forked from tesseract-ocr/tesseract
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@482 d0cd1f9f-072b-0410-8dd7-cf729c803f20
- Loading branch information
joregan
committed
Sep 30, 2010
1 parent
9943e96
commit 0759ee7
Showing
6 changed files
with
265 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
CNTRAINING(1) | ||
============= | ||
|
||
NAME | ||
---- | ||
cntraining - character normalization training for Tesseract | ||
|
||
SYNOPSIS | ||
-------- | ||
*cntraining* 'FILE'... | ||
|
||
DESCRIPTION | ||
----------- | ||
cntraining takes a list of .tr files, from which it generates the | ||
normproto data file (the character normalization sensitivity prototypes). | ||
|
||
SEE ALSO | ||
-------- | ||
tesseract(1), mftraining(1) | ||
|
||
<http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3> | ||
|
||
COPYING | ||
------- | ||
Copyright (c) Hewlett-Packard Company, 1988 | ||
Licensed under the Apache License, Version 2.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd"> | ||
<?asciidoc-toc?> | ||
<?asciidoc-numbered?> | ||
<refentry lang="en"> | ||
<refmeta> | ||
<refentrytitle>cntraining</refentrytitle> | ||
<manvolnum>1</manvolnum> | ||
<refmiscinfo class="source"> </refmiscinfo> | ||
<refmiscinfo class="manual"> </refmiscinfo> | ||
</refmeta> | ||
<refnamediv> | ||
<refname>cntraining</refname> | ||
<refpurpose>character normalization training for Tesseract</refpurpose> | ||
</refnamediv> | ||
<refsynopsisdiv id="_synopsis"> | ||
<simpara><emphasis role="strong">cntraining</emphasis> <emphasis>FILE</emphasis>…</simpara> | ||
</refsynopsisdiv> | ||
<refsect1 id="_description"> | ||
<title>DESCRIPTION</title> | ||
<simpara>cntraining takes a list of .tr files, from which it generates the | ||
normproto data file (the character normalization sensitivity prototypes).</simpara> | ||
</refsect1> | ||
<refsect1 id="_see_also"> | ||
<title>SEE ALSO</title> | ||
<simpara>tesseract(1), mftraining(1)</simpara> | ||
<simpara><ulink url="http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3">http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3</ulink></simpara> | ||
</refsect1> | ||
<refsect1 id="_copying"> | ||
<title>COPYING</title> | ||
<simpara>Copyright (c) Hewlett-Packard Company, 1988 | ||
Licensed under the Apache License, Version 2.0</simpara> | ||
</refsect1> | ||
</refentry> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
MFTRAINING(1) | ||
============= | ||
|
||
NAME | ||
---- | ||
mftraining - feature training for Tesseract | ||
|
||
SYNOPSIS | ||
-------- | ||
mftraining -U 'unicharset' -O 'lang.unicharset' 'FILE'... | ||
|
||
DESCRIPTION | ||
----------- | ||
mftraining takes a list of .tr files, from which it generates the | ||
files inttemp (the shape prototypes) and pffmtable (the number of | ||
expected features for each character). (A third file called Microfeat | ||
is also written by this program, but it is not used.) | ||
|
||
OPTIONS | ||
------- | ||
'-U' FILE | ||
The unicharset generated by unicharset_extractor | ||
|
||
'-O' FILE | ||
The output unicharset that will be given to combine_tessdata. | ||
|
||
SEE ALSO | ||
-------- | ||
tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1) | ||
|
||
COPYING | ||
------- | ||
Copyright (c) Hewlett-Packard Company, 1988 | ||
Licensed under the Apache License, Version 2.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd"> | ||
<?asciidoc-toc?> | ||
<?asciidoc-numbered?> | ||
<refentry lang="en"> | ||
<refmeta> | ||
<refentrytitle>mftraining</refentrytitle> | ||
<manvolnum>1</manvolnum> | ||
<refmiscinfo class="source"> </refmiscinfo> | ||
<refmiscinfo class="manual"> </refmiscinfo> | ||
</refmeta> | ||
<refnamediv> | ||
<refname>mftraining</refname> | ||
<refpurpose>feature training for Tesseract</refpurpose> | ||
</refnamediv> | ||
<refsynopsisdiv id="_synopsis"> | ||
<simpara>mftraining -U <emphasis>unicharset</emphasis> -O <emphasis>lang.unicharset</emphasis> <emphasis>FILE</emphasis>…</simpara> | ||
</refsynopsisdiv> | ||
<refsect1 id="_description"> | ||
<title>DESCRIPTION</title> | ||
<simpara>mftraining takes a list of .tr files, from which it generates the | ||
files inttemp (the shape prototypes) and pffmtable (the number of | ||
expected features for each character). (A third file called Microfeat | ||
is also written by this program, but it is not used.)</simpara> | ||
</refsect1> | ||
<refsect1 id="_options"> | ||
<title>OPTIONS</title> | ||
<simpara><emphasis>-U</emphasis> FILE | ||
The unicharset generated by unicharset_extractor</simpara> | ||
<simpara><emphasis>-O</emphasis> FILE | ||
The output unicharset that will be given to combine_tessdata.</simpara> | ||
</refsect1> | ||
<refsect1 id="_see_also"> | ||
<title>SEE ALSO</title> | ||
<simpara>tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1)</simpara> | ||
</refsect1> | ||
<refsect1 id="_copying"> | ||
<title>COPYING</title> | ||
<simpara>Copyright (c) Hewlett-Packard Company, 1988 | ||
Licensed under the Apache License, Version 2.0</simpara> | ||
</refsect1> | ||
</refentry> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
UNICHARSET(5) | ||
============= | ||
|
||
NAME | ||
---- | ||
unicharset - character properties for use by Tesseract | ||
|
||
DESCRIPTION | ||
----------- | ||
Tesseract needs to have access to the character properties isalpha, | ||
isdigit, isupper, islower, ispunctuation. This data must be encoded | ||
in the unicharset data file. Each line of this file corresponds to | ||
one character. The character in UTF-8 is followed by a hexadecimal | ||
number representing a binary mask that encodes the properties. | ||
Each bit corresponds to a property. If the bit is set to 1, it | ||
means that the property is true. The bit ordering is (from least | ||
significant bit to most significant bit): isalpha, islower, isupper, | ||
isdigit, ispunctuation. | ||
|
||
Each line in the unicharset file has four space-separated fields: | ||
...................................... | ||
[character] [properties] [script] [id] | ||
...................................... | ||
|
||
EXAMPLE | ||
------- | ||
.............. | ||
; 10 Common 46 | ||
b 3 Latin 59 | ||
W 5 Latin 40 | ||
7 8 Common 66 | ||
= 0 Common 93 | ||
.............. | ||
|
||
";" is a punctuation character. Its properties are thus represented by the binary number | ||
10000 (10 in hexadecimal). | ||
|
||
"b" is an alphabetic character and a lower case character. Its properties are thus | ||
represented by the binary number 00011 (3 in hexadecimal). | ||
|
||
"W" is an alphabetic character and an upper case character. Its properties are thus | ||
represented by the binary number 00101 (5 in hexadecimal). | ||
|
||
"7" is just a digit. Its properties are thus represented by the binary number 01000 | ||
(8 in hexadecimal). | ||
|
||
"=" is not punctuation nor a digit nor an alphabetic character. Its properties are | ||
thus represented by the binary number 00000 (0 in hexadecimal). | ||
|
||
Japanese or Chinese alphabetic character properties are represented by the binary number | ||
00001 (1 in hexadecimal): they are alphabetic, but neither upper nor lower case. | ||
|
||
The last two columns represent the type of script (Latin, Common, Greek, Cyrillic, Han, | ||
null) and id code of the character. | ||
|
||
HISTORY | ||
------- | ||
The unicharset format first appeared with Tesseract 2.00, which was the first version | ||
to support languages other than English. The unicharset file contained only the first | ||
two fields, and the "ispunctuation" property was absent (punctuation was regarded as | ||
"0", as "=" is in the above example. | ||
|
||
SEE ALSO | ||
-------- | ||
tesseract(1), unicharset_extractor(1) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd"> | ||
<?asciidoc-toc?> | ||
<?asciidoc-numbered?> | ||
<refentry lang="en"> | ||
<refmeta> | ||
<refentrytitle>unicharset</refentrytitle> | ||
<manvolnum>5</manvolnum> | ||
<refmiscinfo class="source"> </refmiscinfo> | ||
<refmiscinfo class="manual"> </refmiscinfo> | ||
</refmeta> | ||
<refnamediv> | ||
<refname>unicharset</refname> | ||
<refpurpose>character properties for use by Tesseract</refpurpose> | ||
</refnamediv> | ||
<refsect1 id="_description"> | ||
<title>DESCRIPTION</title> | ||
<simpara>Tesseract needs to have access to the character properties isalpha, | ||
isdigit, isupper, islower, ispunctuation. This data must be encoded | ||
in the unicharset data file. Each line of this file corresponds to | ||
one character. The character in UTF-8 is followed by a hexadecimal | ||
number representing a binary mask that encodes the properties. | ||
Each bit corresponds to a property. If the bit is set to 1, it | ||
means that the property is true. The bit ordering is (from least | ||
significant bit to most significant bit): isalpha, islower, isupper, | ||
isdigit.</simpara> | ||
<simpara>Each line in the unicharset file has four space-separated fields: | ||
[character] [properties] [script] [id]</simpara> | ||
</refsect1> | ||
<refsect1 id="_example"> | ||
<title>EXAMPLE</title> | ||
<literallayout class="monospaced">; 10 Common 46 | ||
b 3 Latin 59 | ||
W 5 Latin 40 | ||
7 8 Common 66 | ||
= 0 Common 93</literallayout> | ||
<simpara>";" is a punctuation character. Its properties are thus represented by the binary number | ||
10000 (10 in hexadecimal).</simpara> | ||
<simpara>"b" is an alphabetic character and a lower case character. Its properties are thus | ||
represented by the binary number 00011 (3 in hexadecimal).</simpara> | ||
<simpara>"W" is an alphabetic character and an upper case character. Its properties are thus | ||
represented by the binary number 00101 (5 in hexadecimal).</simpara> | ||
<simpara>"7" is just a digit. Its properties are thus represented by the binary number 01000 | ||
(8 in hexadecimal).</simpara> | ||
<simpara>"=" is not punctuation nor a digit nor an alphabetic character. Its properties are | ||
thus represented by the binary number 00000 (0 in hexadecimal).</simpara> | ||
<simpara>Japanese or Chinese alphabetic character properties are represented by the binary number | ||
00001 (1 in hexadecimal): they are alphabetic, but neither upper nor lower case.</simpara> | ||
<simpara>The last two columns represent the type of script (Latin, Common, Greek, Cyrillic, Han, | ||
null) and id code of the character.</simpara> | ||
</refsect1> | ||
<refsect1 id="_history"> | ||
<title>HISTORY</title> | ||
<simpara>The unicharset format first appeared with Tesseract 2.00, which was the first version | ||
to support languages other than English. The unicharset file contained only the first | ||
two fields, and the "ispunctuation" property was absent (punctuation was regarded as | ||
"0", as "=" is in the above example.</simpara> | ||
</refsect1> | ||
<refsect1 id="_see_also"> | ||
<title>SEE ALSO</title> | ||
<simpara>tesseract(1), unicharset_extractor(1)</simpara> | ||
</refsect1> | ||
</refentry> |