more docs

git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@482 d0cd1f9f-072b-0410-8dd7-cf729c803f20
allen8807 · Sep 30, 2010 · 0759ee7 · 0759ee7
1 parent 9943e96
commit 0759ee7
Show file tree

Hide file tree

Showing 6 changed files with 265 additions and 0 deletions.
diff --git a/doc/cntraining.1.asc b/doc/cntraining.1.asc
@@ -0,0 +1,26 @@
+CNTRAINING(1)
+=============
+
+NAME
+----
+cntraining - character normalization training for Tesseract
+
+SYNOPSIS
+--------
+*cntraining* 'FILE'...
+
+DESCRIPTION
+-----------
+cntraining takes a list of .tr files, from which it generates the
+normproto data file (the character normalization sensitivity prototypes).
+
+SEE ALSO
+--------
+tesseract(1), mftraining(1)
+
+<http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3>
+
+COPYING
+-------
+Copyright (c) Hewlett-Packard Company, 1988
+Licensed under the Apache License, Version 2.0
diff --git a/doc/cntraining.1.xml b/doc/cntraining.1.xml
@@ -0,0 +1,34 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
+<?asciidoc-toc?>
+<?asciidoc-numbered?>
+<refentry lang="en">
+<refmeta>
+<refentrytitle>cntraining</refentrytitle>
+<manvolnum>1</manvolnum>
+<refmiscinfo class="source">&nbsp;</refmiscinfo>
+<refmiscinfo class="manual">&nbsp;</refmiscinfo>
+</refmeta>
+<refnamediv>
+    <refname>cntraining</refname>
+    <refpurpose>character normalization training for Tesseract</refpurpose>
+</refnamediv>
+<refsynopsisdiv id="_synopsis">
+<simpara><emphasis role="strong">cntraining</emphasis> <emphasis>FILE</emphasis>&#8230;</simpara>
+</refsynopsisdiv>
+<refsect1 id="_description">
+<title>DESCRIPTION</title>
+<simpara>cntraining takes a list of .tr files, from which it generates the
+normproto data file (the character normalization sensitivity prototypes).</simpara>
+</refsect1>
+<refsect1 id="_see_also">
+<title>SEE ALSO</title>
+<simpara>tesseract(1), mftraining(1)</simpara>
+<simpara><ulink url="http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3">http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3</ulink></simpara>
+</refsect1>
+<refsect1 id="_copying">
+<title>COPYING</title>
+<simpara>Copyright (c) Hewlett-Packard Company, 1988
+Licensed under the Apache License, Version 2.0</simpara>
+</refsect1>
+</refentry>
diff --git a/doc/mftraining.1.asc b/doc/mftraining.1.asc
@@ -0,0 +1,34 @@
+MFTRAINING(1)
+=============
+
+NAME
+----
+mftraining - feature training for Tesseract
+
+SYNOPSIS
+--------
+mftraining -U 'unicharset' -O 'lang.unicharset' 'FILE'...
+
+DESCRIPTION
+-----------
+mftraining takes a list of .tr files, from which it generates the
+files inttemp (the shape prototypes) and pffmtable (the number of 
+expected features for each character). (A third file called Microfeat 
+is also written by this program, but it is not used.)
+
+OPTIONS
+-------
+'-U' FILE 
+	The unicharset generated by unicharset_extractor
+
+'-O' FILE
+	The output unicharset that will be given to combine_tessdata.
+
+SEE ALSO
+--------
+tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1)
+
+COPYING
+-------
+Copyright (c) Hewlett-Packard Company, 1988
+Licensed under the Apache License, Version 2.0
diff --git a/doc/mftraining.1.xml b/doc/mftraining.1.xml
@@ -0,0 +1,42 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
+<?asciidoc-toc?>
+<?asciidoc-numbered?>
+<refentry lang="en">
+<refmeta>
+<refentrytitle>mftraining</refentrytitle>
+<manvolnum>1</manvolnum>
+<refmiscinfo class="source">&nbsp;</refmiscinfo>
+<refmiscinfo class="manual">&nbsp;</refmiscinfo>
+</refmeta>
+<refnamediv>
+    <refname>mftraining</refname>
+    <refpurpose>feature training for Tesseract</refpurpose>
+</refnamediv>
+<refsynopsisdiv id="_synopsis">
+<simpara>mftraining -U <emphasis>unicharset</emphasis> -O <emphasis>lang.unicharset</emphasis> <emphasis>FILE</emphasis>&#8230;</simpara>
+</refsynopsisdiv>
+<refsect1 id="_description">
+<title>DESCRIPTION</title>
+<simpara>mftraining takes a list of .tr files, from which it generates the
+files inttemp (the shape prototypes) and pffmtable (the number of
+expected features for each character). (A third file called Microfeat
+is also written by this program, but it is not used.)</simpara>
+</refsect1>
+<refsect1 id="_options">
+<title>OPTIONS</title>
+<simpara><emphasis>-U</emphasis> FILE
+        The unicharset generated by unicharset_extractor</simpara>
+<simpara><emphasis>-O</emphasis> FILE
+        The output unicharset that will be given to combine_tessdata.</simpara>
+</refsect1>
+<refsect1 id="_see_also">
+<title>SEE ALSO</title>
+<simpara>tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1)</simpara>
+</refsect1>
+<refsect1 id="_copying">
+<title>COPYING</title>
+<simpara>Copyright (c) Hewlett-Packard Company, 1988
+Licensed under the Apache License, Version 2.0</simpara>
+</refsect1>
+</refentry>
diff --git a/doc/unicharset.5.asc b/doc/unicharset.5.asc
@@ -0,0 +1,66 @@
+UNICHARSET(5)
+=============
+
+NAME
+----
+unicharset - character properties for use by Tesseract
+
+DESCRIPTION
+-----------
+Tesseract needs to have access to the character properties isalpha, 
+isdigit, isupper, islower, ispunctuation. This data must be encoded 
+in the unicharset data file. Each line of this file corresponds to 
+one character. The character in UTF-8 is followed by a hexadecimal 
+number representing a binary mask that encodes the properties. 
+Each bit corresponds to a property. If the bit is set to 1, it 
+means that the property is true. The bit ordering is (from least 
+significant bit to most significant bit): isalpha, islower, isupper, 
+isdigit, ispunctuation.
+
+Each line in the unicharset file has four space-separated fields:
+......................................
+[character] [properties] [script] [id]
+......................................
+
+EXAMPLE
+-------
+..............
+; 10 Common 46
+b 3 Latin 59
+W 5 Latin 40
+7 8 Common 66
+= 0 Common 93
+..............
+
+";" is a punctuation character. Its properties are thus represented by the binary number 
+10000 (10 in hexadecimal).
+
+"b" is an alphabetic character and a lower case character. Its properties are thus 
+represented by the binary number 00011 (3 in hexadecimal).
+
+"W" is an alphabetic character and an upper case character. Its properties are thus 
+represented by the binary number 00101 (5 in hexadecimal).
+
+"7" is just a digit. Its properties are thus represented by the binary number 01000 
+(8 in hexadecimal).
+
+"=" is not punctuation nor a digit nor an alphabetic character. Its properties are 
+thus represented by the binary number 00000 (0 in hexadecimal).
+
+Japanese or Chinese alphabetic character properties are represented by the binary number 
+00001 (1 in hexadecimal): they are alphabetic, but neither upper nor lower case.
+
+The last two columns represent the type of script (Latin, Common, Greek, Cyrillic, Han, 
+null) and id code of the character.
+
+HISTORY
+-------
+The unicharset format first appeared with Tesseract 2.00, which was the first version
+to support languages other than English. The unicharset file contained only the first 
+two fields, and the "ispunctuation" property was absent (punctuation was regarded as
+"0", as "=" is in the above example.
+
+SEE ALSO
+--------
+tesseract(1), unicharset_extractor(1)
+
diff --git a/doc/unicharset.5.xml b/doc/unicharset.5.xml
@@ -0,0 +1,63 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE refentry PUBLIC "-//OASIS//DTD DocBook XML V4.5//EN" "http://www.oasis-open.org/docbook/xml/4.5/docbookx.dtd">
+<?asciidoc-toc?>
+<?asciidoc-numbered?>
+<refentry lang="en">
+<refmeta>
+<refentrytitle>unicharset</refentrytitle>
+<manvolnum>5</manvolnum>
+<refmiscinfo class="source">&nbsp;</refmiscinfo>
+<refmiscinfo class="manual">&nbsp;</refmiscinfo>
+</refmeta>
+<refnamediv>
+    <refname>unicharset</refname>
+    <refpurpose>character properties for use by Tesseract</refpurpose>
+</refnamediv>
+<refsect1 id="_description">
+<title>DESCRIPTION</title>
+<simpara>Tesseract needs to have access to the character properties isalpha,
+isdigit, isupper, islower, ispunctuation. This data must be encoded
+in the unicharset data file. Each line of this file corresponds to
+one character. The character in UTF-8 is followed by a hexadecimal
+number representing a binary mask that encodes the properties.
+Each bit corresponds to a property. If the bit is set to 1, it
+means that the property is true. The bit ordering is (from least
+significant bit to most significant bit): isalpha, islower, isupper,
+isdigit.</simpara>
+<simpara>Each line in the unicharset file has four space-separated fields:
+   [character] [properties] [script] [id]</simpara>
+</refsect1>
+<refsect1 id="_example">
+<title>EXAMPLE</title>
+<literallayout class="monospaced">; 10 Common 46
+b 3 Latin 59
+W 5 Latin 40
+7 8 Common 66
+= 0 Common 93</literallayout>
+<simpara>";" is a punctuation character. Its properties are thus represented by the binary number
+10000 (10 in hexadecimal).</simpara>
+<simpara>"b" is an alphabetic character and a lower case character. Its properties are thus
+represented by the binary number 00011 (3 in hexadecimal).</simpara>
+<simpara>"W" is an alphabetic character and an upper case character. Its properties are thus
+represented by the binary number 00101 (5 in hexadecimal).</simpara>
+<simpara>"7" is just a digit. Its properties are thus represented by the binary number 01000
+(8 in hexadecimal).</simpara>
+<simpara>"=" is not punctuation nor a digit nor an alphabetic character. Its properties are
+thus represented by the binary number 00000 (0 in hexadecimal).</simpara>
+<simpara>Japanese or Chinese alphabetic character properties are represented by the binary number
+00001 (1 in hexadecimal): they are alphabetic, but neither upper nor lower case.</simpara>
+<simpara>The last two columns represent the type of script (Latin, Common, Greek, Cyrillic, Han,
+null) and id code of the character.</simpara>
+</refsect1>
+<refsect1 id="_history">
+<title>HISTORY</title>
+<simpara>The unicharset format first appeared with Tesseract 2.00, which was the first version
+to support languages other than English. The unicharset file contained only the first
+two fields, and the "ispunctuation" property was absent (punctuation was regarded as
+"0", as "=" is in the above example.</simpara>
+</refsect1>
+<refsect1 id="_see_also">
+<title>SEE ALSO</title>
+<simpara>tesseract(1), unicharset_extractor(1)</simpara>
+</refsect1>
+</refentry>