From 5575d8db029ae3ce6414dea78d7d7cc9043fa041 Mon Sep 17 00:00:00 2001 From: joregan Date: Thu, 30 Sep 2010 02:18:45 +0000 Subject: [PATCH] last one git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@483 d0cd1f9f-072b-0410-8dd7-cf729c803f20 --- doc/wordlist2dawg.1.asc | 41 ++++++++++++++++++++++++++++++++++++ doc/wordlist2dawg.1.xml | 46 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+) create mode 100644 doc/wordlist2dawg.1.asc create mode 100644 doc/wordlist2dawg.1.xml diff --git a/doc/wordlist2dawg.1.asc b/doc/wordlist2dawg.1.asc new file mode 100644 index 0000000000..d3aad35781 --- /dev/null +++ b/doc/wordlist2dawg.1.asc @@ -0,0 +1,41 @@ +WORDLIST2DAWG(1) +================ + +NAME +---- +wordlist2dawg - convert a wordlist to a DAWG for Tesseract + +SYNOPSIS +-------- +*wordlist2dawg* 'WORDLIST' 'DAWG' 'lang.unicharset' + +DESCRIPTION +----------- +wordlist2dawg(1) converts a wordlist to a Directed Acyclic +Word Graph (DAWG) for use with Tesseract. + +The wordlists are split into two: one with high frequency +words, and one with the rest. + +OPTIONS +------- +'WORDLIST' + A plain text file in UTF-8, one word per line + +'DAWG' + The output DAWG to write + +'lang.unicharset' + The unicharset of the language. This is the unicharset + generated by mftraining(1) + +SEE ALSO +-------- +tesseract(1), mftraining(1) + + + +COPYING +------- +Copyright (c) 2006 Google, Inc. +Licensed under the Apache License, Version 2.0 diff --git a/doc/wordlist2dawg.1.xml b/doc/wordlist2dawg.1.xml new file mode 100644 index 0000000000..d8ba0cbed0 --- /dev/null +++ b/doc/wordlist2dawg.1.xml @@ -0,0 +1,46 @@ + + + + + + +wordlist2dawg +1 +  +  + + + wordlist2dawg + convert a wordlist to a DAWG for Tesseract + + +wordlist2dawg WORDLIST DAWG lang.unicharset + + +DESCRIPTION +wordlist2dawg(1) converts a wordlist to a Directed Acyclic +Word Graph (DAWG) for use with Tesseract. +The wordlists are split into two: one with high frequency +words, and one with the rest. + + +OPTIONS +WORDLIST + A plain text file in UTF-8, one word per line +DAWG + The output DAWG to write +lang.unicharset + The unicharset of the language. This is the unicharset + generated by mftraining(1) + + +SEE ALSO +tesseract(1), mftraining(1) +http://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3 + + +COPYING +Copyright (c) 2006 Google, Inc. +Licensed under the Apache License, Version 2.0 + +