Skip to content

Commit

Permalink
Merge branch 'master' of github.com:tesseract-ocr/tesseract
Browse files Browse the repository at this point in the history
  • Loading branch information
egorpugin committed Oct 2, 2015
2 parents 25136e4 + 2e7a633 commit f369585
Show file tree
Hide file tree
Showing 78 changed files with 271 additions and 285 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ training/wordlist2dawg
*.o
*.Plo
*.a
*.class
*.jar

# tessdata
*.cube.*
Expand Down
2 changes: 1 addition & 1 deletion COPYING
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
This package contains the Tesseract Open Source OCR Engine.
Orignally developed at Hewlett Packard Laboratories Bristol and
Originally developed at Hewlett Packard Laboratories Bristol and
at Hewlett Packard Co, Greeley Colorado, all the code
in this distribution is now licensed under the Apache License:

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ find its data directory. You must either:
./autogen.sh
./configure
make
make install
sudo make install
sudo ldconfig

to move the data files to the standard place, or:
Expand Down
2 changes: 1 addition & 1 deletion api/baseapi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1660,7 +1660,7 @@ char* TessBaseAPI::GetUNLVText() {
word->word->space() > 0 &&
!word->word->flag(W_FUZZY_NON) &&
!word->word->flag(W_FUZZY_SP)) {
/* Write a space to separate from preceeding good text */
/* Write a space to separate from preceding good text */
*ptr++ = ' ';
last_char_was_tilde = false;
}
Expand Down
2 changes: 1 addition & 1 deletion api/pdfrenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ void TessPDFRenderer::AppendPDFObject(const char *data) {
AppendString((const char *)data);
}

// Helper function to prevent us from accidentaly writing
// Helper function to prevent us from accidentally writing
// scientific notation to an HOCR or PDF file. Besides, three
// decimal points are all you really need.
double prec(double x) {
Expand Down
2 changes: 1 addition & 1 deletion api/tesseractmain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ int main(int argc, char **argv) {
}

// We have 2 possible sources of pagesegmode: a config file and
// the command line. For backwards compatability reasons, the
// the command line. For backwards compatibility reasons, the
// default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
// default for this program is tesseract::PSM_AUTO. We will let
// the config file take priority, so the command-line default
Expand Down
4 changes: 2 additions & 2 deletions ccmain/control.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1556,7 +1556,7 @@ void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word,
word->fix_quotes();
if (tessedit_fix_hyphens)
word->fix_hyphens();
/* Dont trust fix_quotes! - though I think I've fixed the bug */
/* Don't trust fix_quotes! - though I think I've fixed the bug */
if (word->best_choice->length() != word->box_word->length()) {
tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
" #Blobs=%d\n",
Expand Down Expand Up @@ -1694,7 +1694,7 @@ ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(
goto not_a_word;
/*
Allow a single hyphen in a lower case word
- dont trust upper case - I've seen several cases of "H" -> "I-I"
- don't trust upper case - I've seen several cases of "H" -> "I-I"
*/
if (lengths[i] == 1 && s[offset] == '-') {
hyphen_pos = i;
Expand Down
6 changes: 3 additions & 3 deletions ccmain/docqual.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ inT16 Tesseract::count_outline_errs(char c, inT16 outline_count) {
int expected_outline_count;

if (STRING (outlines_odd).contains (c))
return 0; //Dont use this char
return 0; //Don't use this char
else if (STRING (outlines_2).contains (c))
expected_outline_count = 2;
else
Expand Down Expand Up @@ -157,7 +157,7 @@ void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it,
* - Word segmentation is the same as the original image
* - All characters have the expected number of outlines
* NOTE - the rejection counts are recalculated after unrejection
* - CANT do it in a single pass without a bit of fiddling
* - CAN'T do it in a single pass without a bit of fiddling
* - keep it simple but inefficient
*************************************************************************/
void Tesseract::unrej_good_quality_words( //unreject potential
Expand Down Expand Up @@ -403,7 +403,7 @@ void Tesseract::doc_and_block_rejection( //reject big chunks

/*************************************************************************
* reject_whole_page()
* Dont believe any of it - set the reject map to 00..00 in all words
* Don't believe any of it - set the reject map to 00..00 in all words
*
*************************************************************************/

Expand Down
10 changes: 5 additions & 5 deletions ccmain/fixspace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor,
WERD_RES *word_res;
WERD_RES_LIST fuzzy_space_words;
inT16 new_length;
BOOL8 prevent_null_wd_fixsp; // DONT process blobless wds
BOOL8 prevent_null_wd_fixsp; // DON'T process blobless wds
inT32 word_index; // current word

block_res_it.set_to_list(&page_res->block_res_list);
Expand Down Expand Up @@ -222,7 +222,7 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
* fuzzy spaces. The problem with the basic measure is that "561 63" would score
* the same as "56163", though given our knowledge that the space is fuzzy, and
* that there is a "1" next to the fuzzy space, we need to ensure that "56163"
* is prefered.
* is preferred.
*
* The solution is to NOT COUNT the score of any word which has a digit at one
* end and a "1Il" as the character the other side of the space.
Expand Down Expand Up @@ -272,8 +272,8 @@ inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
} else {
/*
Can we add the prev word score and potentially count this word?
Yes IF it didnt end in a 1 when the first char of this word is a digit
AND it didnt end in a digit when the first char of this word is a 1
Yes IF it didn't end in a 1 when the first char of this word is a digit
AND it didn't end in a digit when the first char of this word is a 1
*/
word_len = word->reject_map.length();
current_word_ok_so_far = FALSE;
Expand Down Expand Up @@ -507,7 +507,7 @@ BOOL8 Tesseract::fixspace_thinks_word_done(WERD_RES *word) {

/*
Use all the standard pass 2 conditions for mode 5 in set_done() in
reject.c BUT DONT REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DONT
reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
CARE WHETHER WE HAVE of/at on/an etc.
*/
if (fixsp_done_mode > 0 &&
Expand Down
4 changes: 2 additions & 2 deletions ccmain/output.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
/*************************************************************************
* SUSPECT LEVELS
*
* 0 - dont reject ANYTHING
* 0 - don't reject ANYTHING
* 1,2 - partial rejection
* 3 - BEST
*
Expand Down Expand Up @@ -337,7 +337,7 @@ void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
rating_per_ch = word.rating() / word_res->reject_map.length();

if (rating_per_ch >= suspect_rating_per_ch)
return; //Dont touch bad ratings
return; //Don't touch bad ratings

if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
/* Unreject any Tess Acceptable word - but NOT tess reject chs*/
Expand Down
4 changes: 2 additions & 2 deletions ccmain/paramsd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -329,13 +329,13 @@ void ParamsEditor::WriteParams(char *filename,
fclose(fp);
sprintf (msg_str, "Overwrite file " "%s" "? (Y/N)", filename);
int a = sv_window_->ShowYesNoDialog(msg_str);
if (a == 'n') { return; } // dont write
if (a == 'n') { return; } // don't write
}


fp = fopen (filename, "wb"); // can we write to it?
if (fp == NULL) {
sv_window_->AddMessage("Cant write to file " "%s" "", filename);
sv_window_->AddMessage("Can't write to file " "%s" "", filename);
return;
}

Expand Down
4 changes: 2 additions & 2 deletions ccmain/reject.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ BOOL8 Tesseract::word_contains_non_1_digit(const char *word,

/*************************************************************************
* dont_allow_1Il()
* Dont unreject LONE accepted 1Il conflict set chars
* Don't unreject LONE accepted 1Il conflict set chars
*************************************************************************/
void Tesseract::dont_allow_1Il(WERD_RES *word) {
int i = 0;
Expand Down Expand Up @@ -633,7 +633,7 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) {
next_left = 9999;
else
next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
// Dont touch small or touching blobs - it is too dangerous.
// Don't touch small or touching blobs - it is too dangerous.
if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
(out_box.left() > prev_right) && (out_box.right() < next_left)) {
aspect_ratio = out_box.width() / (float) out_box.height();
Expand Down
20 changes: 10 additions & 10 deletions ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ Tesseract::Tesseract()
BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true,
"Try to improve fuzzy spaces", this->params()),
BOOL_MEMBER(tessedit_unrej_any_wd, false,
"Dont bother with word plausibility", this->params()),
"Don't bother with word plausibility", this->params()),
BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?",
this->params()),
BOOL_MEMBER(tessedit_redo_xheight, true, "Check/Correct x-height",
Expand Down Expand Up @@ -310,19 +310,19 @@ Tesseract::Tesseract()
this->params()),
INT_MEMBER(crunch_pot_indicators, 1,
"How many potential indicators needed", this->params()),
BOOL_MEMBER(crunch_leave_ok_strings, true, "Dont touch sensible strings",
BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings",
this->params()),
BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring",
this->params()),
BOOL_MEMBER(crunch_leave_accept_strings, false,
"Dont pot crunch sensible strings", this->params()),
"Don't pot crunch sensible strings", this->params()),
BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures",
this->params()),
INT_MEMBER(crunch_leave_lc_strings, 4,
"Dont crunch words with long lower case strings",
"Don't crunch words with long lower case strings",
this->params()),
INT_MEMBER(crunch_leave_uc_strings, 4,
"Dont crunch words with long lower case strings",
"Don't crunch words with long lower case strings",
this->params()),
INT_MEMBER(crunch_long_repetitions, 3,
"Crunch words with long repetitions", this->params()),
Expand Down Expand Up @@ -393,21 +393,21 @@ Tesseract::Tesseract()
INT_MEMBER(suspect_space_level, 100,
"Min suspect level for rejecting spaces", this->params()),
INT_MEMBER(suspect_short_words, 2,
"Dont Suspect dict wds longer than this", this->params()),
"Don't suspect dict wds longer than this", this->params()),
BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected",
this->params()),
double_MEMBER(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit",
double_MEMBER(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit",
this->params()),
double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit",
this->params()),
BOOL_MEMBER(tessedit_minimal_rejection, false,
"Only reject tess failures", this->params()),
BOOL_MEMBER(tessedit_zero_rejection, false, "Dont reject ANYTHING",
BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING",
this->params()),
BOOL_MEMBER(tessedit_word_for_word, false,
"Make output have exactly one word per WERD", this->params()),
BOOL_MEMBER(tessedit_zero_kelvin_rejection, false,
"Dont reject ANYTHING AT ALL", this->params()),
"Don't reject ANYTHING AT ALL", this->params()),
BOOL_MEMBER(tessedit_consistent_reps, true,
"Force all rep chars the same", this->params()),
INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm",
Expand All @@ -424,7 +424,7 @@ Tesseract::Tesseract()
"Use DOC dawg in 11l conf. detector", this->params()),
BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test",
this->params()),
BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Dont double check",
BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check",
this->params()),
BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control",
this->params()),
Expand Down
22 changes: 11 additions & 11 deletions ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -733,7 +733,7 @@ class Tesseract : public Wordrec {
GenericVector<UNICHAR_ID>* class_ids);
// Resegments the word to achieve the target_text from the classifier.
// Returns false if the re-segmentation fails.
// Uses brute-force combination of upto kMaxGroupSize adjacent blobs, and
// Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and
// applies a full search on the classifier results to find the best classified
// segmentation. As a compromise to obtain better recall, 1-1 ambigiguity
// substitutions ARE used.
Expand Down Expand Up @@ -833,7 +833,7 @@ class Tesseract : public Wordrec {
BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true,
"Try to improve fuzzy spaces");
BOOL_VAR_H(tessedit_unrej_any_wd, false,
"Dont bother with word plausibility");
"Don't bother with word plausibility");
BOOL_VAR_H(tessedit_fix_hyphens, true, "Crunch double hyphens?");
BOOL_VAR_H(tessedit_redo_xheight, true, "Check/Correct x-height");
BOOL_VAR_H(tessedit_enable_doc_dict, true,
Expand Down Expand Up @@ -954,15 +954,15 @@ class Tesseract : public Wordrec {
double_VAR_H(crunch_small_outlines_size, 0.6, "Small if lt xht x this");
INT_VAR_H(crunch_rating_max, 10, "For adj length in rating per ch");
INT_VAR_H(crunch_pot_indicators, 1, "How many potential indicators needed");
BOOL_VAR_H(crunch_leave_ok_strings, true, "Dont touch sensible strings");
BOOL_VAR_H(crunch_leave_ok_strings, true, "Don't touch sensible strings");
BOOL_VAR_H(crunch_accept_ok, true, "Use acceptability in okstring");
BOOL_VAR_H(crunch_leave_accept_strings, false,
"Dont pot crunch sensible strings");
"Don't pot crunch sensible strings");
BOOL_VAR_H(crunch_include_numerals, false, "Fiddle alpha figures");
INT_VAR_H(crunch_leave_lc_strings, 4,
"Dont crunch words with long lower case strings");
"Don't crunch words with long lower case strings");
INT_VAR_H(crunch_leave_uc_strings, 4,
"Dont crunch words with long lower case strings");
"Don't crunch words with long lower case strings");
INT_VAR_H(crunch_long_repetitions, 3, "Crunch words with long repetitions");
INT_VAR_H(crunch_debug, 0, "As it says");
INT_VAR_H(fixsp_non_noise_limit, 1,
Expand Down Expand Up @@ -1010,16 +1010,16 @@ class Tesseract : public Wordrec {
INT_VAR_H(suspect_space_level, 100,
"Min suspect level for rejecting spaces");
INT_VAR_H(suspect_short_words, 2,
"Dont Suspect dict wds longer than this");
"Don't Suspect dict wds longer than this");
BOOL_VAR_H(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected");
double_VAR_H(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit");
double_VAR_H(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit");
double_VAR_H(suspect_accept_rating, -999.9, "Accept good rating limit");
BOOL_VAR_H(tessedit_minimal_rejection, false, "Only reject tess failures");
BOOL_VAR_H(tessedit_zero_rejection, false, "Dont reject ANYTHING");
BOOL_VAR_H(tessedit_zero_rejection, false, "Don't reject ANYTHING");
BOOL_VAR_H(tessedit_word_for_word, false,
"Make output have exactly one word per WERD");
BOOL_VAR_H(tessedit_zero_kelvin_rejection, false,
"Dont reject ANYTHING AT ALL");
"Don't reject ANYTHING AT ALL");
BOOL_VAR_H(tessedit_consistent_reps, true, "Force all rep chars the same");
INT_VAR_H(tessedit_reject_mode, 0, "Rejection algorithm");
BOOL_VAR_H(tessedit_rejection_debug, false, "Adaption debug");
Expand All @@ -1030,7 +1030,7 @@ class Tesseract : public Wordrec {
"Aspect ratio dot/hyphen test");
BOOL_VAR_H(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector");
BOOL_VAR_H(rej_1Il_use_dict_word, false, "Use dictword test");
BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Dont double check");
BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Don't double check");
BOOL_VAR_H(rej_use_tess_accepted, true, "Individual rejection control");
BOOL_VAR_H(rej_use_tess_blanks, true, "Individual rejection control");
BOOL_VAR_H(rej_use_good_perm, true, "Individual rejection control");
Expand Down
2 changes: 1 addition & 1 deletion ccstruct/blobbox.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

ELISTIZE (BLOBNBOX) ELIST2IZE (TO_ROW) ELISTIZE (TO_BLOCK)

// Upto 30 degrees is allowed for rotations of diacritic blobs.
// Up to 30 degrees is allowed for rotations of diacritic blobs.
const double kCosSmallAngle = 0.866;
// Min aspect ratio for a joined word to indicate an obvious flow direction.
const double kDefiniteAspectRatio = 2.0;
Expand Down
2 changes: 1 addition & 1 deletion ccstruct/boxread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ FILE* OpenBoxFile(const STRING& fname) {
FILE* box_file = NULL;
if (!(box_file = fopen(filename.string(), "rb"))) {
CANTOPENFILE.error("read_next_box", TESSEXIT,
"Cant open box file %s",
"Can't open box file %s",
filename.string());
}
return box_file;
Expand Down
2 changes: 1 addition & 1 deletion ccstruct/normalis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -382,7 +382,7 @@ void DENORM::LocalDenormTransform(const FCOORD& pt, FCOORD* original) const {
}

// Transforms the given coords all the way back to source image space using
// the full transformation sequence defined by this and its predecesors
// the full transformation sequence defined by this and its predecessors
// recursively, shallowest first, and finally any block re_rotation.
// If last_denorm is not NULL, then the last transformation used will
// be last_denorm, and the block re_rotation will never be executed.
Expand Down
2 changes: 1 addition & 1 deletion ccstruct/normalis.h
Original file line number Diff line number Diff line change
Expand Up @@ -218,7 +218,7 @@ class DENORM {
void LocalDenormTransform(const TPOINT& pt, TPOINT* original) const;
void LocalDenormTransform(const FCOORD& pt, FCOORD* original) const;
// Transforms the given coords all the way back to source image space using
// the full transformation sequence defined by this and its predecesors
// the full transformation sequence defined by this and its predecessors
// recursively, shallowest first, and finally any block re_rotation.
// If last_denorm is not NULL, then the last transformation used will
// be last_denorm, and the block re_rotation will never be executed.
Expand Down
2 changes: 1 addition & 1 deletion ccstruct/pdblock.h
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ class PDBLK
PDBLK & operator= (const PDBLK & source);

protected:
POLY_BLOCK *hand_poly; //< wierd as well
POLY_BLOCK *hand_poly; //< weird as well
ICOORDELT_LIST leftside; //< left side vertices
ICOORDELT_LIST rightside; //< right side vertices
TBOX box; //< bounding box
Expand Down
Loading

0 comments on commit f369585

Please sign in to comment.