diff --git a/.gitignore b/.gitignore index 9081efccc9..cbf9cfeef9 100644 --- a/.gitignore +++ b/.gitignore @@ -59,6 +59,8 @@ training/wordlist2dawg *.o *.Plo *.a +*.class +*.jar # tessdata *.cube.* diff --git a/COPYING b/COPYING index 096aaafb27..8d8d48cf91 100644 --- a/COPYING +++ b/COPYING @@ -1,5 +1,5 @@ This package contains the Tesseract Open Source OCR Engine. -Orignally developed at Hewlett Packard Laboratories Bristol and +Originally developed at Hewlett Packard Laboratories Bristol and at Hewlett Packard Co, Greeley Colorado, all the code in this distribution is now licensed under the Apache License: diff --git a/README.md b/README.md index 5eff4402f2..c555506280 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ find its data directory. You must either: ./autogen.sh ./configure make - make install + sudo make install sudo ldconfig to move the data files to the standard place, or: diff --git a/api/baseapi.cpp b/api/baseapi.cpp index bdc02bfe86..fa38d29001 100644 --- a/api/baseapi.cpp +++ b/api/baseapi.cpp @@ -1660,7 +1660,7 @@ char* TessBaseAPI::GetUNLVText() { word->word->space() > 0 && !word->word->flag(W_FUZZY_NON) && !word->word->flag(W_FUZZY_SP)) { - /* Write a space to separate from preceeding good text */ + /* Write a space to separate from preceding good text */ *ptr++ = ' '; last_char_was_tilde = false; } diff --git a/api/pdfrenderer.cpp b/api/pdfrenderer.cpp index e96f67c481..4f6afbe32e 100644 --- a/api/pdfrenderer.cpp +++ b/api/pdfrenderer.cpp @@ -178,7 +178,7 @@ void TessPDFRenderer::AppendPDFObject(const char *data) { AppendString((const char *)data); } -// Helper function to prevent us from accidentaly writing +// Helper function to prevent us from accidentally writing // scientific notation to an HOCR or PDF file. Besides, three // decimal points are all you really need. double prec(double x) { diff --git a/api/tesseractmain.cpp b/api/tesseractmain.cpp index e7abadf3d0..501b66c42c 100644 --- a/api/tesseractmain.cpp +++ b/api/tesseractmain.cpp @@ -227,7 +227,7 @@ int main(int argc, char **argv) { } // We have 2 possible sources of pagesegmode: a config file and - // the command line. For backwards compatability reasons, the + // the command line. For backwards compatibility reasons, the // default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the // default for this program is tesseract::PSM_AUTO. We will let // the config file take priority, so the command-line default diff --git a/ccmain/control.cpp b/ccmain/control.cpp index d40c26329b..66a2a8bb3e 100644 --- a/ccmain/control.cpp +++ b/ccmain/control.cpp @@ -1556,7 +1556,7 @@ void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word, word->fix_quotes(); if (tessedit_fix_hyphens) word->fix_hyphens(); - /* Dont trust fix_quotes! - though I think I've fixed the bug */ + /* Don't trust fix_quotes! - though I think I've fixed the bug */ if (word->best_choice->length() != word->box_word->length()) { tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;" " #Blobs=%d\n", @@ -1694,7 +1694,7 @@ ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string( goto not_a_word; /* Allow a single hyphen in a lower case word - - dont trust upper case - I've seen several cases of "H" -> "I-I" + - don't trust upper case - I've seen several cases of "H" -> "I-I" */ if (lengths[i] == 1 && s[offset] == '-') { hyphen_pos = i; diff --git a/ccmain/docqual.cpp b/ccmain/docqual.cpp index 6a7e6e67ef..327d7cbc55 100644 --- a/ccmain/docqual.cpp +++ b/ccmain/docqual.cpp @@ -129,7 +129,7 @@ inT16 Tesseract::count_outline_errs(char c, inT16 outline_count) { int expected_outline_count; if (STRING (outlines_odd).contains (c)) - return 0; //Dont use this char + return 0; //Don't use this char else if (STRING (outlines_2).contains (c)) expected_outline_count = 2; else @@ -157,7 +157,7 @@ void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it, * - Word segmentation is the same as the original image * - All characters have the expected number of outlines * NOTE - the rejection counts are recalculated after unrejection - * - CANT do it in a single pass without a bit of fiddling + * - CAN'T do it in a single pass without a bit of fiddling * - keep it simple but inefficient *************************************************************************/ void Tesseract::unrej_good_quality_words( //unreject potential @@ -403,7 +403,7 @@ void Tesseract::doc_and_block_rejection( //reject big chunks /************************************************************************* * reject_whole_page() - * Dont believe any of it - set the reject map to 00..00 in all words + * Don't believe any of it - set the reject map to 00..00 in all words * *************************************************************************/ diff --git a/ccmain/fixspace.cpp b/ccmain/fixspace.cpp index 0a561ac9a0..e42617c053 100644 --- a/ccmain/fixspace.cpp +++ b/ccmain/fixspace.cpp @@ -55,7 +55,7 @@ void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor, WERD_RES *word_res; WERD_RES_LIST fuzzy_space_words; inT16 new_length; - BOOL8 prevent_null_wd_fixsp; // DONT process blobless wds + BOOL8 prevent_null_wd_fixsp; // DON'T process blobless wds inT32 word_index; // current word block_res_it.set_to_list(&page_res->block_res_list); @@ -222,7 +222,7 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, * fuzzy spaces. The problem with the basic measure is that "561 63" would score * the same as "56163", though given our knowledge that the space is fuzzy, and * that there is a "1" next to the fuzzy space, we need to ensure that "56163" - * is prefered. + * is preferred. * * The solution is to NOT COUNT the score of any word which has a digit at one * end and a "1Il" as the character the other side of the space. @@ -272,8 +272,8 @@ inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) { } else { /* Can we add the prev word score and potentially count this word? - Yes IF it didnt end in a 1 when the first char of this word is a digit - AND it didnt end in a digit when the first char of this word is a 1 + Yes IF it didn't end in a 1 when the first char of this word is a digit + AND it didn't end in a digit when the first char of this word is a 1 */ word_len = word->reject_map.length(); current_word_ok_so_far = FALSE; @@ -507,7 +507,7 @@ BOOL8 Tesseract::fixspace_thinks_word_done(WERD_RES *word) { /* Use all the standard pass 2 conditions for mode 5 in set_done() in - reject.c BUT DONT REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DONT + reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T CARE WHETHER WE HAVE of/at on/an etc. */ if (fixsp_done_mode > 0 && diff --git a/ccmain/output.cpp b/ccmain/output.cpp index 42623b9ec8..ddfcfc54b6 100644 --- a/ccmain/output.cpp +++ b/ccmain/output.cpp @@ -297,7 +297,7 @@ UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated? /************************************************************************* * SUSPECT LEVELS * - * 0 - dont reject ANYTHING + * 0 - don't reject ANYTHING * 1,2 - partial rejection * 3 - BEST * @@ -337,7 +337,7 @@ void Tesseract::set_unlv_suspects(WERD_RES *word_res) { rating_per_ch = word.rating() / word_res->reject_map.length(); if (rating_per_ch >= suspect_rating_per_ch) - return; //Dont touch bad ratings + return; //Don't touch bad ratings if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) { /* Unreject any Tess Acceptable word - but NOT tess reject chs*/ diff --git a/ccmain/paramsd.cpp b/ccmain/paramsd.cpp index b141bede62..7784f85361 100644 --- a/ccmain/paramsd.cpp +++ b/ccmain/paramsd.cpp @@ -329,13 +329,13 @@ void ParamsEditor::WriteParams(char *filename, fclose(fp); sprintf (msg_str, "Overwrite file " "%s" "? (Y/N)", filename); int a = sv_window_->ShowYesNoDialog(msg_str); - if (a == 'n') { return; } // dont write + if (a == 'n') { return; } // don't write } fp = fopen (filename, "wb"); // can we write to it? if (fp == NULL) { - sv_window_->AddMessage("Cant write to file " "%s" "", filename); + sv_window_->AddMessage("Can't write to file " "%s" "", filename); return; } diff --git a/ccmain/reject.cpp b/ccmain/reject.cpp index 607b84179c..aacc80dd6e 100644 --- a/ccmain/reject.cpp +++ b/ccmain/reject.cpp @@ -521,7 +521,7 @@ BOOL8 Tesseract::word_contains_non_1_digit(const char *word, /************************************************************************* * dont_allow_1Il() - * Dont unreject LONE accepted 1Il conflict set chars + * Don't unreject LONE accepted 1Il conflict set chars *************************************************************************/ void Tesseract::dont_allow_1Il(WERD_RES *word) { int i = 0; @@ -633,7 +633,7 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) { next_left = 9999; else next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left(); - // Dont touch small or touching blobs - it is too dangerous. + // Don't touch small or touching blobs - it is too dangerous. if ((out_box.width() > 8 * word_res->denorm.x_scale()) && (out_box.left() > prev_right) && (out_box.right() < next_left)) { aspect_ratio = out_box.width() / (float) out_box.height(); diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp index 0c52f0efd9..e348c93f98 100644 --- a/ccmain/tesseractclass.cpp +++ b/ccmain/tesseractclass.cpp @@ -136,7 +136,7 @@ Tesseract::Tesseract() BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true, "Try to improve fuzzy spaces", this->params()), BOOL_MEMBER(tessedit_unrej_any_wd, false, - "Dont bother with word plausibility", this->params()), + "Don't bother with word plausibility", this->params()), BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?", this->params()), BOOL_MEMBER(tessedit_redo_xheight, true, "Check/Correct x-height", @@ -310,19 +310,19 @@ Tesseract::Tesseract() this->params()), INT_MEMBER(crunch_pot_indicators, 1, "How many potential indicators needed", this->params()), - BOOL_MEMBER(crunch_leave_ok_strings, true, "Dont touch sensible strings", + BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings", this->params()), BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring", this->params()), BOOL_MEMBER(crunch_leave_accept_strings, false, - "Dont pot crunch sensible strings", this->params()), + "Don't pot crunch sensible strings", this->params()), BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures", this->params()), INT_MEMBER(crunch_leave_lc_strings, 4, - "Dont crunch words with long lower case strings", + "Don't crunch words with long lower case strings", this->params()), INT_MEMBER(crunch_leave_uc_strings, 4, - "Dont crunch words with long lower case strings", + "Don't crunch words with long lower case strings", this->params()), INT_MEMBER(crunch_long_repetitions, 3, "Crunch words with long repetitions", this->params()), @@ -393,21 +393,21 @@ Tesseract::Tesseract() INT_MEMBER(suspect_space_level, 100, "Min suspect level for rejecting spaces", this->params()), INT_MEMBER(suspect_short_words, 2, - "Dont Suspect dict wds longer than this", this->params()), + "Don't suspect dict wds longer than this", this->params()), BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected", this->params()), - double_MEMBER(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit", + double_MEMBER(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit", this->params()), double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit", this->params()), BOOL_MEMBER(tessedit_minimal_rejection, false, "Only reject tess failures", this->params()), - BOOL_MEMBER(tessedit_zero_rejection, false, "Dont reject ANYTHING", + BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING", this->params()), BOOL_MEMBER(tessedit_word_for_word, false, "Make output have exactly one word per WERD", this->params()), BOOL_MEMBER(tessedit_zero_kelvin_rejection, false, - "Dont reject ANYTHING AT ALL", this->params()), + "Don't reject ANYTHING AT ALL", this->params()), BOOL_MEMBER(tessedit_consistent_reps, true, "Force all rep chars the same", this->params()), INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", @@ -424,7 +424,7 @@ Tesseract::Tesseract() "Use DOC dawg in 11l conf. detector", this->params()), BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test", this->params()), - BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Dont double check", + BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check", this->params()), BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control", this->params()), diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h index 50141bf942..6666dec36b 100644 --- a/ccmain/tesseractclass.h +++ b/ccmain/tesseractclass.h @@ -733,7 +733,7 @@ class Tesseract : public Wordrec { GenericVector* class_ids); // Resegments the word to achieve the target_text from the classifier. // Returns false if the re-segmentation fails. - // Uses brute-force combination of upto kMaxGroupSize adjacent blobs, and + // Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and // applies a full search on the classifier results to find the best classified // segmentation. As a compromise to obtain better recall, 1-1 ambigiguity // substitutions ARE used. @@ -833,7 +833,7 @@ class Tesseract : public Wordrec { BOOL_VAR_H(tessedit_fix_fuzzy_spaces, true, "Try to improve fuzzy spaces"); BOOL_VAR_H(tessedit_unrej_any_wd, false, - "Dont bother with word plausibility"); + "Don't bother with word plausibility"); BOOL_VAR_H(tessedit_fix_hyphens, true, "Crunch double hyphens?"); BOOL_VAR_H(tessedit_redo_xheight, true, "Check/Correct x-height"); BOOL_VAR_H(tessedit_enable_doc_dict, true, @@ -954,15 +954,15 @@ class Tesseract : public Wordrec { double_VAR_H(crunch_small_outlines_size, 0.6, "Small if lt xht x this"); INT_VAR_H(crunch_rating_max, 10, "For adj length in rating per ch"); INT_VAR_H(crunch_pot_indicators, 1, "How many potential indicators needed"); - BOOL_VAR_H(crunch_leave_ok_strings, true, "Dont touch sensible strings"); + BOOL_VAR_H(crunch_leave_ok_strings, true, "Don't touch sensible strings"); BOOL_VAR_H(crunch_accept_ok, true, "Use acceptability in okstring"); BOOL_VAR_H(crunch_leave_accept_strings, false, - "Dont pot crunch sensible strings"); + "Don't pot crunch sensible strings"); BOOL_VAR_H(crunch_include_numerals, false, "Fiddle alpha figures"); INT_VAR_H(crunch_leave_lc_strings, 4, - "Dont crunch words with long lower case strings"); + "Don't crunch words with long lower case strings"); INT_VAR_H(crunch_leave_uc_strings, 4, - "Dont crunch words with long lower case strings"); + "Don't crunch words with long lower case strings"); INT_VAR_H(crunch_long_repetitions, 3, "Crunch words with long repetitions"); INT_VAR_H(crunch_debug, 0, "As it says"); INT_VAR_H(fixsp_non_noise_limit, 1, @@ -1010,16 +1010,16 @@ class Tesseract : public Wordrec { INT_VAR_H(suspect_space_level, 100, "Min suspect level for rejecting spaces"); INT_VAR_H(suspect_short_words, 2, - "Dont Suspect dict wds longer than this"); + "Don't Suspect dict wds longer than this"); BOOL_VAR_H(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected"); - double_VAR_H(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit"); + double_VAR_H(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit"); double_VAR_H(suspect_accept_rating, -999.9, "Accept good rating limit"); BOOL_VAR_H(tessedit_minimal_rejection, false, "Only reject tess failures"); - BOOL_VAR_H(tessedit_zero_rejection, false, "Dont reject ANYTHING"); + BOOL_VAR_H(tessedit_zero_rejection, false, "Don't reject ANYTHING"); BOOL_VAR_H(tessedit_word_for_word, false, "Make output have exactly one word per WERD"); BOOL_VAR_H(tessedit_zero_kelvin_rejection, false, - "Dont reject ANYTHING AT ALL"); + "Don't reject ANYTHING AT ALL"); BOOL_VAR_H(tessedit_consistent_reps, true, "Force all rep chars the same"); INT_VAR_H(tessedit_reject_mode, 0, "Rejection algorithm"); BOOL_VAR_H(tessedit_rejection_debug, false, "Adaption debug"); @@ -1030,7 +1030,7 @@ class Tesseract : public Wordrec { "Aspect ratio dot/hyphen test"); BOOL_VAR_H(rej_trust_doc_dawg, false, "Use DOC dawg in 11l conf. detector"); BOOL_VAR_H(rej_1Il_use_dict_word, false, "Use dictword test"); - BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Dont double check"); + BOOL_VAR_H(rej_1Il_trust_permuter_type, true, "Don't double check"); BOOL_VAR_H(rej_use_tess_accepted, true, "Individual rejection control"); BOOL_VAR_H(rej_use_tess_blanks, true, "Individual rejection control"); BOOL_VAR_H(rej_use_good_perm, true, "Individual rejection control"); diff --git a/ccstruct/blobbox.cpp b/ccstruct/blobbox.cpp index 322171f0dc..280096b5d3 100644 --- a/ccstruct/blobbox.cpp +++ b/ccstruct/blobbox.cpp @@ -33,7 +33,7 @@ ELISTIZE (BLOBNBOX) ELIST2IZE (TO_ROW) ELISTIZE (TO_BLOCK) -// Upto 30 degrees is allowed for rotations of diacritic blobs. +// Up to 30 degrees is allowed for rotations of diacritic blobs. const double kCosSmallAngle = 0.866; // Min aspect ratio for a joined word to indicate an obvious flow direction. const double kDefiniteAspectRatio = 2.0; diff --git a/ccstruct/boxread.cpp b/ccstruct/boxread.cpp index 947fcc02fe..f4aedca5b3 100644 --- a/ccstruct/boxread.cpp +++ b/ccstruct/boxread.cpp @@ -35,7 +35,7 @@ FILE* OpenBoxFile(const STRING& fname) { FILE* box_file = NULL; if (!(box_file = fopen(filename.string(), "rb"))) { CANTOPENFILE.error("read_next_box", TESSEXIT, - "Cant open box file %s", + "Can't open box file %s", filename.string()); } return box_file; diff --git a/ccstruct/normalis.cpp b/ccstruct/normalis.cpp index d43a1459cb..ddf6dbf3b1 100644 --- a/ccstruct/normalis.cpp +++ b/ccstruct/normalis.cpp @@ -382,7 +382,7 @@ void DENORM::LocalDenormTransform(const FCOORD& pt, FCOORD* original) const { } // Transforms the given coords all the way back to source image space using -// the full transformation sequence defined by this and its predecesors +// the full transformation sequence defined by this and its predecessors // recursively, shallowest first, and finally any block re_rotation. // If last_denorm is not NULL, then the last transformation used will // be last_denorm, and the block re_rotation will never be executed. diff --git a/ccstruct/normalis.h b/ccstruct/normalis.h index c8ce7cd28b..2d75412078 100644 --- a/ccstruct/normalis.h +++ b/ccstruct/normalis.h @@ -218,7 +218,7 @@ class DENORM { void LocalDenormTransform(const TPOINT& pt, TPOINT* original) const; void LocalDenormTransform(const FCOORD& pt, FCOORD* original) const; // Transforms the given coords all the way back to source image space using - // the full transformation sequence defined by this and its predecesors + // the full transformation sequence defined by this and its predecessors // recursively, shallowest first, and finally any block re_rotation. // If last_denorm is not NULL, then the last transformation used will // be last_denorm, and the block re_rotation will never be executed. diff --git a/ccstruct/pdblock.h b/ccstruct/pdblock.h index 0dd0bf2ef8..b64eff36d0 100644 --- a/ccstruct/pdblock.h +++ b/ccstruct/pdblock.h @@ -108,7 +108,7 @@ class PDBLK PDBLK & operator= (const PDBLK & source); protected: - POLY_BLOCK *hand_poly; //< wierd as well + POLY_BLOCK *hand_poly; //< weird as well ICOORDELT_LIST leftside; //< left side vertices ICOORDELT_LIST rightside; //< right side vertices TBOX box; //< bounding box diff --git a/ccstruct/rejctmap.h b/ccstruct/rejctmap.h index 4b27bab49b..d945dda1fa 100644 --- a/ccstruct/rejctmap.h +++ b/ccstruct/rejctmap.h @@ -16,7 +16,7 @@ ** limitations under the License. * -This module may look unneccessarily verbose, but here's the philosophy... +This module may look unnecessarily verbose, but here's the philosophy... ALL processing of the reject map is done in this module. There are lots of separate calls to set reject/accept flags. These have DELIBERATELY been kept @@ -51,7 +51,7 @@ OF THIS IMPLIED TEMPORAL ORDERING OF THE FLAGS!!!! enum REJ_FLAGS { /* Reject modes which are NEVER overridden */ - R_TESS_FAILURE, // PERM Tess didnt classify + R_TESS_FAILURE, // PERM Tess didn't classify R_SMALL_XHT, // PERM Xht too small R_EDGE_CHAR, // PERM Too close to edge of image R_1IL_CONFLICT, // PERM 1Il confusion @@ -62,7 +62,7 @@ enum REJ_FLAGS /* Initial reject modes (pre NN_ACCEPT) */ R_POOR_MATCH, // TEMP Ray's original heuristic (Not used) - R_NOT_TESS_ACCEPTED, // TEMP Tess didnt accept WERD + R_NOT_TESS_ACCEPTED, // TEMP Tess didn't accept WERD R_CONTAINS_BLANKS, // TEMP Tess failed on other chs in WERD R_BAD_PERMUTER, // POTENTIAL Bad permuter for WERD @@ -82,7 +82,7 @@ enum REJ_FLAGS R_ROW_REJ, // TEMP Row rejection R_UNLV_REJ, // TEMP ~ turned to - or ^ turned to space - /* Accept modes which occur inbetween the above rejection groups */ + /* Accept modes which occur between the above rejection groups */ R_NN_ACCEPT, //NN acceptance R_HYPHEN_ACCEPT, //Hyphen acceptance R_MM_ACCEPT, //Matrix match acceptance diff --git a/ccstruct/statistc.cpp b/ccstruct/statistc.cpp index 63676c2fca..39d5edd180 100644 --- a/ccstruct/statistc.cpp +++ b/ccstruct/statistc.cpp @@ -204,7 +204,7 @@ double STATS::ile(double frac) const { /********************************************************************** * STATS::min_bucket * - * Find REAL minimum bucket - ile(0.0) isnt necessarily correct + * Find REAL minimum bucket - ile(0.0) isn't necessarily correct **********************************************************************/ inT32 STATS::min_bucket() const { // Find min if (buckets_ == NULL || total_count_ == 0) { @@ -219,7 +219,7 @@ inT32 STATS::min_bucket() const { // Find min /********************************************************************** * STATS::max_bucket * - * Find REAL maximum bucket - ile(1.0) isnt necessarily correct + * Find REAL maximum bucket - ile(1.0) isn't necessarily correct **********************************************************************/ inT32 STATS::max_bucket() const { // Find max @@ -249,7 +249,7 @@ double STATS::median() const { //get median if ((total_count_ > 1) && (pile_count(median_pile) == 0)) { inT32 min_pile; inT32 max_pile; - /* Find preceeding non zero pile */ + /* Find preceding non zero pile */ for (min_pile = median_pile; pile_count(min_pile) == 0; min_pile--); /* Find following non zero pile */ for (max_pile = median_pile; pile_count(max_pile) == 0; max_pile++); diff --git a/ccstruct/vecfuncs.cpp b/ccstruct/vecfuncs.cpp index 8357c9aabe..bafca55d60 100644 --- a/ccstruct/vecfuncs.cpp +++ b/ccstruct/vecfuncs.cpp @@ -23,7 +23,7 @@ * ******************************************************************************** * Revision 5.1 89/07/27 11:47:50 11:47:50 ray () - * Added ratings acces methods. + * Added ratings access methods. * This version ready for independent development. */ /*---------------------------------------------------------------------- diff --git a/ccutil/clst.cpp b/ccutil/clst.cpp index 60f88d3706..e71cc20100 100644 --- a/ccutil/clst.cpp +++ b/ccutil/clst.cpp @@ -190,7 +190,7 @@ const void *, const void *)) { // Assuming list has been sorted already, insert new_data to // keep the list sorted according to the same comparison function. -// Comparision function is the same as used by sort, i.e. uses double +// Comparison function is the same as used by sort, i.e. uses double // indirection. Time is O(1) to add to beginning or end. // Time is linear to add pre-sorted items to an empty list. // If unique, then don't add duplicate entries. @@ -513,7 +513,7 @@ CLIST_LINK *CLIST_ITERATOR::extract_sublist( //from temp_it.mark_cycle_pt (); do { //walk sublist - if (temp_it.cycled_list ()) //cant find end pt + if (temp_it.cycled_list ()) //can't find end pt BAD_SUBLIST.error ("CLIST_ITERATOR.extract_sublist", ABORT, NULL); if (temp_it.at_last ()) { diff --git a/ccutil/clst.h b/ccutil/clst.h index 89c4369949..a209ac11cc 100644 --- a/ccutil/clst.h +++ b/ccutil/clst.h @@ -51,11 +51,11 @@ class DLLSYM CLIST_LINK } CLIST_LINK( //copy constructor - const CLIST_LINK &) { //dont copy link + const CLIST_LINK &) { //don't copy link data = next = NULL; } - void operator= ( //dont copy links + void operator= ( //don't copy links const CLIST_LINK &) { data = next = NULL; } @@ -89,7 +89,7 @@ class DLLSYM CLIST void internal_deep_clear ( //destroy all links void (*zapper) (void *)); //ptr to zapper functn - void shallow_clear(); //clear list but dont + void shallow_clear(); //clear list but don't //delete data elements bool empty() const { //is list empty? @@ -117,7 +117,7 @@ class DLLSYM CLIST // Assuming list has been sorted already, insert new_data to // keep the list sorted according to the same comparison function. - // Comparision function is the same as used by sort, i.e. uses double + // Comparison function is the same as used by sort, i.e. uses double // indirection. Time is O(1) to add to beginning or end. // Time is linear to add pre-sorted items to an empty list. // If unique, then don't add duplicate entries. @@ -232,7 +232,7 @@ class DLLSYM CLIST_ITERATOR BOOL8 cycled_list(); //Completed a cycle? void add_to_end( //add at end & - void *new_data); //dont move + void *new_data); //don't move void exchange( //positions of 2 links CLIST_ITERATOR *other_it); //other iterator @@ -437,7 +437,7 @@ inline void CLIST_ITERATOR::add_before_then_move( // element to add /*********************************************************************** * CLIST_ITERATOR::add_before_stay_put * - * Add a new element to the list before the current element but dont move the + * Add a new element to the list before the current element but don't move the * iterator to the new element. **********************************************************************/ @@ -485,7 +485,7 @@ inline void CLIST_ITERATOR::add_before_stay_put( // element to add /*********************************************************************** * CLIST_ITERATOR::add_list_after * - * Insert another list to this list after the current element but dont move the + * Insert another list to this list after the current element but don't move the * iterator. **********************************************************************/ @@ -836,7 +836,7 @@ Replace with "". may be an arbitrary number of tokens CLASSNAME is assumed to be the name of a class to be used in a CONS list -NOTE: Because we dont use virtual functions in the list code, the list code +NOTE: Because we don't use virtual functions in the list code, the list code will NOT work correctly for classes derived from this. The macro generates: @@ -885,7 +885,7 @@ public: \ CLASSNAME##_CLIST():CLIST() {} \ /* constructor */ \ \ - CLASSNAME##_CLIST( /* dont construct */ \ + CLASSNAME##_CLIST( /* don't construct */ \ const CLASSNAME##_CLIST&) /*by initial assign*/ \ { DONT_CONSTRUCT_LIST_BY_COPY.error( QUOTE_IT( CLASSNAME##_CLIST ), \ ABORT, NULL ); } \ @@ -963,7 +963,7 @@ CLISTIZEH_C( CLASSNAME ) * A function which can delete a CLASSNAME element. This is passed to the \ * generic deep_clear list member function so that when a list is cleared the \ * elements on the list are properly destroyed from the base class, even \ -* though we dont use a virtual destructor function. \ +* though we don't use a virtual destructor function. \ **********************************************************************/ \ \ DLLSYM void CLASSNAME##_c1_zapper( /*delete a link*/ \ diff --git a/ccutil/elst.cpp b/ccutil/elst.cpp index 7762220d6e..67a8ab0cbe 100644 --- a/ccutil/elst.cpp +++ b/ccutil/elst.cpp @@ -117,7 +117,7 @@ inT32 ELIST::length() const { // count elements * ELIST::sort * * Sort elements on list - * NB If you dont like the const declarations in the comparator, coerce yours: + * NB If you don't like the const declarations in the comparator, coerce yours: * ( int (*)(const void *, const void *) **********************************************************************/ @@ -161,7 +161,7 @@ const void *, const void *)) { // Assuming list has been sorted already, insert new_link to // keep the list sorted according to the same comparison function. -// Comparision function is the same as used by sort, i.e. uses double +// Comparison function is the same as used by sort, i.e. uses double // indirection. Time is O(1) to add to beginning or end. // Time is linear to add pre-sorted items to an empty list. // If unique is set to true and comparator() returns 0 (an entry with the @@ -455,7 +455,7 @@ ELIST_LINK *ELIST_ITERATOR::extract_sublist( //from temp_it.mark_cycle_pt (); do { //walk sublist - if (temp_it.cycled_list ()) //cant find end pt + if (temp_it.cycled_list ()) //can't find end pt BAD_SUBLIST.error ("ELIST_ITERATOR.extract_sublist", ABORT, NULL); if (temp_it.at_last ()) { diff --git a/ccutil/elst.h b/ccutil/elst.h index dcc1552d16..492c03acb3 100644 --- a/ccutil/elst.h +++ b/ccutil/elst.h @@ -67,7 +67,7 @@ The implementation of lists is very careful about space and speed overheads. This is why many embedded lists are provided. The same concerns mean that in-line type coercion is done, rather than use virtual functions. This is cumbersome in that each data type to be listed requires its own iterator and -list class - though macros can gererate these. It also prevents heterogenous +list class - though macros can gererate these. It also prevents heterogeneous lists. **********************************************************************/ @@ -98,7 +98,7 @@ class DLLSYM ELIST_LINK next = NULL; } - void operator= ( //dont copy links + void operator= ( //don't copy links const ELIST_LINK &) { next = NULL; } @@ -158,7 +158,7 @@ class DLLSYM ELIST // Assuming list has been sorted already, insert new_link to // keep the list sorted according to the same comparison function. - // Comparision function is the same as used by sort, i.e. uses double + // Comparison function is the same as used by sort, i.e. uses double // indirection. Time is O(1) to add to beginning or end. // Time is linear to add pre-sorted items to an empty list. // If unique is set to true and comparator() returns 0 (an entry with the @@ -274,7 +274,7 @@ class DLLSYM ELIST_ITERATOR bool cycled_list(); //Completed a cycle? void add_to_end( //add at end & - ELIST_LINK *new_link); //dont move + ELIST_LINK *new_link); //don't move void exchange( //positions of 2 links ELIST_ITERATOR *other_it); //other iterator @@ -470,7 +470,7 @@ inline void ELIST_ITERATOR::add_before_then_move( // element to add /*********************************************************************** * ELIST_ITERATOR::add_before_stay_put * - * Add a new element to the list before the current element but dont move the + * Add a new element to the list before the current element but don't move the * iterator to the new element. **********************************************************************/ @@ -515,7 +515,7 @@ inline void ELIST_ITERATOR::add_before_stay_put( // element to add /*********************************************************************** * ELIST_ITERATOR::add_list_after * - * Insert another list to this list after the current element but dont move the + * Insert another list to this list after the current element but don't move the * iterator. **********************************************************************/ @@ -868,7 +868,7 @@ Replace with "". may be an arbitrary number of tokens CLASSNAME is assumed to be the name of a class which has a baseclass of ELIST_LINK. -NOTE: Because we dont use virtual functions in the list code, the list code +NOTE: Because we don't use virtual functions in the list code, the list code will NOT work correctly for classes derived from this. The macros generate: @@ -999,7 +999,7 @@ ELISTIZEH_C( CLASSNAME ) * A function which can delete a CLASSNAME element. This is passed to the \ * generic clear list member function so that when a list is cleared the \ * elements on the list are properly destroyed from the base class, even \ -* though we dont use a virtual destructor function. \ +* though we don't use a virtual destructor function. \ **********************************************************************/ \ \ DLLSYM void CLASSNAME##_zapper(ELIST_LINK* link) { \ diff --git a/ccutil/elst2.cpp b/ccutil/elst2.cpp index 7055686fb5..fe5b77e256 100644 --- a/ccutil/elst2.cpp +++ b/ccutil/elst2.cpp @@ -118,7 +118,7 @@ inT32 ELIST2::length() const { // count elements * ELIST2::sort * * Sort elements on list - * NB If you dont like the const declarations in the comparator, coerce yours: + * NB If you don't like the const declarations in the comparator, coerce yours: * ( int (*)(const void *, const void *) **********************************************************************/ @@ -162,7 +162,7 @@ const void *, const void *)) { // Assuming list has been sorted already, insert new_link to // keep the list sorted according to the same comparison function. -// Comparision function is the same as used by sort, i.e. uses double +// Comparison function is the same as used by sort, i.e. uses double // indirection. Time is O(1) to add to beginning or end. // Time is linear to add pre-sorted items to an empty list. void ELIST2::add_sorted(int comparator(const void*, const void*), @@ -475,7 +475,7 @@ ELIST2_LINK *ELIST2_ITERATOR::extract_sublist( //fr temp_it.mark_cycle_pt (); do { //walk sublist - if (temp_it.cycled_list ()) //cant find end pt + if (temp_it.cycled_list ()) //can't find end pt BAD_SUBLIST.error ("ELIST2_ITERATOR.extract_sublist", ABORT, NULL); if (temp_it.at_last ()) { diff --git a/ccutil/elst2.h b/ccutil/elst2.h index 7201750dcb..f7ea6ed07c 100644 --- a/ccutil/elst2.h +++ b/ccutil/elst2.h @@ -69,11 +69,11 @@ class DLLSYM ELIST2_LINK } ELIST2_LINK( //copy constructor - const ELIST2_LINK &) { //dont copy link + const ELIST2_LINK &) { //don't copy link prev = next = NULL; } - void operator= ( //dont copy links + void operator= ( //don't copy links const ELIST2_LINK &) { prev = next = NULL; } @@ -133,7 +133,7 @@ class DLLSYM ELIST2 // Assuming list has been sorted already, insert new_link to // keep the list sorted according to the same comparison function. - // Comparision function is the same as used by sort, i.e. uses double + // Comparison function is the same as used by sort, i.e. uses double // indirection. Time is O(1) to add to beginning or end. // Time is linear to add pre-sorted items to an empty list. void add_sorted(int comparator(const void*, const void*), @@ -241,7 +241,7 @@ class DLLSYM ELIST2_ITERATOR BOOL8 cycled_list(); //Completed a cycle? void add_to_end( //add at end & - ELIST2_LINK *new_link); //dont move + ELIST2_LINK *new_link); //don't move void exchange( //positions of 2 links ELIST2_ITERATOR *other_it); //other iterator @@ -450,7 +450,7 @@ inline void ELIST2_ITERATOR::add_before_then_move( // element to add /*********************************************************************** * ELIST2_ITERATOR::add_before_stay_put * - * Add a new element to the list before the current element but dont move the + * Add a new element to the list before the current element but don't move the * iterator to the new element. **********************************************************************/ @@ -500,7 +500,7 @@ inline void ELIST2_ITERATOR::add_before_stay_put( // element to add /*********************************************************************** * ELIST2_ITERATOR::add_list_after * - * Insert another list to this list after the current element but dont move the + * Insert another list to this list after the current element but don't move the * iterator. **********************************************************************/ @@ -883,7 +883,7 @@ Replace with "". may be an arbitrary number of tokens CLASSNAME is assumed to be the name of a class which has a baseclass of ELIST2_LINK. -NOTE: Because we dont use virtual functions in the list code, the list code +NOTE: Because we don't use virtual functions in the list code, the list code will NOT work correctly for classes derived from this. The macro generates: @@ -927,7 +927,7 @@ public: \ CLASSNAME##_LIST():ELIST2() {} \ /* constructor */ \ \ - CLASSNAME##_LIST( /* dont construct */ \ + CLASSNAME##_LIST( /* don't construct */ \ const CLASSNAME##_LIST&) /*by initial assign*/\ { DONT_CONSTRUCT_LIST_BY_COPY.error( QUOTE_IT( CLASSNAME##_LIST ), \ ABORT, NULL ); } \ @@ -1015,7 +1015,7 @@ ELIST2IZEH_C( CLASSNAME ) * A function which can delete a CLASSNAME element. This is passed to the \ * generic clear list member function so that when a list is cleared the \ * elements on the list are properly destroyed from the base class, even \ -* though we dont use a virtual destructor function. \ +* though we don't use a virtual destructor function. \ **********************************************************************/ \ \ DLLSYM void CLASSNAME##_zapper( /*delete a link*/ \ diff --git a/ccutil/errcode.h b/ccutil/errcode.h index 89385d2b93..69d4187a37 100644 --- a/ccutil/errcode.h +++ b/ccutil/errcode.h @@ -53,7 +53,7 @@ enum TessErrorLogCode { #define LOC_DOC_BLK_REJ 22 #define LOC_WRITE_RESULTS 23 #define LOC_ADAPTIVE 24 -/* DONT DEFINE ANY LOCATION > 31 !!! */ +/* DON'T DEFINE ANY LOCATION > 31 !!! */ /* Sub locatation determines whether pass2 was in normal mode or fix xht mode*/ #define SUBLOC_NORM 0 diff --git a/ccutil/genericvector.h b/ccutil/genericvector.h index 8433966bf9..a0ca9e2926 100644 --- a/ccutil/genericvector.h +++ b/ccutil/genericvector.h @@ -949,7 +949,7 @@ bool GenericVector::SerializeClasses(tesseract::TFile* fp) const { // Reads a vector of classes from the given file. Assumes the existence of // bool T::Deserialize(bool swap, FILE* fp) that returns false in case of -// error. Alse needs T::T() and T::T(constT&), as init_to_size is used in +// error. Also needs T::T() and T::T(constT&), as init_to_size is used in // this function. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. template diff --git a/ccutil/helpers.h b/ccutil/helpers.h index 480929c955..022a2c3066 100644 --- a/ccutil/helpers.h +++ b/ccutil/helpers.h @@ -61,8 +61,8 @@ class TRand { private: // Steps the generator to the next value. void Iterate() { - seed_ *= 6364136223846793005; - seed_ += 1442695040888963407; + seed_ *= 6364136223846793005ULL; + seed_ += 1442695040888963407ULL; } // The current value of the seed. diff --git a/ccutil/lsterr.h b/ccutil/lsterr.h index 6bcd7fead1..42ed07e326 100644 --- a/ccutil/lsterr.h +++ b/ccutil/lsterr.h @@ -38,6 +38,6 @@ const ERRCODE NULL_PREV = "Previous element on the list is NULL"; const ERRCODE EMPTY_LIST = "List is empty"; const ERRCODE BAD_PARAMETER = "List parameter error"; const ERRCODE STILL_LINKED = -"Attemting to add an element with non NULL links, to a list"; +"Attempting to add an element with non NULL links, to a list"; #endif #endif diff --git a/ccutil/ocrclass.h b/ccutil/ocrclass.h index 37556b30b2..9be184d591 100644 --- a/ccutil/ocrclass.h +++ b/ccutil/ocrclass.h @@ -21,7 +21,7 @@ * the HP OCR interface. * The code is designed to be used with either a C or C++ compiler. * The structures are designed to allow them to be used with any - * structure alignment upto 8. + * structure alignment up to 8. **********************************************************************/ #ifndef CCUTIL_OCRCLASS_H_ diff --git a/ccutil/strngs.cpp b/ccutil/strngs.cpp index 1c9769978a..b44c541246 100644 --- a/ccutil/strngs.cpp +++ b/ccutil/strngs.cpp @@ -45,7 +45,7 @@ const int kMaxDoubleSize = 15; * * The collection of MACROS provide different implementations depending * on whether the string keeps track of its strlen or not so that this - * feature can be added in later when consumers dont modifify the string + * feature can be added in later when consumers don't modify the string **********************************************************************/ // Smallest string to allocate by default @@ -339,7 +339,7 @@ STRING& STRING::operator=(const STRING& str) { const STRING_HEADER* str_header = str.GetHeader(); int str_used = str_header->used_; - GetHeader()->used_ = 0; // clear since ensure doesnt need to copy data + GetHeader()->used_ = 0; // clear since ensure doesn't need to copy data char* this_cstr = ensure_cstr(str_used); STRING_HEADER* this_header = GetHeader(); @@ -398,7 +398,7 @@ STRING & STRING::operator=(const char* cstr) { if (cstr) { int len = strlen(cstr) + 1; - this_header->used_ = 0; // dont bother copying data if need to realloc + this_header->used_ = 0; // don't bother copying data if need to realloc char* this_cstr = ensure_cstr(len); this_header = GetHeader(); // for realloc memcpy(this_cstr, cstr, len); @@ -416,7 +416,7 @@ STRING & STRING::operator=(const char* cstr) { void STRING::assign(const char *cstr, int len) { STRING_HEADER* this_header = GetHeader(); - this_header->used_ = 0; // dont bother copying data if need to realloc + this_header->used_ = 0; // don't bother copying data if need to realloc char* this_cstr = ensure_cstr(len + 1); // +1 for '\0' this_header = GetHeader(); // for realloc diff --git a/ccutil/tessdatamanager.cpp b/ccutil/tessdatamanager.cpp index 032d5fee61..23d029bb42 100644 --- a/ccutil/tessdatamanager.cpp +++ b/ccutil/tessdatamanager.cpp @@ -51,7 +51,7 @@ bool TessdataManager::Init(const char *data_file_name, int debug_level) { sizeof(actual_tessdata_num_entries_)); } if (actual_tessdata_num_entries_ > TESSDATA_NUM_ENTRIES) { - // For forward compatability, truncate to the number we can handle. + // For forward compatibility, truncate to the number we can handle. actual_tessdata_num_entries_ = TESSDATA_NUM_ENTRIES; } fread(offset_table_, sizeof(inT64), diff --git a/ccutil/tessdatamanager.h b/ccutil/tessdatamanager.h index de3e599025..fd2685a1d8 100644 --- a/ccutil/tessdatamanager.h +++ b/ccutil/tessdatamanager.h @@ -282,7 +282,7 @@ class TessdataManager { * same or smaller than TESSDATA_NUM_ENTRIES, but can never be larger, * since then it would be impossible to interpret the type of tessdata at * indices same and higher than TESSDATA_NUM_ENTRIES. - * This parameter is used to allow for backward compatiblity + * This parameter is used to allow for backward compatibility * when new tessdata types are introduced. */ inT32 actual_tessdata_num_entries_; diff --git a/classify/adaptmatch.cpp b/classify/adaptmatch.cpp index 3a6ef1c498..b89f1cb7ae 100644 --- a/classify/adaptmatch.cpp +++ b/classify/adaptmatch.cpp @@ -515,7 +515,7 @@ void Classify::EndAdaptiveClassifier() { * load_pre_trained_templates Indicates whether the pre-trained * templates (inttemp, normproto and pffmtable components) * should be lodaded. Should only be set to true if the - * necesary classifier components are present in the + * necessary classifier components are present in the * [lang].traineddata file. * Globals: * BuiltInTemplatesFile file to get built-in temps from @@ -1720,7 +1720,7 @@ bool Classify::LooksLikeGarbage(TBLOB *blob) { * * Globals: * - * @return Number of features extracted or 0 if an error occured. + * @return Number of features extracted or 0 if an error occurred. * @note Exceptions: none * @note History: Tue May 28 10:40:52 1991, DSJ, Created. */ @@ -2082,7 +2082,7 @@ void Classify::PrintAdaptiveMatchResults(const ADAPT_RESULTS& results) { /*---------------------------------------------------------------------------*/ /** - * This routine steps thru each matching class in Results + * This routine steps through each matching class in Results * and removes it from the match list if its rating * is worse than the BestRating plus a pad. In other words, * all good matches get moved to the front of the classes diff --git a/classify/classify.cpp b/classify/classify.cpp index c68fc27643..436efd1f2d 100644 --- a/classify/classify.cpp +++ b/classify/classify.cpp @@ -151,7 +151,7 @@ Classify::Classify() INT_MEMBER(classify_integer_matcher_multiplier, 10, "Integer Matcher Multiplier 0-255: ", this->params()), EnableLearning(true), - INT_MEMBER(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word", + INT_MEMBER(il1_adaption_test, 0, "Don't adapt to i/I at beginning of word", this->params()), BOOL_MEMBER(classify_bln_numeric_mode, 0, "Assume the input is numbers [0-9].", this->params()), diff --git a/classify/classify.h b/classify/classify.h index e952394630..0de8441527 100644 --- a/classify/classify.h +++ b/classify/classify.h @@ -495,7 +495,7 @@ class Classify : public CCStruct { // font combinations that the shape represents. UnicityTable fontset_table_; - INT_VAR_H(il1_adaption_test, 0, "Dont adapt to i/I at beginning of word"); + INT_VAR_H(il1_adaption_test, 0, "Don't adapt to i/I at beginning of word"); BOOL_VAR_H(classify_bln_numeric_mode, 0, "Assume the input is numbers [0-9]."); double_VAR_H(speckle_large_max_size, 0.30, "Max large speckle size"); diff --git a/classify/cluster.cpp b/classify/cluster.cpp index ef46f77c21..b723bfa82e 100644 --- a/classify/cluster.cpp +++ b/classify/cluster.cpp @@ -182,7 +182,7 @@ struct BUCKETS { FLOAT64 ChiSquared; // test threshold uinT16 NumberOfBuckets; // number of cells in histogram uinT16 Bucket[BUCKETTABLESIZE];// mapping to histogram buckets - uinT32 *Count; // frequency of occurence histogram + uinT32 *Count; // frequency of occurrence histogram FLOAT32 *ExpectedCount; // expected histogram }; diff --git a/classify/clusttool.h b/classify/clusttool.h index a4f3b8351d..e82fa1ef48 100644 --- a/classify/clusttool.h +++ b/classify/clusttool.h @@ -24,7 +24,7 @@ #include /*------------------------------------------------------------------------- - Public Funtion Prototype + Public Function Prototype --------------------------------------------------------------------------*/ uinT16 ReadSampleSize(FILE *File); diff --git a/classify/featdefs.cpp b/classify/featdefs.cpp index cf9e551509..ad7b799675 100644 --- a/classify/featdefs.cpp +++ b/classify/featdefs.cpp @@ -285,7 +285,7 @@ CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, /*---------------------------------------------------------------------------*/ /** - * Search thru all features currently defined and return + * Search through all features currently defined and return * the feature type for the feature with the specified short * name. Trap an error if the specified name is not found. * diff --git a/classify/intfx.cpp b/classify/intfx.cpp index 12966aa195..78aa59bbc9 100644 --- a/classify/intfx.cpp +++ b/classify/intfx.cpp @@ -44,7 +44,7 @@ using tesseract::TrainingSample; // The entries are in binary degrees where a full circle is 256 binary degrees. static float cos_table[INT_CHAR_NORM_RANGE]; static float sin_table[INT_CHAR_NORM_RANGE]; -// Guards write access to AtanTable so we dont create it more than once. +// Guards write access to AtanTable so we don't create it more than once. tesseract::CCUtilMutex atan_table_mutex; diff --git a/classify/kdtree.cpp b/classify/kdtree.cpp index 8d05149cc1..61a94f66cc 100644 --- a/classify/kdtree.cpp +++ b/classify/kdtree.cpp @@ -521,7 +521,7 @@ bool KDTreeSearch::BoxIntersectsSearch(FLOAT32 *lower, FLOAT32 *upper) { * Walk a tree, calling action once on each node. * * Operation: - * This routine walks thru the specified sub_tree and invokes action + * This routine walks through the specified sub_tree and invokes action * action at each node as follows: * action(context, data, level) * data the data contents of the node being visited, diff --git a/classify/mfoutline.cpp b/classify/mfoutline.cpp index 7f1b04ad44..511c34d41f 100644 --- a/classify/mfoutline.cpp +++ b/classify/mfoutline.cpp @@ -104,7 +104,7 @@ LIST ConvertOutlines(TESSLINE *outline, /*---------------------------------------------------------------------------*/ /** - * This routine searches thru the specified outline, computes + * This routine searches through the specified outline, computes * a slope for each vector in the outline, and marks each * vector as having one of the following directions: * N, S, E, W, NE, NW, SE, SW @@ -182,7 +182,7 @@ void FreeOutlines(LIST Outlines) { /*---------------------------------------------------------------------------*/ /** - * This routine searches thru the specified outline and finds + * This routine searches through the specified outline and finds * the points at which the outline changes direction. These * points are then marked as "extremities". This routine is * used as an alternative to FindExtremities(). It forces the diff --git a/classify/picofeat.cpp b/classify/picofeat.cpp index fea3b14121..74beb18f35 100644 --- a/classify/picofeat.cpp +++ b/classify/picofeat.cpp @@ -147,7 +147,7 @@ void ConvertSegmentToPicoFeat(FPOINT *Start, /*---------------------------------------------------------------------------*/ /** - * This routine steps thru the specified outline and cuts it + * This routine steps through the specified outline and cuts it * up into pieces of equal length. These pieces become the * desired pico-features. Each segment in the outline * is converted into an integral number of pico-features. diff --git a/cube/beam_search.cpp b/cube/beam_search.cpp index a89b15d8a9..fd17a1d59f 100644 --- a/cube/beam_search.cpp +++ b/cube/beam_search.cpp @@ -93,7 +93,7 @@ void BeamSearch::CreateChildren(SearchColumn *out_col, LangModel *lang_mod, } // lm_edges } -// Performs a beam seach in the specified search using the specified +// Performs a beam search in the specified search using the specified // language model; returns an alternate list of possible words as a result. WordAltList * BeamSearch::Search(SearchObject *srch_obj, LangModel *lang_mod) { // verifications diff --git a/cube/beam_search.h b/cube/beam_search.h index a39f5b1349..cd8fc0110d 100644 --- a/cube/beam_search.h +++ b/cube/beam_search.h @@ -45,7 +45,7 @@ class BeamSearch { public: explicit BeamSearch(CubeRecoContext *cntxt, bool word_mode = true); ~BeamSearch(); - // Performs a beam seach in the specified search using the specified + // Performs a beam search in the specified search using the specified // language model; returns an alternate list of possible words as a result. WordAltList *Search(SearchObject *srch_obj, LangModel *lang_mod = NULL); // Returns the best node in the last column of last performed search. diff --git a/cube/conv_net_classifier.cpp b/cube/conv_net_classifier.cpp index d6ae692e7b..ac33cd33b1 100644 --- a/cube/conv_net_classifier.cpp +++ b/cube/conv_net_classifier.cpp @@ -72,7 +72,7 @@ bool ConvNetCharClassifier::Train(CharSamp *char_samp, int ClassID) { /** * A secondary function needed for training. Allows the trainer to set the - * value of any train-time paramter. This function is currently not + * value of any train-time parameter. This function is currently not * implemented. TODO(ahmadab): implement end-2-end training */ bool ConvNetCharClassifier::SetLearnParam(char *var_name, float val) { diff --git a/cube/conv_net_classifier.h b/cube/conv_net_classifier.h index e9bcd8c2cc..b9e7692c28 100644 --- a/cube/conv_net_classifier.h +++ b/cube/conv_net_classifier.h @@ -55,7 +55,7 @@ class ConvNetCharClassifier : public CharClassifier { // is currently not implemented. TODO(ahmadab): implement end-2-end training virtual bool Train(CharSamp *char_samp, int ClassID); // A secondary function needed for training. Allows the trainer to set the - // value of any train-time paramter. This function is currently not + // value of any train-time parameter. This function is currently not // implemented. TODO(ahmadab): implement end-2-end training virtual bool SetLearnParam(char *var_name, float val); // Externally sets the Neural Net used by the classifier. Used for training diff --git a/cube/cube_line_object.cpp b/cube/cube_line_object.cpp index 64b90cadff..0325453740 100644 --- a/cube/cube_line_object.cpp +++ b/cube/cube_line_object.cpp @@ -247,7 +247,7 @@ int CubeLineObject::ComputeWordBreakThreshold(int con_comp_cnt, word_break_threshold--; } while (!valid && word_break_threshold > 0); - // failed to find a threshold that acheives the target aspect ratio. + // failed to find a threshold that achieves the target aspect ratio. // Just use the default threshold return static_cast(line_pix_->h * cntxt_->Params()->MaxSpaceHeightRatio()); diff --git a/cube/cube_line_segmenter.cpp b/cube/cube_line_segmenter.cpp index 82f8c8ede4..278011f090 100644 --- a/cube/cube_line_segmenter.cpp +++ b/cube/cube_line_segmenter.cpp @@ -237,7 +237,7 @@ Pixa *CubeLineSegmenter::CrackLine(Pix *cracked_line_pix, return NULL; } -// split a line continously until valid or fail +// split a line continuously until valid or fail Pixa *CubeLineSegmenter::SplitLine(Pix *line_mask_pix, Box *line_box) { // clone the line mask Pix *line_pix = pixClone(line_mask_pix); @@ -739,7 +739,7 @@ bool CubeLineSegmenter::LineSegment() { return true; } -// Estimate the paramters of the font(s) used in the page +// Estimate the parameters of the font(s) used in the page bool CubeLineSegmenter::EstimateFontParams() { int hgt_hist[kHgtBins]; int max_hgt; diff --git a/cube/cube_search_object.cpp b/cube/cube_search_object.cpp index 0cf54e31a9..61294f26b6 100644 --- a/cube/cube_search_object.cpp +++ b/cube/cube_search_object.cpp @@ -212,7 +212,7 @@ CharSamp *CubeSearchObject::CharSample(int start_pt, int end_pt) { samp->SetLastChar(last_char ? 255 : 0); } else { // for non cursive languages, these features correspond - // to whether the charsamp is at the begining or end of the word + // to whether the charsamp is at the beginning or end of the word samp->SetFirstChar((start_pt == -1) ? 255 : 0); samp->SetLastChar((end_pt == (segment_cnt_ - 1)) ? 255 : 0); } diff --git a/cube/cube_search_object.h b/cube/cube_search_object.h index 8452417a69..0a6c3ce20b 100644 --- a/cube/cube_search_object.h +++ b/cube/cube_search_object.h @@ -114,7 +114,7 @@ class CubeSearchObject : public SearchObject { end_pt <= (start_pt + max_seg_per_char_)); } // computes the space and no space costs at gaps between segments - // return true on sucess + // return true on success bool ComputeSpaceCosts(); }; } diff --git a/cube/hybrid_neural_net_classifier.cpp b/cube/hybrid_neural_net_classifier.cpp index b5822f6f22..671a74acdf 100644 --- a/cube/hybrid_neural_net_classifier.cpp +++ b/cube/hybrid_neural_net_classifier.cpp @@ -72,7 +72,7 @@ bool HybridNeuralNetCharClassifier::Train(CharSamp *char_samp, int ClassID) { } // A secondary function needed for training. Allows the trainer to set the -// value of any train-time paramter. This function is currently not +// value of any train-time parameter. This function is currently not // implemented. TODO(ahmadab): implement end-2-end training bool HybridNeuralNetCharClassifier::SetLearnParam(char *var_name, float val) { // TODO(ahmadab): implementation of parameter initializing. @@ -151,7 +151,7 @@ bool HybridNeuralNetCharClassifier::RunNets(CharSamp *char_samp) { return false; } - // go thru all the nets + // go through all the nets memset(net_output_, 0, class_cnt * sizeof(*net_output_)); float *inputs = net_input_; for (int net_idx = 0; net_idx < nets_.size(); net_idx++) { diff --git a/cube/hybrid_neural_net_classifier.h b/cube/hybrid_neural_net_classifier.h index 0ab9ba1235..6ad6233f43 100644 --- a/cube/hybrid_neural_net_classifier.h +++ b/cube/hybrid_neural_net_classifier.h @@ -48,7 +48,7 @@ class HybridNeuralNetCharClassifier : public CharClassifier { // is currently not implemented. TODO(ahmadab): implement end-2-end training virtual bool Train(CharSamp *char_samp, int ClassID); // A secondary function needed for training. Allows the trainer to set the - // value of any train-time paramter. This function is currently not + // value of any train-time parameter. This function is currently not // implemented. TODO(ahmadab): implement end-2-end training virtual bool SetLearnParam(char *var_name, float val); // Externally sets the Neural Net used by the classifier. Used for training diff --git a/cube/tess_lang_model.cpp b/cube/tess_lang_model.cpp index 8b4ff68ee4..5113207260 100644 --- a/cube/tess_lang_model.cpp +++ b/cube/tess_lang_model.cpp @@ -397,7 +397,7 @@ int TessLangModel::NumberEdges(EDGE_REF edge_ref, LangModEdge **edge_array) { return 0; } - // go thru all valid transitions from the state + // go through all valid transitions from the state int edge_cnt = 0; EDGE_REF new_edge_ref; diff --git a/cutil/listio.h b/cutil/listio.h index e758c9bcb2..7d9c19f777 100644 --- a/cutil/listio.h +++ b/cutil/listio.h @@ -37,7 +37,7 @@ #include "oldlist.h" /*---------------------------------------------------------------------------- - Public Funtion Prototypes + Public Function Prototypes --------------------------------------------------------------------------*/ LIST read_list(const char *filename); #endif diff --git a/cutil/oldlist.cpp b/cutil/oldlist.cpp index cf93ffb518..52c0d8680a 100644 --- a/cutil/oldlist.cpp +++ b/cutil/oldlist.cpp @@ -407,7 +407,7 @@ LIST s_adjoin(LIST var_list, void *variable, int_compare compare) { * * Search list, return NIL_LIST if not found. Return the list starting from * the item if found. The compare routine "is_equal" is passed in as - * the third paramter to this routine. If the value NULL is supplied + * the third parameter to this routine. If the value NULL is supplied * for is_equal, the is_key routine will be used. **********************************************************************/ LIST search(LIST list, void *key, int_compare is_equal) { diff --git a/cutil/oldlist.h b/cutil/oldlist.h index 103dd72592..a0130ae061 100644 --- a/cutil/oldlist.h +++ b/cutil/oldlist.h @@ -234,7 +234,7 @@ first_node (list_rest (l)) first_node (list_rest (list_rest (l))) /*---------------------------------------------------------------------- - Public Funtion Prototypes + Public Function Prototypes ----------------------------------------------------------------------*/ int count(LIST var_list); diff --git a/dict/context.cpp b/dict/context.cpp index 206447d98f..a9acb137c3 100644 --- a/dict/context.cpp +++ b/dict/context.cpp @@ -33,7 +33,7 @@ static const int kMinAbsoluteGarbageWordLength = 10; static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f; const int case_state_table[6][4] = { { - /* 0. Begining of word */ + /* 0. Beginning of word */ /* P U L D */ /* -1. Error on case */ 0, 1, 5, 4 diff --git a/dict/dawg.h b/dict/dawg.h index a487d3fd1c..b37e771503 100644 --- a/dict/dawg.h +++ b/dict/dawg.h @@ -447,7 +447,7 @@ class SquishedDawg : public Dawg { EDGE_REF edge = node; if (!edge_occupied(edge) || edge == NO_EDGE) return; assert(forward_edge(edge)); // we don't expect any backward edges to - do { // be present when this funciton is called + do { // be present when this function is called if (!word_end || end_of_word_from_edge_rec(edges_[edge])) { vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge)); } diff --git a/dict/dict.cpp b/dict/dict.cpp index 8df5b63bb4..e59b00d58b 100644 --- a/dict/dict.cpp +++ b/dict/dict.cpp @@ -127,7 +127,7 @@ Dict::Dict(CCUtil* ccutil) " when there is a need to explore all segmentations", getCCUtil()->params()), BOOL_MEMBER(save_raw_choices, false, - "Deprecated- backward compatablity only", + "Deprecated- backward compatibility only", getCCUtil()->params()), INT_MEMBER(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list", diff --git a/dict/dict.h b/dict/dict.h index 7556bc5460..938ca3a332 100644 --- a/dict/dict.h +++ b/dict/dict.h @@ -614,7 +614,7 @@ class Dict { "Make AcceptableChoice() always return false. Useful" " when there is a need to explore all segmentations"); BOOL_VAR_H(save_raw_choices, false, - "Deprecated- backward compatability only"); + "Deprecated- backward compatibility only"); INT_VAR_H(tessedit_truncate_wordchoice_log, 10, "Max words to keep in list"); STRING_VAR_H(word_to_debug, "", "Word for which stopper debug information" " should be printed to stdout"); diff --git a/dict/permdawg.cpp b/dict/permdawg.cpp index 7d60d395ff..71e2deca43 100644 --- a/dict/permdawg.cpp +++ b/dict/permdawg.cpp @@ -303,7 +303,7 @@ void Dict::append_choices( * * The given prev_char_frag_info contains: * - fragment: if not NULL contains information about immediately - * preceeding fragmented character choice + * preceding fragmented character choice * - num_fragments: number of fragments that have been used so far * to construct a character * - certainty: certainty of the current choice or minimum diff --git a/doc/Doxyfile b/doc/Doxyfile index 673defaf10..c4f496be39 100644 --- a/doc/Doxyfile +++ b/doc/Doxyfile @@ -1657,7 +1657,7 @@ EXTRA_PACKAGES = # following commands have a special meaning inside the header: $title, # $datetime, $date, $doxygenversion, $projectname, $projectnumber, # $projectbrief, $projectlogo. Doxygen will replace $title with the empy string, -# for the replacement values of the other commands the user is refered to +# for the replacement values of the other commands the user is referred to # HTML_HEADER. # This tag requires that the tag GENERATE_LATEX is set to YES. diff --git a/java/Makefile.am b/java/Makefile.am index 3ed962dfcc..fddbc6f9ec 100644 --- a/java/Makefile.am +++ b/java/Makefile.am @@ -42,18 +42,22 @@ SCROLLVIEW_LIBS = \ CLASSPATH = $(srcdir)/piccolo2d-core-3.0.jar:$(srcdir)/piccolo2d-extras-3.0.jar ScrollView.jar : $(SCROLLVIEW_CLASSES) - $(JAR) cf $@ com/google/scrollview/*.class \ + $(JAR) cfm $@ Manifest.txt com/google/scrollview/*.class \ com/google/scrollview/events/*.class com/google/scrollview/ui/*.class $(SCROLLVIEW_CLASSES) : $(SCROLLVIEW_FILES) $(JAVAC) -encoding UTF8 -sourcepath $(srcdir) -classpath $(CLASSPATH) $(SCROLLVIEW_FILES) -d $(builddir) +fetch-jars : + curl -L http://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-core/3.0/piccolo2d-core-3.0.jar > piccolo2d-core-3.0.jar + curl -L http://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-extras/3.0/piccolo2d-extras-3.0.jar > piccolo2d-extras-3.0.jar + .PHONY: install-jars install-jars : ScrollView.jar @if [ ! -d $(scrollview_path) ]; then mkdir -p $(scrollview_path); fi; $(INSTALL) -m 644 $(SCROLLVIEW_LIBS) $(scrollview_path); $(INSTALL) -m 644 ScrollView.jar $(scrollview_path); - @echo "Don't forget to set eviroment variable SCROLLVIEW_PATH to $(scrollview_path)"; + @echo "Don't forget to set environment variable SCROLLVIEW_PATH to $(scrollview_path)"; uninstall: rm -f $(scrollview_path)/*.jar diff --git a/java/Manifest.txt b/java/Manifest.txt new file mode 100644 index 0000000000..bc0b707bd8 --- /dev/null +++ b/java/Manifest.txt @@ -0,0 +1,2 @@ +Main-Class: com/google/scrollview/ScrollView +Class-Path: ScrollView.jar piccolo2d-core-3.0.jar piccolo2d-extras-3.0.jar diff --git a/java/com/google/scrollview/ui/SVMenuBar.java b/java/com/google/scrollview/ui/SVMenuBar.java index 7c2f5d9af8..9a87524ef0 100644 --- a/java/com/google/scrollview/ui/SVMenuBar.java +++ b/java/com/google/scrollview/ui/SVMenuBar.java @@ -50,7 +50,7 @@ public SVMenuBar(SVWindow scrollView) { /** - * A click on one of the items in our menubar has occured. Forward it + * A click on one of the items in our menubar has occurred. Forward it * to the item itself to let it decide what happens. */ public void actionPerformed(ActionEvent e) { @@ -111,7 +111,7 @@ else if (id == -1) { * @param name The caption of the new entry. * @param id The Id of the new entry. If it is -1, the entry will be treated * as a menu. - * @param b Whether the entry is initally flagged. + * @param b Whether the entry is initially flagged. * */ diff --git a/java/com/google/scrollview/ui/SVPopupMenu.java b/java/com/google/scrollview/ui/SVPopupMenu.java index 6427c0ef85..14c8b3acd3 100644 --- a/java/com/google/scrollview/ui/SVPopupMenu.java +++ b/java/com/google/scrollview/ui/SVPopupMenu.java @@ -123,7 +123,7 @@ public void add(String parent, String name, int id, String value, String desc) { /** - * A click on one of the items in our menubar has occured. Forward it + * A click on one of the items in our menubar has occurred. Forward it * to the item itself to let it decide what happens. */ public void actionPerformed(ActionEvent e) { diff --git a/java/com/google/scrollview/ui/SVWindow.java b/java/com/google/scrollview/ui/SVWindow.java index f4960276f6..267bfdda03 100644 --- a/java/com/google/scrollview/ui/SVWindow.java +++ b/java/com/google/scrollview/ui/SVWindow.java @@ -298,7 +298,7 @@ public void addMessageBox() { ta.setEditable(false); getContentPane().add(ta, BorderLayout.SOUTH); } - // We need to make the window bigger to accomodate the message box. + // We need to make the window bigger to accommodate the message box. winSizeY += DEF_MESSAGEBOX_HEIGHT; setSize(winSizeX, winSizeY); } diff --git a/training/language-specific.sh b/training/language-specific.sh index bc64f67c88..23dee3e1cd 100755 --- a/training/language-specific.sh +++ b/training/language-specific.sh @@ -780,7 +780,7 @@ VERTICAL_FONTS=( \ # holds the text corpus file for the language, used in phase F # ${FONTS[@]} # holds a sequence of applicable fonts for the language, used in -# phase F & I +# phase F & I. only set if not already set, i.e. from command line # ${TRAINING_DATA_ARGUMENTS} # non-default arguments to the training_data program used in phase T # ${FILTER_ARGUMENTS} - @@ -794,7 +794,6 @@ set_lang_specific_parameters() { local lang=$1 # The default text location is now given directly from the language code. TEXT_CORPUS="${FLAGS_webtext_prefix}/${lang}.corpus.txt" - FONTS=( "${LATIN_FONTS[@]}" ) FILTER_ARGUMENTS="" WORDLIST2DAWG_ARGUMENTS="" # These dawg factors represent the fraction of the corpus not covered by the @@ -816,30 +815,30 @@ set_lang_specific_parameters() { case ${lang} in # Latin languages. enm ) TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported - FONTS=( "${EARLY_LATIN_FONTS[@]}" );; + test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );; frm ) TEXT_CORPUS="${FLAGS_webtext_prefix}/fra.corpus.txt" # Make long-s substitutions for Middle French text FILTER_ARGUMENTS="--make_early_language_variant=fra" TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported. - FONTS=( "${EARLY_LATIN_FONTS[@]}" );; + test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );; frk ) TEXT_CORPUS="${FLAGS_webtext_prefix}/deu.corpus.txt" - FONTS=( "${FRAKTUR_FONTS[@]}" );; + test -z "$FONTS" && FONTS=( "${FRAKTUR_FONTS[@]}" );; ita_old ) TEXT_CORPUS="${FLAGS_webtext_prefix}/ita.corpus.txt" # Make long-s substitutions for Early Italian text FILTER_ARGUMENTS="--make_early_language_variant=ita" TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported. - FONTS=( "${EARLY_LATIN_FONTS[@]}" );; + test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );; spa_old ) TEXT_CORPUS="${FLAGS_webtext_prefix}/spa.corpus.txt" # Make long-s substitutions for Early Spanish text FILTER_ARGUMENTS="--make_early_language_variant=spa" TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported. - FONTS=( "${EARLY_LATIN_FONTS[@]}" );; + test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );; srp_latn ) TEXT_CORPUS=${FLAGS_webtext_prefix}/srp.corpus.txt ;; vie ) TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" - FONTS=( "${VIETNAMESE_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${VIETNAMESE_FONTS[@]}" ) ;; # Highly inflective languages get a bigger dawg size. # TODO(rays) Add more here! hun ) WORD_DAWG_SIZE=1000000 ;; @@ -899,14 +898,14 @@ set_lang_specific_parameters() { # Strip unrenderable words as not all fonts will render the extended # latin symbols found in Vietnamese text. WORD_DAWG_SIZE=1000000 - FONTS=( "${EARLY_LATIN_FONTS[@]}" );; + test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );; # Cyrillic script-based languages. - rus ) FONTS=( "${RUSSIAN_FONTS[@]}" ) + rus ) test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" ) NUMBER_DAWG_FACTOR=0.05 WORD_DAWG_SIZE=1000000 ;; aze_cyrl | bel | bul | kaz | mkd | srp | tgk | ukr | uzb_cyrl ) - FONTS=( "${RUSSIAN_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" ) ;; # Special code for performing Cyrillic language-id that is trained on # Russian, Serbian, Ukranian, Belarusian, Macedonian, Tajik and Mongolian @@ -916,70 +915,70 @@ set_lang_specific_parameters() { TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" GENERATE_WORD_BIGRAMS=0 WORD_DAWG_SIZE=1000000 - FONTS=( "${RUSSIAN_FONTS[@]}" );; + test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" );; # South Asian scripts mostly have a lot of different graphemes, so trim # down the MEAN_COUNT so as not to get a huge amount of text. asm | ben ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.15 - FONTS=( "${BENGALI_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${BENGALI_FONTS[@]}" ) ;; bih | hin | mar | nep | san ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.15 - FONTS=( "${DEVANAGARI_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${DEVANAGARI_FONTS[@]}" ) ;; bod ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.15 - FONTS=( "${TIBETAN_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${TIBETAN_FONTS[@]}" ) ;; dzo ) WORD_DAWG_FACTOR=0.01 - FONTS=( "${TIBETAN_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${TIBETAN_FONTS[@]}" ) ;; guj ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.15 - FONTS=( "${GUJARATI_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${GUJARATI_FONTS[@]}" ) ;; kan ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.15 TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output" TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5" - FONTS=( "${KANNADA_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${KANNADA_FONTS[@]}" ) ;; mal ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.15 TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output" TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5" - FONTS=( "${MALAYALAM_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${MALAYALAM_FONTS[@]}" ) ;; ori ) WORD_DAWG_FACTOR=0.01 - FONTS=( "${ORIYA_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${ORIYA_FONTS[@]}" ) ;; pan ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.01 - FONTS=( "${PUNJABI_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${PUNJABI_FONTS[@]}" ) ;; sin ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.01 - FONTS=( "${SINHALA_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${SINHALA_FONTS[@]}" ) ;; tam ) MEAN_COUNT="30" WORD_DAWG_FACTOR=0.15 TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output" TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5" - FONTS=( "${TAMIL_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${TAMIL_FONTS[@]}" ) ;; tel ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.15 TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output" TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5" - FONTS=( "${TELUGU_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${TELUGU_FONTS[@]}" ) ;; # SouthEast Asian scripts. khm ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.15 TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" - FONTS=( "${KHMER_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${KHMER_FONTS[@]}" ) ;; lao ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.15 TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" - FONTS=( "${LAOTHIAN_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${LAOTHIAN_FONTS[@]}" ) ;; mya ) MEAN_COUNT="12" WORD_DAWG_FACTOR=0.15 TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" - FONTS=( "${BURMESE_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${BURMESE_FONTS[@]}" ) ;; tha ) MEAN_COUNT="30" WORD_DAWG_FACTOR=0.01 TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" @@ -987,7 +986,7 @@ set_lang_specific_parameters() { TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams=" AMBIGS_FILTER_DENOMINATOR="1000" LEADING=48 - FONTS=( "${THAI_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${THAI_FONTS[@]}" ) ;; # CJK chi_sim ) @@ -998,7 +997,7 @@ set_lang_specific_parameters() { TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams=" FILTER_ARGUMENTS="--charset_filter=chi_sim --segmenter_lang=chi_sim" - FONTS=( "${CHI_SIM_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${CHI_SIM_FONTS[@]}" ) ;; chi_tra ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.015 @@ -1006,14 +1005,14 @@ set_lang_specific_parameters() { TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams=" FILTER_ARGUMENTS="--charset_filter=chi_tra --segmenter_lang=chi_tra" - FONTS=( "${CHI_TRA_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${CHI_TRA_FONTS[@]}" ) ;; jpn ) MEAN_COUNT="15" WORD_DAWG_FACTOR=0.015 GENERATE_WORD_BIGRAMS=0 TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams=" FILTER_ARGUMENTS="--charset_filter=jpn --segmenter_lang=jpn" - FONTS=( "${JPN_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${JPN_FONTS[@]}" ) ;; kor ) MEAN_COUNT="20" WORD_DAWG_FACTOR=0.015 NUMBER_DAWG_FACTOR=0.05 @@ -1021,38 +1020,38 @@ set_lang_specific_parameters() { TRAINING_DATA_ARGUMENTS+=" --desired_bigrams=" GENERATE_WORD_BIGRAMS=0 FILTER_ARGUMENTS="--charset_filter=kor --segmenter_lang=kor" - FONTS=( "${KOREAN_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${KOREAN_FONTS[@]}" ) ;; # Middle-Eastern scripts. - ara ) FONTS=( "${ARABIC_FONTS[@]}" ) ;; - div ) FONTS=( "${THAANA_FONTS[@]}" ) ;; + ara ) test -z "$FONTS" && FONTS=( "${ARABIC_FONTS[@]}" ) ;; + div ) test -z "$FONTS" && FONTS=( "${THAANA_FONTS[@]}" ) ;; fas | pus | snd | uig | urd ) - FONTS=( "${PERSIAN_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${PERSIAN_FONTS[@]}" ) ;; heb | yid ) NUMBER_DAWG_FACTOR=0.05 WORD_DAWG_FACTOR=0.08 - FONTS=( "${HEBREW_FONTS[@]}" ) ;; - syr ) FONTS=( "${SYRIAC_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${HEBREW_FONTS[@]}" ) ;; + syr ) test -z "$FONTS" && FONTS=( "${SYRIAC_FONTS[@]}" ) ;; # Other scripts. amh | tir) - FONTS=( "${AMHARIC_FONTS[@]}" ) ;; - chr ) FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" \ + test -z "$FONTS" && FONTS=( "${AMHARIC_FONTS[@]}" ) ;; + chr ) test -z "$FONTS" && FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" \ "Noto Sans Cherokee" \ ) ;; ell | grc ) NUMBER_DAWG_FACTOR=0.05 WORD_DAWG_FACTOR=0.08 - FONTS=( "${GREEK_FONTS[@]}" ) ;; - hye ) FONTS=( "${ARMENIAN_FONTS[@]}" ) ;; - iku ) FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" ) ;; - kat) FONTS=( "${GEORGIAN_FONTS[@]}" ) ;; + test -z "$FONTS" && FONTS=( "${GREEK_FONTS[@]}" ) ;; + hye ) test -z "$FONTS" && FONTS=( "${ARMENIAN_FONTS[@]}" ) ;; + iku ) test -z "$FONTS" && FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" ) ;; + kat) test -z "$FONTS" && FONTS=( "${GEORGIAN_FONTS[@]}" ) ;; kat_old) TEXT_CORPUS="${FLAGS_webtext_prefix}/kat.corpus.txt" - FONTS=( "${OLD_GEORGIAN_FONTS[@]}" ) ;; - kir ) FONTS=( "${KYRGYZ_FONTS[@]}" ) + test -z "$FONTS" && FONTS=( "${OLD_GEORGIAN_FONTS[@]}" ) ;; + kir ) test -z "$FONTS" && FONTS=( "${KYRGYZ_FONTS[@]}" ) TRAINING_DATA_ARGUMENTS=" --infrequent_ratio=100" ;; - kur ) FONTS=( "${KURDISH_FONTS[@]}" ) ;; + kur ) test -z "$FONTS" && FONTS=( "${KURDISH_FONTS[@]}" ) ;; *) err "Error: ${lang} is not a valid language code" esac @@ -1061,6 +1060,8 @@ set_lang_specific_parameters() { elif [[ ! -z ${MEAN_COUNT} ]]; then TRAINING_DATA_ARGUMENTS+=" --mean_count=${MEAN_COUNT}" fi + # Default to Latin fonts if none have been set + test -z "$FONTS" && test -z "$FONTS" && FONTS=( "${LATIN_FONTS[@]}" ) } #============================================================================= diff --git a/training/tesstrain.sh b/training/tesstrain.sh index ecf2072083..c1af1e86c1 100755 --- a/training/tesstrain.sh +++ b/training/tesstrain.sh @@ -17,7 +17,6 @@ # USAGE: # # tesstrain.sh -# --bin_dir PATH # Location of training program. # --fontlist FONTS_STR # A plus-separated list of fontnames to train on. # --fonts_dir FONTS_PATH # Path to font files. # --lang LANG_CODE # ISO 639 code. @@ -25,6 +24,7 @@ # --output_dir OUTPUTDIR # Location of output traineddata file. # --overwrite # Safe to overwrite files in output_dir. # --run_shape_clustering # Run shape clustering (use for Indic langs). +# --exposures EXPOSURES # A list of exposure levels to use (e.g. "-1 0 1"). # # OPTIONAL flags for input data. If unspecified we will look for them in # the langdata_dir directory. @@ -49,11 +49,8 @@ source `dirname $0`/tesstrain_utils.sh ARGV=("$@") parse_flags -tlog "\n=== Starting training for language '${LANG_CODE}'" - -tlog "Cleaning workspace directory ${TRAINING_DIR}..." mkdir -p ${TRAINING_DIR} -rm -fr ${TRAINING_DIR}/* +tlog "\n=== Starting training for language '${LANG_CODE}'" source `dirname $0`/language-specific.sh set_lang_specific_parameters ${LANG_CODE} diff --git a/training/tesstrain_utils.sh b/training/tesstrain_utils.sh index a3ad7f5142..30006bc1f7 100755 --- a/training/tesstrain_utils.sh +++ b/training/tesstrain_utils.sh @@ -16,10 +16,6 @@ # # USAGE: source tesstrain_utils.sh -FONTS=( - "Arial" \ - "Times New Roman," \ -) if [ "$(uname)" == "Darwin" ];then FONTS_DIR="/Library/Fonts/" else @@ -29,7 +25,8 @@ OUTPUT_DIR="/tmp/tesstrain/tessdata" OVERWRITE=0 RUN_SHAPE_CLUSTERING=0 EXTRACT_FONT_PROPERTIES=1 -WORKSPACE_DIR="/tmp/tesstrain" +WORKSPACE_DIR=`mktemp -d` +EXPOSURES=0 # Logging helper functions. tlog() { @@ -45,11 +42,11 @@ err_exit() { # if the program file is not found. # Usage: run_command CMD ARG1 ARG2... run_command() { - local cmd=$1 - shift - if [[ ! -x ${cmd} ]]; then - err_exit "File ${cmd} not found" + local cmd=`which $1` + if [[ -z ${cmd} ]]; then + err_exit "$1 not found" fi + shift tlog "[$(date)] ${cmd} $@" ${cmd} "$@" 2>&1 1>&2 | tee -a ${LOG_FILE} # check completion status @@ -69,22 +66,6 @@ check_file_readable() { done } -# Set global path variables that are based on parsed flags. -set_prog_paths() { - if [[ -z ${BINDIR} ]]; then - err_exit "Need to specify location of program files" - fi - CN_TRAINING_EXE=${BINDIR}/cntraining - COMBINE_TESSDATA_EXE=${BINDIR}/combine_tessdata - MF_TRAINING_EXE=${BINDIR}/mftraining - SET_UNICHARSET_PROPERTIES_EXE=${BINDIR}/set_unicharset_properties - SHAPE_TRAINING_EXE=${BINDIR}/shapeclustering - TESSERACT_EXE=${BINDIR}/tesseract - TEXT2IMAGE_EXE=${BINDIR}/text2image - UNICHARSET_EXTRACTOR_EXE=${BINDIR}/unicharset_extractor - WORDLIST2DAWG_EXE=${BINDIR}/wordlist2dawg -} - # Sets the named variable to given value. Aborts if the value is missing or # if it looks like a flag. # Usage: parse_value VAR_NAME VALUE @@ -109,9 +90,6 @@ parse_flags() { case ${ARGV[$i]} in --) break;; - --bin_dir) - parse_value "BINDIR" ${ARGV[$j]} - i=$j ;; --fontlist) # Expect a plus-separated list of names if [[ -z ${ARGV[$j]} ]] || [[ ${ARGV[$j]:0:2} == "--" ]]; then err_exit "Invalid value passed to --fontlist" @@ -121,6 +99,16 @@ parse_flags() { FONTS=( ${ARGV[$j]} ) IFS=$ofs i=$j ;; + --exposures) + exp="" + while test $j -lt ${#ARGV[@]}; do + test -z ${ARGV[$j]} && break + test `echo ${ARGV[$j]} | cut -c -2` = "--" && break + exp="$exp ${ARGV[$j]}" + j=$((j+1)) + done + parse_value "EXPOSURES" "$exp" + i=$((j-1)) ;; --fonts_dir) parse_value "FONTS_DIR" ${ARGV[$j]} i=$j ;; @@ -156,9 +144,6 @@ parse_flags() { if [[ -z ${LANG_CODE} ]]; then err_exit "Need to specify a language --lang" fi - if [[ -z ${BINDIR} ]]; then - err_exit "Need to specify path to built binaries --bin_dir" - fi if [[ -z ${LANGDATA_ROOT} ]]; then err_exit "Need to specify path to language files --langdata_dir" fi @@ -171,8 +156,6 @@ parse_flags() { fi fi - set_prog_paths - # Location where intermediate files will be created. TRAINING_DIR=${WORKSPACE_DIR}/${LANG_CODE} # Location of log file for the whole run. @@ -200,8 +183,8 @@ initialize_fontconfig() { export FONT_CONFIG_CACHE=$(mktemp -d --tmpdir font_tmp.XXXXXXXXXX) local sample_path=${FONT_CONFIG_CACHE}/sample_text.txt echo "Text" >${sample_path} - run_command ${TEXT2IMAGE_EXE} --fonts_dir=${FONTS_DIR} \ - --font="Arial" --outputbase=${sample_path} --text=${sample_path} \ + run_command text2image --fonts_dir=${FONTS_DIR} \ + --font="${FONTS[0]}" --outputbase=${sample_path} --text=${sample_path} \ --fontconfig_tmpdir=${FONT_CONFIG_CACHE} } @@ -228,14 +211,14 @@ generate_font_image() { fi done - run_command ${TEXT2IMAGE_EXE} ${common_args} --font="${font}" \ + run_command text2image ${common_args} --font="${font}" \ --text=${TRAINING_TEXT} ${TEXT2IMAGE_EXTRA_ARGS} check_file_readable ${outbase}.box ${outbase}.tif if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${TRAIN_NGRAMS_FILE} ]]; then tlog "Extracting font properties of ${font}" - run_command ${TEXT2IMAGE_EXE} ${common_args} --font="${font}" \ + run_command text2image ${common_args} --font="${font}" \ --ligatures=false --text=${TRAIN_NGRAMS_FILE} \ --only_extract_font_properties --ptsize=32 check_file_readable ${outbase}.fontinfo @@ -254,35 +237,36 @@ phase_I_generate_image() { err_exit "Could not find training text file ${TRAINING_TEXT}" fi CHAR_SPACING="0.0" - EXPOSURE="0" - - if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then - # Parse .bigram_freqs file and compose a .train_ngrams file with text - # for tesseract to recognize during training. Take only the ngrams whose - # combined weight accounts for 95% of all the bigrams in the language. - NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \ - | awk '{s=s+$2}; END {print (s/100)*p}' p=99) - cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \ - | awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \ - x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE} - check_file_readable ${TRAIN_NGRAMS_FILE} - fi - local counter=0 - for font in "${FONTS[@]}"; do - generate_font_image "${font}" & - let counter=counter+1 - let rem=counter%par_factor - if [[ "${rem}" -eq 0 ]]; then - wait + for EXPOSURE in $EXPOSURES; do + if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then + # Parse .bigram_freqs file and compose a .train_ngrams file with text + # for tesseract to recognize during training. Take only the ngrams whose + # combined weight accounts for 95% of all the bigrams in the language. + NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \ + | awk '{s=s+$2}; END {print (s/100)*p}' p=99) + cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \ + | awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \ + x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE} + check_file_readable ${TRAIN_NGRAMS_FILE} fi - done - wait - # Check that each process was successful. - for font in "${FONTS[@]}"; do - local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g') - local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE} - check_file_readable ${outbase}.box ${outbase}.tif + + local counter=0 + for font in "${FONTS[@]}"; do + generate_font_image "${font}" & + let counter=counter+1 + let rem=counter%par_factor + if [[ "${rem}" -eq 0 ]]; then + wait + fi + done + wait + # Check that each process was successful. + for font in "${FONTS[@]}"; do + local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g') + local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE} + check_file_readable ${outbase}.box ${outbase}.tif + done done } @@ -291,7 +275,7 @@ phase_UP_generate_unicharset() { tlog "\n=== Phase UP: Generating unicharset and unichar properties files ===" local box_files=$(ls ${TRAINING_DIR}/*.box) - run_command ${UNICHARSET_EXTRACTOR_EXE} -D "${TRAINING_DIR}/" ${box_files} + run_command unicharset_extractor -D "${TRAINING_DIR}/" ${box_files} local outfile=${TRAINING_DIR}/unicharset UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset" check_file_readable ${outfile} @@ -299,7 +283,7 @@ phase_UP_generate_unicharset() { XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights" check_file_readable ${UNICHARSET_FILE} - run_command ${SET_UNICHARSET_PROPERTIES_EXE} \ + run_command set_unicharset_properties \ -U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \ --script_dir=${LANGDATA_ROOT} check_file_readable ${XHEIGHTS_FILE} @@ -327,7 +311,7 @@ phase_D_generate_dawg() { if [[ -s ${WORDLIST_FILE} ]]; then tlog "Generating word Dawg" check_file_readable ${UNICHARSET_FILE} - run_command ${WORDLIST2DAWG_EXE} -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \ + run_command wordlist2dawg -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \ ${UNICHARSET_FILE} check_file_readable ${WORD_DAWG} @@ -339,13 +323,13 @@ phase_D_generate_dawg() { if [[ -s ${freq_wordlist_file} ]]; then check_file_readable ${UNICHARSET_FILE} tlog "Generating frequent-word Dawg" - run_command ${WORDLIST2DAWG_EXE} -r 1 ${freq_wordlist_file} \ + run_command wordlist2dawg -r 1 ${freq_wordlist_file} \ ${FREQ_DAWG} ${UNICHARSET_FILE} check_file_readable ${FREQ_DAWG} fi # Punctuation DAWG - # -r arguments to WORDLIST2DAWG_EXE denote RTL reverse policy + # -r arguments to wordlist2dawg denote RTL reverse policy # (see Trie::RTLReversePolicy enum in third_party/tesseract/dict/trie.h). # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG, # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS, @@ -360,20 +344,20 @@ phase_D_generate_dawg() { PUNC_FILE="${LANGDATA_ROOT}/common.punc" fi check_file_readable ${PUNC_FILE} - run_command ${WORDLIST2DAWG_EXE} -r ${punc_reverse_policy} \ + run_command wordlist2dawg -r ${punc_reverse_policy} \ ${PUNC_FILE} ${PUNC_DAWG} ${UNICHARSET_FILE} check_file_readable ${PUNC_DAWG} # Numbers DAWG if [[ -s ${NUMBERS_FILE} ]]; then - run_command ${WORDLIST2DAWG_EXE} -r 0 \ + run_command wordlist2dawg -r 0 \ ${NUMBERS_FILE} ${NUMBER_DAWG} ${UNICHARSET_FILE} check_file_readable ${NUMBER_DAWG} fi # Bigram dawg if [[ -s ${WORD_BIGRAMS_FILE} ]]; then - run_command ${WORDLIST2DAWG_EXE} -r 1 \ + run_command wordlist2dawg -r 1 \ ${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE} check_file_readable ${BIGRAM_DAWG} fi @@ -387,10 +371,9 @@ phase_E_extract_features() { par_factor=1 fi tlog "\n=== Phase E: Extracting features ===" - TRAIN_EXPOSURES='0' local img_files="" - for exposure in ${TRAIN_EXPOSURES}; do + for exposure in ${EXPOSURES}; do img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif) done @@ -405,7 +388,7 @@ phase_E_extract_features() { tlog "Using TESSDATA_PREFIX=${TESSDATA_PREFIX}" local counter=0 for img_file in ${img_files}; do - run_command ${TESSERACT_EXE} ${img_file} ${img_file%.*} \ + run_command tesseract ${img_file} ${img_file%.*} \ ${box_config} ${config} & let counter=counter+1 let rem=counter%par_factor @@ -427,7 +410,7 @@ phase_C_cluster_prototypes() { tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ===" local out_normproto=$1 - run_command ${CN_TRAINING_EXE} -D "${TRAINING_DIR}/" \ + run_command cntraining -D "${TRAINING_DIR}/" \ $(ls ${TRAINING_DIR}/*.tr) check_file_readable ${TRAINING_DIR}/normproto @@ -447,7 +430,7 @@ phase_S_cluster_shapes() { font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights" fi - run_command ${SHAPE_TRAINING_EXE} \ + run_command shapeclustering \ -D "${TRAINING_DIR}/" \ -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \ -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \ @@ -468,7 +451,7 @@ phase_M_cluster_microfeatures() { font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights" fi - run_command ${MF_TRAINING_EXE} \ + run_command mftraining \ -D "${TRAINING_DIR}/" \ -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \ -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \ @@ -528,7 +511,7 @@ make__traineddata() { fi # Compose the traineddata file. - run_command ${COMBINE_TESSDATA_EXE} ${TRAINING_DIR}/${LANG_CODE}. + run_command combine_tessdata ${TRAINING_DIR}/${LANG_CODE}. # Copy it to the output dir, overwriting only if allowed by the cmdline flag. if [[ ! -d ${OUTPUT_DIR} ]]; then diff --git a/viewer/svutil.cpp b/viewer/svutil.cpp index a820eafbc5..f94c1c86d5 100644 --- a/viewer/svutil.cpp +++ b/viewer/svutil.cpp @@ -127,7 +127,7 @@ SVSemaphore::SVSemaphore() { semaphore_ = CreateSemaphore(0, 0, 10, 0); #elif defined(__APPLE__) char name[50]; - snprintf(name, sizeof(name), "%d", random()); + snprintf(name, sizeof(name), "%ld", random()); sem_unlink(name); semaphore_ = sem_open(name, O_CREAT , S_IWUSR, 0); if (semaphore_ == SEM_FAILED) { @@ -296,14 +296,11 @@ static std::string ScrollViewCommand(std::string scrollview_path) { // this unnecessary. // Also the path has to be separated by ; on windows and : otherwise. #ifdef _WIN32 - const char* cmd_template = "-Djava.library.path=%s -cp %s/ScrollView.jar;" - "%s/piccolo2d-core-3.0.jar:%s/piccolo2d-extras-3.0.jar" - " com.google.scrollview.ScrollView"; + const char* cmd_template = "-Djava.library.path=%s -jar %s/ScrollView.jar"; + #else const char* cmd_template = "-c \"trap 'kill %%1' 0 1 2 ; java " - "-Xms1024m -Xmx2048m -Djava.library.path=%s -cp %s/ScrollView.jar:" - "%s/piccolo2d-core-3.0.jar:%s/piccolo2d-extras-3.0.jar" - " com.google.scrollview.ScrollView" + "-Xms1024m -Xmx2048m -jar %s/ScrollView.jar" " & wait\""; #endif int cmdlen = strlen(cmd_template) + 4*strlen(scrollview_path.c_str()) + 1; @@ -374,7 +371,7 @@ static int GetAddrInfo(const char* hostname, int port, struct addrinfo** address) { #if defined(__linux__) char port_str[40]; - snprintf(port_str, 40, "%d", port); + snprintf(port_str, 40, "%ld", port); return getaddrinfo(hostname, port_str, NULL, address); #else return GetAddrInfoNonLinux(hostname, port, address); diff --git a/wordrec/lm_state.h b/wordrec/lm_state.h index c87745b75a..623bbb5e7f 100644 --- a/wordrec/lm_state.h +++ b/wordrec/lm_state.h @@ -177,11 +177,11 @@ struct ViterbiStateEntry : public ELIST_LINK { /// the smallest rating or lower/upper case letters). LanguageModelFlagsType top_choice_flags; - /// Extra information maintained by Dawg laguage model component + /// Extra information maintained by Dawg language model component /// (owned by ViterbiStateEntry). LanguageModelDawgInfo *dawg_info; - /// Extra information maintained by Ngram laguage model component + /// Extra information maintained by Ngram language model component /// (owned by ViterbiStateEntry). LanguageModelNgramInfo *ngram_info; diff --git a/wordrec/pieces.cpp b/wordrec/pieces.cpp index 1818478c66..04e340396e 100644 --- a/wordrec/pieces.cpp +++ b/wordrec/pieces.cpp @@ -273,7 +273,7 @@ void Wordrec::merge_and_put_fragment_lists(inT16 row, inT16 column, * * Recursively go through the ratings matrix to find lists of fragments * to be merged in the function merge_and_put_fragment_lists. - * current_frag is the postion of the piece we are looking for. + * current_frag is the position of the piece we are looking for. * current_row is the row in the rating matrix we are currently at. * start is the row we started initially, so that we can know where * to append the results to the matrix. num_frag_parts is the total diff --git a/wordrec/wordrec.h b/wordrec/wordrec.h index 38f09f23d2..fb54ccae08 100644 --- a/wordrec/wordrec.h +++ b/wordrec/wordrec.h @@ -375,7 +375,7 @@ class Wordrec : public Classify { inT16 num_blobs); // Recursively go through the ratings matrix to find lists of fragments // to be merged in the function merge_and_put_fragment_lists. - // current_frag is the postion of the piece we are looking for. + // current_frag is the position of the piece we are looking for. // current_row is the row in the rating matrix we are currently at. // start is the row we started initially, so that we can know where // to append the results to the matrix. num_frag_parts is the total