Skip to content

Commit

Permalink
Fixed issue 1207
Browse files Browse the repository at this point in the history
  • Loading branch information
theraysmith committed Oct 9, 2014
1 parent d0cb107 commit f927728
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 7 deletions.
29 changes: 25 additions & 4 deletions ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,23 @@
///////////////////////////////////////////////////////////////////////
// File: tesseractclass.cpp
// Description: An instance of Tesseract. For thread safety, *every*
// global variable goes in here, directly, or indirectly.
// Description: The Tesseract class. It holds/owns everything needed
// to run Tesseract on a single language, and also a set of
// sub-Tesseracts to run sub-languages. For thread safety, *every*
// variable that was previously global or static (except for
// constant data, and some visual debugging flags) has been moved
// in here, directly, or indirectly.
// This makes it safe to run multiple Tesseracts in different
// threads in parallel, and keeps the different language
// instances separate.
// Some global functions remain, but they are isolated re-entrant
// functions that operate on their arguments. Functions that work
// on variable data have been moved to an appropriate class based
// mostly on the directory hierarchy. For more information see
// slide 6 of "2ArchitectureAndDataStructures" in
// https://drive.google.com/file/d/0B7l10Bj_LprhbUlIUFlCdGtDYkE/edit?usp=sharing
// Some global data and related functions still exist in the
// training-related code, but they don't interfere with normal
// recognition operation.
// Author: Ray Smith
// Created: Fri Mar 07 08:17:01 PST 2008
//
Expand Down Expand Up @@ -65,6 +81,9 @@ Tesseract::Tesseract()
"Blacklist of chars not to recognize", this->params()),
STRING_MEMBER(tessedit_char_whitelist, "",
"Whitelist of chars to recognize", this->params()),
STRING_MEMBER(tessedit_char_unblacklist, "",
"List of chars to override tessedit_char_blacklist",
this->params()),
BOOL_MEMBER(tessedit_ambigs_training, false,
"Perform training for ambiguities", this->params()),
INT_MEMBER(pageseg_devanagari_split_strategy,
Expand Down Expand Up @@ -578,11 +597,13 @@ void Tesseract::ResetDocumentDictionary() {
void Tesseract::SetBlackAndWhitelist() {
// Set the white and blacklists (if any)
unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
tessedit_char_whitelist.string());
tessedit_char_whitelist.string(),
tessedit_char_unblacklist.string());
// Black and white lists should apply to all loaded classifiers.
for (int i = 0; i < sub_langs_.size(); ++i) {
sub_langs_[i]->unicharset.set_black_and_whitelist(
tessedit_char_blacklist.string(), tessedit_char_whitelist.string());
tessedit_char_blacklist.string(), tessedit_char_whitelist.string(),
tessedit_char_unblacklist.string());
}
}

Expand Down
9 changes: 8 additions & 1 deletion ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
///////////////////////////////////////////////////////////////////////
// File: tesseractclass.h
// Description: An instance of Tesseract. For thread safety, *every*
// Description: The Tesseract class. It holds/owns everything needed
// to run Tesseract on a single language, and also a set of
// sub-Tesseracts to run sub-languages. For thread safety, *every*
// global variable goes in here, directly, or indirectly.
// This makes it safe to run multiple Tesseracts in different
// threads in parallel, and keeps the different language
// instances separate.
// Author: Ray Smith
// Created: Fri Mar 07 08:17:01 PST 2008
//
Expand Down Expand Up @@ -743,6 +748,8 @@ class Tesseract : public Wordrec {
"Blacklist of chars not to recognize");
STRING_VAR_H(tessedit_char_whitelist, "",
"Whitelist of chars to recognize");
STRING_VAR_H(tessedit_char_unblacklist, "",
"List of chars to override tessedit_char_blacklist");
BOOL_VAR_H(tessedit_ambigs_training, false,
"Perform training for ambiguities");
INT_VAR_H(pageseg_devanagari_split_strategy,
Expand Down
13 changes: 12 additions & 1 deletion ccutil/unicharset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -985,8 +985,10 @@ bool UNICHARSET::major_right_to_left() const {
// Set a whitelist and/or blacklist of characters to recognize.
// An empty or NULL whitelist enables everything (minus any blacklist).
// An empty or NULL blacklist disables nothing.
// An empty or NULL blacklist has no effect.
void UNICHARSET::set_black_and_whitelist(const char* blacklist,
const char* whitelist) {
const char* whitelist,
const char* unblacklist) {
bool def_enabled = whitelist == NULL || whitelist[0] == '\0';
// Set everything to default
for (int ch = 0; ch < size_used; ++ch)
Expand All @@ -1009,6 +1011,15 @@ void UNICHARSET::set_black_and_whitelist(const char* blacklist,
unichars[encoding[i]].properties.enabled = false;
}
}
if (unblacklist != NULL && unblacklist[0] != '\0') {
// Re-enable the unblacklist.
GenericVector<UNICHAR_ID> encoding;
encode_string(unblacklist, false, &encoding, NULL, NULL);
for (int i = 0; i < encoding.size(); ++i) {
if (encoding[i] != INVALID_UNICHAR_ID)
unichars[encoding[i]].properties.enabled = true;
}
}
}

int UNICHARSET::add_script(const char* script) {
Expand Down
5 changes: 4 additions & 1 deletion ccutil/unicharset.h
Original file line number Diff line number Diff line change
Expand Up @@ -381,11 +381,14 @@ class UNICHARSET {
// Set a whitelist and/or blacklist of characters to recognize.
// An empty or NULL whitelist enables everything (minus any blacklist).
// An empty or NULL blacklist disables nothing.
// An empty or NULL unblacklist has no effect.
// The blacklist overrides the whitelist.
// The unblacklist overrides the blacklist.
// Each list is a string of utf8 character strings. Boundaries between
// unicharset units are worked out automatically, and characters not in
// the unicharset are silently ignored.
void set_black_and_whitelist(const char* blacklist, const char* whitelist);
void set_black_and_whitelist(const char* blacklist, const char* whitelist,
const char* unblacklist);

// Set the isalpha property of the given unichar to the given value.
void set_isalpha(UNICHAR_ID unichar_id, bool value) {
Expand Down

0 comments on commit f927728

Please sign in to comment.