From 25d0968d094a8f6d4ec52f1cc9b869f4a650e3b7 Mon Sep 17 00:00:00 2001 From: Ray Smith Date: Tue, 12 May 2015 14:59:14 -0700 Subject: [PATCH 01/15] Major refactor to improve speed on difficut images, especially when running a heap checker. SEAM and SPLIT have been begging for a refactor for a *LONG* time. This change does most of the work of turning them into proper classes: Moved relevant code into SEAM/SPLIT/TBLOB/EDGEPT etc from global helper functions. Made the splits full data members of SEAM in an array instead of 3 separate pointers. This greatly reduces the amount of new/delete happening in the chopper, which is the main goal. Deleted redundant files: olutil.*, makechop.* Brought other code into SEAM in order to keep its data members private with only priority having accessors. --- ccmain/applybox.cpp | 2 +- ccmain/tfacepp.cpp | 2 +- ccstruct/blobs.cpp | 78 +++++- ccstruct/blobs.h | 108 +++++++- ccstruct/pageres.cpp | 18 +- ccstruct/seam.cpp | 597 ++++++++++++---------------------------- ccstruct/seam.h | 228 +++++++++------ ccstruct/split.cpp | 271 ++++++++++++------ ccstruct/split.h | 101 ++++--- ccstruct/vecfuncs.cpp | 1 + ccstruct/vecfuncs.h | 1 - classify/adaptmatch.cpp | 7 +- wordrec/chop.cpp | 65 +---- wordrec/chopper.cpp | 76 +---- wordrec/chopper.h | 2 - wordrec/findseam.cpp | 216 +++------------ wordrec/gradechop.cpp | 152 +--------- wordrec/gradechop.h | 19 -- wordrec/makechop.cpp | 226 --------------- wordrec/makechop.h | 71 ----- wordrec/olutil.cpp | 102 ------- wordrec/olutil.h | 82 ------ wordrec/outlines.cpp | 93 ------- wordrec/pieces.cpp | 4 +- wordrec/plotedges.cpp | 17 -- wordrec/plotedges.h | 2 - wordrec/segsearch.cpp | 3 +- wordrec/wordrec.h | 27 +- 28 files changed, 821 insertions(+), 1750 deletions(-) delete mode 100644 wordrec/makechop.cpp delete mode 100644 wordrec/makechop.h delete mode 100644 wordrec/olutil.cpp delete mode 100644 wordrec/olutil.h diff --git a/ccmain/applybox.cpp b/ccmain/applybox.cpp index b9a28fa35c..9c067e7932 100644 --- a/ccmain/applybox.cpp +++ b/ccmain/applybox.cpp @@ -582,7 +582,7 @@ bool Tesseract::FindSegmentation(const GenericVector& target_text, int blob_count = 1; for (int s = 0; s < word_res->seam_array.size(); ++s) { SEAM* seam = word_res->seam_array[s]; - if (seam->split1 == NULL) { + if (!seam->HasAnySplits()) { word_res->best_state.push_back(blob_count); blob_count = 1; } else { diff --git a/ccmain/tfacepp.cpp b/ccmain/tfacepp.cpp index 45775fe40e..e1dc778f34 100644 --- a/ccmain/tfacepp.cpp +++ b/ccmain/tfacepp.cpp @@ -254,7 +254,7 @@ void Tesseract::join_words(WERD_RES *word, // Move the word2 seams onto the end of the word1 seam_array. // Since the seam list is one element short, an empty seam marking the // end of the last blob in the first word is needed first. - word->seam_array.push_back(new SEAM(0.0f, split_pt, NULL, NULL, NULL)); + word->seam_array.push_back(new SEAM(0.0f, split_pt)); word->seam_array += word2->seam_array; word2->seam_array.truncate(0); // Fix widths and gaps. diff --git a/ccstruct/blobs.cpp b/ccstruct/blobs.cpp index a0e6dc7b4c..97f95eba2a 100644 --- a/ccstruct/blobs.cpp +++ b/ccstruct/blobs.cpp @@ -64,6 +64,42 @@ const TPOINT kDivisibleVerticalItalic(1, 5); CLISTIZE(EDGEPT); +// Returns true when the two line segments cross each other. +// (Moved from outlines.cpp). +// Finds where the projected lines would cross and then checks to see if the +// point of intersection lies on both of the line segments. If it does +// then these two segments cross. +/* static */ +bool TPOINT::IsCrossed(const TPOINT& a0, const TPOINT& a1, const TPOINT& b0, + const TPOINT& b1) { + int b0a1xb0b1, b0b1xb0a0; + int a1b1xa1a0, a1a0xa1b0; + + TPOINT b0a1, b0a0, a1b1, b0b1, a1a0; + + b0a1.x = a1.x - b0.x; + b0a0.x = a0.x - b0.x; + a1b1.x = b1.x - a1.x; + b0b1.x = b1.x - b0.x; + a1a0.x = a0.x - a1.x; + b0a1.y = a1.y - b0.y; + b0a0.y = a0.y - b0.y; + a1b1.y = b1.y - a1.y; + b0b1.y = b1.y - b0.y; + a1a0.y = a0.y - a1.y; + + b0a1xb0b1 = CROSS(b0a1, b0b1); + b0b1xb0a0 = CROSS(b0b1, b0a0); + a1b1xa1a0 = CROSS(a1b1, a1a0); + // For clarity, we want CROSS(a1a0,a1b0) here but we have b0a1 instead of a1b0 + // so use -CROSS(a1b0,b0a1) instead, which is the same. + a1a0xa1b0 = -CROSS(a1a0, b0a1); + + return ((b0a1xb0b1 > 0 && b0b1xb0a0 > 0) || + (b0a1xb0b1 < 0 && b0b1xb0a0 < 0)) && + ((a1b1xa1a0 > 0 && a1a0xa1b0 > 0) || (a1b1xa1a0 < 0 && a1a0xa1b0 < 0)); +} + // Consume the circular list of EDGEPTs to make a TESSLINE. TESSLINE* TESSLINE::BuildFromOutlineList(EDGEPT* outline) { TESSLINE* result = new TESSLINE; @@ -454,6 +490,36 @@ TBOX TBLOB::bounding_box() const { return box; } +// Finds and deletes any duplicate outlines in this blob, without deleting +// their EDGEPTs. +void TBLOB::EliminateDuplicateOutlines() { + for (TESSLINE* outline = outlines; outline != NULL; outline = outline->next) { + TESSLINE* last_outline = outline; + for (TESSLINE* other_outline = outline->next; other_outline != NULL; + last_outline = other_outline, other_outline = other_outline->next) { + if (outline->SameBox(*other_outline)) { + last_outline->next = other_outline->next; + // This doesn't leak - the outlines share the EDGEPTs. + other_outline->loop = NULL; + delete other_outline; + other_outline = last_outline; + // If it is part of a cut, then it can't be a hole any more. + outline->is_hole = false; + } + } + } +} + +// Swaps the outlines of *this and next if needed to keep the centers in +// increasing x. +void TBLOB::CorrectBlobOrder(TBLOB* next) { + TBOX box = bounding_box(); + TBOX next_box = next->bounding_box(); + if (box.x_middle() > next_box.x_middle()) { + Swap(&outlines, &next->outlines); + } +} + #ifndef GRAPHICS_DISABLED void TBLOB::plot(ScrollView* window, ScrollView::Color color, ScrollView::Color child_color) { @@ -858,18 +924,6 @@ void TWERD::plot(ScrollView* window) { } #endif // GRAPHICS_DISABLED -/********************************************************************** - * blob_origin - * - * Compute the origin of a compound blob, define to be the centre - * of the bounding box. - **********************************************************************/ -void blob_origin(TBLOB *blob, /*blob to compute on */ - TPOINT *origin) { /*return value */ - TBOX bbox = blob->bounding_box(); - *origin = (bbox.topleft() + bbox.botright()) / 2; -} - /********************************************************************** * divisible_blob * diff --git a/ccstruct/blobs.h b/ccstruct/blobs.h index e39761b170..1fd9683ef9 100644 --- a/ccstruct/blobs.h +++ b/ccstruct/blobs.h @@ -60,6 +60,13 @@ struct TPOINT { x /= divisor; y /= divisor; } + bool operator==(const TPOINT& other) const { + return x == other.x && y == other.y; + } + // Returns true when the two line segments cross each other. + // (Moved from outlines.cpp). + static bool IsCrossed(const TPOINT& a0, const TPOINT& a1, const TPOINT& b0, + const TPOINT& b1); inT16 x; // absolute x coord. inT16 y; // absolute y coord. @@ -87,6 +94,55 @@ struct EDGEPT { start_step = src.start_step; step_count = src.step_count; } + // Returns the squared distance between the points, with the x-component + // weighted by x_factor. + int WeightedDistance(const EDGEPT& other, int x_factor) const { + int x_dist = pos.x - other.pos.x; + int y_dist = pos.y - other.pos.y; + return x_dist * x_dist * x_factor + y_dist * y_dist; + } + // Returns true if the positions are equal. + bool EqualPos(const EDGEPT& other) const { return pos == other.pos; } + // Returns the bounding box of the outline segment from *this to *end. + // Ignores hidden edge flags. + TBOX SegmentBox(const EDGEPT* end) const { + TBOX box(pos.x, pos.y, pos.x, pos.y); + const EDGEPT* pt = this; + do { + pt = pt->next; + if (pt->pos.x < box.left()) box.set_left(pt->pos.x); + if (pt->pos.x > box.right()) box.set_right(pt->pos.x); + if (pt->pos.y < box.bottom()) box.set_bottom(pt->pos.y); + if (pt->pos.y > box.top()) box.set_top(pt->pos.y); + } while (pt != end && pt != this); + return box; + } + // Returns the area of the outline segment from *this to *end. + // Ignores hidden edge flags. + int SegmentArea(const EDGEPT* end) const { + int area = 0; + const EDGEPT* pt = this->next; + do { + TPOINT origin_vec(pt->pos.x - pos.x, pt->pos.y - pos.y); + area += CROSS(origin_vec, pt->vec); + pt = pt->next; + } while (pt != end && pt != this); + return area; + } + // Returns true if the number of points in the outline segment from *this to + // *end is less that min_points and false if we get back to *this first. + // Ignores hidden edge flags. + bool ShortNonCircularSegment(int min_points, const EDGEPT* end) const { + int count = 0; + const EDGEPT* pt = this; + do { + if (pt == end) return true; + pt = pt->next; + ++count; + } while (pt != this && count <= min_points); + return false; + } + // Accessors to hide or reveal a cut edge from feature extractors. void Hide() { flags[0] = true; @@ -100,9 +156,6 @@ struct EDGEPT { void MarkChop() { flags[2] = true; } - void UnmarkChop() { - flags[2] = false; - } bool IsChopPt() const { return flags[2] != 0; } @@ -162,8 +215,23 @@ struct TESSLINE { void MinMaxCrossProduct(const TPOINT vec, int* min_xp, int* max_xp) const; TBOX bounding_box() const; + // Returns true if *this and other have equal bounding boxes. + bool SameBox(const TESSLINE& other) const { + return topleft == other.topleft && botright == other.botright; + } + // Returns true if the given line segment crosses any outline of this blob. + bool SegmentCrosses(const TPOINT& pt1, const TPOINT& pt2) const { + if (Contains(pt1) && Contains(pt2)) { + EDGEPT* pt = loop; + do { + if (TPOINT::IsCrossed(pt1, pt2, pt->pos, pt->next->pos)) return true; + pt = pt->next; + } while (pt != loop); + } + return false; + } // Returns true if the point is contained within the outline box. - bool Contains(const TPOINT& pt) { + bool Contains(const TPOINT& pt) const { return topleft.x <= pt.x && pt.x <= botright.x && botright.y <= pt.y && pt.y <= topleft.y; } @@ -244,6 +312,31 @@ struct TBLOB { TBOX bounding_box() const; + // Returns true if the given line segment crosses any outline of this blob. + bool SegmentCrossesOutline(const TPOINT& pt1, const TPOINT& pt2) const { + for (const TESSLINE* outline = outlines; outline != NULL; + outline = outline->next) { + if (outline->SegmentCrosses(pt1, pt2)) return true; + } + return false; + } + // Returns true if the point is contained within any of the outline boxes. + bool Contains(const TPOINT& pt) const { + for (const TESSLINE* outline = outlines; outline != NULL; + outline = outline->next) { + if (outline->Contains(pt)) return true; + } + return false; + } + + // Finds and deletes any duplicate outlines in this blob, without deleting + // their EDGEPTs. + void EliminateDuplicateOutlines(); + + // Swaps the outlines of *this and next if needed to keep the centers in + // increasing x. + void CorrectBlobOrder(TBLOB* next); + const DENORM& denorm() const { return denorm_; } @@ -358,12 +451,7 @@ if (w) memfree (w) /*---------------------------------------------------------------------- F u n c t i o n s ----------------------------------------------------------------------*/ -// TODO(rays) This will become a member of TBLOB when TBLOB's definition -// moves to blobs.h - -// Returns the center of blob's bounding box in origin. -void blob_origin(TBLOB *blob, TPOINT *origin); - +// TODO(rays) Make divisible_blob and divide_blobs members of TBLOB. bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT* location); void divide_blobs(TBLOB *blob, TBLOB *other_blob, bool italic_blob, diff --git a/ccstruct/pageres.cpp b/ccstruct/pageres.cpp index 6a7f7a0255..58f7d8a838 100644 --- a/ccstruct/pageres.cpp +++ b/ccstruct/pageres.cpp @@ -404,7 +404,8 @@ void WERD_RES::SetupBlobWidthsAndGaps() { // as the blob widths and gaps. void WERD_RES::InsertSeam(int blob_number, SEAM* seam) { // Insert the seam into the SEAMS array. - insert_seam(chopped_word, blob_number, seam, &seam_array); + seam->PrepareToInsertSeam(seam_array, chopped_word->blobs, blob_number, true); + seam_array.insert(seam, blob_number); if (ratings != NULL) { // Expand the ratings matrix. ratings = ratings->ConsumeAndMakeBigger(blob_number); @@ -804,12 +805,16 @@ void WERD_RES::RebuildBestState() { for (int i = 0; i < best_choice->length(); ++i) { int length = best_choice->state(i); best_state.push_back(length); - if (length > 1) - join_pieces(seam_array, start, start + length - 1, chopped_word); + if (length > 1) { + SEAM::JoinPieces(seam_array, chopped_word->blobs, start, + start + length - 1); + } TBLOB* blob = chopped_word->blobs[start]; rebuild_word->blobs.push_back(new TBLOB(*blob)); - if (length > 1) - break_pieces(seam_array, start, start + length - 1, chopped_word); + if (length > 1) { + SEAM::BreakPieces(seam_array, chopped_word->blobs, start, + start + length - 1); + } start += length; } } @@ -1065,8 +1070,7 @@ bool WERD_RES::PiecesAllNatural(int start, int count) const { for (int index = start; index < start + count - 1; ++index) { if (index >= 0 && index < seam_array.size()) { SEAM* seam = seam_array[index]; - if (seam != NULL && seam->split1 != NULL) - return false; + if (seam != NULL && seam->HasAnySplits()) return false; } } return true; diff --git a/ccstruct/seam.cpp b/ccstruct/seam.cpp index e05fac9a96..3d70eafcd9 100644 --- a/ccstruct/seam.cpp +++ b/ccstruct/seam.cpp @@ -27,375 +27,182 @@ ----------------------------------------------------------------------*/ #include "seam.h" #include "blobs.h" -#include "freelist.h" #include "tprintf.h" -#ifdef __UNIX__ -#include -#endif - -/*---------------------------------------------------------------------- - V a r i a b l e s -----------------------------------------------------------------------*/ -#define NUM_STARTING_SEAMS 20 - /*---------------------------------------------------------------------- Public Function Code ----------------------------------------------------------------------*/ -/** - * @name point_in_split - * - * Check to see if either of these points are present in the current - * split. - * @returns TRUE if one of them is split. - */ -bool point_in_split(SPLIT *split, EDGEPT *point1, EDGEPT *point2) { - return ((split) ? ((exact_point (split->point1, point1) || - exact_point (split->point1, point2) || - exact_point (split->point2, point1) || - exact_point (split->point2, point2)) ? TRUE : FALSE) - : FALSE); -} - - -/** - * @name point_in_seam - * - * Check to see if either of these points are present in the current - * seam. - * @returns TRUE if one of them is. - */ -bool point_in_seam(const SEAM *seam, SPLIT *split) { - return (point_in_split(seam->split1, split->point1, split->point2) || - point_in_split(seam->split2, split->point1, split->point2) || - point_in_split(seam->split3, split->point1, split->point2)); -} - -/** - * @name point_used_by_split - * - * Return whether this particular EDGEPT * is used in a given split. - * @returns TRUE if the edgept is used by the split. - */ -bool point_used_by_split(SPLIT *split, EDGEPT *point) { - if (split == NULL) return false; - return point == split->point1 || point == split->point2; -} -/** - * @name point_used_by_seam - * - * Return whether this particular EDGEPT * is used in a given seam. - * @returns TRUE if the edgept is used by the seam. - */ -bool point_used_by_seam(SEAM *seam, EDGEPT *point) { - if (seam == NULL) return false; - return point_used_by_split(seam->split1, point) || - point_used_by_split(seam->split2, point) || - point_used_by_split(seam->split3, point); +// Returns the bounding box of all the points in the seam. +TBOX SEAM::bounding_box() const { + TBOX box(location_.x, location_.y, location_.x, location_.y); + for (int s = 0; s < num_splits_; ++s) { + box += splits_[s].bounding_box(); + } + return box; } -/** - * @name combine_seam - * - * Combine two seam records into a single seam. Move the split - * references from the second seam to the first one. The argument - * convention is patterned after strcpy. - */ -void combine_seams(SEAM *dest_seam, SEAM *source_seam) { - dest_seam->priority += source_seam->priority; - dest_seam->location += source_seam->location; - dest_seam->location /= 2; - - if (source_seam->split1) { - if (!dest_seam->split1) - dest_seam->split1 = source_seam->split1; - else if (!dest_seam->split2) - dest_seam->split2 = source_seam->split1; - else if (!dest_seam->split3) - dest_seam->split3 = source_seam->split1; - else - delete source_seam->split1; // Wouldn't have fitted. - source_seam->split1 = NULL; - } - if (source_seam->split2) { - if (!dest_seam->split2) - dest_seam->split2 = source_seam->split2; - else if (!dest_seam->split3) - dest_seam->split3 = source_seam->split2; - else - delete source_seam->split2; // Wouldn't have fitted. - source_seam->split2 = NULL; - } - if (source_seam->split3) { - if (!dest_seam->split3) - dest_seam->split3 = source_seam->split3; - else - delete source_seam->split3; // Wouldn't have fitted. - source_seam->split3 = NULL; +// Returns true if other can be combined into *this. +bool SEAM::CombineableWith(const SEAM& other, int max_x_dist, + float max_total_priority) const { + int dist = location_.x - other.location_.x; + if (-max_x_dist < dist && dist < max_x_dist && + num_splits_ + other.num_splits_ <= kMaxNumSplits && + priority_ + other.priority_ < max_total_priority && + !OverlappingSplits(other) && !SharesPosition(other)) { + return true; + } else { + return false; } - delete source_seam; } -/** - * @name start_seam_list - * - * Initialize a list of seams that match the original number of blobs - * present in the starting segmentation. Each of the seams created - * by this routine have location information only. - */ -void start_seam_list(TWERD *word, GenericVector* seam_array) { - seam_array->truncate(0); - TPOINT location; +// Combines other into *this. Only works if CombinableWith returned true. +void SEAM::CombineWith(const SEAM& other) { + priority_ += other.priority_; + location_ += other.location_; + location_ /= 2; - for (int b = 1; b < word->NumBlobs(); ++b) { - TBOX bbox = word->blobs[b - 1]->bounding_box(); - TBOX nbox = word->blobs[b]->bounding_box(); - location.x = (bbox.right() + nbox.left()) / 2; - location.y = (bbox.bottom() + bbox.top() + nbox.bottom() + nbox.top()) / 4; - seam_array->push_back(new SEAM(0.0f, location, NULL, NULL, NULL)); - } + for (int s = 0; s < other.num_splits_ && num_splits_ < kMaxNumSplits; ++s) + splits_[num_splits_++] = other.splits_[s]; } +// Returns true if the splits in *this SEAM appear OK in the sense that they +// do not cross any outlines and do not chop off any ridiculously small +// pieces. +bool SEAM::IsHealthy(const TBLOB& blob, int min_points, int min_area) const { + // TODO(rays) Try testing all the splits. Duplicating original code for now, + // which tested only the first. + return num_splits_ == 0 || splits_[0].IsHealthy(blob, min_points, min_area); +} -/** - * @name test_insert_seam - * - * @returns true if insert_seam will succeed. - */ -bool test_insert_seam(const GenericVector& seam_array, - TWERD *word, int index) { - SEAM *test_seam; - int list_length = seam_array.size(); - for (int test_index = 0; test_index < index; ++test_index) { - test_seam = seam_array[test_index]; - if (test_index + test_seam->widthp < index && - test_seam->widthp + test_index == index - 1 && - account_splits(test_seam, word, test_index + 1, 1) < 0) - return false; +// Computes the widthp_/widthn_ range for all existing SEAMs and for *this +// seam, which is about to be inserted at insert_index. Returns false if +// any of the computations fails, as this indicates an invalid chop. +// widthn_/widthp_ are only changed if modify is true. +bool SEAM::PrepareToInsertSeam(const GenericVector& seams, + const GenericVector& blobs, + int insert_index, bool modify) { + for (int s = 0; s < insert_index; ++s) { + if (!seams[s]->FindBlobWidth(blobs, s, modify)) return false; } - for (int test_index = index; test_index < list_length; test_index++) { - test_seam = seam_array[test_index]; - if (test_index - test_seam->widthn >= index && - test_index - test_seam->widthn == index && - account_splits(test_seam, word, test_index + 1, -1) < 0) - return false; + if (!FindBlobWidth(blobs, insert_index, modify)) return false; + for (int s = insert_index; s < seams.size(); ++s) { + if (!seams[s]->FindBlobWidth(blobs, s + 1, modify)) return false; } return true; } -/** - * @name insert_seam - * - * Add another seam to a collection of seams at a particular location - * in the seam array. - */ -void insert_seam(const TWERD* word, int index, SEAM *seam, - GenericVector* seam_array) { - SEAM *test_seam; - int list_length = seam_array->size(); - for (int test_index = 0; test_index < index; ++test_index) { - test_seam = seam_array->get(test_index); - if (test_index + test_seam->widthp >= index) { - test_seam->widthp++; /*got in the way */ - } else if (test_seam->widthp + test_index == index - 1) { - test_seam->widthp = account_splits(test_seam, word, test_index + 1, 1); - if (test_seam->widthp < 0) { - tprintf("Failed to find any right blob for a split!\n"); - print_seam("New dud seam", seam); - print_seam("Failed seam", test_seam); - } - } +// Computes the widthp_/widthn_ range. Returns false if not all the splits +// are accounted for. widthn_/widthp_ are only changed if modify is true. +bool SEAM::FindBlobWidth(const GenericVector& blobs, int index, + bool modify) { + int num_found = 0; + if (modify) { + widthp_ = 0; + widthn_ = 0; } - for (int test_index = index; test_index < list_length; test_index++) { - test_seam = seam_array->get(test_index); - if (test_index - test_seam->widthn < index) { - test_seam->widthn++; /*got in the way */ - } else if (test_index - test_seam->widthn == index) { - test_seam->widthn = account_splits(test_seam, word, test_index + 1, -1); - if (test_seam->widthn < 0) { - tprintf("Failed to find any left blob for a split!\n"); - print_seam("New dud seam", seam); - print_seam("Failed seam", test_seam); - } + for (int s = 0; s < num_splits_; ++s) { + const SPLIT& split = splits_[s]; + bool found_split = split.ContainedByBlob(*blobs[index]); + // Look right. + for (int b = index + 1; !found_split && b < blobs.size(); ++b) { + found_split = split.ContainedByBlob(*blobs[b]); + if (found_split && b - index > widthp_ && modify) widthp_ = b - index; } + // Look left. + for (int b = index - 1; !found_split && b >= 0; --b) { + found_split = split.ContainedByBlob(*blobs[b]); + if (found_split && index - b > widthn_ && modify) widthn_ = index - b; + } + if (found_split) ++num_found; } - seam_array->insert(seam, index); + return num_found == num_splits_; } +// Splits this blob into two blobs by applying the splits included in +// *this SEAM +void SEAM::ApplySeam(bool italic_blob, TBLOB* blob, TBLOB* other_blob) const { + for (int s = 0; s < num_splits_; ++s) { + splits_[s].SplitOutlineList(blob->outlines); + } + blob->ComputeBoundingBoxes(); -/** - * @name account_splits - * - * Account for all the splits by looking to the right (blob_direction == 1), - * or to the left (blob_direction == -1) in the word. - */ -int account_splits(const SEAM *seam, const TWERD *word, int blob_index, - int blob_direction) { - inT8 found_em[3]; - inT8 width; - - found_em[0] = seam->split1 == NULL; - found_em[1] = seam->split2 == NULL; - found_em[2] = seam->split3 == NULL; - if (found_em[0] && found_em[1] && found_em[2]) - return 0; - width = 0; - do { - TBLOB* blob = word->blobs[blob_index]; - if (!found_em[0]) - found_em[0] = find_split_in_blob(seam->split1, blob); - if (!found_em[1]) - found_em[1] = find_split_in_blob(seam->split2, blob); - if (!found_em[2]) - found_em[2] = find_split_in_blob(seam->split3, blob); - if (found_em[0] && found_em[1] && found_em[2]) { - return width; - } - width++; - blob_index += blob_direction; - } while (0 <= blob_index && blob_index < word->NumBlobs()); - return -1; -} - + divide_blobs(blob, other_blob, italic_blob, location_); -/** - * @name find_split_in_blob - * - * @returns TRUE if the split is somewhere in this blob. - */ -bool find_split_in_blob(SPLIT *split, TBLOB *blob) { - TESSLINE *outline; + blob->EliminateDuplicateOutlines(); + other_blob->EliminateDuplicateOutlines(); - for (outline = blob->outlines; outline != NULL; outline = outline->next) - if (outline->Contains(split->point1->pos)) - break; - if (outline == NULL) - return FALSE; - for (outline = blob->outlines; outline != NULL; outline = outline->next) - if (outline->Contains(split->point2->pos)) - return TRUE; - return FALSE; + blob->CorrectBlobOrder(other_blob); } +// Undoes ApplySeam by removing the seam between these two blobs. +// Produces one blob as a result, and deletes other_blob. +void SEAM::UndoSeam(TBLOB* blob, TBLOB* other_blob) const { + if (blob->outlines == NULL) { + blob->outlines = other_blob->outlines; + other_blob->outlines = NULL; + } -/** - * @name join_two_seams - * - * Merge these two seams into a new seam. Duplicate the split records - * in both of the input seams. Return the resultant seam. - */ -SEAM *join_two_seams(const SEAM *seam1, const SEAM *seam2) { - SEAM *result = NULL; - SEAM *temp; - - assert(seam1 &&seam2); + TESSLINE* outline = blob->outlines; + while (outline->next) outline = outline->next; + outline->next = other_blob->outlines; + other_blob->outlines = NULL; + delete other_blob; - if (((seam1->split3 == NULL && seam2->split2 == NULL) || - (seam1->split2 == NULL && seam2->split3 == NULL) || - seam1->split1 == NULL || seam2->split1 == NULL) && - (!shared_split_points(seam1, seam2))) { - result = new SEAM(*seam1); - temp = new SEAM(*seam2); - combine_seams(result, temp); + for (int s = 0; s < num_splits_; ++s) { + splits_[s].UnsplitOutlineList(blob); } - return (result); + blob->ComputeBoundingBoxes(); + blob->EliminateDuplicateOutlines(); } -/** - * @name print_seam - * - * Print a list of splits. Show the coordinates of both points in - * each split. - */ -void print_seam(const char *label, SEAM *seam) { - if (seam) { - tprintf(label); - tprintf(" %6.2f @ (%d,%d), p=%d, n=%d ", - seam->priority, seam->location.x, seam->location.y, - seam->widthp, seam->widthn); - print_split(seam->split1); - - if (seam->split2) { - tprintf(", "); - print_split (seam->split2); - if (seam->split3) { - tprintf(", "); - print_split (seam->split3); - } - } - tprintf("\n"); +// Prints everything in *this SEAM. +void SEAM::Print(const char* label) const { + tprintf(label); + tprintf(" %6.2f @ (%d,%d), p=%d, n=%d ", priority_, location_.x, location_.y, + widthp_, widthn_); + for (int s = 0; s < num_splits_; ++s) { + splits_[s].Print(); + if (s + 1 < num_splits_) tprintf(", "); } + tprintf("\n"); } - -/** - * @name print_seams - * - * Print a list of splits. Show the coordinates of both points in - * each split. - */ -void print_seams(const char *label, const GenericVector& seams) { - char number[CHARS_PER_LINE]; - +// Prints a collection of SEAMs. +/* static */ +void SEAM::PrintSeams(const char* label, const GenericVector& seams) { if (!seams.empty()) { tprintf("%s\n", label); for (int x = 0; x < seams.size(); ++x) { - sprintf(number, "%2d: ", x); - print_seam(number, seams[x]); + tprintf("%2d: ", x); + seams[x]->Print(""); } tprintf("\n"); } } - -/** - * @name shared_split_points - * - * Check these two seams to make sure that neither of them have two - * points in common. Return TRUE if any of the same points are present - * in any of the splits of both seams. - */ -int shared_split_points(const SEAM *seam1, const SEAM *seam2) { - if (seam1 == NULL || seam2 == NULL) - return (FALSE); - - if (seam2->split1 == NULL) - return (FALSE); - if (point_in_seam(seam1, seam2->split1)) - return (TRUE); - - if (seam2->split2 == NULL) - return (FALSE); - if (point_in_seam(seam1, seam2->split2)) - return (TRUE); - - if (seam2->split3 == NULL) - return (FALSE); - if (point_in_seam(seam1, seam2->split3)) - return (TRUE); - - return (FALSE); +#ifndef GRAPHICS_DISABLED +// Draws the seam in the given window. +void SEAM::Mark(ScrollView* window) const { + for (int s = 0; s < num_splits_; ++s) splits_[s].Mark(window); } +#endif -/********************************************************************** - * break_pieces - * - * Break up the blobs in this chain so that they are all independent. - * This operation should undo the affect of join_pieces. - **********************************************************************/ -void break_pieces(const GenericVector& seams, int first, int last, - TWERD *word) { - for (int x = first; x < last; ++x) - reveal_seam(seams[x]); +// Break up the blobs in this chain so that they are all independent. +// This operation should undo the affect of join_pieces. +/* static */ +void SEAM::BreakPieces(const GenericVector& seams, + const GenericVector& blobs, int first, + int last) { + for (int x = first; x < last; ++x) seams[x]->Reveal(); - TESSLINE *outline = word->blobs[first]->outlines; + TESSLINE* outline = blobs[first]->outlines; int next_blob = first + 1; while (outline != NULL && next_blob <= last) { - if (outline->next == word->blobs[next_blob]->outlines) { + if (outline->next == blobs[next_blob]->outlines) { outline->next = NULL; - outline = word->blobs[next_blob]->outlines; + outline = blobs[next_blob]->outlines; ++next_blob; } else { outline = outline->next; @@ -403,131 +210,71 @@ void break_pieces(const GenericVector& seams, int first, int last, } } - -/********************************************************************** - * join_pieces - * - * Join a group of base level pieces into a single blob that can then - * be classified. - **********************************************************************/ -void join_pieces(const GenericVector& seams, int first, int last, - TWERD *word) { - TESSLINE *outline = word->blobs[first]->outlines; +// Join a group of base level pieces into a single blob that can then +// be classified. +/* static */ +void SEAM::JoinPieces(const GenericVector& seams, + const GenericVector& blobs, int first, int last) { + TESSLINE* outline = blobs[first]->outlines; if (!outline) return; for (int x = first; x < last; ++x) { SEAM *seam = seams[x]; - if (x - seam->widthn >= first && x + seam->widthp < last) - hide_seam(seam); - while (outline->next) - outline = outline->next; - outline->next = word->blobs[x + 1]->outlines; + if (x - seam->widthn_ >= first && x + seam->widthp_ < last) seam->Hide(); + while (outline->next) outline = outline->next; + outline->next = blobs[x + 1]->outlines; } } - -/********************************************************************** - * hide_seam - * - * Change the edge points that are referenced by this seam to make - * them hidden edges. - **********************************************************************/ -void hide_seam(SEAM *seam) { - if (seam == NULL || seam->split1 == NULL) - return; - hide_edge_pair (seam->split1->point1, seam->split1->point2); - - if (seam->split2 == NULL) - return; - hide_edge_pair (seam->split2->point1, seam->split2->point2); - - if (seam->split3 == NULL) - return; - hide_edge_pair (seam->split3->point1, seam->split3->point2); +// Hides the seam so the outlines appear not to be cut by it. +void SEAM::Hide() const { + for (int s = 0; s < num_splits_; ++s) { + splits_[s].Hide(); + } } - -/********************************************************************** - * hide_edge_pair - * - * Change the edge points that are referenced by this seam to make - * them hidden edges. - **********************************************************************/ -void hide_edge_pair(EDGEPT *pt1, EDGEPT *pt2) { - EDGEPT *edgept; - - edgept = pt1; - do { - edgept->Hide(); - edgept = edgept->next; - } - while (!exact_point (edgept, pt2) && edgept != pt1); - if (edgept == pt1) { - /* tprintf("Hid entire outline at (%d,%d)!!\n", - edgept->pos.x,edgept->pos.y); */ - } - edgept = pt2; - do { - edgept->Hide(); - edgept = edgept->next; - } - while (!exact_point (edgept, pt1) && edgept != pt2); - if (edgept == pt2) { - /* tprintf("Hid entire outline at (%d,%d)!!\n", - edgept->pos.x,edgept->pos.y); */ +// Undoes hide, so the outlines are cut by the seam. +void SEAM::Reveal() const { + for (int s = 0; s < num_splits_; ++s) { + splits_[s].Reveal(); } } - -/********************************************************************** - * reveal_seam - * - * Change the edge points that are referenced by this seam to make - * them hidden edges. - **********************************************************************/ -void reveal_seam(SEAM *seam) { - if (seam == NULL || seam->split1 == NULL) - return; - reveal_edge_pair (seam->split1->point1, seam->split1->point2); - - if (seam->split2 == NULL) - return; - reveal_edge_pair (seam->split2->point1, seam->split2->point2); - - if (seam->split3 == NULL) - return; - reveal_edge_pair (seam->split3->point1, seam->split3->point2); +// Computes and returns, but does not set, the full priority of *this SEAM. +float SEAM::FullPriority(int xmin, int xmax, double overlap_knob, + int centered_maxwidth, double center_knob, + double width_change_knob) const { + if (num_splits_ == 0) return 0.0f; + for (int s = 1; s < num_splits_; ++s) { + splits_[s].SplitOutline(); + } + float full_priority = + priority_ + + splits_[0].FullPriority(xmin, xmax, overlap_knob, centered_maxwidth, + center_knob, width_change_knob); + for (int s = num_splits_ - 1; s >= 1; --s) { + splits_[s].UnsplitOutlines(); + } + return full_priority; } - -/********************************************************************** - * reveal_edge_pair +/** + * @name start_seam_list * - * Change the edge points that are referenced by this seam to make - * them hidden edges. - **********************************************************************/ -void reveal_edge_pair(EDGEPT *pt1, EDGEPT *pt2) { - EDGEPT *edgept; + * Initialize a list of seams that match the original number of blobs + * present in the starting segmentation. Each of the seams created + * by this routine have location information only. + */ +void start_seam_list(TWERD* word, GenericVector* seam_array) { + seam_array->truncate(0); + TPOINT location; - edgept = pt1; - do { - edgept->Reveal(); - edgept = edgept->next; - } - while (!exact_point (edgept, pt2) && edgept != pt1); - if (edgept == pt1) { - /* tprintf("Hid entire outline at (%d,%d)!!\n", - edgept->pos.x,edgept->pos.y); */ - } - edgept = pt2; - do { - edgept->Reveal(); - edgept = edgept->next; - } - while (!exact_point (edgept, pt1) && edgept != pt2); - if (edgept == pt2) { - /* tprintf("Hid entire outline at (%d,%d)!!\n", - edgept->pos.x,edgept->pos.y); */ + for (int b = 1; b < word->NumBlobs(); ++b) { + TBOX bbox = word->blobs[b - 1]->bounding_box(); + TBOX nbox = word->blobs[b]->bounding_box(); + location.x = (bbox.right() + nbox.left()) / 2; + location.y = (bbox.bottom() + bbox.top() + nbox.bottom() + nbox.top()) / 4; + seam_array->push_back(new SEAM(0.0f, location)); } } diff --git a/ccstruct/seam.h b/ccstruct/seam.h index 23b7bc71f5..9ae63148ed 100644 --- a/ccstruct/seam.h +++ b/ccstruct/seam.h @@ -36,95 +36,163 @@ ----------------------------------------------------------------------*/ typedef float PRIORITY; /* PRIORITY */ -struct SEAM { - // Constructor that was formerly new_seam. - SEAM(PRIORITY priority0, const TPOINT& location0, - SPLIT *splita, SPLIT *splitb, SPLIT *splitc) - : priority(priority0), widthp(0), widthn(0), location(location0), - split1(splita), split2(splitb), split3(splitc) {} - // Copy constructor that was formerly clone_seam. - SEAM(const SEAM& src) - : priority(src.priority), widthp(src.widthp), widthn(src.widthn), - location(src.location) { - clone_split(split1, src.split1); - clone_split(split2, src.split2); - clone_split(split3, src.split3); +class SEAM { + public: + // A seam with no splits + SEAM(float priority, const TPOINT& location) + : priority_(priority), + location_(location), + widthp_(0), + widthn_(0), + num_splits_(0) {} + // A seam with a single split point. + SEAM(float priority, const TPOINT& location, const SPLIT& split) + : priority_(priority), + location_(location), + widthp_(0), + widthn_(0), + num_splits_(1) { + splits_[0] = split; } - // Destructor was delete_seam. - ~SEAM() { - if (split1) - delete_split(split1); - if (split2) - delete_split(split2); - if (split3) - delete_split(split3); + // Default copy constructor, operator= and destructor are OK! + + // Accessors. + float priority() const { return priority_; } + void set_priority(float priority) { priority_ = priority; } + bool HasAnySplits() const { return num_splits_ > 0; } + + // Returns the bounding box of all the points in the seam. + TBOX bounding_box() const; + + // Returns true if other can be combined into *this. + bool CombineableWith(const SEAM& other, int max_x_dist, + float max_total_priority) const; + // Combines other into *this. Only works if CombinableWith returned true. + void CombineWith(const SEAM& other); + + // Returns true if the given blob contains all splits of *this SEAM. + bool ContainedByBlob(const TBLOB& blob) const { + for (int s = 0; s < num_splits_; ++s) { + if (!splits_[s].ContainedByBlob(blob)) return false; + } + return true; } - PRIORITY priority; - inT8 widthp; - inT8 widthn; - TPOINT location; - SPLIT *split1; - SPLIT *split2; - SPLIT *split3; -}; + // Returns true if the given EDGEPT is used by this SEAM, checking only + // the EDGEPT pointer, not the coordinates. + bool UsesPoint(const EDGEPT* point) const { + for (int s = 0; s < num_splits_; ++s) { + if (splits_[s].UsesPoint(point)) return true; + } + return false; + } + // Returns true if *this and other share any common point, by coordinates. + bool SharesPosition(const SEAM& other) const { + for (int s = 0; s < num_splits_; ++s) { + for (int t = 0; t < other.num_splits_; ++t) + if (splits_[s].SharesPosition(other.splits_[t])) return true; + } + return false; + } + // Returns true if *this and other have any vertically overlapping splits. + bool OverlappingSplits(const SEAM& other) const { + for (int s = 0; s < num_splits_; ++s) { + TBOX split1_box = splits_[s].bounding_box(); + for (int t = 0; t < other.num_splits_; ++t) { + TBOX split2_box = other.splits_[t].bounding_box(); + if (split1_box.y_overlap(split2_box)) return true; + } + } + return false; + } -/** - * exact_point - * - * Return TRUE if the point positions are the exactly the same. The - * parameters must be of type (EDGEPT*). - */ + // Marks the edgepts used by the seam so the segments made by the cut + // never get split further by another seam in the future. + void Finalize() { + for (int s = 0; s < num_splits_; ++s) { + splits_[s].point1->MarkChop(); + splits_[s].point2->MarkChop(); + } + } + + // Returns true if the splits in *this SEAM appear OK in the sense that they + // do not cross any outlines and do not chop off any ridiculously small + // pieces. + bool IsHealthy(const TBLOB& blob, int min_points, int min_area) const; + + // Computes the widthp_/widthn_ range for all existing SEAMs and for *this + // seam, which is about to be inserted at insert_index. Returns false if + // any of the computations fails, as this indicates an invalid chop. + // widthn_/widthp_ are only changed if modify is true. + bool PrepareToInsertSeam(const GenericVector& seams, + const GenericVector& blobs, int insert_index, + bool modify); + // Computes the widthp_/widthn_ range. Returns false if not all the splits + // are accounted for. widthn_/widthp_ are only changed if modify is true. + bool FindBlobWidth(const GenericVector& blobs, int index, + bool modify); + + // Splits this blob into two blobs by applying the splits included in + // *this SEAM + void ApplySeam(bool italic_blob, TBLOB* blob, TBLOB* other_blob) const; + // Undoes ApplySeam by removing the seam between these two blobs. + // Produces one blob as a result, and deletes other_blob. + void UndoSeam(TBLOB* blob, TBLOB* other_blob) const; + + // Prints everything in *this SEAM. + void Print(const char* label) const; + // Prints a collection of SEAMs. + static void PrintSeams(const char* label, const GenericVector& seams); +#ifndef GRAPHICS_DISABLED + // Draws the seam in the given window. + void Mark(ScrollView* window) const; +#endif -#define exact_point(p1,p2) \ - (! ((p1->pos.x - p2->pos.x) || (p1->pos.y - p2->pos.y))) + // Break up the blobs in this chain so that they are all independent. + // This operation should undo the affect of join_pieces. + static void BreakPieces(const GenericVector& seams, + const GenericVector& blobs, int first, + int last); + // Join a group of base level pieces into a single blob that can then + // be classified. + static void JoinPieces(const GenericVector& seams, + const GenericVector& blobs, int first, + int last); + + // Hides the seam so the outlines appear not to be cut by it. + void Hide() const; + // Undoes hide, so the outlines are cut by the seam. + void Reveal() const; + + // Computes and returns, but does not set, the full priority of *this SEAM. + // The arguments here are config parameters defined in Wordrec. Add chop_ + // to the beginning of the name. + float FullPriority(int xmin, int xmax, double overlap_knob, + int centered_maxwidth, double center_knob, + double width_change_knob) const; + + private: + // Maximum number of splits that a SEAM can hold. + static const int kMaxNumSplits = 3; + // Priority of this split. Lower is better. + float priority_; + // Position of the middle of the seam. + TPOINT location_; + // A range such that all splits in *this SEAM are contained within blobs in + // the range [index - widthn_,index + widthp_] where index is the index of + // this SEAM in the seams vector. + inT8 widthp_; + inT8 widthn_; + // Number of splits_ that are used. + inT8 num_splits_; + // Set of pairs of points that are the ends of each split in the SEAM. + SPLIT splits_[kMaxNumSplits]; +}; /*---------------------------------------------------------------------- F u n c t i o n s ----------------------------------------------------------------------*/ -bool point_in_split(SPLIT *split, EDGEPT *point1, EDGEPT *point2); - -bool point_in_seam(const SEAM *seam, SPLIT *split); - -bool point_used_by_split(SPLIT *split, EDGEPT *point); - -bool point_used_by_seam(SEAM *seam, EDGEPT *point); - -void combine_seams(SEAM *dest_seam, SEAM *source_seam); - -void start_seam_list(TWERD *word, GenericVector* seam_array); - -bool test_insert_seam(const GenericVector& seam_array, - TWERD *word, int index); - -void insert_seam(const TWERD *word, int index, SEAM *seam, - GenericVector* seam_array); - -int account_splits(const SEAM *seam, const TWERD *word, int blob_index, - int blob_direction); - -bool find_split_in_blob(SPLIT *split, TBLOB *blob); - -SEAM *join_two_seams(const SEAM *seam1, const SEAM *seam2); - -void print_seam(const char *label, SEAM *seam); - -void print_seams(const char *label, const GenericVector& seams); - -int shared_split_points(const SEAM *seam1, const SEAM *seam2); - -void break_pieces(const GenericVector& seams, - int first, int last, TWERD *word); - -void join_pieces(const GenericVector& seams, - int first, int last, TWERD *word); - -void hide_seam(SEAM *seam); - -void hide_edge_pair(EDGEPT *pt1, EDGEPT *pt2); - -void reveal_seam(SEAM *seam); -void reveal_edge_pair(EDGEPT *pt1, EDGEPT *pt2); +void start_seam_list(TWERD* word, GenericVector* seam_array); #endif diff --git a/ccstruct/split.cpp b/ccstruct/split.cpp index a2e974ef1c..24650d4f7a 100644 --- a/ccstruct/split.cpp +++ b/ccstruct/split.cpp @@ -36,23 +36,103 @@ /*---------------------------------------------------------------------- V a r i a b l e s ----------------------------------------------------------------------*/ +// Limit on the amount of penalty for the chop being off-center. +const int kCenterGradeCap = 25; +// Ridiculously large priority for splits that are no use. +const double kBadPriority = 999.0; + BOOL_VAR(wordrec_display_splits, 0, "Display splits"); -/*---------------------------------------------------------------------- - F u n c t i o n s -----------------------------------------------------------------------*/ +// Returns the bounding box of all the points in the split. +TBOX SPLIT::bounding_box() const { + return TBOX( + MIN(point1->pos.x, point2->pos.x), MIN(point1->pos.y, point2->pos.y), + MAX(point1->pos.x, point2->pos.x), MAX(point1->pos.y, point2->pos.y)); +} -/********************************************************************** - * delete_split - * - * Remove this split from existence. - **********************************************************************/ -void delete_split(SPLIT *split) { - if (split) { - delete split; +// Hides the SPLIT so the outlines appear not to be cut by it. +void SPLIT::Hide() const { + EDGEPT* edgept = point1; + do { + edgept->Hide(); + edgept = edgept->next; + } while (!edgept->EqualPos(*point2) && edgept != point1); + edgept = point2; + do { + edgept->Hide(); + edgept = edgept->next; + } while (!edgept->EqualPos(*point1) && edgept != point2); +} + +// Undoes hide, so the outlines are cut by the SPLIT. +void SPLIT::Reveal() const { + EDGEPT* edgept = point1; + do { + edgept->Reveal(); + edgept = edgept->next; + } while (!edgept->EqualPos(*point2) && edgept != point1); + edgept = point2; + do { + edgept->Reveal(); + edgept = edgept->next; + } while (!edgept->EqualPos(*point1) && edgept != point2); +} + +// Compute a split priority based on the bounding boxes of the parts. +// The arguments here are config parameters defined in Wordrec. Add chop_ +// to the beginning of the name. +float SPLIT::FullPriority(int xmin, int xmax, double overlap_knob, + int centered_maxwidth, double center_knob, + double width_change_knob) const { + TBOX box1 = Box12(); + TBOX box2 = Box21(); + int min_left = MIN(box1.left(), box2.left()); + int max_right = MAX(box1.right(), box2.right()); + if (xmin < min_left && xmax > max_right) return kBadPriority; + + float grade = 0.0f; + // grade_overlap. + int width1 = box1.width(); + int width2 = box2.width(); + int min_width = MIN(width1, width2); + int overlap = -box1.x_gap(box2); + if (overlap == min_width) { + grade += 100.0f; // Total overlap. + } else { + if (2 * overlap > min_width) overlap += 2 * overlap - min_width; + if (overlap > 0) grade += overlap_knob * overlap; + } + // grade_center_of_blob. + if (width1 <= centered_maxwidth || width2 <= centered_maxwidth) { + grade += MIN(kCenterGradeCap, center_knob * abs(width1 - width2)); } + // grade_width_change. + float width_change_grade = 20 - (max_right - min_left - MAX(width1, width2)); + if (width_change_grade > 0.0f) + grade += width_change_grade * width_change_knob; + return grade; +} + +// Returns true if *this SPLIT appears OK in the sense that it does not cross +// any outlines and does not chop off any ridiculously small pieces. +bool SPLIT::IsHealthy(const TBLOB& blob, int min_points, int min_area) const { + return !IsLittleChunk(min_points, min_area) && + !blob.SegmentCrossesOutline(point1->pos, point2->pos); } +// Returns true if the split generates a small chunk in terms of either area +// or number of points. +bool SPLIT::IsLittleChunk(int min_points, int min_area) const { + if (point1->ShortNonCircularSegment(min_points, point2) && + point1->SegmentArea(point2) < min_area) { + return true; + } + if (point2->ShortNonCircularSegment(min_points, point1) && + point2->SegmentArea(point1) < min_area) { + return true; + } + return false; +} /********************************************************************** * make_edgept @@ -135,102 +215,113 @@ void remove_edgept(EDGEPT *point) { } /********************************************************************** - * new_split + * Print * - * Create a new split record and initialize it. Put it on the display - * list. + * Shows the coordinates of both points in a split. **********************************************************************/ -SPLIT *new_split(EDGEPT *point1, EDGEPT *point2) { - SPLIT *s = new SPLIT; - s->point1 = point1; - s->point2 = point2; - return (s); +void SPLIT::Print() const { + if (this != NULL) { + tprintf("(%d,%d)--(%d,%d)", point1->pos.x, point1->pos.y, point2->pos.x, + point2->pos.y); + } } - -/********************************************************************** - * print_split - * - * Print a list of splits. Show the coordinates of both points in - * each split. - **********************************************************************/ -void print_split(SPLIT *split) { - if (split) { - tprintf("(%d,%d)--(%d,%d)", - split->point1->pos.x, split->point1->pos.y, - split->point2->pos.x, split->point2->pos.y); - } +#ifndef GRAPHICS_DISABLED +// Draws the split in the given window. +void SPLIT::Mark(ScrollView* window) const { + window->Pen(ScrollView::GREEN); + window->Line(point1->pos.x, point1->pos.y, point2->pos.x, point2->pos.y); + window->UpdateWindow(); } +#endif +// Creates two outlines out of one by splitting the original one in half. +// Inserts the resulting outlines into the given list. +void SPLIT::SplitOutlineList(TESSLINE* outlines) const { + SplitOutline(); + while (outlines->next != NULL) outlines = outlines->next; -/********************************************************************** - * split_outline - * - * Split between these two edge points. - **********************************************************************/ -void split_outline(EDGEPT *join_point1, EDGEPT *join_point2) { - assert(join_point1 != join_point2); + outlines->next = new TESSLINE; + outlines->next->loop = point1; + outlines->next->ComputeBoundingBox(); + + outlines = outlines->next; + + outlines->next = new TESSLINE; + outlines->next->loop = point2; + outlines->next->ComputeBoundingBox(); - EDGEPT* temp2 = join_point2->next; - EDGEPT* temp1 = join_point1->next; + outlines->next->next = NULL; +} + +// Makes a split between these two edge points, but does not affect the +// outlines to which they belong. +void SPLIT::SplitOutline() const { + EDGEPT* temp2 = point2->next; + EDGEPT* temp1 = point1->next; /* Create two new points */ - EDGEPT* new_point1 = make_edgept(join_point1->pos.x, join_point1->pos.y, - temp1, join_point2); - EDGEPT* new_point2 = make_edgept(join_point2->pos.x, join_point2->pos.y, - temp2, join_point1); - // Join_point1 and 2 are now cross-over points, so they must have NULL + EDGEPT* new_point1 = make_edgept(point1->pos.x, point1->pos.y, temp1, point2); + EDGEPT* new_point2 = make_edgept(point2->pos.x, point2->pos.y, temp2, point1); + // point1 and 2 are now cross-over points, so they must have NULL // src_outlines and give their src_outline information their new // replacements. - new_point1->src_outline = join_point1->src_outline; - new_point1->start_step = join_point1->start_step; - new_point1->step_count = join_point1->step_count; - new_point2->src_outline = join_point2->src_outline; - new_point2->start_step = join_point2->start_step; - new_point2->step_count = join_point2->step_count; - join_point1->src_outline = NULL; - join_point1->start_step = 0; - join_point1->step_count = 0; - join_point2->src_outline = NULL; - join_point2->start_step = 0; - join_point2->step_count = 0; - join_point1->MarkChop(); - join_point2->MarkChop(); + new_point1->src_outline = point1->src_outline; + new_point1->start_step = point1->start_step; + new_point1->step_count = point1->step_count; + new_point2->src_outline = point2->src_outline; + new_point2->start_step = point2->start_step; + new_point2->step_count = point2->step_count; + point1->src_outline = NULL; + point1->start_step = 0; + point1->step_count = 0; + point2->src_outline = NULL; + point2->start_step = 0; + point2->step_count = 0; } +// Undoes the effect of SplitOutlineList, correcting the outlines for undoing +// the split, but possibly leaving some duplicate outlines. +void SPLIT::UnsplitOutlineList(TBLOB* blob) const { + /* Modify edge points */ + UnsplitOutlines(); -/********************************************************************** - * unsplit_outlines - * - * Remove the split that was put between these two points. - **********************************************************************/ -void unsplit_outlines(EDGEPT *p1, EDGEPT *p2) { - EDGEPT *tmp1 = p1->next; - EDGEPT *tmp2 = p2->next; - - assert (p1 != p2); - - tmp1->next->prev = p2; - tmp2->next->prev = p1; - - // tmp2 is coincident with p1. p1 takes tmp2's place as tmp2 is deleted. - p1->next = tmp2->next; - p1->src_outline = tmp2->src_outline; - p1->start_step = tmp2->start_step; - p1->step_count = tmp2->step_count; - // Likewise p2 takes tmp1's place. - p2->next = tmp1->next; - p2->src_outline = tmp1->src_outline; - p2->start_step = tmp1->start_step; - p2->step_count = tmp1->step_count; - p1->UnmarkChop(); - p2->UnmarkChop(); + TESSLINE* outline1 = new TESSLINE; + outline1->next = blob->outlines; + blob->outlines = outline1; + outline1->loop = point1; + + TESSLINE* outline2 = new TESSLINE; + outline2->next = blob->outlines; + blob->outlines = outline2; + outline2->loop = point2; +} + +// Removes the split that was put between these two points. +void SPLIT::UnsplitOutlines() const { + EDGEPT* tmp1 = point1->next; + EDGEPT* tmp2 = point2->next; + + tmp1->next->prev = point2; + tmp2->next->prev = point1; + + // tmp2 is coincident with point1. point1 takes tmp2's place as tmp2 is + // deleted. + point1->next = tmp2->next; + point1->src_outline = tmp2->src_outline; + point1->start_step = tmp2->start_step; + point1->step_count = tmp2->step_count; + // Likewise point2 takes tmp1's place. + point2->next = tmp1->next; + point2->src_outline = tmp1->src_outline; + point2->start_step = tmp1->start_step; + point2->step_count = tmp1->step_count; delete tmp1; delete tmp2; - p1->vec.x = p1->next->pos.x - p1->pos.x; - p1->vec.y = p1->next->pos.y - p1->pos.y; + point1->vec.x = point1->next->pos.x - point1->pos.x; + point1->vec.y = point1->next->pos.y - point1->pos.y; - p2->vec.x = p2->next->pos.x - p2->pos.x; - p2->vec.y = p2->next->pos.y - p2->pos.y; + point2->vec.x = point2->next->pos.x - point2->pos.x; + point2->vec.y = point2->next->pos.y - point2->pos.y; } diff --git a/ccstruct/split.h b/ccstruct/split.h index 7291b4cf82..2642474860 100644 --- a/ccstruct/split.h +++ b/ccstruct/split.h @@ -29,18 +29,80 @@ I n c l u d e s ----------------------------------------------------------------------*/ #include "blobs.h" -#include "oldlist.h" +#include "scrollview.h" /*---------------------------------------------------------------------- T y p e s ----------------------------------------------------------------------*/ -typedef struct split_record -{ /* SPLIT */ +struct SPLIT { + SPLIT() : point1(NULL), point2(NULL) {} + SPLIT(EDGEPT* pt1, EDGEPT* pt2) : point1(pt1), point2(pt2) {} + + // Returns the bounding box of all the points in the split. + TBOX bounding_box() const; + // Returns the bounding box of the outline from point1 to point2. + TBOX Box12() const { return point1->SegmentBox(point2); } + // Returns the bounding box of the outline from point1 to point1. + TBOX Box21() const { return point2->SegmentBox(point1); } + // Returns the bounding box of the out + + // Hides the SPLIT so the outlines appear not to be cut by it. + void Hide() const; + // Undoes hide, so the outlines are cut by the SPLIT. + void Reveal() const; + + // Returns true if the given EDGEPT is used by this SPLIT, checking only + // the EDGEPT pointer, not the coordinates. + bool UsesPoint(const EDGEPT* point) const { + return point1 == point || point2 == point; + } + // Returns true if the other SPLIT has any position shared with *this. + bool SharesPosition(const SPLIT& other) const { + return point1->EqualPos(*other.point1) || point1->EqualPos(*other.point2) || + point2->EqualPos(*other.point1) || point2->EqualPos(*other.point2); + } + // Returns true if both points are contained within the blob. + bool ContainedByBlob(const TBLOB& blob) const { + return blob.Contains(point1->pos) && blob.Contains(point2->pos); + } + // Returns true if both points are contained within the outline. + bool ContainedByOutline(const TESSLINE& outline) const { + return outline.Contains(point1->pos) && outline.Contains(point2->pos); + } + // Compute a split priority based on the bounding boxes of the parts. + // The arguments here are config parameters defined in Wordrec. Add chop_ + // to the beginning of the name. + float FullPriority(int xmin, int xmax, double overlap_knob, + int centered_maxwidth, double center_knob, + double width_change_knob) const; + // Returns true if *this SPLIT appears OK in the sense that it does not cross + // any outlines and does not chop off any ridiculously small pieces. + bool IsHealthy(const TBLOB& blob, int min_points, int min_area) const; + // Returns true if the split generates a small chunk in terms of either area + // or number of points. + bool IsLittleChunk(int min_points, int min_area) const; + + void Print() const; +#ifndef GRAPHICS_DISABLED + // Draws the split in the given window. + void Mark(ScrollView* window) const; +#endif + + // Creates two outlines out of one by splitting the original one in half. + // Inserts the resulting outlines into the given list. + void SplitOutlineList(TESSLINE* outlines) const; + // Makes a split between these two edge points, but does not affect the + // outlines to which they belong. + void SplitOutline() const; + // Undoes the effect of SplitOutlineList, correcting the outlines for undoing + // the split, but possibly leaving some duplicate outlines. + void UnsplitOutlineList(TBLOB* blob) const; + // Removes the split that was put between these two points. + void UnsplitOutlines() const; + EDGEPT *point1; EDGEPT *point2; -} SPLIT; - -typedef LIST SPLITS; /* SPLITS */ +}; /*---------------------------------------------------------------------- V a r i a b l e s @@ -48,38 +110,11 @@ typedef LIST SPLITS; /* SPLITS */ extern BOOL_VAR_H(wordrec_display_splits, 0, "Display splits"); -/*---------------------------------------------------------------------- - M a c r o s -----------------------------------------------------------------------*/ -/********************************************************************** - * clone_split - * - * Create a new split record and set the contents equal to the contents - * of this record. - **********************************************************************/ - -#define clone_split(dest,source) \ -if (source) \ - (dest) = new_split ((source)->point1, (source)->point2); \ -else \ - (dest) = (SPLIT*) NULL \ - - /*---------------------------------------------------------------------- F u n c t i o n s ----------------------------------------------------------------------*/ -void delete_split(SPLIT *split); - EDGEPT *make_edgept(int x, int y, EDGEPT *next, EDGEPT *prev); void remove_edgept(EDGEPT *point); -SPLIT *new_split(EDGEPT *point1, EDGEPT *point2); - -void print_split(SPLIT *split); - -void split_outline(EDGEPT *join_point1, EDGEPT *join_point2); - -void unsplit_outlines(EDGEPT *p1, EDGEPT *p2); - #endif diff --git a/ccstruct/vecfuncs.cpp b/ccstruct/vecfuncs.cpp index 3f8251738c..8357c9aabe 100644 --- a/ccstruct/vecfuncs.cpp +++ b/ccstruct/vecfuncs.cpp @@ -30,6 +30,7 @@ I n c l u d e s ----------------------------------------------------------------------*/ #include "vecfuncs.h" +#include "blobs.h" /*---------------------------------------------------------------------- F u n c t i o n s diff --git a/ccstruct/vecfuncs.h b/ccstruct/vecfuncs.h index 91bbb08810..55cf310874 100644 --- a/ccstruct/vecfuncs.h +++ b/ccstruct/vecfuncs.h @@ -26,7 +26,6 @@ #define VECFUNCS_H #include -#include "blobs.h" struct EDGEPT; diff --git a/classify/adaptmatch.cpp b/classify/adaptmatch.cpp index 0eaf144000..7bbc84719d 100644 --- a/classify/adaptmatch.cpp +++ b/classify/adaptmatch.cpp @@ -359,8 +359,8 @@ void Classify::LearnPieces(const char* filename, int start, int length, return; if (length > 1) { - join_pieces(word->seam_array, start, start + length - 1, - word->chopped_word); + SEAM::JoinPieces(word->seam_array, word->chopped_word->blobs, start, + start + length - 1); } TBLOB* blob = word->chopped_word->blobs[start]; // Rotate the blob if needed for classification. @@ -413,7 +413,8 @@ void Classify::LearnPieces(const char* filename, int start, int length, delete rotated_blob; } - break_pieces(word->seam_array, start, start + length - 1, word->chopped_word); + SEAM::BreakPieces(word->seam_array, word->chopped_word->blobs, start, + start + length - 1); } // LearnPieces. /*---------------------------------------------------------------------------*/ diff --git a/wordrec/chop.cpp b/wordrec/chop.cpp index 9ae61bb932..c731005260 100644 --- a/wordrec/chop.cpp +++ b/wordrec/chop.cpp @@ -29,7 +29,6 @@ #include "chop.h" #include "outlines.h" -#include "olutil.h" #include "callcpp.h" #include "plotedges.h" #include "const.h" @@ -74,6 +73,11 @@ void Wordrec::add_point_to_list(PointHeap* point_heap, EDGEPT *point) { #endif } +// Returns true if the edgept supplied as input is an inside angle. This +// is determined by the angular change of the vectors from point to point. +bool Wordrec::is_inside_angle(EDGEPT *pt) { + return angle_change(pt->prev, pt, pt->next) < chop_inside_angle; +} /** * @name angle_change @@ -111,65 +115,6 @@ int Wordrec::angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3) { return (angle); } -/** - * @name is_little_chunk - * - * Return TRUE if one of the pieces resulting from this split would - * less than some number of edge points. - */ -int Wordrec::is_little_chunk(EDGEPT *point1, EDGEPT *point2) { - EDGEPT *p = point1; /* Iterator */ - int counter = 0; - - do { - /* Go from P1 to P2 */ - if (is_same_edgept (point2, p)) { - if (is_small_area (point1, point2)) - return (TRUE); - else - break; - } - p = p->next; - } - while ((p != point1) && (counter++ < chop_min_outline_points)); - /* Go from P2 to P1 */ - p = point2; - counter = 0; - do { - if (is_same_edgept (point1, p)) { - return (is_small_area (point2, point1)); - } - p = p->next; - } - while ((p != point2) && (counter++ < chop_min_outline_points)); - - return (FALSE); -} - - -/** - * @name is_small_area - * - * Test the area defined by a split accross this outline. - */ -int Wordrec::is_small_area(EDGEPT *point1, EDGEPT *point2) { - EDGEPT *p = point1->next; /* Iterator */ - int area = 0; - TPOINT origin; - - do { - /* Go from P1 to P2 */ - origin.x = p->pos.x - point1->pos.x; - origin.y = p->pos.y - point1->pos.y; - area += CROSS (origin, p->vec); - p = p->next; - } - while (!is_same_edgept (point2, p)); - - return (area < chop_min_outline_area); -} - - /** * @name pick_close_point * diff --git a/wordrec/chopper.cpp b/wordrec/chopper.cpp index cf39ceb6fe..c1a57fcd27 100644 --- a/wordrec/chopper.cpp +++ b/wordrec/chopper.cpp @@ -39,7 +39,6 @@ #include "findseam.h" #include "freelist.h" #include "globals.h" -#include "makechop.h" #include "render.h" #include "pageres.h" #include "seam.h" @@ -135,18 +134,14 @@ void restore_outline_tree(TESSLINE *srcline) { static SEAM* CheckSeam(int debug_level, inT32 blob_number, TWERD* word, TBLOB* blob, TBLOB* other_blob, const GenericVector& seams, SEAM* seam) { - if (seam == NULL || - blob->outlines == NULL || - other_blob->outlines == NULL || - total_containment(blob, other_blob) || - check_blob(other_blob) || - !(check_seam_order(blob, seam) && - check_seam_order(other_blob, seam)) || + if (seam == NULL || blob->outlines == NULL || other_blob->outlines == NULL || + total_containment(blob, other_blob) || check_blob(other_blob) || + !seam->ContainedByBlob(*blob) || !seam->ContainedByBlob(*other_blob) || any_shared_split_points(seams, seam) || - !test_insert_seam(seams, word, blob_number)) { + !seam->PrepareToInsertSeam(seams, word->blobs, blob_number, false)) { word->blobs.remove(blob_number + 1); if (seam) { - undo_seam(blob, other_blob, seam); + seam->UndoSeam(blob, other_blob); delete seam; seam = NULL; #ifndef GRAPHICS_DISABLED @@ -185,19 +180,19 @@ SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number, if (prioritize_division) { TPOINT location; if (divisible_blob(blob, italic_blob, &location)) { - seam = new SEAM(0.0f, location, NULL, NULL, NULL); + seam = new SEAM(0.0f, location); } } if (seam == NULL) seam = pick_good_seam(blob); if (chop_debug) { if (seam != NULL) - print_seam("Good seam picked=", seam); + seam->Print("Good seam picked="); else tprintf("\n** no seam picked *** \n"); } if (seam) { - apply_seam(blob, other_blob, italic_blob, seam); + seam->ApplySeam(italic_blob, blob, other_blob); } seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob, @@ -211,13 +206,17 @@ SEAM *Wordrec::attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number, if (divisible_blob(blob, italic_blob, &location)) { other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */ word->blobs.insert(other_blob, blob_number + 1); - seam = new SEAM(0.0f, location, NULL, NULL, NULL); - apply_seam(blob, other_blob, italic_blob, seam); + seam = new SEAM(0.0f, location); + seam->ApplySeam(italic_blob, blob, other_blob); seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob, seams, seam); } } } + if (seam != NULL) { + // Make sure this seam doesn't get chopped again. + seam->Finalize(); + } return seam; } @@ -286,8 +285,7 @@ int any_shared_split_points(const GenericVector& seams, SEAM *seam) { length = seams.size(); for (index = 0; index < length; index++) - if (shared_split_points(seams[index], seam)) - return TRUE; + if (seam->SharesPosition(*seams[index])) return TRUE; return FALSE; } @@ -384,50 +382,6 @@ SEAM* Wordrec::chop_one_blob(const GenericVector& boxes, blob_number); } } -} // namespace tesseract - -/** - * @name check_seam_order - * - * Make sure that each of the splits in this seam match to outlines - * in this blob. If any of the splits could not correspond to this - * blob then there is a problem (and FALSE should be returned to the - * caller). - */ -inT16 check_seam_order(TBLOB *blob, SEAM *seam) { - TESSLINE *outline; - inT8 found_em[3]; - - if (seam->split1 == NULL || blob == NULL) - return (TRUE); - - found_em[0] = found_em[1] = found_em[2] = FALSE; - - for (outline = blob->outlines; outline; outline = outline->next) { - if (!found_em[0] && - ((seam->split1 == NULL) || - is_split_outline (outline, seam->split1))) { - found_em[0] = TRUE; - } - if (!found_em[1] && - ((seam->split2 == NULL) || - is_split_outline (outline, seam->split2))) { - found_em[1] = TRUE; - } - if (!found_em[2] && - ((seam->split3 == NULL) || - is_split_outline (outline, seam->split3))) { - found_em[2] = TRUE; - } - } - - if (!found_em[0] || !found_em[1] || !found_em[2]) - return (FALSE); - else - return (TRUE); -} - -namespace tesseract { /** * @name chop_word_main diff --git a/wordrec/chopper.h b/wordrec/chopper.h index 7955a51f1a..4bfbf653af 100644 --- a/wordrec/chopper.h +++ b/wordrec/chopper.h @@ -44,7 +44,5 @@ int any_shared_split_points(const GenericVector& seams, SEAM *seam); int check_blob(TBLOB *blob); -inT16 check_seam_order(TBLOB *blob, SEAM *seam); - inT16 total_containment(TBLOB *blob1, TBLOB *blob2); #endif diff --git a/wordrec/findseam.cpp b/wordrec/findseam.cpp index 786393c510..dd2de6e699 100644 --- a/wordrec/findseam.cpp +++ b/wordrec/findseam.cpp @@ -27,7 +27,6 @@ ----------------------------------------------------------------------*/ #include "findseam.h" #include "gradechop.h" -#include "olutil.h" #include "plotedges.h" #include "outlines.h" #include "freelist.h" @@ -67,7 +66,7 @@ void Wordrec::add_seam_to_queue(float new_priority, SEAM *new_seam, if (new_seam == NULL) return; if (chop_debug) { tprintf("Pushing new seam with priority %g :", new_priority); - print_seam("seam: ", new_seam); + new_seam->Print("seam: "); } if (seams->size() >= MAX_NUM_SEAMS) { SeamPair old_pair(0, NULL); @@ -101,12 +100,9 @@ void Wordrec::add_seam_to_queue(float new_priority, SEAM *new_seam, * a split of NULL, then no further splits can be supplied by the * caller. **********************************************************************/ -void Wordrec::choose_best_seam(SeamQueue* seam_queue, - SPLIT *split, - PRIORITY priority, - SEAM **seam_result, - TBLOB *blob, - SeamPile* seam_pile) { +void Wordrec::choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, + PRIORITY priority, SEAM **seam_result, + TBLOB *blob, SeamPile *seam_pile) { SEAM *seam; char str[80]; float my_priority; @@ -116,9 +112,8 @@ void Wordrec::choose_best_seam(SeamQueue* seam_queue, TPOINT split_point = split->point1->pos; split_point += split->point2->pos; split_point /= 2; - seam = new SEAM(my_priority, split_point, split, NULL, NULL); - if (chop_debug > 1) - print_seam ("Partial priority ", seam); + seam = new SEAM(my_priority, split_point, *split); + if (chop_debug > 1) seam->Print("Partial priority "); add_seam_to_queue(my_priority, seam, seam_queue); if (my_priority > chop_good_split) @@ -132,19 +127,22 @@ void Wordrec::choose_best_seam(SeamQueue* seam_queue, seam_queue->Pop(&seam_pair); seam = seam_pair.extract_data(); /* Set full priority */ - my_priority = seam_priority(seam, bbox.left(), bbox.right()); + my_priority = seam->FullPriority(bbox.left(), bbox.right(), + chop_overlap_knob, chop_centered_maxwidth, + chop_center_knob, chop_width_change_knob); if (chop_debug) { sprintf (str, "Full my_priority %0.0f, ", my_priority); - print_seam(str, seam); + seam->Print(str); } - if ((*seam_result == NULL || (*seam_result)->priority > my_priority) && + if ((*seam_result == NULL || (*seam_result)->priority() > my_priority) && my_priority < chop_ok_split) { /* No crossing */ - if (constrained_split(seam->split1, blob)) { + if (seam->IsHealthy(*blob, chop_min_outline_points, + chop_min_outline_area)) { delete *seam_result; *seam_result = new SEAM(*seam); - (*seam_result)->priority = my_priority; + (*seam_result)->set_priority(my_priority); } else { delete seam; seam = NULL; @@ -198,104 +196,17 @@ void Wordrec::choose_best_seam(SeamQueue* seam_queue, **********************************************************************/ void Wordrec::combine_seam(const SeamPile& seam_pile, const SEAM* seam, SeamQueue* seam_queue) { - register inT16 dist; - inT16 bottom1, top1; - inT16 bottom2, top2; - - SEAM *new_one; - const SEAM *this_one; - - bottom1 = seam->split1->point1->pos.y; - if (seam->split1->point2->pos.y >= bottom1) - top1 = seam->split1->point2->pos.y; - else { - top1 = bottom1; - bottom1 = seam->split1->point2->pos.y; - } - if (seam->split2 != NULL) { - bottom2 = seam->split2->point1->pos.y; - if (seam->split2->point2->pos.y >= bottom2) - top2 = seam->split2->point2->pos.y; - else { - top2 = bottom2; - bottom2 = seam->split2->point2->pos.y; - } - } - else { - bottom2 = bottom1; - top2 = top1; - } for (int x = 0; x < seam_pile.size(); ++x) { - this_one = seam_pile.get(x).data(); - dist = seam->location.x - this_one->location.x; - if (-SPLIT_CLOSENESS < dist && - dist < SPLIT_CLOSENESS && - seam->priority + this_one->priority < chop_ok_split) { - inT16 split1_point1_y = this_one->split1->point1->pos.y; - inT16 split1_point2_y = this_one->split1->point2->pos.y; - inT16 split2_point1_y = 0; - inT16 split2_point2_y = 0; - if (this_one->split2) { - split2_point1_y = this_one->split2->point1->pos.y; - split2_point2_y = this_one->split2->point2->pos.y; - } - if ( - /*!tessedit_fix_sideways_chops || */ - ( - /* this_one->split1 always exists */ - ( - ((split1_point1_y >= top1 && split1_point2_y >= top1) || - (split1_point1_y <= bottom1 && split1_point2_y <= bottom1)) - && - ((split1_point1_y >= top2 && split1_point2_y >= top2) || - (split1_point1_y <= bottom2 && split1_point2_y <= bottom2)) - ) - ) - && - ( - this_one->split2 == NULL || - ( - ((split2_point1_y >= top1 && split2_point2_y >= top1) || - (split2_point1_y <= bottom1 && split2_point2_y <= bottom1)) - && - ((split2_point1_y >= top2 && split2_point2_y >= top2) || - (split2_point1_y <= bottom2 && split2_point2_y <= bottom2)) - ) - ) - ) { - new_one = join_two_seams (seam, this_one); - if (new_one != NULL) { - if (chop_debug > 1) - print_seam ("Combo priority ", new_one); - add_seam_to_queue(new_one->priority, new_one, seam_queue); - } - } + const SEAM *this_one = seam_pile.get(x).data(); + if (seam->CombineableWith(*this_one, SPLIT_CLOSENESS, chop_ok_split)) { + SEAM *new_one = new SEAM(*seam); + new_one->CombineWith(*this_one); + if (chop_debug > 1) new_one->Print("Combo priority "); + add_seam_to_queue(new_one->priority(), new_one, seam_queue); } } } - -/********************************************************************** - * constrained_split - * - * Constrain this split to obey certain rules. It must not cross any - * inner outline. It must not cut off a small chunk of the outline. - **********************************************************************/ -inT16 Wordrec::constrained_split(SPLIT *split, TBLOB *blob) { - TESSLINE *outline; - - if (is_little_chunk (split->point1, split->point2)) - return (FALSE); - - for (outline = blob->outlines; outline; outline = outline->next) { - if (split_bounds_overlap (split, outline) && - crosses_outline (split->point1, split->point2, outline->loop)) { - return (FALSE); - } - } - return (TRUE); -} - /********************************************************************** * pick_good_seam * @@ -335,16 +246,15 @@ SEAM *Wordrec::pick_good_seam(TBLOB *blob) { if (seam == NULL) { choose_best_seam(&seam_queue, NULL, BAD_PRIORITY, &seam, blob, &seam_pile); - } - else if (seam->priority > chop_good_split) { - choose_best_seam(&seam_queue, NULL, seam->priority, - &seam, blob, &seam_pile); + } else if (seam->priority() > chop_good_split) { + choose_best_seam(&seam_queue, NULL, seam->priority(), &seam, blob, + &seam_pile); } EDGEPT_C_IT it(&new_points); for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { EDGEPT *inserted_point = it.data(); - if (!point_used_by_seam(seam, inserted_point)) { + if (seam == NULL || !seam->UsesPoint(inserted_point)) { for (outline = blob->outlines; outline; outline = outline->next) { if (outline->loop == inserted_point) { outline->loop = outline->loop->next; @@ -355,18 +265,13 @@ SEAM *Wordrec::pick_good_seam(TBLOB *blob) { } if (seam) { - if (seam->priority > chop_ok_split) { + if (seam->priority() > chop_ok_split) { delete seam; seam = NULL; } #ifndef GRAPHICS_DISABLED else if (wordrec_display_splits) { - if (seam->split1) - mark_split (seam->split1); - if (seam->split2) - mark_split (seam->split2); - if (seam->split3) - mark_split (seam->split3); + seam->Mark(edge_window); if (chop_debug > 2) { update_edge_window(); edge_window_wait(); @@ -382,42 +287,6 @@ SEAM *Wordrec::pick_good_seam(TBLOB *blob) { } -/********************************************************************** - * seam_priority - * - * Assign a full priority value to the seam. - **********************************************************************/ -PRIORITY Wordrec::seam_priority(SEAM *seam, inT16 xmin, inT16 xmax) { - PRIORITY priority; - - if (seam->split1 == NULL) - priority = 0; - - else if (seam->split2 == NULL) { - priority = (seam->priority + - full_split_priority (seam->split1, xmin, xmax)); - } - - else if (seam->split3 == NULL) { - split_outline (seam->split2->point1, seam->split2->point2); - priority = (seam->priority + - full_split_priority (seam->split1, xmin, xmax)); - unsplit_outlines (seam->split2->point1, seam->split2->point2); - } - - else { - split_outline (seam->split2->point1, seam->split2->point2); - split_outline (seam->split3->point1, seam->split3->point2); - priority = (seam->priority + - full_split_priority (seam->split1, xmin, xmax)); - unsplit_outlines (seam->split3->point1, seam->split3->point2); - unsplit_outlines (seam->split2->point1, seam->split2->point2); - } - - return (priority); -} - - /********************************************************************** * try_point_pairs * @@ -433,23 +302,20 @@ void Wordrec::try_point_pairs(EDGEPT * points[MAX_NUM_POINTS], TBLOB * blob) { inT16 x; inT16 y; - SPLIT *split; PRIORITY priority; for (x = 0; x < num_points; x++) { for (y = x + 1; y < num_points; y++) { - if (points[y] && - weighted_edgept_dist(points[x], points[y], - chop_x_y_weight) < chop_split_length && - points[x] != points[y]->next && - points[y] != points[x]->next && + points[x]->WeightedDistance(*points[y], chop_x_y_weight) < + chop_split_length && + points[x] != points[y]->next && points[y] != points[x]->next && !is_exterior_point(points[x], points[y]) && !is_exterior_point(points[y], points[x])) { - split = new_split (points[x], points[y]); - priority = partial_split_priority (split); + SPLIT split(points[x], points[y]); + priority = partial_split_priority(&split); - choose_best_seam(seam_queue, split, priority, seam, blob, seam_pile); + choose_best_seam(seam_queue, &split, priority, seam, blob, seam_pile); } } } @@ -474,7 +340,6 @@ void Wordrec::try_vertical_splits(EDGEPT * points[MAX_NUM_POINTS], SEAM ** seam, TBLOB * blob) { EDGEPT *vertical_point = NULL; - SPLIT *split; inT16 x; PRIORITY priority; TESSLINE *outline; @@ -486,16 +351,13 @@ void Wordrec::try_vertical_splits(EDGEPT * points[MAX_NUM_POINTS], &vertical_point, new_points); } - if (vertical_point && - points[x] != vertical_point->next && - vertical_point != points[x]->next && - weighted_edgept_dist(points[x], vertical_point, - chop_x_y_weight) < chop_split_length) { - - split = new_split (points[x], vertical_point); - priority = partial_split_priority (split); - - choose_best_seam(seam_queue, split, priority, seam, blob, seam_pile); + if (vertical_point && points[x] != vertical_point->next && + vertical_point != points[x]->next && + points[x]->WeightedDistance(*vertical_point, chop_x_y_weight) < + chop_split_length) { + SPLIT split(points[x], vertical_point); + priority = partial_split_priority(&split); + choose_best_seam(seam_queue, &split, priority, seam, blob, seam_pile); } } } diff --git a/wordrec/gradechop.cpp b/wordrec/gradechop.cpp index dce35ba5fb..ace8dfc5fd 100644 --- a/wordrec/gradechop.cpp +++ b/wordrec/gradechop.cpp @@ -27,120 +27,19 @@ ----------------------------------------------------------------------*/ #include "gradechop.h" #include "wordrec.h" -#include "olutil.h" #include "chop.h" #include "ndminx.h" #include -/*---------------------------------------------------------------------- - T y p e s -----------------------------------------------------------------------*/ -#define CENTER_GRADE_CAP 25.0 - /*---------------------------------------------------------------------- M a c r o s ----------------------------------------------------------------------*/ -/********************************************************************** - * find_bounds_loop - * - * This is a macro to be used by set_outline_bounds. - **********************************************************************/ - -#define find_bounds_loop(point1,point2,x_min,x_max) \ - x_min = point2->pos.x; \ - x_max = point2->pos.x; \ - \ - this_point = point1; \ - do { \ - x_min = MIN (this_point->pos.x, x_min); \ - x_max = MAX (this_point->pos.x, x_max); \ - this_point = this_point->next; \ - } \ - while (this_point != point2 && this_point != point1) \ - namespace tesseract { /*---------------------------------------------------------------------- F u n c t i o n s ----------------------------------------------------------------------*/ -/********************************************************************** - * full_split_priority - * - * Assign a priority to this split based on the features that it has. - * Part of the priority has already been calculated so just return the - * additional amount for the bounding box type information. - **********************************************************************/ -PRIORITY Wordrec::full_split_priority(SPLIT *split, inT16 xmin, inT16 xmax) { - BOUNDS_RECT rect; - - set_outline_bounds (split->point1, split->point2, rect); - - if (xmin < MIN (rect[0], rect[2]) && xmax > MAX (rect[1], rect[3])) - return (999.0); - - return (grade_overlap (rect) + - grade_center_of_blob (rect) + grade_width_change (rect)); -} - - -/********************************************************************** - * grade_center_of_blob - * - * Return a grade for the a split. Rank it on closeness to the center - * of the original blob - * 0 = "perfect" - * 100 = "no way jay" - **********************************************************************/ -PRIORITY Wordrec::grade_center_of_blob(register BOUNDS_RECT rect) { - register PRIORITY grade; - int width1 = rect[1] - rect[0]; - int width2 = rect[3] - rect[2]; - - if (width1 > chop_centered_maxwidth && - width2 > chop_centered_maxwidth) { - return 0.0; - } - - grade = width1 - width2; - if (grade < 0) - grade = -grade; - - grade *= chop_center_knob; - grade = MIN (CENTER_GRADE_CAP, grade); - return (MAX (0.0, grade)); -} - - -/********************************************************************** - * grade_overlap - * - * Return a grade for this split for the overlap of the resultant blobs. - * 0 = "perfect" - * 100 = "no way jay" - **********************************************************************/ -PRIORITY Wordrec::grade_overlap(register BOUNDS_RECT rect) { - register PRIORITY grade; - register inT16 width1; - register inT16 width2; - register inT16 overlap; - - width1 = rect[3] - rect[2]; - width2 = rect[1] - rect[0]; - - overlap = MIN (rect[1], rect[3]) - MAX (rect[0], rect[2]); - width1 = MIN (width1, width2); - if (overlap == width1) - return (100.0); /* Total overlap */ - - width1 = 2 * overlap - width1; /* Extra penalty for too */ - overlap += MAX (0, width1); /* much overlap */ - - grade = overlap * chop_overlap_knob; - - return (MAX (0.0, grade)); -} - /********************************************************************** * grade_split_length @@ -153,8 +52,8 @@ PRIORITY Wordrec::grade_split_length(register SPLIT *split) { register PRIORITY grade; register float split_length; - split_length = weighted_edgept_dist (split->point1, split->point2, - chop_x_y_weight); + split_length = + split->point1->WeightedDistance(*split->point2, chop_x_y_weight); if (split_length <= 0) grade = 0; @@ -188,51 +87,4 @@ PRIORITY Wordrec::grade_sharpness(register SPLIT *split) { } -/********************************************************************** - * grade_width_change - * - * Return a grade for the change in width of the resultant blobs. - * 0 = "perfect" - * 100 = "no way jay" - **********************************************************************/ -PRIORITY Wordrec::grade_width_change(register BOUNDS_RECT rect) { - register PRIORITY grade; - register inT32 width1; - register inT32 width2; - - width1 = rect[3] - rect[2]; - width2 = rect[1] - rect[0]; - - grade = 20 - (MAX (rect[1], rect[3]) - - MIN (rect[0], rect[2]) - MAX (width1, width2)); - - grade *= chop_width_change_knob; - - return (MAX (0.0, grade)); -} - - -/********************************************************************** - * set_outline_bounds - * - * Set up the limits for the x coordinate of the outline. - **********************************************************************/ -void Wordrec::set_outline_bounds(register EDGEPT *point1, - register EDGEPT *point2, - BOUNDS_RECT rect) { - register EDGEPT *this_point; - register inT16 x_min; - register inT16 x_max; - - find_bounds_loop(point1, point2, x_min, x_max); - - rect[0] = x_min; - rect[1] = x_max; - - find_bounds_loop(point2, point1, x_min, x_max); - - rect[2] = x_min; - rect[3] = x_max; -} - } // namespace tesseract diff --git a/wordrec/gradechop.h b/wordrec/gradechop.h index 469a140b3d..01e5bf2641 100644 --- a/wordrec/gradechop.h +++ b/wordrec/gradechop.h @@ -32,11 +32,6 @@ #include "seam.h" #include "ndminx.h" -/*---------------------------------------------------------------------- - T y p e s -----------------------------------------------------------------------*/ -typedef inT16 BOUNDS_RECT[4]; - /*---------------------------------------------------------------------- M a c r o s ----------------------------------------------------------------------*/ @@ -52,18 +47,4 @@ typedef inT16 BOUNDS_RECT[4]; (grade_split_length (split) + \ grade_sharpness (split)) \ - -/********************************************************************** - * split_bounds_overlap - * - * Check to see if this split might overlap with this outline. Return - * TRUE if there is a positive overlap in the bounding boxes of the two. - **********************************************************************/ - -#define split_bounds_overlap(split,outline) \ -(outline->topleft.x <= MAX (split->point1->pos.x,split->point2->pos.x) && \ - outline->botright.x >= MIN (split->point1->pos.x,split->point2->pos.x) && \ - outline->botright.y <= MAX (split->point1->pos.y,split->point2->pos.y) && \ - outline->topleft.y >= MIN (split->point1->pos.y,split->point2->pos.y)) - #endif diff --git a/wordrec/makechop.cpp b/wordrec/makechop.cpp deleted file mode 100644 index d6795bc3e7..0000000000 --- a/wordrec/makechop.cpp +++ /dev/null @@ -1,226 +0,0 @@ -/* -*-C-*- - ******************************************************************************** - * - * File: makechop.c (Formerly makechop.c) - * Description: - * Author: Mark Seaman, OCR Technology - * Created: Fri Oct 16 14:37:00 1987 - * Modified: Mon Jul 29 15:50:42 1991 (Mark Seaman) marks@hpgrlt - * Language: C - * Package: N/A - * Status: Reusable Software Component - * - * (c) Copyright 1987, Hewlett-Packard Company. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - *********************************************************************************/ -/*---------------------------------------------------------------------- - I n c l u d e s -----------------------------------------------------------------------*/ - -#include "makechop.h" -#include "blobs.h" -#include "render.h" -#include "structures.h" -#ifdef __UNIX__ -#include -#include -#endif - -// Include automatically generated configuration file if running autoconf. -#ifdef HAVE_CONFIG_H -#include "config_auto.h" -#endif - -/*---------------------------------------------------------------------- - Public Function Code -----------------------------------------------------------------------*/ -/********************************************************************** - * apply_seam - * - * Split this blob into two blobs by applying the splits included in - * the seam description. - **********************************************************************/ -void apply_seam(TBLOB *blob, TBLOB *other_blob, bool italic_blob, SEAM *seam) { - if (seam->split1 == NULL) { - divide_blobs(blob, other_blob, italic_blob, seam->location); - } - else if (seam->split2 == NULL) { - make_split_blobs(blob, other_blob, italic_blob, seam); - } - else if (seam->split3 == NULL) { - make_double_split(blob, other_blob, italic_blob, seam); - } - else { - make_triple_split(blob, other_blob, italic_blob, seam); - } -} - - -/********************************************************************** - * form_two_blobs - * - * Group the outlines from the first blob into both of them. Do so - * according to the information about the split. - **********************************************************************/ -void form_two_blobs(TBLOB *blob, TBLOB *other_blob, bool italic_blob, - const TPOINT& location) { - setup_blob_outlines(blob); - - divide_blobs(blob, other_blob, italic_blob, location); - - eliminate_duplicate_outlines(blob); - eliminate_duplicate_outlines(other_blob); - - correct_blob_order(blob, other_blob); -} - - -/********************************************************************** - * make_double_split - * - * Create two blobs out of one by splitting the original one in half. - * Return the resultant blobs for classification. - **********************************************************************/ -void make_double_split(TBLOB *blob, TBLOB *other_blob, bool italic_blob, - SEAM *seam) { - make_single_split(blob->outlines, seam->split1); - make_single_split(blob->outlines, seam->split2); - form_two_blobs(blob, other_blob, italic_blob, seam->location); -} - - -/********************************************************************** - * make_single_split - * - * Create two outlines out of one by splitting the original one in half. - * Return the resultant outlines. - **********************************************************************/ -void make_single_split(TESSLINE *outlines, SPLIT *split) { - assert (outlines != NULL); - - split_outline (split->point1, split->point2); - - while (outlines->next != NULL) - outlines = outlines->next; - - outlines->next = new TESSLINE; - outlines->next->loop = split->point1; - outlines->next->ComputeBoundingBox(); - - outlines = outlines->next; - - outlines->next = new TESSLINE; - outlines->next->loop = split->point2; - outlines->next->ComputeBoundingBox(); - - outlines->next->next = NULL; -} - - -/********************************************************************** - * make_split_blobs - * - * Create two blobs out of one by splitting the original one in half. - * Return the resultant blobs for classification. - **********************************************************************/ -void make_split_blobs(TBLOB *blob, TBLOB *other_blob, bool italic_blob, - SEAM *seam) { - make_single_split(blob->outlines, seam->split1); - - form_two_blobs (blob, other_blob, italic_blob, seam->location); -} - - -/********************************************************************** - * make_triple_split - * - * Create two blobs out of one by splitting the original one in half. - * This splitting is accomplished by applying three separate splits on - * the outlines. Three of the starting outlines will produce two ending - * outlines. Return the resultant blobs for classification. - **********************************************************************/ -void make_triple_split(TBLOB *blob, TBLOB *other_blob, bool italic_blob, - SEAM *seam) { - make_single_split(blob->outlines, seam->split1); - make_single_split(blob->outlines, seam->split2); - make_single_split(blob->outlines, seam->split3); - - form_two_blobs(blob, other_blob, italic_blob, seam->location); -} - - -/********************************************************************** - * undo_seam - * - * Remove the seam between these two blobs. Produce one blob as a - * result. The seam may consist of one, two, or three splits. Each - * of these split must be removed from the outlines. - **********************************************************************/ -void undo_seam(TBLOB *blob, TBLOB *other_blob, SEAM *seam) { - TESSLINE *outline; - - if (!seam) - return; /* Append other blob outlines */ - if (blob->outlines == NULL) { - blob->outlines = other_blob->outlines; - other_blob->outlines = NULL; - } - - outline = blob->outlines; - while (outline->next) - outline = outline->next; - outline->next = other_blob->outlines; - other_blob->outlines = NULL; - delete other_blob; - - if (seam->split1 == NULL) { - } - else if (seam->split2 == NULL) { - undo_single_split (blob, seam->split1); - } - else if (seam->split3 == NULL) { - undo_single_split (blob, seam->split1); - undo_single_split (blob, seam->split2); - } - else { - undo_single_split (blob, seam->split3); - undo_single_split (blob, seam->split2); - undo_single_split (blob, seam->split1); - } - - setup_blob_outlines(blob); - eliminate_duplicate_outlines(blob); -} - - -/********************************************************************** - * undo_single_split - * - * Undo a seam that is made by a single split. Perform the correct - * magic to reconstruct the appropriate set of outline data structures. - **********************************************************************/ -void undo_single_split(TBLOB *blob, SPLIT *split) { - TESSLINE *outline1; - TESSLINE *outline2; - /* Modify edge points */ - unsplit_outlines (split->point1, split->point2); - - outline1 = new TESSLINE; - outline1->next = blob->outlines; - blob->outlines = outline1; - outline1->loop = split->point1; - - outline2 = new TESSLINE; - outline2->next = blob->outlines; - blob->outlines = outline2; - outline2->loop = split->point2; -} diff --git a/wordrec/makechop.h b/wordrec/makechop.h deleted file mode 100644 index 1f2639cd48..0000000000 --- a/wordrec/makechop.h +++ /dev/null @@ -1,71 +0,0 @@ -/* -*-C-*- - ******************************************************************************** - * - * File: makechop.h (Formerly makechop.h) - * Description: - * Author: Mark Seaman, SW Productivity - * Created: Fri Oct 16 14:37:00 1987 - * Modified: Mon Jul 29 13:33:23 1991 (Mark Seaman) marks@hpgrlt - * Language: C - * Package: N/A - * Status: Reusable Software Component - * - * (c) Copyright 1987, Hewlett-Packard Company. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - *********************************************************************************/ -#ifndef MAKECHOP_H -#define MAKECHOP_H - -/*---------------------------------------------------------------------- - I n c l u d e s -----------------------------------------------------------------------*/ -#include "chop.h" -#include "olutil.h" - -/*---------------------------------------------------------------------- - M a c r o s ----------------------------------------------------------------------*/ -/********************************************************************** - * is_split_outline - * - * Check to see if both sides of the split fall within the bounding - * box of this outline. - **********************************************************************/ - -#define is_split_outline(outline,split) \ -(outline->Contains(split->point1->pos) && \ - outline->Contains(split->point2->pos)) \ - - -/*---------------------------------------------------------------------- - Public Function Prototypes -----------------------------------------------------------------------*/ -void apply_seam(TBLOB *blob, TBLOB *other_blob, bool italic_blob, SEAM *seam); - -void form_two_blobs(TBLOB *blob, TBLOB *other_blob, bool italic_blob, - const TPOINT& location); - -void make_double_split(TBLOB *blob, TBLOB *other_blob, bool italic_blob, - SEAM *seam); - -void make_single_split(TESSLINE *outlines, SPLIT *split); - -void make_split_blobs(TBLOB *blob, TBLOB *other_blob, bool italic_blob, - SEAM *seam); - -void make_triple_split(TBLOB *blob, TBLOB *other_blob, bool italic_blob, - SEAM *seam); - -void undo_seam(TBLOB *blob, TBLOB *other_blob, SEAM *seam); - -void undo_single_split(TBLOB *blob, SPLIT *split); -#endif diff --git a/wordrec/olutil.cpp b/wordrec/olutil.cpp deleted file mode 100644 index dadf51af89..0000000000 --- a/wordrec/olutil.cpp +++ /dev/null @@ -1,102 +0,0 @@ -/* -*-C-*- - ******************************************************************************** - * - * File: olutil.c (Formerly olutil.c) - * Description: - * Author: Mark Seaman, OCR Technology - * Created: Fri Oct 16 14:37:00 1987 - * Modified: Fri May 17 13:11:24 1991 (Mark Seaman) marks@hpgrlt - * Language: C - * Package: N/A - * Status: Reusable Software Component - * - * (c) Copyright 1987, Hewlett-Packard Company. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - *********************************************************************************/ -/*---------------------------------------------------------------------- - I n c l u d e s -----------------------------------------------------------------------*/ -#include "olutil.h" -#include "structures.h" -#include "blobs.h" -#include "const.h" - -#ifdef __UNIX__ -#include -#endif - -/*---------------------------------------------------------------------- - F u n c t i o n s -----------------------------------------------------------------------*/ -/********************************************************************** - * correct_blob_order - * - * Check to see if the blobs are in the correct order. If they are not - * then swap which outlines are attached to which blobs. - **********************************************************************/ -void correct_blob_order(TBLOB *blob1, TBLOB *blob2) { - TPOINT origin1; - TPOINT origin2; - TESSLINE *temp; - - blob_origin(blob1, &origin1); - blob_origin(blob2, &origin2); - - if (origin1.x > origin2.x) { - temp = blob2->outlines; - blob2->outlines = blob1->outlines; - blob1->outlines = temp; - } -} - - -/********************************************************************** - * eliminate_duplicate_outlines - * - * Find and delete any duplicate outline records in this blob. - **********************************************************************/ -void eliminate_duplicate_outlines(TBLOB *blob) { - TESSLINE *outline; - TESSLINE *other_outline; - TESSLINE *last_outline; - - for (outline = blob->outlines; outline; outline = outline->next) { - - for (last_outline = outline, other_outline = outline->next; - other_outline; - last_outline = other_outline, other_outline = other_outline->next) { - - if (same_outline_bounds (outline, other_outline)) { - last_outline->next = other_outline->next; - // This doesn't leak - the outlines share the EDGEPTs. - other_outline->loop = NULL; - delete other_outline; - other_outline = last_outline; - // If it is part of a cut, then it can't be a hole any more. - outline->is_hole = false; - } - } - } -} - -/********************************************************************** - * setup_blob_outlines - * - * Set up each of the outlines in this blob. - **********************************************************************/ -void setup_blob_outlines(TBLOB *blob) { - TESSLINE *outline; - - for (outline = blob->outlines; outline; outline = outline->next) { - outline->ComputeBoundingBox(); - } -} diff --git a/wordrec/olutil.h b/wordrec/olutil.h deleted file mode 100644 index c7eeecd235..0000000000 --- a/wordrec/olutil.h +++ /dev/null @@ -1,82 +0,0 @@ -/* -*-C-*- - ******************************************************************************** - * - * File: olutil.h (Formerly olutil.h) - * Description: - * Author: Mark Seaman, SW Productivity - * Created: Fri Oct 16 14:37:00 1987 - * Modified: Wed Jul 10 14:21:55 1991 (Mark Seaman) marks@hpgrlt - * Language: C - * Package: N/A - * Status: Reusable Software Component - * - * (c) Copyright 1987, Hewlett-Packard Company. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - *********************************************************************************/ -#ifndef OLUTIL_H -#define OLUTIL_H - -/*---------------------------------------------------------------------- - I n c l u d e s -----------------------------------------------------------------------*/ -#include "blobs.h" - -/*---------------------------------------------------------------------- - M a c r o s -----------------------------------------------------------------------*/ -/********************************************************************** - * is_inside_angle - * - * Return true if the edgept supplied as input is an inside angle. This - * is determined by the angular change of the vectors from point to - * point. - - **********************************************************************/ - -#define is_inside_angle(pt) \ -(angle_change ((pt)->prev, (pt), (pt)->next) < chop_inside_angle) - -/********************************************************************** - * same_outline_bounds - * - * Return TRUE if these two outlines have the same bounds. - **********************************************************************/ - -#define same_outline_bounds(outline,other_outline) \ -(outline->topleft.x == other_outline->topleft.x && \ - outline->topleft.y == other_outline->topleft.y && \ - outline->botright.x == other_outline->botright.x && \ - outline->botright.y == other_outline->botright.y) \ - - -/********************************************************************** - * weighted_edgept_dist - * - * Return the distance (squared) between the two edge points. - **********************************************************************/ - -#define weighted_edgept_dist(p1,p2,chop_x_y_weight) \ -(((p1)->pos.x - (p2)->pos.x) * \ - ((p1)->pos.x - (p2)->pos.x) * chop_x_y_weight + \ - ((p1)->pos.y - (p2)->pos.y) * \ - ((p1)->pos.y - (p2)->pos.y)) - -/*---------------------------------------------------------------------- - F u n c t i o n s -----------------------------------------------------------------------*/ -void correct_blob_order(TBLOB *blob1, TBLOB *blob2); - -void eliminate_duplicate_outlines(TBLOB *blob); - -void setup_blob_outlines(TBLOB *blob); - -#endif diff --git a/wordrec/outlines.cpp b/wordrec/outlines.cpp index 3d31a67cbf..fdcedfc73b 100644 --- a/wordrec/outlines.cpp +++ b/wordrec/outlines.cpp @@ -39,73 +39,6 @@ namespace tesseract { /*---------------------------------------------------------------------- F u n c t i o n s ----------------------------------------------------------------------*/ -/********************************************************************** - * crosses_outline - * - * Check to see if this line crosses over this outline. If it does - * return TRUE. - **********************************************************************/ -int Wordrec::crosses_outline(EDGEPT *p0, /* Start of line */ - EDGEPT *p1, /* End of line */ - EDGEPT *outline) { /* Outline to check */ - EDGEPT *pt = outline; - do { - if (is_crossed (p0->pos, p1->pos, pt->pos, pt->next->pos)) - return (TRUE); - pt = pt->next; - } - while (pt != outline); - return (FALSE); -} - - -/********************************************************************** - * is_crossed - * - * Return TRUE when the two line segments cross each other. Find out - * where the projected lines would cross and then check to see if the - * point of intersection lies on both of the line segments. If it does - * then these two segments cross. - **********************************************************************/ -int Wordrec::is_crossed(TPOINT a0, TPOINT a1, TPOINT b0, TPOINT b1) { - int b0a1xb0b1, b0b1xb0a0; - int a1b1xa1a0, a1a0xa1b0; - - TPOINT b0a1, b0a0, a1b1, b0b1, a1a0; - - b0a1.x = a1.x - b0.x; - b0a0.x = a0.x - b0.x; - a1b1.x = b1.x - a1.x; - b0b1.x = b1.x - b0.x; - a1a0.x = a0.x - a1.x; - b0a1.y = a1.y - b0.y; - b0a0.y = a0.y - b0.y; - a1b1.y = b1.y - a1.y; - b0b1.y = b1.y - b0.y; - a1a0.y = a0.y - a1.y; - - b0a1xb0b1 = CROSS (b0a1, b0b1); - b0b1xb0a0 = CROSS (b0b1, b0a0); - a1b1xa1a0 = CROSS (a1b1, a1a0); - /*a1a0xa1b0=CROSS(a1a0,a1b0); */ - a1a0xa1b0 = -CROSS (a1a0, b0a1); - - return ((b0a1xb0b1 > 0 && b0b1xb0a0 > 0) - || (b0a1xb0b1 < 0 && b0b1xb0a0 < 0)) - && ((a1b1xa1a0 > 0 && a1a0xa1b0 > 0) || (a1b1xa1a0 < 0 && a1a0xa1b0 < 0)); -} - - -/********************************************************************** - * is_same_edgept - * - * Return true if the points are identical. - **********************************************************************/ -int Wordrec::is_same_edgept(EDGEPT *p1, EDGEPT *p2) { - return (p1 == p2); -} - - /********************************************************************** * near_point * @@ -153,30 +86,4 @@ bool Wordrec::near_point(EDGEPT *point, } } - -/********************************************************************** - * reverse_outline - * - * Change the direction of the outline. If it was clockwise make it - * counter-clockwise and vice versa. Do this by swapping each of the - * next and prev fields of each edge point. - **********************************************************************/ -void Wordrec::reverse_outline(EDGEPT *outline) { - EDGEPT *edgept = outline; - EDGEPT *temp; - - do { - /* Swap next and prev */ - temp = edgept->prev; - edgept->prev = edgept->next; - edgept->next = temp; - /* Set up vec field */ - edgept->vec.x = edgept->next->pos.x - edgept->pos.x; - edgept->vec.y = edgept->next->pos.y - edgept->pos.y; - - edgept = edgept->prev; /* Go to next point */ - } - while (edgept != outline); -} - } // namespace tesseract diff --git a/wordrec/pieces.cpp b/wordrec/pieces.cpp index 35462ea22c..f920534051 100644 --- a/wordrec/pieces.cpp +++ b/wordrec/pieces.cpp @@ -58,7 +58,7 @@ BLOB_CHOICE_LIST *Wordrec::classify_piece(const GenericVector& seams, const char* description, TWERD *word, BlamerBundle *blamer_bundle) { - if (end > start) join_pieces(seams, start, end, word); + if (end > start) SEAM::JoinPieces(seams, word->blobs, start, end); BLOB_CHOICE_LIST *choices = classify_blob(word->blobs[start], description, White, blamer_bundle); // Set the matrix_cell_ entries in all the BLOB_CHOICES. @@ -67,7 +67,7 @@ BLOB_CHOICE_LIST *Wordrec::classify_piece(const GenericVector& seams, bc_it.data()->set_matrix_cell(start, end); } - if (end > start) break_pieces(seams, start, end, word); + if (end > start) SEAM::BreakPieces(seams, word->blobs, start, end); return (choices); } diff --git a/wordrec/plotedges.cpp b/wordrec/plotedges.cpp index 0aa02c37fb..f7fbacee28 100644 --- a/wordrec/plotedges.cpp +++ b/wordrec/plotedges.cpp @@ -119,21 +119,4 @@ void mark_outline(EDGEPT *edgept) { /* Start of point list */ c_make_current(window); } - -/********************************************************************** - * mark_split - * - * Set up the marks list to be displayed in subsequent updates and draw - * the marks in the current window. The marks are stored in the second - * sublist. The first sublist is left unmodified. - **********************************************************************/ -void mark_split(SPLIT *split) { - void *window = edge_window; - - c_line_color_index(window, Green); - c_move (window, (float) split->point1->pos.x, (float) split->point1->pos.y); - c_draw (window, (float) split->point2->pos.x, (float) split->point2->pos.y); - c_make_current(window); -} - #endif // GRAPHICS_DISABLED diff --git a/wordrec/plotedges.h b/wordrec/plotedges.h index d0ca40be77..91521de734 100644 --- a/wordrec/plotedges.h +++ b/wordrec/plotedges.h @@ -28,7 +28,6 @@ #include "callcpp.h" #include "oldlist.h" #include "blobs.h" -#include "split.h" /*---------------------------------------------------------------------- V a r i a b l e s @@ -67,5 +66,4 @@ void draw_blob_edges(TBLOB *blob); void mark_outline(EDGEPT *edgept); -void mark_split(SPLIT *split); #endif diff --git a/wordrec/segsearch.cpp b/wordrec/segsearch.cpp index 29d03702e4..a6fe10ff30 100644 --- a/wordrec/segsearch.cpp +++ b/wordrec/segsearch.cpp @@ -53,8 +53,7 @@ void Wordrec::SegSearch(WERD_RES* word_res, improve_by_chopping(rating_cert_scale, word_res, best_choice_bundle, blamer_bundle, &pain_points, &pending); } - if (chop_debug) - print_seams("Final seam list:", word_res->seam_array); + if (chop_debug) SEAM::PrintSeams("Final seam list:", word_res->seam_array); if (blamer_bundle != NULL && !blamer_bundle->ChoiceIsCorrect(word_res->best_choice)) { diff --git a/wordrec/wordrec.h b/wordrec/wordrec.h index a69026b173..38f09f23d2 100644 --- a/wordrec/wordrec.h +++ b/wordrec/wordrec.h @@ -290,9 +290,10 @@ class Wordrec : public Classify { // chop.cpp PRIORITY point_priority(EDGEPT *point); void add_point_to_list(PointHeap* point_heap, EDGEPT *point); + // Returns true if the edgept supplied as input is an inside angle. This + // is determined by the angular change of the vectors from point to point. + bool is_inside_angle(EDGEPT *pt); int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3); - int is_little_chunk(EDGEPT *point1, EDGEPT *point2); - int is_small_area(EDGEPT *point1, EDGEPT *point2); EDGEPT *pick_close_point(EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist); @@ -335,17 +336,12 @@ class Wordrec : public Classify { // findseam.cpp void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue* seams); - void choose_best_seam(SeamQueue* seam_queue, - SPLIT *split, - PRIORITY priority, - SEAM **seam_result, - TBLOB *blob, - SeamPile* seam_pile); + void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, + PRIORITY priority, SEAM **seam_result, TBLOB *blob, + SeamPile *seam_pile); void combine_seam(const SeamPile& seam_pile, const SEAM* seam, SeamQueue* seam_queue); - inT16 constrained_split(SPLIT *split, TBLOB *blob); SEAM *pick_good_seam(TBLOB *blob); - PRIORITY seam_priority(SEAM *seam, inT16 xmin, inT16 xmax); void try_point_pairs (EDGEPT * points[MAX_NUM_POINTS], inT16 num_points, SeamQueue* seam_queue, @@ -359,23 +355,12 @@ class Wordrec : public Classify { SEAM ** seam, TBLOB * blob); // gradechop.cpp - PRIORITY full_split_priority(SPLIT *split, inT16 xmin, inT16 xmax); - PRIORITY grade_center_of_blob(register BOUNDS_RECT rect); - PRIORITY grade_overlap(register BOUNDS_RECT rect); PRIORITY grade_split_length(register SPLIT *split); PRIORITY grade_sharpness(register SPLIT *split); - PRIORITY grade_width_change(register BOUNDS_RECT rect); - void set_outline_bounds(register EDGEPT *point1, - register EDGEPT *point2, - BOUNDS_RECT rect); // outlines.cpp - int crosses_outline(EDGEPT *p0, EDGEPT *p1, EDGEPT *outline); - int is_crossed(TPOINT a0, TPOINT a1, TPOINT b0, TPOINT b1); - int is_same_edgept(EDGEPT *p1, EDGEPT *p2); bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt); - void reverse_outline(EDGEPT *outline); // pieces.cpp virtual BLOB_CHOICE_LIST *classify_piece(const GenericVector& seams, From e735a9017b14309259015030c0ef2f4be5bfe9ca Mon Sep 17 00:00:00 2001 From: Ray Smith Date: Tue, 12 May 2015 15:05:56 -0700 Subject: [PATCH 02/15] Makefile.am change for Split/seam refactor --- wordrec/Makefile.am | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/wordrec/Makefile.am b/wordrec/Makefile.am index 83c14a98db..a18e63a030 100644 --- a/wordrec/Makefile.am +++ b/wordrec/Makefile.am @@ -13,8 +13,8 @@ noinst_HEADERS = \ associate.h chop.h \ chopper.h drawfx.h findseam.h gradechop.h \ language_model.h lm_consistency.h lm_pain_points.h lm_state.h \ - makechop.h measure.h \ - olutil.h outlines.h params_model.h plotedges.h \ + measure.h \ + outlines.h params_model.h plotedges.h \ render.h \ wordrec.h @@ -36,7 +36,6 @@ libtesseract_wordrec_la_SOURCES = \ associate.cpp chop.cpp chopper.cpp \ drawfx.cpp findseam.cpp gradechop.cpp \ language_model.cpp lm_consistency.cpp lm_pain_points.cpp lm_state.cpp \ - makechop.cpp \ - olutil.cpp outlines.cpp params_model.cpp pieces.cpp \ + outlines.cpp params_model.cpp pieces.cpp \ plotedges.cpp render.cpp segsearch.cpp \ tface.cpp wordclass.cpp wordrec.cpp From 53fc4456cc0a105c5ac93aaa6d27a7124da669ef Mon Sep 17 00:00:00 2001 From: Ray Smith Date: Tue, 12 May 2015 15:22:34 -0700 Subject: [PATCH 03/15] Fixed issue 1252: Refactored LearnBlob and its call hierarchy to make it a member of Classify. Eliminated the flexfx scheme for calling global feature extractor functions through an array of function pointers. Deleted dead code I found as a by-product. This CL does not change BlobToTrainingSample or ExtractFeatures to be full members of Classify (the eventual goal) as that would make it even bigger, since there are a lot of callers to these functions. When ExtractFeatures and BlobToTrainingSample are members of Classify they will be able to access control parameters in Classify, which will greatly simplify developing variations to the feature extraction process. --- api/baseapi.cpp | 29 +++++++-- api/baseapi.h | 8 ++- ccmain/applybox.cpp | 6 +- classify/adaptmatch.cpp | 37 +++++------ classify/blobclass.cpp | 136 ++++++++++++++++----------------------- classify/blobclass.h | 24 +++---- classify/classify.h | 43 +++++++++---- classify/extern.h | 32 --------- classify/extract.cpp | 74 --------------------- classify/extract.h | 40 ------------ classify/featdefs.cpp | 28 ++++---- classify/featdefs.h | 5 +- classify/flexfx.cpp | 72 --------------------- classify/flexfx.h | 36 ----------- classify/fxdefs.cpp | 45 ------------- classify/fxdefs.h | 25 ------- classify/intfx.cpp | 6 +- classify/mf.cpp | 7 +- classify/mf.h | 4 +- classify/mfdefs.h | 1 - classify/mfx.cpp | 6 +- classify/mfx.h | 5 +- classify/normfeat.cpp | 4 +- classify/normfeat.h | 4 +- classify/ocrfeatures.cpp | 69 ++++++++++---------- classify/ocrfeatures.h | 7 -- classify/picofeat.cpp | 23 ++++--- classify/picofeat.h | 7 -- classify/xform2d.cpp | 120 ---------------------------------- classify/xform2d.h | 60 ----------------- wordrec/tface.cpp | 2 - 31 files changed, 220 insertions(+), 745 deletions(-) delete mode 100644 classify/extern.h delete mode 100644 classify/extract.cpp delete mode 100644 classify/extract.h delete mode 100644 classify/flexfx.cpp delete mode 100644 classify/flexfx.h delete mode 100644 classify/fxdefs.cpp delete mode 100644 classify/fxdefs.h delete mode 100644 classify/xform2d.cpp delete mode 100644 classify/xform2d.h diff --git a/api/baseapi.cpp b/api/baseapi.cpp index af14be31f9..f502d24f5e 100644 --- a/api/baseapi.cpp +++ b/api/baseapi.cpp @@ -51,6 +51,7 @@ #include "allheaders.h" #include "baseapi.h" +#include "blobclass.h" #include "resultiterator.h" #include "mutableiterator.h" #include "thresholder.h" @@ -870,7 +871,9 @@ int TessBaseAPI::Recognize(ETEXT_DESC* monitor) { page_res_ = NULL; return -1; } else if (tesseract_->tessedit_train_from_boxes) { - tesseract_->ApplyBoxTraining(*output_file_, page_res_); + STRING fontname; + ExtractFontName(*output_file_, &fontname); + tesseract_->ApplyBoxTraining(fontname, page_res_); } else if (tesseract_->tessedit_ambigs_training) { FILE *training_output_file = tesseract_->init_recog_training(*input_file_); // OCR the page segmented into words by tesseract. @@ -1051,6 +1054,23 @@ bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data, return true; } +// Master ProcessPages calls ProcessPagesInternal and then does any post- +// processing required due to being in a training mode. +bool TessBaseAPI::ProcessPages(const char* filename, const char* retry_config, + int timeout_millisec, + TessResultRenderer* renderer) { + bool result = + ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer); + if (result) { + if (tesseract_->tessedit_train_from_boxes && + !tesseract_->WriteTRFile(*output_file_)) { + tprintf("Write of TR file failed: %s\n", output_file_->string()); + return false; + } + } + return result; +} + // In the ideal scenario, Tesseract will start working on data as soon // as it can. For example, if you steam a filelist through stdin, we // should start the OCR process as soon as the first filename is @@ -1063,9 +1083,10 @@ bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data, // identify the scenario that really matters: filelists on // stdin. We'll still do our best if the user likes pipes. That means // piling up any data coming into stdin into a memory buffer. -bool TessBaseAPI::ProcessPages(const char* filename, - const char* retry_config, int timeout_millisec, - TessResultRenderer* renderer) { +bool TessBaseAPI::ProcessPagesInternal(const char* filename, + const char* retry_config, + int timeout_millisec, + TessResultRenderer* renderer) { PERF_COUNT_START("ProcessPages") bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-"); if (stdInput) { diff --git a/api/baseapi.h b/api/baseapi.h index f06ec7da20..0b8c83fd3a 100644 --- a/api/baseapi.h +++ b/api/baseapi.h @@ -538,9 +538,11 @@ class TESS_API TessBaseAPI { * * Returns true if successful, false on error. */ - bool ProcessPages(const char* filename, - const char* retry_config, int timeout_millisec, - TessResultRenderer* renderer); + bool ProcessPages(const char* filename, const char* retry_config, + int timeout_millisec, TessResultRenderer* renderer); + // Does the real work of ProcessPages. + bool ProcessPagesInternal(const char* filename, const char* retry_config, + int timeout_millisec, TessResultRenderer* renderer); /** * Turn a single image into symbolic text. diff --git a/ccmain/applybox.cpp b/ccmain/applybox.cpp index 9c067e7932..6a94ab3796 100644 --- a/ccmain/applybox.cpp +++ b/ccmain/applybox.cpp @@ -775,13 +775,13 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) { } // Calls LearnWord to extract features for labelled blobs within each word. -// Features are written to the given filename. -void Tesseract::ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res) { +// Features are stored in an internal buffer. +void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) { PAGE_RES_IT pr_it(page_res); int word_count = 0; for (WERD_RES *word_res = pr_it.word(); word_res != NULL; word_res = pr_it.forward()) { - LearnWord(filename.string(), word_res); + LearnWord(fontname.string(), word_res); ++word_count; } tprintf("Generated training data for %d words\n", word_count); diff --git a/classify/adaptmatch.cpp b/classify/adaptmatch.cpp index 7bbc84719d..86305404e3 100644 --- a/classify/adaptmatch.cpp +++ b/classify/adaptmatch.cpp @@ -220,17 +220,15 @@ void Classify::RefreshDebugWindow(ScrollView **win, const char *msg, // Learns the given word using its chopped_word, seam_array, denorm, // box_word, best_state, and correct_text to learn both correctly and -// incorrectly segmented blobs. If filename is not NULL, then LearnBlob -// is called and the data will be written to a file for static training. +// incorrectly segmented blobs. If fontname is not NULL, then LearnBlob +// is called and the data will be saved in an internal buffer. // Otherwise AdaptToBlob is called for adaption within a document. -// If rejmap is not NULL, then only chars with a rejmap entry of '1' will -// be learned, otherwise all chars with good correct_text are learned. -void Classify::LearnWord(const char* filename, WERD_RES *word) { +void Classify::LearnWord(const char* fontname, WERD_RES* word) { int word_len = word->correct_text.size(); if (word_len == 0) return; float* thresholds = NULL; - if (filename == NULL) { + if (fontname == NULL) { // Adaption mode. if (!EnableLearning || word->best_choice == NULL) return; // Can't or won't adapt. @@ -267,8 +265,8 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) { if (word->correct_text[ch].length() > 0) { float threshold = thresholds != NULL ? thresholds[ch] : 0.0f; - LearnPieces(filename, start_blob, word->best_state[ch], - threshold, CST_WHOLE, word->correct_text[ch].string(), word); + LearnPieces(fontname, start_blob, word->best_state[ch], threshold, + CST_WHOLE, word->correct_text[ch].string(), word); if (word->best_state[ch] > 1 && !disable_character_fragments) { // Check that the character breaks into meaningful fragments @@ -301,8 +299,8 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) { if (i != tokens.size() - 1) full_string += ' '; } - LearnPieces(filename, start_blob + frag, 1, - threshold, CST_FRAGMENT, full_string.string(), word); + LearnPieces(fontname, start_blob + frag, 1, threshold, + CST_FRAGMENT, full_string.string(), word); } } } @@ -314,13 +312,13 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) { if (word->best_state[ch] > 1) { // If the next blob is good, make junk with the rightmost fragment. if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) { - LearnPieces(filename, start_blob + word->best_state[ch] - 1, + LearnPieces(fontname, start_blob + word->best_state[ch] - 1, word->best_state[ch + 1] + 1, threshold, CST_IMPROPER, INVALID_UNICHAR, word); } // If the previous blob is good, make junk with the leftmost fragment. if (ch > 0 && word->correct_text[ch - 1].length() > 0) { - LearnPieces(filename, start_blob - word->best_state[ch - 1], + LearnPieces(fontname, start_blob - word->best_state[ch - 1], word->best_state[ch - 1] + 1, threshold, CST_IMPROPER, INVALID_UNICHAR, word); } @@ -329,7 +327,7 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) { if (ch + 1 < word_len && word->correct_text[ch + 1].length() > 0) { STRING joined_text = word->correct_text[ch]; joined_text += word->correct_text[ch + 1]; - LearnPieces(filename, start_blob, + LearnPieces(fontname, start_blob, word->best_state[ch] + word->best_state[ch + 1], threshold, CST_NGRAM, joined_text.string(), word); } @@ -342,16 +340,16 @@ void Classify::LearnWord(const char* filename, WERD_RES *word) { // Builds a blob of length fragments, from the word, starting at start, // and then learns it, as having the given correct_text. -// If filename is not NULL, then LearnBlob -// is called and the data will be written to a file for static training. +// If fontname is not NULL, then LearnBlob is called and the data will be +// saved in an internal buffer for static training. // Otherwise AdaptToBlob is called for adaption within a document. // threshold is a magic number required by AdaptToChar and generated by // ComputeAdaptionThresholds. // Although it can be partly inferred from the string, segmentation is // provided to explicitly clarify the character segmentation. -void Classify::LearnPieces(const char* filename, int start, int length, +void Classify::LearnPieces(const char* fontname, int start, int length, float threshold, CharSegmentationType segmentation, - const char* correct_text, WERD_RES *word) { + const char* correct_text, WERD_RES* word) { // TODO(daria) Remove/modify this if/when we want // to train and/or adapt to n-grams. if (segmentation != CST_WHOLE && @@ -385,7 +383,7 @@ void Classify::LearnPieces(const char* filename, int start, int length, } #endif // GRAPHICS_DISABLED - if (filename != NULL) { + if (fontname != NULL) { classify_norm_method.set_value(character); // force char norm spc 30/11/93 tess_bn_matching.set_value(false); // turn it off tess_cn_matching.set_value(false); @@ -393,8 +391,7 @@ void Classify::LearnPieces(const char* filename, int start, int length, INT_FX_RESULT_STRUCT fx_info; SetupBLCNDenorms(*rotated_blob, classify_nonlinear_norm, &bl_denorm, &cn_denorm, &fx_info); - LearnBlob(feature_defs_, filename, rotated_blob, bl_denorm, cn_denorm, - fx_info, correct_text); + LearnBlob(fontname, rotated_blob, cn_denorm, fx_info, correct_text); } else if (unicharset.contains_unichar(correct_text)) { UNICHAR_ID class_id = unicharset.unichar_to_id(correct_text); int font_id = word->fontinfo != NULL diff --git a/classify/blobclass.cpp b/classify/blobclass.cpp index 5861f53e84..314a71f726 100644 --- a/classify/blobclass.cpp +++ b/classify/blobclass.cpp @@ -20,63 +20,32 @@ Include Files and Type Defines ----------------------------------------------------------------------------**/ #include "blobclass.h" -#include "extract.h" -#include "efio.h" -#include "featdefs.h" -#include "callcpp.h" -#include #include -#include -#define MAXFILENAME 80 -#define MAXMATCHES 10 +#include "classify.h" +#include "efio.h" +#include "featdefs.h" +#include "mf.h" +#include "normfeat.h" static const char kUnknownFontName[] = "UnknownFont"; STRING_VAR(classify_font_name, kUnknownFontName, "Default font name to be used in training"); -/**---------------------------------------------------------------------------- - Global Data Definitions and Declarations -----------------------------------------------------------------------------**/ -/* name of current image file being processed */ -extern char imagefile[]; - +namespace tesseract { /**---------------------------------------------------------------------------- Public Code ----------------------------------------------------------------------------**/ - -/*---------------------------------------------------------------------------*/ -// As all TBLOBs, Blob is in baseline normalized coords. -// See SetupBLCNDenorms in intfx.cpp for other args. -void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename, - TBLOB * Blob, const DENORM& bl_denorm, const DENORM& cn_denorm, - const INT_FX_RESULT_STRUCT& fx_info, const char* BlobText) { -/* - ** Parameters: - ** Blob blob whose micro-features are to be learned - ** Row row of text that blob came from - ** BlobText text that corresponds to blob - ** TextLength number of characters in blob - ** Globals: - ** imagefile base filename of the page being learned - ** classify_font_name - ** name of font currently being trained on - ** Operation: - ** Extract micro-features from the specified blob and append - ** them to the appropriate file. - ** Return: none - ** Exceptions: none - ** History: 7/28/89, DSJ, Created. - */ -#define TRAIN_SUFFIX ".tr" - static FILE *FeatureFile = NULL; - STRING Filename(filename); - - // If no fontname was set, try to extract it from the filename - STRING CurrFontName = classify_font_name; - if (CurrFontName == kUnknownFontName) { +// Finds the name of the training font and returns it in fontname, by cutting +// it out based on the expectation that the filename is of the form: +// /path/to/dir/[lang].[fontname].exp[num] +// The [lang], [fontname] and [num] fields should not have '.' characters. +// If the global parameter classify_font_name is set, its value is used instead. +void ExtractFontName(const STRING& filename, STRING* fontname) { + *fontname = classify_font_name; + if (*fontname == kUnknownFontName) { // filename is expected to be of the form [lang].[fontname].exp[num] // The [lang], [fontname] and [num] fields should not have '.' characters. const char *basename = strrchr(filename.string(), '/'); @@ -84,47 +53,56 @@ void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename, const char *lastdot = strrchr(filename.string(), '.'); if (firstdot != lastdot && firstdot != NULL && lastdot != NULL) { ++firstdot; - CurrFontName = firstdot; - CurrFontName[lastdot - firstdot] = '\0'; + *fontname = firstdot; + fontname->truncate_at(lastdot - firstdot); } } +} - // if a feature file is not yet open, open it - // the name of the file is the name of the image plus TRAIN_SUFFIX - if (FeatureFile == NULL) { - Filename += TRAIN_SUFFIX; - FeatureFile = Efopen(Filename.string(), "wb"); - cprintf("TRAINING ... Font name = %s\n", CurrFontName.string()); - } - - LearnBlob(FeatureDefs, FeatureFile, Blob, bl_denorm, cn_denorm, fx_info, - BlobText, CurrFontName.string()); -} // LearnBlob - -void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE* FeatureFile, - TBLOB* Blob, const DENORM& bl_denorm, const DENORM& cn_denorm, - const INT_FX_RESULT_STRUCT& fx_info, - const char* BlobText, const char* FontName) { - CHAR_DESC CharDesc; - - ASSERT_HOST(FeatureFile != NULL); - - CharDesc = ExtractBlobFeatures(FeatureDefs, bl_denorm, cn_denorm, fx_info, - Blob); - if (CharDesc == NULL) { - cprintf("LearnBLob: CharDesc was NULL. Aborting.\n"); - return; - } - - if (ValidCharDescription(FeatureDefs, CharDesc)) { - // label the features with a class name and font name - fprintf(FeatureFile, "\n%s %s\n", FontName, BlobText); +/*---------------------------------------------------------------------------*/ +// Extracts features from the given blob and saves them in the tr_file_data_ +// member variable. +// fontname: Name of font that this blob was printed in. +// cn_denorm: Character normalization transformation to apply to the blob. +// fx_info: Character normalization parameters computed with cn_denorm. +// blob_text: Ground truth text for the blob. +void Classify::LearnBlob(const STRING& fontname, TBLOB* blob, + const DENORM& cn_denorm, + const INT_FX_RESULT_STRUCT& fx_info, + const char* blob_text) { + CHAR_DESC CharDesc = NewCharDescription(feature_defs_); + CharDesc->FeatureSets[0] = ExtractMicros(blob, cn_denorm); + CharDesc->FeatureSets[1] = ExtractCharNormFeatures(fx_info); + CharDesc->FeatureSets[2] = ExtractIntCNFeatures(*blob, fx_info); + CharDesc->FeatureSets[3] = ExtractIntGeoFeatures(*blob, fx_info); + + if (ValidCharDescription(feature_defs_, CharDesc)) { + // Label the features with a class name and font name. + tr_file_data_ += "\n"; + tr_file_data_ += fontname; + tr_file_data_ += " "; + tr_file_data_ += blob_text; + tr_file_data_ += "\n"; // write micro-features to file and clean up - WriteCharDescription(FeatureDefs, FeatureFile, CharDesc); + WriteCharDescription(feature_defs_, CharDesc, &tr_file_data_); } else { tprintf("Blob learned was invalid!\n"); } FreeCharDescription(CharDesc); - } // LearnBlob + +// Writes stored training data to a .tr file based on the given filename. +// Returns false on error. +bool Classify::WriteTRFile(const STRING& filename) { + STRING tr_filename = filename + ".tr"; + FILE* fp = Efopen(tr_filename.string(), "wb"); + int len = tr_file_data_.length(); + bool result = + fwrite(&tr_file_data_[0], sizeof(tr_file_data_[0]), len, fp) == len; + fclose(fp); + tr_file_data_.truncate_at(0); + return result; +} + +} // namespace tesseract. diff --git a/classify/blobclass.h b/classify/blobclass.h index 95510a2f96..be09465bd4 100644 --- a/classify/blobclass.h +++ b/classify/blobclass.h @@ -21,9 +21,7 @@ /**---------------------------------------------------------------------------- Include Files and Type Defines ----------------------------------------------------------------------------**/ -#include "featdefs.h" -#include "oldlist.h" -#include "blobs.h" +#include "strngs.h" /*--------------------------------------------------------------------------- Macros @@ -39,18 +37,14 @@ /**---------------------------------------------------------------------------- Public Function Prototypes ----------------------------------------------------------------------------**/ -void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, const STRING& filename, - TBLOB * Blob, const DENORM& bl_denorm, const DENORM& cn_denorm, - const INT_FX_RESULT_STRUCT& fx_info, - const char* BlobText); +namespace tesseract { +// Finds the name of the training font and returns it in fontname, by cutting +// it out based on the expectation that the filename is of the form: +// /path/to/dir/[lang].[fontname].exp[num] +// The [lang], [fontname] and [num] fields should not have '.' characters. +// If the global parameter classify_font_name is set, its value is used instead. +void ExtractFontName(const STRING& filename, STRING* fontname); -void LearnBlob(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE* File, TBLOB* Blob, - const DENORM& bl_denorm, const DENORM& cn_denorm, - const INT_FX_RESULT_STRUCT& fx_info, - const char* BlobText, const char* FontName); +} // namespace tesseract. -/**---------------------------------------------------------------------------- - Global Data Definitions and Declarations -----------------------------------------------------------------------------**/ -/*parameter used to turn on/off output of recognized chars to the screen */ #endif diff --git a/classify/classify.h b/classify/classify.h index de62bbf867..f105fc60d0 100644 --- a/classify/classify.h +++ b/classify/classify.h @@ -25,6 +25,7 @@ #include "dict.h" #include "featdefs.h" #include "fontinfo.h" +#include "imagedata.h" #include "intfx.h" #include "intmatcher.h" #include "normalis.h" @@ -119,25 +120,25 @@ class Classify : public CCStruct { const UNICHARSET& target_unicharset); /* adaptmatch.cpp ***********************************************************/ - // Learn the given word using its chopped_word, seam_array, denorm, + // Learns the given word using its chopped_word, seam_array, denorm, // box_word, best_state, and correct_text to learn both correctly and - // incorrectly segmented blobs. If filename is not NULL, then LearnBlob - // is called and the data will be written to a file for static training. + // incorrectly segmented blobs. If fontname is not NULL, then LearnBlob + // is called and the data will be saved in an internal buffer. // Otherwise AdaptToBlob is called for adaption within a document. - void LearnWord(const char* filename, WERD_RES *word); + void LearnWord(const char* fontname, WERD_RES* word); // Builds a blob of length fragments, from the word, starting at start, - // and then learn it, as having the given correct_text. - // If filename is not NULL, then LearnBlob - // is called and the data will be written to a file for static training. + // and then learns it, as having the given correct_text. + // If fontname is not NULL, then LearnBlob is called and the data will be + // saved in an internal buffer for static training. // Otherwise AdaptToBlob is called for adaption within a document. // threshold is a magic number required by AdaptToChar and generated by - // GetAdaptThresholds. + // ComputeAdaptionThresholds. // Although it can be partly inferred from the string, segmentation is // provided to explicitly clarify the character segmentation. - void LearnPieces(const char* filename, int start, int length, - float threshold, CharSegmentationType segmentation, - const char* correct_text, WERD_RES *word); + void LearnPieces(const char* fontname, int start, int length, float threshold, + CharSegmentationType segmentation, const char* correct_text, + WERD_RES* word); void InitAdaptiveClassifier(bool load_pre_trained_templates); void InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, @@ -361,7 +362,22 @@ class Classify : public CCStruct { FEATURE_SET ExtractOutlineFeatures(TBLOB *Blob); /* picofeat.cpp ***********************************************************/ FEATURE_SET ExtractPicoFeatures(TBLOB *Blob); - + FEATURE_SET ExtractIntCNFeatures(const TBLOB& blob, + const INT_FX_RESULT_STRUCT& fx_info); + FEATURE_SET ExtractIntGeoFeatures(const TBLOB& blob, + const INT_FX_RESULT_STRUCT& fx_info); + /* blobclass.cpp ***********************************************************/ + // Extracts features from the given blob and saves them in the tr_file_data_ + // member variable. + // fontname: Name of font that this blob was printed in. + // cn_denorm: Character normalization transformation to apply to the blob. + // fx_info: Character normalization parameters computed with cn_denorm. + // blob_text: Ground truth text for the blob. + void LearnBlob(const STRING& fontname, TBLOB* Blob, const DENORM& cn_denorm, + const INT_FX_RESULT_STRUCT& fx_info, const char* blob_text); + // Writes stored training data to a .tr file based on the given filename. + // Returns false on error. + bool WriteTRFile(const STRING& filename); // Member variables. @@ -498,6 +514,9 @@ class Classify : public CCStruct { /* variables used to hold performance statistics */ int NumAdaptationsFailed; + // Training data gathered here for all the images in a document. + STRING tr_file_data_; + // Expected number of features in the class pruner, used to penalize // unknowns that have too few features (like a c being classified as e) so // it doesn't recognize everything as '@' or '#'. diff --git a/classify/extern.h b/classify/extern.h deleted file mode 100644 index ebbe4dfe3a..0000000000 --- a/classify/extern.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef EXTERN_H -#define EXTERN_H - -/* -*-C-*- - ******************************************************************************** - * - * File: extern.h (Formerly extern.h) - * Description: External definitions for C or C++ - * Author: Mark Seaman, OCR Technology - * Created: Tue Mar 20 14:01:22 1990 - * Modified: Tue Mar 20 14:02:09 1990 (Mark Seaman) marks@hpgrlt - * Language: C - * Package: N/A - * Status: Experimental (Do Not Distribute) - * - * (c) Copyright 1990, Hewlett-Packard Company. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - ******************************************************************************** - */ - -#define EXTERN extern - -#endif diff --git a/classify/extract.cpp b/classify/extract.cpp deleted file mode 100644 index 822c733e4e..0000000000 --- a/classify/extract.cpp +++ /dev/null @@ -1,74 +0,0 @@ -/****************************************************************************** - ** Filename: extract.c - ** Purpose: Generic high level feature extractor routines. - ** Author: Dan Johnson - ** History: Sun Jan 21 09:44:08 1990, DSJ, Created. - ** - ** (c) Copyright Hewlett-Packard Company, 1988. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - ******************************************************************************/ -/*----------------------------------------------------------------------------- - Include Files and Type Defines ------------------------------------------------------------------------------*/ -#include "extract.h" -#include "flexfx.h" -#include "danerror.h" - -typedef CHAR_FEATURES (*CF_FUNC) (); - -/*----------------------------------------------------------------------------- - Private Function Prototypes ------------------------------------------------------------------------------*/ -void ExtractorStub(); - -/*----------------------------------------------------------------------------- - Public Code ------------------------------------------------------------------------------*/ -/*---------------------------------------------------------------------------*/ -/** - * Extract features from Blob by calling the feature - * extractor which is currently being used. This routine - * simply provides a high level interface to feature - * extraction. The caller can extract any type of features - * from a blob without understanding any lower level details. - * - * @param FeatureDefs definitions of feature types/extractors - * @param denorm Normalize/denormalize to access original image - * @param Blob blob to extract features from - * - * @return The character features extracted from Blob. - * @note Exceptions: none - * @note History: Sun Jan 21 10:07:28 1990, DSJ, Created. - */ -CHAR_DESC ExtractBlobFeatures(const FEATURE_DEFS_STRUCT &FeatureDefs, - const DENORM& bl_denorm, const DENORM& cn_denorm, - const INT_FX_RESULT_STRUCT& fx_info, - TBLOB *Blob) { - return ExtractFlexFeatures(FeatureDefs, Blob, bl_denorm, cn_denorm, fx_info); -} /* ExtractBlobFeatures */ - -/*----------------------------------------------------------------------------- - Private Code ------------------------------------------------------------------------------*/ -/*---------------------------------------------------------------------------*/ -void -ExtractorStub () -/** - * This routine is used to stub out feature extractors - * that are no longer used. It simply calls DoError. - * - * @note Exceptions: none - * @note History: Wed Jan 2 14:16:49 1991, DSJ, Created. - */ -#define DUMMY_ERROR 1 -{ - DoError (DUMMY_ERROR, "Selected feature extractor has been stubbed out!"); -} /* ExtractorStub */ diff --git a/classify/extract.h b/classify/extract.h deleted file mode 100644 index 1f80c20e42..0000000000 --- a/classify/extract.h +++ /dev/null @@ -1,40 +0,0 @@ -/****************************************************************************** - ** Filename: extract.h - ** Purpose: Interface to high level generic feature extraction. - ** Author: Dan Johnson - ** History: 1/21/90, DSJ, Created. - ** - ** (c) Copyright Hewlett-Packard Company, 1988. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - ******************************************************************************/ -#ifndef EXTRACT_H -#define EXTRACT_H - -#include "featdefs.h" -#include - -class DENORM; - -/*----------------------------------------------------------------------------- - Public Function Prototypes ------------------------------------------------------------------------------*/ -// Deprecated! Will be deleted soon! -// In the meantime, as all TBLOBs, Blob is in baseline normalized coords. -// See SetupBLCNDenorms in intfx.cpp for other args. -CHAR_DESC ExtractBlobFeatures(const FEATURE_DEFS_STRUCT &FeatureDefs, - const DENORM& bl_denorm, const DENORM& cn_denorm, - const INT_FX_RESULT_STRUCT& fx_info, TBLOB *Blob); - -/*--------------------------------------------------------------------------- - Private Function Prototypes -----------------------------------------------------------------------------*/ -void ExtractorStub(); -#endif diff --git a/classify/featdefs.cpp b/classify/featdefs.cpp index 5226c6aae5..cf9e551509 100644 --- a/classify/featdefs.cpp +++ b/classify/featdefs.cpp @@ -178,7 +178,7 @@ CHAR_DESC NewCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs) { /*---------------------------------------------------------------------------*/ /** - * Write a textual representation of CharDesc to File. + * Appends a textual representation of CharDesc to str. * The format used is to write out the number of feature * sets which will be written followed by a representation of * each feature set. @@ -187,18 +187,15 @@ CHAR_DESC NewCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs) { * by a description of the feature set. Feature sets which are * not present are not written. * - * Globals: - * - none - * * @param FeatureDefs definitions of feature types/extractors - * @param File open text file to write CharDesc to - * @param CharDesc character description to write to File + * @param str string to append CharDesc to + * @param CharDesc character description to write to File * * @note Exceptions: none * @note History: Wed May 23 17:21:18 1990, DSJ, Created. */ -void WriteCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, - FILE *File, CHAR_DESC CharDesc) { +void WriteCharDescription(const FEATURE_DEFS_STRUCT& FeatureDefs, + CHAR_DESC CharDesc, STRING* str) { int Type; int NumSetsToWrite = 0; @@ -206,11 +203,14 @@ void WriteCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, if (CharDesc->FeatureSets[Type]) NumSetsToWrite++; - fprintf (File, " %d\n", NumSetsToWrite); - for (Type = 0; Type < CharDesc->NumFeatureSets; Type++) - if (CharDesc->FeatureSets[Type]) { - fprintf (File, "%s ", (FeatureDefs.FeatureDesc[Type])->ShortName); - WriteFeatureSet (File, CharDesc->FeatureSets[Type]); + str->add_str_int(" ", NumSetsToWrite); + *str += "\n"; + for (Type = 0; Type < CharDesc->NumFeatureSets; Type++) { + if (CharDesc->FeatureSets[Type]) { + *str += FeatureDefs.FeatureDesc[Type]->ShortName; + *str += " "; + WriteFeatureSet(CharDesc->FeatureSets[Type], str); + } } } /* WriteCharDescription */ @@ -231,6 +231,8 @@ bool ValidCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, anything_written = true; } } + } else { + return false; } } return anything_written && well_formed; diff --git a/classify/featdefs.h b/classify/featdefs.h index e187091061..704bbdfde2 100644 --- a/classify/featdefs.h +++ b/classify/featdefs.h @@ -48,7 +48,6 @@ typedef CHAR_DESC_STRUCT *CHAR_DESC; struct FEATURE_DEFS_STRUCT { inT32 NumFeatureTypes; const FEATURE_DESC_STRUCT* FeatureDesc[NUM_FEATURE_TYPES]; - const FEATURE_EXT_STRUCT* FeatureExtractors[NUM_FEATURE_TYPES]; int FeatureEnabled[NUM_FEATURE_TYPES]; }; typedef FEATURE_DEFS_STRUCT *FEATURE_DEFS; @@ -65,8 +64,8 @@ CHAR_DESC NewCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs); bool ValidCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, CHAR_DESC CharDesc); -void WriteCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, - FILE *File, CHAR_DESC CharDesc); +void WriteCharDescription(const FEATURE_DEFS_STRUCT& FeatureDefs, + CHAR_DESC CharDesc, STRING* str); CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File); diff --git a/classify/flexfx.cpp b/classify/flexfx.cpp deleted file mode 100644 index 2ddbe3a025..0000000000 --- a/classify/flexfx.cpp +++ /dev/null @@ -1,72 +0,0 @@ -/****************************************************************************** - ** Filename: flexfx.c - ** Purpose: Interface to flexible feature extractor. - ** Author: Dan Johnson - ** History: Wed May 23 13:45:10 1990, DSJ, Created. - ** - ** (c) Copyright Hewlett-Packard Company, 1988. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - ******************************************************************************/ -/**---------------------------------------------------------------------------- - Include Files and Type Defines -----------------------------------------------------------------------------**/ -#include "flexfx.h" -#include "featdefs.h" -#include "emalloc.h" -#include -#include - -/**---------------------------------------------------------------------------- - Public Code -----------------------------------------------------------------------------**/ -/*---------------------------------------------------------------------------*/ -// Deprecated! Will be deleted soon! -// In the meantime, as all TBLOBs, Blob is in baseline normalized coords. -// See SetupBLCNDenorms in intfx.cpp for other args. -CHAR_DESC ExtractFlexFeatures(const FEATURE_DEFS_STRUCT &FeatureDefs, - TBLOB *Blob, const DENORM& bl_denorm, - const DENORM& cn_denorm, - const INT_FX_RESULT_STRUCT& fx_info) { -/* - ** Parameters: - ** Blob blob to extract features from - ** denorm control parameter for feature extractor - ** Globals: none - ** Operation: Allocate a new character descriptor and fill it in by - ** calling all feature extractors which are enabled. - ** Return: Structure containing features extracted from Blob. - ** Exceptions: none - ** History: Wed May 23 13:46:22 1990, DSJ, Created. - */ - int Type; - CHAR_DESC CharDesc; - - CharDesc = NewCharDescription(FeatureDefs); - - for (Type = 0; Type < CharDesc->NumFeatureSets; Type++) - if (FeatureDefs.FeatureExtractors[Type] != NULL && - FeatureDefs.FeatureExtractors[Type]->Extractor != NULL) { - CharDesc->FeatureSets[Type] = - (FeatureDefs.FeatureExtractors[Type])->Extractor(Blob, - bl_denorm, - cn_denorm, - fx_info); - if (CharDesc->FeatureSets[Type] == NULL) { - tprintf("Feature extractor for type %d = %s returned NULL!\n", - Type, FeatureDefs.FeatureDesc[Type]->ShortName); - FreeCharDescription(CharDesc); - return NULL; - } - } - - return (CharDesc); - -} /* ExtractFlexFeatures */ diff --git a/classify/flexfx.h b/classify/flexfx.h deleted file mode 100644 index 21c4fa2619..0000000000 --- a/classify/flexfx.h +++ /dev/null @@ -1,36 +0,0 @@ -/****************************************************************************** - ** Filename: flexfx.h - ** Purpose: Interface to flexible feature extractor. - ** Author: Dan Johnson - ** History: Wed May 23 13:36:58 1990, DSJ, Created. - ** - ** (c) Copyright Hewlett-Packard Company, 1988. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - ******************************************************************************/ -#ifndef FLEXFX_H -#define FLEXFX_H - -/**---------------------------------------------------------------------------- - Include Files and Type Defines -----------------------------------------------------------------------------**/ -#include "featdefs.h" -#include - -/**---------------------------------------------------------------------------- - Public Function Prototypes -----------------------------------------------------------------------------**/ -// As with all TBLOBs this one is also baseline normalized. -CHAR_DESC ExtractFlexFeatures(const FEATURE_DEFS_STRUCT &FeatureDefs, - TBLOB *Blob, const DENORM& bl_denorm, - const DENORM& cn_denorm, - const INT_FX_RESULT_STRUCT& fx_info); - -#endif diff --git a/classify/fxdefs.cpp b/classify/fxdefs.cpp deleted file mode 100644 index c4f9cd5599..0000000000 --- a/classify/fxdefs.cpp +++ /dev/null @@ -1,45 +0,0 @@ -/****************************************************************************** - ** Filename: fxdefs.c - ** Purpose: Utility functions to be used by feature extractors. - ** Author: Dan Johnson - ** History: Sun Jan 21 15:29:02 1990, DSJ, Created. - ** - ** (c) Copyright Hewlett-Packard Company, 1988. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - ******************************************************************************/ -#include "fxdefs.h" -#include "featdefs.h" -#include "mf.h" -#include "outfeat.h" -#include "picofeat.h" -#include "normfeat.h" - -/*----------------------------------------------------------------------------- - Global Data Definitions and Declarations ------------------------------------------------------------------------------*/ -// Definitions of extractors separated from feature definitions. -const FEATURE_EXT_STRUCT MicroFeatureExt = { ExtractMicros }; -const FEATURE_EXT_STRUCT CharNormExt = { ExtractCharNormFeatures }; -const FEATURE_EXT_STRUCT IntFeatExt = { ExtractIntCNFeatures }; -const FEATURE_EXT_STRUCT GeoFeatExt = { ExtractIntGeoFeatures }; - -// MUST be kept in-sync with DescDefs in featdefs.cpp. -const FEATURE_EXT_STRUCT* ExtractorDefs[NUM_FEATURE_TYPES] = { - &MicroFeatureExt, - &CharNormExt, - &IntFeatExt, - &GeoFeatExt -}; - -void SetupExtractors(FEATURE_DEFS_STRUCT *FeatureDefs) { - for (int i = 0; i < NUM_FEATURE_TYPES; ++i) - FeatureDefs->FeatureExtractors[i] = ExtractorDefs[i]; -} diff --git a/classify/fxdefs.h b/classify/fxdefs.h deleted file mode 100644 index 67f1b2b3e9..0000000000 --- a/classify/fxdefs.h +++ /dev/null @@ -1,25 +0,0 @@ -/****************************************************************************** - ** Filename: fxdefs.h - ** Purpose: Generic interface definitions for feature extractors - ** Author: Dan Johnson - ** History: Fri Jan 19 09:04:14 1990, DSJ, Created. - ** - ** (c) Copyright Hewlett-Packard Company, 1988. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - ******************************************************************************/ -#ifndef FXDEFS_H -#define FXDEFS_H - -#include "featdefs.h" - -void SetupExtractors(FEATURE_DEFS_STRUCT *FeatureDefs); - -#endif diff --git a/classify/intfx.cpp b/classify/intfx.cpp index 496cdad2c9..12966aa195 100644 --- a/classify/intfx.cpp +++ b/classify/intfx.cpp @@ -75,9 +75,9 @@ namespace tesseract { // Generates a TrainingSample from a TBLOB. Extracts features and sets // the bounding box, so classifiers that operate on the image can work. -// TODO(rays) BlobToTrainingSample must remain a global function until -// the FlexFx and FeatureDescription code can be removed and LearnBlob -// made a member of Classify. +// TODO(rays) Make BlobToTrainingSample a member of Classify now that +// the FlexFx and FeatureDescription code have been removed and LearnBlob +// is now a member of Classify. TrainingSample* BlobToTrainingSample( const TBLOB& blob, bool nonlinear_norm, INT_FX_RESULT_STRUCT* fx_info, GenericVector* bl_features) { diff --git a/classify/mf.cpp b/classify/mf.cpp index ad1ba285f5..f657fb19d6 100644 --- a/classify/mf.cpp +++ b/classify/mf.cpp @@ -33,9 +33,7 @@ Private Code ----------------------------------------------------------------------------**/ /*---------------------------------------------------------------------------*/ -FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM& bl_denorm, - const DENORM& cn_denorm, - const INT_FX_RESULT_STRUCT& fx_info) { +FEATURE_SET ExtractMicros(TBLOB* Blob, const DENORM& cn_denorm) { /* ** Parameters: ** Blob blob to extract micro-features from @@ -54,8 +52,7 @@ FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM& bl_denorm, FEATURE Feature; MICROFEATURE OldFeature; - OldFeatures = (MICROFEATURES)BlobMicroFeatures(Blob, bl_denorm, cn_denorm, - fx_info); + OldFeatures = BlobMicroFeatures(Blob, cn_denorm); if (OldFeatures == NULL) return NULL; NumFeatures = count (OldFeatures); diff --git a/classify/mf.h b/classify/mf.h index 716f5b8c0b..4c06a5625b 100644 --- a/classify/mf.h +++ b/classify/mf.h @@ -34,8 +34,6 @@ typedef float MicroFeature[MFCount]; /*---------------------------------------------------------------------------- Private Function Prototypes -----------------------------------------------------------------------------*/ -FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM& bl_denorm, - const DENORM& cn_denorm, - const INT_FX_RESULT_STRUCT& fx_info); +FEATURE_SET ExtractMicros(TBLOB* Blob, const DENORM& cn_denorm); #endif diff --git a/classify/mfdefs.h b/classify/mfdefs.h index 6202a17424..20a3e2189b 100644 --- a/classify/mfdefs.h +++ b/classify/mfdefs.h @@ -23,7 +23,6 @@ ----------------------------------------------------------------------------**/ #include "oldlist.h" #include "matchdefs.h" -#include "xform2d.h" /* definition of a list of micro-features */ typedef LIST MICROFEATURES; diff --git a/classify/mfx.cpp b/classify/mfx.cpp index 9f3e3d2426..d81aca0cf0 100644 --- a/classify/mfx.cpp +++ b/classify/mfx.cpp @@ -59,9 +59,7 @@ MICROFEATURE ExtractMicroFeature(MFOUTLINE Start, MFOUTLINE End); ----------------------------------------------------------------------------**/ /*---------------------------------------------------------------------------*/ -CHAR_FEATURES BlobMicroFeatures(TBLOB *Blob, const DENORM& bl_denorm, - const DENORM& cn_denorm, - const INT_FX_RESULT_STRUCT& fx_info) { +MICROFEATURES BlobMicroFeatures(TBLOB* Blob, const DENORM& cn_denorm) { /* ** Parameters: ** Blob blob to extract micro-features from @@ -98,7 +96,7 @@ CHAR_FEATURES BlobMicroFeatures(TBLOB *Blob, const DENORM& bl_denorm, } FreeOutlines(Outlines); } - return ((CHAR_FEATURES) MicroFeatures); + return MicroFeatures; } /* BlobMicroFeatures */ diff --git a/classify/mfx.h b/classify/mfx.h index 7e7fe1cfb6..9dcedc1a3b 100644 --- a/classify/mfx.h +++ b/classify/mfx.h @@ -21,6 +21,7 @@ /**---------------------------------------------------------------------------- Include Files and Type Defines ----------------------------------------------------------------------------**/ +#include "mfdefs.h" #include "params.h" /**---------------------------------------------------------------------------- Variables @@ -35,8 +36,6 @@ extern double_VAR_H(classify_max_slope, 2.414213562, /**---------------------------------------------------------------------------- Public Function Prototypes ----------------------------------------------------------------------------**/ -CHAR_FEATURES BlobMicroFeatures(TBLOB *Blob, const DENORM& bl_denorm, - const DENORM& cn_denorm, - const INT_FX_RESULT_STRUCT& fx_info); +MICROFEATURES BlobMicroFeatures(TBLOB* Blob, const DENORM& cn_denorm); #endif diff --git a/classify/normfeat.cpp b/classify/normfeat.cpp index 3f8013aa3c..7f383cb81a 100644 --- a/classify/normfeat.cpp +++ b/classify/normfeat.cpp @@ -59,9 +59,7 @@ FLOAT32 ActualOutlineLength(FEATURE Feature) { // the x center of the grapheme's bounding box. // English: [0.011, 0.31] // -FEATURE_SET ExtractCharNormFeatures(TBLOB *blob, const DENORM& bl_denorm, - const DENORM& cn_denorm, - const INT_FX_RESULT_STRUCT& fx_info) { +FEATURE_SET ExtractCharNormFeatures(const INT_FX_RESULT_STRUCT& fx_info) { FEATURE_SET feature_set = NewFeatureSet(1); FEATURE feature = NewFeature(&CharNormDesc); diff --git a/classify/normfeat.h b/classify/normfeat.h index 59703a517d..1478b827d4 100644 --- a/classify/normfeat.h +++ b/classify/normfeat.h @@ -34,8 +34,6 @@ typedef enum { ----------------------------------------------------------------------------**/ FLOAT32 ActualOutlineLength(FEATURE Feature); -FEATURE_SET ExtractCharNormFeatures(TBLOB *Blob, const DENORM& bl_denorm, - const DENORM& cn_denorm, - const INT_FX_RESULT_STRUCT& fx_info); +FEATURE_SET ExtractCharNormFeatures(const INT_FX_RESULT_STRUCT& fx_info); #endif diff --git a/classify/ocrfeatures.cpp b/classify/ocrfeatures.cpp index 1a75648cbc..3900540988 100644 --- a/classify/ocrfeatures.cpp +++ b/classify/ocrfeatures.cpp @@ -209,55 +209,52 @@ FEATURE_SET ReadFeatureSet(FILE *File, const FEATURE_DESC_STRUCT* FeatureDesc) { /*---------------------------------------------------------------------------*/ -void WriteFeature(FILE *File, FEATURE Feature) { /* - ** Parameters: - ** File open text file to write Feature to - ** Feature feature to write out to File - ** Globals: none - ** Operation: Write a textual representation of Feature to File. - ** This representation is simply a list of the N parameters - ** of the feature, terminated with a newline. It is assumed - ** that the ExtraPenalty field can be reconstructed from the - ** parameters of the feature. It is also assumed that the - ** feature type information is specified or assumed elsewhere. - ** Return: none - ** Exceptions: none - ** History: Wed May 23 09:28:18 1990, DSJ, Created. + ** Parameters: + ** Feature: feature to write out to str + ** str: string to write Feature to + ** Operation: Appends a textual representation of Feature to str. + ** This representation is simply a list of the N parameters + ** of the feature, terminated with a newline. It is assumed + ** that the ExtraPenalty field can be reconstructed from the + ** parameters of the feature. It is also assumed that the + ** feature type information is specified or assumed elsewhere. + ** Return: none + ** Exceptions: none + ** History: Wed May 23 09:28:18 1990, DSJ, Created. */ - int i; - - for (i = 0; i < Feature->Type->NumParams; i++) { +void WriteFeature(FEATURE Feature, STRING* str) { + for (int i = 0; i < Feature->Type->NumParams; i++) { #ifndef WIN32 assert(!isnan(Feature->Params[i])); #endif - fprintf(File, " %g", Feature->Params[i]); + str->add_str_double(" ", Feature->Params[i]); } - fprintf(File, "\n"); + *str += "\n"; } /* WriteFeature */ /*---------------------------------------------------------------------------*/ -void WriteFeatureSet(FILE *File, FEATURE_SET FeatureSet) { /* - ** Parameters: - ** File open text file to write FeatureSet to - ** FeatureSet feature set to write to File - ** Globals: none - ** Operation: Write a textual representation of FeatureSet to File. - ** This representation is an integer specifying the number of - ** features in the set, followed by a newline, followed by - ** text representations for each feature in the set. - ** Return: none - ** Exceptions: none - ** History: Wed May 23 10:06:03 1990, DSJ, Created. + ** Parameters: + ** FeatureSet: feature set to write to File + ** str: string to write Feature to + ** Globals: none + ** Operation: Write a textual representation of FeatureSet to File. + ** This representation is an integer specifying the number of + ** features in the set, followed by a newline, followed by + ** text representations for each feature in the set. + ** Return: none + ** Exceptions: none + ** History: Wed May 23 10:06:03 1990, DSJ, Created. */ - int i; - +void WriteFeatureSet(FEATURE_SET FeatureSet, STRING* str) { if (FeatureSet) { - fprintf (File, "%d\n", FeatureSet->NumFeatures); - for (i = 0; i < FeatureSet->NumFeatures; i++) - WriteFeature (File, FeatureSet->Features[i]); + str->add_str_int("", FeatureSet->NumFeatures); + *str += "\n"; + for (int i = 0; i < FeatureSet->NumFeatures; i++) { + WriteFeature(FeatureSet->Features[i], str); + } } } /* WriteFeatureSet */ diff --git a/classify/ocrfeatures.h b/classify/ocrfeatures.h index 734b4ff07b..7d6ba95dab 100644 --- a/classify/ocrfeatures.h +++ b/classify/ocrfeatures.h @@ -79,13 +79,6 @@ typedef FEATURE_SET_STRUCT *FEATURE_SET; // classifier does not need to know the details of this data structure. typedef char *CHAR_FEATURES; -typedef FEATURE_SET (*FX_FUNC)(TBLOB *, const DENORM&, const DENORM&, - const INT_FX_RESULT_STRUCT&); - -struct FEATURE_EXT_STRUCT { - FX_FUNC Extractor; // func to extract features -}; - /*---------------------------------------------------------------------- Macros for defining the parameters of a new features ----------------------------------------------------------------------*/ diff --git a/classify/picofeat.cpp b/classify/picofeat.cpp index aff0c24b12..fa2eedbb3a 100644 --- a/classify/picofeat.cpp +++ b/classify/picofeat.cpp @@ -223,10 +223,10 @@ void NormalizePicoX(FEATURE_SET FeatureSet) { } } /* NormalizePicoX */ +namespace tesseract { /*---------------------------------------------------------------------------*/ -FEATURE_SET ExtractIntCNFeatures(TBLOB *blob, const DENORM& bl_denorm, - const DENORM& cn_denorm, - const INT_FX_RESULT_STRUCT& fx_info) { +FEATURE_SET Classify::ExtractIntCNFeatures( + const TBLOB& blob, const INT_FX_RESULT_STRUCT& fx_info) { /* ** Parameters: ** blob blob to extract features from @@ -237,9 +237,8 @@ FEATURE_SET ExtractIntCNFeatures(TBLOB *blob, const DENORM& bl_denorm, */ INT_FX_RESULT_STRUCT local_fx_info(fx_info); GenericVector bl_features; - tesseract::TrainingSample* sample = - tesseract::BlobToTrainingSample(*blob, false, &local_fx_info, - &bl_features); + tesseract::TrainingSample* sample = tesseract::BlobToTrainingSample( + blob, false, &local_fx_info, &bl_features); if (sample == NULL) return NULL; int num_features = sample->num_features(); @@ -259,9 +258,8 @@ FEATURE_SET ExtractIntCNFeatures(TBLOB *blob, const DENORM& bl_denorm, } /* ExtractIntCNFeatures */ /*---------------------------------------------------------------------------*/ -FEATURE_SET ExtractIntGeoFeatures(TBLOB *blob, const DENORM& bl_denorm, - const DENORM& cn_denorm, - const INT_FX_RESULT_STRUCT& fx_info) { +FEATURE_SET Classify::ExtractIntGeoFeatures( + const TBLOB& blob, const INT_FX_RESULT_STRUCT& fx_info) { /* ** Parameters: ** blob blob to extract features from @@ -272,9 +270,8 @@ FEATURE_SET ExtractIntGeoFeatures(TBLOB *blob, const DENORM& bl_denorm, */ INT_FX_RESULT_STRUCT local_fx_info(fx_info); GenericVector bl_features; - tesseract::TrainingSample* sample = - tesseract::BlobToTrainingSample(*blob, false, &local_fx_info, - &bl_features); + tesseract::TrainingSample* sample = tesseract::BlobToTrainingSample( + blob, false, &local_fx_info, &bl_features); if (sample == NULL) return NULL; FEATURE_SET feature_set = NewFeatureSet(1); @@ -288,3 +285,5 @@ FEATURE_SET ExtractIntGeoFeatures(TBLOB *blob, const DENORM& bl_denorm, return feature_set; } /* ExtractIntGeoFeatures */ + +} // namespace tesseract. diff --git a/classify/picofeat.h b/classify/picofeat.h index ab37ba0388..208b7e7708 100644 --- a/classify/picofeat.h +++ b/classify/picofeat.h @@ -58,13 +58,6 @@ extern double_VAR_H(classify_pico_feature_length, 0.05, "Pico Feature Length"); ----------------------------------------------------------------------------**/ #define GetPicoFeatureLength() (PicoFeatureLength) -FEATURE_SET ExtractIntCNFeatures(TBLOB *Blob, const DENORM& bl_denorm, - const DENORM& cn_denorm, - const INT_FX_RESULT_STRUCT& fx_info); -FEATURE_SET ExtractIntGeoFeatures(TBLOB *Blob, const DENORM& bl_denorm, - const DENORM& cn_denorm, - const INT_FX_RESULT_STRUCT& fx_info); - /**---------------------------------------------------------------------------- Global Data Definitions and Declarations ----------------------------------------------------------------------------**/ diff --git a/classify/xform2d.cpp b/classify/xform2d.cpp deleted file mode 100644 index 05ec126326..0000000000 --- a/classify/xform2d.cpp +++ /dev/null @@ -1,120 +0,0 @@ -/****************************************************************************** - ** Filename: xform2d.c - ** Purpose: Library routines for performing 2D point transformations - ** Author: Dan Johnson - ** History: Fri Sep 22 09:54:17 1989, DSJ, Created. - ** - ** (c) Copyright Hewlett-Packard Company, 1988. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - ******************************************************************************/ -/**---------------------------------------------------------------------------- - Include Files and Type Defines -----------------------------------------------------------------------------**/ -#include "xform2d.h" -#include - -/**---------------------------------------------------------------------------- - Public Code -----------------------------------------------------------------------------**/ - -void InitMatrix(MATRIX_2D *M) { - M->a = 1; - M->b = 0; - M->c = 0; - M->d = 1; - M->tx = 0; - M->ty = 0; -} - -void CopyMatrix(MATRIX_2D *A, MATRIX_2D *B) { - B->a = A->a; - B->b = A->b; - B->c = A->c; - B->d = A->d; - B->tx = A->tx; - B->ty = A->ty; -} - -void TranslateMatrix(MATRIX_2D *M, FLOAT32 X, FLOAT32 Y) { - M->tx += M->a * X + M->c * Y; - M->ty += M->b * X + M->d * Y; -} - -void ScaleMatrix(MATRIX_2D *M, FLOAT32 X, FLOAT32 Y) { - M->a *= X; - M->b *= X; - M->c *= Y; - M->d *= Y; -} - -void MirrorMatrixInX(MATRIX_2D *M) {ScaleMatrix(M, -1, 1);} -void MirrorMatrixInY(MATRIX_2D *M) {ScaleMatrix(M, 1, -1);} -void MirrorMatrixInXY(MATRIX_2D *M) {ScaleMatrix(M, -1, -1);} - -FLOAT32 MapX(MATRIX_2D *M, FLOAT32 X, FLOAT32 Y) { - return M->a * (X) + (M)->c * (Y) + (M)->tx; -} - -FLOAT32 MapY(MATRIX_2D *M, FLOAT32 X, FLOAT32 Y) { - return M->b * X + M->d * Y + M->ty; -} - -void MapPoint(MATRIX_2D *M, const FPOINT &A, FPOINT* B) { - B->x = MapX(M, A.x, A.y); - B->y = MapY(M, A.x, A.y); -} - -FLOAT32 MapDx(MATRIX_2D *M, FLOAT32 DX, FLOAT32 DY) { - return M->a * DX + M->c * DY; -} - -FLOAT32 MapDy(MATRIX_2D *M, FLOAT32 DX, FLOAT32 DY) { - return M->b * DX + M->d * DY; -} - - -/*---------------------------------------------------------------------------*/ -void RotateMatrix(MATRIX_2D_PTR Matrix, FLOAT32 Angle) { -/* - ** Parameters: - ** Matrix transformation matrix to rotate - ** Angle angle to rotate matrix - ** Globals: none - ** Operation: - ** Rotate the coordinate system (as specified by Matrix) about - ** its origin by Angle radians. In matrix notation the - ** effect is as follows: - ** - ** Matrix = R X Matrix - ** - ** where R is the following matrix - ** - ** cos Angle sin Angle 0 - ** -sin Angle cos Angle 0 - ** 0 0 1 - ** Return: none - ** Exceptions: none - ** History: 7/27/89, DSJ, Create. - */ - FLOAT32 Cos, Sin; - FLOAT32 NewA, NewB; - - Cos = cos ((double) Angle); - Sin = sin ((double) Angle); - - NewA = Matrix->a * Cos + Matrix->c * Sin; - NewB = Matrix->b * Cos + Matrix->d * Sin; - Matrix->c = Matrix->a * -Sin + Matrix->c * Cos; - Matrix->d = Matrix->b * -Sin + Matrix->d * Cos; - Matrix->a = NewA; - Matrix->b = NewB; - -} /* RotateMatrix */ diff --git a/classify/xform2d.h b/classify/xform2d.h deleted file mode 100644 index 1cd3bb1211..0000000000 --- a/classify/xform2d.h +++ /dev/null @@ -1,60 +0,0 @@ -/****************************************************************************** - ** Filename: xform2d.h - ** Purpose: Definitions for using 2D point transformation library - ** Author: Dan Johnson - ** History: Fri Sep 22 09:57:08 1989, DSJ, Created. - ** - ** (c) Copyright Hewlett-Packard Company, 1988. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - ******************************************************************************/ -#ifndef XFORM2D_H -#define XFORM2D_H - -/**---------------------------------------------------------------------------- - Include Files and Type Defines -----------------------------------------------------------------------------**/ -#include "fpoint.h" - -typedef struct -{ - FLOAT32 a, b, c, d, tx, ty; -} - - -MATRIX_2D, *MATRIX_2D_PTR; - -/**---------------------------------------------------------------------------- - Public Function Prototypes -----------------------------------------------------------------------------**/ - -void InitMatrix(MATRIX_2D *M); -void CopyMatrix(MATRIX_2D *A, MATRIX_2D *B); - -/* matrix scaling, translation, rotation, mirroring, etc.*/ -void TranslateMatrix(MATRIX_2D *M, FLOAT32 X, FLOAT32 Y); -void ScaleMatrix(MATRIX_2D *M, FLOAT32 X, FLOAT32 Y); - -void MirrorMatrixInX(MATRIX_2D *M); -void MirrorMatrixInY(MATRIX_2D *M); -void MirrorMatrixInXY(MATRIX_2D *M); - -/* using a matrix to map points*/ -FLOAT32 MapX(MATRIX_2D *M, FLOAT32 X, FLOAT32 Y); - -FLOAT32 MapY(MATRIX_2D *M, FLOAT32 X, FLOAT32 Y); - -void MapPoint(MATRIX_2D *M, const FPOINT &A, FPOINT* B); - -FLOAT32 MapDx(MATRIX_2D *M, FLOAT32 DX, FLOAT32 DY); -FLOAT32 MapDy(MATRIX_2D M, FLOAT32 DX, FLOAT32 DY); - - void RotateMatrix(MATRIX_2D_PTR Matrix, FLOAT32 Angle); -#endif diff --git a/wordrec/tface.cpp b/wordrec/tface.cpp index 383505bfcd..e21fcb8829 100644 --- a/wordrec/tface.cpp +++ b/wordrec/tface.cpp @@ -21,7 +21,6 @@ #include "chop.h" #include "chopper.h" #include "danerror.h" -#include "fxdefs.h" #include "globals.h" #include "gradechop.h" #include "pageres.h" @@ -49,7 +48,6 @@ void Wordrec::program_editup(const char *textbase, bool init_dict) { if (textbase != NULL) imagefile = textbase; InitFeatureDefs(&feature_defs_); - SetupExtractors(&feature_defs_); InitAdaptiveClassifier(init_classifier); if (init_dict) getDict().Load(Dict::GlobalDawgCache()); pass2_ok_split = chop_ok_split; From 2eec97957768aa892fe71c6021655a48a9c7cb3e Mon Sep 17 00:00:00 2001 From: Ray Smith Date: Tue, 12 May 2015 15:25:00 -0700 Subject: [PATCH 04/15] Makefile.am for fix to issue 1252 --- classify/Makefile.am | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/classify/Makefile.am b/classify/Makefile.am index 59450fa158..f620e3bfff 100644 --- a/classify/Makefile.am +++ b/classify/Makefile.am @@ -11,15 +11,15 @@ endif noinst_HEADERS = \ adaptive.h blobclass.h \ classify.h cluster.h clusttool.h cutoffs.h \ - errorcounter.h extern.h extract.h \ - featdefs.h flexfx.h float2int.h fpoint.h fxdefs.h \ + errorcounter.h \ + featdefs.h float2int.h fpoint.h \ intfeaturedist.h intfeaturemap.h intfeaturespace.h \ intfx.h intmatcher.h intproto.h kdtree.h \ mastertrainer.h mf.h mfdefs.h mfoutline.h mfx.h \ normfeat.h normmatch.h \ ocrfeatures.h outfeat.h picofeat.h protos.h \ sampleiterator.h shapeclassifier.h shapetable.h \ - tessclassifier.h trainingsample.h trainingsampleset.h xform2d.h + tessclassifier.h trainingsample.h trainingsampleset.h if !USING_MULTIPLELIBS noinst_LTLIBRARIES = libtesseract_classify.la @@ -37,14 +37,14 @@ endif libtesseract_classify_la_SOURCES = \ adaptive.cpp adaptmatch.cpp blobclass.cpp \ classify.cpp cluster.cpp clusttool.cpp cutoffs.cpp \ - errorcounter.cpp extract.cpp \ - featdefs.cpp flexfx.cpp float2int.cpp fpoint.cpp fxdefs.cpp \ + errorcounter.cpp \ + featdefs.cpp float2int.cpp fpoint.cpp \ intfeaturedist.cpp intfeaturemap.cpp intfeaturespace.cpp \ intfx.cpp intmatcher.cpp intproto.cpp kdtree.cpp \ mastertrainer.cpp mf.cpp mfdefs.cpp mfoutline.cpp mfx.cpp \ normfeat.cpp normmatch.cpp \ ocrfeatures.cpp outfeat.cpp picofeat.cpp protos.cpp \ sampleiterator.cpp shapeclassifier.cpp shapetable.cpp \ - tessclassifier.cpp trainingsample.cpp trainingsampleset.cpp xform2d.cpp + tessclassifier.cpp trainingsample.cpp trainingsampleset.cpp From 4a3caefd92e2a15a384249735f09a3cd58bdb0e7 Mon Sep 17 00:00:00 2001 From: Ray Smith Date: Tue, 12 May 2015 15:41:15 -0700 Subject: [PATCH 05/15] Add ability to build under android (without cube or scrollview). --- android/AndroidManifest.xml | 4 +++ android/Makefile.am | 1 + android/jni/Android.mk | 57 +++++++++++++++++++++++++++++++++++++ android/jni/Application.mk | 13 +++++++++ api/baseapi.cpp | 14 +++++++++ ccmain/control.cpp | 4 +++ ccmain/paramsd.h | 2 ++ ccmain/tessedit.cpp | 9 ++++-- ccmain/tesseractclass.cpp | 8 ++++++ ccmain/tesseractclass.h | 14 +++++++-- ccstruct/imagedata.cpp | 4 +++ textord/drawedg.h | 2 ++ 12 files changed, 127 insertions(+), 5 deletions(-) create mode 100644 android/AndroidManifest.xml create mode 100644 android/Makefile.am create mode 100644 android/jni/Android.mk create mode 100644 android/jni/Application.mk diff --git a/android/AndroidManifest.xml b/android/AndroidManifest.xml new file mode 100644 index 0000000000..d5bf0998fd --- /dev/null +++ b/android/AndroidManifest.xml @@ -0,0 +1,4 @@ + \ No newline at end of file diff --git a/android/Makefile.am b/android/Makefile.am new file mode 100644 index 0000000000..9b822f6b9a --- /dev/null +++ b/android/Makefile.am @@ -0,0 +1 @@ +EXTRA_DIST = AndroidManifest.xml jni/Android.mk jni/Application.mk diff --git a/android/jni/Android.mk b/android/jni/Android.mk new file mode 100644 index 0000000000..d8f557e6a1 --- /dev/null +++ b/android/jni/Android.mk @@ -0,0 +1,57 @@ +LOCAL_PATH := $(call my-dir) +include $(CLEAR_VARS) + +LOCAL_MODULE := tesseract-$(APP_ABI) + +LOCAL_STATIC_LIBRARIES := \ + mobile_base \ + leptonica-$(APP_ABI) + +LOCAL_C_INCLUDES := $(APP_C_INCLUDES) + +LOCAL_C_INCLUDES += \ + $(LOCAL_PATH)/../../api \ + $(LOCAL_PATH)/../../ccmain\ + $(LOCAL_PATH)/../../ccstruct\ + $(LOCAL_PATH)/../../ccutil\ + $(LOCAL_PATH)/../../classify\ + $(LOCAL_PATH)/../../cutil\ + $(LOCAL_PATH)/../../dict\ + $(LOCAL_PATH)/../../image\ + $(LOCAL_PATH)/../../textord\ + $(LOCAL_PATH)/../../third_party\ + $(LOCAL_PATH)/../../wordrec\ + $(LOCAL_PATH)/../../opencl\ + $(LOCAL_PATH)/../../viewer\ + $(LOCAL_PATH)/../../../leptonica/include + +$(info local c includes=$(LOCAL_C_INCLUDES)) +$(info local path=$(LOCAL_PATH)) +LOCAL_SRC_FILES := $(wildcard $(LOCAL_PATH)/../../api/*.cpp $(LOCAL_PATH)/../../ccmain/*.cpp $(LOCAL_PATH)/../../ccstruct/*.cpp $(LOCAL_PATH)/../../ccutil/*.cpp $(LOCAL_PATH)/../../classify/*.cpp $(LOCAL_PATH)/../../cutil/*.cpp $(LOCAL_PATH)/../../dict/*.cpp $(LOCAL_PATH)/../../image/*.cpp $(LOCAL_PATH)/../../textord/*.cpp $(LOCAL_PATH)/../../viewer/*.cpp $(LOCAL_PATH)/../../wordrec/*.cpp) + +EXPLICIT_SRC_EXCLUDES := \ + $(LOCAL_PATH)/../../ccmain/cubeclassifier.cpp \ + $(LOCAL_PATH)/../../ccmain/cubeclassifier.h \ + $(LOCAL_PATH)/../../ccmain/cube_control.cpp \ + $(LOCAL_PATH)/../../ccmain/cube_reco_context.cpp \ + $(LOCAL_PATH)/../../ccmain/cube_reco_context.h \ + $(LOCAL_PATH)/../../ccmain/tesseract_cube_combiner.cpp \ + $(LOCAL_PATH)/../../ccmain/tesseract_cube_combiner.h \ + $(LOCAL_PATH)/../../api/pdfrenderer.cpp \ + $(LOCAL_PATH)/../../api/tesseractmain.cpp \ + +LOCAL_SRC_FILES := $(filter-out $(EXPLICIT_SRC_EXCLUDES), $(LOCAL_SRC_FILES)) + +LOCAL_SRC_FILES := $(LOCAL_SRC_FILES:$(LOCAL_PATH)/%=%) + +$(info local src files = $(LOCAL_SRC_FILES)) + +LOCAL_LDLIBS := -ldl -llog -ljnigraphics +LOCAL_CFLAGS := -DANDROID_BUILD -DGRAPHICS_DISABLED + +include $(BUILD_SHARED_LIBRARY) + +$(call import-module,mobile/base) +$(call import-module,mobile/base) +$(call import-module,mobile/util/hash) +$(call import-module,third_party/leptonica/android/jni) diff --git a/android/jni/Application.mk b/android/jni/Application.mk new file mode 100644 index 0000000000..ef8a2153f5 --- /dev/null +++ b/android/jni/Application.mk @@ -0,0 +1,13 @@ +# Include common.mk for building google3 native code. +DEPOT_PATH := $(firstword $(subst /google3, ,$(abspath $(call my-dir)))) +ifneq ($(wildcard $(DEPOT_PATH)/google3/mobile/build/common.mk),) + include $(DEPOT_PATH)/google3/mobile/build/common.mk +else + include $(DEPOT_PATH)/READONLY/google3/mobile/build/common.mk +endif + +# Specify the hash namespace that we're using, based on the APP_STL we're using. +APP_CFLAGS += -Werror -DHASH_NAMESPACE=__gnu_cxx -Wno-error=deprecated-register +APP_PLATFORM := android-16 +APP_STL := gnustl_static +NDK_TOOLCHAIN_VERSION := clang diff --git a/api/baseapi.cpp b/api/baseapi.cpp index f502d24f5e..26398491f4 100644 --- a/api/baseapi.cpp +++ b/api/baseapi.cpp @@ -742,6 +742,7 @@ void TessBaseAPI::DumpPGM(const char* filename) { fclose(fp); } +#ifndef ANDROID_BUILD /** * Placeholder for call to Cube and test that the input data is correct. * reskew is the direction of baselines in the skewed image in @@ -786,6 +787,7 @@ int CubeAPITest(Boxa* boxa_blocks, Pixa* pixa_blocks, ASSERT_HOST(pr_word == word_count); return 0; } +#endif /** * Runs page layout analysis in the mode set by SetPageSegMode. @@ -1022,6 +1024,7 @@ bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data, int timeout_millisec, TessResultRenderer* renderer, int tessedit_page_number) { +#ifndef ANDROID_BUILD Pix *pix = NULL; #ifdef USE_OPENCL OpenclDevice od; @@ -1052,6 +1055,9 @@ bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data, if (tessedit_page_number >= 0) break; } return true; +#else + return false; +#endif } // Master ProcessPages calls ProcessPagesInternal and then does any post- @@ -1087,6 +1093,7 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename, const char* retry_config, int timeout_millisec, TessResultRenderer* renderer) { +#ifndef ANDROID_BUILD PERF_COUNT_START("ProcessPages") bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-"); if (stdInput) { @@ -1174,6 +1181,9 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename, } PERF_COUNT_END return true; +#else + return false; +#endif } bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename, @@ -1207,8 +1217,10 @@ bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename, failed = Recognize(NULL) < 0; } if (tesseract_->tessedit_write_images) { +#ifndef ANDROID_BUILD Pix* page_pix = GetThresholdedImage(); pixWrite("tessinput.tif", page_pix, IFF_TIFF_G4); +#endif } if (failed && retry_config != NULL && retry_config[0] != '\0') { // Save current config variables before switching modes. @@ -2613,10 +2625,12 @@ int TessBaseAPI::NumDawgs() const { return tesseract_ == NULL ? 0 : tesseract_->getDict().NumDawgs(); } +#ifndef ANDROID_BUILD /** Return a pointer to underlying CubeRecoContext object if present. */ CubeRecoContext *TessBaseAPI::GetCubeRecoContext() const { return (tesseract_ == NULL) ? NULL : tesseract_->GetCubeRecoContext(); } +#endif /** Escape a char string - remove <>&"' with HTML codes. */ STRING HOcrEscape(const char* text) { diff --git a/ccmain/control.cpp b/ccmain/control.cpp index 0dbc3fa3d0..28f6c82488 100644 --- a/ccmain/control.cpp +++ b/ccmain/control.cpp @@ -387,12 +387,14 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res, // ****************** Pass 5,6 ******************* rejection_passes(page_res, monitor, target_word_box, word_config); +#ifndef ANDROID_BUILD // ****************** Pass 7 ******************* // Cube combiner. // If cube is loaded and its combiner is present, run it. if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) { run_cube_combiner(page_res); } +#endif // ****************** Pass 8 ******************* font_recognition_pass(page_res); @@ -986,11 +988,13 @@ void Tesseract::classify_word_pass1(const WordData& word_data, BLOCK* block = word_data.block; prev_word_best_choice_ = word_data.prev_word != NULL ? word_data.prev_word->word->best_choice : NULL; +#ifndef ANDROID_BUILD // If we only intend to run cube - run it and return. if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) { cube_word_pass1(block, row, *in_word); return; } +#endif WERD_RES* word = *in_word; match_word_pass_n(1, word, row, block); if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) { diff --git a/ccmain/paramsd.h b/ccmain/paramsd.h index 164b146371..0214652e5b 100644 --- a/ccmain/paramsd.h +++ b/ccmain/paramsd.h @@ -24,7 +24,9 @@ #define VARABLED_H #include "elst.h" +#ifndef ANDROID_BUILD #include "scrollview.h" +#endif #include "params.h" #include "tesseractclass.h" diff --git a/ccmain/tessedit.cpp b/ccmain/tessedit.cpp index e85660507e..f5fb037904 100644 --- a/ccmain/tessedit.cpp +++ b/ccmain/tessedit.cpp @@ -194,7 +194,11 @@ bool Tesseract::init_tesseract_lang_data( if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n"); } - // Load Cube objects if necessary. + // The various OcrEngineMode settings (see publictypes.h) determine which + // engine-specific data files need to be loaded. Currently everything needs + // the base tesseract data, which supplies other useful information, but + // alternative engines, such as cube and LSTM are optional. +#ifndef ANDROID_BUILD if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) { ASSERT_HOST(init_cube_objects(false, &tessdata_manager)); if (tessdata_manager_debug_level) @@ -204,7 +208,7 @@ bool Tesseract::init_tesseract_lang_data( if (tessdata_manager_debug_level) tprintf("Loaded Cube with combiner\n"); } - +#endif // Init ParamsModel. // Load pass1 and pass2 weights (for now these two sets are the same, but in // the future separate sets of weights can be generated). @@ -475,5 +479,4 @@ enum CMD_EVENTS RECOG_PSEUDO, ACTION_2_CMD_EVENT }; - } // namespace tesseract diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp index e50699b541..c262bbc95e 100644 --- a/ccmain/tesseractclass.cpp +++ b/ccmain/tesseractclass.cpp @@ -37,11 +37,15 @@ #include "tesseractclass.h" #include "allheaders.h" +#ifndef ANDROID_BUILD #include "cube_reco_context.h" +#endif #include "edgblob.h" #include "equationdetect.h" #include "globals.h" +#ifndef ANDROID_BUILD #include "tesseract_cube_combiner.h" +#endif // Include automatically generated configuration file if running autoconf. #ifdef HAVE_CONFIG_H @@ -547,8 +551,10 @@ Tesseract::Tesseract() reskew_(1.0f, 0.0f), most_recently_used_(this), font_table_size_(0), +#ifndef ANDROID_BUILD cube_cntxt_(NULL), tess_cube_combiner_(NULL), +#endif equ_detect_(NULL) { } @@ -556,6 +562,7 @@ Tesseract::~Tesseract() { Clear(); end_tesseract(); sub_langs_.delete_data_pointers(); +#ifndef ANDROID_BUILD // Delete cube objects. if (cube_cntxt_ != NULL) { delete cube_cntxt_; @@ -565,6 +572,7 @@ Tesseract::~Tesseract() { delete tess_cube_combiner_; tess_cube_combiner_ = NULL; } +#endif } void Tesseract::Clear() { diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h index c3fe50e07b..087e995e15 100644 --- a/ccmain/tesseractclass.h +++ b/ccmain/tesseractclass.h @@ -97,12 +97,16 @@ class WERD_RES; namespace tesseract { class ColumnFinder; +#ifndef ANDROID_BUILD class CubeLineObject; class CubeObject; class CubeRecoContext; +#endif class EquationDetect; class Tesseract; +#ifndef ANDROID_BUILD class TesseractCubeCombiner; +#endif // A collection of various variables for statistics and debugging. struct TesseractStats { @@ -382,6 +386,7 @@ class Tesseract : public Wordrec { int *right_ok) const; //// cube_control.cpp /////////////////////////////////////////////////// +#ifndef ANDROID_BUILD bool init_cube_objects(bool load_combiner, TessdataManager *tessdata_manager); // Iterates through tesseract's results and calls cube on each word, @@ -407,6 +412,7 @@ class Tesseract : public Wordrec { Boxa** char_boxes, CharSamp*** char_samples); bool create_cube_box_word(Boxa *char_boxes, int num_chars, TBOX word_box, BoxWord* box_word); +#endif //// output.h ////////////////////////////////////////////////////////// void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box); @@ -713,8 +719,8 @@ class Tesseract : public Wordrec { // Creates a fake best_choice entry in each WERD_RES with the correct text. void CorrectClassifyWords(PAGE_RES* page_res); // Call LearnWord to extract features for labelled blobs within each word. - // Features are written to the given filename. - void ApplyBoxTraining(const STRING& filename, PAGE_RES* page_res); + // Features are stored in an internal buffer. + void ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res); //// fixxht.cpp /////////////////////////////////////////////////////// // Returns the number of misfit blob tops in this word. @@ -1089,7 +1095,9 @@ class Tesseract : public Wordrec { PAGE_RES_IT* pr_it, FILE *output_file); +#ifndef ANDROID_BUILD inline CubeRecoContext *GetCubeRecoContext() { return cube_cntxt_; } +#endif private: // The filename of a backup config file. If not null, then we currently @@ -1129,9 +1137,11 @@ class Tesseract : public Wordrec { Tesseract* most_recently_used_; // The size of the font table, ie max possible font id + 1. int font_table_size_; +#ifndef ANDROID_BUILD // Cube objects. CubeRecoContext* cube_cntxt_; TesseractCubeCombiner *tess_cube_combiner_; +#endif // Equation detector. Note: this pointer is NOT owned by the class. EquationDetect* equ_detect_; }; diff --git a/ccstruct/imagedata.cpp b/ccstruct/imagedata.cpp index 0186cd0948..4016a92b5b 100644 --- a/ccstruct/imagedata.cpp +++ b/ccstruct/imagedata.cpp @@ -51,6 +51,7 @@ void WordFeature::ComputeSize(const GenericVector& features, // Draws the features in the given window. void WordFeature::Draw(const GenericVector& features, ScrollView* window) { +#ifndef GRAPHICS_DISABLED for (int f = 0; f < features.size(); ++f) { FCOORD pos(features[f].x_, features[f].y_); FCOORD dir; @@ -61,6 +62,7 @@ void WordFeature::Draw(const GenericVector& features, window->DrawTo(IntCastRounded(pos.x() + dir.x()), IntCastRounded(pos.y() + dir.y())); } +#endif } // Writes to the given file. Returns false in case of error. @@ -244,6 +246,7 @@ int ImageData::MemoryUsed() const { // Draws the data in a new window. void ImageData::Display() const { +#ifndef GRAPHICS_DISABLED const int kTextSize = 64; // Draw the image. Pix* pix = GetPix(); @@ -274,6 +277,7 @@ void ImageData::Display() const { win->Pen(ScrollView::GREEN); win->Update(); window_wait(win); +#endif } // Adds the supplied boxes and transcriptions that correspond to the correct diff --git a/textord/drawedg.h b/textord/drawedg.h index 339432fd13..0d4903ba19 100644 --- a/textord/drawedg.h +++ b/textord/drawedg.h @@ -19,6 +19,7 @@ #ifndef DRAWEDG_H #define DRAWEDG_H +#ifndef ANDROID_BUILD #include "scrollview.h" #include "crakedge.h" @@ -32,3 +33,4 @@ void draw_raw_edge( //draw the cracks ScrollView::Color colour //colour to draw in ); #endif +#endif From b6d0184806c1f0c5f8b434657465861752f862b0 Mon Sep 17 00:00:00 2001 From: Ray Smith Date: Tue, 12 May 2015 15:53:45 -0700 Subject: [PATCH 06/15] Fixed problems with shifted baselines so recognition can recover from layout analysis errors. --- ccmain/control.cpp | 108 +++++++++++++++++++---------- ccmain/fixxht.cpp | 149 ++++++++++++++++++++++++++-------------- ccmain/tesseractclass.h | 7 +- ccstruct/blobs.cpp | 8 +-- ccstruct/blobs.h | 2 +- ccstruct/normalis.cpp | 2 +- ccstruct/pageres.cpp | 6 +- ccstruct/pageres.h | 1 + 8 files changed, 186 insertions(+), 97 deletions(-) diff --git a/ccmain/control.cpp b/ccmain/control.cpp index 28f6c82488..a765a97c8a 100644 --- a/ccmain/control.cpp +++ b/ccmain/control.cpp @@ -1044,45 +1044,77 @@ bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) { int original_misfits = CountMisfitTops(word); if (original_misfits == 0) return false; - float new_x_ht = ComputeCompatibleXheight(word); - if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) { - WERD_RES new_x_ht_word(word->word); - if (word->blamer_bundle != NULL) { - new_x_ht_word.blamer_bundle = new BlamerBundle(); - new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle)); - } - new_x_ht_word.x_height = new_x_ht; - new_x_ht_word.caps_height = 0.0; - new_x_ht_word.SetupForRecognition( - unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL, - classify_bln_numeric_mode, textord_use_cjk_fp_model, - poly_allow_detailed_fx, row, block); - match_word_pass_n(2, &new_x_ht_word, row, block); - if (!new_x_ht_word.tess_failed) { - int new_misfits = CountMisfitTops(&new_x_ht_word); - if (debug_x_ht_level >= 1) { - tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n", - original_misfits, word->x_height, - new_misfits, new_x_ht); - tprintf("Old rating= %f, certainty=%f, new=%f, %f\n", - word->best_choice->rating(), word->best_choice->certainty(), - new_x_ht_word.best_choice->rating(), - new_x_ht_word.best_choice->certainty()); - } - // The misfits must improve and either the rating or certainty. - accept_new_x_ht = new_misfits < original_misfits && - (new_x_ht_word.best_choice->certainty() > - word->best_choice->certainty() || - new_x_ht_word.best_choice->rating() < - word->best_choice->rating()); - if (debug_x_ht_level >= 1) { - ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word); + float baseline_shift = 0.0f; + float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift); + if (baseline_shift != 0.0f) { + // Try the shift on its own first. + if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height, + word, block, row)) + return false; + original_misfits = CountMisfitTops(word); + if (original_misfits > 0) { + float new_baseline_shift; + // Now recompute the new x_height. + new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift); + if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) { + // No test of return value here, as we are definitely making a change + // to the word by shifting the baseline. + TestNewNormalization(original_misfits, baseline_shift, new_x_ht, + word, block, row); } } - if (accept_new_x_ht) { - word->ConsumeWordResults(&new_x_ht_word); - return true; + return true; + } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) { + return TestNewNormalization(original_misfits, 0.0f, new_x_ht, + word, block, row); + } else { + return false; + } +} + +// Runs recognition with the test baseline shift and x-height and returns true +// if there was an improvement in recognition result. +bool Tesseract::TestNewNormalization(int original_misfits, + float baseline_shift, float new_x_ht, + WERD_RES *word, BLOCK* block, ROW *row) { + bool accept_new_x_ht = false; + WERD_RES new_x_ht_word(word->word); + if (word->blamer_bundle != NULL) { + new_x_ht_word.blamer_bundle = new BlamerBundle(); + new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle)); + } + new_x_ht_word.x_height = new_x_ht; + new_x_ht_word.baseline_shift = baseline_shift; + new_x_ht_word.caps_height = 0.0; + new_x_ht_word.SetupForRecognition( + unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL, + classify_bln_numeric_mode, textord_use_cjk_fp_model, + poly_allow_detailed_fx, row, block); + match_word_pass_n(2, &new_x_ht_word, row, block); + if (!new_x_ht_word.tess_failed) { + int new_misfits = CountMisfitTops(&new_x_ht_word); + if (debug_x_ht_level >= 1) { + tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n", + original_misfits, word->x_height, + new_misfits, new_x_ht); + tprintf("Old rating= %f, certainty=%f, new=%f, %f\n", + word->best_choice->rating(), word->best_choice->certainty(), + new_x_ht_word.best_choice->rating(), + new_x_ht_word.best_choice->certainty()); } + // The misfits must improve and either the rating or certainty. + accept_new_x_ht = new_misfits < original_misfits && + (new_x_ht_word.best_choice->certainty() > + word->best_choice->certainty() || + new_x_ht_word.best_choice->rating() < + word->best_choice->rating()); + if (debug_x_ht_level >= 1) { + ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word); + } + } + if (accept_new_x_ht) { + word->ConsumeWordResults(&new_x_ht_word); + return true; } return false; } @@ -1380,13 +1412,13 @@ BOOL8 Tesseract::check_debug_pt(WERD_RES *word, int location) { return FALSE; tessedit_rejection_debug.set_value (FALSE); - debug_x_ht_level.set_value (0); + debug_x_ht_level.set_value(0); if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) { if (location < 0) return TRUE; // For breakpoint use tessedit_rejection_debug.set_value (TRUE); - debug_x_ht_level.set_value (20); + debug_x_ht_level.set_value(2); tprintf ("\n\nTESTWD::"); switch (location) { case 0: diff --git a/ccmain/fixxht.cpp b/ccmain/fixxht.cpp index b82f0ca503..c066f80e27 100644 --- a/ccmain/fixxht.cpp +++ b/ccmain/fixxht.cpp @@ -35,6 +35,8 @@ namespace tesseract { // guessed that the blob tops are caps and will have placed the xheight too low. // 3. Noise/logos beside words, or changes in font size on a line. Such // things can blow the statistics and cause an incorrect estimate. +// 4. Incorrect baseline. Can happen when 2 columns are incorrectly merged. +// In this case the x-height is often still correct. // // Algorithm. // Compare the vertical position (top only) of alphnumerics in a word with @@ -54,6 +56,10 @@ namespace tesseract { // even if the x-height is incorrect. This is not a terrible assumption, but // it is not great. An improvement would be to use a classifier that does // not care about vertical position or scaling at all. +// Separately collect stats on shifted baselines and apply the same logic to +// computing a best-fit shift to fix the error. If the baseline needs to be +// shifted, but the x-height is OK, returns the original x-height along with +// the baseline shift to indicate that recognition needs to re-run. // If the max-min top of a unicharset char is bigger than kMaxCharTopRange // then the char top cannot be used to judge misfits or suggest a new top. @@ -92,65 +98,108 @@ int Tesseract::CountMisfitTops(WERD_RES *word_res) { // Returns a new x-height maximally compatible with the result in word_res. // See comment above for overall algorithm. -float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res) { +float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res, + float* baseline_shift) { STATS top_stats(0, MAX_UINT8); + STATS shift_stats(-MAX_UINT8, MAX_UINT8); + int bottom_shift = 0; int num_blobs = word_res->rebuild_word->NumBlobs(); - for (int blob_id = 0; blob_id < num_blobs; ++blob_id) { - TBLOB* blob = word_res->rebuild_word->blobs[blob_id]; - UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id); - if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) { - int top = blob->bounding_box().top(); - // Clip the top to the limit of normalized feature space. - if (top >= INT_FEAT_RANGE) - top = INT_FEAT_RANGE - 1; - int bottom = blob->bounding_box().bottom(); - int min_bottom, max_bottom, min_top, max_top; - unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, - &min_top, &max_top); - // Chars with a wild top range would mess up the result so ignore them. - if (max_top - min_top > kMaxCharTopRange) - continue; - int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top, - top - (max_top + x_ht_acceptance_tolerance)); - int height = top - kBlnBaselineOffset; - if (debug_x_ht_level >= 20) { - tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d : ", - unicharset.id_to_unichar(class_id), - height, min_bottom, max_bottom, min_top, max_top, - bottom, top); - } - // Use only chars that fit in the expected bottom range, and where - // the range of tops is sensibly near the xheight. - if (min_bottom <= bottom + x_ht_acceptance_tolerance && - bottom - x_ht_acceptance_tolerance <= max_bottom && - min_top > kBlnBaselineOffset && - max_top - kBlnBaselineOffset >= kBlnXHeight && - misfit_dist > 0) { - // Compute the x-height position using proportionality between the - // actual height and expected height. - int min_xht = DivRounded(height * kBlnXHeight, - max_top - kBlnBaselineOffset); - int max_xht = DivRounded(height * kBlnXHeight, - min_top - kBlnBaselineOffset); - if (debug_x_ht_level >= 20) { - tprintf(" xht range min=%d, max=%d\n", - min_xht, max_xht); + do { + top_stats.clear(); + shift_stats.clear(); + for (int blob_id = 0; blob_id < num_blobs; ++blob_id) { + TBLOB* blob = word_res->rebuild_word->blobs[blob_id]; + UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id); + if (unicharset.get_isalpha(class_id) || + unicharset.get_isdigit(class_id)) { + int top = blob->bounding_box().top() + bottom_shift; + // Clip the top to the limit of normalized feature space. + if (top >= INT_FEAT_RANGE) + top = INT_FEAT_RANGE - 1; + int bottom = blob->bounding_box().bottom() + bottom_shift; + int min_bottom, max_bottom, min_top, max_top; + unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom, + &min_top, &max_top); + // Chars with a wild top range would mess up the result so ignore them. + if (max_top - min_top > kMaxCharTopRange) + continue; + int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top, + top - (max_top + x_ht_acceptance_tolerance)); + int height = top - kBlnBaselineOffset; + if (debug_x_ht_level >= 2) { + tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ", + unicharset.id_to_unichar(class_id), + height, min_bottom, max_bottom, min_top, max_top, + bottom, top); + } + // Use only chars that fit in the expected bottom range, and where + // the range of tops is sensibly near the xheight. + if (min_bottom <= bottom + x_ht_acceptance_tolerance && + bottom - x_ht_acceptance_tolerance <= max_bottom && + min_top > kBlnBaselineOffset && + max_top - kBlnBaselineOffset >= kBlnXHeight && + misfit_dist > 0) { + // Compute the x-height position using proportionality between the + // actual height and expected height. + int min_xht = DivRounded(height * kBlnXHeight, + max_top - kBlnBaselineOffset); + int max_xht = DivRounded(height * kBlnXHeight, + min_top - kBlnBaselineOffset); + if (debug_x_ht_level >= 2) { + tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht); + } + // The range of expected heights gets a vote equal to the distance + // of the actual top from the expected top. + for (int y = min_xht; y <= max_xht; ++y) + top_stats.add(y, misfit_dist); + } else if ((min_bottom > bottom + x_ht_acceptance_tolerance || + bottom - x_ht_acceptance_tolerance > max_bottom) && + bottom_shift == 0) { + // Get the range of required bottom shift. + int min_shift = min_bottom - bottom; + int max_shift = max_bottom - bottom; + if (debug_x_ht_level >= 2) { + tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift); + } + // The range of expected shifts gets a vote equal to the min distance + // of the actual bottom from the expected bottom, spread over the + // range of its acceptance. + int misfit_weight = abs(min_shift); + if (max_shift > min_shift) + misfit_weight /= max_shift - min_shift; + for (int y = min_shift; y <= max_shift; ++y) + shift_stats.add(y, misfit_weight); + } else { + if (bottom_shift == 0) { + // Things with bottoms that are already ok need to say so, on the + // 1st iteration only. + shift_stats.add(0, kBlnBaselineOffset); + } + if (debug_x_ht_level >= 2) { + tprintf(" already OK\n"); + } } - // The range of expected heights gets a vote equal to the distance - // of the actual top from the expected top. - for (int y = min_xht; y <= max_xht; ++y) - top_stats.add(y, misfit_dist); - } else if (debug_x_ht_level >= 20) { - tprintf(" already OK\n"); } } + if (shift_stats.get_total() > top_stats.get_total()) { + bottom_shift = IntCastRounded(shift_stats.median()); + if (debug_x_ht_level >= 2) { + tprintf("Applying bottom shift=%d\n", bottom_shift); + } + } + } while (bottom_shift != 0 && + top_stats.get_total() < shift_stats.get_total()); + // Baseline shift is opposite sign to the bottom shift. + *baseline_shift = -bottom_shift / word_res->denorm.y_scale(); + if (debug_x_ht_level >= 2) { + tprintf("baseline shift=%g\n", *baseline_shift); } if (top_stats.get_total() == 0) - return 0.0f; + return bottom_shift != 0 ? word_res->x_height : 0.0f; // The new xheight is just the median vote, which is then scaled out // of BLN space back to pixel space to get the x-height in pixel space. float new_xht = top_stats.median(); - if (debug_x_ht_level >= 20) { + if (debug_x_ht_level >= 2) { tprintf("Median xht=%f\n", new_xht); tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n", new_xht, new_xht / word_res->denorm.y_scale()); @@ -159,7 +208,7 @@ float Tesseract::ComputeCompatibleXheight(WERD_RES *word_res) { if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change) return new_xht / word_res->denorm.y_scale(); else - return 0.0f; + return bottom_shift != 0 ? word_res->x_height : 0.0f; } } // namespace tesseract diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h index 087e995e15..bd03fff642 100644 --- a/ccmain/tesseractclass.h +++ b/ccmain/tesseractclass.h @@ -350,6 +350,11 @@ class Tesseract : public Wordrec { WERD_RES* word, WERD_RES* new_word); bool RunOldFixXht(WERD_RES *word, BLOCK* block, ROW *row); bool TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row); + // Runs recognition with the test baseline shift and x-height and returns true + // if there was an improvement in recognition result. + bool TestNewNormalization(int original_misfits, float baseline_shift, + float new_x_ht, WERD_RES *word, BLOCK* block, + ROW *row); BOOL8 recog_interactive(PAGE_RES_IT* pr_it); // Set fonts of this word. @@ -729,7 +734,7 @@ class Tesseract : public Wordrec { // maximally compatible with the result in word_res. // Returns 0.0f if no x-height is found that is better than the current // estimate. - float ComputeCompatibleXheight(WERD_RES *word_res); + float ComputeCompatibleXheight(WERD_RES *word_res, float* baseline_shift); //// Data members /////////////////////////////////////////////////////// // TODO(ocr-team): Find and remove obsolete parameters. BOOL_VAR_H(tessedit_resegment_from_boxes, false, diff --git a/ccstruct/blobs.cpp b/ccstruct/blobs.cpp index 97f95eba2a..ad4994079e 100644 --- a/ccstruct/blobs.cpp +++ b/ccstruct/blobs.cpp @@ -805,8 +805,8 @@ TWERD* TWERD::PolygonalCopy(bool allow_detailed_fx, WERD* src) { // Baseline normalizes the blobs in-place, recording the normalization in the // DENORMs in the blobs. void TWERD::BLNormalize(const BLOCK* block, const ROW* row, Pix* pix, - bool inverse, float x_height, bool numeric_mode, - tesseract::OcrEngineMode hint, + bool inverse, float x_height, float baseline_shift, + bool numeric_mode, tesseract::OcrEngineMode hint, const TBOX* norm_box, DENORM* word_denorm) { TBOX word_box = bounding_box(); @@ -822,7 +822,7 @@ void TWERD::BLNormalize(const BLOCK* block, const ROW* row, Pix* pix, if (hint == tesseract::OEM_CUBE_ONLY) scale = 1.0f; } else { - input_y_offset = row->base_line(word_middle); + input_y_offset = row->base_line(word_middle) + baseline_shift; } for (int b = 0; b < blobs.size(); ++b) { TBLOB* blob = blobs[b]; @@ -835,7 +835,7 @@ void TWERD::BLNormalize(const BLOCK* block, const ROW* row, Pix* pix, blob_scale = ClipToRange(kBlnXHeight * 4.0f / (3 * blob_box.height()), scale, scale * 1.5f); } else if (row != NULL && hint != tesseract::OEM_CUBE_ONLY) { - baseline = row->base_line(mid_x); + baseline = row->base_line(mid_x) + baseline_shift; } // The image will be 8-bit grey if the input was grey or color. Note that in // a grey image 0 is black and 255 is white. If the input was binary, then diff --git a/ccstruct/blobs.h b/ccstruct/blobs.h index 1fd9683ef9..c7118a1774 100644 --- a/ccstruct/blobs.h +++ b/ccstruct/blobs.h @@ -410,7 +410,7 @@ struct TWERD { // Baseline normalizes the blobs in-place, recording the normalization in the // DENORMs in the blobs. void BLNormalize(const BLOCK* block, const ROW* row, Pix* pix, bool inverse, - float x_height, bool numeric_mode, + float x_height, float baseline_shift, bool numeric_mode, tesseract::OcrEngineMode hint, const TBOX* norm_box, DENORM* word_denorm); diff --git a/ccstruct/normalis.cpp b/ccstruct/normalis.cpp index 638be619ea..d43a1459cb 100644 --- a/ccstruct/normalis.cpp +++ b/ccstruct/normalis.cpp @@ -487,7 +487,7 @@ void DENORM::XHeightRange(int unichar_id, const UNICHARSET& unicharset, top > kBlnCellHeight - kBlnBaselineOffset / 2) max_top += kBlnBaselineOffset; top -= bln_yshift; - int height = top - kBlnBaselineOffset - bottom_shift; + int height = top - kBlnBaselineOffset; double min_height = min_top - kBlnBaselineOffset - tolerance; double max_height = max_top - kBlnBaselineOffset + tolerance; diff --git a/ccstruct/pageres.cpp b/ccstruct/pageres.cpp index 58f7d8a838..5304451929 100644 --- a/ccstruct/pageres.cpp +++ b/ccstruct/pageres.cpp @@ -252,6 +252,7 @@ void WERD_RES::CopySimpleFields(const WERD_RES& source) { fontinfo_id2_count = source.fontinfo_id2_count; x_height = source.x_height; caps_height = source.caps_height; + baseline_shift = source.baseline_shift; guessed_x_ht = source.guessed_x_ht; guessed_caps_ht = source.guessed_caps_ht; reject_spaces = source.reject_spaces; @@ -314,8 +315,8 @@ bool WERD_RES::SetupForRecognition(const UNICHARSET& unicharset_in, float word_xheight = use_body_size && row != NULL && row->body_size() > 0.0f ? row->body_size() : x_height; chopped_word->BLNormalize(block, row, pix, word->flag(W_INVERSE), - word_xheight, numeric_mode, norm_mode_hint, - norm_box, &denorm); + word_xheight, baseline_shift, numeric_mode, + norm_mode_hint, norm_box, &denorm); blob_row = row; SetupBasicsFromChoppedWord(unicharset_in); SetupBlamerBundle(); @@ -1100,6 +1101,7 @@ void WERD_RES::InitNonPointers() { fontinfo_id2_count = 0; x_height = 0.0; caps_height = 0.0; + baseline_shift = 0.0f; guessed_x_ht = TRUE; guessed_caps_ht = TRUE; combination = FALSE; diff --git a/ccstruct/pageres.h b/ccstruct/pageres.h index 5e165e5195..75798113d4 100644 --- a/ccstruct/pageres.h +++ b/ccstruct/pageres.h @@ -294,6 +294,7 @@ class WERD_RES : public ELIST_LINK { CRUNCH_MODE unlv_crunch_mode; float x_height; // post match estimate float caps_height; // post match estimate + float baseline_shift; // post match estimate. /* To deal with fuzzy spaces we need to be able to combine "words" to form From 0e868ef377103c448cfe58d873957f966a53ce28 Mon Sep 17 00:00:00 2001 From: Ray Smith Date: Tue, 12 May 2015 16:47:02 -0700 Subject: [PATCH 07/15] Major change to improve layout analysis for heavily diacritic languages: Tha, Vie, Kan, Tel etc. There is a new overlap detector that detects when diacritics cause a big increase in textline overlap. In such cases, diacritics from overlap regions are kept separate from layout analysis completely, allowing textline formation to happen without them. The diacritics are then assigned to 0, 1 or 2 close words at the end of layout analysis, using and modifying an old noise detection data path. The stored diacritics are used or not during recognition according to the character classifier's liking for them. --- ccmain/control.cpp | 378 +++++++++++- ccmain/fixspace.cpp | 3 +- ccmain/pageiterator.cpp | 132 +++-- ccmain/pageiterator.h | 18 + ccmain/pagesegmain.cpp | 40 +- ccmain/pgedit.cpp | 3 +- ccmain/recogtraining.cpp | 3 +- ccmain/tesseractclass.cpp | 1054 ++++++++++++++++++---------------- ccmain/tesseractclass.h | 64 ++- ccstruct/blobbox.h | 9 + ccstruct/ocrblock.cpp | 12 + ccstruct/ocrblock.h | 8 +- ccstruct/ocrrow.cpp | 11 + ccstruct/ocrrow.h | 3 + ccstruct/pageres.cpp | 44 +- ccstruct/pageres.h | 6 +- ccstruct/pdblock.cpp | 4 +- ccstruct/pdblock.h | 4 +- ccstruct/werd.cpp | 130 ++++- ccstruct/werd.h | 28 +- textord/colfind.cpp | 30 +- textord/colfind.h | 12 +- textord/colpartition.cpp | 27 + textord/colpartition.h | 5 + textord/colpartitiongrid.cpp | 59 ++ textord/colpartitiongrid.h | 9 + textord/strokewidth.cpp | 112 +++- textord/strokewidth.h | 42 +- textord/tablefind.cpp | 4 +- textord/textord.cpp | 6 +- textord/textord.h | 53 +- textord/topitch.cpp | 27 +- textord/tordmain.cpp | 239 ++++++-- textord/tordmain.h | 21 +- 34 files changed, 1856 insertions(+), 744 deletions(-) diff --git a/ccmain/control.cpp b/ccmain/control.cpp index a765a97c8a..3abf216e34 100644 --- a/ccmain/control.cpp +++ b/ccmain/control.cpp @@ -93,8 +93,7 @@ BOOL8 Tesseract::recog_interactive(PAGE_RES_IT* pr_it) { WordData word_data(*pr_it); SetupWordPassN(2, &word_data); - classify_word_and_language(&Tesseract::classify_word_pass2, pr_it, - &word_data); + classify_word_and_language(2, pr_it, &word_data); if (tessedit_debug_quality_metrics) { WERD_RES* word_res = pr_it->word(); word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual); @@ -190,6 +189,7 @@ void Tesseract::SetupWordPassN(int pass_n, WordData* word) { if (word->word->x_height == 0.0f) word->word->x_height = word->row->x_height(); } + word->lang_words.truncate(0); for (int s = 0; s <= sub_langs_.size(); ++s) { // The sub_langs_.size() entry is for the master language. Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this; @@ -249,15 +249,23 @@ bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor, while (pr_it->word() != NULL && pr_it->word() != word->word) pr_it->forward(); ASSERT_HOST(pr_it->word() != NULL); - WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1 - : &Tesseract::classify_word_pass2; - classify_word_and_language(recognizer, pr_it, word); - if (tessedit_dump_choices) { + bool make_next_word_fuzzy = false; + if (!AnyLSTMLang() && + ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) { + // Needs to be setup again to see the new outlines in the chopped_word. + SetupWordPassN(pass_n, word); + } + + classify_word_and_language(pass_n, pr_it, word); + if (tessedit_dump_choices || debug_noise_removal) { tprintf("Pass%d: %s [%s]\n", pass_n, word->word->best_choice->unichar_string().string(), word->word->best_choice->debug_string().string()); } pr_it->forward(); + if (make_next_word_fuzzy && pr_it->word() != NULL) { + pr_it->MakeCurrentWordFuzzy(); + } } return true; } @@ -898,6 +906,359 @@ static bool WordsAcceptable(const PointerVector& words) { return true; } +// Moves good-looking "noise"/diacritics from the reject list to the main +// blob list on the current word. Returns true if anything was done, and +// sets make_next_word_fuzzy if blob(s) were added to the end of the word. +bool Tesseract::ReassignDiacritics(int pass, PAGE_RES_IT* pr_it, + bool* make_next_word_fuzzy) { + *make_next_word_fuzzy = false; + WERD* real_word = pr_it->word()->word; + if (real_word->rej_cblob_list()->empty() || + real_word->cblob_list()->empty() || + real_word->rej_cblob_list()->length() > noise_maxperword) + return false; + real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle); + // Get the noise outlines into a vector with matching bool map. + GenericVector outlines; + real_word->GetNoiseOutlines(&outlines); + GenericVector word_wanted; + GenericVector overlapped_any_blob; + GenericVector target_blobs; + AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it, + &word_wanted, &overlapped_any_blob, + &target_blobs); + // Filter the outlines that overlapped any blob and put them into the word + // now. This simplifies the remaining task and also makes it more accurate + // as it has more completed blobs to work on. + GenericVector wanted; + GenericVector wanted_blobs; + GenericVector wanted_outlines; + int num_overlapped = 0; + int num_overlapped_used = 0; + for (int i = 0; i < overlapped_any_blob.size(); ++i) { + if (overlapped_any_blob[i]) { + ++num_overlapped; + if (word_wanted[i]) ++num_overlapped_used; + wanted.push_back(word_wanted[i]); + wanted_blobs.push_back(target_blobs[i]); + wanted_outlines.push_back(outlines[i]); + outlines[i] = NULL; + } + } + real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, NULL); + AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted, + &target_blobs); + int non_overlapped = 0; + int non_overlapped_used = 0; + for (int i = 0; i < word_wanted.size(); ++i) { + if (word_wanted[i]) ++non_overlapped_used; + if (outlines[i] != NULL) ++non_overlapped_used; + } + if (debug_noise_removal) { + tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:", + num_overlapped_used, num_overlapped, non_overlapped_used, + non_overlapped); + real_word->bounding_box().print(); + } + // Now we have decided which outlines we want, put them into the real_word. + if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines, + make_next_word_fuzzy)) { + pr_it->MakeCurrentWordFuzzy(); + } + // TODO(rays) Parts of combos have a deep copy of the real word, and need + // to have their noise outlines moved/assigned in the same way!! + return num_overlapped_used != 0 || non_overlapped_used != 0; +} + +// Attempts to put noise/diacritic outlines into the blobs that they overlap. +// Input: a set of noisy outlines that probably belong to the real_word. +// Output: word_wanted indicates which outlines are to be assigned to a blob, +// target_blobs indicates which to assign to, and overlapped_any_blob is +// true for all outlines that overlapped a blob. +void Tesseract::AssignDiacriticsToOverlappingBlobs( + const GenericVector& outlines, int pass, WERD* real_word, + PAGE_RES_IT* pr_it, GenericVector* word_wanted, + GenericVector* overlapped_any_blob, + GenericVector* target_blobs) { + GenericVector blob_wanted; + word_wanted->init_to_size(outlines.size(), false); + overlapped_any_blob->init_to_size(outlines.size(), false); + target_blobs->init_to_size(outlines.size(), NULL); + // For each real blob, find the outlines that seriously overlap it. + // A single blob could be several merged characters, so there can be quite + // a few outlines overlapping, and the full engine needs to be used to chop + // and join to get a sensible result. + C_BLOB_IT blob_it(real_word->cblob_list()); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + C_BLOB* blob = blob_it.data(); + TBOX blob_box = blob->bounding_box(); + blob_wanted.init_to_size(outlines.size(), false); + int num_blob_outlines = 0; + for (int i = 0; i < outlines.size(); ++i) { + if (blob_box.major_x_overlap(outlines[i]->bounding_box()) && + !(*word_wanted)[i]) { + blob_wanted[i] = true; + (*overlapped_any_blob)[i] = true; + ++num_blob_outlines; + } + } + if (debug_noise_removal) { + tprintf("%d noise outlines overlap blob at:", num_blob_outlines); + blob_box.print(); + } + // If any outlines overlap the blob, and not too many, classify the blob + // (using the full engine, languages and all), and choose the maximal + // combination of outlines that doesn't hurt the end-result classification + // by too much. Mark them as wanted. + if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) { + if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob, + outlines, num_blob_outlines, + &blob_wanted)) { + for (int i = 0; i < blob_wanted.size(); ++i) { + if (blob_wanted[i]) { + // Claim the outline and record where it is going. + (*word_wanted)[i] = true; + (*target_blobs)[i] = blob; + } + } + } + } + } +} + +// Attempts to assign non-overlapping outlines to their nearest blobs or +// make new blobs out of them. +void Tesseract::AssignDiacriticsToNewBlobs( + const GenericVector& outlines, int pass, WERD* real_word, + PAGE_RES_IT* pr_it, GenericVector* word_wanted, + GenericVector* target_blobs) { + GenericVector blob_wanted; + word_wanted->init_to_size(outlines.size(), false); + target_blobs->init_to_size(outlines.size(), NULL); + // Check for outlines that need to be turned into stand-alone blobs. + for (int i = 0; i < outlines.size(); ++i) { + if (outlines[i] == NULL) continue; + // Get a set of adjacent outlines that don't overlap any existing blob. + blob_wanted.init_to_size(outlines.size(), false); + int num_blob_outlines = 0; + TBOX total_ol_box(outlines[i]->bounding_box()); + while (i < outlines.size() && outlines[i] != NULL) { + blob_wanted[i] = true; + total_ol_box += outlines[i]->bounding_box(); + ++i; + ++num_blob_outlines; + } + // Find the insertion point. + C_BLOB_IT blob_it(real_word->cblob_list()); + while (!blob_it.at_last() && + blob_it.data_relative(1)->bounding_box().left() <= + total_ol_box.left()) { + blob_it.forward(); + } + // Choose which combination of them we actually want and where to put + // them. + if (debug_noise_removal) + tprintf("Num blobless outlines = %d\n", num_blob_outlines); + C_BLOB* left_blob = blob_it.data(); + TBOX left_box = left_blob->bounding_box(); + C_BLOB* right_blob = blob_it.at_last() ? NULL : blob_it.data_relative(1); + if ((left_box.x_overlap(total_ol_box) || right_blob == NULL || + !right_blob->bounding_box().x_overlap(total_ol_box)) && + SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob, + outlines, num_blob_outlines, + &blob_wanted)) { + if (debug_noise_removal) tprintf("Added to left blob\n"); + for (int j = 0; j < blob_wanted.size(); ++j) { + if (blob_wanted[j]) { + (*word_wanted)[j] = true; + (*target_blobs)[j] = left_blob; + } + } + } else if (right_blob != NULL && + (!left_box.x_overlap(total_ol_box) || + right_blob->bounding_box().x_overlap(total_ol_box)) && + SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, + right_blob, outlines, + num_blob_outlines, &blob_wanted)) { + if (debug_noise_removal) tprintf("Added to right blob\n"); + for (int j = 0; j < blob_wanted.size(); ++j) { + if (blob_wanted[j]) { + (*word_wanted)[j] = true; + (*target_blobs)[j] = right_blob; + } + } + } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, NULL, + outlines, num_blob_outlines, + &blob_wanted)) { + if (debug_noise_removal) tprintf("Fitted between blobs\n"); + for (int j = 0; j < blob_wanted.size(); ++j) { + if (blob_wanted[j]) { + (*word_wanted)[j] = true; + (*target_blobs)[j] = NULL; + } + } + } + } +} + +// Starting with ok_outlines set to indicate which outlines overlap the blob, +// chooses the optimal set (approximately) and returns true if any outlines +// are desired, in which case ok_outlines indicates which ones. +bool Tesseract::SelectGoodDiacriticOutlines( + int pass, float certainty_threshold, PAGE_RES_IT* pr_it, C_BLOB* blob, + const GenericVector& outlines, int num_outlines, + GenericVector* ok_outlines) { + STRING best_str; + float target_cert = certainty_threshold; + if (blob != NULL) { + float target_c2; + target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2); + if (debug_noise_removal) { + tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.string(), + target_cert, target_c2); + blob->bounding_box().print(); + } + target_cert -= (target_cert - certainty_threshold) * noise_cert_factor; + } + GenericVector test_outlines = *ok_outlines; + // Start with all the outlines in. + STRING all_str; + GenericVector best_outlines = *ok_outlines; + float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, + pr_it, blob, &all_str); + if (debug_noise_removal) { + TBOX ol_box; + for (int i = 0; i < test_outlines.size(); ++i) { + if (test_outlines[i]) ol_box += outlines[i]->bounding_box(); + } + tprintf("All Noise blob classified as %s=%g, delta=%g at:", + all_str.string(), best_cert, best_cert - target_cert); + ol_box.print(); + } + // Iteratively zero out the bit that improves the certainty the most, until + // we get past the threshold, have zero bits, or fail to improve. + int best_index = 0; // To zero out. + while (num_outlines > 1 && best_index >= 0 && + (blob == NULL || best_cert < target_cert || blob != NULL)) { + // Find the best bit to zero out. + best_index = -1; + for (int i = 0; i < outlines.size(); ++i) { + if (test_outlines[i]) { + test_outlines[i] = false; + STRING str; + float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass, + pr_it, blob, &str); + if (debug_noise_removal) { + TBOX ol_box; + for (int j = 0; j < outlines.size(); ++j) { + if (test_outlines[j]) ol_box += outlines[j]->bounding_box(); + tprintf("%d", test_outlines[j]); + } + tprintf(" blob classified as %s=%g, delta=%g) at:", str.string(), + cert, cert - target_cert); + ol_box.print(); + } + if (cert > best_cert) { + best_cert = cert; + best_index = i; + best_outlines = test_outlines; + } + test_outlines[i] = true; + } + } + if (best_index >= 0) { + test_outlines[best_index] = false; + --num_outlines; + } + } + if (best_cert >= target_cert) { + // Save the best combination. + *ok_outlines = best_outlines; + if (debug_noise_removal) { + tprintf("%s noise combination ", blob ? "Adding" : "New"); + for (int i = 0; i < best_outlines.size(); ++i) { + tprintf("%d", best_outlines[i]); + } + tprintf(" yields certainty %g, beating target of %g\n", best_cert, + target_cert); + } + return true; + } + return false; +} + +// Classifies the given blob plus the outlines flagged by ok_outlines, undoes +// the inclusion of the outlines, and returns the certainty of the raw choice. +float Tesseract::ClassifyBlobPlusOutlines( + const GenericVector& ok_outlines, + const GenericVector& outlines, int pass_n, PAGE_RES_IT* pr_it, + C_BLOB* blob, STRING* best_str) { + C_OUTLINE_IT ol_it; + C_OUTLINE* first_to_keep = NULL; + if (blob != NULL) { + // Add the required outlines to the blob. + ol_it.set_to_list(blob->out_list()); + first_to_keep = ol_it.data(); + } + for (int i = 0; i < ok_outlines.size(); ++i) { + if (ok_outlines[i]) { + // This outline is to be added. + if (blob == NULL) { + blob = new C_BLOB(outlines[i]); + ol_it.set_to_list(blob->out_list()); + } else { + ol_it.add_before_stay_put(outlines[i]); + } + } + } + float c2; + float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2); + ol_it.move_to_first(); + if (first_to_keep == NULL) { + // We created blob. Empty its outlines and delete it. + for (; !ol_it.empty(); ol_it.forward()) ol_it.extract(); + delete blob; + cert = -c2; + } else { + // Remove the outlines that we put in. + for (; ol_it.data() != first_to_keep; ol_it.forward()) { + ol_it.extract(); + } + } + return cert; +} + +// Classifies the given blob (part of word_data->word->word) as an individual +// word, using languages, chopper etc, returning only the certainty of the +// best raw choice, and undoing all the work done to fake out the word. +float Tesseract::ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it, + C_BLOB* blob, STRING* best_str, float* c2) { + WERD* real_word = pr_it->word()->word; + WERD* word = real_word->ConstructFromSingleBlob( + real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob)); + WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word); + // Get a new iterator that points to the new word. + PAGE_RES_IT it(pr_it->page_res); + while (it.word() != word_res && it.word() != NULL) it.forward(); + ASSERT_HOST(it.word() == word_res); + WordData wd(it); + // Force full initialization. + SetupWordPassN(1, &wd); + classify_word_and_language(pass_n, &it, &wd); + if (debug_noise_removal) { + tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height, + wd.row->x_height(), wd.word->raw_choice->min_x_height(), + wd.word->raw_choice->max_x_height()); + } + float cert = wd.word->raw_choice->certainty(); + float rat = wd.word->raw_choice->rating(); + *c2 = rat > 0.0f ? cert * cert / rat : 0.0f; + *best_str = wd.word->raw_choice->unichar_string(); + it.DeleteCurrentWord(); + pr_it->ResetWordIterator(); + return cert; +} + // Generic function for classifying a word. Can be used either for pass1 or // pass2 according to the function passed to recognizer. // word_data holds the word to be recognized, and its block and row, and @@ -906,9 +1267,10 @@ static bool WordsAcceptable(const PointerVector& words) { // Recognizes in the current language, and if successful that is all. // If recognition was not successful, tries all available languages until // it gets a successful result or runs out of languages. Keeps the best result. -void Tesseract::classify_word_and_language(WordRecognizer recognizer, - PAGE_RES_IT* pr_it, +void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it, WordData* word_data) { + WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1 + : &Tesseract::classify_word_pass2; // Best result so far. PointerVector best_words; // Points to the best result. May be word or in lang_words. diff --git a/ccmain/fixspace.cpp b/ccmain/fixspace.cpp index 17c4f96ed1..0a561ac9a0 100644 --- a/ccmain/fixspace.cpp +++ b/ccmain/fixspace.cpp @@ -205,8 +205,7 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, if ((!word->part_of_combo) && (word->box_word == NULL)) { WordData word_data(block, row, word); SetupWordPassN(2, &word_data); - classify_word_and_language(&Tesseract::classify_word_pass2, NULL, - &word_data); + classify_word_and_language(2, NULL, &word_data); } prev_word_best_choice_ = word->best_choice; } diff --git a/ccmain/pageiterator.cpp b/ccmain/pageiterator.cpp index c8e025c13f..ed03ceaba5 100644 --- a/ccmain/pageiterator.cpp +++ b/ccmain/pageiterator.cpp @@ -26,15 +26,23 @@ namespace tesseract { -PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract, - int scale, int scaled_yres, - int rect_left, int rect_top, +PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract, int scale, + int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height) - : page_res_(page_res), tesseract_(tesseract), - word_(NULL), word_length_(0), blob_index_(0), cblob_it_(NULL), - scale_(scale), scaled_yres_(scaled_yres), - rect_left_(rect_left), rect_top_(rect_top), - rect_width_(rect_width), rect_height_(rect_height) { + : page_res_(page_res), + tesseract_(tesseract), + word_(NULL), + word_length_(0), + blob_index_(0), + cblob_it_(NULL), + include_upper_dots_(false), + include_lower_dots_(false), + scale_(scale), + scaled_yres_(scaled_yres), + rect_left_(rect_left), + rect_top_(rect_top), + rect_width_(rect_width), + rect_height_(rect_height) { it_ = new PAGE_RES_IT(page_res); PageIterator::Begin(); } @@ -50,12 +58,20 @@ PageIterator::~PageIterator() { * objects at a higher level. */ PageIterator::PageIterator(const PageIterator& src) - : page_res_(src.page_res_), tesseract_(src.tesseract_), - word_(NULL), word_length_(src.word_length_), - blob_index_(src.blob_index_), cblob_it_(NULL), - scale_(src.scale_), scaled_yres_(src.scaled_yres_), - rect_left_(src.rect_left_), rect_top_(src.rect_top_), - rect_width_(src.rect_width_), rect_height_(src.rect_height_) { + : page_res_(src.page_res_), + tesseract_(src.tesseract_), + word_(NULL), + word_length_(src.word_length_), + blob_index_(src.blob_index_), + cblob_it_(NULL), + include_upper_dots_(src.include_upper_dots_), + include_lower_dots_(src.include_lower_dots_), + scale_(src.scale_), + scaled_yres_(src.scaled_yres_), + rect_left_(src.rect_left_), + rect_top_(src.rect_top_), + rect_width_(src.rect_width_), + rect_height_(src.rect_height_) { it_ = new PAGE_RES_IT(*src.it_); BeginWord(src.blob_index_); } @@ -63,6 +79,8 @@ PageIterator::PageIterator(const PageIterator& src) const PageIterator& PageIterator::operator=(const PageIterator& src) { page_res_ = src.page_res_; tesseract_ = src.tesseract_; + include_upper_dots_ = src.include_upper_dots_; + include_lower_dots_ = src.include_lower_dots_; scale_ = src.scale_; scaled_yres_ = src.scaled_yres_; rect_left_ = src.rect_left_; @@ -252,16 +270,19 @@ bool PageIterator::BoundingBoxInternal(PageIteratorLevel level, PARA *para = NULL; switch (level) { case RIL_BLOCK: - box = it_->block()->block->bounding_box(); + box = it_->block()->block->restricted_bounding_box(include_upper_dots_, + include_lower_dots_); break; case RIL_PARA: para = it_->row()->row->para(); // explicit fall-through. case RIL_TEXTLINE: - box = it_->row()->row->bounding_box(); + box = it_->row()->row->restricted_bounding_box(include_upper_dots_, + include_lower_dots_); break; case RIL_WORD: - box = it_->word()->word->bounding_box(); + box = it_->word()->word->restricted_bounding_box(include_upper_dots_, + include_lower_dots_); break; case RIL_SYMBOL: if (cblob_it_ == NULL) @@ -387,39 +408,23 @@ Pix* PageIterator::GetBinaryImage(PageIteratorLevel level) const { int left, top, right, bottom; if (!BoundingBoxInternal(level, &left, &top, &right, &bottom)) return NULL; - Pix* pix = NULL; - switch (level) { - case RIL_BLOCK: - case RIL_PARA: - int bleft, btop, bright, bbottom; - BoundingBoxInternal(RIL_BLOCK, &bleft, &btop, &bright, &bbottom); - pix = it_->block()->block->render_mask(); - // AND the mask and the image. - pixRasterop(pix, 0, 0, pixGetWidth(pix), pixGetHeight(pix), - PIX_SRC & PIX_DST, tesseract_->pix_binary(), - bleft, btop); - if (level == RIL_PARA) { - // RIL_PARA needs further attention: - // clip the paragraph from the block mask. - Box* box = boxCreate(left - bleft, top - btop, - right - left, bottom - top); - Pix* pix2 = pixClipRectangle(pix, box, NULL); - boxDestroy(&box); - pixDestroy(&pix); - pix = pix2; - } - break; - case RIL_TEXTLINE: - case RIL_WORD: - case RIL_SYMBOL: - if (level == RIL_SYMBOL && cblob_it_ != NULL && - cblob_it_->data()->area() != 0) - return cblob_it_->data()->render(); - // Just clip from the bounding box. - Box* box = boxCreate(left, top, right - left, bottom - top); - pix = pixClipRectangle(tesseract_->pix_binary(), box, NULL); - boxDestroy(&box); - break; + if (level == RIL_SYMBOL && cblob_it_ != NULL && + cblob_it_->data()->area() != 0) + return cblob_it_->data()->render(); + Box* box = boxCreate(left, top, right - left, bottom - top); + Pix* pix = pixClipRectangle(tesseract_->pix_binary(), box, NULL); + boxDestroy(&box); + if (level == RIL_BLOCK || level == RIL_PARA) { + // Clip to the block polygon as well. + TBOX mask_box; + Pix* mask = it_->block()->block->render_mask(&mask_box); + int mask_x = left - mask_box.left(); + int mask_y = top - (tesseract_->ImageHeight() - mask_box.top()); + // AND the mask and pix, putting the result in pix. + pixRasterop(pix, MAX(0, -mask_x), MAX(0, -mask_y), pixGetWidth(pix), + pixGetHeight(pix), PIX_SRC & PIX_DST, mask, MAX(0, mask_x), + MAX(0, mask_y)); + pixDestroy(&mask); } return pix; } @@ -452,17 +457,24 @@ Pix* PageIterator::GetImage(PageIteratorLevel level, int padding, Box* box = boxCreate(*left, *top, right - *left, bottom - *top); Pix* grey_pix = pixClipRectangle(original_img, box, NULL); boxDestroy(&box); - if (level == RIL_BLOCK) { - Pix* mask = it_->block()->block->render_mask(); - Pix* expanded_mask = pixCreate(right - *left, bottom - *top, 1); - pixRasterop(expanded_mask, padding, padding, - pixGetWidth(mask), pixGetHeight(mask), - PIX_SRC, mask, 0, 0); + if (level == RIL_BLOCK || level == RIL_PARA) { + // Clip to the block polygon as well. + TBOX mask_box; + Pix* mask = it_->block()->block->render_mask(&mask_box); + // Copy the mask registered correctly into an image the size of grey_pix. + int mask_x = *left - mask_box.left(); + int mask_y = *top - (pixGetHeight(original_img) - mask_box.top()); + int width = pixGetWidth(grey_pix); + int height = pixGetHeight(grey_pix); + Pix* resized_mask = pixCreate(width, height, 1); + pixRasterop(resized_mask, MAX(0, -mask_x), MAX(0, -mask_y), width, height, + PIX_SRC, mask, MAX(0, mask_x), MAX(0, mask_y)); pixDestroy(&mask); - pixDilateBrick(expanded_mask, expanded_mask, 2*padding + 1, 2*padding + 1); - pixInvert(expanded_mask, expanded_mask); - pixSetMasked(grey_pix, expanded_mask, MAX_UINT32); - pixDestroy(&expanded_mask); + pixDilateBrick(resized_mask, resized_mask, 2 * padding + 1, + 2 * padding + 1); + pixInvert(resized_mask, resized_mask); + pixSetMasked(grey_pix, resized_mask, MAX_UINT32); + pixDestroy(&resized_mask); } return grey_pix; } diff --git a/ccmain/pageiterator.h b/ccmain/pageiterator.h index 27b02ddf8f..56c78150a8 100644 --- a/ccmain/pageiterator.h +++ b/ccmain/pageiterator.h @@ -179,6 +179,21 @@ class TESS_API PageIterator { // If an image rectangle has been set in the API, then returned coordinates // relate to the original (full) image, rather than the rectangle. + /** + * Controls what to include in a bounding box. Bounding boxes of all levels + * between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics. + * Between layout analysis and recognition, it isn't known where all + * diacritics belong, so this control is used to include or exclude some + * diacritics that are above or below the main body of the word. In most cases + * where the placement is obvious, and after recognition, it doesn't make as + * much difference, as the diacritics will already be included in the word. + */ + void SetBoundingBoxComponents(bool include_upper_dots, + bool include_lower_dots) { + include_upper_dots_ = include_upper_dots; + include_lower_dots_ = include_lower_dots; + } + /** * Returns the bounding rectangle of the current object at the given level. * See comment on coordinate system above. @@ -332,6 +347,9 @@ class TESS_API PageIterator { * Owned by this ResultIterator. */ C_BLOB_IT* cblob_it_; + /** Control over what to include in bounding boxes. */ + bool include_upper_dots_; + bool include_lower_dots_; /** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/ int scale_; int scaled_yres_; diff --git a/ccmain/pagesegmain.cpp b/ccmain/pagesegmain.cpp index 396be13048..6ced2d4c40 100644 --- a/ccmain/pagesegmain.cpp +++ b/ccmain/pagesegmain.cpp @@ -134,12 +134,20 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, // UNLV file present. Use PSM_SINGLE_BLOCK. pageseg_mode = PSM_SINGLE_BLOCK; } + // The diacritic_blobs holds noise blobs that may be diacritics. They + // are separated out on areas of the image that seem noisy and short-circuit + // the layout process, going straight from the initial partition creation + // right through to after word segmentation, where they are added to the + // rej_cblobs list of the most appropriate word. From there classification + // will determine whether they are used. + BLOBNBOX_LIST diacritic_blobs; int auto_page_seg_ret_val = 0; TO_BLOCK_LIST to_blocks; if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) || PSM_SPARSE(pageseg_mode)) { - auto_page_seg_ret_val = - AutoPageSeg(pageseg_mode, blocks, &to_blocks, osd_tess, osr); + auto_page_seg_ret_val = AutoPageSeg( + pageseg_mode, blocks, &to_blocks, + enable_noise_removal ? &diacritic_blobs : NULL, osd_tess, osr); if (pageseg_mode == PSM_OSD_ONLY) return auto_page_seg_ret_val; // To create blobs from the image region bounds uncomment this line: @@ -171,7 +179,7 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_, pix_thresholds_, pix_grey_, splitting || cjk_mode, - blocks, &to_blocks); + &diacritic_blobs, blocks, &to_blocks); return auto_page_seg_ret_val; } @@ -197,7 +205,6 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) { pixDestroy(&grey_pix); } - /** * Auto page segmentation. Divide the page image into blocks of uniform * text linespacing and images. @@ -207,9 +214,14 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) { * The output goes in the blocks list with corresponding TO_BLOCKs in the * to_blocks list. * - * If single_column is true, then no attempt is made to divide the image - * into columns, but multiple blocks are still made if the text is of - * non-uniform linespacing. + * If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide + * the image into columns, but multiple blocks are still made if the text is + * of non-uniform linespacing. + * + * If diacritic_blobs is non-null, then diacritics/noise blobs, that would + * confuse layout anaylsis by causing textline overlap, are placed there, + * with the expectation that they will be reassigned to words later and + * noise/diacriticness determined via classification. * * If osd (orientation and script detection) is true then that is performed * as well. If only_osd is true, then only orientation and script detection is @@ -217,9 +229,10 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) { * another Tesseract that was initialized especially for osd, and the results * will be output into osr (orientation and script result). */ -int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, - BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks, - Tesseract* osd_tess, OSResults* osr) { +int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks, + TO_BLOCK_LIST* to_blocks, + BLOBNBOX_LIST* diacritic_blobs, Tesseract* osd_tess, + OSResults* osr) { if (textord_debug_images) { WriteDebugBackgroundImage(textord_debug_printable, pix_binary_); } @@ -247,10 +260,9 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, if (equ_detect_) { finder->SetEquationDetect(equ_detect_); } - result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, - to_block, photomask_pix, - pix_thresholds_, pix_grey_, - &found_blocks, to_blocks); + result = finder->FindBlocks( + pageseg_mode, scaled_color_, scaled_factor_, to_block, photomask_pix, + pix_thresholds_, pix_grey_, &found_blocks, diacritic_blobs, to_blocks); if (result >= 0) finder->GetDeskewVectors(&deskew_, &reskew_); delete finder; diff --git a/ccmain/pgedit.cpp b/ccmain/pgedit.cpp index 7c8f626b6b..ea44ead7c9 100644 --- a/ccmain/pgedit.cpp +++ b/ccmain/pgedit.cpp @@ -655,7 +655,8 @@ void show_point(PAGE_RES* page_res, float x, float y) { FCOORD pt(x, y); PAGE_RES_IT pr_it(page_res); - char msg[160]; + const int kBufsize = 512; + char msg[kBufsize]; char *msg_ptr = msg; msg_ptr += sprintf(msg_ptr, "Pt:(%0.3f, %0.3f) ", x, y); diff --git a/ccmain/recogtraining.cpp b/ccmain/recogtraining.cpp index 2dc94886ed..27d7e97ea0 100644 --- a/ccmain/recogtraining.cpp +++ b/ccmain/recogtraining.cpp @@ -207,8 +207,7 @@ void Tesseract::ambigs_classify_and_output(const char *label, fflush(stdout); WordData word_data(*pr_it); SetupWordPassN(1, &word_data); - classify_word_and_language(&Tesseract::classify_word_pass1, - pr_it, &word_data); + classify_word_and_language(1, pr_it, &word_data); WERD_RES* werd_res = word_data.word; WERD_CHOICE *best_choice = werd_res->best_choice; ASSERT_HOST(best_choice != NULL); diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp index c262bbc95e..25819e8cdd 100644 --- a/ccmain/tesseractclass.cpp +++ b/ccmain/tesseractclass.cpp @@ -55,507 +55,569 @@ namespace tesseract { Tesseract::Tesseract() - : BOOL_MEMBER(tessedit_resegment_from_boxes, false, - "Take segmentation and labeling from box file", - this->params()), - BOOL_MEMBER(tessedit_resegment_from_line_boxes, false, - "Conversion of word/line box file to char box file", - this->params()), - BOOL_MEMBER(tessedit_train_from_boxes, false, - "Generate training data from boxed chars", this->params()), - BOOL_MEMBER(tessedit_make_boxes_from_boxes, false, - "Generate more boxes from boxed chars", this->params()), - BOOL_MEMBER(tessedit_dump_pageseg_images, false, - "Dump intermediate images made during page segmentation", - this->params()), - // The default for pageseg_mode is the old behaviour, so as not to - // upset anything that relies on that. - INT_MEMBER(tessedit_pageseg_mode, PSM_SINGLE_BLOCK, - "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," - " 5=line, 6=word, 7=char" - " (Values from PageSegMode enum in publictypes.h)", - this->params()), - INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY, - "Which OCR engine(s) to run (Tesseract, Cube, both)." - " Defaults to loading and running only Tesseract" - " (no Cube,no combiner)." - " Values from OcrEngineMode enum in tesseractclass.h)", - this->params()), - STRING_MEMBER(tessedit_char_blacklist, "", - "Blacklist of chars not to recognize", this->params()), - STRING_MEMBER(tessedit_char_whitelist, "", - "Whitelist of chars to recognize", this->params()), - STRING_MEMBER(tessedit_char_unblacklist, "", - "List of chars to override tessedit_char_blacklist", - this->params()), - BOOL_MEMBER(tessedit_ambigs_training, false, - "Perform training for ambiguities", this->params()), - INT_MEMBER(pageseg_devanagari_split_strategy, - tesseract::ShiroRekhaSplitter::NO_SPLIT, - "Whether to use the top-line splitting process for Devanagari " - "documents while performing page-segmentation.", this->params()), - INT_MEMBER(ocr_devanagari_split_strategy, - tesseract::ShiroRekhaSplitter::NO_SPLIT, - "Whether to use the top-line splitting process for Devanagari " - "documents while performing ocr.", this->params()), - STRING_MEMBER(tessedit_write_params_to_file, "", - "Write all parameters to the given file.", this->params()), - BOOL_MEMBER(tessedit_adaption_debug, false, "Generate and print debug" - " information for adaption", this->params()), - INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()), - INT_MEMBER(applybox_debug, 1, "Debug level", this->params()), - INT_MEMBER(applybox_page, 0, - "Page number to apply boxes from", this->params()), - STRING_MEMBER(applybox_exposure_pattern, ".exp", "Exposure value follows" - " this pattern in the image filename. The name of the image" - " files are expected to be in the form" - " [lang].[fontname].exp[num].tif", this->params()), - BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false, - "Learn both character fragments (as is done in the" - " special low exposure mode) as well as unfragmented" - " characters.", this->params()), - BOOL_MEMBER(applybox_learn_ngrams_mode, false, "Each bounding box" - " is assumed to contain ngrams. Only learn the ngrams" - " whose outlines overlap horizontally.", this->params()), - BOOL_MEMBER(tessedit_display_outwords, false, - "Draw output words", this->params()), - BOOL_MEMBER(tessedit_dump_choices, false, - "Dump char choices", this->params()), - BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats", - this->params()), - BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true, - "Try to improve fuzzy spaces", this->params()), - BOOL_MEMBER(tessedit_unrej_any_wd, false, - "Dont bother with word plausibility", this->params()), - BOOL_MEMBER(tessedit_fix_hyphens, true, - "Crunch double hyphens?", this->params()), - BOOL_MEMBER(tessedit_redo_xheight, true, - "Check/Correct x-height", this->params()), - BOOL_MEMBER(tessedit_enable_doc_dict, true, - "Add words to the document dictionary", this->params()), - BOOL_MEMBER(tessedit_debug_fonts, false, - "Output font info per char", this->params()), - BOOL_MEMBER(tessedit_debug_block_rejection, false, - "Block and Row stats", this->params()), - BOOL_MEMBER(tessedit_enable_bigram_correction, true, - "Enable correction based on the word bigram dictionary.", - this->params()), - BOOL_MEMBER(tessedit_enable_dict_correction, false, - "Enable single word correction based on the dictionary.", - this->params()), - INT_MEMBER(tessedit_bigram_debug, 0, - "Amount of debug output for bigram correction.", - this->params()), - INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()), - BOOL_MEMBER(debug_acceptable_wds, false, - "Dump word pass/fail chk", this->params()), - STRING_MEMBER(chs_leading_punct, "('`\"", - "Leading punctuation", this->params()), - STRING_MEMBER(chs_trailing_punct1, ").,;:?!", - "1st Trailing punctuation", this->params()), - STRING_MEMBER(chs_trailing_punct2, ")'`\"", - "2nd Trailing punctuation", this->params()), - double_MEMBER(quality_rej_pc, 0.08, - "good_quality_doc lte rejection limit", this->params()), - double_MEMBER(quality_blob_pc, 0.0, - "good_quality_doc gte good blobs limit", this->params()), - double_MEMBER(quality_outline_pc, 1.0, - "good_quality_doc lte outline error limit", this->params()), - double_MEMBER(quality_char_pc, 0.95, - "good_quality_doc gte good char limit", this->params()), - INT_MEMBER(quality_min_initial_alphas_reqd, 2, - "alphas in a good word", this->params()), - INT_MEMBER(tessedit_tess_adaption_mode, 0x27, - "Adaptation decision algorithm for tess", this->params()), - BOOL_MEMBER(tessedit_minimal_rej_pass1, false, - "Do minimal rejection on pass 1 output", this->params()), - BOOL_MEMBER(tessedit_test_adaption, false, - "Test adaption criteria", this->params()), - BOOL_MEMBER(tessedit_matcher_log, false, - "Log matcher activity", this->params()), - INT_MEMBER(tessedit_test_adaption_mode, 3, - "Adaptation decision algorithm for tess", this->params()), - BOOL_MEMBER(test_pt, false, "Test for point", this->params()), - double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()), - double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()), - INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.", - this->params()), - BOOL_MEMBER(paragraph_text_based, true, - "Run paragraph detection on the post-text-recognition " - "(more accurate)", this->params()), - INT_MEMBER(cube_debug_level, 0, "Print cube debug info.", this->params()), - STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines", - this->params()), - STRING_MEMBER(outlines_2, "ij!?%\":;", - "Non standard number of outlines", this->params()), - BOOL_MEMBER(docqual_excuse_outline_errs, false, - "Allow outline errs in unrejection?", this->params()), - BOOL_MEMBER(tessedit_good_quality_unrej, true, - "Reduce rejection on good docs", this->params()), - BOOL_MEMBER(tessedit_use_reject_spaces, true, - "Reject spaces?", this->params()), - double_MEMBER(tessedit_reject_doc_percent, 65.00, - "%rej allowed before rej whole doc", this->params()), - double_MEMBER(tessedit_reject_block_percent, 45.00, - "%rej allowed before rej whole block", this->params()), - double_MEMBER(tessedit_reject_row_percent, 40.00, - "%rej allowed before rej whole row", this->params()), - double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00, - "Number of row rejects in whole word rejects" - "which prevents whole row rejection", this->params()), - BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true, - "Only rej partially rejected words in block rejection", - this->params()), - BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true, - "Only rej partially rejected words in row rejection", - this->params()), - BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false, - "Use word segmentation quality metric", this->params()), - BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false, - "Use word segmentation quality metric", this->params()), - INT_MEMBER(tessedit_preserve_min_wd_len, 2, - "Only preserve wds longer than this", this->params()), - BOOL_MEMBER(tessedit_row_rej_good_docs, true, - "Apply row rejection to good docs", this->params()), - double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1, - "rej good doc wd if more than this fraction rejected", - this->params()), - BOOL_MEMBER(tessedit_reject_bad_qual_wds, true, - "Reject all bad quality wds", this->params()), - BOOL_MEMBER(tessedit_debug_doc_rejection, false, - "Page stats", this->params()), - BOOL_MEMBER(tessedit_debug_quality_metrics, false, - "Output data to debug file", this->params()), - BOOL_MEMBER(bland_unrej, false, - "unrej potential with no chekcs", this->params()), - double_MEMBER(quality_rowrej_pc, 1.1, - "good_quality_doc gte good char limit", this->params()), - BOOL_MEMBER(unlv_tilde_crunching, true, - "Mark v.bad words for tilde crunch", this->params()), - BOOL_MEMBER(hocr_font_info, false, - "Add font info to hocr output", this->params()), - BOOL_MEMBER(crunch_early_merge_tess_fails, true, - "Before word crunch?", this->params()), - BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false, - "Take out ~^ early?", this->params()), - double_MEMBER(crunch_terrible_rating, 80.0, - "crunch rating lt this", this->params()), - BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()), - double_MEMBER(crunch_poor_garbage_cert, -9.0, - "crunch garbage cert lt this", this->params()), - double_MEMBER(crunch_poor_garbage_rate, 60, - "crunch garbage rating lt this", this->params()), - double_MEMBER(crunch_pot_poor_rate, 40, - "POTENTIAL crunch rating lt this", this->params()), - double_MEMBER(crunch_pot_poor_cert, -8.0, - "POTENTIAL crunch cert lt this", this->params()), - BOOL_MEMBER(crunch_pot_garbage, true, - "POTENTIAL crunch garbage", this->params()), - double_MEMBER(crunch_del_rating, 60, - "POTENTIAL crunch rating lt this", this->params()), - double_MEMBER(crunch_del_cert, -10.0, - "POTENTIAL crunch cert lt this", this->params()), - double_MEMBER(crunch_del_min_ht, 0.7, - "Del if word ht lt xht x this", this->params()), - double_MEMBER(crunch_del_max_ht, 3.0, - "Del if word ht gt xht x this", this->params()), - double_MEMBER(crunch_del_min_width, 3.0, - "Del if word width lt xht x this", this->params()), - double_MEMBER(crunch_del_high_word, 1.5, - "Del if word gt xht x this above bl", this->params()), - double_MEMBER(crunch_del_low_word, 0.5, - "Del if word gt xht x this below bl", this->params()), - double_MEMBER(crunch_small_outlines_size, 0.6, - "Small if lt xht x this", this->params()), - INT_MEMBER(crunch_rating_max, 10, - "For adj length in rating per ch", this->params()), - INT_MEMBER(crunch_pot_indicators, 1, - "How many potential indicators needed", this->params()), - BOOL_MEMBER(crunch_leave_ok_strings, true, - "Dont touch sensible strings", this->params()), - BOOL_MEMBER(crunch_accept_ok, true, - "Use acceptability in okstring", this->params()), - BOOL_MEMBER(crunch_leave_accept_strings, false, - "Dont pot crunch sensible strings", this->params()), - BOOL_MEMBER(crunch_include_numerals, false, - "Fiddle alpha figures", this->params()), - INT_MEMBER(crunch_leave_lc_strings, 4, - "Dont crunch words with long lower case strings", - this->params()), - INT_MEMBER(crunch_leave_uc_strings, 4, - "Dont crunch words with long lower case strings", - this->params()), - INT_MEMBER(crunch_long_repetitions, 3, - "Crunch words with long repetitions", this->params()), - INT_MEMBER(crunch_debug, 0, "As it says", this->params()), - INT_MEMBER(fixsp_non_noise_limit, 1, - "How many non-noise blbs either side?", this->params()), - double_MEMBER(fixsp_small_outlines_size, 0.28, - "Small if lt xht x this", this->params()), - BOOL_MEMBER(tessedit_prefer_joined_punct, false, - "Reward punctation joins", this->params()), - INT_MEMBER(fixsp_done_mode, 1, - "What constitues done for spacing", this->params()), - INT_MEMBER(debug_fix_space_level, 0, - "Contextual fixspace debug", this->params()), - STRING_MEMBER(numeric_punctuation, ".,", - "Punct. chs expected WITHIN numbers", this->params()), - INT_MEMBER(x_ht_acceptance_tolerance, 8, - "Max allowed deviation of blob top outside of font data", - this->params()), - INT_MEMBER(x_ht_min_change, 8, - "Min change in xht before actually trying it", this->params()), - INT_MEMBER(superscript_debug, 0, "Debug level for sub & superscript fixer", - this->params()), - double_MEMBER(superscript_worse_certainty, 2.0, "How many times worse " - "certainty does a superscript position glyph need to be for " - "us to try classifying it as a char with a different " - "baseline?", this->params()), - double_MEMBER(superscript_bettered_certainty, 0.97, "What reduction in " - "badness do we think sufficient to choose a superscript " - "over what we'd thought. For example, a value of 0.6 means " - "we want to reduce badness of certainty by at least 40%", - this->params()), - double_MEMBER(superscript_scaledown_ratio, 0.4, - "A superscript scaled down more than this is unbelievably " - "small. For example, 0.3 means we expect the font size to " - "be no smaller than 30% of the text line font size.", - this->params()), - double_MEMBER(subscript_max_y_top, 0.5, - "Maximum top of a character measured as a multiple of " - "x-height above the baseline for us to reconsider whether " - "it's a subscript.", this->params()), - double_MEMBER(superscript_min_y_bottom, 0.3, - "Minimum bottom of a character measured as a multiple of " - "x-height above the baseline for us to reconsider whether " - "it's a superscript.", this->params()), - BOOL_MEMBER(tessedit_write_block_separators, false, - "Write block separators in output", this->params()), - BOOL_MEMBER(tessedit_write_rep_codes, false, - "Write repetition char code", this->params()), - BOOL_MEMBER(tessedit_write_unlv, false, - "Write .unlv output file", this->params()), - BOOL_MEMBER(tessedit_create_txt, true, - "Write .txt output file", this->params()), - BOOL_MEMBER(tessedit_create_hocr, false, - "Write .html hOCR output file", this->params()), - BOOL_MEMBER(tessedit_create_pdf, false, - "Write .pdf output file", this->params()), - STRING_MEMBER(unrecognised_char, "|", - "Output char for unidentified blobs", this->params()), - INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()), - INT_MEMBER(suspect_space_level, 100, - "Min suspect level for rejecting spaces", this->params()), - INT_MEMBER(suspect_short_words, 2, - "Dont Suspect dict wds longer than this", this->params()), - BOOL_MEMBER(suspect_constrain_1Il, false, - "UNLV keep 1Il chars rejected", this->params()), - double_MEMBER(suspect_rating_per_ch, 999.9, - "Dont touch bad rating limit", this->params()), - double_MEMBER(suspect_accept_rating, -999.9, - "Accept good rating limit", this->params()), - BOOL_MEMBER(tessedit_minimal_rejection, false, - "Only reject tess failures", this->params()), - BOOL_MEMBER(tessedit_zero_rejection, false, - "Dont reject ANYTHING", this->params()), - BOOL_MEMBER(tessedit_word_for_word, false, - "Make output have exactly one word per WERD", this->params()), - BOOL_MEMBER(tessedit_zero_kelvin_rejection, false, - "Dont reject ANYTHING AT ALL", this->params()), - BOOL_MEMBER(tessedit_consistent_reps, true, - "Force all rep chars the same", this->params()), - INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", this->params()), - BOOL_MEMBER(tessedit_rejection_debug, false, - "Adaption debug", this->params()), - BOOL_MEMBER(tessedit_flip_0O, true, - "Contextual 0O O0 flips", this->params()), - double_MEMBER(tessedit_lower_flip_hyphen, 1.5, - "Aspect ratio dot/hyphen test", this->params()), - double_MEMBER(tessedit_upper_flip_hyphen, 1.8, - "Aspect ratio dot/hyphen test", this->params()), - BOOL_MEMBER(rej_trust_doc_dawg, false, - "Use DOC dawg in 11l conf. detector", this->params()), - BOOL_MEMBER(rej_1Il_use_dict_word, false, - "Use dictword test", this->params()), - BOOL_MEMBER(rej_1Il_trust_permuter_type, true, - "Dont double check", this->params()), - BOOL_MEMBER(rej_use_tess_accepted, true, - "Individual rejection control", this->params()), - BOOL_MEMBER(rej_use_tess_blanks, true, - "Individual rejection control", this->params()), - BOOL_MEMBER(rej_use_good_perm, true, - "Individual rejection control", this->params()), - BOOL_MEMBER(rej_use_sensible_wd, false, - "Extend permuter check", this->params()), - BOOL_MEMBER(rej_alphas_in_number_perm, false, - "Extend permuter check", this->params()), - double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85, - "if >this fract", this->params()), - INT_MEMBER(tessedit_image_border, 2, - "Rej blbs near image edge limit", this->params()), - STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075", - "Allow NN to unrej", this->params()), - STRING_MEMBER(conflict_set_I_l_1, "Il1[]", - "Il1 conflict set", this->params()), - INT_MEMBER(min_sane_x_ht_pixels, 8, - "Reject any x-ht lt or eq than this", this->params()), - BOOL_MEMBER(tessedit_create_boxfile, false, - "Output text with boxes", this->params()), - INT_MEMBER(tessedit_page_number, -1, "-1 -> All pages" - " , else specifc page to process", this->params()), - BOOL_MEMBER(tessedit_write_images, false, - "Capture the image from the IPE", this->params()), - BOOL_MEMBER(interactive_display_mode, false, "Run interactively?", - this->params()), - STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()), - BOOL_MEMBER(tessedit_override_permuter, true, - "According to dict_word", this->params()), - INT_MEMBER(tessdata_manager_debug_level, 0, "Debug level for" - " TessdataManager functions.", this->params()), - STRING_MEMBER(tessedit_load_sublangs, "", - "List of languages to load with this one", this->params()), - BOOL_MEMBER(tessedit_use_primary_params_model, false, - "In multilingual mode use params model of the" - " primary language", this->params()), - double_MEMBER(min_orientation_margin, 7.0, - "Min acceptable orientation margin", this->params()), - BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding", - this->params()), - BOOL_MEMBER(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model", - this->params()), - BOOL_MEMBER(poly_allow_detailed_fx, false, - "Allow feature extractors to see the original outline", - this->params()), - BOOL_INIT_MEMBER(tessedit_init_config_only, false, - "Only initialize with the config file. Useful if the " - "instance is not going to be used for OCR but say only " - "for layout analysis.", this->params()), - BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector", - this->params()), - BOOL_MEMBER(textord_tabfind_vertical_text, true, - "Enable vertical detection", this->params()), - BOOL_MEMBER(textord_tabfind_force_vertical_text, false, - "Force using vertical text page mode", this->params()), - double_MEMBER(textord_tabfind_vertical_text_ratio, 0.5, - "Fraction of textlines deemed vertical to use vertical page " - "mode", this->params()), - double_MEMBER(textord_tabfind_aligned_gap_fraction, 0.75, - "Fraction of height used as a minimum gap for aligned blobs.", - this->params()), - INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible", - this->params()), - BOOL_MEMBER(preserve_interword_spaces, false, - "Preserve multiple interword spaces", this->params()), - BOOL_MEMBER(include_page_breaks, FALSE, - "Include page separator string in output text after each " - "image/page.", this->params()), - STRING_MEMBER(page_separator, "\f", - "Page separator (default is form feed control character)", + : BOOL_MEMBER(tessedit_resegment_from_boxes, false, + "Take segmentation and labeling from box file", this->params()), + BOOL_MEMBER(tessedit_resegment_from_line_boxes, false, + "Conversion of word/line box file to char box file", + this->params()), + BOOL_MEMBER(tessedit_train_from_boxes, false, + "Generate training data from boxed chars", this->params()), + BOOL_MEMBER(tessedit_make_boxes_from_boxes, false, + "Generate more boxes from boxed chars", this->params()), + BOOL_MEMBER(tessedit_dump_pageseg_images, false, + "Dump intermediate images made during page segmentation", + this->params()), + // The default for pageseg_mode is the old behaviour, so as not to + // upset anything that relies on that. + INT_MEMBER( + tessedit_pageseg_mode, PSM_SINGLE_BLOCK, + "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," + " 5=line, 6=word, 7=char" + " (Values from PageSegMode enum in publictypes.h)", + this->params()), + INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY, + "Which OCR engine(s) to run (Tesseract, Cube, both)." + " Defaults to loading and running only Tesseract" + " (no Cube,no combiner)." + " Values from OcrEngineMode enum in tesseractclass.h)", + this->params()), + STRING_MEMBER(tessedit_char_blacklist, "", + "Blacklist of chars not to recognize", this->params()), + STRING_MEMBER(tessedit_char_whitelist, "", + "Whitelist of chars to recognize", this->params()), + STRING_MEMBER(tessedit_char_unblacklist, "", + "List of chars to override tessedit_char_blacklist", + this->params()), + BOOL_MEMBER(tessedit_ambigs_training, false, + "Perform training for ambiguities", this->params()), + INT_MEMBER(pageseg_devanagari_split_strategy, + tesseract::ShiroRekhaSplitter::NO_SPLIT, + "Whether to use the top-line splitting process for Devanagari " + "documents while performing page-segmentation.", + this->params()), + INT_MEMBER(ocr_devanagari_split_strategy, + tesseract::ShiroRekhaSplitter::NO_SPLIT, + "Whether to use the top-line splitting process for Devanagari " + "documents while performing ocr.", + this->params()), + STRING_MEMBER(tessedit_write_params_to_file, "", + "Write all parameters to the given file.", this->params()), + BOOL_MEMBER(tessedit_adaption_debug, false, + "Generate and print debug" + " information for adaption", + this->params()), + INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()), + INT_MEMBER(applybox_debug, 1, "Debug level", this->params()), + INT_MEMBER(applybox_page, 0, "Page number to apply boxes from", + this->params()), + STRING_MEMBER(applybox_exposure_pattern, ".exp", + "Exposure value follows" + " this pattern in the image filename. The name of the image" + " files are expected to be in the form" + " [lang].[fontname].exp[num].tif", + this->params()), + BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false, + "Learn both character fragments (as is done in the" + " special low exposure mode) as well as unfragmented" + " characters.", + this->params()), + BOOL_MEMBER(applybox_learn_ngrams_mode, false, + "Each bounding box" + " is assumed to contain ngrams. Only learn the ngrams" + " whose outlines overlap horizontally.", + this->params()), + BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words", + this->params()), + BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices", + this->params()), + BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats", + this->params()), + BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true, + "Try to improve fuzzy spaces", this->params()), + BOOL_MEMBER(tessedit_unrej_any_wd, false, + "Dont bother with word plausibility", this->params()), + BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?", + this->params()), + BOOL_MEMBER(tessedit_redo_xheight, true, "Check/Correct x-height", + this->params()), + BOOL_MEMBER(tessedit_enable_doc_dict, true, + "Add words to the document dictionary", this->params()), + BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char", + this->params()), + BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats", + this->params()), + BOOL_MEMBER(tessedit_enable_bigram_correction, true, + "Enable correction based on the word bigram dictionary.", + this->params()), + BOOL_MEMBER(tessedit_enable_dict_correction, false, + "Enable single word correction based on the dictionary.", + this->params()), + INT_MEMBER(tessedit_bigram_debug, 0, + "Amount of debug output for bigram correction.", + this->params()), + BOOL_MEMBER(enable_noise_removal, true, + "Remove and conditionally reassign small outlines when they" + " confuse layout analysis, determining diacritics vs noise", + this->params()), + INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines", + this->params()), + // Worst (min) certainty, for which a diacritic is allowed to make the + // base + // character worse and still be included. + double_MEMBER(noise_cert_basechar, -8.0, + "Hingepoint for base char certainty", this->params()), + // Worst (min) certainty, for which a non-overlapping diacritic is allowed + // to make the base character worse and still be included. + double_MEMBER(noise_cert_disjoint, -1.0, + "Hingepoint for disjoint certainty", this->params()), + // Worst (min) certainty, for which a diacritic is allowed to make a new + // stand-alone blob. + double_MEMBER(noise_cert_punc, -3.0, + "Threshold for new punc char certainty", this->params()), + // Factor of certainty margin for adding diacritics to not count as worse. + double_MEMBER(noise_cert_factor, 0.375, + "Scaling on certainty diff from Hingepoint", + this->params()), + INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob", + this->params()), + INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word", + this->params()), + INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()), + BOOL_MEMBER(debug_acceptable_wds, false, "Dump word pass/fail chk", + this->params()), + STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation", + this->params()), + STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation", + this->params()), + STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation", + this->params()), + double_MEMBER(quality_rej_pc, 0.08, + "good_quality_doc lte rejection limit", this->params()), + double_MEMBER(quality_blob_pc, 0.0, + "good_quality_doc gte good blobs limit", this->params()), + double_MEMBER(quality_outline_pc, 1.0, + "good_quality_doc lte outline error limit", this->params()), + double_MEMBER(quality_char_pc, 0.95, + "good_quality_doc gte good char limit", this->params()), + INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word", + this->params()), + INT_MEMBER(tessedit_tess_adaption_mode, 0x27, + "Adaptation decision algorithm for tess", this->params()), + BOOL_MEMBER(tessedit_minimal_rej_pass1, false, + "Do minimal rejection on pass 1 output", this->params()), + BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria", + this->params()), + BOOL_MEMBER(tessedit_matcher_log, false, "Log matcher activity", + this->params()), + INT_MEMBER(tessedit_test_adaption_mode, 3, + "Adaptation decision algorithm for tess", this->params()), + BOOL_MEMBER(test_pt, false, "Test for point", this->params()), + double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()), + double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()), + INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.", + this->params()), + BOOL_MEMBER(paragraph_text_based, true, + "Run paragraph detection on the post-text-recognition " + "(more accurate)", + this->params()), + INT_MEMBER(cube_debug_level, 0, "Print cube debug info.", this->params()), + STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines", + this->params()), + STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines", + this->params()), + BOOL_MEMBER(docqual_excuse_outline_errs, false, + "Allow outline errs in unrejection?", this->params()), + BOOL_MEMBER(tessedit_good_quality_unrej, true, + "Reduce rejection on good docs", this->params()), + BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?", + this->params()), + double_MEMBER(tessedit_reject_doc_percent, 65.00, + "%rej allowed before rej whole doc", this->params()), + double_MEMBER(tessedit_reject_block_percent, 45.00, + "%rej allowed before rej whole block", this->params()), + double_MEMBER(tessedit_reject_row_percent, 40.00, + "%rej allowed before rej whole row", this->params()), + double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00, + "Number of row rejects in whole word rejects" + "which prevents whole row rejection", + this->params()), + BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true, + "Only rej partially rejected words in block rejection", + this->params()), + BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true, + "Only rej partially rejected words in row rejection", + this->params()), + BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false, + "Use word segmentation quality metric", this->params()), + BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false, + "Use word segmentation quality metric", this->params()), + INT_MEMBER(tessedit_preserve_min_wd_len, 2, + "Only preserve wds longer than this", this->params()), + BOOL_MEMBER(tessedit_row_rej_good_docs, true, + "Apply row rejection to good docs", this->params()), + double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1, + "rej good doc wd if more than this fraction rejected", + this->params()), + BOOL_MEMBER(tessedit_reject_bad_qual_wds, true, + "Reject all bad quality wds", this->params()), + BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats", + this->params()), + BOOL_MEMBER(tessedit_debug_quality_metrics, false, + "Output data to debug file", this->params()), + BOOL_MEMBER(bland_unrej, false, "unrej potential with no chekcs", + this->params()), + double_MEMBER(quality_rowrej_pc, 1.1, + "good_quality_doc gte good char limit", this->params()), + BOOL_MEMBER(unlv_tilde_crunching, true, + "Mark v.bad words for tilde crunch", this->params()), + BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output", + this->params()), + BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?", + this->params()), + BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false, + "Take out ~^ early?", this->params()), + double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this", + this->params()), + BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()), + double_MEMBER(crunch_poor_garbage_cert, -9.0, + "crunch garbage cert lt this", this->params()), + double_MEMBER(crunch_poor_garbage_rate, 60, + "crunch garbage rating lt this", this->params()), + double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this", + this->params()), + double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this", + this->params()), + BOOL_MEMBER(crunch_pot_garbage, true, "POTENTIAL crunch garbage", + this->params()), + double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this", + this->params()), + double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this", + this->params()), + double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this", + this->params()), + double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this", + this->params()), + double_MEMBER(crunch_del_min_width, 3.0, + "Del if word width lt xht x this", this->params()), + double_MEMBER(crunch_del_high_word, 1.5, + "Del if word gt xht x this above bl", this->params()), + double_MEMBER(crunch_del_low_word, 0.5, + "Del if word gt xht x this below bl", this->params()), + double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this", + this->params()), + INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch", + this->params()), + INT_MEMBER(crunch_pot_indicators, 1, + "How many potential indicators needed", this->params()), + BOOL_MEMBER(crunch_leave_ok_strings, true, "Dont touch sensible strings", + this->params()), + BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring", + this->params()), + BOOL_MEMBER(crunch_leave_accept_strings, false, + "Dont pot crunch sensible strings", this->params()), + BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures", + this->params()), + INT_MEMBER(crunch_leave_lc_strings, 4, + "Dont crunch words with long lower case strings", + this->params()), + INT_MEMBER(crunch_leave_uc_strings, 4, + "Dont crunch words with long lower case strings", + this->params()), + INT_MEMBER(crunch_long_repetitions, 3, + "Crunch words with long repetitions", this->params()), + INT_MEMBER(crunch_debug, 0, "As it says", this->params()), + INT_MEMBER(fixsp_non_noise_limit, 1, + "How many non-noise blbs either side?", this->params()), + double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this", + this->params()), + BOOL_MEMBER(tessedit_prefer_joined_punct, false, + "Reward punctation joins", this->params()), + INT_MEMBER(fixsp_done_mode, 1, "What constitues done for spacing", + this->params()), + INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug", + this->params()), + STRING_MEMBER(numeric_punctuation, ".,", + "Punct. chs expected WITHIN numbers", this->params()), + INT_MEMBER(x_ht_acceptance_tolerance, 8, + "Max allowed deviation of blob top outside of font data", + this->params()), + INT_MEMBER(x_ht_min_change, 8, + "Min change in xht before actually trying it", this->params()), + INT_MEMBER(superscript_debug, 0, + "Debug level for sub & superscript fixer", this->params()), + double_MEMBER( + superscript_worse_certainty, 2.0, + "How many times worse " + "certainty does a superscript position glyph need to be for " + "us to try classifying it as a char with a different " + "baseline?", + this->params()), + double_MEMBER( + superscript_bettered_certainty, 0.97, + "What reduction in " + "badness do we think sufficient to choose a superscript " + "over what we'd thought. For example, a value of 0.6 means " + "we want to reduce badness of certainty by at least 40%", + this->params()), + double_MEMBER(superscript_scaledown_ratio, 0.4, + "A superscript scaled down more than this is unbelievably " + "small. For example, 0.3 means we expect the font size to " + "be no smaller than 30% of the text line font size.", + this->params()), + double_MEMBER(subscript_max_y_top, 0.5, + "Maximum top of a character measured as a multiple of " + "x-height above the baseline for us to reconsider whether " + "it's a subscript.", + this->params()), + double_MEMBER(superscript_min_y_bottom, 0.3, + "Minimum bottom of a character measured as a multiple of " + "x-height above the baseline for us to reconsider whether " + "it's a superscript.", + this->params()), + BOOL_MEMBER(tessedit_write_block_separators, false, + "Write block separators in output", this->params()), + BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code", + this->params()), + BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file", + this->params()), + BOOL_MEMBER(tessedit_create_txt, true, "Write .txt output file", + this->params()), + BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file", + this->params()), + BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file", + this->params()), + STRING_MEMBER(unrecognised_char, "|", + "Output char for unidentified blobs", this->params()), + INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()), + INT_MEMBER(suspect_space_level, 100, + "Min suspect level for rejecting spaces", this->params()), + INT_MEMBER(suspect_short_words, 2, + "Dont Suspect dict wds longer than this", this->params()), + BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected", + this->params()), + double_MEMBER(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit", + this->params()), + double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit", + this->params()), + BOOL_MEMBER(tessedit_minimal_rejection, false, + "Only reject tess failures", this->params()), + BOOL_MEMBER(tessedit_zero_rejection, false, "Dont reject ANYTHING", + this->params()), + BOOL_MEMBER(tessedit_word_for_word, false, + "Make output have exactly one word per WERD", this->params()), + BOOL_MEMBER(tessedit_zero_kelvin_rejection, false, + "Dont reject ANYTHING AT ALL", this->params()), + BOOL_MEMBER(tessedit_consistent_reps, true, + "Force all rep chars the same", this->params()), + INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm", + this->params()), + BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug", + this->params()), + BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips", + this->params()), + double_MEMBER(tessedit_lower_flip_hyphen, 1.5, + "Aspect ratio dot/hyphen test", this->params()), + double_MEMBER(tessedit_upper_flip_hyphen, 1.8, + "Aspect ratio dot/hyphen test", this->params()), + BOOL_MEMBER(rej_trust_doc_dawg, false, + "Use DOC dawg in 11l conf. detector", this->params()), + BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test", + this->params()), + BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Dont double check", + this->params()), + BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control", + this->params()), + BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control", + this->params()), + BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control", + this->params()), + BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check", + this->params()), + BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check", + this->params()), + double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85, + "if >this fract", this->params()), + INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit", + this->params()), + STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075", + "Allow NN to unrej", this->params()), + STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set", + this->params()), + INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this", + this->params()), + BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes", + this->params()), + INT_MEMBER(tessedit_page_number, -1, + "-1 -> All pages" + " , else specifc page to process", + this->params()), + BOOL_MEMBER(tessedit_write_images, false, + "Capture the image from the IPE", this->params()), + BOOL_MEMBER(interactive_display_mode, false, "Run interactively?", + this->params()), + STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()), + BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word", + this->params()), + INT_MEMBER(tessdata_manager_debug_level, 0, + "Debug level for" + " TessdataManager functions.", + this->params()), + STRING_MEMBER(tessedit_load_sublangs, "", + "List of languages to load with this one", this->params()), + BOOL_MEMBER(tessedit_use_primary_params_model, false, + "In multilingual mode use params model of the" + " primary language", + this->params()), + double_MEMBER(min_orientation_margin, 7.0, + "Min acceptable orientation margin", this->params()), + BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding", + this->params()), + BOOL_MEMBER(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model", + this->params()), + BOOL_MEMBER(poly_allow_detailed_fx, false, + "Allow feature extractors to see the original outline", + this->params()), + BOOL_INIT_MEMBER(tessedit_init_config_only, false, + "Only initialize with the config file. Useful if the " + "instance is not going to be used for OCR but say only " + "for layout analysis.", + this->params()), + BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector", + this->params()), + BOOL_MEMBER(textord_tabfind_vertical_text, true, + "Enable vertical detection", this->params()), + BOOL_MEMBER(textord_tabfind_force_vertical_text, false, + "Force using vertical text page mode", this->params()), + double_MEMBER( + textord_tabfind_vertical_text_ratio, 0.5, + "Fraction of textlines deemed vertical to use vertical page " + "mode", + this->params()), + double_MEMBER( + textord_tabfind_aligned_gap_fraction, 0.75, + "Fraction of height used as a minimum gap for aligned blobs.", + this->params()), + INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible", + this->params()), + BOOL_MEMBER(preserve_interword_spaces, false, + "Preserve multiple interword spaces", this->params()), + BOOL_MEMBER(include_page_breaks, FALSE, + "Include page separator string in output text after each " + "image/page.", + this->params()), + STRING_MEMBER(page_separator, "\f", + "Page separator (default is form feed control character)", + this->params()), - // The following parameters were deprecated and removed from their original - // locations. The parameters are temporarily kept here to give Tesseract - // users a chance to updated their [lang].traineddata and config files - // without introducing failures during Tesseract initialization. - // TODO(ocr-team): remove these parameters from the code once we are - // reasonably sure that Tesseract users have updated their data files. - // - // BEGIN DEPRECATED PARAMETERS - BOOL_MEMBER(textord_tabfind_vertical_horizontal_mix, true, - "find horizontal lines such as headers in vertical page mode", - this->params()), - INT_MEMBER(tessedit_ok_mode, 5, - "Acceptance decision algorithm", this->params()), - BOOL_INIT_MEMBER(load_fixed_length_dawgs, true, "Load fixed length dawgs" - " (e.g. for non-space delimited languages)", - this->params()), - INT_MEMBER(segment_debug, 0, "Debug the whole segmentation process", - this->params()), - BOOL_MEMBER(permute_debug, 0, "Debug char permutation process", - this->params()), - double_MEMBER(bestrate_pruning_factor, 2.0, "Multiplying factor of" - " current best rate to prune other hypotheses", - this->params()), - BOOL_MEMBER(permute_script_word, 0, - "Turn on word script consistency permuter", - this->params()), - BOOL_MEMBER(segment_segcost_rating, 0, - "incorporate segmentation cost in word rating?", - this->params()), - double_MEMBER(segment_reward_script, 0.95, - "Score multipler for script consistency within a word. " - "Being a 'reward' factor, it should be <= 1. " - "Smaller value implies bigger reward.", - this->params()), - BOOL_MEMBER(permute_fixed_length_dawg, 0, - "Turn on fixed-length phrasebook search permuter", - this->params()), - BOOL_MEMBER(permute_chartype_word, 0, - "Turn on character type (property) consistency permuter", - this->params()), - double_MEMBER(segment_reward_chartype, 0.97, - "Score multipler for char type consistency within a word. ", - this->params()), - double_MEMBER(segment_reward_ngram_best_choice, 0.99, - "Score multipler for ngram permuter's best choice" - " (only used in the Han script path).", - this->params()), - BOOL_MEMBER(ngram_permuter_activated, false, - "Activate character-level n-gram-based permuter", - this->params()), - BOOL_MEMBER(permute_only_top, false, "Run only the top choice permuter", - this->params()), - INT_MEMBER(language_model_fixed_length_choices_depth, 3, - "Depth of blob choice lists to explore" - " when fixed length dawgs are on", - this->params()), - BOOL_MEMBER(use_new_state_cost, FALSE, - "use new state cost heuristics for segmentation state" - " evaluation", this->params()), - double_MEMBER(heuristic_segcost_rating_base, 1.25, - "base factor for adding segmentation cost into word rating." - "It's a multiplying factor, the larger the value above 1, " - "the bigger the effect of segmentation cost.", - this->params()), - double_MEMBER(heuristic_weight_rating, 1.0, - "weight associated with char rating in combined cost of" - "state", this->params()), - double_MEMBER(heuristic_weight_width, 1000.0, - "weight associated with width evidence in combined cost of" - " state", this->params()), - double_MEMBER(heuristic_weight_seamcut, 0.0, - "weight associated with seam cut in combined cost of state", - this->params()), - double_MEMBER(heuristic_max_char_wh_ratio, 2.0, - "max char width-to-height ratio allowed in segmentation", - this->params()), - BOOL_MEMBER(enable_new_segsearch, true, - "Enable new segmentation search path.", this->params()), - double_MEMBER(segsearch_max_fixed_pitch_char_wh_ratio, 2.0, - "Maximum character width-to-height ratio for" - " fixed-pitch fonts", - this->params()), - // END DEPRECATED PARAMETERS + // The following parameters were deprecated and removed from their + // original + // locations. The parameters are temporarily kept here to give Tesseract + // users a chance to updated their [lang].traineddata and config files + // without introducing failures during Tesseract initialization. + // TODO(ocr-team): remove these parameters from the code once we are + // reasonably sure that Tesseract users have updated their data files. + // + // BEGIN DEPRECATED PARAMETERS + BOOL_MEMBER(textord_tabfind_vertical_horizontal_mix, true, + "find horizontal lines such as headers in vertical page mode", + this->params()), + INT_MEMBER(tessedit_ok_mode, 5, "Acceptance decision algorithm", + this->params()), + BOOL_INIT_MEMBER(load_fixed_length_dawgs, true, + "Load fixed length dawgs" + " (e.g. for non-space delimited languages)", + this->params()), + INT_MEMBER(segment_debug, 0, "Debug the whole segmentation process", + this->params()), + BOOL_MEMBER(permute_debug, 0, "Debug char permutation process", + this->params()), + double_MEMBER(bestrate_pruning_factor, 2.0, + "Multiplying factor of" + " current best rate to prune other hypotheses", + this->params()), + BOOL_MEMBER(permute_script_word, 0, + "Turn on word script consistency permuter", this->params()), + BOOL_MEMBER(segment_segcost_rating, 0, + "incorporate segmentation cost in word rating?", + this->params()), + double_MEMBER(segment_reward_script, 0.95, + "Score multipler for script consistency within a word. " + "Being a 'reward' factor, it should be <= 1. " + "Smaller value implies bigger reward.", + this->params()), + BOOL_MEMBER(permute_fixed_length_dawg, 0, + "Turn on fixed-length phrasebook search permuter", + this->params()), + BOOL_MEMBER(permute_chartype_word, 0, + "Turn on character type (property) consistency permuter", + this->params()), + double_MEMBER(segment_reward_chartype, 0.97, + "Score multipler for char type consistency within a word. ", + this->params()), + double_MEMBER(segment_reward_ngram_best_choice, 0.99, + "Score multipler for ngram permuter's best choice" + " (only used in the Han script path).", + this->params()), + BOOL_MEMBER(ngram_permuter_activated, false, + "Activate character-level n-gram-based permuter", + this->params()), + BOOL_MEMBER(permute_only_top, false, "Run only the top choice permuter", + this->params()), + INT_MEMBER(language_model_fixed_length_choices_depth, 3, + "Depth of blob choice lists to explore" + " when fixed length dawgs are on", + this->params()), + BOOL_MEMBER(use_new_state_cost, FALSE, + "use new state cost heuristics for segmentation state" + " evaluation", + this->params()), + double_MEMBER(heuristic_segcost_rating_base, 1.25, + "base factor for adding segmentation cost into word rating." + "It's a multiplying factor, the larger the value above 1, " + "the bigger the effect of segmentation cost.", + this->params()), + double_MEMBER(heuristic_weight_rating, 1.0, + "weight associated with char rating in combined cost of" + "state", + this->params()), + double_MEMBER(heuristic_weight_width, 1000.0, + "weight associated with width evidence in combined cost of" + " state", + this->params()), + double_MEMBER(heuristic_weight_seamcut, 0.0, + "weight associated with seam cut in combined cost of state", + this->params()), + double_MEMBER(heuristic_max_char_wh_ratio, 2.0, + "max char width-to-height ratio allowed in segmentation", + this->params()), + BOOL_MEMBER(enable_new_segsearch, true, + "Enable new segmentation search path.", this->params()), + double_MEMBER(segsearch_max_fixed_pitch_char_wh_ratio, 2.0, + "Maximum character width-to-height ratio for" + " fixed-pitch fonts", + this->params()), + // END DEPRECATED PARAMETERS - backup_config_file_(NULL), - pix_binary_(NULL), - cube_binary_(NULL), - pix_grey_(NULL), - pix_thresholds_(NULL), - source_resolution_(0), - textord_(this), - right_to_left_(false), - scaled_color_(NULL), - scaled_factor_(-1), - deskew_(1.0f, 0.0f), - reskew_(1.0f, 0.0f), - most_recently_used_(this), - font_table_size_(0), + backup_config_file_(NULL), + pix_binary_(NULL), + cube_binary_(NULL), + pix_grey_(NULL), + pix_thresholds_(NULL), + source_resolution_(0), + textord_(this), + right_to_left_(false), + scaled_color_(NULL), + scaled_factor_(-1), + deskew_(1.0f, 0.0f), + reskew_(1.0f, 0.0f), + most_recently_used_(this), + font_table_size_(0), #ifndef ANDROID_BUILD - cube_cntxt_(NULL), - tess_cube_combiner_(NULL), + cube_cntxt_(NULL), + tess_cube_combiner_(NULL), #endif - equ_detect_(NULL) { + equ_detect_(NULL) { } Tesseract::~Tesseract() { diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h index bd03fff642..d488fd30f3 100644 --- a/ccmain/tesseractclass.h +++ b/ccmain/tesseractclass.h @@ -283,8 +283,8 @@ class Tesseract : public Wordrec { int SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, Tesseract* osd_tess, OSResults* osr); void SetupWordScripts(BLOCK_LIST* blocks); - int AutoPageSeg(PageSegMode pageseg_mode, - BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks, + int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks, + TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs, Tesseract* osd_tess, OSResults* osr); ColumnFinder* SetupPageSegAndDetectOrientation( bool single_column, bool osd, bool only_osd, @@ -328,8 +328,46 @@ class Tesseract : public Wordrec { WordRecognizer recognizer, WERD_RES** in_word, PointerVector* best_words); - void classify_word_and_language(WordRecognizer recognizer, - PAGE_RES_IT* pr_it, + // Moves good-looking "noise"/diacritics from the reject list to the main + // blob list on the current word. Returns true if anything was done, and + // sets make_next_word_fuzzy if blob(s) were added to the end of the word. + bool ReassignDiacritics(int pass, PAGE_RES_IT* pr_it, + bool* make_next_word_fuzzy); + // Attempts to put noise/diacritic outlines into the blobs that they overlap. + // Input: a set of noisy outlines that probably belong to the real_word. + // Output: outlines that overlapped blobs are set to NULL and put back into + // the word, either in the blobs or in the reject list. + void AssignDiacriticsToOverlappingBlobs( + const GenericVector& outlines, int pass, WERD* real_word, + PAGE_RES_IT* pr_it, GenericVector* word_wanted, + GenericVector* overlapped_any_blob, + GenericVector* target_blobs); + // Attempts to assign non-overlapping outlines to their nearest blobs or + // make new blobs out of them. + void AssignDiacriticsToNewBlobs(const GenericVector& outlines, + int pass, WERD* real_word, PAGE_RES_IT* pr_it, + GenericVector* word_wanted, + GenericVector* target_blobs); + // Starting with ok_outlines set to indicate which outlines overlap the blob, + // chooses the optimal set (approximately) and returns true if any outlines + // are desired, in which case ok_outlines indicates which ones. + bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, + PAGE_RES_IT* pr_it, C_BLOB* blob, + const GenericVector& outlines, + int num_outlines, + GenericVector* ok_outlines); + // Classifies the given blob plus the outlines flagged by ok_outlines, undoes + // the inclusion of the outlines, and returns the certainty of the raw choice. + float ClassifyBlobPlusOutlines(const GenericVector& ok_outlines, + const GenericVector& outlines, + int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob, + STRING* best_str); + // Classifies the given blob (part of word_data->word->word) as an individual + // word, using languages, chopper etc, returning only the certainty of the + // best raw choice, and undoing all the work done to fake out the word. + float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT* pr_it, C_BLOB* blob, + STRING* best_str, float* c2); + void classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it, WordData* word_data); void classify_word_pass1(const WordData& word_data, WERD_RES** in_word, @@ -808,6 +846,24 @@ class Tesseract : public Wordrec { "Enable single word correction based on the dictionary."); INT_VAR_H(tessedit_bigram_debug, 0, "Amount of debug output for bigram " "correction."); + BOOL_VAR_H(enable_noise_removal, true, + "Remove and conditionally reassign small outlines when they" + " confuse layout analysis, determining diacritics vs noise"); + INT_VAR_H(debug_noise_removal, 0, "Debug reassignment of small outlines"); + // Worst (min) certainty, for which a diacritic is allowed to make the base + // character worse and still be included. + double_VAR_H(noise_cert_basechar, -8.0, "Hingepoint for base char certainty"); + // Worst (min) certainty, for which a non-overlapping diacritic is allowed to + // make the base character worse and still be included. + double_VAR_H(noise_cert_disjoint, -2.5, "Hingepoint for disjoint certainty"); + // Worst (min) certainty, for which a diacritic is allowed to make a new + // stand-alone blob. + double_VAR_H(noise_cert_punc, -2.5, "Threshold for new punc char certainty"); + // Factor of certainty margin for adding diacritics to not count as worse. + double_VAR_H(noise_cert_factor, 0.375, + "Scaling on certainty diff from Hingepoint"); + INT_VAR_H(noise_maxperblob, 8, "Max diacritics to apply to a blob"); + INT_VAR_H(noise_maxperword, 16, "Max diacritics to apply to a word"); INT_VAR_H(debug_x_ht_level, 0, "Reestimate debug"); BOOL_VAR_H(debug_acceptable_wds, false, "Dump word pass/fail chk"); STRING_VAR_H(chs_leading_punct, "('`\"", "Leading punctuation"); diff --git a/ccstruct/blobbox.h b/ccstruct/blobbox.h index bd26e1be95..b09d82f4da 100644 --- a/ccstruct/blobbox.h +++ b/ccstruct/blobbox.h @@ -137,6 +137,9 @@ class BLOBNBOX:public ELIST_LINK cblob_ptr = srcblob; area = static_cast(srcblob->area()); } + ~BLOBNBOX() { + if (owns_cblob_) delete cblob_ptr; + } static BLOBNBOX* RealBlob(C_OUTLINE* outline) { C_BLOB* blob = new C_BLOB(outline); return new BLOBNBOX(blob); @@ -387,6 +390,7 @@ class BLOBNBOX:public ELIST_LINK void set_base_char_blob(BLOBNBOX* blob) { base_char_blob_ = blob; } + void set_owns_cblob(bool value) { owns_cblob_ = value; } bool UniquelyVertical() const { return vert_possible_ && !horz_possible_; @@ -450,6 +454,7 @@ class BLOBNBOX:public ELIST_LINK // construction time. void ConstructionInit() { cblob_ptr = NULL; + owns_cblob_ = false; area = 0; area_stroke_width_ = 0.0f; horz_stroke_width_ = 0.0f; @@ -525,6 +530,10 @@ class BLOBNBOX:public ELIST_LINK bool vert_possible_; // Could be part of vertical flow. bool leader_on_left_; // There is a leader to the left. bool leader_on_right_; // There is a leader to the right. + // Iff true, then the destructor should delete the cblob_ptr. + // TODO(rays) migrate all uses to correctly setting this flag instead of + // deleting the C_BLOB before deleting the BLOBNBOX. + bool owns_cblob_; }; class TO_ROW: public ELIST2_LINK diff --git a/ccstruct/ocrblock.cpp b/ccstruct/ocrblock.cpp index a328e03887..ad7893b05a 100644 --- a/ccstruct/ocrblock.cpp +++ b/ccstruct/ocrblock.cpp @@ -86,6 +86,18 @@ void BLOCK::rotate(const FCOORD& rotation) { box = *poly_block()->bounding_box(); } +// Returns the bounding box including the desired combination of upper and +// lower noise/diacritic elements. +TBOX BLOCK::restricted_bounding_box(bool upper_dots, bool lower_dots) const { + TBOX box; + // This is a read-only iteration of the rows in the block. + ROW_IT it(const_cast(&rows)); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + box += it.data()->restricted_bounding_box(upper_dots, lower_dots); + } + return box; +} + /** * BLOCK::reflect_polygon_in_y_axis * diff --git a/ccstruct/ocrblock.h b/ccstruct/ocrblock.h index 207c1e8579..c93aaf8a4c 100644 --- a/ccstruct/ocrblock.h +++ b/ccstruct/ocrblock.h @@ -161,10 +161,14 @@ class BLOCK:public ELIST_LINK, public PDBLK median_size_.set_y(y); } - Pix* render_mask() { - return PDBLK::render_mask(re_rotation_); + Pix* render_mask(TBOX* mask_box) { + return PDBLK::render_mask(re_rotation_, mask_box); } + // Returns the bounding box including the desired combination of upper and + // lower noise/diacritic elements. + TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const; + // Reflects the polygon in the y-axis and recomputes the bounding_box. // Does nothing to any contained rows/words/blobs etc. void reflect_polygon_in_y_axis(); diff --git a/ccstruct/ocrrow.cpp b/ccstruct/ocrrow.cpp index a7ad6ba791..c6f919ca12 100644 --- a/ccstruct/ocrrow.cpp +++ b/ccstruct/ocrrow.cpp @@ -80,6 +80,17 @@ ROW::ROW( //constructor rmargin_ = 0; } +// Returns the bounding box including the desired combination of upper and +// lower noise/diacritic elements. +TBOX ROW::restricted_bounding_box(bool upper_dots, bool lower_dots) const { + TBOX box; + // This is a read-only iteration of the words in the row. + WERD_IT it(const_cast(&words)); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + box += it.data()->restricted_bounding_box(upper_dots, lower_dots); + } + return box; +} /********************************************************************** * ROW::recalc_bounding_box diff --git a/ccstruct/ocrrow.h b/ccstruct/ocrrow.h index 1a23889279..45384b710f 100644 --- a/ccstruct/ocrrow.h +++ b/ccstruct/ocrrow.h @@ -85,6 +85,9 @@ class ROW:public ELIST_LINK TBOX bounding_box() const { //return bounding box return bound_box; } + // Returns the bounding box including the desired combination of upper and + // lower noise/diacritic elements. + TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const; void set_lmargin(inT16 lmargin) { lmargin_ = lmargin; diff --git a/ccstruct/pageres.cpp b/ccstruct/pageres.cpp index 5304451929..9c1b13c5c3 100644 --- a/ccstruct/pageres.cpp +++ b/ccstruct/pageres.cpp @@ -1258,23 +1258,16 @@ int PAGE_RES_IT::cmp(const PAGE_RES_IT &other) const { return 0; } -// Inserts the new_word and a corresponding WERD_RES before the current -// position. The simple fields of the WERD_RES are copied from clone_res and -// the resulting WERD_RES is returned for further setup with best_choice etc. +// Inserts the new_word as a combination owned by a corresponding WERD_RES +// before the current position. The simple fields of the WERD_RES are copied +// from clone_res and the resulting WERD_RES is returned for further setup +// with best_choice etc. WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res, WERD* new_word) { - // Insert new_word into the ROW. - WERD_IT w_it(row()->row->word_list()); - for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { - WERD* word = w_it.data(); - if (word == word_res->word) - break; - } - ASSERT_HOST(!w_it.cycled_list()); - w_it.add_before_then_move(new_word); // Make a WERD_RES for the new_word. WERD_RES* new_res = new WERD_RES(new_word); new_res->CopySimpleFields(clone_res); + new_res->combination = true; // Insert into the appropriate place in the ROW_RES. WERD_RES_IT wr_it(&row()->word_res_list); for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { @@ -1477,6 +1470,33 @@ void PAGE_RES_IT::DeleteCurrentWord() { ResetWordIterator(); } +// Makes the current word a fuzzy space if not already fuzzy. Updates +// corresponding part of combo if required. +void PAGE_RES_IT::MakeCurrentWordFuzzy() { + WERD* real_word = word_res->word; + if (!real_word->flag(W_FUZZY_SP) && !real_word->flag(W_FUZZY_NON)) { + real_word->set_flag(W_FUZZY_SP, true); + tprintf("Made word fuzzy at:"); + real_word->bounding_box().print(); + if (word_res->combination) { + // The next word should be the corresponding part of combo, but we have + // already stepped past it, so find it by search. + WERD_RES_IT wr_it(&row()->word_res_list); + for (wr_it.mark_cycle_pt(); + !wr_it.cycled_list() && wr_it.data() != word_res; wr_it.forward()) { + } + wr_it.forward(); + ASSERT_HOST(wr_it.data()->part_of_combo); + real_word = wr_it.data()->word; + ASSERT_HOST(!real_word->flag(W_FUZZY_SP) && + !real_word->flag(W_FUZZY_NON)); + real_word->set_flag(W_FUZZY_SP, true); + tprintf("Made part of combo word fuzzy at:"); + real_word->bounding_box().print(); + } + } +} + /************************************************************************* * PAGE_RES_IT::restart_page * diff --git a/ccstruct/pageres.h b/ccstruct/pageres.h index 75798113d4..a6a8404275 100644 --- a/ccstruct/pageres.h +++ b/ccstruct/pageres.h @@ -708,6 +708,10 @@ class PAGE_RES_IT { // Deletes the current WERD_RES and its underlying WERD. void DeleteCurrentWord(); + // Makes the current word a fuzzy space if not already fuzzy. Updates + // corresponding part of combo if required. + void MakeCurrentWordFuzzy(); + WERD_RES *forward() { // Get next word. return internal_forward(false, false); } @@ -747,9 +751,9 @@ class PAGE_RES_IT { return next_block_res; } void rej_stat_word(); // for page/block/row + void ResetWordIterator(); private: - void ResetWordIterator(); WERD_RES *internal_forward(bool new_block, bool empty_ok); WERD_RES * prev_word_res; // previous word diff --git a/ccstruct/pdblock.cpp b/ccstruct/pdblock.cpp index 97365b53e7..cf3289f2e7 100644 --- a/ccstruct/pdblock.cpp +++ b/ccstruct/pdblock.cpp @@ -77,7 +77,6 @@ void PDBLK::set_sides( //set vertex lists right_it.add_list_before (right); } - /********************************************************************** * PDBLK::contains * @@ -126,7 +125,7 @@ void PDBLK::move( // reposition block // Returns a binary Pix mask with a 1 pixel for every pixel within the // block. Rotates the coordinate system by rerotation prior to rendering. -Pix* PDBLK::render_mask(const FCOORD& rerotation) { +Pix* PDBLK::render_mask(const FCOORD& rerotation, TBOX* mask_box) { TBOX rotated_box(box); rotated_box.rotate(rerotation); Pix* pix = pixCreate(rotated_box.width(), rotated_box.height(), 1); @@ -163,6 +162,7 @@ Pix* PDBLK::render_mask(const FCOORD& rerotation) { pixRasterop(pix, 0, 0, rotated_box.width(), rotated_box.height(), PIX_SET, NULL, 0, 0); } + if (mask_box != NULL) *mask_box = rotated_box; return pix; } diff --git a/ccstruct/pdblock.h b/ccstruct/pdblock.h index 34f5518e3c..0dd0bf2ef8 100644 --- a/ccstruct/pdblock.h +++ b/ccstruct/pdblock.h @@ -89,7 +89,9 @@ class PDBLK // Returns a binary Pix mask with a 1 pixel for every pixel within the // block. Rotates the coordinate system by rerotation prior to rendering. - Pix* render_mask(const FCOORD& rerotation); + // If not NULL, mask_box is filled with the position box of the returned + // mask image. + Pix *render_mask(const FCOORD &rerotation, TBOX *mask_box); #ifndef GRAPHICS_DISABLED ///draw histogram diff --git a/ccstruct/werd.cpp b/ccstruct/werd.cpp index 24c8a41b33..aaaee9cc23 100644 --- a/ccstruct/werd.cpp +++ b/ccstruct/werd.cpp @@ -160,23 +160,37 @@ WERD* WERD::ConstructFromSingleBlob(bool bol, bool eol, C_BLOB* blob) { * row being marked as FUZZY space. */ -TBOX WERD::bounding_box() { - TBOX box; // box being built - C_BLOB_IT rej_cblob_it = &rej_cblobs; // rejected blobs - - for (rej_cblob_it.mark_cycle_pt(); !rej_cblob_it.cycled_list(); - rej_cblob_it.forward()) { - box += rej_cblob_it.data()->bounding_box(); +TBOX WERD::bounding_box() const { return restricted_bounding_box(true, true); } + +// Returns the bounding box including the desired combination of upper and +// lower noise/diacritic elements. +TBOX WERD::restricted_bounding_box(bool upper_dots, bool lower_dots) const { + TBOX box = true_bounding_box(); + int bottom = box.bottom(); + int top = box.top(); + // This is a read-only iteration of the rejected blobs. + C_BLOB_IT it(const_cast(&rej_cblobs)); + for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { + TBOX dot_box = it.data()->bounding_box(); + if ((upper_dots || dot_box.bottom() <= top) && + (lower_dots || dot_box.top() >= bottom)) { + box += dot_box; + } } + return box; +} - C_BLOB_IT it = &cblobs; // blobs of WERD +// Returns the bounding box of only the good blobs. +TBOX WERD::true_bounding_box() const { + TBOX box; // box being built + // This is a read-only iteration of the good blobs. + C_BLOB_IT it(const_cast(&cblobs)); for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { box += it.data()->bounding_box(); } return box; } - /** * WERD::move * @@ -489,3 +503,101 @@ WERD* WERD::ConstructWerdWithNewBlobs(C_BLOB_LIST* all_blobs, } return new_werd; } + +// Removes noise from the word by moving small outlines to the rej_cblobs +// list, based on the size_threshold. +void WERD::CleanNoise(float size_threshold) { + C_BLOB_IT blob_it(&cblobs); + C_BLOB_IT rej_it(&rej_cblobs); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + C_BLOB* blob = blob_it.data(); + C_OUTLINE_IT ol_it(blob->out_list()); + for (ol_it.mark_cycle_pt(); !ol_it.cycled_list(); ol_it.forward()) { + C_OUTLINE* outline = ol_it.data(); + TBOX ol_box = outline->bounding_box(); + int ol_size = + ol_box.width() > ol_box.height() ? ol_box.width() : ol_box.height(); + if (ol_size < size_threshold) { + // This outline is too small. Move it to a separate blob in the + // reject blobs list. + C_BLOB* rej_blob = new C_BLOB(ol_it.extract()); + rej_it.add_after_then_move(rej_blob); + } + } + if (blob->out_list()->empty()) delete blob_it.extract(); + } +} + +// Extracts all the noise outlines and stuffs the pointers into the given +// vector of outlines. Afterwards, the outlines vector owns the pointers. +void WERD::GetNoiseOutlines(GenericVector* outlines) { + C_BLOB_IT rej_it(&rej_cblobs); + for (rej_it.mark_cycle_pt(); !rej_it.empty(); rej_it.forward()) { + C_BLOB* blob = rej_it.extract(); + C_OUTLINE_IT ol_it(blob->out_list()); + outlines->push_back(ol_it.extract()); + delete blob; + } +} + +// Adds the selected outlines to the indcated real blobs, and puts the rest +// back in rej_cblobs where they came from. Where the target_blobs entry is +// NULL, a run of wanted outlines is put into a single new blob. +// Ownership of the outlines is transferred back to the word. (Hence +// GenericVector and not PointerVector.) +// Returns true if any new blob was added to the start of the word, which +// suggests that it might need joining to the word before it, and likewise +// sets make_next_word_fuzzy true if any new blob was added to the end. +bool WERD::AddSelectedOutlines(const GenericVector& wanted, + const GenericVector& target_blobs, + const GenericVector& outlines, + bool* make_next_word_fuzzy) { + bool outline_added_to_start = false; + if (make_next_word_fuzzy != NULL) *make_next_word_fuzzy = false; + C_BLOB_IT rej_it(&rej_cblobs); + for (int i = 0; i < outlines.size(); ++i) { + C_OUTLINE* outline = outlines[i]; + if (outline == NULL) continue; // Already used it. + if (wanted[i]) { + C_BLOB* target_blob = target_blobs[i]; + TBOX noise_box = outline->bounding_box(); + if (target_blob == NULL) { + target_blob = new C_BLOB(outline); + // Need to find the insertion point. + C_BLOB_IT blob_it(&cblobs); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); + blob_it.forward()) { + C_BLOB* blob = blob_it.data(); + TBOX blob_box = blob->bounding_box(); + if (blob_box.left() > noise_box.left()) { + if (blob_it.at_first() && !flag(W_FUZZY_SP) && !flag(W_FUZZY_NON)) { + // We might want to join this word to its predecessor. + outline_added_to_start = true; + } + blob_it.add_before_stay_put(target_blob); + break; + } + } + if (blob_it.cycled_list()) { + blob_it.add_to_end(target_blob); + if (make_next_word_fuzzy != NULL) *make_next_word_fuzzy = true; + } + // Add all consecutive wanted, but null-blob outlines to same blob. + C_OUTLINE_IT ol_it(target_blob->out_list()); + while (i + 1 < outlines.size() && wanted[i + 1] && + target_blobs[i + 1] == NULL) { + ++i; + ol_it.add_to_end(outlines[i]); + } + } else { + // Insert outline into this blob. + C_OUTLINE_IT ol_it(target_blob->out_list()); + ol_it.add_to_end(outline); + } + } else { + // Put back on noise list. + rej_it.add_to_end(new C_BLOB(outline)); + } + } + return outline_added_to_start; +} diff --git a/ccstruct/werd.h b/ccstruct/werd.h index 43ecb84b6e..f9a89fb5b5 100644 --- a/ccstruct/werd.h +++ b/ccstruct/werd.h @@ -114,7 +114,13 @@ class WERD : public ELIST2_LINK { script_id_ = id; } - TBOX bounding_box(); // compute bounding box + // Returns the (default) bounding box including all the dots. + TBOX bounding_box() const; // compute bounding box + // Returns the bounding box including the desired combination of upper and + // lower noise/diacritic elements. + TBOX restricted_bounding_box(bool upper_dots, bool lower_dots) const; + // Returns the bounding box of only the good blobs. + TBOX true_bounding_box() const; const char *text() const { return correct.string(); } void set_text(const char *new_text) { correct = new_text; } @@ -155,6 +161,26 @@ class WERD : public ELIST2_LINK { void plot_rej_blobs(ScrollView *window); #endif // GRAPHICS_DISABLED + // Removes noise from the word by moving small outlines to the rej_cblobs + // list, based on the size_threshold. + void CleanNoise(float size_threshold); + + // Extracts all the noise outlines and stuffs the pointers into the given + // vector of outlines. Afterwards, the outlines vector owns the pointers. + void GetNoiseOutlines(GenericVector *outlines); + // Adds the selected outlines to the indcated real blobs, and puts the rest + // back in rej_cblobs where they came from. Where the target_blobs entry is + // NULL, a run of wanted outlines is put into a single new blob. + // Ownership of the outlines is transferred back to the word. (Hence + // GenericVector and not PointerVector.) + // Returns true if any new blob was added to the start of the word, which + // suggests that it might need joining to the word before it, and likewise + // sets make_next_word_fuzzy true if any new blob was added to the end. + bool AddSelectedOutlines(const GenericVector &wanted, + const GenericVector &target_blobs, + const GenericVector &outlines, + bool *make_next_word_fuzzy); + private: uinT8 blanks; // no of blanks uinT8 dummy; // padding diff --git a/textord/colfind.cpp b/textord/colfind.cpp index b9b10649af..41b3895602 100644 --- a/textord/colfind.cpp +++ b/textord/colfind.cpp @@ -286,22 +286,27 @@ void ColumnFinder::CorrectOrientation(TO_BLOCK* block, // thresholds_pix is expected to be present iff grey_pix is present and // can be an integer factor reduction of the grey_pix. It represents the // thresholds that were used to create the binary_pix from the grey_pix. +// If diacritic_blobs is non-null, then diacritics/noise blobs, that would +// confuse layout anaylsis by causing textline overlap, are placed there, +// with the expectation that they will be reassigned to words later and +// noise/diacriticness determined via classification. // Returns -1 if the user hits the 'd' key in the blocks window while running // in debug mode, which requests a retry with more debug info. -int ColumnFinder::FindBlocks(PageSegMode pageseg_mode, - Pix* scaled_color, int scaled_factor, - TO_BLOCK* input_block, Pix* photo_mask_pix, - Pix* thresholds_pix, Pix* grey_pix, - BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks) { +int ColumnFinder::FindBlocks(PageSegMode pageseg_mode, Pix* scaled_color, + int scaled_factor, TO_BLOCK* input_block, + Pix* photo_mask_pix, Pix* thresholds_pix, + Pix* grey_pix, BLOCK_LIST* blocks, + BLOBNBOX_LIST* diacritic_blobs, + TO_BLOCK_LIST* to_blocks) { pixOr(photo_mask_pix, photo_mask_pix, nontext_map_); stroke_width_->FindLeaderPartitions(input_block, &part_grid_); stroke_width_->RemoveLineResidue(&big_parts_); FindInitialTabVectors(NULL, min_gutter_width_, tabfind_aligned_gap_fraction_, input_block); SetBlockRuleEdges(input_block); - stroke_width_->GradeBlobsIntoPartitions(rerotate_, input_block, nontext_map_, - denorm_, cjk_script_, &projection_, - &part_grid_, &big_parts_); + stroke_width_->GradeBlobsIntoPartitions( + rerotate_, input_block, nontext_map_, denorm_, cjk_script_, &projection_, + diacritic_blobs, &part_grid_, &big_parts_); if (!PSM_SPARSE(pageseg_mode)) { ImageFind::FindImagePartitions(photo_mask_pix, rotation_, rerotate_, input_block, this, &part_grid_, &big_parts_); @@ -1134,9 +1139,13 @@ void ColumnFinder::GridMergePartitions() { neighbour->Print(); } rsearch.RemoveBBox(); - gsearch.RepositionIterator(); + if (!modified_box) { + // We are going to modify part, so remove it and re-insert it after. + gsearch.RemoveBBox(); + rsearch.RepositionIterator(); + modified_box = true; + } part->Absorb(neighbour, WidthCB()); - modified_box = true; } else if (debug) { tprintf("Neighbour failed hgap test\n"); } @@ -1151,7 +1160,6 @@ void ColumnFinder::GridMergePartitions() { // or it will never be found by a full search. // Because the box has changed, it has to be removed first, otherwise // add_sorted may fail to keep a single copy of the pointer. - gsearch.RemoveBBox(); part_grid_.InsertBBox(true, true, part); gsearch.RepositionIterator(); } diff --git a/textord/colfind.h b/textord/colfind.h index 04ad1684de..eedd4c407e 100644 --- a/textord/colfind.h +++ b/textord/colfind.h @@ -155,13 +155,15 @@ class ColumnFinder : public TabFind { // thresholds_pix is expected to be present iff grey_pix is present and // can be an integer factor reduction of the grey_pix. It represents the // thresholds that were used to create the binary_pix from the grey_pix. + // Small blobs that confuse the segmentation into lines are placed into + // diacritic_blobs, with the intention that they be put into the most + // appropriate word after the rest of layout analysis. // Returns -1 if the user hits the 'd' key in the blocks window while running // in debug mode, which requests a retry with more debug info. - int FindBlocks(PageSegMode pageseg_mode, - Pix* scaled_color, int scaled_factor, - TO_BLOCK* block, Pix* photo_mask_pix, - Pix* thresholds_pix, Pix* grey_pix, - BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks); + int FindBlocks(PageSegMode pageseg_mode, Pix* scaled_color, int scaled_factor, + TO_BLOCK* block, Pix* photo_mask_pix, Pix* thresholds_pix, + Pix* grey_pix, BLOCK_LIST* blocks, + BLOBNBOX_LIST* diacritic_blobs, TO_BLOCK_LIST* to_blocks); // Get the rotation required to deskew, and its inverse rotation. void GetDeskewVectors(FCOORD* deskew, FCOORD* reskew); diff --git a/textord/colpartition.cpp b/textord/colpartition.cpp index e9ce568aa3..565c660bb2 100644 --- a/textord/colpartition.cpp +++ b/textord/colpartition.cpp @@ -297,6 +297,25 @@ void ColPartition::DisownBoxesNoAssert() { } } +// NULLs the owner of the blobs in this partition that are owned by this +// partition and not leader blobs, removing them from the boxes_ list, thus +// turning this partition back to a leader partition if it contains a leader, +// or otherwise leaving it empty. Returns true if any boxes remain. +bool ColPartition::ReleaseNonLeaderBoxes() { + BLOBNBOX_C_IT bb_it(&boxes_); + for (bb_it.mark_cycle_pt(); !bb_it.cycled_list(); bb_it.forward()) { + BLOBNBOX* bblob = bb_it.data(); + if (bblob->flow() != BTFT_LEADER) { + if (bblob->owner() == this) bblob->set_owner(NULL); + bb_it.extract(); + } + } + if (bb_it.empty()) return false; + flow_ = BTFT_LEADER; + ComputeLimits(); + return true; +} + // Delete the boxes that this partition owns. void ColPartition::DeleteBoxes() { // Although the boxes_ list is a C_LIST, in some cases it owns the @@ -831,6 +850,10 @@ ColPartition* ColPartition::SplitAt(int split_x) { bbox->set_owner(split_part); } } + if (it.empty()) { + // Possible if split-x passes through the first blob. + it.add_list_after(&split_part->boxes_); + } ASSERT_HOST(!it.empty()); if (split_part->IsEmpty()) { // Split part ended up with nothing. Possible if split_x passes @@ -1130,6 +1153,7 @@ bool ColPartition::MarkAsLeaderIfMonospaced() { if (best_end != NULL && best_end->total_cost() < blob_count) { // Good enough. Call it a leader. result = true; + bool modified_blob_list = false; for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { BLOBNBOX* blob = it.data(); TBOX box = blob->bounding_box(); @@ -1139,6 +1163,7 @@ bool ColPartition::MarkAsLeaderIfMonospaced() { blob->bounding_box().right(); if (blob->bounding_box().width() + gap > max_step) { it.extract(); + modified_blob_list = true; continue; } } @@ -1147,12 +1172,14 @@ bool ColPartition::MarkAsLeaderIfMonospaced() { it.data_relative(-1)->bounding_box().right(); if (blob->bounding_box().width() + gap > max_step) { it.extract(); + modified_blob_list = true; break; } } blob->set_region_type(BRT_TEXT); blob->set_flow(BTFT_LEADER); } + if (modified_blob_list) ComputeLimits(); blob_type_ = BRT_TEXT; flow_ = BTFT_LEADER; } else if (textord_debug_tabfind) { diff --git a/textord/colpartition.h b/textord/colpartition.h index 7f6cd64328..1b35d48545 100644 --- a/textord/colpartition.h +++ b/textord/colpartition.h @@ -481,6 +481,11 @@ class ColPartition : public ELIST2_LINK { // Any blobs that are not owned by this partition get to keep their owner // without an assert failure. void DisownBoxesNoAssert(); + // NULLs the owner of the blobs in this partition that are owned by this + // partition and not leader blobs, removing them from the boxes_ list, thus + // turning this partition back to a leader partition if it contains a leader, + // or otherwise leaving it empty. Returns true if any boxes remain. + bool ReleaseNonLeaderBoxes(); // Delete the boxes that this partition owns. void DeleteBoxes(); diff --git a/textord/colpartitiongrid.cpp b/textord/colpartitiongrid.cpp index 6cd8f31c93..800cbcb3c9 100644 --- a/textord/colpartitiongrid.cpp +++ b/textord/colpartitiongrid.cpp @@ -324,6 +324,40 @@ static bool TestCompatibleCandidates(const ColPartition& part, bool debug, return true; } +// Computes and returns the total overlap of all partitions in the grid. +// If overlap_grid is non-null, it is filled with a grid that holds empty +// partitions representing the union of all overlapped partitions. +int ColPartitionGrid::ComputeTotalOverlap(ColPartitionGrid** overlap_grid) { + int total_overlap = 0; + // Iterate the ColPartitions in the grid. + ColPartitionGridSearch gsearch(this); + gsearch.StartFullSearch(); + ColPartition* part; + while ((part = gsearch.NextFullSearch()) != NULL) { + ColPartition_CLIST neighbors; + const TBOX& part_box = part->bounding_box(); + FindOverlappingPartitions(part_box, part, &neighbors); + ColPartition_C_IT n_it(&neighbors); + bool any_part_overlap = false; + for (n_it.mark_cycle_pt(); !n_it.cycled_list(); n_it.forward()) { + const TBOX& n_box = n_it.data()->bounding_box(); + int overlap = n_box.intersection(part_box).area(); + if (overlap > 0 && overlap_grid != NULL) { + if (*overlap_grid == NULL) { + *overlap_grid = new ColPartitionGrid(gridsize(), bleft(), tright()); + } + (*overlap_grid)->InsertBBox(true, true, n_it.data()->ShallowCopy()); + if (!any_part_overlap) { + (*overlap_grid)->InsertBBox(true, true, part->ShallowCopy()); + } + } + any_part_overlap = true; + total_overlap += overlap; + } + } + return total_overlap; +} + // Finds all the ColPartitions in the grid that overlap with the given // box and returns them SortByBoxLeft(ed) and uniqued in the given list. // Any partition equal to not_this (may be NULL) is excluded. @@ -901,6 +935,7 @@ void ColPartitionGrid::ReTypeBlobs(BLOBNBOX_LIST* im_blobs) { while ((part = gsearch.NextFullSearch()) != NULL) { BlobRegionType blob_type = part->blob_type(); BlobTextFlowType flow = part->flow(); + bool any_blobs_moved = false; if (blob_type == BRT_POLYIMAGE || blob_type == BRT_RECTIMAGE) { BLOBNBOX_C_IT blob_it(part->boxes()); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { @@ -918,6 +953,7 @@ void ColPartitionGrid::ReTypeBlobs(BLOBNBOX_LIST* im_blobs) { ASSERT_HOST(blob->cblob()->area() != 0); blob->set_owner(NULL); blob_it.extract(); + any_blobs_moved = true; } else { blob->set_region_type(blob_type); if (blob->flow() != BTFT_LEADER) @@ -938,6 +974,11 @@ void ColPartitionGrid::ReTypeBlobs(BLOBNBOX_LIST* im_blobs) { delete blob; } } + } else if (any_blobs_moved) { + gsearch.RemoveBBox(); + part->ComputeLimits(); + InsertBBox(true, true, part); + gsearch.RepositionIterator(); } } } @@ -1048,6 +1089,24 @@ void ColPartitionGrid::DeleteUnknownParts(TO_BLOCK* block) { block->DeleteUnownedNoise(); } +// Deletes all the partitions in the grid that are NOT of flow type BTFT_LEADER. +void ColPartitionGrid::DeleteNonLeaderParts() { + ColPartitionGridSearch gsearch(this); + gsearch.StartFullSearch(); + ColPartition* part; + while ((part = gsearch.NextFullSearch()) != NULL) { + if (part->flow() != BTFT_LEADER) { + gsearch.RemoveBBox(); + if (part->ReleaseNonLeaderBoxes()) { + InsertBBox(true, true, part); + gsearch.RepositionIterator(); + } else { + delete part; + } + } + } +} + // Finds and marks text partitions that represent figure captions. void ColPartitionGrid::FindFigureCaptions() { // For each image region find its best candidate text caption region, diff --git a/textord/colpartitiongrid.h b/textord/colpartitiongrid.h index 40946e5746..94e7da2c43 100644 --- a/textord/colpartitiongrid.h +++ b/textord/colpartitiongrid.h @@ -63,6 +63,11 @@ class ColPartitionGrid : public BBGrid* confirm_cb, ColPartition* part); + // Computes and returns the total overlap of all partitions in the grid. + // If overlap_grid is non-null, it is filled with a grid that holds empty + // partitions representing the union of all overlapped partitions. + int ComputeTotalOverlap(ColPartitionGrid** overlap_grid); + // Finds all the ColPartitions in the grid that overlap with the given // box and returns them SortByBoxLeft(ed) and uniqued in the given list. // Any partition equal to not_this (may be NULL) is excluded. @@ -165,6 +170,10 @@ class ColPartitionGrid : public BBGridConstructProjection(block, rerotation, nontext_map_); if (textord_tabfind_show_strokewidths) { ScrollView* line_blobs_win = MakeWindow(0, 0, "Initial textline Blobs"); @@ -375,7 +379,19 @@ void StrokeWidth::GradeBlobsIntoPartitions(const FCOORD& rerotation, // Clear and re Insert to take advantage of the removed diacritics. Clear(); InsertBlobs(block); - FindInitialPartitions(rerotation, block, part_grid, big_parts); + FCOORD skew; + FindTextlineFlowDirection(true); + PartitionFindResult r = FindInitialPartitions( + rerotation, true, block, diacritic_blobs, part_grid, big_parts, &skew); + if (r == PFR_NOISE) { + tprintf("Detected %d diacritics\n", diacritic_blobs->length()); + // Noise was found, and removed. + Clear(); + InsertBlobs(block); + FindTextlineFlowDirection(true); + r = FindInitialPartitions(rerotation, false, block, diacritic_blobs, + part_grid, big_parts, &skew); + } nontext_map_ = NULL; projection_ = NULL; denorm_ = NULL; @@ -1220,10 +1236,17 @@ void StrokeWidth::SmoothNeighbourTypes(BLOBNBOX* blob, bool reset_all) { // minimize overlap and smoothes the types with neighbours and the color // image if provided. rerotation is used to rotate the coordinate space // back to the nontext_map_ image. -void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation, - TO_BLOCK* block, - ColPartitionGrid* part_grid, - ColPartition_LIST* big_parts) { +// If find_problems is true, detects possible noise pollution by the amount +// of partition overlap that is created by the diacritics. If excessive, the +// noise is separated out into diacritic blobs, and PFR_NOISE is returned. +// [TODO(rays): if the partition overlap is caused by heavy skew, deskews +// the components, saves the skew_angle and returns PFR_SKEW.] If the return +// is not PFR_OK, the job is incomplete, and FindInitialPartitions must be +// called again after cleaning up the partly done work. +PartitionFindResult StrokeWidth::FindInitialPartitions( + const FCOORD& rerotation, bool find_problems, TO_BLOCK* block, + BLOBNBOX_LIST* diacritic_blobs, ColPartitionGrid* part_grid, + ColPartition_LIST* big_parts, FCOORD* skew_angle) { FindVerticalTextChains(part_grid); FindHorizontalTextChains(part_grid); if (textord_tabfind_show_strokewidths) { @@ -1231,6 +1254,10 @@ void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation, part_grid->DisplayBoxes(chains_win_); projection_->DisplayProjection(); } + if (find_problems) { + // TODO(rays) Do something to find skew, set skew_angle and return if there + // is some. + } part_grid->SplitOverlappingPartitions(big_parts); EasyMerges(part_grid); RemoveLargeUnusedBlobs(block, part_grid, big_parts); @@ -1239,8 +1266,14 @@ void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation, rerotation)); while (part_grid->GridSmoothNeighbours(BTFT_NEIGHBOURS, nontext_map_, grid_box, rerotation)); + int pre_overlap = part_grid->ComputeTotalOverlap(NULL); TestDiacritics(part_grid, block); MergeDiacritics(block, part_grid); + if (find_problems && diacritic_blobs != NULL && + DetectAndRemoveNoise(pre_overlap, grid_box, block, part_grid, + diacritic_blobs)) { + return PFR_NOISE; + } if (textord_tabfind_show_strokewidths) { textlines_win_ = MakeWindow(400, 400, "GoodTextline blobs"); part_grid->DisplayBoxes(textlines_win_); @@ -1260,6 +1293,57 @@ void StrokeWidth::FindInitialPartitions(const FCOORD& rerotation, smoothed_win_ = MakeWindow(800, 400, "Smoothed blobs"); part_grid->DisplayBoxes(smoothed_win_); } + return PFR_OK; +} + +// Detects noise by a significant increase in partition overlap from +// pre_overlap to now, and removes noise from the union of all the overlapping +// partitions, placing the blobs in diacritic_blobs. Returns true if any noise +// was found and removed. +bool StrokeWidth::DetectAndRemoveNoise(int pre_overlap, const TBOX& grid_box, + TO_BLOCK* block, + ColPartitionGrid* part_grid, + BLOBNBOX_LIST* diacritic_blobs) { + ColPartitionGrid* noise_grid = NULL; + int post_overlap = part_grid->ComputeTotalOverlap(&noise_grid); + if (pre_overlap == 0) pre_overlap = 1; + BLOBNBOX_IT diacritic_it(diacritic_blobs); + if (noise_grid != NULL) { + if (post_overlap > pre_overlap * kNoiseOverlapGrowthFactor && + post_overlap > grid_box.area() * kNoiseOverlapAreaFactor) { + // This is noisy enough to fix. + if (textord_tabfind_show_strokewidths) { + ScrollView* noise_win = MakeWindow(1000, 500, "Noise Areas"); + noise_grid->DisplayBoxes(noise_win); + } + part_grid->DeleteNonLeaderParts(); + BLOBNBOX_IT blob_it(&block->noise_blobs); + ColPartitionGridSearch rsearch(noise_grid); + for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { + BLOBNBOX* blob = blob_it.data(); + blob->ClearNeighbours(); + if (!blob->IsDiacritic() || blob->owner() != NULL) + continue; // Not a noise candidate. + TBOX blob_box(blob->bounding_box()); + TBOX search_box(blob->bounding_box()); + search_box.pad(gridsize(), gridsize()); + rsearch.StartRectSearch(search_box); + ColPartition* part = rsearch.NextRectSearch(); + if (part != NULL) { + // Consider blob as possible noise. + blob->set_owns_cblob(true); + blob->compute_bounding_box(); + diacritic_it.add_after_then_move(blob_it.extract()); + } + } + noise_grid->DeleteParts(); + delete noise_grid; + return true; + } + noise_grid->DeleteParts(); + delete noise_grid; + } + return false; } // Helper verifies that blob's neighbour in direction dir is good to add to a diff --git a/textord/strokewidth.h b/textord/strokewidth.h index 5d649b5708..12cb3c91f6 100644 --- a/textord/strokewidth.h +++ b/textord/strokewidth.h @@ -41,6 +41,14 @@ enum LeftOrRight { LR_RIGHT }; +// Return value from FindInitialPartitions indicates detection of severe +// skew or noise. +enum PartitionFindResult { + PFR_OK, // Everything is OK. + PFR_SKEW, // Skew was detected and rotated. + PFR_NOISE // Noise was detected and removed. +}; + /** * The StrokeWidth class holds all the normal and large blobs. * It is used to find good large blobs and move them to the normal blobs @@ -110,12 +118,10 @@ class StrokeWidth : public BlobGrid { // part_grid is the output grid of textline partitions. // Large blobs that cause overlap are put in separate partitions and added // to the big_parts list. - void GradeBlobsIntoPartitions(const FCOORD& rerotation, - TO_BLOCK* block, - Pix* nontext_pix, - const DENORM* denorm, - bool cjk_script, - TextlineProjection* projection, + void GradeBlobsIntoPartitions(const FCOORD& rerotation, TO_BLOCK* block, + Pix* nontext_pix, const DENORM* denorm, + bool cjk_script, TextlineProjection* projection, + BLOBNBOX_LIST* diacritic_blobs, ColPartitionGrid* part_grid, ColPartition_LIST* big_parts); @@ -205,10 +211,26 @@ class StrokeWidth : public BlobGrid { // minimize overlap and smoothes the types with neighbours and the color // image if provided. rerotation is used to rotate the coordinate space // back to the nontext_map_ image. - void FindInitialPartitions(const FCOORD& rerotation, - TO_BLOCK* block, - ColPartitionGrid* part_grid, - ColPartition_LIST* big_parts); + // If find_problems is true, detects possible noise pollution by the amount + // of partition overlap that is created by the diacritics. If excessive, the + // noise is separated out into diacritic blobs, and PFR_NOISE is returned. + // [TODO(rays): if the partition overlap is caused by heavy skew, deskews + // the components, saves the skew_angle and returns PFR_SKEW.] If the return + // is not PFR_OK, the job is incomplete, and FindInitialPartitions must be + // called again after cleaning up the partly done work. + PartitionFindResult FindInitialPartitions(const FCOORD& rerotation, + bool find_problems, TO_BLOCK* block, + BLOBNBOX_LIST* diacritic_blobs, + ColPartitionGrid* part_grid, + ColPartition_LIST* big_parts, + FCOORD* skew_angle); + // Detects noise by a significant increase in partition overlap from + // pre_overlap to now, and removes noise from the union of all the overlapping + // partitions, placing the blobs in diacritic_blobs. Returns true if any noise + // was found and removed. + bool DetectAndRemoveNoise(int pre_overlap, const TBOX& grid_box, + TO_BLOCK* block, ColPartitionGrid* part_grid, + BLOBNBOX_LIST* diacritic_blobs); // Finds vertical chains of text-like blobs and puts them in ColPartitions. void FindVerticalTextChains(ColPartitionGrid* part_grid); // Finds horizontal chains of text-like blobs and puts them in ColPartitions. diff --git a/textord/tablefind.cpp b/textord/tablefind.cpp index 888fe145f5..2e38bada0b 100644 --- a/textord/tablefind.cpp +++ b/textord/tablefind.cpp @@ -974,12 +974,12 @@ bool TableFinder::HasLeaderAdjacent(const ColPartition& part) { hsearch.StartSideSearch(x, bottom, top); ColPartition* leader = NULL; while ((leader = hsearch.NextSideSearch(right_to_left)) != NULL) { - // This should not happen, they are in different grids. - ASSERT_HOST(&part != leader); // The leader could be a horizontal ruling in the grid. // Make sure it is actually a leader. if (leader->flow() != BTFT_LEADER) continue; + // This should not happen, they are in different grids. + ASSERT_HOST(&part != leader); // Make sure the leader shares a page column with the partition, // otherwise we are spreading across columns. if (!part.IsInSameColumnAs(*leader)) diff --git a/textord/textord.cpp b/textord/textord.cpp index cf2fc04fe3..6156e45b3b 100644 --- a/textord/textord.cpp +++ b/textord/textord.cpp @@ -268,7 +268,7 @@ Textord::~Textord() { void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew, int width, int height, Pix* binary_pix, Pix* thresholds_pix, Pix* grey_pix, - bool use_box_bottoms, + bool use_box_bottoms, BLOBNBOX_LIST* diacritic_blobs, BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks) { page_tr_.set_x(width); page_tr_.set_y(height); @@ -340,9 +340,9 @@ void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew, make_single_word(pageseg_mode == PSM_SINGLE_CHAR, to_block->get_rows(), to_block->block->row_list()); } - cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks); // Remove empties. - + cleanup_blocks(PSM_WORD_FIND_ENABLED(pageseg_mode), blocks); + TransferDiacriticsToBlockGroups(diacritic_blobs, blocks); // Compute the margins for each row in the block, to be used later for // paragraph detection. BLOCK_IT b_it(blocks); diff --git a/textord/textord.h b/textord/textord.h index b99541efce..cc9cb1d341 100644 --- a/textord/textord.h +++ b/textord/textord.h @@ -22,6 +22,7 @@ #define TESSERACT_TEXTORD_TEXTORD_H__ #include "ccstruct.h" +#include "bbgrid.h" #include "blobbox.h" #include "gap_map.h" #include "publictypes.h" // For PageSegMode. @@ -35,6 +36,35 @@ class ScrollView; namespace tesseract { +// A simple class that can be used by BBGrid to hold a word and an expanded +// bounding box that makes it easy to find words to put diacritics. +class WordWithBox { + public: + WordWithBox() : word_(NULL) {} + explicit WordWithBox(WERD *word) + : word_(word), bounding_box_(word->bounding_box()) { + int height = bounding_box_.height(); + bounding_box_.pad(height, height); + } + + const TBOX &bounding_box() const { return bounding_box_; } + // Returns the bounding box of only the good blobs. + TBOX true_bounding_box() const { return word_->true_bounding_box(); } + C_BLOB_LIST *RejBlobs() const { return word_->rej_cblob_list(); } + const WERD *word() const { return word_; } + + private: + // Borrowed pointer to a real word somewhere that must outlive this class. + WERD *word_; + // Cached expanded bounding box of the word, padded all round by its height. + TBOX bounding_box_; +}; + +// Make it usable by BBGrid. +CLISTIZEH(WordWithBox) +typedef BBGrid WordGrid; +typedef GridSearch WordSearch; + class Textord { public: explicit Textord(CCStruct* ccstruct); @@ -47,11 +77,13 @@ class Textord { // thresholds_pix is expected to be present iff grey_pix is present and // can be an integer factor reduction of the grey_pix. It represents the // thresholds that were used to create the binary_pix from the grey_pix. - void TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew, - int width, int height, Pix* binary_pix, - Pix* thresholds_pix, Pix* grey_pix, - bool use_box_bottoms, - BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks); + // diacritic_blobs contain small confusing components that should be added + // to the appropriate word(s) in case they are really diacritics. + void TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, + int height, Pix *binary_pix, Pix *thresholds_pix, + Pix *grey_pix, bool use_box_bottoms, + BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks, + TO_BLOCK_LIST *to_blocks); // If we were supposed to return only a single textline, and there is more // than one, clean up and leave only the best. @@ -212,6 +244,17 @@ class Textord { // Remove outlines that are a tiny fraction in either width or height // of the word height. void clean_small_noise_from_words(ROW *row); + // Groups blocks by rotation, then, for each group, makes a WordGrid and calls + // TransferDiacriticsToWords to copy the diacritic blobs to the most + // appropriate words in the group of blocks. Source blobs are not touched. + void TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs, + BLOCK_LIST* blocks); + // Places a copy of blobs that are near a word (after applying rotation to the + // blob) in the most appropriate word, unless there is doubt, in which case a + // blob can end up in two words. Source blobs are not touched. + void TransferDiacriticsToWords(BLOBNBOX_LIST *diacritic_blobs, + const FCOORD &rotation, WordGrid *word_grid); + public: // makerow.cpp /////////////////////////////////////////// BOOL_VAR_H(textord_single_height_mode, false, diff --git a/textord/topitch.cpp b/textord/topitch.cpp index 3136a9417e..e918f14c36 100644 --- a/textord/topitch.cpp +++ b/textord/topitch.cpp @@ -283,12 +283,13 @@ void fix_row_pitch(TO_ROW *bad_row, // row to fix bad_row->space_threshold = (bad_row->min_space + bad_row->max_nonspace) / 2; bad_row->space_size = bad_row->fixed_pitch; - if (bad_row->char_cells.empty ()) + if (bad_row->char_cells.empty() && !bad_row->blob_list()->empty()) { tune_row_pitch (bad_row, &bad_row->projection, bad_row->projection_left, bad_row->projection_right, (bad_row->fixed_pitch + bad_row->max_nonspace * 3) / 4, bad_row->fixed_pitch, sp_sd, mid_cuts, &bad_row->char_cells, FALSE); + } } else if (bad_row->pitch_decision == PITCH_CORR_PROP || bad_row->pitch_decision == PITCH_DEF_PROP) { @@ -1279,13 +1280,13 @@ float tune_row_pitch2( //find fp cells best_sp_sd = initial_pitch; - if (textord_disable_pitch_test) { + best_pitch = static_cast(initial_pitch); + if (textord_disable_pitch_test || best_pitch <= textord_pitch_range) { return initial_pitch; } sum_proj = new STATS[textord_pitch_range * 2 + 1]; if (sum_proj == NULL) return initial_pitch; - best_pitch = (inT32) initial_pitch; for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) @@ -1293,12 +1294,12 @@ float tune_row_pitch2( //find fp cells best_pitch + pitch_delta + 1); for (pixel = projection_left; pixel <= projection_right; pixel++) { - for (pitch_delta = -textord_pitch_range; - pitch_delta <= textord_pitch_range; pitch_delta++) - sum_proj[textord_pitch_range + - pitch_delta].add ((pixel - projection_left) % (best_pitch + - pitch_delta), - projection->pile_count (pixel)); + for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; + pitch_delta++) { + sum_proj[textord_pitch_range + pitch_delta].add( + (pixel - projection_left) % (best_pitch + pitch_delta), + projection->pile_count(pixel)); + } } best_count = sum_proj[textord_pitch_range].pile_count (0); best_delta = 0; @@ -1427,7 +1428,7 @@ float compute_pitch_sd( //find fp cells if (blob_it.empty ()) return space_size * 10; #ifndef GRAPHICS_DISABLED - if (testing_on && to_win > 0) { + if (testing_on && to_win != NULL) { blob_box = blob_it.data ()->bounding_box (); projection->plot (to_win, projection_left, row->intercept (), 1.0f, -1.0f, ScrollView::CORAL); @@ -1476,7 +1477,7 @@ float compute_pitch_sd( //find fp cells tprintf ("\n"); } #ifndef GRAPHICS_DISABLED - if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0) + if (textord_show_fixed_cuts && blob_count > 0 && to_win != NULL) plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list); #endif seg_it.set_to_list (&seg_list); @@ -1566,7 +1567,7 @@ float compute_pitch_sd2( //find fp cells return initial_pitch * 10; } #ifndef GRAPHICS_DISABLED - if (testing_on && to_win > 0) { + if (testing_on && to_win != NULL) { projection->plot (to_win, projection_left, row->intercept (), 1.0f, -1.0f, ScrollView::CORAL); } @@ -1602,7 +1603,7 @@ float compute_pitch_sd2( //find fp cells tprintf ("\n"); } #ifndef GRAPHICS_DISABLED - if (textord_show_fixed_cuts && blob_count > 0 && to_win > 0) + if (textord_show_fixed_cuts && blob_count > 0 && to_win != NULL) plot_fp_cells2(to_win, ScrollView::GOLDENROD, row, &seg_list); #endif seg_it.set_to_list (&seg_list); diff --git a/textord/tordmain.cpp b/textord/tordmain.cpp index eb229eaa1a..e9e59261da 100644 --- a/textord/tordmain.cpp +++ b/textord/tordmain.cpp @@ -38,13 +38,18 @@ #include "allheaders.h" -const ERRCODE BLOCKLESS_BLOBS = "Warning:some blobs assigned to no block"; +// Gridsize for word grid when reassigning diacritics to words. Not critical. +const int kWordGridSize = 50; #undef EXTERN #define EXTERN #define MAX_NEAREST_DIST 600 //for block skew stats +namespace tesseract { + +CLISTIZE(WordWithBox) + /********************************************************************** * SetBlobStrokeWidth * @@ -143,7 +148,6 @@ void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob) { } } - /********************************************************************** * assign_blobs_to_blocks2 * @@ -193,7 +197,6 @@ void assign_blobs_to_blocks2(Pix* pix, } } -namespace tesseract { /********************************************************************** * find_components * @@ -400,7 +403,7 @@ void Textord::cleanup_nontext_block(BLOCK* block) { * Delete empty blocks, rows from the page. **********************************************************************/ -void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks) { +void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST* blocks) { BLOCK_IT block_it = blocks; //iterator ROW_IT row_it; //row iterator @@ -420,18 +423,18 @@ void Textord::cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks) { if (clean_noise) { row_it.set_to_list(block->row_list()); for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + ROW* row = row_it.data(); ++num_rows_all; - clean_small_noise_from_words(row_it.data()); - if ((textord_noise_rejrows && !row_it.data()->word_list()->empty() && - clean_noise_from_row(row_it.data())) || - row_it.data()->word_list()->empty()) { + clean_small_noise_from_words(row); + if ((textord_noise_rejrows && !row->word_list()->empty() && + clean_noise_from_row(row)) || + row->word_list()->empty()) { delete row_it.extract(); // lose empty row. } else { if (textord_noise_rejwords) clean_noise_from_words(row_it.data()); if (textord_blshift_maxshift >= 0) - tweak_row_baseline(row_it.data(), - textord_blshift_maxshift, + tweak_row_baseline(row, textord_blshift_maxshift, textord_blshift_xfraction); ++num_rows; } @@ -640,16 +643,16 @@ void Textord::clean_noise_from_words( //remove empties && (!word_it.at_first () || !blob_it.at_first ())) dot_count += 2; } - if (dot_count > 2) { + if (dot_count > 2 && !word->flag(W_REP_CHAR)) { if (dot_count > norm_count * textord_noise_normratio * 2) word_dud[word_index] = 2; else if (dot_count > norm_count * textord_noise_normratio) word_dud[word_index] = 1; else word_dud[word_index] = 0; - } - else + } else { word_dud[word_index] = 0; + } if (word_dud[word_index] == 2) dud_words++; else @@ -661,11 +664,11 @@ void Textord::clean_noise_from_words( //remove empties for (word_it.mark_cycle_pt (); !word_it.cycled_list (); word_it.forward ()) { if (word_dud[word_index] == 2 || (word_dud[word_index] == 1 && dud_words > ok_words)) { - word = word_it.data (); //current word - //rejected blobs - blob_it.set_to_list (word->rej_cblob_list ()); - //move from blobs - blob_it.add_list_after (word->cblob_list ()); + word = word_it.data(); // Current word. + // Previously we threw away the entire word. + // Now just aggressively throw all small blobs into the reject list, where + // the classifier can decide whether they are actually needed. + word->CleanNoise(textord_noise_sizelimit * row->x_height()); } word_index++; } @@ -705,6 +708,176 @@ void Textord::clean_small_noise_from_words(ROW *row) { } } } + +// Local struct to hold a group of blocks. +struct BlockGroup { + BlockGroup() : rotation(1.0f, 0.0f), angle(0.0f), min_xheight(1.0f) {} + explicit BlockGroup(BLOCK* block) + : bounding_box(block->bounding_box()), + rotation(block->re_rotation()), + angle(block->re_rotation().angle()), + min_xheight(block->x_height()) { + blocks.push_back(block); + } + // Union of block bounding boxes. + TBOX bounding_box; + // Common rotation of the blocks. + FCOORD rotation; + // Angle of rotation. + float angle; + // Min xheight of the blocks. + float min_xheight; + // Collection of borrowed pointers to the blocks in the group. + GenericVector blocks; +}; + +// Groups blocks by rotation, then, for each group, makes a WordGrid and calls +// TransferDiacriticsToWords to copy the diacritic blobs to the most +// appropriate words in the group of blocks. Source blobs are not touched. +void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs, + BLOCK_LIST* blocks) { + // Angle difference larger than this is too much to consider equal. + // They should only be in multiples of M_PI/2 anyway. + const double kMaxAngleDiff = 0.01; // About 0.6 degrees. + PointerVector groups; + BLOCK_IT bk_it(blocks); + for (bk_it.mark_cycle_pt(); !bk_it.cycled_list(); bk_it.forward()) { + BLOCK* block = bk_it.data(); + if (block->poly_block() != NULL && !block->poly_block()->IsText()) { + continue; + } + // Linear search of the groups to find a matching rotation. + float block_angle = block->re_rotation().angle(); + int best_g = 0; + float best_angle_diff = MAX_FLOAT32; + for (int g = 0; g < groups.size(); ++g) { + double angle_diff = fabs(block_angle - groups[g]->angle); + if (angle_diff > M_PI) angle_diff = fabs(angle_diff - 2.0 * M_PI); + if (angle_diff < best_angle_diff) { + best_angle_diff = angle_diff; + best_g = g; + } + } + if (best_angle_diff > kMaxAngleDiff) { + groups.push_back(new BlockGroup(block)); + } else { + groups[best_g]->blocks.push_back(block); + groups[best_g]->bounding_box += block->bounding_box(); + float x_height = block->x_height(); + if (x_height < groups[best_g]->min_xheight) + groups[best_g]->min_xheight = x_height; + } + } + // Now process each group of blocks. + PointerVector word_ptrs; + for (int g = 0; g < groups.size(); ++g) { + const BlockGroup* group = groups[g]; + tprintf("group %d, xh=%g, %d blocks\n", g, group->min_xheight, + group->blocks.size()); + WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(), + group->bounding_box.topright()); + for (int b = 0; b < group->blocks.size(); ++b) { + tprintf("block %d, %d rows\n", b, group->blocks[b]->row_list()->length()); + ROW_IT row_it(group->blocks[b]->row_list()); + for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) { + ROW* row = row_it.data(); + tprintf("%d words in row\n", row->word_list()->length()); + // Put the words of the row into the grid. + WERD_IT w_it(row->word_list()); + for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) { + WERD* word = w_it.data(); + WordWithBox* box_word = new WordWithBox(word); + word_grid.InsertBBox(true, true, box_word); + // Save the pointer where it will be auto-deleted. + word_ptrs.push_back(box_word); + } + } + } + FCOORD rotation = group->rotation; + // Make it a forward rotation that will transform blob coords to block. + rotation.set_y(-rotation.y()); + TransferDiacriticsToWords(diacritic_blobs, rotation, &word_grid); + } +} + +// Places a copy of blobs that are near a word (after applying rotation to the +// blob) in the most appropriate word, unless there is doubt, in which case a +// blob can end up in two words. Source blobs are not touched. +void Textord::TransferDiacriticsToWords(BLOBNBOX_LIST* diacritic_blobs, + const FCOORD& rotation, + WordGrid* word_grid) { + WordSearch ws(word_grid); + BLOBNBOX_IT b_it(diacritic_blobs); + // Apply rotation to each blob before finding the nearest words. The rotation + // allows us to only consider above/below placement and not left/right on + // vertical text, because all text is horizontal here. + for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { + BLOBNBOX* blobnbox = b_it.data(); + TBOX blob_box = blobnbox->bounding_box(); + blob_box.rotate(rotation); + ws.StartRectSearch(blob_box); + // Above/below refer to word position relative to diacritic. Since some + // scripts eg Kannada/Telugu habitually put diacritics below words, and + // others eg Thai/Vietnamese/Latin put most diacritics above words, try + // for both if there isn't much in it. + WordWithBox* best_above_word = NULL; + WordWithBox* best_below_word = NULL; + int best_above_distance = 0; + int best_below_distance = 0; + for (WordWithBox* word = ws.NextRectSearch(); word != NULL; + word = ws.NextRectSearch()) { + if (word->word()->flag(W_REP_CHAR)) continue; + TBOX word_box = word->true_bounding_box(); + int x_distance = blob_box.x_gap(word_box); + int y_distance = blob_box.y_gap(word_box); + if (x_distance > 0) { + // Arbitrarily divide x-distance by 2 if there is a major y overlap, + // and the word is to the left of the diacritic. If the + // diacritic is a dropped broken character between two words, this will + // help send all the pieces to a single word, instead of splitting them + // over the 2 words. + if (word_box.major_y_overlap(blob_box) && + blob_box.left() > word_box.right()) { + x_distance /= 2; + } + y_distance += x_distance; + } + if (word_box.y_middle() > blob_box.y_middle() && + (best_above_word == NULL || y_distance < best_above_distance)) { + best_above_word = word; + best_above_distance = y_distance; + } + if (word_box.y_middle() <= blob_box.y_middle() && + (best_below_word == NULL || y_distance < best_below_distance)) { + best_below_word = word; + best_below_distance = y_distance; + } + } + bool above_good = + best_above_word != NULL && + (best_below_word == NULL || + best_above_distance < best_below_distance + blob_box.height()); + bool below_good = + best_below_word != NULL && best_below_word != best_above_word && + (best_above_word == NULL || + best_below_distance < best_above_distance + blob_box.height()); + if (below_good) { + C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob()); + copied_blob->rotate(rotation); + // Put the blob into the word's reject blobs list. + C_BLOB_IT blob_it(best_below_word->RejBlobs()); + blob_it.add_to_end(copied_blob); + } + if (above_good) { + C_BLOB* copied_blob = C_BLOB::deep_copy(blobnbox->cblob()); + copied_blob->rotate(rotation); + // Put the blob into the word's reject blobs list. + C_BLOB_IT blob_it(best_above_word->RejBlobs()); + blob_it.add_to_end(copied_blob); + } + } +} + } // tesseract /********************************************************************** @@ -820,33 +993,3 @@ void tweak_row_baseline(ROW *row, free_mem(xstarts); free_mem(coeffs); } - -/********************************************************************** - * blob_y_order - * - * Sort function to sort blobs in y from page top. - **********************************************************************/ - -inT32 blob_y_order( //sort function - void *item1, //items to compare - void *item2) { - //converted ptr - BLOBNBOX *blob1 = *(BLOBNBOX **) item1; - //converted ptr - BLOBNBOX *blob2 = *(BLOBNBOX **) item2; - - if (blob1->bounding_box ().bottom () > blob2->bounding_box ().bottom ()) - return -1; - else if (blob1->bounding_box ().bottom () < - blob2->bounding_box ().bottom ()) - return 1; - else { - if (blob1->bounding_box ().left () < blob2->bounding_box ().left ()) - return -1; - else if (blob1->bounding_box ().left () > - blob2->bounding_box ().left ()) - return 1; - else - return 0; - } -} diff --git a/textord/tordmain.h b/textord/tordmain.h index 340ff1aabe..cb5a6a1ef2 100644 --- a/textord/tordmain.h +++ b/textord/tordmain.h @@ -29,29 +29,14 @@ struct Pix; namespace tesseract { class Tesseract; -} -void make_blocks_from_blobs( //convert & textord - TBLOB *tessblobs, //tess style input - const char *filename, //blob file - ICOORD page_tr, //top right - BOOL8 do_shift, //shift tess coords - BLOCK_LIST *blocks //block list - ); void SetBlobStrokeWidth(Pix* pix, BLOBNBOX* blob); void assign_blobs_to_blocks2(Pix* pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *port_blocks); -void textord_page( //make rows & words - ICOORD page_tr, //top right - BLOCK_LIST *blocks, //block list - TO_BLOCK_LIST *land_blocks, //rotated for landscape - TO_BLOCK_LIST *port_blocks, //output list - tesseract::Tesseract* - ); +} // namespace tesseract + void tweak_row_baseline(ROW *row, double blshift_maxshift, double blshift_xfraction); -inT32 blob_y_order( //sort function - void *item1, //items to compare - void *item2); + #endif From 84920b92b393c6848575952f14269cd9457e3c4d Mon Sep 17 00:00:00 2001 From: Ray Smith Date: Tue, 12 May 2015 17:24:34 -0700 Subject: [PATCH 08/15] Font and classifier output structure cleanup. Font recognition was poor, due to forcing a 1st and 2nd choice at a character level, when the total score for the correct font is often correct at the word level, so allowed the propagation of a full set of fonts and scores to the word recognizer, which can now decide word level fonts using the scores instead of simple votes. Change precipitated a cleanup of output data structures for classifier results, eliminating ScoredClass and INT_RESULT_STRUCT, with a few extra elements going in UnicharRating, and using that wherever possible. That added the extra complexity of 1-rating due to a flip between 0 is good and 0 is bad for the internal classifier scores before they are converted to rating and certainty. --- ccmain/applybox.cpp | 6 +- ccmain/control.cpp | 92 ++++---- ccmain/cube_control.cpp | 2 +- ccstruct/fontinfo.cpp | 10 +- ccstruct/fontinfo.h | 22 +- ccstruct/pageres.cpp | 5 +- ccstruct/pageres.h | 2 - ccstruct/ratngs.cpp | 7 +- ccstruct/ratngs.h | 31 ++- ccutil/genericvector.h | 12 +- classify/adaptmatch.cpp | 500 +++++++++++++++++++--------------------- classify/classify.cpp | 2 +- classify/classify.h | 13 +- classify/intmatcher.cpp | 55 +++-- classify/intmatcher.h | 23 +- classify/shapetable.cpp | 6 +- classify/shapetable.h | 29 ++- dict/stopper.cpp | 5 +- wordrec/pieces.cpp | 41 +++- 19 files changed, 432 insertions(+), 431 deletions(-) diff --git a/ccmain/applybox.cpp b/ccmain/applybox.cpp index 6a94ab3796..af237a02c9 100644 --- a/ccmain/applybox.cpp +++ b/ccmain/applybox.cpp @@ -272,7 +272,7 @@ void Tesseract::MaximallyChopWord(const GenericVector& boxes, // limited by the ability of the chopper to find suitable chop points, // and not by the value of the certainties. BLOB_CHOICE* choice = - new BLOB_CHOICE(0, rating, -rating, -1, -1, 0, 0, 0, 0, BCC_FAKE); + new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE); blob_choices.push_back(choice); rating -= 0.125f; } @@ -291,8 +291,8 @@ void Tesseract::MaximallyChopWord(const GenericVector& boxes, left_choice->set_certainty(-rating); // combine confidence w/ serial # BLOB_CHOICE* right_choice = new BLOB_CHOICE(++right_chop_index, - rating - 0.125f, -rating, - -1, -1, 0, 0, 0, 0, BCC_FAKE); + rating - 0.125f, -rating, -1, + 0.0f, 0.0f, 0.0f, BCC_FAKE); blob_choices.insert(right_choice, blob_number + 1); } } diff --git a/ccmain/control.cpp b/ccmain/control.cpp index 3abf216e34..ce15e4da6e 100644 --- a/ccmain/control.cpp +++ b/ccmain/control.cpp @@ -1885,62 +1885,54 @@ void Tesseract::set_word_fonts(WERD_RES *word) { if (word->chopped_word == NULL) return; ASSERT_HOST(word->best_choice != NULL); - inT32 index; // char id index - // character iterator - BLOB_CHOICE_IT choice_it; // choice iterator int fontinfo_size = get_fontinfo_table().size(); - int fontset_size = get_fontset_table().size(); - if (fontinfo_size == 0 || fontset_size == 0) return; - STATS fonts(0, fontinfo_size); // font counters + if (fontinfo_size == 0) return; + GenericVector font_total_score; + font_total_score.init_to_size(fontinfo_size, 0); word->italic = 0; word->bold = 0; - if (!word->best_choice_fontinfo_ids.empty()) { - word->best_choice_fontinfo_ids.clear(); + // Compute the font scores for the word + if (tessedit_debug_fonts) { + tprintf("Examining fonts in %s\n", + word->best_choice->debug_string().string()); + } + for (int b = 0; b < word->best_choice->length(); ++b) { + BLOB_CHOICE* choice = word->GetBlobChoice(b); + if (choice == NULL) continue; + const GenericVector& fonts = choice->fonts(); + for (int f = 0; f < fonts.size(); ++f) { + int fontinfo_id = fonts[f].fontinfo_id; + if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) { + font_total_score[fontinfo_id] += fonts[f].score; + } + } } - // Compute the modal font for the word - for (index = 0; index < word->best_choice->length(); ++index) { - UNICHAR_ID word_ch_id = word->best_choice->unichar_id(index); - choice_it.set_to_list(word->GetBlobChoices(index)); - if (tessedit_debug_fonts) { - tprintf("Examining fonts in %s\n", - word->best_choice->debug_string().string()); + // Find the top and 2nd choice for the word. + int score1 = 0, score2 = 0; + inT16 font_id1 = -1, font_id2 = -1; + for (int f = 0; f < fontinfo_size; ++f) { + if (tessedit_debug_fonts && font_total_score[f] > 0) { + tprintf("Font %s, total score = %d\n", + fontinfo_table_.get(f).name, font_total_score[f]); } - for (choice_it.mark_cycle_pt(); !choice_it.cycled_list(); - choice_it.forward()) { - UNICHAR_ID blob_ch_id = choice_it.data()->unichar_id(); - if (blob_ch_id == word_ch_id) { - if (tessedit_debug_fonts) { - tprintf("%s font %s (%d) font2 %s (%d)\n", - word->uch_set->id_to_unichar(blob_ch_id), - choice_it.data()->fontinfo_id() < 0 ? "unknown" : - fontinfo_table_.get(choice_it.data()->fontinfo_id()).name, - choice_it.data()->fontinfo_id(), - choice_it.data()->fontinfo_id2() < 0 ? "unknown" : - fontinfo_table_.get(choice_it.data()->fontinfo_id2()).name, - choice_it.data()->fontinfo_id2()); - } - // 1st choice font gets 2 pts, 2nd choice 1 pt. - if (choice_it.data()->fontinfo_id() >= 0) { - fonts.add(choice_it.data()->fontinfo_id(), 2); - } - if (choice_it.data()->fontinfo_id2() >= 0) { - fonts.add(choice_it.data()->fontinfo_id2(), 1); - } - break; - } + if (font_total_score[f] > score1) { + score2 = score1; + font_id2 = font_id1; + score1 = font_total_score[f]; + font_id1 = f; + } else if (font_total_score[f] > score2) { + score2 = font_total_score[f]; + font_id2 = f; } } - inT16 font_id1, font_id2; - find_modal_font(&fonts, &font_id1, &word->fontinfo_id_count); - find_modal_font(&fonts, &font_id2, &word->fontinfo_id2_count); word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : NULL; word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : NULL; - // All the blobs get the word's best choice font. - for (int i = 0; i < word->best_choice->length(); ++i) { - word->best_choice_fontinfo_ids.push_back(font_id1); - } - if (word->fontinfo_id_count > 0) { + // Each score has a limit of MAX_UINT16, so divide by that to get the number + // of "votes" for that font, ie number of perfect scores. + word->fontinfo_id_count = ClipToRange(score1 / MAX_UINT16, 1, MAX_INT8); + word->fontinfo_id2_count = ClipToRange(score2 / MAX_UINT16, 0, MAX_INT8); + if (score1 > 0) { FontInfo fi = fontinfo_table_.get(font_id1); if (tessedit_debug_fonts) { if (word->fontinfo_id2_count > 0) { @@ -1953,9 +1945,8 @@ void Tesseract::set_word_fonts(WERD_RES *word) { fi.name, word->fontinfo_id_count); } } - // 1st choices got 2 pts, so we need to halve the score for the mode. - word->italic = (fi.is_italic() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2; - word->bold = (fi.is_bold() ? 1 : -1) * (word->fontinfo_id_count + 1) / 2; + word->italic = (fi.is_italic() ? 1 : -1) * word->fontinfo_id_count; + word->bold = (fi.is_bold() ? 1 : -1) * word->fontinfo_id_count; } } @@ -2009,8 +2000,7 @@ void Tesseract::font_recognition_pass(PAGE_RES* page_res) { word = page_res_it.word(); int length = word->best_choice->length(); - // 1st choices got 2 pts, so we need to halve the score for the mode. - int count = (word->fontinfo_id_count + 1) / 2; + int count = word->fontinfo_id_count; if (!(count == length || (length > 3 && count >= length * 3 / 4))) { word->fontinfo = modal_font; // Counts only get 1 as it came from the doc. diff --git a/ccmain/cube_control.cpp b/ccmain/cube_control.cpp index f7f5138857..4dcc3621fc 100644 --- a/ccmain/cube_control.cpp +++ b/ccmain/cube_control.cpp @@ -384,7 +384,7 @@ bool Tesseract::cube_recognize(CubeObject *cube_obj, BLOCK* block, UNICHAR_ID uch_id = cube_cntxt_->CharacterSet()->UnicharID(char_samples[i]->StrLabel()); choices[i] = new BLOB_CHOICE(uch_id, -cube_certainty, cube_certainty, - -1, -1, 0, 0, 0, 0, BCC_STATIC_CLASSIFIER); + -1, 0.0f, 0.0f, 0.0f, BCC_STATIC_CLASSIFIER); } word->FakeClassifyWord(num_chars, choices); // within a word, cube recognizes the word in reading order. diff --git a/ccstruct/fontinfo.cpp b/ccstruct/fontinfo.cpp index b5cb31085b..d3e6f3756e 100644 --- a/ccstruct/fontinfo.cpp +++ b/ccstruct/fontinfo.cpp @@ -59,10 +59,10 @@ bool FontInfoTable::DeSerialize(bool swap, FILE* fp) { // Returns true if the given set of fonts includes one with the same // properties as font_id. bool FontInfoTable::SetContainsFontProperties( - int font_id, const GenericVector& font_set) const { + int font_id, const GenericVector& font_set) const { uinT32 properties = get(font_id).properties; for (int f = 0; f < font_set.size(); ++f) { - if (get(font_set[f]).properties == properties) + if (get(font_set[f].fontinfo_id).properties == properties) return true; } return false; @@ -70,12 +70,12 @@ bool FontInfoTable::SetContainsFontProperties( // Returns true if the given set of fonts includes multiple properties. bool FontInfoTable::SetContainsMultipleFontProperties( - const GenericVector& font_set) const { + const GenericVector& font_set) const { if (font_set.empty()) return false; - int first_font = font_set[0]; + int first_font = font_set[0].fontinfo_id; uinT32 properties = get(first_font).properties; for (int f = 1; f < font_set.size(); ++f) { - if (get(font_set[f]).properties != properties) + if (get(font_set[f].fontinfo_id).properties != properties) return true; } return false; diff --git a/ccstruct/fontinfo.h b/ccstruct/fontinfo.h index 8b90381470..5f2d420852 100644 --- a/ccstruct/fontinfo.h +++ b/ccstruct/fontinfo.h @@ -31,6 +31,22 @@ namespace tesseract { class BitVector; +// Simple struct to hold a font and a score. The scores come from the low-level +// integer matcher, so they are in the uinT16 range. Fonts are an index to +// fontinfo_table. +// These get copied around a lot, so best to keep them small. +struct ScoredFont { + ScoredFont() : fontinfo_id(-1), score(0) {} + ScoredFont(int font_id, uinT16 classifier_score) + : fontinfo_id(font_id), score(classifier_score) {} + + // Index into fontinfo table, but inside the classifier, may be a shapetable + // index. + inT32 fontinfo_id; + // Raw score from the low-level classifier. + uinT16 score; +}; + // Struct for information about spacing between characters in a particular font. struct FontSpacingInfo { inT16 x_gap_before; @@ -140,11 +156,11 @@ class FontInfoTable : public GenericVector { // Returns true if the given set of fonts includes one with the same // properties as font_id. - bool SetContainsFontProperties(int font_id, - const GenericVector& font_set) const; + bool SetContainsFontProperties( + int font_id, const GenericVector& font_set) const; // Returns true if the given set of fonts includes multiple properties. bool SetContainsMultipleFontProperties( - const GenericVector& font_set) const; + const GenericVector& font_set) const; // Moves any non-empty FontSpacingInfo entries from other to this. void MoveSpacingInfoFrom(FontInfoTable* other); diff --git a/ccstruct/pageres.cpp b/ccstruct/pageres.cpp index 9c1b13c5c3..00467225bc 100644 --- a/ccstruct/pageres.cpp +++ b/ccstruct/pageres.cpp @@ -148,6 +148,7 @@ ROW_RES::ROW_RES(bool merge_similar_words, ROW *the_row) { add_next_word = false; } } + next_word->set_flag(W_FUZZY_NON, add_next_word); } else { add_next_word = next_word->flag(W_FUZZY_NON); } @@ -206,12 +207,8 @@ WERD_RES& WERD_RES::operator=(const WERD_RES & source) { if (!wc_dest_it.empty()) { wc_dest_it.move_to_first(); best_choice = wc_dest_it.data(); - best_choice_fontinfo_ids = source.best_choice_fontinfo_ids; } else { best_choice = NULL; - if (!best_choice_fontinfo_ids.empty()) { - best_choice_fontinfo_ids.clear(); - } } if (source.raw_choice != NULL) { diff --git a/ccstruct/pageres.h b/ccstruct/pageres.h index a6a8404275..b8cd4dc31a 100644 --- a/ccstruct/pageres.h +++ b/ccstruct/pageres.h @@ -315,8 +315,6 @@ class WERD_RES : public ELIST_LINK { BOOL8 combination; //of two fuzzy gap wds BOOL8 part_of_combo; //part of a combo BOOL8 reject_spaces; //Reject spacing? - // FontInfo ids for each unichar in best_choice. - GenericVector best_choice_fontinfo_ids; WERD_RES() { InitNonPointers(); diff --git a/ccstruct/ratngs.cpp b/ccstruct/ratngs.cpp index 52023dcf4e..908c00bcec 100644 --- a/ccstruct/ratngs.cpp +++ b/ccstruct/ratngs.cpp @@ -90,8 +90,6 @@ static const char * const kPermuterTypeNames[] = { BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id float src_rating, // rating float src_cert, // certainty - inT16 src_fontinfo_id, // font - inT16 src_fontinfo_id2, // 2nd choice font int src_script_id, // script float min_xheight, // min xheight allowed float max_xheight, // max xheight by this char @@ -100,8 +98,8 @@ BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id unichar_id_ = src_unichar_id; rating_ = src_rating; certainty_ = src_cert; - fontinfo_id_ = src_fontinfo_id; - fontinfo_id2_ = src_fontinfo_id2; + fontinfo_id_ = -1; + fontinfo_id2_ = -1; script_id_ = src_script_id; min_xheight_ = min_xheight; max_xheight_ = max_xheight; @@ -126,6 +124,7 @@ BLOB_CHOICE::BLOB_CHOICE(const BLOB_CHOICE &other) { max_xheight_ = other.max_xheight_; yshift_ = other.yshift(); classifier_ = other.classifier_; + fonts_ = other.fonts_; } // Returns true if *this and other agree on the baseline and x-height diff --git a/ccstruct/ratngs.h b/ccstruct/ratngs.h index f777a87187..54e625a56c 100644 --- a/ccstruct/ratngs.h +++ b/ccstruct/ratngs.h @@ -24,6 +24,7 @@ #include "clst.h" #include "elst.h" +#include "fontinfo.h" #include "genericvector.h" #include "matrix.h" #include "unichar.h" @@ -64,8 +65,6 @@ class BLOB_CHOICE: public ELIST_LINK BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id float src_rating, // rating float src_cert, // certainty - inT16 src_fontinfo_id, // font - inT16 src_fontinfo_id2, // 2nd choice font int script_id, // script float min_xheight, // min xheight in image pixel units float max_xheight, // max xheight allowed by this char @@ -89,6 +88,26 @@ class BLOB_CHOICE: public ELIST_LINK inT16 fontinfo_id2() const { return fontinfo_id2_; } + const GenericVector& fonts() const { + return fonts_; + } + void set_fonts(const GenericVector& fonts) { + fonts_ = fonts; + int score1 = 0, score2 = 0; + fontinfo_id_ = -1; + fontinfo_id2_ = -1; + for (int f = 0; f < fonts_.size(); ++f) { + if (fonts_[f].score > score1) { + score2 = score1; + fontinfo_id2_ = fontinfo_id_; + score1 = fonts_[f].score; + fontinfo_id_ = fonts_[f].fontinfo_id; + } else if (fonts_[f].score > score2) { + score2 = fonts_[f].score; + fontinfo_id2_ = fonts_[f].fontinfo_id; + } + } + } int script_id() const { return script_id_; } @@ -131,12 +150,6 @@ class BLOB_CHOICE: public ELIST_LINK void set_certainty(float newrat) { certainty_ = newrat; } - void set_fontinfo_id(inT16 newfont) { - fontinfo_id_ = newfont; - } - void set_fontinfo_id2(inT16 newfont) { - fontinfo_id2_ = newfont; - } void set_script(int newscript_id) { script_id_ = newscript_id; } @@ -186,6 +199,8 @@ class BLOB_CHOICE: public ELIST_LINK private: UNICHAR_ID unichar_id_; // unichar id + // Fonts and scores. Allowed to be empty. + GenericVector fonts_; inT16 fontinfo_id_; // char font information inT16 fontinfo_id2_; // 2nd choice font information // Rating is the classifier distance weighted by the length of the outline diff --git a/ccutil/genericvector.h b/ccutil/genericvector.h index 0cc55109a0..8433966bf9 100644 --- a/ccutil/genericvector.h +++ b/ccutil/genericvector.h @@ -445,8 +445,10 @@ class PointerVector : public GenericVector { } PointerVector& operator=(const PointerVector& other) { - this->truncate(0); - this->operator+=(other); + if (&other != this) { + this->truncate(0); + this->operator+=(other); + } return *this; } @@ -777,8 +779,10 @@ GenericVector &GenericVector::operator+=(const GenericVector& other) { template GenericVector &GenericVector::operator=(const GenericVector& other) { - this->truncate(0); - this->operator+=(other); + if (&other != this) { + this->truncate(0); + this->operator+=(other); + } return *this; } diff --git a/classify/adaptmatch.cpp b/classify/adaptmatch.cpp index 86305404e3..e9ff913b77 100644 --- a/classify/adaptmatch.cpp +++ b/classify/adaptmatch.cpp @@ -73,37 +73,39 @@ #define Y_DIM_OFFSET (Y_SHIFT - BASELINE_Y_SHIFT) -#define WORST_POSSIBLE_RATING (1.0) - -struct ScoredClass { - CLASS_ID unichar_id; - int shape_id; - FLOAT32 rating; - bool adapted; - inT16 config; - inT16 fontinfo_id; - inT16 fontinfo_id2; -}; +#define WORST_POSSIBLE_RATING (0.0f) + +using tesseract::UnicharRating; +using tesseract::ScoredFont; struct ADAPT_RESULTS { inT32 BlobLength; bool HasNonfragment; - GenericVector match; - ScoredClass best_match; + UNICHAR_ID best_unichar_id; + int best_match_index; + FLOAT32 best_rating; + GenericVector match; GenericVector CPResults; /// Initializes data members to the default values. Sets the initial /// rating of each class to be the worst possible rating (1.0). inline void Initialize() { - BlobLength = MAX_INT32; - HasNonfragment = false; - best_match.unichar_id = NO_CLASS; - best_match.shape_id = -1; - best_match.rating = WORST_POSSIBLE_RATING; - best_match.adapted = false; - best_match.config = 0; - best_match.fontinfo_id = kBlankFontinfoId; - best_match.fontinfo_id2 = kBlankFontinfoId; + BlobLength = MAX_INT32; + HasNonfragment = false; + ComputeBest(); + } + // Computes best_unichar_id, best_match_index and best_rating. + void ComputeBest() { + best_unichar_id = INVALID_UNICHAR_ID; + best_match_index = -1; + best_rating = WORST_POSSIBLE_RATING; + for (int i = 0; i < match.size(); ++i) { + if (match[i].rating > best_rating) { + best_rating = match[i].rating; + best_unichar_id = match[i].unichar_id; + best_match_index = i; + } + } } }; @@ -116,17 +118,30 @@ struct PROTO_KEY { /*----------------------------------------------------------------------------- Private Macros -----------------------------------------------------------------------------*/ -#define MarginalMatch(Rating) \ -((Rating) > matcher_great_threshold) +inline bool MarginalMatch(float confidence, float matcher_great_threshold) { + return (1.0f - confidence) > matcher_great_threshold; +} /*----------------------------------------------------------------------------- Private Function Prototypes -----------------------------------------------------------------------------*/ -int CompareByRating(const void *arg1, const void *arg2); - -ScoredClass *FindScoredUnichar(ADAPT_RESULTS *results, UNICHAR_ID id); +// Returns the index of the given id in results, if present, or the size of the +// vector (index it will go at) if not present. +static int FindScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS& results) { + for (int i = 0; i < results.match.size(); i++) { + if (results.match[i].unichar_id == id) + return i; + } + return results.match.size(); +} -ScoredClass ScoredUnichar(ADAPT_RESULTS *results, UNICHAR_ID id); +// Returns the current rating for a unichar id if we have rated it, defaulting +// to WORST_POSSIBLE_RATING. +static float ScoredUnichar(UNICHAR_ID id, const ADAPT_RESULTS& results) { + int index = FindScoredUnichar(id, results); + if (index >= results.match.size()) return WORST_POSSIBLE_RATING; + return results.match[index].rating; +} void InitMatcherRatings(register FLOAT32 *Rating); @@ -176,14 +191,19 @@ void Classify::AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices) { DoAdaptiveMatch(Blob, Results); RemoveBadMatches(Results); - Results->match.sort(CompareByRating); + Results->match.sort(&UnicharRating::SortDescendingRating); RemoveExtraPuncs(Results); + Results->ComputeBest(); ConvertMatchesToChoices(Blob->denorm(), Blob->bounding_box(), Results, Choices); + // TODO(rays) Move to before ConvertMatchesToChoices! + if (LargeSpeckle(*Blob) || Choices->length() == 0) + AddLargeSpeckleTo(Results->BlobLength, Choices); + if (matcher_debug_level >= 1) { - cprintf ("AD Matches = "); - PrintAdaptiveMatchResults(stdout, Results); + tprintf("AD Matches = "); + PrintAdaptiveMatchResults(*Results); } if (LargeSpeckle(*Blob) || Choices->length() == 0) @@ -724,8 +744,8 @@ void Classify::InitAdaptedClass(TBLOB *Blob, ConvertConfig (AllProtosOn, 0, IClass); if (classify_learning_debug_level >= 1) { - cprintf ("Added new class '%s' with class id %d and %d protos.\n", - unicharset.id_to_unichar(ClassId), ClassId, NumFeatures); + tprintf("Added new class '%s' with class id %d and %d protos.\n", + unicharset.id_to_unichar(ClassId), ClassId, NumFeatures); if (classify_learning_debug_level > 1) DisplayAdaptedChar(Blob, IClass); } @@ -837,7 +857,7 @@ void Classify::AdaptToChar(TBLOB *Blob, FLOAT32 Threshold) { int NumFeatures; INT_FEATURE_ARRAY IntFeatures; - INT_RESULT_STRUCT IntResult; + UnicharRating int_result; INT_CLASS IClass; ADAPT_CLASS Class; TEMP_CONFIG TempConfig; @@ -847,13 +867,13 @@ void Classify::AdaptToChar(TBLOB *Blob, if (!LegalClassId (ClassId)) return; + int_result.unichar_id = ClassId; Class = AdaptedTemplates->Class[ClassId]; assert(Class != NULL); if (IsEmptyAdaptedClass(Class)) { InitAdaptedClass(Blob, ClassId, FontinfoId, Class, AdaptedTemplates); - } - else { - IClass = ClassForClassId (AdaptedTemplates->Templates, ClassId); + } else { + IClass = ClassForClassId(AdaptedTemplates->Templates, ClassId); NumFeatures = GetAdaptiveFeatures(Blob, IntFeatures, &FloatFeatures); if (NumFeatures <= 0) @@ -870,39 +890,38 @@ void Classify::AdaptToChar(TBLOB *Blob, } im_.Match(IClass, AllProtosOn, MatchingFontConfigs, NumFeatures, IntFeatures, - &IntResult, classify_adapt_feature_threshold, + &int_result, classify_adapt_feature_threshold, NO_DEBUG, matcher_debug_separate_windows); FreeBitVector(MatchingFontConfigs); SetAdaptiveThreshold(Threshold); - if (IntResult.Rating <= Threshold) { - if (ConfigIsPermanent (Class, IntResult.Config)) { + if (1.0f - int_result.rating <= Threshold) { + if (ConfigIsPermanent(Class, int_result.config)) { if (classify_learning_debug_level >= 1) - cprintf ("Found good match to perm config %d = %4.1f%%.\n", - IntResult.Config, (1.0 - IntResult.Rating) * 100.0); + tprintf("Found good match to perm config %d = %4.1f%%.\n", + int_result.config, int_result.rating * 100.0); FreeFeatureSet(FloatFeatures); return; } - TempConfig = TempConfigFor (Class, IntResult.Config); + TempConfig = TempConfigFor(Class, int_result.config); IncreaseConfidence(TempConfig); if (TempConfig->NumTimesSeen > Class->MaxNumTimesSeen) { Class->MaxNumTimesSeen = TempConfig->NumTimesSeen; } if (classify_learning_debug_level >= 1) - cprintf ("Increasing reliability of temp config %d to %d.\n", - IntResult.Config, TempConfig->NumTimesSeen); + tprintf("Increasing reliability of temp config %d to %d.\n", + int_result.config, TempConfig->NumTimesSeen); if (TempConfigReliable(ClassId, TempConfig)) { - MakePermanent(AdaptedTemplates, ClassId, IntResult.Config, Blob); + MakePermanent(AdaptedTemplates, ClassId, int_result.config, Blob); UpdateAmbigsGroup(ClassId, Blob); } - } - else { + } else { if (classify_learning_debug_level >= 1) { - cprintf ("Found poor match to temp config %d = %4.1f%%.\n", - IntResult.Config, (1.0 - IntResult.Rating) * 100.0); + tprintf("Found poor match to temp config %d = %4.1f%%.\n", + int_result.config, int_result.rating * 100.0); if (classify_learning_debug_level > 2) DisplayAdaptedChar(Blob, IClass); } @@ -937,20 +956,20 @@ void Classify::DisplayAdaptedChar(TBLOB* blob, INT_CLASS_STRUCT* int_class) { &bl_features); if (sample == NULL) return; - INT_RESULT_STRUCT IntResult; + UnicharRating int_result; im_.Match(int_class, AllProtosOn, AllConfigsOn, bl_features.size(), &bl_features[0], - &IntResult, classify_adapt_feature_threshold, + &int_result, classify_adapt_feature_threshold, NO_DEBUG, matcher_debug_separate_windows); - cprintf ("Best match to temp config %d = %4.1f%%.\n", - IntResult.Config, (1.0 - IntResult.Rating) * 100.0); + tprintf("Best match to temp config %d = %4.1f%%.\n", + int_result.config, int_result.rating * 100.0); if (classify_learning_debug_level >= 2) { uinT32 ConfigMask; - ConfigMask = 1 << IntResult.Config; + ConfigMask = 1 << int_result.config; ShowMatchDisplay(); im_.Match(int_class, AllProtosOn, (BIT_VECTOR)&ConfigMask, bl_features.size(), &bl_features[0], - &IntResult, classify_adapt_feature_threshold, + &int_result, classify_adapt_feature_threshold, 6 | 0x19, matcher_debug_separate_windows); UpdateMatchDisplay(); } @@ -986,44 +1005,34 @@ void Classify::DisplayAdaptedChar(TBLOB* blob, INT_CLASS_STRUCT* int_class) { * @note Exceptions: none * @note History: Tue Mar 12 18:19:29 1991, DSJ, Created. */ -void Classify::AddNewResult(ADAPT_RESULTS *results, - CLASS_ID class_id, - int shape_id, - FLOAT32 rating, - bool adapted, - int config, - int fontinfo_id, - int fontinfo_id2) { - ScoredClass *old_match = FindScoredUnichar(results, class_id); - ScoredClass match = - { class_id, - shape_id, - rating, - adapted, - static_cast(config), - static_cast(fontinfo_id), - static_cast(fontinfo_id2) }; - - if (rating > results->best_match.rating + matcher_bad_match_pad || - (old_match && rating >= old_match->rating)) - return; +void Classify::AddNewResult(const UnicharRating& new_result, + ADAPT_RESULTS *results) { + int old_match = FindScoredUnichar(new_result.unichar_id, *results); + + if (new_result.rating + matcher_bad_match_pad < results->best_rating || + (old_match < results->match.size() && + new_result.rating <= results->match[old_match].rating)) + return; // New one not good enough. - if (!unicharset.get_fragment(class_id)) + if (!unicharset.get_fragment(new_result.unichar_id)) results->HasNonfragment = true; - if (old_match) - old_match->rating = rating; - else - results->match.push_back(match); + if (old_match < results->match.size()) { + results->match[old_match].rating = new_result.rating; + } else { + results->match.push_back(new_result); + } - if (rating < results->best_match.rating && + if (new_result.rating > results->best_rating && // Ensure that fragments do not affect best rating, class and config. // This is needed so that at least one non-fragmented character is // always present in the results. // TODO(daria): verify that this helps accuracy and does not // hurt performance. - !unicharset.get_fragment(class_id)) { - results->best_match = match; + !unicharset.get_fragment(new_result.unichar_id)) { + results->best_match_index = old_match; + results->best_rating = new_result.rating; + results->best_unichar_id = new_result.unichar_id; } } /* AddNewResult */ @@ -1058,7 +1067,7 @@ void Classify::AmbigClassifier( ADAPT_RESULTS *results) { if (int_features.empty()) return; uinT8* CharNormArray = new uinT8[unicharset.size()]; - INT_RESULT_STRUCT IntResult; + UnicharRating int_result; results->BlobLength = GetCharNormFeature(fx_info, templates, NULL, CharNormArray); @@ -1071,17 +1080,18 @@ void Classify::AmbigClassifier( while (*ambiguities >= 0) { CLASS_ID class_id = *ambiguities; + int_result.unichar_id = class_id; im_.Match(ClassForClassId(templates, class_id), AllProtosOn, AllConfigsOn, int_features.size(), &int_features[0], - &IntResult, + &int_result, classify_adapt_feature_threshold, NO_DEBUG, matcher_debug_separate_windows); ExpandShapesAndApplyCorrections(NULL, debug, class_id, bottom, top, 0, results->BlobLength, classify_integer_matcher_multiplier, - CharNormArray, IntResult, results); + CharNormArray, &int_result, results); ambiguities++; } delete [] CharNormArray; @@ -1102,14 +1112,15 @@ void Classify::MasterMatcher(INT_TEMPLATES templates, ADAPT_RESULTS* final_results) { int top = blob_box.top(); int bottom = blob_box.bottom(); + UnicharRating int_result; for (int c = 0; c < results.size(); c++) { CLASS_ID class_id = results[c].Class; - INT_RESULT_STRUCT& int_result = results[c].IMResult; BIT_VECTOR protos = classes != NULL ? classes[class_id]->PermProtos : AllProtosOn; BIT_VECTOR configs = classes != NULL ? classes[class_id]->PermConfigs : AllConfigsOn; + int_result.unichar_id = class_id; im_.Match(ClassForClassId(templates, class_id), protos, configs, num_features, features, @@ -1120,7 +1131,7 @@ void Classify::MasterMatcher(INT_TEMPLATES templates, results[c].Rating, final_results->BlobLength, matcher_multiplier, norm_factors, - int_result, final_results); + &int_result, final_results); } } @@ -1133,65 +1144,76 @@ void Classify::ExpandShapesAndApplyCorrections( ADAPT_CLASS* classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uinT8* cn_factors, - INT_RESULT_STRUCT& int_result, ADAPT_RESULTS* final_results) { - // Compute the fontinfo_ids. - int fontinfo_id = kBlankFontinfoId; - int fontinfo_id2 = kBlankFontinfoId; + UnicharRating* int_result, ADAPT_RESULTS* final_results) { if (classes != NULL) { - // Adapted result. - fontinfo_id = GetFontinfoId(classes[class_id], int_result.Config); - fontinfo_id2 = GetFontinfoId(classes[class_id], int_result.Config2); + // Adapted result. Convert configs to fontinfo_ids. + int_result->adapted = true; + for (int f = 0; f < int_result->fonts.size(); ++f) { + int_result->fonts[f].fontinfo_id = + GetFontinfoId(classes[class_id], int_result->fonts[f].fontinfo_id); + } } else { - // Pre-trained result. - fontinfo_id = ClassAndConfigIDToFontOrShapeID(class_id, int_result.Config); - fontinfo_id2 = ClassAndConfigIDToFontOrShapeID(class_id, - int_result.Config2); + // Pre-trained result. Map fonts using font_sets_. + int_result->adapted = false; + for (int f = 0; f < int_result->fonts.size(); ++f) { + int_result->fonts[f].fontinfo_id = + ClassAndConfigIDToFontOrShapeID(class_id, + int_result->fonts[f].fontinfo_id); + } if (shape_table_ != NULL) { - // Actually fontinfo_id is an index into the shape_table_ and it - // contains a list of unchar_id/font_id pairs. - int shape_id = fontinfo_id; - const Shape& shape = shape_table_->GetShape(fontinfo_id); - double min_rating = 0.0; - for (int c = 0; c < shape.size(); ++c) { - int unichar_id = shape[c].unichar_id; - fontinfo_id = shape[c].font_ids[0]; - if (shape[c].font_ids.size() > 1) - fontinfo_id2 = shape[c].font_ids[1]; - else if (fontinfo_id2 != kBlankFontinfoId) - fontinfo_id2 = shape_table_->GetShape(fontinfo_id2)[0].font_ids[0]; - double rating = ComputeCorrectedRating(debug, unichar_id, cp_rating, - int_result.Rating, - int_result.FeatureMisses, - bottom, top, blob_length, - matcher_multiplier, cn_factors); - if (c == 0 || rating < min_rating) - min_rating = rating; - if (unicharset.get_enabled(unichar_id)) { - AddNewResult(final_results, unichar_id, shape_id, rating, - classes != NULL, int_result.Config, - fontinfo_id, fontinfo_id2); + // Two possible cases: + // 1. Flat shapetable. All unichar-ids of the shapes referenced by + // int_result->fonts are the same. In this case build a new vector of + // mapped fonts and replace the fonts in int_result. + // 2. Multi-unichar shapetable. Variable unichars in the shapes referenced + // by int_result. In this case, build a vector of UnicharRating to + // gather together different font-ids for each unichar. Also covers case1. + GenericVector mapped_results; + for (int f = 0; f < int_result->fonts.size(); ++f) { + int shape_id = int_result->fonts[f].fontinfo_id; + const Shape& shape = shape_table_->GetShape(shape_id); + for (int c = 0; c < shape.size(); ++c) { + int unichar_id = shape[c].unichar_id; + if (!unicharset.get_enabled(unichar_id)) continue; + // Find the mapped_result for unichar_id. + int r = 0; + for (r = 0; r < mapped_results.size() && + mapped_results[r].unichar_id != unichar_id; ++r) {} + if (r == mapped_results.size()) { + mapped_results.push_back(*int_result); + mapped_results[r].unichar_id = unichar_id; + mapped_results[r].fonts.truncate(0); + } + for (int i = 0; i < shape[c].font_ids.size(); ++i) { + mapped_results[r].fonts.push_back( + ScoredFont(shape[c].font_ids[i], int_result->fonts[f].score)); + } } } - int_result.Rating = min_rating; + for (int m = 0; m < mapped_results.size(); ++m) { + mapped_results[m].rating = + ComputeCorrectedRating(debug, mapped_results[m].unichar_id, + cp_rating, int_result->rating, + int_result->feature_misses, bottom, top, + blob_length, matcher_multiplier, cn_factors); + AddNewResult(mapped_results[m], final_results); + } return; } } - double rating = ComputeCorrectedRating(debug, class_id, cp_rating, - int_result.Rating, - int_result.FeatureMisses, - bottom, top, blob_length, - matcher_multiplier, cn_factors); if (unicharset.get_enabled(class_id)) { - AddNewResult(final_results, class_id, -1, rating, - classes != NULL, int_result.Config, - fontinfo_id, fontinfo_id2); + int_result->rating = ComputeCorrectedRating(debug, class_id, cp_rating, + int_result->rating, + int_result->feature_misses, + bottom, top, blob_length, + matcher_multiplier, cn_factors); + AddNewResult(*int_result, final_results); } - int_result.Rating = rating; } -// Applies a set of corrections to the distance im_rating, +// Applies a set of corrections to the confidence im_rating, // including the cn_correction, miss penalty and additional penalty -// for non-alnums being vertical misfits. Returns the corrected distance. +// for non-alnums being vertical misfits. Returns the corrected confidence. double Classify::ComputeCorrectedRating(bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, @@ -1199,7 +1221,7 @@ double Classify::ComputeCorrectedRating(bool debug, int unichar_id, int blob_length, int matcher_multiplier, const uinT8* cn_factors) { // Compute class feature corrections. - double cn_corrected = im_.ApplyCNCorrection(im_rating, blob_length, + double cn_corrected = im_.ApplyCNCorrection(1.0 - im_rating, blob_length, cn_factors[unichar_id], matcher_multiplier); double miss_penalty = tessedit_class_miss_scale * feature_misses; @@ -1220,16 +1242,16 @@ double Classify::ComputeCorrectedRating(bool debug, int unichar_id, vertical_penalty = classify_misfit_junk_penalty; } } - double result =cn_corrected + miss_penalty + vertical_penalty; - if (result > WORST_POSSIBLE_RATING) + double result = 1.0 - (cn_corrected + miss_penalty + vertical_penalty); + if (result < WORST_POSSIBLE_RATING) result = WORST_POSSIBLE_RATING; if (debug) { - tprintf("%s: %2.1f(CP%2.1f, IM%2.1f + CN%.2f(%d) + MP%2.1f + VP%2.1f)\n", + tprintf("%s: %2.1f%%(CP%2.1f, IM%2.1f + CN%.2f(%d) + MP%2.1f + VP%2.1f)\n", unicharset.id_to_unichar(unichar_id), result * 100.0, cp_rating * 100.0, - im_rating * 100.0, - (cn_corrected - im_rating) * 100.0, + (1.0 - im_rating) * 100.0, + (cn_corrected - (1.0 - im_rating)) * 100.0, cn_factors[unichar_id], miss_penalty * 100.0, vertical_penalty * 100.0); @@ -1268,7 +1290,7 @@ UNICHAR_ID *Classify::BaselineClassifier( CharNormArray, BaselineCutoffs, &Results->CPResults); if (matcher_debug_level >= 2 || classify_debug_level > 1) - cprintf ("BL Matches = "); + tprintf("BL Matches = "); MasterMatcher(Templates->Templates, int_features.size(), &int_features[0], CharNormArray, @@ -1276,13 +1298,12 @@ UNICHAR_ID *Classify::BaselineClassifier( Blob->bounding_box(), Results->CPResults, Results); delete [] CharNormArray; - CLASS_ID ClassId = Results->best_match.unichar_id; - if (ClassId == NO_CLASS) - return (NULL); - /* this is a bug - maybe should return "" */ + CLASS_ID ClassId = Results->best_unichar_id; + if (ClassId == INVALID_UNICHAR_ID || Results->best_match_index < 0) + return NULL; return Templates->Class[ClassId]-> - Config[Results->best_match.config].Perm->Ambigs; + Config[Results->match[Results->best_match_index].config].Perm->Ambigs; } /* BaselineClassifier */ @@ -1316,14 +1337,7 @@ int Classify::CharNormClassifier(TBLOB *blob, -1, &unichar_results); // Convert results to the format used internally by AdaptiveClassifier. for (int r = 0; r < unichar_results.size(); ++r) { - int unichar_id = unichar_results[r].unichar_id; - // Fonts are listed in order of preference. - int font1 = unichar_results[r].fonts.size() >= 1 - ? unichar_results[r].fonts[0] : kBlankFontinfoId; - int font2 = unichar_results[r].fonts.size() >= 2 - ? unichar_results[r].fonts[1] : kBlankFontinfoId; - float rating = 1.0f - unichar_results[r].rating; - AddNewResult(adapt_results, unichar_id, -1, rating, false, 0, font1, font2); + AddNewResult(unichar_results[r], adapt_results); } return sample.num_features(); } /* CharNormClassifier */ @@ -1378,14 +1392,7 @@ int Classify::CharNormTrainingSample(bool pruner_only, blob_box, adapt_results->CPResults, adapt_results); // Convert master matcher results to output format. for (int i = 0; i < adapt_results->match.size(); i++) { - ScoredClass next = adapt_results->match[i]; - UnicharRating rating(next.unichar_id, 1.0f - next.rating); - if (next.fontinfo_id >= 0) { - rating.fonts.push_back(next.fontinfo_id); - if (next.fontinfo_id2 >= 0) - rating.fonts.push_back(next.fontinfo_id2); - } - results->push_back(rating); + results->push_back(adapt_results->match[i]); } results->sort(&UnicharRating::SortDescendingRating); } @@ -1410,60 +1417,14 @@ int Classify::CharNormTrainingSample(bool pruner_only, * @note Exceptions: none * @note History: Tue Mar 12 18:36:52 1991, DSJ, Created. */ -void Classify::ClassifyAsNoise(ADAPT_RESULTS *Results) { - register FLOAT32 Rating; +void Classify::ClassifyAsNoise(ADAPT_RESULTS *results) { + float rating = results->BlobLength / matcher_avg_noise_size; + rating *= rating; + rating /= 1.0 + rating; - Rating = Results->BlobLength / matcher_avg_noise_size; - Rating *= Rating; - Rating /= 1.0 + Rating; - - AddNewResult(Results, NO_CLASS, -1, Rating, false, -1, - kBlankFontinfoId, kBlankFontinfoId); + AddNewResult(UnicharRating(UNICHAR_SPACE, 1.0f - rating), results); } /* ClassifyAsNoise */ -} // namespace tesseract - - -/*---------------------------------------------------------------------------*/ -// Return a pointer to the scored unichar in results, or NULL if not present. -ScoredClass *FindScoredUnichar(ADAPT_RESULTS *results, UNICHAR_ID id) { - for (int i = 0; i < results->match.size(); i++) { - if (results->match[i].unichar_id == id) - return &results->match[i]; - } - return NULL; -} - -// Retrieve the current rating for a unichar id if we have rated it, defaulting -// to WORST_POSSIBLE_RATING. -ScoredClass ScoredUnichar(ADAPT_RESULTS *results, UNICHAR_ID id) { - ScoredClass poor_result = - {id, -1, WORST_POSSIBLE_RATING, false, -1, - kBlankFontinfoId, kBlankFontinfoId}; - ScoredClass *entry = FindScoredUnichar(results, id); - return (entry == NULL) ? poor_result : *entry; -} - -// Compare character classes by rating as for qsort(3). -// For repeatability, use character class id as a tie-breaker. -int CompareByRating(const void *arg1, // ScoredClass *class1 - const void *arg2) { // ScoredClass *class2 - const ScoredClass *class1 = (const ScoredClass *)arg1; - const ScoredClass *class2 = (const ScoredClass *)arg2; - - if (class1->rating < class2->rating) - return -1; - else if (class1->rating > class2->rating) - return 1; - - if (class1->unichar_id < class2->unichar_id) - return -1; - else if (class1->unichar_id > class2->unichar_id) - return 1; - return 0; -} -/*---------------------------------------------------------------------------*/ -namespace tesseract { /// The function converts the given match ratings to the list of blob /// choices with ratings and certainties (used by the context checkers). /// If character fragments are present in the results, this function also makes @@ -1494,11 +1455,9 @@ void Classify::ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box, float best_certainty = -MAX_FLOAT32; for (int i = 0; i < Results->match.size(); i++) { - ScoredClass next = Results->match[i]; - int fontinfo_id = next.fontinfo_id; - int fontinfo_id2 = next.fontinfo_id2; - bool adapted = next.adapted; - bool current_is_frag = (unicharset.get_fragment(next.unichar_id) != NULL); + const UnicharRating& result = Results->match[i]; + bool adapted = result.adapted; + bool current_is_frag = (unicharset.get_fragment(result.unichar_id) != NULL); if (temp_it.length()+1 == max_matches && !contains_nonfrag && current_is_frag) { continue; // look for a non-fragmented character to fill the @@ -1512,7 +1471,7 @@ void Classify::ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box, Certainty = -20; Rating = 100; // should be -certainty * real_blob_length } else { - Rating = Certainty = next.rating; + Rating = Certainty = (1.0f - result.rating); Rating *= rating_scale * Results->BlobLength; Certainty *= -(getDict().certainty_scale); } @@ -1529,14 +1488,16 @@ void Classify::ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box, } float min_xheight, max_xheight, yshift; - denorm.XHeightRange(next.unichar_id, unicharset, box, + denorm.XHeightRange(result.unichar_id, unicharset, box, &min_xheight, &max_xheight, &yshift); - temp_it.add_to_end(new BLOB_CHOICE(next.unichar_id, Rating, Certainty, - fontinfo_id, fontinfo_id2, - unicharset.get_script(next.unichar_id), - min_xheight, max_xheight, yshift, - adapted ? BCC_ADAPTED_CLASSIFIER - : BCC_STATIC_CLASSIFIER)); + BLOB_CHOICE* choice = + new BLOB_CHOICE(result.unichar_id, Rating, Certainty, + unicharset.get_script(result.unichar_id), + min_xheight, max_xheight, yshift, + adapted ? BCC_ADAPTED_CLASSIFIER + : BCC_STATIC_CLASSIFIER); + choice->set_fonts(result.fonts); + temp_it.add_to_end(choice); contains_nonfrag |= !current_is_frag; // update contains_nonfrag choices_length++; if (choices_length >= max_matches) break; @@ -1560,17 +1521,13 @@ void Classify::ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box, void Classify::DebugAdaptiveClassifier(TBLOB *blob, ADAPT_RESULTS *Results) { if (static_classifier_ == NULL) return; - for (int i = 0; i < Results->match.size(); i++) { - if (i == 0 || Results->match[i].rating < Results->best_match.rating) - Results->best_match = Results->match[i]; - } INT_FX_RESULT_STRUCT fx_info; GenericVector bl_features; TrainingSample* sample = BlobToTrainingSample(*blob, false, &fx_info, &bl_features); if (sample == NULL) return; static_classifier_->DebugDisplay(*sample, blob->denorm().pix(), - Results->best_match.unichar_id); + Results->best_unichar_id); } /* DebugAdaptiveClassifier */ #endif @@ -1613,7 +1570,8 @@ void Classify::DoAdaptiveMatch(TBLOB *Blob, ADAPT_RESULTS *Results) { } else { Ambiguities = BaselineClassifier(Blob, bl_features, fx_info, AdaptedTemplates, Results); - if ((!Results->match.empty() && MarginalMatch(Results->best_match.rating) && + if ((!Results->match.empty() && + MarginalMatch(Results->best_rating, matcher_great_threshold) && !tess_bn_matching) || Results->match.empty()) { CharNormClassifier(Blob, *sample, Results); @@ -1672,7 +1630,7 @@ UNICHAR_ID *Classify::GetAmbiguities(TBLOB *Blob, CharNormClassifier(Blob, *sample, Results); delete sample; RemoveBadMatches(Results); - Results->match.sort(CompareByRating); + Results->match.sort(&UnicharRating::SortDescendingRating); /* copy the class id's into an string of ambiguities - don't copy if the correct class is the only class id matched */ @@ -2092,14 +2050,11 @@ namespace tesseract { * @note Exceptions: none * @note History: Mon Mar 18 09:24:53 1991, DSJ, Created. */ -void Classify::PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results) { - for (int i = 0; i < Results->match.size(); ++i) { - tprintf("%s(%d), shape %d, %.2f ", - unicharset.debug_str(Results->match[i].unichar_id).string(), - Results->match[i].unichar_id, Results->match[i].shape_id, - Results->match[i].rating * 100.0); +void Classify::PrintAdaptiveMatchResults(const ADAPT_RESULTS& results) { + for (int i = 0; i < results.match.size(); ++i) { + tprintf("%s ", unicharset.debug_str(results.match[i].unichar_id).string()); + results.match[i].Print(); } - tprintf("\n"); } /* PrintAdaptiveMatchResults */ /*---------------------------------------------------------------------------*/ @@ -2122,40 +2077,49 @@ void Classify::RemoveBadMatches(ADAPT_RESULTS *Results) { int Next, NextGood; FLOAT32 BadMatchThreshold; static const char* romans = "i v x I V X"; - BadMatchThreshold = Results->best_match.rating + matcher_bad_match_pad; + BadMatchThreshold = Results->best_rating - matcher_bad_match_pad; if (classify_bln_numeric_mode) { UNICHAR_ID unichar_id_one = unicharset.contains_unichar("1") ? unicharset.unichar_to_id("1") : -1; UNICHAR_ID unichar_id_zero = unicharset.contains_unichar("0") ? unicharset.unichar_to_id("0") : -1; - ScoredClass scored_one = ScoredUnichar(Results, unichar_id_one); - ScoredClass scored_zero = ScoredUnichar(Results, unichar_id_zero); + float scored_one = ScoredUnichar(unichar_id_one, *Results); + float scored_zero = ScoredUnichar(unichar_id_zero, *Results); for (Next = NextGood = 0; Next < Results->match.size(); Next++) { - if (Results->match[Next].rating <= BadMatchThreshold) { - ScoredClass match = Results->match[Next]; + const UnicharRating& match = Results->match[Next]; + if (match.rating >= BadMatchThreshold) { if (!unicharset.get_isalpha(match.unichar_id) || strstr(romans, unicharset.id_to_unichar(match.unichar_id)) != NULL) { - Results->match[NextGood++] = Results->match[Next]; } else if (unicharset.eq(match.unichar_id, "l") && - scored_one.rating >= BadMatchThreshold) { - Results->match[NextGood] = scored_one; - Results->match[NextGood].rating = match.rating; - NextGood++; + scored_one < BadMatchThreshold) { + Results->match[Next].unichar_id = unichar_id_one; } else if (unicharset.eq(match.unichar_id, "O") && - scored_zero.rating >= BadMatchThreshold) { - Results->match[NextGood] = scored_zero; - Results->match[NextGood].rating = match.rating; - NextGood++; + scored_zero < BadMatchThreshold) { + Results->match[Next].unichar_id = unichar_id_zero; + } else { + Results->match[Next].unichar_id = INVALID_UNICHAR_ID; // Don't copy. + } + if (Results->match[Next].unichar_id != INVALID_UNICHAR_ID) { + if (NextGood == Next) { + ++NextGood; + } else { + Results->match[NextGood++] = Results->match[Next]; + } } } } } else { for (Next = NextGood = 0; Next < Results->match.size(); Next++) { - if (Results->match[Next].rating <= BadMatchThreshold) - Results->match[NextGood++] = Results->match[Next]; + if (Results->match[Next].rating >= BadMatchThreshold) { + if (NextGood == Next) { + ++NextGood; + } else { + Results->match[NextGood++] = Results->match[Next]; + } + } } } Results->match.truncate(NextGood); @@ -2182,18 +2146,24 @@ void Classify::RemoveExtraPuncs(ADAPT_RESULTS *Results) { punc_count = 0; digit_count = 0; for (Next = NextGood = 0; Next < Results->match.size(); Next++) { - ScoredClass match = Results->match[Next]; + const UnicharRating& match = Results->match[Next]; + bool keep = true; if (strstr(punc_chars, unicharset.id_to_unichar(match.unichar_id)) != NULL) { - if (punc_count < 2) - Results->match[NextGood++] = match; + if (punc_count >= 2) + keep = false; punc_count++; } else { if (strstr(digit_chars, unicharset.id_to_unichar(match.unichar_id)) != NULL) { - if (digit_count < 1) - Results->match[NextGood++] = match; + if (digit_count >= 1) + keep = false; digit_count++; + } + } + if (keep) { + if (NextGood == Next) { + ++NextGood; } else { Results->match[NextGood++] = match; } @@ -2250,7 +2220,7 @@ void Classify::ShowBestMatchFor(int shape_id, tprintf("Illegal blob (char norm features)!\n"); return; } - INT_RESULT_STRUCT cn_result; + UnicharRating cn_result; classify_norm_method.set_value(character); im_.Match(ClassForClassId(PreTrainedTemplates, shape_id), AllProtosOn, AllConfigsOn, @@ -2258,7 +2228,7 @@ void Classify::ShowBestMatchFor(int shape_id, classify_adapt_feature_threshold, NO_DEBUG, matcher_debug_separate_windows); tprintf("\n"); - config_mask = 1 << cn_result.Config; + config_mask = 1 << cn_result.config; tprintf("Static Shape ID: %d\n", shape_id); ShowMatchDisplay(); diff --git a/classify/classify.cpp b/classify/classify.cpp index 53c1b589e1..22b59b4055 100644 --- a/classify/classify.cpp +++ b/classify/classify.cpp @@ -217,7 +217,7 @@ void Classify::AddLargeSpeckleTo(int blob_length, BLOB_CHOICE_LIST *choices) { (rating_scale * blob_length); } BLOB_CHOICE* blob_choice = new BLOB_CHOICE(UNICHAR_SPACE, rating, certainty, - -1, -1, 0, 0, MAX_FLOAT32, 0, + -1, 0.0f, MAX_FLOAT32, 0, BCC_SPECKLE_CLASSIFIER); bc_it.add_to_end(blob_choice); } diff --git a/classify/classify.h b/classify/classify.h index f105fc60d0..895642aa21 100644 --- a/classify/classify.h +++ b/classify/classify.h @@ -175,7 +175,7 @@ class Classify : public CCStruct { int blob_length, int matcher_multiplier, const uinT8* cn_factors, - INT_RESULT_STRUCT& int_result, + UnicharRating* int_result, ADAPT_RESULTS* final_results); // Applies a set of corrections to the distance im_rating, // including the cn_correction, miss penalty and additional penalty @@ -188,14 +188,7 @@ class Classify : public CCStruct { void ConvertMatchesToChoices(const DENORM& denorm, const TBOX& box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices); - void AddNewResult(ADAPT_RESULTS *results, - CLASS_ID class_id, - int shape_id, - FLOAT32 rating, - bool adapted, - int config, - int fontinfo_id, - int fontinfo_id2); + void AddNewResult(const UnicharRating& new_result, ADAPT_RESULTS *results); int GetAdaptiveFeatures(TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures); @@ -220,7 +213,7 @@ class Classify : public CCStruct { CLASS_ID ClassId, int ConfigId, TBLOB *Blob); - void PrintAdaptiveMatchResults(FILE *File, ADAPT_RESULTS *Results); + void PrintAdaptiveMatchResults(const ADAPT_RESULTS& results); void RemoveExtraPuncs(ADAPT_RESULTS *Results); void RemoveBadMatches(ADAPT_RESULTS *Results); void SetAdaptiveThreshold(FLOAT32 Threshold); diff --git a/classify/intmatcher.cpp b/classify/intmatcher.cpp index b9e65a8db2..cb4b2eb331 100644 --- a/classify/intmatcher.cpp +++ b/classify/intmatcher.cpp @@ -26,6 +26,8 @@ Include Files and Type Defines ----------------------------------------------------------------------------*/ #include "intmatcher.h" + +#include "fontinfo.h" #include "intproto.h" #include "callcpp.h" #include "scrollview.h" @@ -36,6 +38,9 @@ #include "shapetable.h" #include +using tesseract::ScoredFont; +using tesseract::UnicharRating; + /*---------------------------------------------------------------------------- Global Data Definitions and Declarations ----------------------------------------------------------------------------*/ @@ -464,7 +469,7 @@ void IntegerMatcher::Match(INT_CLASS ClassTemplate, BIT_VECTOR ConfigMask, inT16 NumFeatures, const INT_FEATURE_STRUCT* Features, - INT_RESULT Result, + UnicharRating* Result, int AdaptFeatureThreshold, int Debug, bool SeparateDebugWindows) { @@ -477,7 +482,7 @@ void IntegerMatcher::Match(INT_CLASS ClassTemplate, ** NormalizationFactor Fudge factor from blob ** normalization process ** Result Class rating & configuration: - ** (0.0 -> 1.0), 0=good, 1=bad + ** (0.0 -> 1.0), 0=bad, 1=good ** Debug Debugger flag: 1=debugger on ** Globals: ** local_matcher_multiplier_ Normalization factor multiplier @@ -498,7 +503,7 @@ void IntegerMatcher::Match(INT_CLASS ClassTemplate, cprintf ("Integer Matcher -------------------------------------------\n"); tables->Clear(ClassTemplate); - Result->FeatureMisses = 0; + Result->feature_misses = 0; for (Feature = 0; Feature < NumFeatures; Feature++) { int csum = UpdateTablesForFeature(ClassTemplate, ProtoMask, ConfigMask, @@ -506,7 +511,7 @@ void IntegerMatcher::Match(INT_CLASS ClassTemplate, tables, Debug); // Count features that were missed over all configs. if (csum == 0) - Result->FeatureMisses++; + ++Result->feature_misses; } #ifndef GRAPHICS_DISABLED @@ -534,7 +539,7 @@ void IntegerMatcher::Match(INT_CLASS ClassTemplate, #ifndef GRAPHICS_DISABLED if (PrintMatchSummaryOn(Debug)) - DebugBestMatch(BestMatch, Result); + Result->Print(); if (MatchDebuggingOn(Debug)) cprintf("Match Complete --------------------------------------------\n"); @@ -1222,9 +1227,9 @@ void ScratchEvidence::NormalizeSums( /*---------------------------------------------------------------------------*/ int IntegerMatcher::FindBestMatch( - INT_CLASS ClassTemplate, + INT_CLASS class_template, const ScratchEvidence &tables, - INT_RESULT Result) { + UnicharRating* result) { /* ** Parameters: ** Globals: @@ -1236,35 +1241,27 @@ int IntegerMatcher::FindBestMatch( ** Exceptions: none ** History: Wed Feb 27 14:12:28 MST 1991, RWM, Created. */ - int BestMatch = 0; - int Best2Match = 0; - Result->Config = 0; - Result->Config2 = 0; + int best_match = 0; + result->config = 0; + result->fonts.truncate(0); + result->fonts.reserve(class_template->NumConfigs); /* Find best match */ - for (int ConfigNum = 0; ConfigNum < ClassTemplate->NumConfigs; ConfigNum++) { - int rating = tables.sum_feature_evidence_[ConfigNum]; + for (int c = 0; c < class_template->NumConfigs; ++c) { + int rating = tables.sum_feature_evidence_[c]; if (*classify_debug_level_ > 2) - cprintf("Config %d, rating=%d\n", ConfigNum, rating); - if (rating > BestMatch) { - if (BestMatch > 0) { - Result->Config2 = Result->Config; - Best2Match = BestMatch; - } else { - Result->Config2 = ConfigNum; - } - Result->Config = ConfigNum; - BestMatch = rating; - } else if (rating > Best2Match) { - Result->Config2 = ConfigNum; - Best2Match = rating; + tprintf("Config %d, rating=%d\n", c, rating); + if (rating > best_match) { + result->config = c; + best_match = rating; } + result->fonts.push_back(ScoredFont(c, rating)); } - /* Compute Certainty Rating */ - Result->Rating = (65536.0 - BestMatch) / 65536.0; + // Compute confidence on a Probability scale. + result->rating = best_match / 65536.0f; - return BestMatch; + return best_match; } // Applies the CN normalization factor to the given rating and returns diff --git a/classify/intmatcher.h b/classify/intmatcher.h index 8df6d6fdb6..c5bcb027a0 100644 --- a/classify/intmatcher.h +++ b/classify/intmatcher.h @@ -38,25 +38,14 @@ extern INT_VAR_H(classify_integer_matcher_multiplier, 10, #include "intproto.h" #include "cutoffs.h" -struct INT_RESULT_STRUCT { - INT_RESULT_STRUCT() : Rating(0.0f), Config(0), Config2(0), FeatureMisses(0) {} - - FLOAT32 Rating; - // TODO(rays) It might be desirable for these to be able to represent a - // null config. - uinT8 Config; - uinT8 Config2; - uinT16 FeatureMisses; -}; - -typedef INT_RESULT_STRUCT *INT_RESULT; - +namespace tesseract { +class UnicharRating; +} struct CP_RESULT_STRUCT { CP_RESULT_STRUCT() : Rating(0.0f), Class(0) {} FLOAT32 Rating; - INT_RESULT_STRUCT IMResult; CLASS_ID Class; }; @@ -113,7 +102,7 @@ class IntegerMatcher { BIT_VECTOR ConfigMask, inT16 NumFeatures, const INT_FEATURE_STRUCT* Features, - INT_RESULT Result, + tesseract::UnicharRating* Result, int AdaptFeatureThreshold, int Debug, bool SeparateDebugWindows); @@ -155,7 +144,7 @@ class IntegerMatcher { int FindBestMatch(INT_CLASS ClassTemplate, const ScratchEvidence &tables, - INT_RESULT Result); + tesseract::UnicharRating* Result); #ifndef GRAPHICS_DISABLED void DebugFeatureProtoError( @@ -182,8 +171,6 @@ class IntegerMatcher { int AdaptFeatureThreshold, int Debug, bool SeparateDebugWindows); - - void DebugBestMatch(int BestMatch, INT_RESULT Result); #endif diff --git a/classify/shapetable.cpp b/classify/shapetable.cpp index 325a0e2836..0800860b94 100644 --- a/classify/shapetable.cpp +++ b/classify/shapetable.cpp @@ -710,7 +710,11 @@ void ShapeTable::AddShapeToResults(const ShapeRating& shape_rating, int result_index = AddUnicharToResults(shape[u].unichar_id, shape_rating.rating, unichar_map, results); - (*results)[result_index].fonts += shape[u].font_ids; + for (int f = 0; f < shape[u].font_ids.size(); ++f) { + (*results)[result_index].fonts.push_back( + ScoredFont(shape[u].font_ids[f], + IntCastRounded(shape_rating.rating * MAX_INT16))); + } } } diff --git a/classify/shapetable.h b/classify/shapetable.h index 87f4245fdc..d8faae8817 100644 --- a/classify/shapetable.h +++ b/classify/shapetable.h @@ -24,6 +24,7 @@ #define TESSERACT_CLASSIFY_SHAPETABLE_H_ #include "bitvector.h" +#include "fontinfo.h" #include "genericheap.h" #include "genericvector.h" #include "intmatcher.h" @@ -33,16 +34,23 @@ class UNICHARSET; namespace tesseract { -struct FontInfo; -class FontInfoTable; class ShapeTable; // Simple struct to hold a single classifier unichar selection, a corresponding // rating, and a list of appropriate fonts. struct UnicharRating { - UnicharRating() : unichar_id(0), rating(0.0f) {} + UnicharRating() + : unichar_id(0), rating(0.0f), adapted(false), config(0), + feature_misses(0) {} UnicharRating(int u, float r) - : unichar_id(u), rating(r) {} + : unichar_id(u), rating(r), adapted(false), config(0), feature_misses(0) {} + + // Print debug info. + void Print() const { + tprintf("Unichar-id=%d, rating=%g, adapted=%d, config=%d, misses=%d," + " %d fonts\n", unichar_id, rating, adapted, config, feature_misses, + fonts.size()); + } // Sort function to sort ratings appropriately by descending rating. static int SortDescendingRating(const void* t1, const void* t2) { @@ -68,9 +76,16 @@ struct UnicharRating { // Rating from classifier with 1.0 perfect and 0.0 impossible. // Call it a probability if you must. float rating; - // Set of fonts for this shape in order of decreasing preference. - // (There is no mechanism for storing scores for fonts as yet.) - GenericVector fonts; + // True if this result is from the adaptive classifier. + bool adapted; + // Index of best matching font configuration of result. + uinT8 config; + // Number of features that were total misses - were liked by no classes. + uinT16 feature_misses; + // Unsorted collection of fontinfo ids and scores. Note that a raw result + // from the IntegerMatch will contain config ids, that require transforming + // to fontinfo ids via fontsets and (possibly) shapetable. + GenericVector fonts; }; // Classifier result from a low-level classification is an index into some diff --git a/dict/stopper.cpp b/dict/stopper.cpp index 690f5fa387..e042500a6a 100644 --- a/dict/stopper.cpp +++ b/dict/stopper.cpp @@ -192,8 +192,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, BLOB_CHOICE_IT lst_it(lst); // TODO(rays/antonova) Put real xheights and y shifts here. lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i), - 0.0, 0.0, -1, -1, -1, 0, 1, 0, - BCC_AMBIG)); + 0.0, 0.0, -1, 0, 1, 0, BCC_AMBIG)); ambig_blob_choices.push_back(lst); } } @@ -278,7 +277,7 @@ bool Dict::NoDangerousAmbig(WERD_CHOICE *best_choice, BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]); bc_it.add_to_end(new BLOB_CHOICE( ambig_spec->correct_fragments[tmp_index], -1.0, 0.0, - -1, -1, -1, 0, 1, 0, BCC_AMBIG)); + -1, 0, 1, 0, BCC_AMBIG)); } } spec_it.forward(); diff --git a/wordrec/pieces.cpp b/wordrec/pieces.cpp index f920534051..1818478c66 100644 --- a/wordrec/pieces.cpp +++ b/wordrec/pieces.cpp @@ -40,6 +40,8 @@ #include "config_auto.h" #endif +using tesseract::ScoredFont; + /*---------------------------------------------------------------------- F u n c t i o n s ----------------------------------------------------------------------*/ @@ -194,8 +196,8 @@ void Wordrec::merge_and_put_fragment_lists(inT16 row, inT16 column, if (same_unichar) { // Add the merged character to the result UNICHAR_ID merged_unichar_id = first_unichar_id; - inT16 merged_fontinfo_id = choice_lists_it[0].data()->fontinfo_id(); - inT16 merged_fontinfo_id2 = choice_lists_it[0].data()->fontinfo_id2(); + GenericVector merged_fonts = + choice_lists_it[0].data()->fonts(); float merged_min_xheight = choice_lists_it[0].data()->min_xheight(); float merged_max_xheight = choice_lists_it[0].data()->max_xheight(); float positive_yshift = 0, negative_yshift = 0; @@ -220,21 +222,36 @@ void Wordrec::merge_and_put_fragment_lists(inT16 row, inT16 column, float yshift = choice_lists_it[i].data()->yshift(); if (yshift > positive_yshift) positive_yshift = yshift; if (yshift < negative_yshift) negative_yshift = yshift; + // Use the min font rating over the parts. + // TODO(rays) font lists are unsorted. Need to be faster? + const GenericVector& frag_fonts = + choice_lists_it[i].data()->fonts(); + for (int f = 0; f < frag_fonts.size(); ++f) { + int merged_f = 0; + for (merged_f = 0; merged_f < merged_fonts.size() && + merged_fonts[merged_f].fontinfo_id != frag_fonts[f].fontinfo_id; + ++merged_f) {} + if (merged_f == merged_fonts.size()) { + merged_fonts.push_back(frag_fonts[f]); + } else if (merged_fonts[merged_f].score > frag_fonts[f].score) { + merged_fonts[merged_f].score = frag_fonts[f].score; + } + } } float merged_yshift = positive_yshift != 0 ? (negative_yshift != 0 ? 0 : positive_yshift) : negative_yshift; - merged_choice_it.add_to_end(new BLOB_CHOICE(merged_unichar_id, - merged_rating, - merged_certainty, - merged_fontinfo_id, - merged_fontinfo_id2, - merged_script_id, - merged_min_xheight, - merged_max_xheight, - merged_yshift, - classifier)); + BLOB_CHOICE* choice = new BLOB_CHOICE(merged_unichar_id, + merged_rating, + merged_certainty, + merged_script_id, + merged_min_xheight, + merged_max_xheight, + merged_yshift, + classifier); + choice->set_fonts(merged_fonts); + merged_choice_it.add_to_end(choice); } } From 2924d3ae152a5e39e6fa0275a208e93f20a8c53b Mon Sep 17 00:00:00 2001 From: Ray Smith Date: Tue, 12 May 2015 17:28:56 -0700 Subject: [PATCH 09/15] Changes missed from diacritic fix edit --- ccstruct/pageres.cpp | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/ccstruct/pageres.cpp b/ccstruct/pageres.cpp index 00467225bc..ad835a2fdd 100644 --- a/ccstruct/pageres.cpp +++ b/ccstruct/pageres.cpp @@ -364,6 +364,7 @@ void WERD_RES::SetupFake(const UNICHARSET& unicharset_in) { LogNewCookedChoice(1, false, word); } tess_failed = true; + done = true; } void WERD_RES::SetupWordScript(const UNICHARSET& uch) { @@ -1314,6 +1315,10 @@ static void ComputeBlobEnds(const WERD_RES& word, C_BLOB_LIST* next_word_blobs, // replaced with real blobs from the current word as much as possible. void PAGE_RES_IT::ReplaceCurrentWord( tesseract::PointerVector* words) { + if (words->empty()) { + DeleteCurrentWord(); + return; + } WERD_RES* input_word = word(); // Set the BOL/EOL flags on the words from the input word. if (input_word->word->flag(W_BOL)) { @@ -1528,12 +1533,13 @@ void PAGE_RES_IT::ResetWordIterator() { // Reset the member iterator so it can move forward and detect the // cycled_list state correctly. word_res_it.move_to_first(); - word_res_it.mark_cycle_pt(); - while (!word_res_it.cycled_list() && word_res_it.data() != next_word_res) { - if (prev_row_res == row_res) - prev_word_res = word_res; - word_res = word_res_it.data(); - word_res_it.forward(); + for (word_res_it.mark_cycle_pt(); + !word_res_it.cycled_list() && word_res_it.data() != next_word_res; + word_res_it.forward()) { + if (!word_res_it.data()->part_of_combo) { + if (prev_row_res == row_res) prev_word_res = word_res; + word_res = word_res_it.data(); + } } ASSERT_HOST(!word_res_it.cycled_list()); word_res_it.forward(); @@ -1541,9 +1547,10 @@ void PAGE_RES_IT::ResetWordIterator() { // word_res_it is OK, but reset word_res and prev_word_res if needed. WERD_RES_IT wr_it(&row_res->word_res_list); for (wr_it.mark_cycle_pt(); !wr_it.cycled_list(); wr_it.forward()) { - if (prev_row_res == row_res) - prev_word_res = word_res; - word_res = wr_it.data(); + if (!wr_it.data()->part_of_combo) { + if (prev_row_res == row_res) prev_word_res = word_res; + word_res = wr_it.data(); + } } } } From 6b634170c157673015dcde33b00019a28417709a Mon Sep 17 00:00:00 2001 From: Ray Smith Date: Tue, 12 May 2015 17:33:18 -0700 Subject: [PATCH 10/15] Significant change to invisible font system to improve correctness and compatibility with external programs, particularly ghostscript. We will start mapping everything to a single glyph, rather than allowing characters to run off the end of the font. A more detailed design discussion is embedded into pdfrenderer.cpp comments. The font, source code that produces the font, and the design comments were contributed by Ken Sharp from Artifex Software. --- api/pdfrenderer.cpp | 198 ++++- tessdata/pdf.ttf | Bin 3628 -> 568 bytes tessdata/pdf.ttx | 1747 -------------------------------------- training/GlyphLessFont.c | 631 ++++++++++++++ training/GlyphLessFont.h | 228 +++++ 5 files changed, 1039 insertions(+), 1765 deletions(-) delete mode 100644 tessdata/pdf.ttx create mode 100644 training/GlyphLessFont.c create mode 100644 training/GlyphLessFont.h diff --git a/api/pdfrenderer.cpp b/api/pdfrenderer.cpp index 55232515c5..6cdeae8682 100644 --- a/api/pdfrenderer.cpp +++ b/api/pdfrenderer.cpp @@ -14,6 +14,139 @@ #include "mathfix.h" #endif +/* + +Design notes from Ken Sharp, with light editing. + +We think one solution is a font with a single glyph (.notdef) and a +CIDToGIDMap which maps all the CIDs to 0. That map would then be +stored as a stream in the PDF file, and when flate compressed should +be pretty small. The font, of course, will be approximately the same +size as the one you currently use. + +I'm working on such a font now, the CIDToGIDMap is trivial, you just +create a stream object which contains 128k bytes (2 bytes per possible +CID and your CIDs range from 0 to 65535) and where you currently have +"/CIDToGIDMap /Identity" you would have "/CIDToGIDMap 0 R". + +Note that if, in future, you were to use a different (ie not 2 byte) +CMap for character codes you could trivially extend the CIDToGIDMap. + +The following is an explanation of how some of the font stuff works, +this may be too simple for you in which case please accept my +apologies, its hard to know how much knowledge someone has. You can +skip all this anyway, its just for information. + +The font embedded in a PDF file is usually intended just to be +rendered, but extensions allow for at least some ability to locate (or +copy) text from a document. This isn't something which was an original +goal of the PDF format, but its been retro-fitted, presumably due to +popular demand. + +To do this reliably the PDF file must contain a ToUnicode CMap, a +device for mapping character codes to Unicode code points. If one of +these is present, then this will be used to convert the character +codes into Unicode values. If its not present then the reader will +fall back through a series of heuristics to try and guess the +result. This is, as you would expect, prone to failure. + +This doesn't concern you of course, since you always write a ToUnicode +CMap, so because you are writing the text in text rendering mode 3 it +would seem that you don't really need to worry about this, but in the +PDF spec you cannot have an isolated ToUnicode CMap, it has to be +attached to a font, so in order to get even copy/paste to work you +need to define a font. + +This is what leads to problems, tools like pdfwrite assume that they +are going to be able to (or even have to) modify the font entries, so +they require that the font being embedded be valid, and to be honest +the font Tesseract embeds isn't valid (for this purpose). + + +To see why lets look at how text is specified in a PDF file: + +(Test) Tj + +Now that looks like text but actually it isn't. Each of those bytes is +a 'character code'. When it comes to rendering the text a complex +sequence of events takes place, which converts the character code into +'something' which the font understands. Its entirely possible via +character mappings to have that text render as 'Sftu' + +For simple fonts (PostScript type 1), we use the character code as the +index into an Encoding array (256 elements), each element of which is +a glyph name, so this gives us a glyph name. We then consult the +CharStrings dictionary in the font, that's a complex object which +contains pairs of keys and values, you can use the key to retrieve a +given value. So we have a glyph name, we then use that as the key to +the dictionary and retrieve the associated value. For a type 1 font, +the value is a glyph program that describes how to draw the glyph. + +For CIDFonts, its a little more complicated. Because CIDFonts can be +large, using a glyph name as the key is unreasonable (it would also +lead to unfeasibly large Encoding arrays), so instead we use a 'CID' +as the key. CIDs are just numbers. + +But.... We don't use the character code as the CID. What we do is use +a CMap to convert the character code into a CID. We then use the CID +to key the CharStrings dictionary and proceed as before. So the 'CMap' +is the equivalent of the Encoding array, but its a more compact and +flexible representation. + +Note that you have to use the CMap just to find out how many bytes +constitute a character code, and it can be variable. For example you +can say if the first byte is 0x00->0x7f then its just one byte, if its +0x80->0xf0 then its 2 bytes and if its 0xf0->0xff then its 3 bytes. I +have seen CMaps defining character codes up to 5 bytes wide. + +Now that's fine for 'PostScript' CIDFonts, but its not sufficient for +TrueType CIDFonts. The thing is that TrueType fonts are accessed using +a Glyph ID (GID) (and the LOCA table) which may well not be anything +like the CID. So for this case PDF includes a CIDToGIDMap. That maps +the CIDs to GIDs, and we can then use the GID to get the glyph +description from the GLYF table of the font. + +So for a TrueType CIDFont, character-code->CID->GID->glyf-program. + +Looking at the PDF file I was supplied with we see that it contains +text like : + +<0x0075> Tj + +So we start by taking the character code (117) and look it up in the +CMap. Well you don't supply a CMap, you just use the Identity-H one +which is predefined. So character code 117 maps to CID 117. Then we +use the CIDToGIDMap, again you don't supply one, you just use the +predefined 'Identity' map. So CID 117 maps to GID 117. But the font we +were supplied with only contains 116 glyphs. + +Now for Latin that's not a huge problem, you can just supply a bigger +font. But for more complex languages that *is* going to be more of a +problem. Either you need to supply a font which contains glyphs for +all the possible CID->GID mappings, or we need to think laterally. + +Our solution using a TrueType CIDFont is to intervene at the +CIDToGIDMap stage and convert all the CIDs to GID 0. Then we have a +font with just one glyph, the .notdef glyph at GID 0. This is what I'm +looking into now. + +It would also be possible to have a 'PostScript' (ie type 1 outlines) +CIDFont which contained 1 glyph, and a CMap which mapped all character +codes to CID 0. The effect would be the same. + +Its possible (I haven't checked) that the PostScript CIDFont and +associated CMap would be smaller than the TrueType font and associated +CIDToGIDMap. + +--- in a followup --- + +OK there is a small problem there, if I use GID 0 then Acrobat gets +upset about it and complains it cannot extract the font. If I set the +CIDToGIDMap so that all the entries are 1 instead, its happy. Totally +mad...... + +*/ + namespace tesseract { // Use for PDF object fragments. Must be large enough @@ -334,7 +467,8 @@ bool TessPDFRenderer::BeginDocumentHandler() { " /Type /Catalog\n" " /Pages %ld 0 R\n" ">>\n" - "endobj\n", 2L); + "endobj\n", + 2L); if (n >= sizeof(buf)) return false; AppendPDFObject(buf); @@ -355,8 +489,8 @@ bool TessPDFRenderer::BeginDocumentHandler() { " /Type /Font\n" ">>\n" "endobj\n", - 4L, // CIDFontType2 font - 5L // ToUnicode + 4L, // CIDFontType2 font + 6L // ToUnicode ); if (n >= sizeof(buf)) return false; AppendPDFObject(buf); @@ -366,7 +500,7 @@ bool TessPDFRenderer::BeginDocumentHandler() { "4 0 obj\n" "<<\n" " /BaseFont /GlyphLessFont\n" - " /CIDToGIDMap /Identity\n" + " /CIDToGIDMap %ld 0 R\n" " /CIDSystemInfo\n" " <<\n" " /Ordering (Identity)\n" @@ -379,11 +513,44 @@ bool TessPDFRenderer::BeginDocumentHandler() { " /DW %d\n" ">>\n" "endobj\n", - 6L, // Font descriptor + 5L, // CIDToGIDMap + 7L, // Font descriptor 1000 / kCharWidth); if (n >= sizeof(buf)) return false; AppendPDFObject(buf); + // CIDTOGIDMAP + const int kCIDToGIDMapSize = 2 * (1 << 16); + unsigned char *cidtogidmap = new unsigned char[kCIDToGIDMapSize]; + for (int i = 0; i < kCIDToGIDMapSize; i++) { + cidtogidmap[i] = (i % 2) ? 1 : 0; + } + size_t len; + unsigned char *comp = + zlibCompress(cidtogidmap, kCIDToGIDMapSize, &len); + delete[] cidtogidmap; + n = snprintf(buf, sizeof(buf), + "5 0 obj\n" + "<<\n" + " /Length %ld /Filter /FlateDecode\n" + ">>\n" + "stream\n", len); + if (n >= sizeof(buf)) { + lept_free(comp); + return false; + } + AppendString(buf); + long objsize = strlen(buf); + AppendData(reinterpret_cast(comp), len); + objsize += len; + lept_free(comp); + const char *endstream_endobj = + "endstream\n" + "endobj\n"; + AppendString(endstream_endobj); + objsize += strlen(endstream_endobj); + AppendPDFObjectDIY(objsize); + const char *stream = "/CIDInit /ProcSet findresource begin\n" "12 dict begin\n" @@ -409,7 +576,7 @@ bool TessPDFRenderer::BeginDocumentHandler() { // TOUNICODE n = snprintf(buf, sizeof(buf), - "5 0 obj\n" + "6 0 obj\n" "<< /Length %lu >>\n" "stream\n" "%s" @@ -421,7 +588,7 @@ bool TessPDFRenderer::BeginDocumentHandler() { // FONT DESCRIPTOR const int kCharHeight = 2; // Effect: highlights are half height n = snprintf(buf, sizeof(buf), - "6 0 obj\n" + "7 0 obj\n" "<<\n" " /Ascent %d\n" " /CapHeight %d\n" @@ -439,7 +606,7 @@ bool TessPDFRenderer::BeginDocumentHandler() { 1000 / kCharHeight, 1000 / kCharWidth, 1000 / kCharHeight, - 7L // Font data + 8L // Font data ); if (n >= sizeof(buf)) return false; AppendPDFObject(buf); @@ -461,7 +628,7 @@ bool TessPDFRenderer::BeginDocumentHandler() { fclose(fp); // FONTFILE2 n = snprintf(buf, sizeof(buf), - "7 0 obj\n" + "8 0 obj\n" "<<\n" " /Length %ld\n" " /Length1 %ld\n" @@ -469,15 +636,12 @@ bool TessPDFRenderer::BeginDocumentHandler() { "stream\n", size, size); if (n >= sizeof(buf)) return false; AppendString(buf); - size_t objsize = strlen(buf); + objsize = strlen(buf); AppendData(buffer, size); delete[] buffer; objsize += size; - const char *b2 = - "endstream\n" - "endobj\n"; - AppendString(b2); - objsize += strlen(b2); + AppendString(endstream_endobj); + objsize += strlen(endstream_endobj); AppendPDFObjectDIY(objsize); return true; } @@ -679,9 +843,7 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) { unsigned char *pdftext_casted = reinterpret_cast(pdftext); size_t len; unsigned char *comp_pdftext = - zlibCompress(pdftext_casted, - pdftext_len, - &len); + zlibCompress(pdftext_casted, pdftext_len, &len); long comp_pdftext_len = len; n = snprintf(buf, sizeof(buf), "%ld 0 obj\n" diff --git a/tessdata/pdf.ttf b/tessdata/pdf.ttf index 08fd97ae89bd15b24d4ef6ab7268d0eabefbf44c..e7148c95c072e7627ac4f1f1d8b5886f64a2b4ba 100644 GIT binary patch literal 568 zcmZWlJxjw-6g@9(YW)C3L>x0Xh)8W6ba8Mif|!DkRYQ%Y(0tHzs9Q$I4o>dk;wCsb zI&||FxQK&)pqo(7eQDLy3-{e~?&sk_0hmLI0xbJvd&hYj+yH8W*|TQg##qKNd7pgi zvVC>INK-HJ_?~-iG@jmxALzS26Wn7Ef6&+cpxa}lscSyvcGPq=g?eTz1#T~9KNYAl zxfQyBclS9wBp;AB;wb5|%DS;8bwT3ydyhw_i~Gakj3j{9+pq99rUIp~q?JsV8jS>w zEgPIL9?%n_JhSOKs8fEyoHR^FlMDV2Nmt#r>p0NYGYxD^^W*u!Obf6gH)$41SkOrN z{$ic{c?#GhmKhVPm>7Vw<^aW@c{NFt-_Aw;5*Uew+Ku%x$mDyx0xH=DH+Fk|ZI7BuNM% zNlVg_W-Vz+NRqT9p{3>Z{N5M&7vy(-opYY^JU!1j&$sXMI|l`Xl7YsPU+gVYo~Ra} znx&_&Dyl53{&DaX*bHdd=v(P6@Mp%K6OV1uQ+%R`I1^eg@(t;UzT)zV&^nQzN&IzD zX}5`}~jcZ+xOLPWmd25EUK( zqRLqceg;bmLWCn>v?QEXw5APhi4a8F5k-5V=|Bt}iKP>r=|We!5l1`;bf*VBNu(FO z=|f-okwkxz8NfgW;bAaC7|Jk)lfnp68ObO{lg1d*87rI|$9N_%kx5Ku3R9WJbY_sr zOlC2gIm~4q^T}cX*(@Z7MdY%WB`jqb%gMvb3i2snC55cQ$7s{ zNUUA#=Ln~{D7A5q8tQq$I|=YzS=Cm>svas?rKk)wU1h0d%BR+=ZEBA?sE(_1>ax16 z?yJY@nR=~0s4wb=4%1P(tM08mdX%21XXzZhLKo>xx=QcUhxI9aL0{8%^+R2!pX;~! zqi!|T4wp0MZbReQ%iu(h_qzOjw=n?Mk#uRw*0*`U!^$S)u>{ diff --git a/tessdata/pdf.ttx b/tessdata/pdf.ttx deleted file mode 100644 index 66ac4dfbd2..0000000000 --- a/tessdata/pdf.ttx +++ /dev/null @@ -1,1747 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/training/GlyphLessFont.c b/training/GlyphLessFont.c new file mode 100644 index 0000000000..5321f7def9 --- /dev/null +++ b/training/GlyphLessFont.c @@ -0,0 +1,631 @@ +/* I don't expect anyone to run this program, ever again. It is + * included primarily as documentation for how the GlyphLessFont was + * created. + */ + +#include +#include +#include +#include "GlyphLessFont.h" + +#define LITTLE_ENDIAN + +Offset_Table Offsets = { +#ifdef LITTLE_ENDIAN + 0x00000100, /* sfnt_version */ + 0x0A00, /* numTables (10) */ + 0x8000, /* searchRange = Max power of 2 <= numTables*16 (128) */ + 0x0300, /* entrySelector Log2(searchRange) (3) */ + 0x2000, /* rangeShift = numTables*16 - searchRange (160 - 128 = 32) */ +#else + 0x00010000, /* sfnt_version */ + 0x000A, /* numTables (10) */ + 0x0080, /* searchRange = Max power of 2 <= numTables*16 (128) */ + 0x0003, /* entrySelector Log2(searchRange) (3) */ + 0x0020, /* rangeShift = numTables*16 - searchRange (160 - 128 = 32) */ +#endif +}; + +head_table head = { +#ifdef LITTLE_ENDIAN + 0x00000100, /* sfnt_version */ + 0x00000100, /* font_version */ + 0, /* checksum adjustment */ + 0xF53C0F5F, /* Magic number */ + 0x0704, /* flags: + * Bit 0 - 1 - baseline of font at y = 0 + * Bit 1 - 1 - Left sidebearing at x = 0 + * Bit 2 - 0 - instructions not dependent on font size + * Bit 3 - 1 - force integer ppem + * Bit 4 - 0 - instructions may not alter advance width + * Bit 5 - 0 - Not laid out vertically + * Bit 6 - 0 - required to be 0 + * Bit 7 - 0 - Does not require layout for rendering + * Bit 8 - 0 - Not an AAT font with metamorphosis + * Bit 9 - 0 - Not strongly right to left + * Bit 10 - 0 - Does not require indic-style rearrangements + * Bit 11 - 0 - Font data is not 'lossless' + * Bit 12 - 0 - Font not 'covnerted' + * Bit 13 - 0 - Not optimised for ClearType + * Bit 14 - 1 - This is a 'last resort' font + * Bit 15 - 0 - Reserved, must be 0 + */ + 0x0001, /* 16 units per em */ + 0x0,0x6EFC9ACF,/* Creation time */ + 0x0,0x6EFC9ACF,/* Modified time */ + 0, /* xMin */ + 0x0080, /* yMin */ + 0, /* xMax */ + 0x0100, /* yMax */ + 0x0000, /* macStyle (none) */ + 0x1000, /* Lowest readable size (16 pixels) */ + 0x0200, /* font direction (deprecated, should be 2) */ + 0, /* index to LOCA format (shorts) */ + 0 /* glyph data format (must be 0) */ +#else + 0x00010000, /* afnt version */ + 0x00010000, /* font version */ + 0, /* checksum adjustment */ + 0x5F0F3CF5, /* Magic number */ + 0x0407, /* flags: + * Bit 0 - 1 - baseline of font at y = 0 + * Bit 1 - 1 - Left sidebearing at x = 0 + * Bit 2 - 0 - instructions not dependent on font size + * Bit 3 - 1 - force integer ppem + * Bit 4 - 0 - instructions may not alter advance width + * Bit 5 - 0 - Not laid out vertically + * Bit 6 - 0 - required to be 0 + * Bit 7 - 0 - Does not require layout for rendering + * Bit 8 - 0 - Not an AAT font with metamorphosis + * Bit 9 - 0 - Not strongly right to left + * Bit 10 - 0 - Does not require indic-style rearrangements + * Bit 11 - 0 - Font data is not 'lossless' + * Bit 12 - 0 - Font not 'covnerted' + * Bit 13 - 0 - Not optimised for ClearType + * Bit 14 - 1 - This is a 'last resort' font + * Bit 15 - 0 - Reserved, must be 0 + */ + 0x0100, /* 16 units per em */ + 0x0,0xCF9AFC6E,/* Creation time */ + 0x0,0xCF9AFC6E,/* Modified time */ + 0, /* xMin */ + 0xFFFF, /* yMin */ + 0, /* xMax */ + 0x001, /* yMax */ + 0, /* macStyle (none) */ + 0x0010, /* Lowest readable size (16 pixels) */ + 0x0002, /* font direction (deprecated, should be 2) */ + 0, /* index to LOCA format (shorts) */ + 0 /* glyph data format (must be 0) */ +#endif +}; + +hhea_table hhea = { +#ifdef LITTLE_ENDIAN + 0x00000100, /* table version */ + 0x0100, /* Ascender */ +#else + 0x00001000, /* table version */ + 0x0001, /* Ascender */ +#endif + 0xFFFF, /* Descender */ + 0x0000, /* LineGap */ + 0x0000, /* AdvanceWidthMax */ + 0x0000, /* MinLeftSideBearing */ + 0x0000, /* MinRightSideBearing */ + 0x0000, /* xMaxExtent */ +#ifdef LITTLE_ENDIAN + 0x0100, /* caretSlopeRise (1 = vertical) */ +#else + 0x0001, /* caretSlopeRise (1 = vertical) */ +#endif + 0x0000, /* caretslopeRun (0 = vertical) */ + 0x0000, /* caretOffset */ + 0x0000, /* Reserved1 */ + 0x0000, /* Reserved2 */ + 0x0000, /* Reserved3 */ + 0x0000, /* Reserved4 */ + 0x0000, /* merticDataFormat (must be 0) */ +#ifdef LITTLE_ENDIAN + 0x0200, /* number of hMetric entries in hmtx */ +#else + 0x0002, /* number of hMetric entries in hmtx */ +#endif +}; + +maxp_table maxp = { +#ifdef LITTLE_ENDIAN + 0x00000100, /* table version */ + 0x0200, /* numGlyphs */ + 0x00000000, /* maxPoints */ + 0x00000000, /* maxContours */ + 0x00000000, /* maxCompositePoints */ + 0x00000000, /* maxCompositeContours */ + 0x00000100, /* maxZones */ + 0x00000000, /* maxTwilightPoints */ + 0x00000000, /* maxStorage */ + 0x00000000, /* maxFunctionDefs */ + 0x00000000, /* maxInstructionDefs */ + 0x00000000, /* maxStackElements */ + 0x00000000, /* maxSizeOfInstructions */ + 0x00000000, /* maxComponentElements */ + 0x00000000, /* maxComponentDepth */ +#else + 0x00001000, /* table version */ + 0x0002, /* numGlyphs */ + 0x00000000, /* maxPoints */ + 0x00000000, /* maxContours */ + 0x00000000, /* maxCompositePoints */ + 0x00000000, /* maxCompositeContours */ + 0x00000001, /* maxZones */ + 0x00000000, /* maxTwilightPoints */ + 0x00000000, /* maxStorage */ + 0x00000000, /* maxFunctionDefs */ + 0x00000000, /* maxInstructionDefs */ + 0x00000000, /* maxStackElements */ + 0x00000000, /* maxSizeOfInstructions */ + 0x00000000, /* maxComponentElements */ + 0x00000000, /* maxComponentDepth */ +#endif +}; + +OS2_table OS2 = { +#ifdef LITTLE_ENDIAN + 0x0300, /* table version */ + 0x0000, /* xAvgCharWidth */ + 0x9001, /* usWeight Class (400 = FW_NORMAL) */ + 0x0500, /* usWidthClass (5 = FWIDTH_NORMAL) */ + 0x0000, /* fsType (0 = no embedding restrictions) */ + 0x0000, /* ySubscriptXSize */ + 0x0000, /* ySubscriptYSize */ + 0x0000, /* ySubscriptXOffset */ + 0x0000, /* ySubscriptYOffset */ + 0x0000, /* ySuperscriptXSize */ + 0x0000, /* ySuperscriptYSize */ + 0x0000, /* ySuperscriptXOffset */ + 0x0000, /* ySuperscriptYOffset */ + 0x0000, /* yStikeoutPosition */ + 0x0000, /* sFamilyClass (0 = no classification) */ + 0,5,0,1,0,1,0,0,0,0,0, /* PANOSE */ + 0x00000000, /* ulUnicodeRanges1 */ + 0x00000000, /* ulUnicodeRanges2 */ + 0x00000000, /* ulUnicodeRanges3 */ + 0x00000000, /* ulUnicodeRanges4 */ + 'G', 'O', 'O', 'G', /* achVendID (GOOG = Google) */ + 0x4000, /* fsSelection (bit 6 set = regular font) */ + 0xFFFF, /* fsFirstCharIndex */ + 0x0000, /* fsLastCharIndex */ + 0x0100, /* sTypoAscender */ + 0xFFFF, /* StypoDescender */ + 0x0000, /* STypoLineGap */ + 0x0100, /* usWinAscent */ + 0x0100, /* usWinDescent */ + 0x00000080,/* ulCodePageRange1 */ + 0x00000000,/* ulCodePageRange2 */ + 0x0000, /* sxHeight */ + 0x0000, /* sCapHeight */ + 0x0000, /* usDefaultChar */ + 0x0100, /* usBreakChar */ + 0x0000, /* usMaxContent */ +#else + 0x0003, /* table version */ + 0x0000, /* xAvgCharWidth */ + 0x0190, /* usWeight Class (400 = FW_NORMAL) */ + 0x0005, /* usWidthClass (5 = FWIDTH_NORMAL) */ + 0x0000, /* fsType (0 = no embedding restrictions) */ + 0x0000, /* ySubscriptXSize */ + 0x0000, /* ySubscriptYSize */ + 0x0000, /* ySubscriptXOffset */ + 0x0000, /* ySubscriptYOffset */ + 0x0000, /* ySuperscriptXSize */ + 0x0000, /* ySuperscriptYSize */ + 0x0000, /* ySuperscriptXOffset */ + 0x0000, /* ySuperscriptYOffset */ + 0x0000, /* yStikeoutPosition */ + 0x0000, /* sFamilyClass (0 = no classification) */ + 0,5,0,1,0,1,0,0,0,0,0, /* PANOSE */ + 0x00000000,/* ulUnicodeRanges1 */ + 0x00000000,/* ulUnicodeRanges2 */ + 0x00000000,/* ulUnicodeRanges3 */ + 0x00000000,/* ulUnicodeRanges4 */ + 'G', 'O', 'O', 'G', /* achVendID (GOOG = Google) */ + 0x0040, /* fsSelection (bit 6 set = regular font) */ + 0xFFFF, /* fsFirstCharIndex */ + 0x0000, /* fsLastCharIndex */ + 0x0001, /* sTypoAscender */ + 0xFFFF, /* StypoDescender */ + 0x0000, /* STypoLineGap */ + 0x0001, /* usWinAscent */ + 0x0001, /* usWinDescent */ + 0x80000000,/* ulCodePageRange1 */ + 0x00000000,/* ulCodePageRange2 */ + 0x0000, /* sxHeight */ + 0x0000, /* sCapHeight */ + 0x0000, /* usDefaultChar */ + 0x0001, /* usBreakChar */ + 0x0000, /* usMaxContent */ +#endif +}; + +hmtx_table hmtx = { +0x0000, 0x0000, +0x0000, 0x0000 +}; + +cmap_table cmap = { + 0x0000, /* Cmap version (0) */ +#ifdef LITTLE_ENDIAN + 0x0200, /* numTables (2) */ + 0x0100, /* Start of first subtable record, platformID = 1 */ + 0x0000, /* encodingID = 0 */ + 0x14000000, /* Offset of data */ + 0x0300, /* Start of second subtable record, platformID = 3 */ + 0x0000, /* encodingID = 0 */ + 0x20000000, /* Offset of data */ + 0x0600, /* STart of Apple table (format 6) */ + 0x0C00, /* lenght of table (12) */ + 0x0000, /* Language must be 0 for non-Apple or + non-specific language */ + 0x0000, /* firstCode = 0 */ + 0x0100, /* number of codes is 1 */ + 0x0000, /* GID is 0 */ + 0x0600, /* Start of MS Table (format 4) */ + 0x0C00, /* lenght of table (12) */ + 0x0000, /* Language must be 0 for non-Apple or + non-specific language */ + 0x0000, /* firstCode = 0 */ + 0x0100, /* number of codes is 1 */ + 0x0000, /* GID is 0 */ +#else + 0x0002, /* numTables (2) */ + 0x0001, + 0x0000, + 0x00000014, + 0x0003, + 0x0000, + 0x00000020, + 0x0006, + 0x000C, + 0x0000, + 0x0000, + 0x0001, + 0x0000, + 0x0006, + 0x000C, + 0x0000, + 0x0000, + 0x0001, + 0x0000, +#endif +}; + +/* Changing these strings requires you to change the offset and lengths + in the name table below */ +char Macnamestring[] = {'V', 'e', 'r', 's', 'i', 'o', 'n', ' ', '1', '.', '0'}; +char Unamestring[] = {0x00, 'V', 0x00, 'e', 0x00, 'r', 0x00, 's', 0x00, 'i', + 0x00, 'o', 0x00, 'n', 0x00, ' ', 0x00, '1', 0x00, '.', + 0x00, '0', 0x00, 0x00, 0x00}; +name_table name = { + 0x0000, /* format 0 */ +#ifdef LITTLE_ENDIAN + 0x0300, /* 3 records */ + 0x2A00, /* Offset of string storage */ + + 0x0000, /* Start of 1st name record, platform = 0 (Unicode) */ + 0x0300, /* Platform-specific ID = 0 */ + 0x0000, /* Language ID (0 = none) */ + 0x0500, /* name ID (5 = version string) */ + 0x1600, /* String length */ + 0x0B00, /* Offset from start of storage */ + + 0x0100, /* Start of 2nd name record, platform = 1 (Mac) */ + 0x0000, + 0x0000, + 0x0500, /* name ID (5 = version string) */ + 0x0B00, /* String length */ + 0x0000, /* Offset from start of storage */ + + 0x0300, /* Start of 3rd name record, platform = 3 */ + 0x0100, /* Platform-specific ID = 1 */ + 0x0904, /* Language ID (0x409 = US English) */ + 0x0500, /* name ID (5 = version string) */ + 0x1600, /* String length */ + 0x0B00, /* Offset from start of storage */ +#else + 0x0003, /* 3 record2 */ + 0x002A, /* Offset of string storage */ + + 0x0000, /* Start of 1st name record, platform = 0 (Unicode) */ + 0x0003, /* Platform-specific ID = 0 */ + 0x0000, /* Language ID (0 = none) */ + 0x0005, /* name ID (5 = version string) */ + 0x0016, /* String length */ + 0x000B, /* Offset from start of storage */ + + 0x0001, /* Start of 2nd name record, platform = 1 (Mac) */ + 0x0000, + 0x0000, + 0x0500, /* name ID (5 = version string) */ + 0x000B, /* String length */ + 0x0000, /* Offset from start of storage */ + + 0x0003, /* Start of 3rd name record, platform = 3 */ + 0x0001, /* Platform-specific ID = 0 */ + 0x0409, /* Language ID (0 = none) */ + 0x0005, /* name ID (5 = version string) */ + 0x0016, /* String length */ + 0x000B, /* Offset from start of storage */ +#endif +}; + +post_table post = { +#ifdef LITTLE_ENDIAN + 0x0100, /* Version (2) */ +#else + 0x0001, /* Version (2) */ +#endif + 0x00000000, /* italicAngle */ + 0x0000, /* underlinePosition */ + 0x0000, /* underlineThickness */ +#ifdef LITTLE_ENDIAN + 0x01000000, /* isFixedPitch */ +#else + 0x00000001, /* isFixedPitch */ +#endif + 0x00000000, /* minMemType42 */ + 0x00000000, /* maxMemType42 */ + 0x00000000, /* minMemType1 */ + 0x00000000, /* maxMemType1 */ +}; + +int main (int argc, char **argv) +{ + FILE *OutputFile; + TableRecord Table[10]; + unsigned long offset = + sizeof(Offset_Table) + (sizeof(TableRecord) * 10), + length = 0, checksum = 0, HeadTableOffset, Working; + short fword = -1; + short loca = 0; + long glyf = 0; + unsigned int NameLength, i, FileLength; + + printf("Ken's Glyph-free font creator\n"); + if (argc != 2) { + fprintf (stderr, "Usage: GlyphLessFont \n"); + exit (1); + } + + OutputFile = fopen (argv[1], "wb+"); + if (OutputFile == 0) { + fprintf (stderr, "Couldn't open file %s for writing\n", argv[1]); + exit (1); + } + + fwrite (&Offsets, sizeof(Offset_Table), 1, OutputFile); + memset(&Table, 0x00, sizeof(TableRecord) + 10); + fwrite (&Table, sizeof (TableRecord), 10, OutputFile); + offset = ftell(OutputFile); + Table[3].offset = HeadTableOffset = offset; + + /* The whole business of writing a TrueType file is complicated by + * the way its laid out Firstly there is the fact that it wants + * the tables to be laid out in alphabetical order, but it wants + * the actual table data (which the table record points to) to be + * in quite a different order. Then there's the requirement to + * have all the table offsets be a multiple of 4 bytes. Finally + * we have to calculate a checksum for each table as well, which + * we cna't realistically do until we have written the table data, + * but which gets stored in the table record at the start of the + * file. + * + * So we start by writing a dumm set of table records, we'll fill + * in the array as we go and once we've written all the data and + * worked out the offsets and checksums of all the tables, we'll + * come back and write the table records into the area we left + * reserved. + */ + fwrite (&head, sizeof(head_table), 1, OutputFile); + offset = ftell(OutputFile); + Table[4].offset = offset; + + fwrite (&hhea, sizeof(hhea_table), 1, OutputFile); + offset = ftell(OutputFile); + Table[7].offset = offset; + + fwrite (&maxp, sizeof(maxp_table), 1, OutputFile); + offset = ftell(OutputFile); + Table[0].offset = offset; + + fwrite (&OS2, sizeof(OS2_table), 1, OutputFile); + offset = ftell(OutputFile); + Table[5].offset = offset; + + fwrite (&hmtx, sizeof(hmtx_table), 1, OutputFile); + offset = ftell(OutputFile); + Table[1].offset = offset; + + fwrite (&cmap, sizeof(cmap_table), 1, OutputFile); + offset = ftell(OutputFile); + Table[6].offset = offset; + + fwrite (&loca, sizeof(short), 1, OutputFile); + fwrite (&loca, sizeof(short), 1, OutputFile); + fwrite (&loca, sizeof(short), 1, OutputFile); + fwrite (&loca, sizeof(short), 1, OutputFile); + offset = ftell(OutputFile); + Table[2].offset = offset; + + fwrite (&glyf, sizeof(long), 1, OutputFile); + offset = ftell(OutputFile); + Table[8].offset = offset; + + length = (sizeof(name_table) + sizeof(Macnamestring) + + sizeof(Unamestring) + 3) / 4; + length *= 4; + NameLength = length; + fwrite (&name, sizeof(name_table), 1, OutputFile); + fwrite (&Macnamestring, sizeof(Macnamestring), 1, OutputFile); + fwrite (&Unamestring, NameLength - + (sizeof(name_table) + sizeof(Macnamestring)), 1, OutputFile); + offset = ftell(OutputFile); + Table[9].offset = offset; + + fwrite (&post, sizeof(post_table), 1, OutputFile); + FileLength = ftell(OutputFile); + + Table[3].tag[0] = 'h'; + Table[3].tag[1] = 'e'; + Table[3].tag[2] = 'a'; + Table[3].tag[3] = 'd'; + Table[3].checkSum = 0; + Table[3].length = sizeof(head_table) - 2; /* Don't count size + of padding bytes in table */ + + Table[4].tag[0] = 'h'; + Table[4].tag[1] = 'h'; + Table[4].tag[2] = 'e'; + Table[4].tag[3] = 'a'; + Table[4].checkSum = 0; + Table[4].length = sizeof(hhea_table); + + Table[7].tag[0] = 'm'; + Table[7].tag[1] = 'a'; + Table[7].tag[2] = 'x'; + Table[7].tag[3] = 'p'; + Table[7].checkSum = 0; + Table[7].length = sizeof(maxp_table); + + Table[0].tag[0] = 'O'; + Table[0].tag[1] = 'S'; + Table[0].tag[2] = '/'; + Table[0].tag[3] = '2'; + Table[0].checkSum = 0; + Table[0].length = sizeof(OS2_table); + + Table[5].tag[0] = 'h'; + Table[5].tag[1] = 'm'; + Table[5].tag[2] = 't'; + Table[5].tag[3] = 'x'; + Table[5].checkSum = 0; + Table[5].length = sizeof(hmtx_table); + + Table[1].tag[0] = 'c'; + Table[1].tag[1] = 'm'; + Table[1].tag[2] = 'a'; + Table[1].tag[3] = 'p'; + Table[1].checkSum = 0; + Table[1].length = sizeof(cmap_table); + + Table[6].tag[0] = 'l'; + Table[6].tag[1] = 'o'; + Table[6].tag[2] = 'c'; + Table[6].tag[3] = 'a'; + Table[6].checkSum = 0; + Table[6].length = sizeof(USHORT) * 3; + + Table[2].tag[0] = 'g'; + Table[2].tag[1] = 'l'; + Table[2].tag[2] = 'y'; + Table[2].tag[3] = 'f'; + Table[2].checkSum = 0; + Table[2].length = 1; + + Table[8].tag[0] = 'n'; + Table[8].tag[1] = 'a'; + Table[8].tag[2] = 'm'; + Table[8].tag[3] = 'e'; + Table[8].checkSum = 0; + Table[8].length = (sizeof(name_table) + + sizeof(Macnamestring) + + sizeof(Unamestring) + 3) / 4; + Table[8].length *= 4; + NameLength = Table[8].length; + + Table[9].tag[0] = 'p'; + Table[9].tag[1] = 'o'; + Table[9].tag[2] = 's'; + Table[9].tag[3] = 't'; + Table[9].checkSum = 0; + Table[9].length = sizeof(post_table); + + for (i=0;i<10;i++) { + ULONG LENGTH, Sum = 0L; + ULONG *EndPtr, *Data, *Current; + + offset = Table[i].offset; + length = Table[i].length; + LENGTH = (length + 3 & ~3); + Data = (ULONG *)malloc(LENGTH); + memset(Data, 0x00, LENGTH); + fseek(OutputFile, offset, SEEK_SET); + fread(Data, length, 1, OutputFile); + + Current = Data; + EndPtr = Data + (LENGTH / sizeof(ULONG)); + while(Current < EndPtr){ +#ifdef LITTLE_ENDIAN + Working = *Current++; + Sum += ((Working & 0xff) << 24) + + ((Working & 0xff00) << 8) + + ((Working & 0xff0000) >> 8) + + (Working >> 24); +#else + Sum += *Current++; +#endif + } + free(Data); + +#ifdef LITTLE_ENDIAN + Table[i].offset = + ((offset & 0xff) << 24) + + ((offset & 0xff00) << 8) + + ((offset & 0xff0000) >> 8) + + (offset >> 24); + Table[i].length = + ((length & 0xff) << 24) + + ((length & 0xff00) << 8) + + ((length & 0xff0000) >> 8) + + (length >> 24); + Table[i].checkSum = + ((Sum & 0xff) << 24) + + ((Sum & 0xff00) << 8) + + ((Sum & 0xff0000) >> 8) + + (Sum >> 24); +#else + Table[i].checkSum = Sum; +#endif + } + + fseek(OutputFile, sizeof(Offset_Table), SEEK_SET); + fwrite (&Table, sizeof(TableRecord), 10, OutputFile); + + fseek(OutputFile, 0, SEEK_SET); + + for (i=0;i < FileLength / sizeof(long);i++) { + fread(&Working, sizeof(long), 1, OutputFile); +#ifdef LITTLE_ENDIAN + checksum += ((Working & 0xff) << 24) + + ((Working & 0xff00) << 8) + + ((Working & 0xff0000) >> 8) + + (Working >> 24); +#else + checksum += Working; +#endif + } + checksum = 0xB1B0AFBA - checksum; +#ifdef LITTLE_ENDIAN + head.checkSumAdjustment = + ((checksum & 0xff) << 24) + + ((checksum & 0xff00) << 8) + + ((checksum & 0xff0000) >> 8) + + (checksum >> 24); +#else + head.checkSumAdjustment = checksum; +#endif + fseek(OutputFile, HeadTableOffset, SEEK_SET); + fwrite (&head, sizeof(head_table), 1, OutputFile); + fclose(OutputFile); + + return 0; +} diff --git a/training/GlyphLessFont.h b/training/GlyphLessFont.h new file mode 100644 index 0000000000..97856a71c6 --- /dev/null +++ b/training/GlyphLessFont.h @@ -0,0 +1,228 @@ +/* I don't expect anyone to run this program, ever again. It is + * included primarily as documentation for how the GlyphLessFont was + * created. + */ + +/* The OpenType data types, we'll duplicate the definitions so that + * the code shall be (as far as possible) self-documenting simply by + * referencing the OpenType specification. Note that the specification + * is soemwhat inconsistent with regards to usage, naming and capitalisation + * of the names for these data types. + */ +typedef char BYTE; +typedef char CHAR; +typedef unsigned short USHORT; +typedef short SHORT; +typedef struct _uint24 {char top8;unsigned short bottom16;} UINT24; +typedef unsigned long ULONG; +typedef long LONG; +typedef unsigned long Fixed; +typedef SHORT FWORD; +typedef USHORT UFWORD; +typedef unsigned short F2DOT14; +typedef struct _datetime {long upper;long lower;} LONGDATETIME; +typedef char Tag[4]; +typedef USHORT GlyphId; +typedef USHORT Offset; +typedef struct _longHorMetric {USHORT advanceWidth;SHORT lsb;} longHorMetric; + +/* And now definitions for each of the OpenType tables we will wish to use */ + +typedef struct { + Fixed sfnt_version; + USHORT numTables; + USHORT searchRange; + USHORT entrySelector; + USHORT rangeShift; +} Offset_Table; + +typedef struct { + Tag tag; /* The spec defines this as a ULONG, + but also as a 'Tag' in its own right */ + ULONG checkSum; + ULONG offset; + ULONG length; +} TableRecord; + +typedef struct { + USHORT version; + USHORT numTables; +} cmap_header; + +typedef struct { + USHORT platformID; + USHORT encodingID; + ULONG Offset; +} cmap_record; + +typedef struct { + USHORT format; + USHORT length; + USHORT language; + BYTE glyphIDArray[256]; +} format0_cmap_table; + +/* This structure only works for single segment format 4 tables, + for multiple segments it must be constructed */ +typedef struct { + USHORT format; + USHORT length; + USHORT language; + USHORT segCountx2; + USHORT searchRange; + USHORT entrySelector; + USHORT rangeShift; + USHORT endcount; + USHORT reservedPad; + USHORT startCount; + SHORT idDelta; + USHORT idRangeOffset; + USHORT glyphIdArray[2]; +} format4_cmap_table; + +typedef struct { + USHORT format; + USHORT length; + USHORT language; + USHORT firstCode; + USHORT entryCount; + USHORT glyphIDArray; +} format6_cmap_table; + +typedef struct { + cmap_header header; + cmap_record records[2]; + format6_cmap_table AppleTable; + format6_cmap_table MSTable; +} cmap_table; + +typedef struct { + Fixed version; + Fixed FontRevision; + ULONG checkSumAdjustment; + ULONG MagicNumber; + USHORT Flags; + USHORT unitsPerEm; + LONGDATETIME created; + LONGDATETIME modified; + SHORT xMin; + SHORT yMin; + SHORT xMax; + SHORT yMax; + USHORT macStyle; + USHORT lowestRecPPEM; + SHORT FontDirectionHint; + SHORT indexToLocFormat; + SHORT glyphDataFormat; + SHORT PAD; +} head_table; + +typedef struct { + Fixed version; + FWORD Ascender; + FWORD Descender; + FWORD LineGap; + UFWORD advanceWidthMax; + FWORD minLeftSideBearing; + FWORD minRightSideBearing; + FWORD xMaxExtent; + SHORT caretSlopeRise; + SHORT caretSlopeRun; + SHORT caretOffset; + SHORT reserved1; + SHORT reserved2; + SHORT reserved3; + SHORT reserved4; + SHORT metricDataFormat; + USHORT numberOfHMetrics; +} hhea_table; + +typedef struct { + longHorMetric hMetrics[2]; +} hmtx_table; + +typedef struct { + Fixed version; + USHORT numGlyphs; + USHORT maxPoints; + USHORT maxContours; + USHORT maxCompositePoints; + USHORT maxCompositeContours; + USHORT maxZones; + USHORT maxTwilightPoints; + USHORT maxStorage; + USHORT maxFunctionDefs; + USHORT maxInstructionDefs; + USHORT maxStackElements; + USHORT maxSizeOfInstructions; + USHORT maxComponentElements; + USHORT maxComponentDepth; +} maxp_table; + +typedef struct { + USHORT platformID; + USHORT encodingID; + USHORT languageID; + USHORT nameID; + USHORT length; + USHORT offset; +} NameRecord; + +typedef struct { + USHORT format; + USHORT count; + USHORT stringOffset; + NameRecord nameRecord[3]; +} name_table; + +typedef struct { + USHORT version; + SHORT xAvgCharWidth; + USHORT usWeightClass; + USHORT usWidthClass; + USHORT fsType; + SHORT ySubscriptXSize; + SHORT ySubscriptYSize; + SHORT ySubscriptXOffset; + SHORT ySubscriptYOffset; + SHORT ySuperscriptXSize; + SHORT ySuperscriptYSize; + SHORT ySuperscriptXOffset; + SHORT ySuperscriptYOffset; + SHORT yStrikeoutSize; + SHORT yStrikeoutPosition; + SHORT sFamilyClass; + BYTE panose[10]; + ULONG ulUnicodeRange1; + ULONG ulUnicodeRange2; + ULONG ulUnicodeRange3; + ULONG ulUnicodeRange4; + CHAR achVendID[4]; + USHORT fsSelection; + USHORT usFirstCharIndex; + USHORT usLastCharIndex; + SHORT sTypoAscender; + SHORT sTypoDescender; + SHORT sTypoLineGap; + USHORT usWinAscent; + USHORT usWinDescent; + ULONG ulCodePageRange1; + ULONG ulCodePageRange2; + SHORT sxHeight; + SHORT sCapHeight; + USHORT usDefaultChar; + USHORT usBreakChar; + USHORT usMaxContent; +} OS2_table; + +typedef struct { + Fixed version; + Fixed italicAngle; + FWORD underlinePosition; + FWORD underlineThickness; + ULONG isFixedPitch; + ULONG minMemType42; + ULONG maxMemType42; + ULONG minMemType1; + ULONG maxMemType1; +} post_table; From 164897210a71f048ed0ebd74fe1a3c67755fd961 Mon Sep 17 00:00:00 2001 From: Ray Smith Date: Tue, 12 May 2015 17:51:03 -0700 Subject: [PATCH 11/15] Improved newlines and spaces in a box file so it works better with RTL languages. --- ccutil/unichar.cpp | 12 +- ccutil/unichar.h | 4 +- training/boxchar.cpp | 285 ++++++++++++++++++++++++++++-------- training/boxchar.h | 35 +++++ training/stringrenderer.cpp | 3 +- training/stringrenderer.h | 2 +- 6 files changed, 279 insertions(+), 62 deletions(-) diff --git a/ccutil/unichar.cpp b/ccutil/unichar.cpp index b75dff6d1c..0ceced13f0 100644 --- a/ccutil/unichar.cpp +++ b/ccutil/unichar.cpp @@ -206,12 +206,20 @@ UNICHAR::const_iterator UNICHAR::end(const char* utf8_str, const int len) { } // Converts a utf-8 string to a vector of unicodes. -void UNICHAR::UTF8ToUnicode(const char* utf8_str, +// Returns false if the input contains invalid UTF-8, and replaces +// the rest of the string with a single space. +bool UNICHAR::UTF8ToUnicode(const char* utf8_str, GenericVector* unicodes) { const int utf8_length = strlen(utf8_str); const_iterator end_it(end(utf8_str, utf8_length)); for (const_iterator it(begin(utf8_str, utf8_length)); it != end_it; ++it) { - unicodes->push_back(*it); + if (it.is_legal()) { + unicodes->push_back(*it); + } else { + unicodes->push_back(' '); + return false; + } } + return true; } diff --git a/ccutil/unichar.h b/ccutil/unichar.h index 7e5cd9fb89..b2a1e013f9 100644 --- a/ccutil/unichar.h +++ b/ccutil/unichar.h @@ -151,7 +151,9 @@ class UNICHAR { static const_iterator end(const char* utf8_str, const int byte_length); // Converts a utf-8 string to a vector of unicodes. - static void UTF8ToUnicode(const char* utf8_str, GenericVector* unicodes); + // Returns false if the input contains invalid UTF-8, and replaces + // the rest of the string with a single space. + static bool UTF8ToUnicode(const char* utf8_str, GenericVector* unicodes); private: // A UTF-8 representation of 1 or more Unicode characters. diff --git a/training/boxchar.cpp b/training/boxchar.cpp index b3b7173259..276d8af6d2 100644 --- a/training/boxchar.cpp +++ b/training/boxchar.cpp @@ -23,9 +23,18 @@ #include "boxchar.h" #include +#include #include "fileio.h" +#include "genericvector.h" #include "ndminx.h" +#include "normstrngs.h" +#include "tprintf.h" +#include "unicharset.h" +#include "unicode/uchar.h" // from libicu + +// Absolute Ratio of dx:dy or dy:dx to be a newline. +const int kMinNewlineRatio = 5; namespace tesseract { @@ -33,17 +42,14 @@ BoxChar::BoxChar(const char* utf8_str, int len) : ch_(utf8_str, len) { box_ = NULL; } -BoxChar::~BoxChar() { - boxDestroy(&box_); -} +BoxChar::~BoxChar() { boxDestroy(&box_); } void BoxChar::AddBox(int x, int y, int width, int height) { box_ = boxCreate(x, y, width, height); } /* static */ -void BoxChar::TranslateBoxes(int xshift, int yshift, - vector* boxes) { +void BoxChar::TranslateBoxes(int xshift, int yshift, vector* boxes) { for (int i = 0; i < boxes->size(); ++i) { BOX* box = (*boxes)[i]->box_; if (box != NULL) { @@ -53,15 +59,218 @@ void BoxChar::TranslateBoxes(int xshift, int yshift, } } +// Prepares for writing the boxes to a file by inserting newlines, spaces, +// and re-ordering so the boxes are strictly left-to-right. +/* static */ +void BoxChar::PrepareToWrite(vector* boxes) { + bool rtl_rules = ContainsMostlyRTL(*boxes); + bool vertical_rules = MostlyVertical(*boxes); + InsertNewlines(rtl_rules, vertical_rules, boxes); + InsertSpaces(rtl_rules, vertical_rules, boxes); + for (int i = 0; i < boxes->size(); ++i) { + if ((*boxes)[i]->box_ == NULL) tprintf("Null box at index %d\n", i); + } + if (rtl_rules) { + ReorderRTLText(boxes); + } + tprintf("Rtl = %d ,vertical=%d\n", rtl_rules, vertical_rules); +} + +// Inserts newline (tab) characters into the vector at newline positions. +/* static */ +void BoxChar::InsertNewlines(bool rtl_rules, bool vertical_rules, + vector* boxes) { + int prev_i = -1; + int max_shift = 0; + for (int i = 0; i < boxes->size(); ++i) { + Box* box = (*boxes)[i]->box_; + if (box == NULL) { + if (prev_i < 0 || prev_i < i - 1 || i + 1 == boxes->size()) { + // Erase null boxes at the start of a line and after another null box. + do { + delete (*boxes)[i]; + boxes->erase(boxes->begin() + i); + --i; + } while (i >= 0 && i + 1 == boxes->size() && (*boxes)[i]->box_ == NULL); + } + continue; + } + if (prev_i >= 0) { + Box* prev_box = (*boxes)[prev_i]->box_; + int shift = box->x - prev_box->x; + if (vertical_rules) { + shift = box->y - prev_box->y; + } else if (rtl_rules) { + shift = -shift; + } + if (-shift > max_shift) { + // This is a newline. + int width = prev_box->w; + int height = prev_box->h; + int x = prev_box->x + width; + int y = prev_box->y; + if (vertical_rules) { + x = prev_box->x; + y = prev_box->y + height; + } else if (rtl_rules) { + x = prev_box->x - width; + if (x < 0) { + tprintf("prev x = %d, width=%d\n", prev_box->x, width); + x = 0; + } + } + if (prev_i == i - 1) { + // New character needed. + BoxChar* new_box = new BoxChar("\t", 1); + new_box->AddBox(x, y, width, height); + new_box->page_ = (*boxes)[i]->page_; + boxes->insert(boxes->begin() + i, new_box); + ++i; + } else { + (*boxes)[i - 1]->AddBox(x, y, width, height); + (*boxes)[i - 1]->ch_ = "\t"; + } + max_shift = 0; + } else if (shift > max_shift) { + max_shift = shift; + } + } + prev_i = i; + } +} + +// Converts NULL boxes to space characters, with appropriate bounding boxes. +/* static */ +void BoxChar::InsertSpaces(bool rtl_rules, bool vertical_rules, + vector* boxes) { + // After InsertNewlines, any remaining null boxes are not newlines, and are + // singletons, so add a box to each remaining null box. + for (int i = 1; i + 1 < boxes->size(); ++i) { + Box* box = (*boxes)[i]->box_; + if (box == NULL) { + Box* prev = (*boxes)[i - 1]->box_; + Box* next = (*boxes)[i + 1]->box_; + ASSERT_HOST(prev != NULL && next != NULL); + int top = MIN(prev->y, next->y); + int bottom = MAX(prev->y + prev->h, next->y + next->h); + int left = prev->x + prev->w; + int right = next->x; + if (vertical_rules) { + top = prev->y + prev->h; + bottom = next->y; + left = MIN(prev->x, next->x); + right = MAX(prev->x + prev->w, next->x + next->w); + } else if (rtl_rules) { + // With RTL we have to account for BiDi. + // Right becomes the min left of all prior boxes back to the first + // space or newline. + right = prev->x; + left = next->x + next->w; + for (int j = i - 2; + j >= 0 && (*boxes)[j]->ch_ != " " && (*boxes)[j]->ch_ != "\t"; + --j) { + prev = (*boxes)[j]->box_; + ASSERT_HOST(prev != NULL); + if (prev->x < right) { + right = prev->x; + } + } + // Left becomes the max right of all next boxes foward to the first + // space or newline. + for (int j = i + 2; j < boxes->size() && (*boxes)[j]->box_ != NULL && + (*boxes)[j]->ch_ != "\t"; + ++j) { + next = (*boxes)[j]->box_; + if (next->x + next->w > left) { + left = next->x + next->w; + } + } + } + // Italic and stylized characters can produce negative spaces, which + // Leptonica doesn't like, so clip to a positive size. + if (right <= left) right = left + 1; + if (bottom <= top) bottom = top + 1; + (*boxes)[i]->AddBox(left, top, right - left, bottom - top); + (*boxes)[i]->ch_ = " "; + } + } +} + +// Reorders text in a right-to-left script in left-to-right order. +/* static */ +void BoxChar::ReorderRTLText(vector* boxes) { + // After adding newlines and spaces, this task is simply a matter of sorting + // by left each group of boxes between newlines. + BoxCharPtrSort sorter; + int end = 0; + for (int start = 0; start < boxes->size(); start = end + 1) { + end = start + 1; + while (end < boxes->size() && (*boxes)[end]->ch_ != "\t") ++end; + sort(boxes->begin() + start, boxes->begin() + end, sorter); + } +} + +// Returns true if the vector contains mostly RTL characters. +/* static */ +bool BoxChar::ContainsMostlyRTL(const vector& boxes) { + int num_rtl = 0, num_ltr = 0; + for (int i = 0; i < boxes.size(); ++i) { + // Convert the unichar to UTF32 representation + GenericVector uni_vector; + if (!UNICHAR::UTF8ToUnicode(boxes[i]->ch_.c_str(), &uni_vector)) { + tprintf("Illegal utf8 in boxchar %d string:%s = ", i, + boxes[i]->ch_.c_str()); + for (int c = 0; c < boxes[i]->ch_.size(); ++c) { + tprintf(" 0x%x", boxes[i]->ch_[c]); + } + tprintf("\n"); + continue; + } + for (int j = 0; j < uni_vector.size(); ++j) { + UCharDirection dir = u_charDirection(uni_vector[j]); + if (dir == U_RIGHT_TO_LEFT || dir == U_RIGHT_TO_LEFT_ARABIC || + dir == U_ARABIC_NUMBER) { + ++num_rtl; + } else { + ++num_ltr; + } + } + } + return num_rtl > num_ltr; +} + +// Returns true if the text is mostly laid out vertically. +/* static */ +bool BoxChar::MostlyVertical(const vector& boxes) { + inT64 total_dx = 0, total_dy = 0; + for (int i = 1; i < boxes.size(); ++i) { + if (boxes[i - 1]->box_ != NULL && boxes[i]->box_ != NULL && + boxes[i - 1]->page_ == boxes[i]->page_) { + int dx = boxes[i]->box_->x - boxes[i - 1]->box_->x; + int dy = boxes[i]->box_->y - boxes[i - 1]->box_->y; + if (abs(dx) > abs(dy) * kMinNewlineRatio || + abs(dy) > abs(dx) * kMinNewlineRatio) { + total_dx += dx * dx; + total_dy += dy * dy; + } + } + } + return total_dy > total_dx; +} + +// Returns the total length of all the strings in the boxes. +/* static */ +int BoxChar::TotalByteLength(const vector& boxes) { + int total_length = 0; + for (int i = 0; i < boxes.size(); ++i) total_length += boxes[i]->ch_.size(); + return total_length; +} + // Rotate the boxes in [start_box, end_box) by the given rotation. // The rotation is in radians clockwise about the given center. /* static */ -void BoxChar::RotateBoxes(float rotation, - int xcenter, - int ycenter, - int start_box, - int end_box, - vector* boxes) { +void BoxChar::RotateBoxes(float rotation, int xcenter, int ycenter, + int start_box, int end_box, vector* boxes) { Boxa* orig = boxaCreate(0); for (int i = start_box; i < end_box; ++i) { BOX* box = (*boxes)[i]->box_; @@ -79,16 +288,6 @@ void BoxChar::RotateBoxes(float rotation, } const int kMaxLineLength = 1024; -// Helper appends a tab box to the string to indicate a newline. We can't use -// an actual newline as the file format is line-based text. -static void AppendTabBox(const Box* box, int height, int page, string* output) { - char buffer[kMaxLineLength]; - int nbytes = snprintf(buffer, kMaxLineLength, "\t %d %d %d %d %d\n", - box->x + box->w, height - box->y - box->h, - box->x + box->w + 10, height - box->y, page); - output->append(buffer, nbytes); -} - /* static */ void BoxChar::WriteTesseractBoxFile(const string& filename, int height, const vector& boxes) { @@ -96,43 +295,15 @@ void BoxChar::WriteTesseractBoxFile(const string& filename, int height, char buffer[kMaxLineLength]; for (int i = 0; i < boxes.size(); ++i) { const Box* box = boxes[i]->box_; - if (box != NULL) { - if (i > 0 && boxes[i - 1]->box_ != NULL && - boxes[i - 1]->page_ == boxes[i]->page_ && - box->x + box->w < boxes[i - 1]->box_->x) { - // We are on a newline. Output a tab character to indicate the newline. - AppendTabBox(boxes[i - 1]->box_, height, boxes[i]->page_, &output); - } - int nbytes = snprintf(buffer, kMaxLineLength, - "%s %d %d %d %d %d\n", - boxes[i]->ch_.c_str(), - box->x, height - box->y - box->h, - box->x + box->w, height - box->y, - boxes[i]->page_); - output.append(buffer, nbytes); - } else if (i > 0 && boxes[i - 1]->box_ != NULL) { - int j = i + 1; - // Find the next non-null box, as there may be multiple spaces. - while (j < boxes.size() && boxes[j]->box_ == NULL) ++j; - if (j < boxes.size() && boxes[i - 1]->page_ == boxes[j]->page_) { - const Box* prev = boxes[i - 1]->box_; - const Box* next = boxes[j]->box_; - if (next->x + next->w < prev->x) { - // We are on a newline. Output a tab character to indicate it. - AppendTabBox(prev, height, boxes[j]->page_, &output); - } else { - // Space between words. - int nbytes = snprintf(buffer, kMaxLineLength, - " %d %d %d %d %d\n", - prev->x + prev->w, - height - MAX(prev->y + prev->h, - next->y + next->h), - next->x, height - MIN(prev->y, next->y), - boxes[i - 1]->page_); - output.append(buffer, nbytes); - } - } + if (box == NULL) { + tprintf("Error: Call PrepareToWrite before WriteTesseractBoxFile!!\n"); + return; } + int nbytes = + snprintf(buffer, kMaxLineLength, "%s %d %d %d %d %d\n", + boxes[i]->ch_.c_str(), box->x, height - box->y - box->h, + box->x + box->w, height - box->y, boxes[i]->page_); + output.append(buffer, nbytes); } File::WriteStringToFileOrDie(output, filename); } diff --git a/training/boxchar.h b/training/boxchar.h index bcb102ced5..27b568a143 100644 --- a/training/boxchar.h +++ b/training/boxchar.h @@ -57,9 +57,36 @@ class BoxChar { string* mutable_ch() { return &ch_; } Box* mutable_box() { return box_; } + // Sort function for sorting by left edge of box. Note that this will not + // work properly until after InsertNewlines and InsertSpaces. + bool operator<(const BoxChar& other) const { + if (box_ == NULL) return true; + if (other.box_ == NULL) return false; + return box_->x < other.box_->x; + } + static void TranslateBoxes(int xshift, int yshift, vector* boxes); + // Prepares for writing the boxes to a file by inserting newlines, spaces, + // and re-ordering so the boxes are strictly left-to-right. + static void PrepareToWrite(vector* boxes); + // Inserts newline (tab) characters into the vector at newline positions. + static void InsertNewlines(bool rtl_rules, bool vertical_rules, + vector* boxes); + // Converts NULL boxes to space characters, with appropriate bounding boxes. + static void InsertSpaces(bool rtl_rules, bool vertical_rules, + vector* boxes); + // Reorders text in a right-to-left script in left-to-right order. + static void ReorderRTLText(vector* boxes); + // Returns true if the vector contains mostly RTL characters. + static bool ContainsMostlyRTL(const vector& boxes); + // Returns true if the text is mostly laid out vertically. + static bool MostlyVertical(const vector& boxes); + + // Returns the total length of all the strings in the boxes. + static int TotalByteLength(const vector& boxes); + // Rotate the vector of boxes between start and end by the given rotation. // The rotation is in radians clockwise about the given center. static void RotateBoxes(float rotation, @@ -79,6 +106,14 @@ class BoxChar { Box* box_; int page_; }; + +// Sort predicate to sort a vector of BoxChar*. +struct BoxCharPtrSort { + bool operator()(const BoxChar* box1, const BoxChar* box2) const { + return *box1 < *box2; + } +}; + } // namespace tesseract #endif // TESSERACT_TRAINING_BOXCHAR_H_ diff --git a/training/stringrenderer.cpp b/training/stringrenderer.cpp index a448b92e51..8bef6699f7 100644 --- a/training/stringrenderer.cpp +++ b/training/stringrenderer.cpp @@ -330,7 +330,8 @@ void StringRenderer::ClearBoxes() { boxaDestroy(&page_boxes_); } -void StringRenderer::WriteAllBoxes(const string& filename) const { +void StringRenderer::WriteAllBoxes(const string& filename) { + BoxChar::PrepareToWrite(&boxchars_); BoxChar::WriteTesseractBoxFile(filename, page_height_, boxchars_); } diff --git a/training/stringrenderer.h b/training/stringrenderer.h index 1fd62d4d97..d96e572e17 100644 --- a/training/stringrenderer.h +++ b/training/stringrenderer.h @@ -148,7 +148,7 @@ class StringRenderer { void RotatePageBoxes(float rotation); // Delete all boxes. void ClearBoxes(); - void WriteAllBoxes(const string& filename) const; + void WriteAllBoxes(const string& filename); // Removes space-delimited words from the string that are not renderable by // the current font and returns the count of such words. int StripUnrenderableWords(string* utf8_text) const; From 21805e63a4980ae58717f23824c14f02ef31e4be Mon Sep 17 00:00:00 2001 From: Ray Smith Date: Tue, 12 May 2015 17:56:04 -0700 Subject: [PATCH 12/15] Improved performance with PIC compilation option --- classify/intmatcher.cpp | 108 +++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 63 deletions(-) diff --git a/classify/intmatcher.cpp b/classify/intmatcher.cpp index cb4b2eb331..b876435cd3 100644 --- a/classify/intmatcher.cpp +++ b/classify/intmatcher.cpp @@ -50,58 +50,51 @@ using tesseract::UnicharRating; const float IntegerMatcher::kSEExponentialMultiplier = 0.0; const float IntegerMatcher::kSimilarityCenter = 0.0075; -static const uinT8 offset_table[256] = { - 255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, - 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 -}; - -static const uinT8 next_table[256] = { - 0, 0, 0, 0x2, 0, 0x4, 0x4, 0x6, 0, 0x8, 0x8, 0x0a, 0x08, 0x0c, 0x0c, 0x0e, - 0, 0x10, 0x10, 0x12, 0x10, 0x14, 0x14, 0x16, 0x10, 0x18, 0x18, 0x1a, 0x18, - 0x1c, 0x1c, 0x1e, - 0, 0x20, 0x20, 0x22, 0x20, 0x24, 0x24, 0x26, 0x20, 0x28, 0x28, 0x2a, 0x28, - 0x2c, 0x2c, 0x2e, - 0x20, 0x30, 0x30, 0x32, 0x30, 0x34, 0x34, 0x36, 0x30, 0x38, 0x38, 0x3a, - 0x38, 0x3c, 0x3c, 0x3e, - 0, 0x40, 0x40, 0x42, 0x40, 0x44, 0x44, 0x46, 0x40, 0x48, 0x48, 0x4a, 0x48, - 0x4c, 0x4c, 0x4e, - 0x40, 0x50, 0x50, 0x52, 0x50, 0x54, 0x54, 0x56, 0x50, 0x58, 0x58, 0x5a, - 0x58, 0x5c, 0x5c, 0x5e, - 0x40, 0x60, 0x60, 0x62, 0x60, 0x64, 0x64, 0x66, 0x60, 0x68, 0x68, 0x6a, - 0x68, 0x6c, 0x6c, 0x6e, - 0x60, 0x70, 0x70, 0x72, 0x70, 0x74, 0x74, 0x76, 0x70, 0x78, 0x78, 0x7a, - 0x78, 0x7c, 0x7c, 0x7e, - 0, 0x80, 0x80, 0x82, 0x80, 0x84, 0x84, 0x86, 0x80, 0x88, 0x88, 0x8a, 0x88, - 0x8c, 0x8c, 0x8e, - 0x80, 0x90, 0x90, 0x92, 0x90, 0x94, 0x94, 0x96, 0x90, 0x98, 0x98, 0x9a, - 0x98, 0x9c, 0x9c, 0x9e, - 0x80, 0xa0, 0xa0, 0xa2, 0xa0, 0xa4, 0xa4, 0xa6, 0xa0, 0xa8, 0xa8, 0xaa, - 0xa8, 0xac, 0xac, 0xae, - 0xa0, 0xb0, 0xb0, 0xb2, 0xb0, 0xb4, 0xb4, 0xb6, 0xb0, 0xb8, 0xb8, 0xba, - 0xb8, 0xbc, 0xbc, 0xbe, - 0x80, 0xc0, 0xc0, 0xc2, 0xc0, 0xc4, 0xc4, 0xc6, 0xc0, 0xc8, 0xc8, 0xca, - 0xc8, 0xcc, 0xcc, 0xce, - 0xc0, 0xd0, 0xd0, 0xd2, 0xd0, 0xd4, 0xd4, 0xd6, 0xd0, 0xd8, 0xd8, 0xda, - 0xd8, 0xdc, 0xdc, 0xde, - 0xc0, 0xe0, 0xe0, 0xe2, 0xe0, 0xe4, 0xe4, 0xe6, 0xe0, 0xe8, 0xe8, 0xea, - 0xe8, 0xec, 0xec, 0xee, - 0xe0, 0xf0, 0xf0, 0xf2, 0xf0, 0xf4, 0xf4, 0xf6, 0xf0, 0xf8, 0xf8, 0xfa, - 0xf8, 0xfc, 0xfc, 0xfe -}; +#define offset_table_entries \ + 255, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, \ + 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, \ + 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, 0, 1, 0, 2, 0, 1, 0, 3, \ + 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, \ + 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, \ + 0, 1, 0, 2, 0, 1, 0, 7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, \ + 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, \ + 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 6, \ + 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, 0, 1, 0, 2, 0, 1, 0, 3, \ + 0, 1, 0, 2, 0, 1, 0, 5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 4, \ + 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0 + +#define INTMATCHER_OFFSET_TABLE_SIZE 256 + +#define next_table_entries \ + 0, 0, 0, 0x2, 0, 0x4, 0x4, 0x6, 0, 0x8, 0x8, 0x0a, 0x08, 0x0c, 0x0c, 0x0e, \ + 0, 0x10, 0x10, 0x12, 0x10, 0x14, 0x14, 0x16, 0x10, 0x18, 0x18, 0x1a, \ + 0x18, 0x1c, 0x1c, 0x1e, 0, 0x20, 0x20, 0x22, 0x20, 0x24, 0x24, 0x26, \ + 0x20, 0x28, 0x28, 0x2a, 0x28, 0x2c, 0x2c, 0x2e, 0x20, 0x30, 0x30, 0x32, \ + 0x30, 0x34, 0x34, 0x36, 0x30, 0x38, 0x38, 0x3a, 0x38, 0x3c, 0x3c, 0x3e, \ + 0, 0x40, 0x40, 0x42, 0x40, 0x44, 0x44, 0x46, 0x40, 0x48, 0x48, 0x4a, \ + 0x48, 0x4c, 0x4c, 0x4e, 0x40, 0x50, 0x50, 0x52, 0x50, 0x54, 0x54, 0x56, \ + 0x50, 0x58, 0x58, 0x5a, 0x58, 0x5c, 0x5c, 0x5e, 0x40, 0x60, 0x60, 0x62, \ + 0x60, 0x64, 0x64, 0x66, 0x60, 0x68, 0x68, 0x6a, 0x68, 0x6c, 0x6c, 0x6e, \ + 0x60, 0x70, 0x70, 0x72, 0x70, 0x74, 0x74, 0x76, 0x70, 0x78, 0x78, 0x7a, \ + 0x78, 0x7c, 0x7c, 0x7e, 0, 0x80, 0x80, 0x82, 0x80, 0x84, 0x84, 0x86, \ + 0x80, 0x88, 0x88, 0x8a, 0x88, 0x8c, 0x8c, 0x8e, 0x80, 0x90, 0x90, 0x92, \ + 0x90, 0x94, 0x94, 0x96, 0x90, 0x98, 0x98, 0x9a, 0x98, 0x9c, 0x9c, 0x9e, \ + 0x80, 0xa0, 0xa0, 0xa2, 0xa0, 0xa4, 0xa4, 0xa6, 0xa0, 0xa8, 0xa8, 0xaa, \ + 0xa8, 0xac, 0xac, 0xae, 0xa0, 0xb0, 0xb0, 0xb2, 0xb0, 0xb4, 0xb4, 0xb6, \ + 0xb0, 0xb8, 0xb8, 0xba, 0xb8, 0xbc, 0xbc, 0xbe, 0x80, 0xc0, 0xc0, 0xc2, \ + 0xc0, 0xc4, 0xc4, 0xc6, 0xc0, 0xc8, 0xc8, 0xca, 0xc8, 0xcc, 0xcc, 0xce, \ + 0xc0, 0xd0, 0xd0, 0xd2, 0xd0, 0xd4, 0xd4, 0xd6, 0xd0, 0xd8, 0xd8, 0xda, \ + 0xd8, 0xdc, 0xdc, 0xde, 0xc0, 0xe0, 0xe0, 0xe2, 0xe0, 0xe4, 0xe4, 0xe6, \ + 0xe0, 0xe8, 0xe8, 0xea, 0xe8, 0xec, 0xec, 0xee, 0xe0, 0xf0, 0xf0, 0xf2, \ + 0xf0, 0xf4, 0xf4, 0xf6, 0xf0, 0xf8, 0xf8, 0xfa, 0xf8, 0xfc, 0xfc, 0xfe + +// See http://b/19318793 (#6) for a complete discussion. Merging arrays +// offset_table and next_table helps improve performance of PIE code. +static const uinT8 data_table[512] = {offset_table_entries, next_table_entries}; + +static const uinT8* const offset_table = &data_table[0]; +static const uinT8* const next_table = + &data_table[INTMATCHER_OFFSET_TABLE_SIZE]; namespace tesseract { @@ -1274,17 +1267,6 @@ float IntegerMatcher::ApplyCNCorrection(float rating, int blob_length, (blob_length + matcher_multiplier); } -/*---------------------------------------------------------------------------*/ -#ifndef GRAPHICS_DISABLED -// Print debug information about the best match for the current class. -void IntegerMatcher::DebugBestMatch( - int BestMatch, INT_RESULT Result) { - tprintf("Rating = %5.1f%% Best Config = %3d, Distance = %5.1f\n", - 100.0 * Result->Rating, Result->Config, - 100.0 * (65536.0 - BestMatch) / 65536.0); -} -#endif - /*---------------------------------------------------------------------------*/ void HeapSort (int n, register int ra[], register int rb[]) { From 6be25156f75ed7572b2d65e8fe6b9679adfd5599 Mon Sep 17 00:00:00 2001 From: Ray Smith Date: Tue, 12 May 2015 18:04:31 -0700 Subject: [PATCH 13/15] Major updates to training system as a result of extensive testing on 100 languages --- training/language-specific.sh | 1131 ++++++++++++++++++++++++ training/ligature_table.cpp | 2 +- training/pango_font_info.cpp | 147 ++- training/pango_font_info.h | 23 +- training/set_unicharset_properties.cpp | 161 +--- training/stringrenderer.cpp | 4 +- training/tesstrain.sh | 536 +---------- training/tesstrain_utils.sh | 578 ++++++++++++ training/text2image.cpp | 11 +- training/unicharset_training_utils.cpp | 193 ++++ training/unicharset_training_utils.h | 50 ++ 11 files changed, 2104 insertions(+), 732 deletions(-) create mode 100644 training/language-specific.sh create mode 100755 training/tesstrain_utils.sh create mode 100644 training/unicharset_training_utils.cpp create mode 100644 training/unicharset_training_utils.h diff --git a/training/language-specific.sh b/training/language-specific.sh new file mode 100644 index 0000000000..6514d23de7 --- /dev/null +++ b/training/language-specific.sh @@ -0,0 +1,1131 @@ +# +# Set some language specific variables. Works in conjunction with +# tesstrain.sh +# + +#============================================================================= +# Language specific info +#============================================================================= + +# Array of all valid language codes. +VALID_LANGUAGE_CODES="afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat + ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo + ell eng enm epo est eus fas fil fin fra frk frm gle glg + grc guj hat heb hin hrv hun hye iku ind isl ita ita_old + jav jpn kan kat kat_old kaz khm kir kor kur lao lat + lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori + pan pol por pus ron rus san sin slk slv snd spa spa_old + sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur + uig ukr urd uzb uzb_cyrl vie yid" + +# Codes for which we have webtext but no fonts: +# armenian, dhivehi, mongolian (we support mongolian cyrillic as in the webtext, +# but not mongolian script with vertical writing direction), sindhi (for which +# we have persian script webtext, but real sindhi text can be in persian OR +# devanagari script) +UNUSABLE_LANGUAGE_CODES="hye div mon snd" + +FRAKTUR_FONTS=( + "CaslonishFraxx Medium" \ + "Cloister Black, Light" \ + "Proclamate Light" \ + "UnifrakturMaguntia" \ + "Walbaum-Fraktur" \ +) + +# List of fonts to train on +LATIN_FONTS=( + "Arial Bold" \ + "Arial Bold Italic" \ + "Arial Italic" \ + "Arial" \ + "Courier New Bold" \ + "Courier New Bold Italic" \ + "Courier New Italic" \ + "Courier New" \ + "Times New Roman, Bold" \ + "Times New Roman, Bold Italic" \ + "Times New Roman, Italic" \ + "Times New Roman," \ + "Georgia Bold" \ + "Georgia Italic" \ + "Georgia" \ + "Georgia Bold Italic" \ + "Trebuchet MS Bold" \ + "Trebuchet MS Bold Italic" \ + "Trebuchet MS Italic" \ + "Trebuchet MS" \ + "Verdana Bold" \ + "Verdana Italic" \ + "Verdana" \ + "Verdana Bold Italic" \ + "URW Bookman L Bold" \ + "URW Bookman L Italic" \ + "URW Bookman L Bold Italic" \ + "Century Schoolbook L Bold" \ + "Century Schoolbook L Italic" \ + "Century Schoolbook L Bold Italic" \ + "Century Schoolbook L Medium" \ + "DejaVu Sans Ultra-Light" \ +) + +EARLY_LATIN_FONTS=( + "${FRAKTUR_FONTS[@]}" \ + "${LATIN_FONTS[@]}" \ + # The Wyld font family renders early modern ligatures encoded in the private + # unicode area. + "Wyld" \ + "Wyld Italic" \ + # Fonts that render the Yogh symbol (U+021C, U+021D) found in Old English. + "GentiumAlt" \ +) + +VIETNAMESE_FONTS=( \ + "Arial Unicode MS Bold" \ + "Arial Bold Italic" \ + "Arial Italic" \ + "Arial Unicode MS" \ + "FreeMono Bold" \ + "Courier New Bold Italic" \ + "FreeMono Italic" \ + "FreeMono" \ + "GentiumAlt Italic" \ + "GentiumAlt" \ + "Palatino Linotype Bold" \ + "Palatino Linotype Bold Italic" \ + "Palatino Linotype Italic" \ + "Palatino Linotype" \ + "Really No 2 LT W2G Light" \ + "Really No 2 LT W2G Light Italic" \ + "Really No 2 LT W2G Medium" \ + "Really No 2 LT W2G Medium Italic" \ + "Really No 2 LT W2G Semi-Bold" \ + "Really No 2 LT W2G Semi-Bold Italic" \ + "Really No 2 LT W2G Ultra-Bold" \ + "Really No 2 LT W2G Ultra-Bold Italic" \ + "Times New Roman, Bold" \ + "Times New Roman, Bold Italic" \ + "Times New Roman, Italic" \ + "Times New Roman," \ + "Verdana Bold" \ + "Verdana Italic" \ + "Verdana" \ + "Verdana Bold Italic" \ + "VL Gothic" \ + "VL PGothic" \ + ) + +DEVANAGARI_FONTS=( \ + "FreeSans" \ + "Chandas" \ + "Kalimati" \ + "Uttara" \ + "Lucida Sans" \ + "gargi Medium" \ + "Lohit Devanagari" \ + "Arial Unicode MS Bold" \ + "Ascender Uni" \ + "Noto Sans Devanagari Bold" \ + "Noto Sans Devanagari" \ + "Samyak Devanagari Medium" \ + "Sarai" \ + "Saral LT Bold" \ + "Saral LT Light" \ + "Nakula" \ + "Sahadeva" \ + "Samanata" \ + "Santipur OT Medium" \ + ) + +KANNADA_FONTS=( \ + "Kedage Bold" \ + "Kedage Italic" \ + "Kedage" \ + "Kedage Bold Italic" \ + "Mallige Bold" \ + "Mallige Italic" \ + "Mallige" \ + "Mallige Bold Italic" \ + "Arial Unicode MS" \ + "Arial Unicode MS Bold" \ + "Ascender Uni" \ + "cheluvi Medium" \ + "Noto Sans Kannada Bold" \ + "Noto Sans Kannada" \ + "Lohit Kannada" \ + "Tunga" \ + "Tunga Bold" \ + ) + +TELUGU_FONTS=( \ + "Pothana2000" \ + "Vemana2000" \ + "Lohit Telugu" \ + "Arial Unicode MS Bold" \ + "Ascender Uni" \ + "Dhurjati" \ + "Gautami Bold" \ + "Gidugu" \ + "Gurajada" \ + "Lakki Reddy" \ + "Mallanna" \ + "Mandali" \ + "NATS" \ + "NTR" \ + "Noto Sans Telugu Bold" \ + "Noto Sans Telugu" \ + "Peddana" \ + "Ponnala" \ + "Ramabhadra" \ + "Ravi Prakash" \ + "Sree Krushnadevaraya" \ + "Suranna" \ + "Suravaram" \ + "Tenali Ramakrishna" \ + "Gautami" \ + ) + +TAMIL_FONTS=( \ + "TAMu_Kadambri" \ + "TAMu_Kalyani" \ + "TAMu_Maduram" \ + "TSCu_Paranar" \ + "TSCu_Times" \ + "TSCu_Paranar Bold" \ + "FreeSans" \ + "FreeSerif" \ + "Lohit Tamil" \ + "Arial Unicode MS Bold" \ + "Ascender Uni" \ + "Droid Sans Tamil Bold" \ + "Droid Sans Tamil" \ + "Karla Tamil Inclined Bold Italic" \ + "Karla Tamil Inclined Italic" \ + "Karla Tamil Upright Bold" \ + "Karla Tamil Upright" \ + "Noto Sans Tamil Bold" \ + "Noto Sans Tamil" \ + "Noto Sans Tamil UI Bold" \ + "Noto Sans Tamil UI" \ + "TSCu_Comic Normal" \ + "Lohit Tamil Classical" \ + ) + +THAI_FONTS=( \ + "FreeSerif" \ + "FreeSerif Italic" \ + "Garuda" \ + "Norasi" \ + "Lucida Sans Typewriter" \ + "Lucida Sans" \ + "Garuda Oblique" \ + "Norasi Oblique" \ + "Norasi Italic" \ + "Garuda Bold" \ + "Norasi Bold" \ + "Lucida Sans Typewriter Bold" \ + "Lucida Sans Semi-Bold" \ + "Garuda Bold Oblique" \ + "Norasi Bold Italic" \ + "Norasi Bold Oblique" \ + "AnuParp LT Thai" \ + "Arial Unicode MS Bold" \ + "Arial Unicode MS" \ + "Ascender Uni" \ + "Loma" \ + "Noto Serif Thai Bold" \ + "Noto Serif Thai" \ + "Purisa Light" \ + "Sirichana LT Bold" \ + "Sirichana LT" \ + "Sukothai LT Bold" \ + "Sukothai LT" \ + "UtSaHaGumm LT Thai" \ + "Tahoma" \ + ) + +KOREAN_FONTS=( \ + "Arial Unicode MS" \ + "Arial Unicode MS Bold" \ + "Ascender Uni" \ + "Baekmuk Batang Patched" \ + "Baekmuk Batang" \ + "Baekmuk Dotum" \ + "Baekmuk Gulim" \ + "Baekmuk Headline" \ + "Bandal Medium" \ + "Bangwool Medium" \ + "Dotum" \ + "Eunjin Medium" \ + "EunjinNakseo Medium" \ + "FBHanGothicDB" \ + "Guseul Medium" \ + "JejuGothic" \ + "JejuHallasan" \ + "JejuMyeongjo" \ + "KoPub Batang Bold" \ + "KoPub Batang Light" \ + "KoPub Batang" \ + "Nanum Brush Script" \ + "NanumGothic Bold" \ + "NanumGothic Ultra-Bold" \ + "NanumGothic" \ + "NanumMyeongjo Bold" \ + "NanumMyeongjo Semi-Bold" \ + "NanumMyeongjo" \ + "Nanum Pen" \ + "WenQuanYi Zen Hei Medium" \ + ) + +CHI_SIM_FONTS=( \ + "AR PL UKai CN" \ + "AR PL UMing Patched Light" \ + "Arial Unicode MS" \ + "Arial Unicode MS Bold" \ + "CFangSongPRC" \ + "CGuLi PRC" \ + "CGuYin PRC" \ + "CHei2 PRC" \ + "CHei3 PRC" \ + "CNganKai PRC" \ + "CPo3 PRC" \ + "CPo PRC" \ + "CSong3 PRC" \ + "CWeiBei PRC" \ + "CXLi PRC" \ + "CXYao PRC" \ + "CXing PRC" \ + "CYuen2 PRC" \ + "MComic PRC" \ + "MCute PRC" \ + "MElle PRC" \ + "MGentle PRC" \ + "MJNgai PRC" \ + "MKai PRC" \ + "MMarker PRC" \ + "MRocky PRC" \ + "MSung PRC" \ + "MWindy PRC" \ + "MYoung PRC" \ + "MYuen PRC" \ + "MYuppy PRC" \ + "WenQuanYi Zen Hei Medium" \ + ) + +# The PRC fonts don't cover all the character set for chi_tra, but they +# provide a broader view of the fonts for the characters they do cover. +CHI_TRA_FONTS=( \ + "AR PL UKai Patched" \ + "AR PL UMing Patched Light" \ + "Arial Unicode MS" \ + "Arial Unicode MS Bold" \ + "CFangSongPRC" \ + "CGuLi PRC" \ + "CGuYin PRC" \ + "CHei2 PRC" \ + "CHei3 PRC" \ + "CNganKai PRC" \ + "CPo3 PRC" \ + "CPo PRC" \ + "CSong3 PRC" \ + "CWeiBei PRC" \ + "CXLi PRC" \ + "CXYao PRC" \ + "CXing PRC" \ + "CYuen2 PRC" \ + "MComic PRC" \ + "MCute PRC" \ + "MElle PRC" \ + "MGentle PRC" \ + "MJNgai PRC" \ + "MKai PRC" \ + "MMarker PRC" \ + "MRocky PRC" \ + "MSung PRC" \ + "MWindy PRC" \ + "MYoung PRC" \ + "MYuen PRC" \ + "MYuppy PRC" \ + "WenQuanYi Zen Hei Medium" \ + ) + +JPN_FONTS=( \ + "TakaoExGothic" \ + "TakaoExMincho" \ + "TakaoGothic" \ + "TakaoMincho" \ + "TakaoPGothic" \ + "TakaoPMincho" \ + "VL Gothic" \ + "VL PGothic" \ + "Noto Sans Japanese Black" \ + "Noto Sans Japanese Bold" \ + "Noto Sans Japanese Light" \ + "Noto Sans Japanese Medium" \ + "Noto Sans Japanese" \ + "Noto Sans Japanese Thin" \ + "IPAGothic" \ + "IPAPGothic" \ + "IPAUIGothic" \ + "IPAMincho" \ + "IPAPMincho" \ + "Kochi Gothic" \ + "Kochi Mincho" \ + "Monapo" \ + "UmePlus Gothic" \ + "UmePlus P Gothic" \ + "WenQuanYi Zen Hei Medium" \ + ) + +RUSSIAN_FONTS=( \ + "Arial Bold" \ + "Arial Bold Italic" \ + "Arial Italic" \ + "Arial" \ + "Courier New Bold" \ + "Courier New Bold Italic" \ + "Courier New Italic" \ + "Courier New" \ + "Times New Roman, Bold" \ + "Times New Roman, Bold Italic" \ + "Times New Roman, Italic" \ + "Times New Roman," \ + "Georgia Bold" \ + "Georgia Italic" \ + "Georgia" \ + "Georgia Bold Italic" \ + "Trebuchet MS Bold" \ + "Trebuchet MS Bold Italic" \ + "Trebuchet MS Italic" \ + "Trebuchet MS" \ + "Verdana Bold" \ + "Verdana Italic" \ + "Verdana" \ + "Verdana Bold Italic" \ + "DejaVu Serif" \ + "DejaVu Serif Oblique" \ + "DejaVu Serif Bold" \ + "DejaVu Serif Bold Oblique" \ + "Lucida Bright" \ + "FreeSerif Bold" \ + "FreeSerif Bold Italic" \ + "DejaVu Sans Ultra-Light" \ + ) + +GREEK_FONTS=( \ + "Arial Unicode MS" \ + "Arial Unicode MS Bold" \ + "DejaVu Sans Mono" \ + "DejaVu Sans Mono Oblique" \ + "DejaVu Sans Mono Bold" \ + "DejaVu Sans Mono Bold Oblique" \ + "DejaVu Serif" \ + "DejaVu Serif Semi-Condensed" \ + "DejaVu Serif Oblique" \ + "DejaVu Serif Bold" \ + "DejaVu Serif Bold Oblique" \ + "DejaVu Serif Bold Semi-Condensed" \ + "FreeSerif Bold" \ + "FreeSerif Bold Italic" \ + "FreeSerif Italic" \ + "FreeSerif" \ + "GentiumAlt" \ + "GentiumAlt Italic" \ + "Linux Biolinum O Bold" \ + "Linux Biolinum O" \ + "Linux Libertine O Bold" \ + "Linux Libertine O" \ + "Linux Libertine O Bold Italic" \ + "Linux Libertine O Italic" \ + "Palatino Linotype Bold" \ + "Palatino Linotype Bold Italic" \ + "Palatino Linotype Italic" \ + "Palatino Linotype" \ + "UmePlus P Gothic" \ + "VL PGothic" \ + ) + +ARABIC_FONTS=( \ + "Arabic Transparent Bold" \ + "Arabic Transparent" \ + "Arab" \ + "Arial Unicode MS Bold" \ + "Arial Unicode MS" \ + "ASVCodar LT Bold" \ + "ASVCodar LT Light" \ + "Badiya LT Bold" \ + "Badiya LT" \ + "Badr LT Bold" \ + "Badr LT" \ + "Dimnah" \ + "Frutiger LT Arabic Bold" \ + "Frutiger LT Arabic" \ + "Furat" \ + "Hassan LT Bold" \ + "Hassan LT Light" \ + "Jalal LT Bold" \ + "Jalal LT Light" \ + "Midan Bold" \ + "Midan" \ + "Mitra LT Bold" \ + "Mitra LT Light" \ + "Palatino LT Arabic" \ + "Palatino Sans Arabic Bold" \ + "Palatino Sans Arabic" \ + "Simplified Arabic Bold" \ + "Simplified Arabic" \ + "Times New Roman, Bold" \ + "Times New Roman," \ + "Traditional Arabic Bold" \ + "Traditional Arabic" \ + ) + +HEBREW_FONTS=( \ + "Arial Bold" \ + "Arial Bold Italic" \ + "Arial Italic" \ + "Arial" \ + "Courier New Bold" \ + "Courier New Bold Italic" \ + "Courier New Italic" \ + "Courier New" \ + "Ergo Hebrew Semi-Bold" \ + "Ergo Hebrew Semi-Bold Italic" \ + "Ergo Hebrew" \ + "Ergo Hebrew Italic" \ + "Really No 2 LT W2G Light" \ + "Really No 2 LT W2G Light Italic" \ + "Really No 2 LT W2G Medium" \ + "Really No 2 LT W2G Medium Italic" \ + "Really No 2 LT W2G Semi-Bold" \ + "Really No 2 LT W2G Semi-Bold Italic" \ + "Really No 2 LT W2G Ultra-Bold" \ + "Really No 2 LT W2G Ultra-Bold Italic" \ + "Times New Roman, Bold" \ + "Times New Roman, Bold Italic" \ + "Times New Roman, Italic" \ + "Times New Roman," \ + "Lucida Sans" \ + "Tahoma" \ + ) + +BENGALI_FONTS=( \ + "Bangla Medium" \ + "Lohit Bengali" \ + "Mukti Narrow" \ + "Mukti Narrow Bold" \ + "Jamrul Medium Semi-Expanded" \ + "Likhan Medium" \ + "Arial Unicode MS Bold" \ + "Ascender Uni" \ + "FreeSans" \ + "FreeSans Oblique" \ + "FreeSerif" \ + "FreeSerif Italic" \ + "Noto Sans Bengali Bold" \ + "Noto Sans Bengali" \ + "Ani" \ + "Lohit Assamese" \ + "Lohit Bengali" \ + "Mitra Mono" \ + ) + +KYRGYZ_FONTS=( \ + "Arial" \ + "Arial Bold" \ + "Arial Italic" \ + "Arial Bold Italic" \ + "Courier New" \ + "Courier New Bold" \ + "Courier New Italic" \ + "Courier New Bold Italic" \ + "Times New Roman," \ + "Times New Roman, Bold" \ + "Times New Roman, Bold Italic" \ + "Times New Roman, Italic" \ + "DejaVu Serif" \ + "DejaVu Serif Oblique" \ + "DejaVu Serif Bold" \ + "DejaVu Serif Bold Oblique" \ + "Lucida Bright" \ + "FreeSerif Bold" \ + "FreeSerif Bold Italic" \ + ) + +PERSIAN_FONTS=( \ + "Amiri Bold Italic" \ + "Amiri Bold" \ + "Amiri Italic" \ + "Amiri" \ + "Andale Sans Arabic Farsi" \ + "Arial Unicode MS" \ + "Arial Unicode MS Bold" \ + "Lateef" \ + "Lucida Bright" \ + "Lucida Sans Oblique" \ + "Lucida Sans Semi-Bold" \ + "Lucida Sans" \ + "Lucida Sans Typewriter Bold" \ + "Lucida Sans Typewriter Oblique" \ + "Lucida Sans Typewriter" \ + "Scheherazade" \ + "Tahoma" \ + "Times New Roman," \ + "Times New Roman, Bold" \ + "Times New Roman, Bold Italic" \ + "Times New Roman, Italic" \ + "Yakout Linotype Bold" \ + "Yakout Linotype" \ + ) + +AMHARIC_FONTS=( \ + "Abyssinica SIL" + "Droid Sans Ethiopic Bold" \ + "Droid Sans Ethiopic" \ + "FreeSerif" \ + "Noto Sans Ethiopic Bold" \ + "Noto Sans Ethiopic" \ + ) + +ARMENIAN_FONTS=( \ + "Arial Unicode MS" \ + "Arial Unicode MS Bold" \ + "Ascender Uni" \ + "FreeMono" \ + "FreeMono Italic" \ + "FreeSans" \ + "FreeSans Bold" \ + "FreeSans Oblique" \ + ) + +BURMESE_FONTS=( \ + "Myanmar Sans Pro" \ + "Noto Sans Myanmar Bold" \ + "Noto Sans Myanmar" \ + "Padauk Bold" \ + "Padauk" \ + "TharLon" \ + ) + +NORTH_AMERICAN_ABORIGINAL_FONTS=( \ + "Aboriginal Sans" \ + "Aboriginal Sans Bold Italic" \ + "Aboriginal Sans Italic" \ + "Aboriginal Sans Bold" \ + "Aboriginal Serif Bold" \ + "Aboriginal Serif Bold Italic" \ + "Aboriginal Serif Italic" \ + "Aboriginal Serif" \ + ) + +GEORGIAN_FONTS=( \ + "Arial Unicode MS Bold" \ + "Arial Unicode MS" \ + "BPG Algeti GPL&GNU" \ + "BPG Chveulebrivi GPL&GNU" \ + "BPG Courier GPL&GNU" \ + "BPG Courier S GPL&GNU" \ + "BPG DejaVu Sans 2011 GNU-GPL" \ + "BPG Elite GPL&GNU" \ + "BPG Excelsior GPL&GNU" \ + "BPG Glaho GPL&GNU" \ + "BPG Gorda GPL&GNU" \ + "BPG Ingiri GPL&GNU" \ + "BPG Mrgvlovani Caps GNU&GPL" \ + "BPG Mrgvlovani GPL&GNU" \ + "BPG Nateli Caps GPL&GNU Light" \ + "BPG Nateli Condenced GPL&GNU Light" \ + "BPG Nateli GPL&GNU Light" \ + "BPG Nino Medium Cond GPL&GNU" \ + "BPG Nino Medium GPL&GNU Medium" \ + "BPG Sans GPL&GNU" \ + "BPG Sans Medium GPL&GNU" \ + "BPG Sans Modern GPL&GNU" \ + "BPG Sans Regular GPL&GNU" \ + "BPG Serif GPL&GNU" \ + "BPG Serif Modern GPL&GNU" \ + "FreeMono" \ + "FreeMono Bold Italic" \ + "FreeSans" \ + "FreeSerif" \ + "FreeSerif Bold" \ + "FreeSerif Bold Italic" \ + "FreeSerif Italic" \ + ) + +OLD_GEORGIAN_FONTS=( \ + "Arial Unicode MS Bold" \ + "Arial Unicode MS" \ + "BPG Algeti GPL&GNU" \ + "BPG Courier S GPL&GNU" \ + "BPG DejaVu Sans 2011 GNU-GPL" \ + "BPG Elite GPL&GNU" \ + "BPG Excelsior GPL&GNU" \ + "BPG Glaho GPL&GNU" \ + "BPG Ingiri GPL&GNU" \ + "BPG Mrgvlovani Caps GNU&GPL" \ + "BPG Mrgvlovani GPL&GNU" \ + "BPG Nateli Caps GPL&GNU Light" \ + "BPG Nateli Condenced GPL&GNU Light" \ + "BPG Nateli GPL&GNU Light" \ + "BPG Nino Medium Cond GPL&GNU" \ + "BPG Nino Medium GPL&GNU Medium" \ + "BPG Sans GPL&GNU" \ + "BPG Sans Medium GPL&GNU" \ + "BPG Sans Modern GPL&GNU" \ + "BPG Sans Regular GPL&GNU" \ + "BPG Serif GPL&GNU" \ + "BPG Serif Modern GPL&GNU" \ + "FreeSans" \ + "FreeSerif" \ + "FreeSerif Bold" \ + "FreeSerif Bold Italic" \ + "FreeSerif Italic" \ + ) + +KHMER_FONTS=( \ + "Khmer OS" \ + "Khmer OS System" \ + "Khmer OS Battambang" \ + "Khmer OS Bokor" \ + "Khmer OS Content" \ + "Khmer OS Fasthand" \ + "Khmer OS Freehand" \ + "Khmer OS Metal Chrieng" \ + "Khmer OS Muol Light" \ + "Khmer OS Muol Pali" \ + "Khmer OS Muol" \ + "Khmer OS Siemreap" \ + "Noto Sans Bold" \ + "Noto Sans" \ + "Noto Serif Khmer Bold" \ + "Noto Serif Khmer Light" \ + ) + +KURDISH_FONTS=( \ + "Amiri Bold Italic" \ + "Amiri Bold" \ + "Amiri Italic" \ + "Amiri" \ + "Arial Unicode MS" \ + "Arial Unicode MS Bold" \ + "Lateef" \ + "Lucida Bright" \ + "Lucida Sans Oblique" \ + "Lucida Sans Semi-Bold" \ + "Lucida Sans" \ + "Lucida Sans Typewriter Bold" \ + "Lucida Sans Typewriter Oblique" \ + "Lucida Sans Typewriter" \ + "Scheherazade" \ + "Tahoma" \ + "Times New Roman," \ + "Times New Roman, Bold" \ + "Times New Roman, Bold Italic" \ + "Times New Roman, Italic" \ + "Unikurd Web" \ + "Yakout Linotype Bold" \ + "Yakout Linotype" \ + ) + +LAOTHIAN_FONTS=( \ + "Phetsarath OT" \ + "Arial Unicode MS" \ + "Arial Unicode MS Bold" \ + "Ascender Uni" \ + "Dhyana Bold" \ + "Dhyana" \ + "Lao Muang Don" \ + "Lao Muang Khong" \ + "Lao Sans Pro" \ + "Noto Sans Lao Bold" \ + "Noto Sans Lao" \ + "Noto Sans Lao UI Bold" \ + "Noto Sans Lao UI" \ + "Noto Serif Lao Bold" \ + "Noto Serif Lao" \ + "Phetsarath Bold" \ + "Phetsarath" \ + "Souliyo Unicode" \ +) + +GUJARATI_FONTS=( \ + "Lohit Gujarati" \ + "Rekha Medium" \ + "Samyak Gujarati Medium" \ + "aakar Medium" \ + "padmaa Bold" \ + "padmaa Medium" \ + "Arial Unicode MS" \ + "Arial Unicode MS Bold" \ + "Ascender Uni" \ + "FreeSans" \ + "Noto Sans Gujarati Bold" \ + "Noto Sans Gujarati" \ + "Shruti" \ + "Shruti Bold" \ + ) + +MALAYALAM_FONTS=( \ + "AnjaliOldLipi" \ + "Arial Unicode MS" \ + "Arial Unicode MS Bold" \ + "Ascender Uni" \ + "Dyuthi" \ + "FreeSerif" \ + "Kalyani" \ + "Kartika" \ + "Kartika Bold" \ + "Lohit Malayalam" \ + "Meera" \ + "Noto Sans Malayalam Bold" \ + "Noto Sans Malayalam" \ + "Rachana" \ + "Rachana_w01" \ + "RaghuMalayalam" \ + "suruma" \ + ) + +ORIYA_FONTS=( \ + "Arial Unicode MS" \ + "Arial Unicode MS Bold" \ + "Ascender Uni" \ + "ori1Uni Medium" \ + "Samyak Oriya Medium" \ + "Lohit Oriya" \ + ) + +PUNJABI_FONTS=( \ + "Arial Unicode MS" \ + "Arial Unicode MS Bold" \ + "Ascender Uni" \ + "Saab" \ + "Lohit Punjabi" \ + "Noto Sans Gurmukhi" \ + "Noto Sans Gurmukhi Bold" \ + "FreeSans" \ + "FreeSans Bold" \ + "FreeSerif" \ + ) + +SINHALA_FONTS=( \ + "Noto Sans Sinhala Bold" \ + "Noto Sans Sinhala" \ + "OCRUnicode" \ + "Yagpo" \ + "LKLUG" \ + "FreeSerif" \ + ) + +SYRIAC_FONTS=( \ + "East Syriac Adiabene" \ + "East Syriac Ctesiphon" \ + "Estrangelo Antioch" \ + "Estrangelo Edessa" \ + "Estrangelo Midyat" \ + "Estrangelo Nisibin" \ + "Estrangelo Quenneshrin" \ + "Estrangelo Talada" \ + "Estrangelo TurAbdin" \ + "Serto Batnan Bold" \ + "Serto Batnan" \ + "Serto Jerusalem Bold" \ + "Serto Jerusalem Italic" \ + "Serto Jerusalem" \ + "Serto Kharput" \ + "Serto Malankara" \ + "Serto Mardin Bold" \ + "Serto Mardin" \ + "Serto Urhoy Bold" \ + "Serto Urhoy" \ + "FreeSans" \ + ) + +THAANA_FONTS=( \ + "FreeSerif" \ + ) + +TIBETAN_FONTS=( \ + "Arial Unicode MS" \ + "Arial Unicode MS Bold" \ + "Ascender Uni" \ + "DDC Uchen" \ + "Jomolhari" \ + "Kailasa" \ + "Kokonor" \ + "Tibetan Machine Uni" \ + "TibetanTsugRing" \ + "Yagpo" \ + ) + +# The following fonts will be rendered vertically in phase I. +VERTICAL_FONTS=( \ + "TakaoExGothic" \ # for jpn + "TakaoExMincho" \ # for jpn + "AR PL UKai Patched" \ # for chi_tra + "AR PL UMing Patched Light" \ # for chi_tra + "Baekmuk Batang Patched" \ # for kor + ) + +# Set language-specific values for several global variables, including +# ${TEXT_CORPUS} +# holds the text corpus file for the language, used in phase F +# ${FONTS[@]} +# holds a sequence of applicable fonts for the language, used in +# phase F & I +# ${TRAINING_DATA_ARGUMENTS} +# non-default arguments to the training_data program used in phase T +# ${FILTER_ARGUMENTS} - +# character-code-specific filtering to distinguish between scripts +# (eg. CJK) used by filter_borbidden_characters in phase F +# ${WORDLIST2DAWG_ARGUMENTS} +# specify fixed length dawg generation for non-space-delimited lang +# TODO(dsl): We can refactor these into functions that assign FONTS, +# TEXT_CORPUS, etc. separately. +set_lang_specific_parameters() { + local lang=$1 + # The default text location is now given directly from the language code. + TEXT_CORPUS="${FLAGS_webtext_prefix}/${lang}.corpus.txt" + FONTS=( "${LATIN_FONTS[@]}" ) + FILTER_ARGUMENTS="" + WORDLIST2DAWG_ARGUMENTS="" + WORD_DAWG_SIZE=100000 + TRAINING_DATA_ARGUMENTS="" + FRAGMENTS_DISABLED="y" + RUN_SHAPE_CLUSTERING=0 + AMBIGS_FILTER_DENOMINATOR="100000" + LEADING="32" + MEAN_COUNT="40" # Default for latin script. + + case ${lang} in + # Latin languages. + enm ) TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported + FONTS=( "${EARLY_LATIN_FONTS[@]}" );; + frm ) TEXT_CORPUS="${FLAGS_webtext_prefix}/fra.corpus.txt" + # Make long-s substitutions for Middle French text + FILTER_ARGUMENTS="--make_early_language_variant=fra" + TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported. + FONTS=( "${EARLY_LATIN_FONTS[@]}" );; + frk ) TEXT_CORPUS="${FLAGS_webtext_prefix}/deu.corpus.txt" + FONTS=( "${FRAKTUR_FONTS[@]}" );; + ita_old ) + TEXT_CORPUS="${FLAGS_webtext_prefix}/ita.corpus.txt" + # Make long-s substitutions for Early Italian text + FILTER_ARGUMENTS="--make_early_language_variant=ita" + TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported. + FONTS=( "${EARLY_LATIN_FONTS[@]}" );; + spa_old ) + TEXT_CORPUS="${FLAGS_webtext_prefix}/spa.corpus.txt" + # Make long-s substitutions for Early Spanish text + FILTER_ARGUMENTS="--make_early_language_variant=spa" + TEXT2IMAGE_EXTRA_ARGS=" --ligatures" # Add ligatures when supported. + FONTS=( "${EARLY_LATIN_FONTS[@]}" );; + srp_latn ) + TEXT_CORPUS=${FLAGS_webtext_prefix}/srp.corpus.txt ;; + vie ) TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" + FONTS=( "${VIETNAMESE_FONTS[@]}" ) ;; + # Highly inflective languages get a bigger dawg size. + # TODO(rays) Add more here! + hun ) WORD_DAWG_SIZE=1000000 ;; + pol ) WORD_DAWG_SIZE=1000000 ;; + + # Latin with default treatment. + afr ) ;; + aze ) ;; + bos ) ;; + cat ) ;; + ceb ) ;; + ces ) ;; + cym ) ;; + dan ) ;; + deu ) ;; + eng ) ;; + epo ) ;; + est ) ;; + eus ) ;; + fil ) ;; + fin ) ;; + fra ) ;; + gle ) ;; + glg ) ;; + hat ) ;; + hrv ) ;; + ind ) ;; + isl ) ;; + ita ) ;; + jav ) ;; + lat ) ;; + lav ) ;; + lit ) ;; + mlt ) ;; + msa ) ;; + nld ) ;; + nor ) ;; + por ) ;; + ron ) ;; + slk ) ;; + slv ) ;; + spa ) ;; + sqi ) ;; + swa ) ;; + swe ) ;; + tgl ) ;; + tur ) ;; + uzb ) ;; + zlm ) ;; + + # Special code for performing language-id that is trained on + # EFIGS+Latin+Vietnamese text with regular + fraktur fonts. + lat_lid ) + TEXT_CORPUS=${FLAGS_webtext_prefix}/lat_lid.corpus.txt + TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" + GENERATE_WORD_BIGRAMS=0 + # Strip unrenderable words as not all fonts will render the extended + # latin symbols found in Vietnamese text. + WORD_DAWG_SIZE=1000000 + FONTS=( "${EARLY_LATIN_FONTS[@]}" );; + + # Cyrillic script-based languages. + rus ) FONTS=( "${RUSSIAN_FONTS[@]}" ) + WORD_DAWG_SIZE=1000000 ;; + aze_cyrl | bel | bul | kaz | mkd | srp | tgk | ukr | uzb_cyrl ) + FONTS=( "${RUSSIAN_FONTS[@]}" ) ;; + + # Special code for performing Cyrillic language-id that is trained on + # Russian, Serbian, Ukranian, Belarusian, Macedonian, Tajik and Mongolian + # text with the list of Russian fonts. + cyr_lid ) + TEXT_CORPUS=${FLAGS_webtext_prefix}/cyr_lid.corpus.txt + TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" + GENERATE_WORD_BIGRAMS=0 + FRAGMENTS_DISABLED="y" + WORD_DAWG_SIZE=1000000 + FONTS=( "${RUSSIAN_FONTS[@]}" );; + + # South Asian scripts mostly have a lot of different graphemes, so trim + # down the MEAN_COUNT so as not to get a huge amount of text. + asm | ben ) + MEAN_COUNT="15" + FONTS=( "${BENGALI_FONTS[@]}" ) ;; + bih | hin | mar | nep | san ) + MEAN_COUNT="15" + FONTS=( "${DEVANAGARI_FONTS[@]}" ) ;; + bod ) MEAN_COUNT="15" + FONTS=( "${TIBETAN_FONTS[@]}" ) ;; + dzo ) FONTS=( "${TIBETAN_FONTS[@]}" ) ;; + guj ) MEAN_COUNT="15" + FONTS=( "${GUJARATI_FONTS[@]}" ) ;; + kan ) MEAN_COUNT="15" + TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output" + TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5" + FONTS=( "${KANNADA_FONTS[@]}" ) ;; + mal ) MEAN_COUNT="15" + TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output" + TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5" + FONTS=( "${MALAYALAM_FONTS[@]}" ) ;; + ori ) FONTS=( "${ORIYA_FONTS[@]}" ) ;; + pan ) MEAN_COUNT="15" + FONTS=( "${PUNJABI_FONTS[@]}" ) ;; + sin ) MEAN_COUNT="15" + FONTS=( "${SINHALA_FONTS[@]}" ) ;; + tam ) MEAN_COUNT="30" + TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output" + TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5" + FONTS=( "${TAMIL_FONTS[@]}" ) ;; + tel ) MEAN_COUNT="15" + TRAINING_DATA_ARGUMENTS+=" --no_newline_in_output" + TEXT2IMAGE_EXTRA_ARGS=" --char_spacing=0.5" + FONTS=( "${TELUGU_FONTS[@]}" ) ;; + + # SouthEast Asian scripts. + khm ) MEAN_COUNT="15" + TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" + FONTS=( "${KHMER_FONTS[@]}" ) ;; + lao ) MEAN_COUNT="15" + TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" + FONTS=( "${LAOTHIAN_FONTS[@]}" ) ;; + mya ) MEAN_COUNT="12" + TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" + FONTS=( "${BURMESE_FONTS[@]}" ) ;; + tha ) MEAN_COUNT="30" + TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" + FILTER_ARGUMENTS="--segmenter_lang=tha" + TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams=" + AMBIGS_FILTER_DENOMINATOR="1000" + LEADING=48 + FONTS=( "${THAI_FONTS[@]}" ) ;; + + # CJK + chi_sim ) + MEAN_COUNT="15" + GENERATE_WORD_BIGRAMS=0 + TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" + TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams=" + FILTER_ARGUMENTS="--charset_filter=chi_sim --segmenter_lang=chi_sim" + FRAGMENTS_DISABLED="y" + GENERATE_DAWGS=0 + FONTS=( "${CHI_SIM_FONTS[@]}" ) ;; + chi_tra ) + MEAN_COUNT="15" + GENERATE_WORD_BIGRAMS=0 + TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" + TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams=" + FILTER_ARGUMENTS="--charset_filter=chi_tra --segmenter_lang=chi_tra" + FRAGMENTS_DISABLED="y" + GENERATE_DAWGS=0 + FONTS=( "${CHI_TRA_FONTS[@]}" ) ;; + jpn ) MEAN_COUNT="15" + GENERATE_WORD_BIGRAMS=0 + TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" + TRAINING_DATA_ARGUMENTS+=" --no_space_in_output --desired_bigrams=" + FILTER_ARGUMENTS="--charset_filter=jpn --segmenter_lang=jpn" + FRAGMENTS_DISABLED="y" + GENERATE_DAWGS=0 + FONTS=( "${JPN_FONTS[@]}" ) ;; + kor ) MEAN_COUNT="20" + TRAINING_DATA_ARGUMENTS+=" --infrequent_ratio=10000" + TRAINING_DATA_ARGUMENTS+=" --desired_bigrams=" + GENERATE_WORD_BIGRAMS=0 + FILTER_ARGUMENTS="--charset_filter=kor --segmenter_lang=kor" + FRAGMENTS_DISABLED="y" + FONTS=( "${KOREAN_FONTS[@]}" ) ;; + + # Middle-Eastern scripts. + ara ) FONTS=( "${ARABIC_FONTS[@]}" ) ;; + div ) FONTS=( "${THAANA_FONTS[@]}" ) ;; + fas | pus | snd | uig | urd ) + FONTS=( "${PERSIAN_FONTS[@]}" ) ;; + heb | yid ) + FONTS=( "${HEBREW_FONTS[@]}" ) ;; + syr ) FONTS=( "${SYRIAC_FONTS[@]}" ) ;; + + # Other scripts. + amh | tir) + FONTS=( "${AMHARIC_FONTS[@]}" ) ;; + chr ) FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" \ + "Noto Sans Cherokee" \ + ) ;; + ell | grc ) + FONTS=( "${GREEK_FONTS[@]}" ) ;; + hye ) FONTS=( "${ARMENIAN_FONTS[@]}" ) ;; + iku ) FONTS=( "${NORTH_AMERICAN_ABORIGINAL_FONTS[@]}" ) ;; + kat) FONTS=( "${GEORGIAN_FONTS[@]}" ) ;; + kat_old) + TEXT_CORPUS="${FLAGS_webtext_prefix}/kat.corpus.txt" + FONTS=( "${OLD_GEORGIAN_FONTS[@]}" ) ;; + kir ) FONTS=( "${KYRGYZ_FONTS[@]}" ) + TRAINING_DATA_ARGUMENTS=" --infrequent_ratio=100" ;; + kur ) FONTS=( "${KURDISH_FONTS[@]}" ) ;; + + *) err "Error: ${lang} is not a valid language code" + esac + if [[ ${FLAGS_mean_count} -gt 0 ]]; then + TRAINING_DATA_ARGUMENTS+=" --mean_count=${FLAGS_mean_count}" + elif [[ ! -z ${MEAN_COUNT} ]]; then + TRAINING_DATA_ARGUMENTS+=" --mean_count=${MEAN_COUNT}" + fi +} + +#============================================================================= +# END of Language specific info +#============================================================================= diff --git a/training/ligature_table.cpp b/training/ligature_table.cpp index 54fbabdfb5..fabed602f6 100644 --- a/training/ligature_table.cpp +++ b/training/ligature_table.cpp @@ -43,7 +43,7 @@ static string EncodeAsUTF8(const char32 ch32) { // from. Note that this range does not contain the custom ligatures that we // encode in the private use area. const int kMinLigature = 0xfb00; -const int kMaxLigature = 0xfb4f; +const int kMaxLigature = 0xfb17; // Don't put the wide Hebrew letters in. /* static */ SmartPtr LigatureTable::instance_; diff --git a/training/pango_font_info.cpp b/training/pango_font_info.cpp index 6a463028de..2def976d33 100644 --- a/training/pango_font_info.cpp +++ b/training/pango_font_info.cpp @@ -51,6 +51,12 @@ STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp", BOOL_PARAM_FLAG(fontconfig_refresh_cache, false, "Does a one-time deletion of cache files from the " "fontconfig_tmpdir before initializing fontconfig."); +BOOL_PARAM_FLAG(fontconfig_refresh_config_file, true, + "Does a one-time reset of the fontconfig config file to point" + " to fonts_dir before initializing fontconfig. Set to true" + " if fontconfig_refresh_cache is true. Set it to false to use" + " multiple instances in separate processes without having to" + " rescan the fonts_dir, using a previously setup font cache"); #ifndef USE_STD_NAMESPACE #include "ocr/trainingdata/typesetting/legacy_fonts.h" @@ -67,6 +73,8 @@ namespace tesseract { // in pixels. const int kDefaultResolution = 300; +bool PangoFontInfo::fontconfig_initialized_ = false; + PangoFontInfo::PangoFontInfo() : desc_(NULL), resolution_(kDefaultResolution) { Clear(); } @@ -103,34 +111,35 @@ string PangoFontInfo::DescriptionName() const { // Initializes Fontconfig for use by writing a fake fonts.conf file into the // FLAGS_fontconfigs_tmpdir directory, that points to the supplied -// FLAGS_fonts_dir, and then overrides the FONTCONFIG_PATH environment variable -// to point to this fonts.conf file. -static void InitFontconfig() { - static bool init_fontconfig = false; - if (init_fontconfig || FLAGS_fonts_dir.empty()) { - init_fontconfig = true; +// fonts_dir, and then overrides the FONTCONFIG_PATH environment variable +// to point to this fonts.conf file. If force_clear, the cache is refreshed +// even if it has already been initialized. +void PangoFontInfo::InitFontConfig(bool force_clear, const string& fonts_dir) { + if ((fontconfig_initialized_ && !force_clear) || fonts_dir.empty()) { + fontconfig_initialized_ = true; return; } - if (FLAGS_fontconfig_refresh_cache) { - tprintf("Deleting cache files from %s\n", FLAGS_fontconfig_tmpdir.c_str()); + if (FLAGS_fontconfig_refresh_cache || force_clear) { File::DeleteMatchingFiles(File::JoinPath( - FLAGS_fontconfig_tmpdir.c_str(), "*cache-2").c_str()); - } - tprintf("Initializing fontconfig\n"); - const int MAX_FONTCONF_FILESIZE = 1024; - char fonts_conf_template[MAX_FONTCONF_FILESIZE]; - snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE, - "\n" - "\n" - "\n" - "%s\n" - "%s\n" - "\n" - "", FLAGS_fonts_dir.c_str(), - FLAGS_fontconfig_tmpdir.c_str()); - string fonts_conf_file = File::JoinPath(FLAGS_fontconfig_tmpdir.c_str(), - "fonts.conf"); - File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file); + FLAGS_fontconfig_tmpdir.c_str(), "*cache-?").c_str()); + } + if (FLAGS_fontconfig_refresh_config_file || FLAGS_fontconfig_refresh_cache || + force_clear) { + const int MAX_FONTCONF_FILESIZE = 1024; + char fonts_conf_template[MAX_FONTCONF_FILESIZE]; + snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE, + "\n" + "\n" + "\n" + "%s\n" + "%s\n" + "\n" + "", fonts_dir.c_str(), + FLAGS_fontconfig_tmpdir.c_str()); + string fonts_conf_file = File::JoinPath(FLAGS_fontconfig_tmpdir.c_str(), + "fonts.conf"); + File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file); + } #ifdef _WIN32 std::string env("FONTCONFIG_PATH="); env.append(FLAGS_fontconfig_tmpdir.c_str()); @@ -141,12 +150,18 @@ static void InitFontconfig() { // Fix the locale so that the reported font names are consistent. setenv("LANG", "en_US.utf8", true); #endif // _WIN32 - init_fontconfig = true; + if (!fontconfig_initialized_ || force_clear) { + if (FcInitReinitialize() != FcTrue) { + tprintf("FcInitiReinitialize failed!!\n"); + } + } + fontconfig_initialized_ = true; + FontUtils::ReInit(); } static void ListFontFamilies(PangoFontFamily*** families, int* n_families) { - InitFontconfig(); + PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir); PangoFontMap* font_map = pango_cairo_font_map_get_default(); DISABLE_HEAP_LEAK_CHECK; pango_font_map_list_families(font_map, families, n_families); @@ -220,7 +235,7 @@ bool PangoFontInfo::ParseFontDescriptionName(const string& name) { // in the font map. Note that if the font is wholly missing, this could // correspond to a completely different font family and face. PangoFont* PangoFontInfo::ToPangoFont() const { - InitFontconfig(); + InitFontConfig(false, FLAGS_fonts_dir); PangoFontMap* font_map = pango_cairo_font_map_get_default(); PangoContext* context = pango_context_new(); pango_cairo_context_set_resolution(context, resolution_); @@ -253,6 +268,28 @@ bool PangoFontInfo::CoversUTF8Text(const char* utf8_text, int byte_length) const return true; } +// This variant of strncpy permits src and dest to overlap. It will copy the +// first byte first. +static char* my_strnmove(char* dest, const char* src, size_t n) { + char* ret = dest; + + // Copy characters until n reaches zero or the src byte is a nul. + do { + *dest = *src; + --n; + ++dest; + ++src; + } while (n && src[0]); + + // If we reached a nul byte and there are more 'n' left, zero them out. + while (n) { + *dest = '\0'; + --n; + ++dest; + } + return ret; +} + int PangoFontInfo::DropUncoveredChars(string* utf8_text) const { PangoFont* font = ToPangoFont(); PangoCoverage* coverage = pango_font_get_coverage(font, NULL); @@ -265,23 +302,30 @@ int PangoFontInfo::DropUncoveredChars(string* utf8_text) const { UNICHAR::begin(utf8_text->c_str(), utf8_text->length()); const UNICHAR::const_iterator it_end = UNICHAR::end(utf8_text->c_str(), utf8_text->length()); - for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) { + for (UNICHAR::const_iterator it = it_begin; it != it_end;) { // Skip bad utf-8. - if (!it.is_legal()) - continue; // One suitable error message will still be issued. - if (!IsWhitespace(*it) && !pango_is_zero_width(*it) && - pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) { + if (!it.is_legal()) { + ++it; // One suitable error message will still be issued. + continue; + } + int unicode = *it; + int utf8_len = it.utf8_len(); + const char* utf8_char = it.utf8_data(); + // Move it forward before the data gets modified. + ++it; + if (!IsWhitespace(unicode) && !pango_is_zero_width(unicode) && + pango_coverage_get(coverage, unicode) != PANGO_COVERAGE_EXACT) { if (TLOG_IS_ON(2)) { - char tmp[5]; - int len = it.get_utf8(tmp); - tmp[len] = '\0'; - tlog(2, "'%s' (U+%x) not covered by font\n", tmp, *it); + UNICHAR unichar(unicode); + char* str = unichar.utf8_str(); + tlog(2, "'%s' (U+%x) not covered by font\n", str, unicode); + delete[] str; } ++num_dropped_chars; continue; } - strncpy(out, it.utf8_data(), it.utf8_len()); - out += it.utf8_len(); + my_strnmove(out, utf8_char, utf8_len); + out += utf8_len; } utf8_text->resize(out - utf8_text->c_str()); return num_dropped_chars; @@ -438,6 +482,7 @@ bool PangoFontInfo::CanRenderString(const char* utf8_word, int len, // ------------------------ FontUtils ------------------------------------ +vector FontUtils::available_fonts_; // cache list // Returns whether the specified font description is available in the fonts // directory. @@ -449,7 +494,8 @@ bool PangoFontInfo::CanRenderString(const char* utf8_word, int len, // from the font_map, and then check what we loaded to see if it has the // description we expected. If it is not, then the font is deemed unavailable. /* static */ -bool FontUtils::IsAvailableFont(const char* input_query_desc) { +bool FontUtils::IsAvailableFont(const char* input_query_desc, + string* best_match) { string query_desc(input_query_desc); if (PANGO_VERSION <= 12005) { // Strip commas and any ' Medium' substring in the name. @@ -466,7 +512,7 @@ bool FontUtils::IsAvailableFont(const char* input_query_desc) { query_desc.c_str()); PangoFont* selected_font = NULL; { - InitFontconfig(); + PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir); PangoFontMap* font_map = pango_cairo_font_map_get_default(); PangoContext* context = pango_context_new(); pango_context_set_font_map(context, font_map); @@ -490,7 +536,16 @@ bool FontUtils::IsAvailableFont(const char* input_query_desc) { char* selected_desc_str = pango_font_description_to_string(selected_desc); tlog(2, "query_desc: '%s' Selected: 's'\n", query_desc.c_str(), selected_desc_str); - + if (!equal && best_match != NULL) { + *best_match = selected_desc_str; + // Clip the ending ' 0' if there is one. It seems that, if there is no + // point size on the end of the fontname, then Pango always appends ' 0'. + int len = best_match->size(); + if (len > 2 && best_match->at(len - 1) == '0' && + best_match->at(len - 2) == ' ') { + *best_match = best_match->substr(0, len - 2); + } + } g_free(selected_desc_str); pango_font_description_free(selected_desc); g_object_unref(selected_font); @@ -512,7 +567,6 @@ static bool ShouldIgnoreFontFamilyName(const char* query) { // Outputs description names of available fonts. /* static */ const vector& FontUtils::ListAvailableFonts() { - static vector available_fonts_; // cache list if (available_fonts_.size()) { return available_fonts_; } @@ -536,8 +590,9 @@ const vector& FontUtils::ListAvailableFonts() { for (int i = 0; i < n_families; ++i) { const char* family_name = pango_font_family_get_name(families[i]); tlog(2, "Listing family %s\n", family_name); - if (ShouldIgnoreFontFamilyName(family_name)) + if (ShouldIgnoreFontFamilyName(family_name)) { continue; + } int n_faces; PangoFontFace** faces = NULL; @@ -733,4 +788,8 @@ bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len, return false; } +// PangoFontInfo is reinitialized, so clear the static list of fonts. +/* static */ +void FontUtils::ReInit() { available_fonts_.clear(); } + } // namespace tesseract diff --git a/training/pango_font_info.h b/training/pango_font_info.h index 56aae46fae..07d153255c 100644 --- a/training/pango_font_info.h +++ b/training/pango_font_info.h @@ -83,6 +83,11 @@ class PangoFontInfo { bool GetSpacingProperties(const string& utf8_char, int* x_bearing, int* x_advance) const; + // Initializes FontConfig by setting its environment variable and creating + // a fonts.conf file that points to the given fonts_dir. Once initialized, + // it is not re-initialized unless force_clear is true. + static void InitFontConfig(bool force_clear, const string& fonts_dir); + // Accessors string DescriptionName() const; // Font Family name eg. "Arial" @@ -123,6 +128,10 @@ class PangoFontInfo { // Default output resolution to assume for GetSpacingProperties() and any // other methods that returns pixel values. int resolution_; + // Fontconfig operates through an environment variable, so it intrinsically + // cannot be thread-friendly, but you can serialize multiple independent + // font configurations by calling InitFontConfig(true, path). + static bool fontconfig_initialized_; private: PangoFontInfo(const PangoFontInfo&); @@ -135,7 +144,13 @@ class FontUtils { public: // Returns true if the font of the given description name is available in the // target directory specified by --fonts_dir - static bool IsAvailableFont(const char* font_desc); + static bool IsAvailableFont(const char* font_desc) { + return IsAvailableFont(font_desc, NULL); + } + // Returns true if the font of the given description name is available in the + // target directory specified by --fonts_dir. If false is returned, and + // best_match is not NULL, the closest matching font is returned there. + static bool IsAvailableFont(const char* font_desc, string* best_match); // Outputs description names of available fonts. static const vector& ListAvailableFonts(); @@ -181,6 +196,12 @@ class FontUtils { static int FontScore(const unordered_map& ch_map, const string& fontname, int* raw_score, vector* ch_flags); + + // PangoFontInfo is reinitialized, so clear the static list of fonts. + static void ReInit(); + + private: + static vector available_fonts_; // cache list }; } // namespace tesseract diff --git a/training/set_unicharset_properties.cpp b/training/set_unicharset_properties.cpp index d8ab2c4767..00844ecb56 100644 --- a/training/set_unicharset_properties.cpp +++ b/training/set_unicharset_properties.cpp @@ -7,14 +7,8 @@ #include #include "commandlineflags.h" -#include "fileio.h" -#include "genericvector.h" -#include "icuerrorcode.h" -#include "normstrngs.h" -#include "strngs.h" -#include "unicharset.h" -#include "unicode/uchar.h" // from libicu -#include "unicode/uscript.h" // from libicu +#include "tprintf.h" +#include "unicharset_training_utils.h" // The directory that is searched for universal script unicharsets. STRING_PARAM_FLAG(script_dir, "", @@ -25,157 +19,6 @@ DECLARE_STRING_PARAM_FLAG(U); DECLARE_STRING_PARAM_FLAG(O); DECLARE_STRING_PARAM_FLAG(X); -namespace tesseract { - -// Helper sets the character attribute properties and sets up the script table. -// Does not set tops and bottoms. -static void SetupBasicProperties(UNICHARSET* unicharset) { - for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) { - // Convert any custom ligatures. - const char* unichar_str = unicharset->id_to_unichar(unichar_id); - for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL; ++i) { - if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) { - unichar_str = UNICHARSET::kCustomLigatures[i][0]; - break; - } - } - - // Convert the unichar to UTF32 representation - GenericVector uni_vector; - tesseract::UTF8ToUTF32(unichar_str, &uni_vector); - - // Assume that if the property is true for any character in the string, - // then it holds for the whole "character". - bool unichar_isalpha = false; - bool unichar_islower = false; - bool unichar_isupper = false; - bool unichar_isdigit = false; - bool unichar_ispunct = false; - - for (int i = 0; i < uni_vector.size(); ++i) { - if (u_isalpha(uni_vector[i])) - unichar_isalpha = true; - if (u_islower(uni_vector[i])) - unichar_islower = true; - if (u_isupper(uni_vector[i])) - unichar_isupper = true; - if (u_isdigit(uni_vector[i])) - unichar_isdigit = true; - if (u_ispunct(uni_vector[i])) - unichar_ispunct = true; - } - - unicharset->set_isalpha(unichar_id, unichar_isalpha); - unicharset->set_islower(unichar_id, unichar_islower); - unicharset->set_isupper(unichar_id, unichar_isupper); - unicharset->set_isdigit(unichar_id, unichar_isdigit); - unicharset->set_ispunctuation(unichar_id, unichar_ispunct); - - tesseract::IcuErrorCode err; - unicharset->set_script(unichar_id, uscript_getName( - uscript_getScript(uni_vector[0], err))); - - const int num_code_points = uni_vector.size(); - // Obtain the lower/upper case if needed and record it in the properties. - unicharset->set_other_case(unichar_id, unichar_id); - if (unichar_islower || unichar_isupper) { - GenericVector other_case(num_code_points, 0); - for (int i = 0; i < num_code_points; ++i) { - // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used. - // However since they deal with UChars (so need a conversion function - // from char32 or UTF8string) and require a meaningful locale string, - // for now u_tolower()/u_toupper() are used. - other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) : - u_tolower(uni_vector[i]); - } - STRING other_case_uch; - tesseract::UTF32ToUTF8(other_case, &other_case_uch); - UNICHAR_ID other_case_id = - unicharset->unichar_to_id(other_case_uch.c_str()); - if (other_case_id != INVALID_UNICHAR_ID) { - unicharset->set_other_case(unichar_id, other_case_id); - } else { - tprintf("Other case %s of %s is not in unicharset\n", - other_case_uch.c_str(), unichar_str); - } - } - - // Set RTL property and obtain mirror unichar ID from ICU. - GenericVector mirrors(num_code_points, 0); - for (int i = 0; i < num_code_points; ++i) { - mirrors[i] = u_charMirror(uni_vector[i]); - if (i == 0) { // set directionality to that of the 1st code point - unicharset->set_direction(unichar_id, - static_cast( - u_charDirection(uni_vector[i]))); - } - } - STRING mirror_uch; - tesseract::UTF32ToUTF8(mirrors, &mirror_uch); - UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str()); - if (mirror_uch_id != INVALID_UNICHAR_ID) { - unicharset->set_mirror(unichar_id, mirror_uch_id); - } else { - tprintf("Mirror %s of %s is not in unicharset\n", - mirror_uch.c_str(), unichar_str); - } - - // Record normalized version of this unichar. - STRING normed_str = tesseract::NormalizeUTF8String(unichar_str); - if (unichar_id != 0 && normed_str.length() > 0) { - unicharset->set_normed(unichar_id, normed_str.c_str()); - } else { - unicharset->set_normed(unichar_id, unichar_str); - } - } - unicharset->post_load_setup(); -} - -// Helper to set the properties for an input unicharset file, writes to the -// output file. If an appropriate script unicharset can be found in the -// script_dir directory, then the tops and bottoms are expanded using the -// script unicharset. -// If non-empty, xheight data for the fonts are written to the xheights_file. -static void SetPropertiesForInputFile(const string& script_dir, - const string& input_unicharset_file, - const string& output_unicharset_file, - const string& output_xheights_file) { - UNICHARSET unicharset; - - // Load the input unicharset - unicharset.load_from_file(input_unicharset_file.c_str()); - tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(), - input_unicharset_file.c_str()); - - // Set unichar properties - tprintf("Setting unichar properties\n"); - SetupBasicProperties(&unicharset); - string xheights_str; - for (int s = 0; s < unicharset.get_script_table_size(); ++s) { - // Load the unicharset for the script if available. - string filename = script_dir + "/" + - unicharset.get_script_from_script_id(s) + ".unicharset"; - UNICHARSET script_set; - if (script_set.load_from_file(filename.c_str())) { - unicharset.SetPropertiesFromOther(script_set); - } - // Load the xheights for the script if available. - filename = script_dir + "/" + unicharset.get_script_from_script_id(s) + - ".xheights"; - string script_heights; - if (File::ReadFileToString(filename, &script_heights)) - xheights_str += script_heights; - } - if (!output_xheights_file.empty()) - File::WriteStringToFileOrDie(xheights_str, output_xheights_file); - - // Write the output unicharset - tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str()); - unicharset.save_to_file(output_unicharset_file.c_str()); -} -} // namespace tesseract - - int main(int argc, char** argv) { tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true); diff --git a/training/stringrenderer.cpp b/training/stringrenderer.cpp index 8bef6699f7..43c3d7efae 100644 --- a/training/stringrenderer.cpp +++ b/training/stringrenderer.cpp @@ -819,6 +819,7 @@ int StringRenderer::RenderToImage(const char* text, int text_length, int StringRenderer::RenderAllFontsToImage(double min_coverage, const char* text, int text_length, string* font_used, Pix** image) { + *image = NULL; // Select a suitable font to render the title with. const char kTitleTemplate[] = "%s : %d hits = %.2f%%, raw = %d = %.2f%%"; string title_font; @@ -882,10 +883,9 @@ int StringRenderer::RenderAllFontsToImage(double min_coverage, all_fonts[i].c_str(), ok_chars, 100.0 * ok_chars / total_chars_); } } - *image = NULL; font_index_ = 0; char_map_.clear(); - return last_offset_; + return last_offset_ == 0 ? -1 : last_offset_; } } // namespace tesseract diff --git a/training/tesstrain.sh b/training/tesstrain.sh index eb3d562ff2..fbb4e48e0d 100755 --- a/training/tesstrain.sh +++ b/training/tesstrain.sh @@ -44,516 +44,7 @@ # appropriate --fonts_dir path. -FONTS=( - "Arial" \ - "Times New Roman," \ -) -FONTS_DIR="/usr/share/fonts/truetype/" -OUTPUT_DIR="/tmp/tesstrain/tessdata" -OVERWRITE=0 -RUN_SHAPE_CLUSTERING=0 -EXTRACT_FONT_PROPERTIES=1 -WORKSPACE_DIR="/tmp/tesstrain" - - -# Logging helper functions. -tlog() { - echo -e $* 2>&1 1>&2 | tee -a ${LOG_FILE} -} - -err() { - echo -e "ERROR: "$* 2>&1 1>&2 | tee -a ${LOG_FILE} - exit 1 -} - -# Helper function to run a command and append its output to a log. Aborts early -# if the program file is not found. -# Usage: run_cmd CMD ARG1 ARG2... -run_cmd() { - local cmd=$1 - shift - if [[ ! -x ${cmd} ]]; then - err "File ${cmd} not found" - fi - tlog "[$(date)] ${cmd} $@" - ${cmd} "$@" 2>&1 1>&2 | tee -a ${LOG_FILE} - # check completion status - if [[ $? -gt 0 ]]; then - err "Program $(basename ${cmd}) failed. Abort." - fi -} - -# Check if all the given files exist, or exit otherwise. -# Used to check required input files and produced output files in each phase. -# Usage: check_file_readable FILE1 FILE2... -check_file_readable() { - for file in $@; do - if [[ ! -r ${file} ]]; then - err "${file} does not exist or is not readable" - fi - done -} - - -# Write a file (with name specified in $2) with records that account for -# n% (specified in $3) of the total weights of records in the input file -# (input file name specified in $1). The input file should have one record -# per line along with its weight separated by \t. The records should be -# sorted in non-ascending order of frequency. -# If $4 is true the first record is skipped. -# USAGE: discard_tail INPUT_FILE OUTPUT_FILE PERCENTAGE -discard_tail() { - local infile=$1 - local outfile=$2 - local pct=$3 - local skip_first=$4 - - local more_arg="1"; - if [[ ${skip_first} ]]; then - more_arg="2" - fi - local sum=$(tail -n +${more_arg} ${infile} \ - | awk 'BEGIN {FS = "\t"} {if ($1 != " ") {s=s+$2}}; END {print s}') - if [[ ${sum} == "" ]]; then sum=0 - fi - local limit=$((${sum}*${pct}/100)) - tail -n +${more_arg} ${infile} | awk 'BEGIN {FS = "\t"} - {if (s > 0) {print $1; if ($1 != " ") {s=s-$2;}}}' s=${limit} \ - >> ${outfile} -} - - -# Set global path variables that are based on parsed flags. -set_prog_paths() { - if [[ -z ${BINDIR} ]]; then - err "Need to specify location of program files" - fi - CN_TRAINING_EXE=${BINDIR}/cntraining - COMBINE_TESSDATA_EXE=${BINDIR}/combine_tessdata - MF_TRAINING_EXE=${BINDIR}/mftraining - SET_UNICHARSET_PROPERTIES_EXE=${BINDIR}/set_unicharset_properties - SHAPE_TRAINING_EXE=${BINDIR}/shapeclustering - TESSERACT_EXE=${BINDIR}/tesseract - TEXT2IMAGE_EXE=${BINDIR}/text2image - UNICHARSET_EXTRACTOR_EXE=${BINDIR}/unicharset_extractor - WORDLIST2DAWG_EXE=${BINDIR}/wordlist2dawg -} - -# Sets the named variable to given value. Aborts if the value is missing or -# if it looks like a flag. -# Usage: parse_value VAR_NAME VALUE -parse_value() { - local val="$2" - if [[ -z $val ]]; then - err "Missing value for variable $1" - exit - fi - if [[ ${val:0:2} == "--" ]]; then - err "Invalid value $val passed for variable $1" - exit - fi - eval $1=\"$val\" -} - -# Does simple command-line parsing and initialization. -parse_flags() { - local i=0 - while test $i -lt ${#ARGV[@]}; do - local j=$((i+1)) - case ${ARGV[$i]} in - --) - break;; - --bin_dir) - parse_value "BINDIR" ${ARGV[$j]} - i=$j ;; - --fontlist) # Expect a plus-separated list of names - if [[ -z ${ARGV[$j]} ]] || [[ ${ARGV[$j]:0:2} == "--" ]]; then - err "Invalid value passed to --fontlist" - fi - local ofs=$IFS - IFS='+' - FONTS=( ${ARGV[$j]} ) - IFS=$ofs - i=$j ;; - --fonts_dir) - parse_value "FONTS_DIR" ${ARGV[$j]} - i=$j ;; - --lang) - parse_value "LANG_CODE" ${ARGV[$j]} - i=$j ;; - --langdata_dir) - parse_value "LANGDATA_ROOT" ${ARGV[$j]} - i=$j ;; - --output_dir) - parse_value "OUTPUT_DIR" ${ARGV[$j]} - i=$j ;; - --overwrite) - OVERWRITE=1 ;; - --extract_font_properties) - EXTRACT_FONT_PROPERTIES=1 ;; - --noextract_font_properties) - EXTRACT_FONT_PROPERTIES=0 ;; - --run_shape_clustering) - RUN_SHAPE_CLUSTERING=1 ;; - --tessdata_dir) - parse_value "TESSDATA_DIR" ${ARGV[$j]} - i=$j ;; - --training_text) - parse_value "TRAINING_TEXT" "${ARGV[$j]}" - i=$j ;; - --wordlist) - parse_value "WORDLIST_FILE" ${ARGV[$j]} - i=$j ;; - *) - err "Unrecognized argument ${ARGV[$i]}" ;; - esac - i=$((i+1)) - done - if [[ -z ${LANG_CODE} ]]; then - err "Need to specify a language --lang" - fi - if [[ -z ${BINDIR} ]]; then - err "Need to specify path to built binaries --bin_dir" - fi - if [[ -z ${LANGDATA_ROOT} ]]; then - err "Need to specify path to language files --langdata_dir" - fi - if [[ -z ${TESSDATA_DIR} ]]; then - if [[ -z ${TESSDATA_PREFIX} ]]; then - err "Need to specify a --tessdata_dir or have a "\ - "TESSDATA_PREFIX variable defined in your environment" - else - TESSDATA_DIR="${TESSDATA_PREFIX}" - fi - fi - - set_prog_paths - - # Location where intermediate files will be created. - TRAINING_DIR=${WORKSPACE_DIR}/${LANG_CODE} - # Location of log file for the whole run. - LOG_FILE=${TRAINING_DIR}/tesstrain.log - - # Take training text and wordlist from the langdata directory if not - # specified in the commend-line. - if [[ -z ${TRAINING_TEXT} ]]; then - TRAINING_TEXT=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.training_text - fi - if [[ -z ${WORDLIST_FILE} ]]; then - WORDLIST_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.wordlist.clean - fi - WORD_BIGRAMS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.word.bigrams.clean - NUMBERS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.numbers - PUNC_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.punc - BIGRAM_FREQS_FILE=${TRAINING_TEXT}.bigram_freqs - UNIGRAM_FREQS_FILE=${TRAINING_TEXT}.unigram_freqs - TRAIN_NGRAMS_FILE=${TRAINING_TEXT}.train_ngrams -} - -# Phase I : Generate (I)mages from training text for each font. -phaseI_generate_image() { - tlog "\n=== Phase I: Generating training images ===" - if [[ -z ${TRAINING_TEXT} ]] || [[ ! -r ${TRAINING_TEXT} ]]; then - err "Could not find training text file ${TRAINING_TEXT}" - fi - BOX_PADDING="0" - CHAR_SPACING="0.0" - EXPOSURE="0" - LEADING="32" - NGRAM_CHAR_SPACING="0.0" - - if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS} ]]; then - # Parse .bigram_freqs file and compose a .train_ngrams file with text - # for tesseract to recognize during training. Take only the ngrams whose - # combined weight accounts for 95% of all the bigrams in the language. - TMP_FILE="${TRAINING_DIR}/_tmp" - cat ${BIGRAM_FREQS_FILE} > ${TMP_FILE} - NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \ - | awk '{s=s+$2}; END {print (s/100)*p}' p=99) - cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \ - | awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \ - x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE} - check_file_readable ${TRAIN_NGRAMS_FILE} - fi - - for font in "${FONTS[@]}"; do - tlog "Rendering using ${font}" - fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g') - outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE} - - common_args="--leading=${LEADING} --fonts_dir=${FONTS_DIR} " - common_args+=" --box_padding=${BOX_PADDING} --strip_unrenderable_words" - - run_cmd ${TEXT2IMAGE_EXE} ${common_args} \ - --char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE} \ - --font="${font}" --outputbase=${outbase} --text=${TRAINING_TEXT} - check_file_readable ${outbase}.box ${outbase}.tif - - if (( ${EXTRACT_FONT_PROPERTIES} )) && - [[ -r ${TRAIN_NGRAMS_FILE} ]]; then - tlog "Rendering ngrams using ${font}" - outbase=${TRAINING_DIR}/ngrams/${LANG_CODE}.ngrams.${fontname}.exp${EXPOSURE} - run_cmd ${TEXT2IMAGE_EXE} ${common_args} \ - --char_spacing=${NGRAM_CHAR_SPACING} --exposure=${EXPOSURE} \ - --font="${font}" --outputbase=${outbase} \ - --box_padding=${BOX_PADDING} --render_ngrams=1 \ - --text=${TRAIN_NGRAMS_FILE} - check_file_readable ${outbase}.box ${outbase}.tif - fi - done -} - - -# Phase UP : Generate (U)nicharset and (P)roperties file. -phaseUP_generate_unicharset() { - tlog "\n=== Phase UP: Generating unicharset and unichar properties files ===" - - box_files=$(ls ${TRAINING_DIR}/*.box) - run_cmd ${UNICHARSET_EXTRACTOR_EXE} -D "${TRAINING_DIR}/" ${box_files} - outfile=${TRAINING_DIR}/unicharset - UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset" - check_file_readable ${outfile} - mv ${outfile} ${UNICHARSET_FILE} - - XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights" - check_file_readable ${UNICHARSET_FILE} - run_cmd ${SET_UNICHARSET_PROPERTIES_EXE} \ - -U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \ - --script_dir=${LANGDATA_ROOT} - check_file_readable ${XHEIGHTS_FILE} -} - -# Phase D : Generate (D)awg files from unicharset file and wordlist files -phaseD_generate_dawg() { - tlog "\n=== Phase D: Generating Dawg files ===" - # Output files - WORD_DAWG=${TRAINING_DIR}/${LANG_CODE}.word-dawg - FREQ_DAWG=${TRAINING_DIR}/${LANG_CODE}.freq-dawg - PUNC_DAWG=${TRAINING_DIR}/${LANG_CODE}.punc-dawg - NUMBER_DAWG=${TRAINING_DIR}/${LANG_CODE}.number-dawg - BIGRAM_DAWG=${TRAINING_DIR}/${LANG_CODE}.bigram-dawg - - # Word DAWG - local freq_wordlist_file=${TRAINING_DIR}/${LANG_CODE}.wordlist.clean.freq - if [[ -r ${WORDLIST_FILE} ]]; then - tlog "Generating word Dawg" - check_file_readable ${UNICHARSET_FILE} - run_cmd ${WORDLIST2DAWG_EXE} -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \ - ${UNICHARSET_FILE} - check_file_readable ${WORD_DAWG} - - FREQ_DAWG_SIZE=100 - head -n ${FREQ_DAWG_SIZE} ${WORDLIST_FILE} > ${freq_wordlist_file} - fi - - # Freq-word DAWG - if [[ -r ${freq_wordlist_file} ]]; then - check_file_readable ${UNICHARSET_FILE} - tlog "Generating frequent-word Dawg" - run_cmd ${WORDLIST2DAWG_EXE} -r 1 ${freq_wordlist_file} ${FREQ_DAWG} \ - ${UNICHARSET_FILE} - check_file_readable ${FREQ_DAWG} - fi - - # Punctuation DAWG - local punc_clean="${LANGDATA_ROOT}/common.punc" - if [[ -r ${PUNC_FILE} ]]; then - local top_punc_file=${TRAINING_DIR}/${LANG_CODE}.punc.top - head -n 1 ${PUNC_FILE} | awk 'BEGIN {FS = "\t"} {print $1}' \ - > ${top_punc_file} - discard_tail ${PUNC_FILE} ${top_punc_file} 99 1 - punc_clean="${top_punc_file}" - fi - # -r arguments to WORDLIST2DAWG_EXE denote RTL reverse policy - # (see Trie::RTLReversePolicy enum in third_party/tesseract/dict/trie.h). - # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG, - # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS, - # 2/RRP_FORCE_REVERSE for the punctuation DAWG. - local punc_reverse_policy=0; - if [[ ${LANG_CODE} == "heb" || ${LANG_CODE} == "ara" ]]; then - punc_reverse_policy=2 - fi - if [[ -r ${punc_clean} ]]; then - run_cmd ${WORDLIST2DAWG_EXE} -r ${punc_reverse_policy} \ - ${punc_clean} ${PUNC_DAWG} ${UNICHARSET_FILE} - check_file_readable ${PUNC_DAWG} - fi - - # Numbers DAWG - if [[ -r ${NUMBERS_FILE} ]]; then - local top_num_file=${TRAINING_DIR}/${LANG_CODE}.numbers.top - head -n 1 ${NUMBERS_FILE} | awk 'BEGIN {FS = "\t"} {print $1}' \ - > ${top_num_file} - discard_tail ${NUMBERS_FILE} ${top_num_file} 85 1 - run_cmd ${WORDLIST2DAWG_EXE} -r 0 \ - ${top_num_file} ${NUMBER_DAWG} ${UNICHARSET_FILE} - check_file_readable ${NUMBER_DAWG} - fi - - # Bigram dawg - if [[ -r ${WORD_BIGRAMS_FILE} ]]; then - run_cmd ${WORDLIST2DAWG_EXE} -r 1 \ - ${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE} - check_file_readable ${BIGRAM_DAWG} - fi -} - -# Phase E : (E)xtract .tr feature files from .tif/.box files -phaseE_extract_features() { - tlog "\n=== Phase E: Extracting features ===" - local box_config="box.train" - TRAIN_EXPOSURES='0' - - for exposure in ${TRAIN_EXPOSURES}; do - img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif) - done - - # Use any available language-specific configs. - local config="" - if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config ]]; then - config=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config - fi - - OLD_TESSDATA_PREFIX=${TESSDATA_PREFIX} - export TESSDATA_PREFIX=${TESSDATA_DIR} - tlog "Using TESSDATA_PREFIX=${TESSDATA_PREFIX}" - for img_file in ${img_files}; do - run_cmd ${TESSERACT_EXE} ${img_file} ${img_file%.*} \ - ${box_config} ${config} - done - export TESSDATA_PREFIX=${OLD_TESSDATA_PREFIX} -} - -# Phase C : (C)luster feature prototypes in .tr into normproto file (cnTraining) -# phaseC_cluster_prototypes ${TRAINING_DIR}/${LANG_CODE}.normproto -phaseC_cluster_prototypes() { - tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ===" - local out_normproto=${TRAINING_DIR}/${LANG_CODE}.normproto - - run_cmd ${CN_TRAINING_EXE} -D "${TRAINING_DIR}/" \ - $(ls ${TRAINING_DIR}/*.tr) - - check_file_readable ${TRAINING_DIR}/normproto - mv ${TRAINING_DIR}/normproto ${out_normproto} -} - -# Phase S : (S)hape clustering -phaseS_cluster_shapes() { - if (( ! ${RUN_SHAPE_CLUSTERING} )); then - return - fi - check_file_readable ${LANGDATA_ROOT}/font_properties - local font_props=${LANGDATA_ROOT}/font_properties - if [[ -r ${font_props} ]]; then - font_props="-F ${font_props}" - else - font_props="" - fi - if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] &&\ - [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then - font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights" - fi - - run_cmd ${SHAPE_TRAINING_EXE} \ - -D "${TRAINING_DIR}/" \ - -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \ - -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \ - ${font_props} \ - $(ls ${TRAINING_DIR}/*.tr) - check_file_readable ${TRAINING_DIR}/shapetable \ - ${TRAINING_DIR}/${LANG_CODE}.mfunicharset -} - -# Phase M : Clustering microfeatures (mfTraining) -phaseM_cluster_microfeatures() { - tlog "\n=== Phase M : Clustering microfeatures (mfTraining) ===" - - font_props=${LANGDATA_ROOT}/font_properties - if [[ -r ${font_props} ]]; then - font_props="-F ${font_props}" - else - font_props="" - fi - if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] && \ - [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then - font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights" - fi - - run_cmd ${MF_TRAINING_EXE} \ - -D "${TRAINING_DIR}/" \ - -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \ - -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \ - ${font_props} \ - $(ls ${TRAINING_DIR}/*.tr) - check_file_readable ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/shapetable \ - ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.mfunicharset - mv ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/${LANG_CODE}.inttemp - mv ${TRAINING_DIR}/shapetable ${TRAINING_DIR}/${LANG_CODE}.shapetable - mv ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.pffmtable - mv ${TRAINING_DIR}/${LANG_CODE}.mfunicharset ${TRAINING_DIR}/${LANG_CODE}.unicharset -} - -phaseB_generate_ambiguities() { - tlog "\n=== Phase B : ambiguities training ===" - - # Check for manually created ambiguities data. - if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs ]]; then - tlog "Found file ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs" - cp ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs \ - ${TRAINING_DIR}/${LANG_CODE}.unicharambigs - # Make it writable, as it may be read-only in the client. - chmod u+w ${TRAINING_DIR}/${LANG_CODE}.unicharambigs - return - else - tlog "No unicharambigs file found!" - fi - - # TODO: Add support for generating ambiguities automatically. -} - - -make_traineddata() { - tlog "\n=== Making final traineddata file ===" - local lang_prefix=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE} - - # Combine available files for this language from the langdata dir. - if [[ -r ${lang_prefix}.config ]]; then - tlog "Copying ${lang_prefix}.config to ${TRAINING_DIR}" - cp ${lang_prefix}.config ${TRAINING_DIR} - chmod u+w ${TRAINING_DIR}/${LANG_CODE}.config - fi - if [[ -r ${lang_prefix}.cube-unicharset ]]; then - tlog "Copying ${lang_prefix}.cube-unicharset to ${TRAINING_DIR}" - cp ${lang_prefix}.cube-unicharset ${TRAINING_DIR} - chmod u+w ${TRAINING_DIR}/${LANG_CODE}.cube-unicharset - fi - if [[ -r ${lang_prefix}.cube-word-dawg ]]; then - tlog "Copying ${lang_prefix}.cube-word-dawg to ${TRAINING_DIR}" - cp ${lang_prefix}.cube-word-dawg ${TRAINING_DIR} - chmod u+w ${TRAINING_DIR}/${LANG_CODE}.cube-word-dawg - fi - if [[ -r ${lang_prefix}.params-model ]]; then - tlog "Copying ${lang_prefix}.params-model to ${TRAINING_DIR}" - cp ${lang_prefix}.params-model ${TRAINING_DIR} - chmod u+w ${TRAINING_DIR}/${LANG_CODE}.params-model - fi - - # Compose the traineddata file. - run_cmd ${COMBINE_TESSDATA_EXE} ${TRAINING_DIR}/${LANG_CODE}. - - # Copy it to the output dir, overwriting only if allowed by the cmdline flag. - if [[ ! -d ${OUTPUT_DIR} ]]; then - tlog "Creating new directory ${OUTPUT_DIR}" - mkdir -p ${OUTPUT_DIR} - fi - local destfile=${OUTPUT_DIR}/${LANG_CODE}.traineddata; - if [[ -f ${destfile} ]] && (( ! ${OVERWRITE} )); then - err "File ${destfile} exists and no --overwrite specified"; - fi - tlog "Moving ${TRAINING_DIR}/${LANG_CODE}.traineddata to ${OUTPUT_DIR}" - cp -f ${TRAINING_DIR}/${LANG_CODE}.traineddata ${destfile} -} - +source `dirname $0`/tesstrain_utils.sh ARGV=("$@") parse_flags @@ -564,14 +55,21 @@ tlog "Cleaning workspace directory ${TRAINING_DIR}..." mkdir -p ${TRAINING_DIR} rm -fr ${TRAINING_DIR}/* -phaseI_generate_image -phaseUP_generate_unicharset -phaseD_generate_dawg -phaseE_extract_features -phaseC_cluster_prototypes -phaseS_cluster_shapes -phaseM_cluster_microfeatures -phaseB_generate_ambiguities -make_traineddata +source `dirname $0`/language-specific.sh +set_lang_specific_parameters ${LANG_CODE} + +initialize_fontconfig + +phase_I_generate_image 8 +phase_UP_generate_unicharset +phase_D_generate_dawg +phase_E_extract_features "box.train" 8 +phase_C_cluster_prototypes "${TRAINING_DIR}/${LANG_CODE}.normproto" +if [[ "${ENABLE_SHAPE_CLUSTERING}" == "y" ]]; then + phase_S_cluster_shapes +fi +phase_M_cluster_microfeatures +phase_B_generate_ambiguities +make__traineddata tlog "\nCompleted training for language '${LANG_CODE}'\n" diff --git a/training/tesstrain_utils.sh b/training/tesstrain_utils.sh new file mode 100755 index 0000000000..0c2c3a9fb1 --- /dev/null +++ b/training/tesstrain_utils.sh @@ -0,0 +1,578 @@ +#!/bin/bash +# (C) Copyright 2014, Google Inc. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# This script defines functions that are used by tesstrain.sh +# For a detailed description of the phases, see +# https://code.google.com/p/tesseract-ocr/wiki/TrainingTesseract3 +# +# USAGE: source tesstrain_utils.sh + +FONTS=( + "Arial" \ + "Times New Roman," \ +) +FONTS_DIR="/usr/share/fonts/truetype/" +OUTPUT_DIR="/tmp/tesstrain/tessdata" +OVERWRITE=0 +RUN_SHAPE_CLUSTERING=0 +EXTRACT_FONT_PROPERTIES=1 +WORKSPACE_DIR="/tmp/tesstrain" + +# Logging helper functions. +tlog() { + echo -e $* 2>&1 1>&2 | tee -a ${LOG_FILE} +} + +err_exit() { + echo -e "ERROR: "$* 2>&1 1>&2 | tee -a ${LOG_FILE} + exit 1 +} + +# Helper function to run a command and append its output to a log. Aborts early +# if the program file is not found. +# Usage: run_command CMD ARG1 ARG2... +run_command() { + local cmd=$1 + shift + if [[ ! -x ${cmd} ]]; then + err_exit "File ${cmd} not found" + fi + tlog "[$(date)] ${cmd} $@" + ${cmd} "$@" 2>&1 1>&2 | tee -a ${LOG_FILE} + # check completion status + if [[ $? -gt 0 ]]; then + err_exit "Program $(basename ${cmd}) failed. Abort." + fi +} + +# Check if all the given files exist, or exit otherwise. +# Used to check required input files and produced output files in each phase. +# Usage: check_file_readable FILE1 FILE2... +check_file_readable() { + for file in $@; do + if [[ ! -r ${file} ]]; then + err_exit "${file} does not exist or is not readable" + fi + done +} + +# Write a file (with name specified in $2) with records that account for +# n% (specified in $3) of the total weights of records in the input file +# (input file name specified in $1). The input file should have one record +# per line along with its weight separated by \t. The records should be +# sorted in non-ascending order of frequency. +# If $4 is true the first record is skipped. +# USAGE: discard_tail INPUT_FILE OUTPUT_FILE PERCENTAGE +discard_tail() { + local infile=$1 + local outfile=$2 + local pct=$3 + local skip_first=$4 + + local more_arg="1"; + if [[ ${skip_first} ]]; then + more_arg="2" + fi + local sum=$(tail -n +${more_arg} ${infile} \ + | awk 'BEGIN {FS = "\t"} {if ($1 != " ") {s=s+$2}}; END {print s}') + if [[ ${sum} == "" ]]; then sum=0 + fi + local limit=$((${sum}*${pct}/100)) + tail -n +${more_arg} ${infile} | awk 'BEGIN {FS = "\t"} + {if (s > 0) {print $1; if ($1 != " ") {s=s-$2;}}}' s=${limit} \ + >> ${outfile} +} + +# Set global path variables that are based on parsed flags. +set_prog_paths() { + if [[ -z ${BINDIR} ]]; then + err_exit "Need to specify location of program files" + fi + CN_TRAINING_EXE=${BINDIR}/cntraining + COMBINE_TESSDATA_EXE=${BINDIR}/combine_tessdata + MF_TRAINING_EXE=${BINDIR}/mftraining + SET_UNICHARSET_PROPERTIES_EXE=${BINDIR}/set_unicharset_properties + SHAPE_TRAINING_EXE=${BINDIR}/shapeclustering + TESSERACT_EXE=${BINDIR}/tesseract + TEXT2IMAGE_EXE=${BINDIR}/text2image + UNICHARSET_EXTRACTOR_EXE=${BINDIR}/unicharset_extractor + WORDLIST2DAWG_EXE=${BINDIR}/wordlist2dawg +} + +# Sets the named variable to given value. Aborts if the value is missing or +# if it looks like a flag. +# Usage: parse_value VAR_NAME VALUE +parse_value() { + local val="$2" + if [[ -z $val ]]; then + err_exit "Missing value for variable $1" + exit + fi + if [[ ${val:0:2} == "--" ]]; then + err_exit "Invalid value $val passed for variable $1" + exit + fi + eval $1=\"$val\" +} + +# Does simple command-line parsing and initialization. +parse_flags() { + local i=0 + while test $i -lt ${#ARGV[@]}; do + local j=$((i+1)) + case ${ARGV[$i]} in + --) + break;; + --bin_dir) + parse_value "BINDIR" ${ARGV[$j]} + i=$j ;; + --fontlist) # Expect a plus-separated list of names + if [[ -z ${ARGV[$j]} ]] || [[ ${ARGV[$j]:0:2} == "--" ]]; then + err_exit "Invalid value passed to --fontlist" + fi + local ofs=$IFS + IFS='+' + FONTS=( ${ARGV[$j]} ) + IFS=$ofs + i=$j ;; + --fonts_dir) + parse_value "FONTS_DIR" ${ARGV[$j]} + i=$j ;; + --lang) + parse_value "LANG_CODE" ${ARGV[$j]} + i=$j ;; + --langdata_dir) + parse_value "LANGDATA_ROOT" ${ARGV[$j]} + i=$j ;; + --output_dir) + parse_value "OUTPUT_DIR" ${ARGV[$j]} + i=$j ;; + --overwrite) + OVERWRITE=1 ;; + --extract_font_properties) + EXTRACT_FONT_PROPERTIES=1 ;; + --noextract_font_properties) + EXTRACT_FONT_PROPERTIES=0 ;; + --tessdata_dir) + parse_value "TESSDATA_DIR" ${ARGV[$j]} + i=$j ;; + --training_text) + parse_value "TRAINING_TEXT" "${ARGV[$j]}" + i=$j ;; + --wordlist) + parse_value "WORDLIST_FILE" ${ARGV[$j]} + i=$j ;; + *) + err_exit "Unrecognized argument ${ARGV[$i]}" ;; + esac + i=$((i+1)) + done + if [[ -z ${LANG_CODE} ]]; then + err_exit "Need to specify a language --lang" + fi + if [[ -z ${BINDIR} ]]; then + err_exit "Need to specify path to built binaries --bin_dir" + fi + if [[ -z ${LANGDATA_ROOT} ]]; then + err_exit "Need to specify path to language files --langdata_dir" + fi + if [[ -z ${TESSDATA_DIR} ]]; then + if [[ -z ${TESSDATA_PREFIX} ]]; then + err_exit "Need to specify a --tessdata_dir or have a "\ + "TESSDATA_PREFIX variable defined in your environment" + else + TESSDATA_DIR="${TESSDATA_PREFIX}" + fi + fi + + set_prog_paths + + # Location where intermediate files will be created. + TRAINING_DIR=${WORKSPACE_DIR}/${LANG_CODE} + # Location of log file for the whole run. + LOG_FILE=${TRAINING_DIR}/tesstrain.log + + # Take training text and wordlist from the langdata directory if not + # specified in the commend-line. + if [[ -z ${TRAINING_TEXT} ]]; then + TRAINING_TEXT=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.training_text + fi + if [[ -z ${WORDLIST_FILE} ]]; then + WORDLIST_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.wordlist.clean + fi + WORD_BIGRAMS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.word.bigrams.clean + NUMBERS_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.numbers + PUNC_FILE=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.punc + BIGRAM_FREQS_FILE=${TRAINING_TEXT}.bigram_freqs + UNIGRAM_FREQS_FILE=${TRAINING_TEXT}.unigram_freqs + TRAIN_NGRAMS_FILE=${TRAINING_TEXT}.train_ngrams + GENERATE_DAWGS=1 +} + +# Function initializes font config with a unique font cache dir. +initialize_fontconfig() { + export FONT_CONFIG_CACHE=$(mktemp -d --tmpdir font_tmp.XXXXXXXXXX) + local sample_path=${FONT_CONFIG_CACHE}/sample_text.txt + echo "Text" >${sample_path} + run_command ${TEXT2IMAGE_EXE} --fonts_dir=${FONTS_DIR} \ + --font="Arial" --outputbase=${sample_path} --text=${sample_path} \ + --fontconfig_tmpdir=${FONT_CONFIG_CACHE} +} + +# Helper function for phaseI_generate_image. Generates the image for a single +# language/font combination in a way that can be run in parallel. +generate_font_image() { + local font="$1" + tlog "Rendering using ${font}" + local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g') + local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE} + + local common_args="--fontconfig_tmpdir=${FONT_CONFIG_CACHE}" + common_args+=" --fonts_dir=${FONTS_DIR} --strip_unrenderable_words" + common_args+=" --fontconfig_refresh_config_file=false --leading=${LEADING}" + common_args+=" --char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE}" + common_args+=" --outputbase=${outbase}" + + # add --writing_mode=vertical-upright to common_args if the font is + # specified to be rendered vertically. + for vfont in "${VERTICAL_FONTS[@]}"; do + if [[ "${font}" == "${vfont}" ]]; then + common_args+=" --writing_mode=vertical-upright " + break + fi + done + + run_command ${TEXT2IMAGE_EXE} ${common_args} --font="${font}" \ + --text=${TRAINING_TEXT} ${TEXT2IMAGE_EXTRA_ARGS} + check_file_readable ${outbase}.box ${outbase}.tif + + if (( ${EXTRACT_FONT_PROPERTIES} )) && + [[ -r ${TRAIN_NGRAMS_FILE} ]]; then + tlog "Extracting font properties of ${font}" + run_command ${TEXT2IMAGE_EXE} ${common_args} --font="${font}" \ + --ligatures=false --text=${TRAIN_NGRAMS_FILE} \ + --only_extract_font_properties --ptsize=32 + check_file_readable ${outbase}.fontinfo + fi +} + + +# Phase I : Generate (I)mages from training text for each font. +phase_I_generate_image() { + local par_factor=$1 + if [[ -z ${par_factor} || ${par_factor} -le 0 ]]; then + par_factor=1 + fi + tlog "\n=== Phase I: Generating training images ===" + if [[ -z ${TRAINING_TEXT} ]] || [[ ! -r ${TRAINING_TEXT} ]]; then + err_exit "Could not find training text file ${TRAINING_TEXT}" + fi + CHAR_SPACING="0.0" + EXPOSURE="0" + + if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then + # Parse .bigram_freqs file and compose a .train_ngrams file with text + # for tesseract to recognize during training. Take only the ngrams whose + # combined weight accounts for 95% of all the bigrams in the language. + NGRAM_FRAC=$(cat ${BIGRAM_FREQS_FILE} \ + | awk '{s=s+$2}; END {print (s/100)*p}' p=99) + cat ${BIGRAM_FREQS_FILE} | sort -rnk2 \ + | awk '{s=s+$2; if (s <= x) {printf "%s ", $1; } }' \ + x=${NGRAM_FRAC} > ${TRAIN_NGRAMS_FILE} + check_file_readable ${TRAIN_NGRAMS_FILE} + fi + + local counter=0 + for font in "${FONTS[@]}"; do + generate_font_image "${font}" & + let counter=counter+1 + let rem=counter%par_factor + if [[ "${rem}" -eq 0 ]]; then + wait + fi + done + wait + # Check that each process was successful. + for font in "${FONTS[@]}"; do + local fontname=$(echo ${font} | tr ' ' '_' | sed 's/,//g') + local outbase=${TRAINING_DIR}/${LANG_CODE}.${fontname}.exp${EXPOSURE} + check_file_readable ${outbase}.box ${outbase}.tif + done +} + +# Phase UP : Generate (U)nicharset and (P)roperties file. +phase_UP_generate_unicharset() { + tlog "\n=== Phase UP: Generating unicharset and unichar properties files ===" + + local box_files=$(ls ${TRAINING_DIR}/*.box) + run_command ${UNICHARSET_EXTRACTOR_EXE} -D "${TRAINING_DIR}/" ${box_files} + local outfile=${TRAINING_DIR}/unicharset + UNICHARSET_FILE="${TRAINING_DIR}/${LANG_CODE}.unicharset" + check_file_readable ${outfile} + mv ${outfile} ${UNICHARSET_FILE} + + XHEIGHTS_FILE="${TRAINING_DIR}/${LANG_CODE}.xheights" + check_file_readable ${UNICHARSET_FILE} + run_command ${SET_UNICHARSET_PROPERTIES_EXE} \ + -U ${UNICHARSET_FILE} -O ${UNICHARSET_FILE} -X ${XHEIGHTS_FILE} \ + --script_dir=${LANGDATA_ROOT} + check_file_readable ${XHEIGHTS_FILE} +} + +# Phase D : Generate (D)awg files from unicharset file and wordlist files +phase_D_generate_dawg() { + tlog "\n=== Phase D: Generating Dawg files ===" + + # Skip if requested + if [[ ${GENERATE_DAWGS} -eq 0 ]]; then + tlog "Skipping ${phase_name}" + return + fi + + # Output files + WORD_DAWG=${TRAINING_DIR}/${LANG_CODE}.word-dawg + FREQ_DAWG=${TRAINING_DIR}/${LANG_CODE}.freq-dawg + PUNC_DAWG=${TRAINING_DIR}/${LANG_CODE}.punc-dawg + NUMBER_DAWG=${TRAINING_DIR}/${LANG_CODE}.number-dawg + BIGRAM_DAWG=${TRAINING_DIR}/${LANG_CODE}.bigram-dawg + + # Word DAWG + local freq_wordlist_file=${TRAINING_DIR}/${LANG_CODE}.wordlist.clean.freq + if [[ -r ${WORDLIST_FILE} ]]; then + tlog "Generating word Dawg" + check_file_readable ${UNICHARSET_FILE} + run_command ${WORDLIST2DAWG_EXE} -r 1 ${WORDLIST_FILE} ${WORD_DAWG} \ + ${UNICHARSET_FILE} + check_file_readable ${WORD_DAWG} + + FREQ_DAWG_SIZE=100 + head -n ${FREQ_DAWG_SIZE} ${WORDLIST_FILE} > ${freq_wordlist_file} + fi + + # Freq-word DAWG + if [[ -r ${freq_wordlist_file} ]]; then + check_file_readable ${UNICHARSET_FILE} + tlog "Generating frequent-word Dawg" + run_command ${WORDLIST2DAWG_EXE} -r 1 ${freq_wordlist_file} ${FREQ_DAWG} \ + ${UNICHARSET_FILE} + check_file_readable ${FREQ_DAWG} + fi + + # Punctuation DAWG + local punc_clean="${LANGDATA_ROOT}/common.punc" + if [[ -r ${PUNC_FILE} ]]; then + local top_punc_file=${TRAINING_DIR}/${LANG_CODE}.punc.top + head -n 1 ${PUNC_FILE} | awk 'BEGIN {FS = "\t"} {print $1}' \ + > ${top_punc_file} + discard_tail ${PUNC_FILE} ${top_punc_file} 99 1 + punc_clean="${top_punc_file}" + fi + # -r arguments to WORDLIST2DAWG_EXE denote RTL reverse policy + # (see Trie::RTLReversePolicy enum in third_party/tesseract/dict/trie.h). + # We specify 0/RRP_DO_NO_REVERSE when generating number DAWG, + # 1/RRP_REVERSE_IF_HAS_RTL for freq and word DAWGS, + # 2/RRP_FORCE_REVERSE for the punctuation DAWG. + local punc_reverse_policy=0; + case ${LANG_CODE} in + ara | div| fas | pus | snd | syr | uig | urd | heb | yid ) + punc_reverse_policy=2 ;; + * ) ;; + esac + if [[ -r ${punc_clean} ]]; then + run_command ${WORDLIST2DAWG_EXE} -r ${punc_reverse_policy} \ + ${punc_clean} ${PUNC_DAWG} ${UNICHARSET_FILE} + check_file_readable ${PUNC_DAWG} + fi + + # Numbers DAWG + if [[ -r ${NUMBERS_FILE} ]]; then + local top_num_file=${TRAINING_DIR}/${LANG_CODE}.numbers.top + head -n 1 ${NUMBERS_FILE} | awk 'BEGIN {FS = "\t"} {print $1}' \ + > ${top_num_file} + discard_tail ${NUMBERS_FILE} ${top_num_file} 85 1 + run_command ${WORDLIST2DAWG_EXE} -r 0 \ + ${top_num_file} ${NUMBER_DAWG} ${UNICHARSET_FILE} + check_file_readable ${NUMBER_DAWG} + fi + + # Bigram dawg + if [[ -r ${WORD_BIGRAMS_FILE} ]]; then + run_command ${WORDLIST2DAWG_EXE} -r 1 \ + ${WORD_BIGRAMS_FILE} ${BIGRAM_DAWG} ${UNICHARSET_FILE} + check_file_readable ${BIGRAM_DAWG} + fi +} + +# Phase E : (E)xtract .tr feature files from .tif/.box files +phase_E_extract_features() { + local box_config=$1 + local par_factor=$2 + if [[ -z ${par_factor} || ${par_factor} -le 0 ]]; then + par_factor=1 + fi + tlog "\n=== Phase E: Extracting features ===" + TRAIN_EXPOSURES='0' + + local img_files="" + for exposure in ${TRAIN_EXPOSURES}; do + img_files=${img_files}' '$(ls ${TRAINING_DIR}/*.exp${exposure}.tif) + done + + # Use any available language-specific configs. + local config="" + if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config ]]; then + config=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.config + fi + + OLD_TESSDATA_PREFIX=${TESSDATA_PREFIX} + export TESSDATA_PREFIX=${TESSDATA_DIR} + tlog "Using TESSDATA_PREFIX=${TESSDATA_PREFIX}" + local counter=0 + for img_file in ${img_files}; do + run_command ${TESSERACT_EXE} ${img_file} ${img_file%.*} \ + ${box_config} ${config} & + let counter=counter+1 + let rem=counter%par_factor + if [[ "${rem}" -eq 0 ]]; then + wait + fi + done + wait + export TESSDATA_PREFIX=${OLD_TESSDATA_PREFIX} + # Check that all the output files were produced. + for img_file in ${img_files}; do + check_file_readable ${img_file%.*}.tr + done +} + +# Phase C : (C)luster feature prototypes in .tr into normproto file (cnTraining) +# phaseC_cluster_prototypes ${TRAINING_DIR}/${LANG_CODE}.normproto +phase_C_cluster_prototypes() { + tlog "\n=== Phase C: Clustering feature prototypes (cnTraining) ===" + local out_normproto=$1 + + run_command ${CN_TRAINING_EXE} -D "${TRAINING_DIR}/" \ + $(ls ${TRAINING_DIR}/*.tr) + + check_file_readable ${TRAINING_DIR}/normproto + mv ${TRAINING_DIR}/normproto ${out_normproto} +} + +# Phase S : (S)hape clustering +phase_S_cluster_shapes() { + if (( ! ${RUN_SHAPE_CLUSTERING} )); then + tlog "\n=== Shape Clustering disabled ===" + return + fi + check_file_readable ${LANGDATA_ROOT}/font_properties + local font_props="-F ${LANGDATA_ROOT}/font_properties" + if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] &&\ + [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then + font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights" + fi + + run_command ${SHAPE_TRAINING_EXE} \ + -D "${TRAINING_DIR}/" \ + -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \ + -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \ + ${font_props} \ + $(ls ${TRAINING_DIR}/*.tr) + check_file_readable ${TRAINING_DIR}/shapetable \ + ${TRAINING_DIR}/${LANG_CODE}.mfunicharset +} + +# Phase M : Clustering microfeatures (mfTraining) +phase_M_cluster_microfeatures() { + tlog "\n=== Phase M : Clustering microfeatures (mfTraining) ===" + + check_file_readable ${LANGDATA_ROOT}/font_properties + font_props="-F ${LANGDATA_ROOT}/font_properties" + if [[ -r ${TRAINING_DIR}/${LANG_CODE}.xheights ]] && \ + [[ -s ${TRAINING_DIR}/${LANG_CODE}.xheights ]]; then + font_props=${font_props}" -X ${TRAINING_DIR}/${LANG_CODE}.xheights" + fi + + run_command ${MF_TRAINING_EXE} \ + -D "${TRAINING_DIR}/" \ + -U ${TRAINING_DIR}/${LANG_CODE}.unicharset \ + -O ${TRAINING_DIR}/${LANG_CODE}.mfunicharset \ + ${font_props} \ + $(ls ${TRAINING_DIR}/*.tr) + check_file_readable ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/shapetable \ + ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.mfunicharset + mv ${TRAINING_DIR}/inttemp ${TRAINING_DIR}/${LANG_CODE}.inttemp + mv ${TRAINING_DIR}/shapetable ${TRAINING_DIR}/${LANG_CODE}.shapetable + mv ${TRAINING_DIR}/pffmtable ${TRAINING_DIR}/${LANG_CODE}.pffmtable + mv ${TRAINING_DIR}/${LANG_CODE}.mfunicharset ${TRAINING_DIR}/${LANG_CODE}.unicharset +} + +phase_B_generate_ambiguities() { + tlog "\n=== Phase B : ambiguities training ===" + + # Check for manually created ambiguities data. + if [[ -r ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs ]]; then + tlog "Found file ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs" + cp ${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE}.unicharambigs \ + ${TRAINING_DIR}/${LANG_CODE}.unicharambigs + # Make it writable, as it may be read-only in the client. + chmod u+w ${TRAINING_DIR}/${LANG_CODE}.unicharambigs + return + else + tlog "No unicharambigs file found!" + fi + + # TODO: Add support for generating ambiguities automatically. +} + + +make__traineddata() { + tlog "\n=== Making final traineddata file ===" + local lang_prefix=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE} + + # Combine available files for this language from the langdata dir. + if [[ -r ${lang_prefix}.config ]]; then + tlog "Copying ${lang_prefix}.config to ${TRAINING_DIR}" + cp ${lang_prefix}.config ${TRAINING_DIR} + chmod u+w ${TRAINING_DIR}/${LANG_CODE}.config + fi + if [[ -r ${lang_prefix}.cube-unicharset ]]; then + tlog "Copying ${lang_prefix}.cube-unicharset to ${TRAINING_DIR}" + cp ${lang_prefix}.cube-unicharset ${TRAINING_DIR} + chmod u+w ${TRAINING_DIR}/${LANG_CODE}.cube-unicharset + fi + if [[ -r ${lang_prefix}.cube-word-dawg ]]; then + tlog "Copying ${lang_prefix}.cube-word-dawg to ${TRAINING_DIR}" + cp ${lang_prefix}.cube-word-dawg ${TRAINING_DIR} + chmod u+w ${TRAINING_DIR}/${LANG_CODE}.cube-word-dawg + fi + if [[ -r ${lang_prefix}.params-model ]]; then + tlog "Copying ${lang_prefix}.params-model to ${TRAINING_DIR}" + cp ${lang_prefix}.params-model ${TRAINING_DIR} + chmod u+w ${TRAINING_DIR}/${LANG_CODE}.params-model + fi + + # Compose the traineddata file. + run_command ${COMBINE_TESSDATA_EXE} ${TRAINING_DIR}/${LANG_CODE}. + + # Copy it to the output dir, overwriting only if allowed by the cmdline flag. + if [[ ! -d ${OUTPUT_DIR} ]]; then + tlog "Creating new directory ${OUTPUT_DIR}" + mkdir -p ${OUTPUT_DIR} + fi + local destfile=${OUTPUT_DIR}/${LANG_CODE}.traineddata; + if [[ -f ${destfile} ]] && (( ! ${OVERWRITE} )); then + err_exit "File ${destfile} exists and no --overwrite specified"; + fi + tlog "Moving ${TRAINING_DIR}/${LANG_CODE}.traineddata to ${OUTPUT_DIR}" + cp -f ${TRAINING_DIR}/${LANG_CODE}.traineddata ${destfile} +} + diff --git a/training/text2image.cpp b/training/text2image.cpp index 8685bcf3c8..d3b2c712f7 100644 --- a/training/text2image.cpp +++ b/training/text2image.cpp @@ -115,7 +115,7 @@ STRING_PARAM_FLAG(writing_mode, "horizontal", INT_PARAM_FLAG(box_padding, 0, "Padding around produced bounding boxes"); -BOOL_PARAM_FLAG(strip_unrenderable_words, false, +BOOL_PARAM_FLAG(strip_unrenderable_words, true, "Remove unrenderable words from source text"); // Font name. @@ -618,9 +618,9 @@ int main(int argc, char** argv) { } pixDestroy(&binary); } - if (FLAGS_find_fonts && !FLAGS_render_per_font && !font_names.empty()) { - // We just want a list of names, so we don't need to render any more - // of the text. + if (FLAGS_find_fonts && offset != 0) { + // We just want a list of names, or some sample images so we don't need + // to render more than the first page of the text. break; } } @@ -630,8 +630,7 @@ int main(int argc, char** argv) { box_name += ".box"; render.WriteAllBoxes(box_name); } else if (!FLAGS_render_per_font && !font_names.empty()) { - string filename = FLAGS_outputbase.c_str(); - filename += ".fontlist.txt"; + string filename = FLAGS_outputbase + ".fontlist.txt"; FILE* fp = fopen(filename.c_str(), "wb"); if (fp == NULL) { tprintf("Failed to create output font list %s\n", filename.c_str()); diff --git a/training/unicharset_training_utils.cpp b/training/unicharset_training_utils.cpp new file mode 100644 index 0000000000..10aaf0e6c3 --- /dev/null +++ b/training/unicharset_training_utils.cpp @@ -0,0 +1,193 @@ +/////////////////////////////////////////////////////////////////////// +// File: unicharset_training_utils.cpp +// Description: Training utilities for UNICHARSET. +// Author: Ray Smith +// Created: Fri Oct 17 17:09:01 PDT 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#include "unicharset_training_utils.h" + +#include +#include +#include + +#include "fileio.h" +#include "genericvector.h" +#include "icuerrorcode.h" +#include "normstrngs.h" +#include "statistc.h" +#include "strngs.h" +#include "unicharset.h" +#include "unicode/uchar.h" // from libicu +#include "unicode/uscript.h" // from libicu + +namespace tesseract { + +// Helper sets the character attribute properties and sets up the script table. +// Does not set tops and bottoms. +void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset) { + for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) { + // Convert any custom ligatures. + const char* unichar_str = unicharset->id_to_unichar(unichar_id); + for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL; ++i) { + if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) { + unichar_str = UNICHARSET::kCustomLigatures[i][0]; + break; + } + } + + // Convert the unichar to UTF32 representation + GenericVector uni_vector; + tesseract::UTF8ToUTF32(unichar_str, &uni_vector); + + // Assume that if the property is true for any character in the string, + // then it holds for the whole "character". + bool unichar_isalpha = false; + bool unichar_islower = false; + bool unichar_isupper = false; + bool unichar_isdigit = false; + bool unichar_ispunct = false; + + for (int i = 0; i < uni_vector.size(); ++i) { + if (u_isalpha(uni_vector[i])) + unichar_isalpha = true; + if (u_islower(uni_vector[i])) + unichar_islower = true; + if (u_isupper(uni_vector[i])) + unichar_isupper = true; + if (u_isdigit(uni_vector[i])) + unichar_isdigit = true; + if (u_ispunct(uni_vector[i])) + unichar_ispunct = true; + } + + unicharset->set_isalpha(unichar_id, unichar_isalpha); + unicharset->set_islower(unichar_id, unichar_islower); + unicharset->set_isupper(unichar_id, unichar_isupper); + unicharset->set_isdigit(unichar_id, unichar_isdigit); + unicharset->set_ispunctuation(unichar_id, unichar_ispunct); + + tesseract::IcuErrorCode err; + unicharset->set_script(unichar_id, uscript_getName( + uscript_getScript(uni_vector[0], err))); + + const int num_code_points = uni_vector.size(); + // Obtain the lower/upper case if needed and record it in the properties. + unicharset->set_other_case(unichar_id, unichar_id); + if (unichar_islower || unichar_isupper) { + GenericVector other_case(num_code_points, 0); + for (int i = 0; i < num_code_points; ++i) { + // TODO(daria): Ideally u_strToLower()/ustrToUpper() should be used. + // However since they deal with UChars (so need a conversion function + // from char32 or UTF8string) and require a meaningful locale string, + // for now u_tolower()/u_toupper() are used. + other_case[i] = unichar_islower ? u_toupper(uni_vector[i]) : + u_tolower(uni_vector[i]); + } + STRING other_case_uch; + tesseract::UTF32ToUTF8(other_case, &other_case_uch); + UNICHAR_ID other_case_id = + unicharset->unichar_to_id(other_case_uch.c_str()); + if (other_case_id != INVALID_UNICHAR_ID) { + unicharset->set_other_case(unichar_id, other_case_id); + } else if (unichar_id >= SPECIAL_UNICHAR_CODES_COUNT && report_errors) { + tprintf("Other case %s of %s is not in unicharset\n", + other_case_uch.c_str(), unichar_str); + } + } + + // Set RTL property and obtain mirror unichar ID from ICU. + GenericVector mirrors(num_code_points, 0); + for (int i = 0; i < num_code_points; ++i) { + mirrors[i] = u_charMirror(uni_vector[i]); + if (i == 0) { // set directionality to that of the 1st code point + unicharset->set_direction(unichar_id, + static_cast( + u_charDirection(uni_vector[i]))); + } + } + STRING mirror_uch; + tesseract::UTF32ToUTF8(mirrors, &mirror_uch); + UNICHAR_ID mirror_uch_id = unicharset->unichar_to_id(mirror_uch.c_str()); + if (mirror_uch_id != INVALID_UNICHAR_ID) { + unicharset->set_mirror(unichar_id, mirror_uch_id); + } else if (report_errors) { + tprintf("Mirror %s of %s is not in unicharset\n", + mirror_uch.c_str(), unichar_str); + } + + // Record normalized version of this unichar. + STRING normed_str = tesseract::NormalizeUTF8String(unichar_str); + if (unichar_id != 0 && normed_str.length() > 0) { + unicharset->set_normed(unichar_id, normed_str.c_str()); + } else { + unicharset->set_normed(unichar_id, unichar_str); + } + ASSERT_HOST(unicharset->get_other_case(unichar_id) < unicharset->size()); + } + unicharset->post_load_setup(); +} + +// Helper to set the properties for an input unicharset file, writes to the +// output file. If an appropriate script unicharset can be found in the +// script_dir directory, then the tops and bottoms are expanded using the +// script unicharset. +// If non-empty, xheight data for the fonts are written to the xheights_file. +void SetPropertiesForInputFile(const string& script_dir, + const string& input_unicharset_file, + const string& output_unicharset_file, + const string& output_xheights_file) { + UNICHARSET unicharset; + + // Load the input unicharset + unicharset.load_from_file(input_unicharset_file.c_str()); + tprintf("Loaded unicharset of size %d from file %s\n", unicharset.size(), + input_unicharset_file.c_str()); + + // Set unichar properties + tprintf("Setting unichar properties\n"); + SetupBasicProperties(true, &unicharset); + string xheights_str; + for (int s = 0; s < unicharset.get_script_table_size(); ++s) { + // Load the unicharset for the script if available. + string filename = script_dir + "/" + + unicharset.get_script_from_script_id(s) + ".unicharset"; + UNICHARSET script_set; + if (script_set.load_from_file(filename.c_str())) { + unicharset.SetPropertiesFromOther(script_set); + } + // Load the xheights for the script if available. + filename = script_dir + "/" + unicharset.get_script_from_script_id(s) + + ".xheights"; + string script_heights; + if (File::ReadFileToString(filename, &script_heights)) + xheights_str += script_heights; + } + if (!output_xheights_file.empty()) + File::WriteStringToFileOrDie(xheights_str, output_xheights_file); + for (int c = SPECIAL_UNICHAR_CODES_COUNT; c < unicharset.size(); ++c) { + if (unicharset.PropertiesIncomplete(c)) { + tprintf("Warning: properties incomplete for index %d = %s\n", + c, unicharset.id_to_unichar(c)); + } + } + + // Write the output unicharset + tprintf("Writing unicharset to file %s\n", output_unicharset_file.c_str()); + unicharset.save_to_file(output_unicharset_file.c_str()); +} + +} // namespace tesseract + diff --git a/training/unicharset_training_utils.h b/training/unicharset_training_utils.h new file mode 100644 index 0000000000..ff2262875d --- /dev/null +++ b/training/unicharset_training_utils.h @@ -0,0 +1,50 @@ +/////////////////////////////////////////////////////////////////////// +// File: unicharset_training_utils.h +// Description: Training utilities for UNICHARSET. +// Author: Ray Smith +// Created: Fri Oct 17 17:14:01 PDT 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_TRAINING_UNICHARSET_TRAINING_UTILS_H_ +#define TESSERACT_TRAINING_UNICHARSET_TRAINING_UTILS_H_ + +#include + +#ifdef USE_STD_NAMESPACE +using std::string; +#endif + +class STATS; +class UNICHARSET; + +namespace tesseract { + +// Helper sets the character attribute properties and sets up the script table. +// Does not set tops and bottoms. +void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset); + +// Helper to set the properties for an input unicharset file, writes to the +// output file. If an appropriate script unicharset can be found in the +// script_dir directory, then the tops and bottoms are expanded using the +// script unicharset. +// If non-empty, xheight data for the fonts are written to the xheights_file. +void SetPropertiesForInputFile(const string& script_dir, + const string& input_unicharset_file, + const string& output_unicharset_file, + const string& output_xheights_file); + +} // namespace tesseract. + +#endif // TESSERACT_TRAINING_UNICHARSET_TRAINING_UTILS_H_ From b2a3924585ec41a1d4274dfec55f1fc9d4047b83 Mon Sep 17 00:00:00 2001 From: Ray Smith Date: Tue, 12 May 2015 18:08:39 -0700 Subject: [PATCH 14/15] Major updates to training system as a result of extensive testing on 100 languages - makefile.am --- training/Makefile.am | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/training/Makefile.am b/training/Makefile.am index 00a81b3318..e6e2641dca 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -8,7 +8,7 @@ AM_CPPFLAGS += \ -I$(top_srcdir)/classify -I$(top_srcdir)/display \ -I$(top_srcdir)/wordrec -I$(top_srcdir)/cutil -EXTRA_DIST = tesstrain.sh +EXTRA_DIST = language-specific.sh tesstrain.sh tesstrain_utils.sh if T_WIN # try static build @@ -28,7 +28,7 @@ noinst_HEADERS = \ boxchar.h commandlineflags.h commontraining.h degradeimage.h \ fileio.h icuerrorcode.h ligature_table.h normstrngs.h \ mergenf.h pango_font_info.h stringrenderer.h \ - tessopt.h tlog.h util.h + tessopt.h tlog.h unicharset_training_utils.h util.h noinst_LTLIBRARIES = libtesseract_training.la libtesseract_tessopt.la @@ -39,7 +39,7 @@ libtesseract_training_la_LIBADD = \ libtesseract_training_la_SOURCES = \ boxchar.cpp commandlineflags.cpp commontraining.cpp degradeimage.cpp \ fileio.cpp ligature_table.cpp normstrngs.cpp pango_font_info.cpp \ - stringrenderer.cpp tlog.cpp + stringrenderer.cpp tlog.cpp unicharset_training_utils.cpp libtesseract_tessopt_la_SOURCES = \ tessopt.cpp From 03f3c9dc8874e2bb350b237d4737c3a457bfea7c Mon Sep 17 00:00:00 2001 From: Ray Smith Date: Tue, 12 May 2015 18:13:15 -0700 Subject: [PATCH 15/15] Misc fixes missed from previous commits --- api/renderer.cpp | 2 +- ccmain/resultiterator.cpp | 10 +++++----- ccstruct/boxread.cpp | 2 +- classify/adaptmatch.cpp | 3 --- classify/ocrfeatures.h | 6 ++---- 5 files changed, 9 insertions(+), 14 deletions(-) diff --git a/api/renderer.cpp b/api/renderer.cpp index 2664a9b7bd..83b4f53efd 100644 --- a/api/renderer.cpp +++ b/api/renderer.cpp @@ -117,7 +117,7 @@ bool TessTextRenderer::AddImageHandler(TessBaseAPI* api) { bool pageBreak = false; api->GetBoolVariable("include_page_breaks", &pageBreak); const char* pageSeparator = api->GetStringVariable("page_separator"); - if(pageBreak) { + if (pageBreak) { AppendString(pageSeparator); } diff --git a/ccmain/resultiterator.cpp b/ccmain/resultiterator.cpp index b3f5321d18..77514a6a59 100644 --- a/ccmain/resultiterator.cpp +++ b/ccmain/resultiterator.cpp @@ -37,8 +37,8 @@ ResultIterator::ResultIterator(const LTRResultIterator &resit) preserve_interword_spaces_ = false; BoolParam *p = ParamUtils::FindParam( - "preserve_interword_spaces", GlobalParams()->bool_params, - tesseract_->params()->bool_params); + "preserve_interword_spaces", GlobalParams()->bool_params, + tesseract_->params()->bool_params); if (p != NULL) preserve_interword_spaces_ = (bool)(*p); current_paragraph_is_ltr_ = CurrentParagraphIsLtr(); @@ -636,9 +636,9 @@ void ResultIterator::IterateAndAppendUTF8TextlineText(STRING *text) { int words_appended = 0; do { - int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space() : - (words_appended > 0); - for(int i = 0 ; i < numSpaces ; ++i) { + int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space() + : (words_appended > 0); + for (int i = 0; i < numSpaces; ++i) { *text += " "; } AppendUTF8WordText(text); diff --git a/ccstruct/boxread.cpp b/ccstruct/boxread.cpp index a91261391e..947fcc02fe 100644 --- a/ccstruct/boxread.cpp +++ b/ccstruct/boxread.cpp @@ -78,7 +78,7 @@ bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data, if (!ParseBoxFileStr(lines[i].string(), &page, &utf8_str, &box)) { continue; } - if (skip_blanks && utf8_str == " ") continue; + if (skip_blanks && (utf8_str == " " || utf8_str == "\t")) continue; if (target_page >= 0 && page != target_page) continue; if (boxes != NULL) boxes->push_back(box); if (texts != NULL) texts->push_back(utf8_str); diff --git a/classify/adaptmatch.cpp b/classify/adaptmatch.cpp index e9ff913b77..aebf64893f 100644 --- a/classify/adaptmatch.cpp +++ b/classify/adaptmatch.cpp @@ -206,9 +206,6 @@ void Classify::AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices) { PrintAdaptiveMatchResults(*Results); } - if (LargeSpeckle(*Blob) || Choices->length() == 0) - AddLargeSpeckleTo(Results->BlobLength, Choices); - #ifndef GRAPHICS_DISABLED if (classify_enable_adaptive_debugger) DebugAdaptiveClassifier(Blob, Results); diff --git a/classify/ocrfeatures.h b/classify/ocrfeatures.h index 7d6ba95dab..31a4794ca6 100644 --- a/classify/ocrfeatures.h +++ b/classify/ocrfeatures.h @@ -118,10 +118,8 @@ FEATURE ReadFeature(FILE *File, const FEATURE_DESC_STRUCT *FeatureDesc); FEATURE_SET ReadFeatureSet(FILE *File, const FEATURE_DESC_STRUCT *FeatureDesc); -void WriteFeature(FILE *File, FEATURE Feature); +void WriteFeature(FEATURE Feature, STRING* str); -void WriteFeatureSet(FILE *File, FEATURE_SET FeatureSet); - -void WriteOldParamDesc(FILE *File, const FEATURE_DESC_STRUCT *FeatureDesc); +void WriteFeatureSet(FEATURE_SET FeatureSet, STRING* str); #endif