From 4a28d33c5851dbeee65f71db73e0c34e8c659fff Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Sat, 26 Dec 2020 23:37:58 +0100 Subject: [PATCH] Replace GenericVector by std::vector in strngs.h and more places Signed-off-by: Stefan Weil --- include/tesseract/strngs.h | 6 ++---- src/api/baseapi.cpp | 2 +- src/ccmain/applybox.cpp | 2 +- src/ccstruct/boxread.cpp | 25 +++++++++++++------------ src/ccstruct/boxread.h | 19 ++++++++----------- src/ccutil/ambigs.cpp | 2 +- src/ccutil/strngs.cpp | 3 +-- src/ccutil/unicharcompress.cpp | 4 ++-- src/classify/adaptmatch.cpp | 2 +- src/dict/trie.cpp | 14 ++++++-------- src/dict/trie.h | 4 ++-- src/training/combine_lang_model.cpp | 2 +- src/training/fileio.h | 2 +- src/training/lang_model_helpers.cpp | 14 +++++++------- src/training/lang_model_helpers.h | 6 +++--- src/training/lstmtrainer.cpp | 2 +- src/training/unicharset_extractor.cpp | 6 +++--- 17 files changed, 54 insertions(+), 61 deletions(-) diff --git a/include/tesseract/strngs.h b/include/tesseract/strngs.h index cd0991ce97..8ec22190f1 100644 --- a/include/tesseract/strngs.h +++ b/include/tesseract/strngs.h @@ -26,14 +26,12 @@ #include // for FILE #include // for strncpy #include +#include namespace tesseract { class TFile; -template -class GenericVector; - class STRING : public std::string { public: using std::string::string; @@ -76,7 +74,7 @@ class STRING : public std::string { } TESS_API - void split(char c, GenericVector* splited); + void split(char c, std::vector* splited); TESS_API void truncate_at(int32_t index); diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp index 3c10b3a110..2a56b120d5 100644 --- a/src/api/baseapi.cpp +++ b/src/api/baseapi.cpp @@ -999,7 +999,7 @@ bool TessBaseAPI::ProcessPagesFileList(FILE *flist, int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0; char pagename[MAX_PATH]; - GenericVector lines; + std::vector lines; if (!flist) { buf->split('\n', &lines); if (lines.empty()) return false; diff --git a/src/ccmain/applybox.cpp b/src/ccmain/applybox.cpp index c8bd96701c..a972c02809 100644 --- a/src/ccmain/applybox.cpp +++ b/src/ccmain/applybox.cpp @@ -791,7 +791,7 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) { for (int i = 0; i < word_res->correct_text.size(); ++i) { // The part before the first space is the real ground truth, and the // rest is the bounding box location and page number. - GenericVector tokens; + std::vector tokens; word_res->correct_text[i].split(' ', &tokens); UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].c_str()); choice->append_unichar_id_space_allocated(char_id, diff --git a/src/ccstruct/boxread.cpp b/src/ccstruct/boxread.cpp index 1d4df736a0..7e0ebfff24 100644 --- a/src/ccstruct/boxread.cpp +++ b/src/ccstruct/boxread.cpp @@ -23,12 +23,12 @@ #include "rect.h" // for TBOX #include "tprintf.h" // for tprintf -#include // for GenericVector #include // for chomp_string #include // for STRING #include // for UNICHAR #include // for strchr, strcmp +#include // for std::ifstream #include // for std::locale::classic #include // for std::stringstream #include // for std::string @@ -74,12 +74,13 @@ FILE* OpenBoxFile(const char* fname) { // Each of the output vectors is optional (may be nullptr). // Returns false if no boxes are found. bool ReadAllBoxes(int target_page, bool skip_blanks, const char* filename, - GenericVector* boxes, - GenericVector* texts, - GenericVector* box_texts, - GenericVector* pages) { - GenericVector box_data; - if (!tesseract::LoadDataFromFile(BoxFileName(filename).c_str(), &box_data)) + std::vector* boxes, + std::vector* texts, + std::vector* box_texts, + std::vector* pages) { + std::ifstream input(BoxFileName(filename).c_str(), std::ios::in | std::ios::binary); + std::vector box_data(std::istreambuf_iterator(input), {}); + if (box_data.empty()) return false; // Convert the array of bytes to a string, so it can be used by the parser. box_data.push_back('\0'); @@ -91,12 +92,12 @@ bool ReadAllBoxes(int target_page, bool skip_blanks, const char* filename, // Reads all boxes from the string. Otherwise, as ReadAllBoxes. bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data, bool continue_on_failure, - GenericVector* boxes, - GenericVector* texts, - GenericVector* box_texts, - GenericVector* pages) { + std::vector* boxes, + std::vector* texts, + std::vector* box_texts, + std::vector* pages) { STRING box_str(box_data); - GenericVector lines; + std::vector lines; box_str.split('\n', &lines); if (lines.empty()) return false; int num_boxes = 0; diff --git a/src/ccstruct/boxread.h b/src/ccstruct/boxread.h index 14135d9fec..8c7057cf10 100644 --- a/src/ccstruct/boxread.h +++ b/src/ccstruct/boxread.h @@ -27,9 +27,6 @@ namespace tesseract { class TBOX; -template class GenericVector; -template class GenericVector; - // Size of buffer used to read a line from a box file. const int kBoxReadBufSize = 1024; @@ -45,10 +42,10 @@ FILE* OpenBoxFile(const char* filename); // Each of the output vectors is optional (may be nullptr). // Returns false if no boxes are found. bool ReadAllBoxes(int target_page, bool skip_blanks, const char* filename, - GenericVector* boxes, - GenericVector* texts, - GenericVector* box_texts, - GenericVector* pages); + std::vector* boxes, + std::vector* texts, + std::vector* box_texts, + std::vector* pages); // Reads all boxes from the string. Otherwise, as ReadAllBoxes. // continue_on_failure allows reading to continue even if an invalid box is @@ -56,10 +53,10 @@ bool ReadAllBoxes(int target_page, bool skip_blanks, const char* filename, // It otherwise gives up and returns false on encountering an invalid box. bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data, bool continue_on_failure, - GenericVector* boxes, - GenericVector* texts, - GenericVector* box_texts, - GenericVector* pages); + std::vector* boxes, + std::vector* texts, + std::vector* box_texts, + std::vector* pages); // ReadNextBox factors out the code to interpret a line of a box // file so that applybox and unicharset_extractor interpret the same way. diff --git a/src/ccutil/ambigs.cpp b/src/ccutil/ambigs.cpp index 991e954b9f..908a4b72f6 100644 --- a/src/ccutil/ambigs.cpp +++ b/src/ccutil/ambigs.cpp @@ -228,7 +228,7 @@ bool UnicharAmbigs::ParseAmbiguityLine( if (version > 1) { // Simpler format is just wrong-string correct-string type\n. STRING input(buffer); - GenericVector fields; + std::vector fields; input.split(' ', &fields); if (fields.size() != 3) { if (debug_level) tprintf(kIllegalMsg, line_num); diff --git a/src/ccutil/strngs.cpp b/src/ccutil/strngs.cpp index 8f93fe8916..68729a0396 100644 --- a/src/ccutil/strngs.cpp +++ b/src/ccutil/strngs.cpp @@ -20,7 +20,6 @@ #include "errcode.h" // for ASSERT_HOST -#include // for GenericVector #include // for ReverseN #include // for TFile @@ -87,7 +86,7 @@ void STRING::truncate_at(int32_t index) { resize(index); } -void STRING::split(const char c, GenericVector *splited) { +void STRING::split(const char c, std::vector *splited) { int start_index = 0; const int len = length(); for (int i = 0; i < len; i++) { diff --git a/src/ccutil/unicharcompress.cpp b/src/ccutil/unicharcompress.cpp index 3abaa9b9a0..d5fed9c72a 100644 --- a/src/ccutil/unicharcompress.cpp +++ b/src/ccutil/unicharcompress.cpp @@ -50,7 +50,7 @@ using RSCounts = std::unordered_map; static bool DecodeRadicalLine(STRING* radical_data_line, RSMap* radical_map) { if (radical_data_line->length() == 0 || (*radical_data_line)[0] == '#') return true; - GenericVector entries; + std::vector entries; radical_data_line->split(' ', &entries); if (entries.size() < 2) return false; char* end = nullptr; @@ -71,7 +71,7 @@ static bool DecodeRadicalLine(STRING* radical_data_line, RSMap* radical_map) { // The radical_stroke_table is non-const because it gets split and the caller // is unlikely to want to use it again. static bool DecodeRadicalTable(STRING* radical_data, RSMap* radical_map) { - GenericVector lines; + std::vector lines; radical_data->split('\n', &lines); for (int i = 0; i < lines.size(); ++i) { if (!DecodeRadicalLine(&lines[i], radical_map)) { diff --git a/src/classify/adaptmatch.cpp b/src/classify/adaptmatch.cpp index c0cafa6122..f428dd4681 100644 --- a/src/classify/adaptmatch.cpp +++ b/src/classify/adaptmatch.cpp @@ -310,7 +310,7 @@ void Classify::LearnWord(const char* fontname, WERD_RES* word) { word->best_state[ch]); if (pieces_all_natural || !prioritize_division) { for (frag = 0; frag < word->best_state[ch]; ++frag) { - GenericVector tokens; + std::vector tokens; word->correct_text[ch].split(' ', &tokens); tokens[0] = CHAR_FRAGMENT::to_string( diff --git a/src/dict/trie.cpp b/src/dict/trie.cpp index ffe54493c7..129a9f1389 100644 --- a/src/dict/trie.cpp +++ b/src/dict/trie.cpp @@ -270,23 +270,21 @@ NODE_REF Trie::new_dawg_node() { } // Sort function to sort words by decreasing order of length. -static int sort_strings_by_dec_length(const void* v1, const void* v2) { - const auto *s1 = static_cast(v1); - const auto *s2 = static_cast(v2); - return s2->length() - s1->length(); +static int sort_strings_by_dec_length(STRING& s1, STRING& s2) { + return s2.length() - s1.length(); } bool Trie::read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy) { - GenericVector word_list; + std::vector word_list; if (!read_word_list(filename, &word_list)) return false; - word_list.sort(sort_strings_by_dec_length); + std::sort(word_list.begin(), word_list.end(), sort_strings_by_dec_length); return add_word_list(word_list, unicharset, reverse_policy); } bool Trie::read_word_list(const char *filename, - GenericVector* words) { + std::vector* words) { FILE *word_file; char line_str[CHARS_PER_LINE]; int word_count = 0; @@ -308,7 +306,7 @@ bool Trie::read_word_list(const char *filename, return true; } -bool Trie::add_word_list(const GenericVector &words, +bool Trie::add_word_list(const std::vector &words, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy) { for (int i = 0; i < words.size(); ++i) { diff --git a/src/dict/trie.h b/src/dict/trie.h index b2950d28f3..644926ccc7 100644 --- a/src/dict/trie.h +++ b/src/dict/trie.h @@ -174,11 +174,11 @@ class Trie : public Dawg { // Reads a list of words from the given file. // Returns false on error. bool read_word_list(const char *filename, - GenericVector* words); + std::vector* words); // Adds a list of words previously read using read_word_list to the trie // using the given unicharset and reverse_policy to convert to unichar-ids. // Returns false on error. - bool add_word_list(const GenericVector &words, + bool add_word_list(const std::vector &words, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy); diff --git a/src/training/combine_lang_model.cpp b/src/training/combine_lang_model.cpp index 2c5a350c7c..440088c206 100644 --- a/src/training/combine_lang_model.cpp +++ b/src/training/combine_lang_model.cpp @@ -53,7 +53,7 @@ int main(int argc, char** argv) { tesseract::CheckSharedLibraryVersion(); tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true); - GenericVector words, puncs, numbers; + std::vector words, puncs, numbers; // If these reads fail, we get a warning message and an empty list of words. tesseract::ReadFile(FLAGS_words.c_str(), nullptr).split('\n', &words); tesseract::ReadFile(FLAGS_puncs.c_str(), nullptr).split('\n', &puncs); diff --git a/src/training/fileio.h b/src/training/fileio.h index d783ae96c0..8aa39c1966 100644 --- a/src/training/fileio.h +++ b/src/training/fileio.h @@ -29,7 +29,7 @@ namespace tesseract { // Reads a file as a vector of STRING. // TODO: Use std::vector and std::string for LoadFileLinesToStrings. inline bool LoadFileLinesToStrings(const char* filename, - GenericVector* lines) { + std::vector* lines) { GenericVector data; if (!LoadDataFromFile(filename, &data)) { return false; diff --git a/src/training/lang_model_helpers.cpp b/src/training/lang_model_helpers.cpp index f6216d8a52..dd1059d9bd 100644 --- a/src/training/lang_model_helpers.cpp +++ b/src/training/lang_model_helpers.cpp @@ -123,7 +123,7 @@ bool WriteRecoder(const UNICHARSET& unicharset, bool pass_through, // Helper builds a dawg from the given words, using the unicharset as coding, // and reverse_policy for LTR/RTL, and overwrites file_type in the traineddata. -static bool WriteDawg(const GenericVector& words, +static bool WriteDawg(const std::vector& words, const UNICHARSET& unicharset, Trie::RTLReversePolicy reverse_policy, TessdataType file_type, TessdataManager* traineddata) { @@ -144,9 +144,9 @@ static bool WriteDawg(const GenericVector& words, // Builds and writes the dawgs, given a set of words, punctuation // patterns, number patterns, to the traineddata. Encoding uses the given // unicharset, and the punc dawgs is reversed if lang_is_rtl. -static bool WriteDawgs(const GenericVector& words, - const GenericVector& puncs, - const GenericVector& numbers, bool lang_is_rtl, +static bool WriteDawgs(const std::vector& words, + const std::vector& puncs, + const std::vector& numbers, bool lang_is_rtl, const UNICHARSET& unicharset, TessdataManager* traineddata) { if (puncs.empty()) { @@ -185,9 +185,9 @@ static bool WriteDawgs(const GenericVector& words, int CombineLangModel(const UNICHARSET& unicharset, const std::string& script_dir, const std::string& version_str, const std::string& output_dir, const std::string& lang, bool pass_through_recoder, - const GenericVector& words, - const GenericVector& puncs, - const GenericVector& numbers, bool lang_is_rtl, + const std::vector& words, + const std::vector& puncs, + const std::vector& numbers, bool lang_is_rtl, FileReader reader, FileWriter writer) { // Build the traineddata file. TessdataManager traineddata; diff --git a/src/training/lang_model_helpers.h b/src/training/lang_model_helpers.h index 7538d0a1ae..de2b25c02d 100644 --- a/src/training/lang_model_helpers.h +++ b/src/training/lang_model_helpers.h @@ -74,9 +74,9 @@ bool WriteRecoder(const UNICHARSET& unicharset, bool pass_through, int CombineLangModel(const UNICHARSET& unicharset, const std::string& script_dir, const std::string& version_str, const std::string& output_dir, const std::string& lang, bool pass_through_recoder, - const GenericVector& words, - const GenericVector& puncs, - const GenericVector& numbers, bool lang_is_rtl, + const std::vector& words, + const std::vector& puncs, + const std::vector& numbers, bool lang_is_rtl, FileReader reader, FileWriter writer); } // namespace tesseract diff --git a/src/training/lstmtrainer.cpp b/src/training/lstmtrainer.cpp index 57e6481bfb..03170ff2ab 100644 --- a/src/training/lstmtrainer.cpp +++ b/src/training/lstmtrainer.cpp @@ -1196,7 +1196,7 @@ double LSTMTrainer::ComputeCharError(const GenericVector& truth_str, // NOTE that this is destructive on both input strings. double LSTMTrainer::ComputeWordError(STRING* truth_str, STRING* ocr_str) { using StrMap = std::unordered_map>; - GenericVector truth_words, ocr_words; + std::vector truth_words, ocr_words; truth_str->split(' ', &truth_words); if (truth_words.empty()) return 0.0; ocr_str->split(' ', &ocr_words); diff --git a/src/training/unicharset_extractor.cpp b/src/training/unicharset_extractor.cpp index 54f62f6644..3aae0a072c 100644 --- a/src/training/unicharset_extractor.cpp +++ b/src/training/unicharset_extractor.cpp @@ -42,7 +42,7 @@ namespace tesseract { // Helper normalizes and segments the given strings according to norm_mode, and // adds the segmented parts to unicharset. -static void AddStringsToUnicharset(const GenericVector& strings, +static void AddStringsToUnicharset(const std::vector& strings, int norm_mode, UNICHARSET* unicharset) { for (int i = 0; i < strings.size(); ++i) { std::vector normalized; @@ -68,14 +68,14 @@ static int Main(int argc, char** argv) { for (int arg = 1; arg < argc; ++arg) { STRING file_data = tesseract::ReadFile(argv[arg], /*reader*/ nullptr); if (file_data.length() == 0) continue; - GenericVector texts; + std::vector texts; if (ReadMemBoxes(-1, /*skip_blanks*/ true, &file_data[0], /*continue_on_failure*/ false, /*boxes*/ nullptr, &texts, /*box_texts*/ nullptr, /*pages*/ nullptr)) { tprintf("Extracting unicharset from box file %s\n", argv[arg]); } else { tprintf("Extracting unicharset from plain text file %s\n", argv[arg]); - texts.truncate(0); + texts.resize(0); file_data.split('\n', &texts); } AddStringsToUnicharset(texts, FLAGS_norm_mode, &unicharset);