From 4a28d33c5851dbeee65f71db73e0c34e8c659fff Mon Sep 17 00:00:00 2001
From: Stefan Weil <sw@weil.de>
Date: Sat, 26 Dec 2020 23:37:58 +0100
Subject: [PATCH] Replace GenericVector by std::vector in strngs.h and more
 places

Signed-off-by: Stefan Weil <sw@weil.de>
---
 include/tesseract/strngs.h            |  6 ++----
 src/api/baseapi.cpp                   |  2 +-
 src/ccmain/applybox.cpp               |  2 +-
 src/ccstruct/boxread.cpp              | 25 +++++++++++++------------
 src/ccstruct/boxread.h                | 19 ++++++++-----------
 src/ccutil/ambigs.cpp                 |  2 +-
 src/ccutil/strngs.cpp                 |  3 +--
 src/ccutil/unicharcompress.cpp        |  4 ++--
 src/classify/adaptmatch.cpp           |  2 +-
 src/dict/trie.cpp                     | 14 ++++++--------
 src/dict/trie.h                       |  4 ++--
 src/training/combine_lang_model.cpp   |  2 +-
 src/training/fileio.h                 |  2 +-
 src/training/lang_model_helpers.cpp   | 14 +++++++-------
 src/training/lang_model_helpers.h     |  6 +++---
 src/training/lstmtrainer.cpp          |  2 +-
 src/training/unicharset_extractor.cpp |  6 +++---
 17 files changed, 54 insertions(+), 61 deletions(-)
diff --git a/include/tesseract/strngs.h b/include/tesseract/strngs.h
index cd0991ce97..8ec22190f1 100644
--- a/include/tesseract/strngs.h
+++ b/include/tesseract/strngs.h
@@ -26,14 +26,12 @@
 #include <cstdio>   // for FILE
 #include <cstring>  // for strncpy
 #include <string>
+#include <vector>
 
 namespace tesseract {
 
 class TFile;
 
-template <typename T>
-class GenericVector;
-
 class STRING : public std::string {
  public:
   using std::string::string;
@@ -76,7 +74,7 @@ class STRING : public std::string {
   }
 
   TESS_API
-  void split(char c, GenericVector<STRING>* splited);
+  void split(char c, std::vector<STRING>* splited);
   TESS_API
   void truncate_at(int32_t index);
 
diff --git a/src/api/baseapi.cpp b/src/api/baseapi.cpp
index 3c10b3a110..2a56b120d5 100644
--- a/src/api/baseapi.cpp
+++ b/src/api/baseapi.cpp
@@ -999,7 +999,7 @@ bool TessBaseAPI::ProcessPagesFileList(FILE *flist,
   int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
   char pagename[MAX_PATH];
 
-  GenericVector<STRING> lines;
+  std::vector<STRING> lines;
   if (!flist) {
     buf->split('\n', &lines);
     if (lines.empty()) return false;
diff --git a/src/ccmain/applybox.cpp b/src/ccmain/applybox.cpp
index c8bd96701c..a972c02809 100644
--- a/src/ccmain/applybox.cpp
+++ b/src/ccmain/applybox.cpp
@@ -791,7 +791,7 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
     for (int i = 0; i < word_res->correct_text.size(); ++i) {
       // The part before the first space is the real ground truth, and the
       // rest is the bounding box location and page number.
-      GenericVector<STRING> tokens;
+      std::vector<STRING> tokens;
       word_res->correct_text[i].split(' ', &tokens);
       UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].c_str());
       choice->append_unichar_id_space_allocated(char_id,
diff --git a/src/ccstruct/boxread.cpp b/src/ccstruct/boxread.cpp
index 1d4df736a0..7e0ebfff24 100644
--- a/src/ccstruct/boxread.cpp
+++ b/src/ccstruct/boxread.cpp
@@ -23,12 +23,12 @@
 #include "rect.h"           // for TBOX
 #include "tprintf.h"        // for tprintf
 
-#include <tesseract/genericvector.h>  // for GenericVector
 #include <tesseract/helpers.h>        // for chomp_string
 #include <tesseract/strngs.h>         // for STRING
 #include <tesseract/unichar.h>        // for UNICHAR
 
 #include <cstring>          // for strchr, strcmp
+#include <fstream>          // for std::ifstream
 #include <locale>           // for std::locale::classic
 #include <sstream>          // for std::stringstream
 #include <string>           // for std::string
@@ -74,12 +74,13 @@ FILE* OpenBoxFile(const char* fname) {
 // Each of the output vectors is optional (may be nullptr).
 // Returns false if no boxes are found.
 bool ReadAllBoxes(int target_page, bool skip_blanks, const char* filename,
-                  GenericVector<TBOX>* boxes,
-                  GenericVector<STRING>* texts,
-                  GenericVector<STRING>* box_texts,
-                  GenericVector<int>* pages) {
-  GenericVector<char> box_data;
-  if (!tesseract::LoadDataFromFile(BoxFileName(filename).c_str(), &box_data))
+                  std::vector<TBOX>* boxes,
+                  std::vector<STRING>* texts,
+                  std::vector<STRING>* box_texts,
+                  std::vector<int>* pages) {
+  std::ifstream input(BoxFileName(filename).c_str(), std::ios::in | std::ios::binary);
+  std::vector<char> box_data(std::istreambuf_iterator<char>(input), {});
+  if (box_data.empty())
     return false;
   // Convert the array of bytes to a string, so it can be used by the parser.
   box_data.push_back('\0');
@@ -91,12 +92,12 @@ bool ReadAllBoxes(int target_page, bool skip_blanks, const char* filename,
 // Reads all boxes from the string. Otherwise, as ReadAllBoxes.
 bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data,
                   bool continue_on_failure,
-                  GenericVector<TBOX>* boxes,
-                  GenericVector<STRING>* texts,
-                  GenericVector<STRING>* box_texts,
-                  GenericVector<int>* pages) {
+                  std::vector<TBOX>* boxes,
+                  std::vector<STRING>* texts,
+                  std::vector<STRING>* box_texts,
+                  std::vector<int>* pages) {
   STRING box_str(box_data);
-  GenericVector<STRING> lines;
+  std::vector<STRING> lines;
   box_str.split('\n', &lines);
   if (lines.empty()) return false;
   int num_boxes = 0;
diff --git a/src/ccstruct/boxread.h b/src/ccstruct/boxread.h
index 14135d9fec..8c7057cf10 100644
--- a/src/ccstruct/boxread.h
+++ b/src/ccstruct/boxread.h
@@ -27,9 +27,6 @@ namespace tesseract {
 
 class TBOX;
 
-template <typename T> class GenericVector;
-template <typename T> class GenericVector;
-
 // Size of buffer used to read a line from a box file.
 const int kBoxReadBufSize = 1024;
 
@@ -45,10 +42,10 @@ FILE* OpenBoxFile(const char* filename);
 // Each of the output vectors is optional (may be nullptr).
 // Returns false if no boxes are found.
 bool ReadAllBoxes(int target_page, bool skip_blanks, const char* filename,
-                  GenericVector<TBOX>* boxes,
-                  GenericVector<STRING>* texts,
-                  GenericVector<STRING>* box_texts,
-                  GenericVector<int>* pages);
+                  std::vector<TBOX>* boxes,
+                  std::vector<STRING>* texts,
+                  std::vector<STRING>* box_texts,
+                  std::vector<int>* pages);
 
 // Reads all boxes from the string. Otherwise, as ReadAllBoxes.
 // continue_on_failure allows reading to continue even if an invalid box is
@@ -56,10 +53,10 @@ bool ReadAllBoxes(int target_page, bool skip_blanks, const char* filename,
 // It otherwise gives up and returns false on encountering an invalid box.
 bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data,
                   bool continue_on_failure,
-                  GenericVector<TBOX>* boxes,
-                  GenericVector<STRING>* texts,
-                  GenericVector<STRING>* box_texts,
-                  GenericVector<int>* pages);
+                  std::vector<TBOX>* boxes,
+                  std::vector<STRING>* texts,
+                  std::vector<STRING>* box_texts,
+                  std::vector<int>* pages);
 
 // ReadNextBox factors out the code to interpret a line of a box
 // file so that applybox and unicharset_extractor interpret the same way.
diff --git a/src/ccutil/ambigs.cpp b/src/ccutil/ambigs.cpp
index 991e954b9f..908a4b72f6 100644
--- a/src/ccutil/ambigs.cpp
+++ b/src/ccutil/ambigs.cpp
@@ -228,7 +228,7 @@ bool UnicharAmbigs::ParseAmbiguityLine(
   if (version > 1) {
     // Simpler format is just wrong-string correct-string type\n.
     STRING input(buffer);
-    GenericVector<STRING> fields;
+    std::vector<STRING> fields;
     input.split(' ', &fields);
     if (fields.size() != 3) {
       if (debug_level) tprintf(kIllegalMsg, line_num);
diff --git a/src/ccutil/strngs.cpp b/src/ccutil/strngs.cpp
index 8f93fe8916..68729a0396 100644
--- a/src/ccutil/strngs.cpp
+++ b/src/ccutil/strngs.cpp
@@ -20,7 +20,6 @@
 
 #include "errcode.h"        // for ASSERT_HOST
 
-#include <tesseract/genericvector.h>  // for GenericVector
 #include <tesseract/helpers.h>        // for ReverseN
 #include <tesseract/serialis.h>       // for TFile
 
@@ -87,7 +86,7 @@ void STRING::truncate_at(int32_t index) {
   resize(index);
 }
 
-void STRING::split(const char c, GenericVector<STRING> *splited) {
+void STRING::split(const char c, std::vector<STRING> *splited) {
   int start_index = 0;
   const int len = length();
   for (int i = 0; i < len; i++) {
diff --git a/src/ccutil/unicharcompress.cpp b/src/ccutil/unicharcompress.cpp
index 3abaa9b9a0..d5fed9c72a 100644
--- a/src/ccutil/unicharcompress.cpp
+++ b/src/ccutil/unicharcompress.cpp
@@ -50,7 +50,7 @@ using RSCounts = std::unordered_map<int, int>;
 static bool DecodeRadicalLine(STRING* radical_data_line, RSMap* radical_map) {
   if (radical_data_line->length() == 0 || (*radical_data_line)[0] == '#')
     return true;
-  GenericVector<STRING> entries;
+  std::vector<STRING> entries;
   radical_data_line->split(' ', &entries);
   if (entries.size() < 2) return false;
   char* end = nullptr;
@@ -71,7 +71,7 @@ static bool DecodeRadicalLine(STRING* radical_data_line, RSMap* radical_map) {
 // The radical_stroke_table is non-const because it gets split and the caller
 // is unlikely to want to use it again.
 static bool DecodeRadicalTable(STRING* radical_data, RSMap* radical_map) {
-  GenericVector<STRING> lines;
+  std::vector<STRING> lines;
   radical_data->split('\n', &lines);
   for (int i = 0; i < lines.size(); ++i) {
     if (!DecodeRadicalLine(&lines[i], radical_map)) {
diff --git a/src/classify/adaptmatch.cpp b/src/classify/adaptmatch.cpp
index c0cafa6122..f428dd4681 100644
--- a/src/classify/adaptmatch.cpp
+++ b/src/classify/adaptmatch.cpp
@@ -310,7 +310,7 @@ void Classify::LearnWord(const char* fontname, WERD_RES* word) {
               word->best_state[ch]);
           if (pieces_all_natural || !prioritize_division) {
             for (frag = 0; frag < word->best_state[ch]; ++frag) {
-              GenericVector<STRING> tokens;
+              std::vector<STRING> tokens;
               word->correct_text[ch].split(' ', &tokens);
 
               tokens[0] = CHAR_FRAGMENT::to_string(
diff --git a/src/dict/trie.cpp b/src/dict/trie.cpp
index ffe54493c7..129a9f1389 100644
--- a/src/dict/trie.cpp
+++ b/src/dict/trie.cpp
@@ -270,23 +270,21 @@ NODE_REF Trie::new_dawg_node() {
 }
 
 // Sort function to sort words by decreasing order of length.
-static int sort_strings_by_dec_length(const void* v1, const void* v2) {
-  const auto *s1 = static_cast<const STRING *>(v1);
-  const auto *s2 = static_cast<const STRING *>(v2);
-  return s2->length() - s1->length();
+static int sort_strings_by_dec_length(STRING& s1, STRING& s2) {
+  return s2.length() - s1.length();
 }
 
 bool Trie::read_and_add_word_list(const char *filename,
                                   const UNICHARSET &unicharset,
                                   Trie::RTLReversePolicy reverse_policy) {
-  GenericVector<STRING> word_list;
+  std::vector<STRING> word_list;
   if (!read_word_list(filename, &word_list)) return false;
-  word_list.sort(sort_strings_by_dec_length);
+  std::sort(word_list.begin(), word_list.end(), sort_strings_by_dec_length);
   return add_word_list(word_list, unicharset, reverse_policy);
 }
 
 bool Trie::read_word_list(const char *filename,
-                          GenericVector<STRING>* words) {
+                          std::vector<STRING>* words) {
   FILE *word_file;
   char line_str[CHARS_PER_LINE];
   int  word_count = 0;
@@ -308,7 +306,7 @@ bool Trie::read_word_list(const char *filename,
   return true;
 }
 
-bool Trie::add_word_list(const GenericVector<STRING> &words,
+bool Trie::add_word_list(const std::vector<STRING> &words,
                          const UNICHARSET &unicharset,
                          Trie::RTLReversePolicy reverse_policy) {
   for (int i = 0; i < words.size(); ++i) {
diff --git a/src/dict/trie.h b/src/dict/trie.h
index b2950d28f3..644926ccc7 100644
--- a/src/dict/trie.h
+++ b/src/dict/trie.h
@@ -174,11 +174,11 @@ class Trie : public Dawg {
   // Reads a list of words from the given file.
   // Returns false on error.
   bool read_word_list(const char *filename,
-                      GenericVector<STRING>* words);
+                      std::vector<STRING>* words);
   // Adds a list of words previously read using read_word_list to the trie
   // using the given unicharset and reverse_policy to convert to unichar-ids.
   // Returns false on error.
-  bool add_word_list(const GenericVector<STRING> &words,
+  bool add_word_list(const std::vector<STRING> &words,
                      const UNICHARSET &unicharset,
                      Trie::RTLReversePolicy reverse_policy);
 
diff --git a/src/training/combine_lang_model.cpp b/src/training/combine_lang_model.cpp
index 2c5a350c7c..440088c206 100644
--- a/src/training/combine_lang_model.cpp
+++ b/src/training/combine_lang_model.cpp
@@ -53,7 +53,7 @@ int main(int argc, char** argv) {
   tesseract::CheckSharedLibraryVersion();
   tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
 
-  GenericVector<STRING> words, puncs, numbers;
+  std::vector<STRING> words, puncs, numbers;
   // If these reads fail, we get a warning message and an empty list of words.
   tesseract::ReadFile(FLAGS_words.c_str(), nullptr).split('\n', &words);
   tesseract::ReadFile(FLAGS_puncs.c_str(), nullptr).split('\n', &puncs);
diff --git a/src/training/fileio.h b/src/training/fileio.h
index d783ae96c0..8aa39c1966 100644
--- a/src/training/fileio.h
+++ b/src/training/fileio.h
@@ -29,7 +29,7 @@ namespace tesseract {
 // Reads a file as a vector of STRING.
 // TODO: Use std::vector and std::string for LoadFileLinesToStrings.
 inline bool LoadFileLinesToStrings(const char* filename,
-                                   GenericVector<STRING>* lines) {
+                                   std::vector<STRING>* lines) {
   GenericVector<char> data;
   if (!LoadDataFromFile(filename, &data)) {
     return false;
diff --git a/src/training/lang_model_helpers.cpp b/src/training/lang_model_helpers.cpp
index f6216d8a52..dd1059d9bd 100644
--- a/src/training/lang_model_helpers.cpp
+++ b/src/training/lang_model_helpers.cpp
@@ -123,7 +123,7 @@ bool WriteRecoder(const UNICHARSET& unicharset, bool pass_through,
 
 // Helper builds a dawg from the given words, using the unicharset as coding,
 // and reverse_policy for LTR/RTL, and overwrites file_type in the traineddata.
-static bool WriteDawg(const GenericVector<STRING>& words,
+static bool WriteDawg(const std::vector<STRING>& words,
                       const UNICHARSET& unicharset,
                       Trie::RTLReversePolicy reverse_policy,
                       TessdataType file_type, TessdataManager* traineddata) {
@@ -144,9 +144,9 @@ static bool WriteDawg(const GenericVector<STRING>& words,
 // Builds and writes the dawgs, given a set of words, punctuation
 // patterns, number patterns, to the traineddata. Encoding uses the given
 // unicharset, and the punc dawgs is reversed if lang_is_rtl.
-static bool WriteDawgs(const GenericVector<STRING>& words,
-                       const GenericVector<STRING>& puncs,
-                       const GenericVector<STRING>& numbers, bool lang_is_rtl,
+static bool WriteDawgs(const std::vector<STRING>& words,
+                       const std::vector<STRING>& puncs,
+                       const std::vector<STRING>& numbers, bool lang_is_rtl,
                        const UNICHARSET& unicharset,
                        TessdataManager* traineddata) {
   if (puncs.empty()) {
@@ -185,9 +185,9 @@ static bool WriteDawgs(const GenericVector<STRING>& words,
 int CombineLangModel(const UNICHARSET& unicharset, const std::string& script_dir,
                      const std::string& version_str, const std::string& output_dir,
                      const std::string& lang, bool pass_through_recoder,
-                     const GenericVector<STRING>& words,
-                     const GenericVector<STRING>& puncs,
-                     const GenericVector<STRING>& numbers, bool lang_is_rtl,
+                     const std::vector<STRING>& words,
+                     const std::vector<STRING>& puncs,
+                     const std::vector<STRING>& numbers, bool lang_is_rtl,
                      FileReader reader, FileWriter writer) {
   // Build the traineddata file.
   TessdataManager traineddata;
diff --git a/src/training/lang_model_helpers.h b/src/training/lang_model_helpers.h
index 7538d0a1ae..de2b25c02d 100644
--- a/src/training/lang_model_helpers.h
+++ b/src/training/lang_model_helpers.h
@@ -74,9 +74,9 @@ bool WriteRecoder(const UNICHARSET& unicharset, bool pass_through,
 int CombineLangModel(const UNICHARSET& unicharset, const std::string& script_dir,
                      const std::string& version_str, const std::string& output_dir,
                      const std::string& lang, bool pass_through_recoder,
-                     const GenericVector<STRING>& words,
-                     const GenericVector<STRING>& puncs,
-                     const GenericVector<STRING>& numbers, bool lang_is_rtl,
+                     const std::vector<STRING>& words,
+                     const std::vector<STRING>& puncs,
+                     const std::vector<STRING>& numbers, bool lang_is_rtl,
                      FileReader reader, FileWriter writer);
 
 }  // namespace tesseract
diff --git a/src/training/lstmtrainer.cpp b/src/training/lstmtrainer.cpp
index 57e6481bfb..03170ff2ab 100644
--- a/src/training/lstmtrainer.cpp
+++ b/src/training/lstmtrainer.cpp
@@ -1196,7 +1196,7 @@ double LSTMTrainer::ComputeCharError(const GenericVector<int>& truth_str,
 // NOTE that this is destructive on both input strings.
 double LSTMTrainer::ComputeWordError(STRING* truth_str, STRING* ocr_str) {
   using StrMap = std::unordered_map<std::string, int, std::hash<std::string>>;
-  GenericVector<STRING> truth_words, ocr_words;
+  std::vector<STRING> truth_words, ocr_words;
   truth_str->split(' ', &truth_words);
   if (truth_words.empty()) return 0.0;
   ocr_str->split(' ', &ocr_words);
diff --git a/src/training/unicharset_extractor.cpp b/src/training/unicharset_extractor.cpp
index 54f62f6644..3aae0a072c 100644
--- a/src/training/unicharset_extractor.cpp
+++ b/src/training/unicharset_extractor.cpp
@@ -42,7 +42,7 @@ namespace tesseract {
 
 // Helper normalizes and segments the given strings according to norm_mode, and
 // adds the segmented parts to unicharset.
-static void AddStringsToUnicharset(const GenericVector<STRING>& strings,
+static void AddStringsToUnicharset(const std::vector<STRING>& strings,
                                    int norm_mode, UNICHARSET* unicharset) {
   for (int i = 0; i < strings.size(); ++i) {
     std::vector<std::string> normalized;
@@ -68,14 +68,14 @@ static int Main(int argc, char** argv) {
   for (int arg = 1; arg < argc; ++arg) {
     STRING file_data = tesseract::ReadFile(argv[arg], /*reader*/ nullptr);
     if (file_data.length() == 0) continue;
-    GenericVector<STRING> texts;
+    std::vector<STRING> texts;
     if (ReadMemBoxes(-1, /*skip_blanks*/ true, &file_data[0],
                      /*continue_on_failure*/ false, /*boxes*/ nullptr,
                      &texts, /*box_texts*/ nullptr, /*pages*/ nullptr)) {
       tprintf("Extracting unicharset from box file %s\n", argv[arg]);
     } else {
       tprintf("Extracting unicharset from plain text file %s\n", argv[arg]);
-      texts.truncate(0);
+      texts.resize(0);
       file_data.split('\n', &texts);
     }
     AddStringsToUnicharset(texts, FLAGS_norm_mode, &unicharset);