Skip to content

Commit

Permalink
Replace GenericVector by std::vector in strngs.h and more places
Browse files Browse the repository at this point in the history
Signed-off-by: Stefan Weil <sw@weil.de>
  • Loading branch information
Stefan Weil committed Dec 28, 2020
1 parent 3ddc88c commit 4a28d33
Show file tree
Hide file tree
Showing 17 changed files with 54 additions and 61 deletions.
6 changes: 2 additions & 4 deletions include/tesseract/strngs.h
Original file line number Diff line number Diff line change
Expand Up @@ -26,14 +26,12 @@
#include <cstdio> // for FILE
#include <cstring> // for strncpy
#include <string>
#include <vector>

namespace tesseract {

class TFile;

template <typename T>
class GenericVector;

class STRING : public std::string {
public:
using std::string::string;
Expand Down Expand Up @@ -76,7 +74,7 @@ class STRING : public std::string {
}

TESS_API
void split(char c, GenericVector<STRING>* splited);
void split(char c, std::vector<STRING>* splited);
TESS_API
void truncate_at(int32_t index);

Expand Down
2 changes: 1 addition & 1 deletion src/api/baseapi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -999,7 +999,7 @@ bool TessBaseAPI::ProcessPagesFileList(FILE *flist,
int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
char pagename[MAX_PATH];

GenericVector<STRING> lines;
std::vector<STRING> lines;
if (!flist) {
buf->split('\n', &lines);
if (lines.empty()) return false;
Expand Down
2 changes: 1 addition & 1 deletion src/ccmain/applybox.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -791,7 +791,7 @@ void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
for (int i = 0; i < word_res->correct_text.size(); ++i) {
// The part before the first space is the real ground truth, and the
// rest is the bounding box location and page number.
GenericVector<STRING> tokens;
std::vector<STRING> tokens;
word_res->correct_text[i].split(' ', &tokens);
UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].c_str());
choice->append_unichar_id_space_allocated(char_id,
Expand Down
25 changes: 13 additions & 12 deletions src/ccstruct/boxread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@
#include "rect.h" // for TBOX
#include "tprintf.h" // for tprintf

#include <tesseract/genericvector.h> // for GenericVector
#include <tesseract/helpers.h> // for chomp_string
#include <tesseract/strngs.h> // for STRING
#include <tesseract/unichar.h> // for UNICHAR

#include <cstring> // for strchr, strcmp
#include <fstream> // for std::ifstream
#include <locale> // for std::locale::classic
#include <sstream> // for std::stringstream
#include <string> // for std::string
Expand Down Expand Up @@ -74,12 +74,13 @@ FILE* OpenBoxFile(const char* fname) {
// Each of the output vectors is optional (may be nullptr).
// Returns false if no boxes are found.
bool ReadAllBoxes(int target_page, bool skip_blanks, const char* filename,
GenericVector<TBOX>* boxes,
GenericVector<STRING>* texts,
GenericVector<STRING>* box_texts,
GenericVector<int>* pages) {
GenericVector<char> box_data;
if (!tesseract::LoadDataFromFile(BoxFileName(filename).c_str(), &box_data))
std::vector<TBOX>* boxes,
std::vector<STRING>* texts,
std::vector<STRING>* box_texts,
std::vector<int>* pages) {
std::ifstream input(BoxFileName(filename).c_str(), std::ios::in | std::ios::binary);
std::vector<char> box_data(std::istreambuf_iterator<char>(input), {});
if (box_data.empty())
return false;
// Convert the array of bytes to a string, so it can be used by the parser.
box_data.push_back('\0');
Expand All @@ -91,12 +92,12 @@ bool ReadAllBoxes(int target_page, bool skip_blanks, const char* filename,
// Reads all boxes from the string. Otherwise, as ReadAllBoxes.
bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data,
bool continue_on_failure,
GenericVector<TBOX>* boxes,
GenericVector<STRING>* texts,
GenericVector<STRING>* box_texts,
GenericVector<int>* pages) {
std::vector<TBOX>* boxes,
std::vector<STRING>* texts,
std::vector<STRING>* box_texts,
std::vector<int>* pages) {
STRING box_str(box_data);
GenericVector<STRING> lines;
std::vector<STRING> lines;
box_str.split('\n', &lines);
if (lines.empty()) return false;
int num_boxes = 0;
Expand Down
19 changes: 8 additions & 11 deletions src/ccstruct/boxread.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,6 @@ namespace tesseract {

class TBOX;

template <typename T> class GenericVector;
template <typename T> class GenericVector;

// Size of buffer used to read a line from a box file.
const int kBoxReadBufSize = 1024;

Expand All @@ -45,21 +42,21 @@ FILE* OpenBoxFile(const char* filename);
// Each of the output vectors is optional (may be nullptr).
// Returns false if no boxes are found.
bool ReadAllBoxes(int target_page, bool skip_blanks, const char* filename,
GenericVector<TBOX>* boxes,
GenericVector<STRING>* texts,
GenericVector<STRING>* box_texts,
GenericVector<int>* pages);
std::vector<TBOX>* boxes,
std::vector<STRING>* texts,
std::vector<STRING>* box_texts,
std::vector<int>* pages);

// Reads all boxes from the string. Otherwise, as ReadAllBoxes.
// continue_on_failure allows reading to continue even if an invalid box is
// encountered and will return true if it succeeds in reading some boxes.
// It otherwise gives up and returns false on encountering an invalid box.
bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data,
bool continue_on_failure,
GenericVector<TBOX>* boxes,
GenericVector<STRING>* texts,
GenericVector<STRING>* box_texts,
GenericVector<int>* pages);
std::vector<TBOX>* boxes,
std::vector<STRING>* texts,
std::vector<STRING>* box_texts,
std::vector<int>* pages);

// ReadNextBox factors out the code to interpret a line of a box
// file so that applybox and unicharset_extractor interpret the same way.
Expand Down
2 changes: 1 addition & 1 deletion src/ccutil/ambigs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ bool UnicharAmbigs::ParseAmbiguityLine(
if (version > 1) {
// Simpler format is just wrong-string correct-string type\n.
STRING input(buffer);
GenericVector<STRING> fields;
std::vector<STRING> fields;
input.split(' ', &fields);
if (fields.size() != 3) {
if (debug_level) tprintf(kIllegalMsg, line_num);
Expand Down
3 changes: 1 addition & 2 deletions src/ccutil/strngs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

#include "errcode.h" // for ASSERT_HOST

#include <tesseract/genericvector.h> // for GenericVector
#include <tesseract/helpers.h> // for ReverseN
#include <tesseract/serialis.h> // for TFile

Expand Down Expand Up @@ -87,7 +86,7 @@ void STRING::truncate_at(int32_t index) {
resize(index);
}

void STRING::split(const char c, GenericVector<STRING> *splited) {
void STRING::split(const char c, std::vector<STRING> *splited) {
int start_index = 0;
const int len = length();
for (int i = 0; i < len; i++) {
Expand Down
4 changes: 2 additions & 2 deletions src/ccutil/unicharcompress.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ using RSCounts = std::unordered_map<int, int>;
static bool DecodeRadicalLine(STRING* radical_data_line, RSMap* radical_map) {
if (radical_data_line->length() == 0 || (*radical_data_line)[0] == '#')
return true;
GenericVector<STRING> entries;
std::vector<STRING> entries;
radical_data_line->split(' ', &entries);
if (entries.size() < 2) return false;
char* end = nullptr;
Expand All @@ -71,7 +71,7 @@ static bool DecodeRadicalLine(STRING* radical_data_line, RSMap* radical_map) {
// The radical_stroke_table is non-const because it gets split and the caller
// is unlikely to want to use it again.
static bool DecodeRadicalTable(STRING* radical_data, RSMap* radical_map) {
GenericVector<STRING> lines;
std::vector<STRING> lines;
radical_data->split('\n', &lines);
for (int i = 0; i < lines.size(); ++i) {
if (!DecodeRadicalLine(&lines[i], radical_map)) {
Expand Down
2 changes: 1 addition & 1 deletion src/classify/adaptmatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -310,7 +310,7 @@ void Classify::LearnWord(const char* fontname, WERD_RES* word) {
word->best_state[ch]);
if (pieces_all_natural || !prioritize_division) {
for (frag = 0; frag < word->best_state[ch]; ++frag) {
GenericVector<STRING> tokens;
std::vector<STRING> tokens;
word->correct_text[ch].split(' ', &tokens);

tokens[0] = CHAR_FRAGMENT::to_string(
Expand Down
14 changes: 6 additions & 8 deletions src/dict/trie.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -270,23 +270,21 @@ NODE_REF Trie::new_dawg_node() {
}

// Sort function to sort words by decreasing order of length.
static int sort_strings_by_dec_length(const void* v1, const void* v2) {
const auto *s1 = static_cast<const STRING *>(v1);
const auto *s2 = static_cast<const STRING *>(v2);
return s2->length() - s1->length();
static int sort_strings_by_dec_length(STRING& s1, STRING& s2) {
return s2.length() - s1.length();
}

bool Trie::read_and_add_word_list(const char *filename,
const UNICHARSET &unicharset,
Trie::RTLReversePolicy reverse_policy) {
GenericVector<STRING> word_list;
std::vector<STRING> word_list;
if (!read_word_list(filename, &word_list)) return false;
word_list.sort(sort_strings_by_dec_length);
std::sort(word_list.begin(), word_list.end(), sort_strings_by_dec_length);
return add_word_list(word_list, unicharset, reverse_policy);
}

bool Trie::read_word_list(const char *filename,
GenericVector<STRING>* words) {
std::vector<STRING>* words) {
FILE *word_file;
char line_str[CHARS_PER_LINE];
int word_count = 0;
Expand All @@ -308,7 +306,7 @@ bool Trie::read_word_list(const char *filename,
return true;
}

bool Trie::add_word_list(const GenericVector<STRING> &words,
bool Trie::add_word_list(const std::vector<STRING> &words,
const UNICHARSET &unicharset,
Trie::RTLReversePolicy reverse_policy) {
for (int i = 0; i < words.size(); ++i) {
Expand Down
4 changes: 2 additions & 2 deletions src/dict/trie.h
Original file line number Diff line number Diff line change
Expand Up @@ -174,11 +174,11 @@ class Trie : public Dawg {
// Reads a list of words from the given file.
// Returns false on error.
bool read_word_list(const char *filename,
GenericVector<STRING>* words);
std::vector<STRING>* words);
// Adds a list of words previously read using read_word_list to the trie
// using the given unicharset and reverse_policy to convert to unichar-ids.
// Returns false on error.
bool add_word_list(const GenericVector<STRING> &words,
bool add_word_list(const std::vector<STRING> &words,
const UNICHARSET &unicharset,
Trie::RTLReversePolicy reverse_policy);

Expand Down
2 changes: 1 addition & 1 deletion src/training/combine_lang_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ int main(int argc, char** argv) {
tesseract::CheckSharedLibraryVersion();
tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);

GenericVector<STRING> words, puncs, numbers;
std::vector<STRING> words, puncs, numbers;
// If these reads fail, we get a warning message and an empty list of words.
tesseract::ReadFile(FLAGS_words.c_str(), nullptr).split('\n', &words);
tesseract::ReadFile(FLAGS_puncs.c_str(), nullptr).split('\n', &puncs);
Expand Down
2 changes: 1 addition & 1 deletion src/training/fileio.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ namespace tesseract {
// Reads a file as a vector of STRING.
// TODO: Use std::vector and std::string for LoadFileLinesToStrings.
inline bool LoadFileLinesToStrings(const char* filename,
GenericVector<STRING>* lines) {
std::vector<STRING>* lines) {
GenericVector<char> data;
if (!LoadDataFromFile(filename, &data)) {
return false;
Expand Down
14 changes: 7 additions & 7 deletions src/training/lang_model_helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ bool WriteRecoder(const UNICHARSET& unicharset, bool pass_through,

// Helper builds a dawg from the given words, using the unicharset as coding,
// and reverse_policy for LTR/RTL, and overwrites file_type in the traineddata.
static bool WriteDawg(const GenericVector<STRING>& words,
static bool WriteDawg(const std::vector<STRING>& words,
const UNICHARSET& unicharset,
Trie::RTLReversePolicy reverse_policy,
TessdataType file_type, TessdataManager* traineddata) {
Expand All @@ -144,9 +144,9 @@ static bool WriteDawg(const GenericVector<STRING>& words,
// Builds and writes the dawgs, given a set of words, punctuation
// patterns, number patterns, to the traineddata. Encoding uses the given
// unicharset, and the punc dawgs is reversed if lang_is_rtl.
static bool WriteDawgs(const GenericVector<STRING>& words,
const GenericVector<STRING>& puncs,
const GenericVector<STRING>& numbers, bool lang_is_rtl,
static bool WriteDawgs(const std::vector<STRING>& words,
const std::vector<STRING>& puncs,
const std::vector<STRING>& numbers, bool lang_is_rtl,
const UNICHARSET& unicharset,
TessdataManager* traineddata) {
if (puncs.empty()) {
Expand Down Expand Up @@ -185,9 +185,9 @@ static bool WriteDawgs(const GenericVector<STRING>& words,
int CombineLangModel(const UNICHARSET& unicharset, const std::string& script_dir,
const std::string& version_str, const std::string& output_dir,
const std::string& lang, bool pass_through_recoder,
const GenericVector<STRING>& words,
const GenericVector<STRING>& puncs,
const GenericVector<STRING>& numbers, bool lang_is_rtl,
const std::vector<STRING>& words,
const std::vector<STRING>& puncs,
const std::vector<STRING>& numbers, bool lang_is_rtl,
FileReader reader, FileWriter writer) {
// Build the traineddata file.
TessdataManager traineddata;
Expand Down
6 changes: 3 additions & 3 deletions src/training/lang_model_helpers.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,9 @@ bool WriteRecoder(const UNICHARSET& unicharset, bool pass_through,
int CombineLangModel(const UNICHARSET& unicharset, const std::string& script_dir,
const std::string& version_str, const std::string& output_dir,
const std::string& lang, bool pass_through_recoder,
const GenericVector<STRING>& words,
const GenericVector<STRING>& puncs,
const GenericVector<STRING>& numbers, bool lang_is_rtl,
const std::vector<STRING>& words,
const std::vector<STRING>& puncs,
const std::vector<STRING>& numbers, bool lang_is_rtl,
FileReader reader, FileWriter writer);

} // namespace tesseract
Expand Down
2 changes: 1 addition & 1 deletion src/training/lstmtrainer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1196,7 +1196,7 @@ double LSTMTrainer::ComputeCharError(const GenericVector<int>& truth_str,
// NOTE that this is destructive on both input strings.
double LSTMTrainer::ComputeWordError(STRING* truth_str, STRING* ocr_str) {
using StrMap = std::unordered_map<std::string, int, std::hash<std::string>>;
GenericVector<STRING> truth_words, ocr_words;
std::vector<STRING> truth_words, ocr_words;
truth_str->split(' ', &truth_words);
if (truth_words.empty()) return 0.0;
ocr_str->split(' ', &ocr_words);
Expand Down
6 changes: 3 additions & 3 deletions src/training/unicharset_extractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ namespace tesseract {

// Helper normalizes and segments the given strings according to norm_mode, and
// adds the segmented parts to unicharset.
static void AddStringsToUnicharset(const GenericVector<STRING>& strings,
static void AddStringsToUnicharset(const std::vector<STRING>& strings,
int norm_mode, UNICHARSET* unicharset) {
for (int i = 0; i < strings.size(); ++i) {
std::vector<std::string> normalized;
Expand All @@ -68,14 +68,14 @@ static int Main(int argc, char** argv) {
for (int arg = 1; arg < argc; ++arg) {
STRING file_data = tesseract::ReadFile(argv[arg], /*reader*/ nullptr);
if (file_data.length() == 0) continue;
GenericVector<STRING> texts;
std::vector<STRING> texts;
if (ReadMemBoxes(-1, /*skip_blanks*/ true, &file_data[0],
/*continue_on_failure*/ false, /*boxes*/ nullptr,
&texts, /*box_texts*/ nullptr, /*pages*/ nullptr)) {
tprintf("Extracting unicharset from box file %s\n", argv[arg]);
} else {
tprintf("Extracting unicharset from plain text file %s\n", argv[arg]);
texts.truncate(0);
texts.resize(0);
file_data.split('\n', &texts);
}
AddStringsToUnicharset(texts, FLAGS_norm_mode, &unicharset);
Expand Down

0 comments on commit 4a28d33

Please sign in to comment.