Skip to content

Commit

Permalink
Updated tessdatamanager/combine_tessdata to give more functionality
Browse files Browse the repository at this point in the history
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@353 d0cd1f9f-072b-0410-8dd7-cf729c803f20
  • Loading branch information
theraysmith committed May 20, 2010
1 parent a5b4570 commit 45aacc0
Show file tree
Hide file tree
Showing 3 changed files with 339 additions and 123 deletions.
259 changes: 155 additions & 104 deletions ccutil/tessdatamanager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,139 +65,190 @@ void TessdataManager::Init(const char *data_file_name) {
}
}

FILE *TessdataManager::GetFilePtr(const char *language_data_path_prefix,
const char *file_suffix, bool required_file,
bool text_file) {
STRING file_name = language_data_path_prefix;
file_name += file_suffix;
FILE *file_ptr = fopen(file_name.string(), text_file ? "r" : "rb");
if (required_file && (file_ptr == NULL)) {
tprintf("Error openning required file %s\n", file_name.string());
exit(1);
}
return file_ptr;
}

void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
bool newline_end) {
bool newline_end, inT64 num_bytes_to_copy) {
if (num_bytes_to_copy == 0) return;
int buffer_size = 1024;
if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) {
buffer_size = num_bytes_to_copy;
}
inT64 num_bytes_copied = 0;
char *chunk = new char[buffer_size];
int bytes_read;
char last_char = 0x0;
while ((bytes_read = fread(chunk, sizeof(char),
buffer_size, input_file))) {
fwrite(chunk, sizeof(char), bytes_read, output_file);
last_char = chunk[bytes_read-1];
if (num_bytes_to_copy > 0) {
num_bytes_copied += bytes_read;
if (num_bytes_copied == num_bytes_to_copy) break;
if (num_bytes_copied + buffer_size > num_bytes_to_copy) {
buffer_size = num_bytes_to_copy - num_bytes_copied;
}
}
}
if (newline_end) ASSERT_HOST(last_char == '\n');
delete[] chunk;
}

void TessdataManager::CombineDataFiles(
void TessdataManager::WriteMetadata(inT64 *offset_table, FILE *output_file) {
fseek(output_file, 0, SEEK_SET);
inT32 num_entries = TESSDATA_NUM_ENTRIES;
fwrite(&num_entries, sizeof(inT32), 1, output_file);
fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, output_file);
fclose(output_file);

tprintf("TessdataManager combined tesseract data files.\n");
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
tprintf("Offset for type %d is %lld\n", i, offset_table[i]);
}
}

bool TessdataManager::CombineDataFiles(
const char *language_data_path_prefix,
const char *output_filename) {
FILE *file_ptr;
STRING file_name;
int i;
inT64 offset_table[TESSDATA_NUM_ENTRIES];
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
FILE *output_file = fopen(output_filename, "wb");
if (output_file == NULL) {
tprintf("Error opening %s for writing\n", output_filename);
return false;
}
// Leave some space for recording the offset_table.
fseek(output_file,
sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);

// Record language-specific tesseract config file.
file_ptr = GetFilePtr(language_data_path_prefix,
kLangConfigFileSuffix, false, true);
if (file_ptr != NULL) {
offset_table[TESSDATA_LANG_CONFIG] = ftell(output_file);
CopyFile(file_ptr, output_file, true);
fclose(file_ptr);
}

// Record unicharset.
file_ptr = GetFilePtr(language_data_path_prefix,
kUnicharsetFileSuffix, true, true);
offset_table[TESSDATA_UNICHARSET] = ftell(output_file);
CopyFile(file_ptr, output_file, true);
fclose(file_ptr);

// Record ambiguities.
file_ptr = GetFilePtr(language_data_path_prefix,
kAmbigsFileSuffix, false, true);
if (file_ptr != NULL) {
offset_table[TESSDATA_AMBIGS] = ftell(output_file);
CopyFile(file_ptr, output_file, true);
fclose(file_ptr);
}

// Record inttemp.
file_ptr =
GetFilePtr(language_data_path_prefix,
kBuiltInTemplatesFileSuffix, false, false);
if (file_ptr != NULL) {
offset_table[TESSDATA_INTTEMP] = ftell(output_file);
CopyFile(file_ptr, output_file, false);
fclose(file_ptr);

// Record pffmtable.
file_ptr = GetFilePtr(language_data_path_prefix,
kBuiltInCutoffsFileSuffix, true, true);
offset_table[TESSDATA_PFFMTABLE] = ftell(output_file);
CopyFile(file_ptr, output_file, true);
fclose(file_ptr);

// Record normproto.
file_ptr = GetFilePtr(language_data_path_prefix,
kNormProtoFileSuffix, true, true);
offset_table[TESSDATA_NORMPROTO] = ftell(output_file);
CopyFile(file_ptr, output_file, true);
fclose(file_ptr);
}

// Record dawgs.
file_ptr = GetFilePtr(language_data_path_prefix,
kPuncDawgFileSuffix, false, false);
if (file_ptr != NULL) {
offset_table[TESSDATA_PUNC_DAWG] = ftell(output_file);
CopyFile(file_ptr, output_file, false);
fclose(file_ptr);
}

file_ptr = GetFilePtr(language_data_path_prefix,
kSystemDawgFileSuffix, false, false);
if (file_ptr != NULL) {
offset_table[TESSDATA_SYSTEM_DAWG] = ftell(output_file);
CopyFile(file_ptr, output_file, false);
fclose(file_ptr);
}

file_ptr = GetFilePtr(language_data_path_prefix,
kNumberDawgFileSuffix, false, false);
if (file_ptr != NULL) {
offset_table[TESSDATA_NUMBER_DAWG] = ftell(output_file);
CopyFile(file_ptr, output_file, false);
fclose(file_ptr);
}

file_ptr = GetFilePtr(language_data_path_prefix,
kFreqDawgFileSuffix, false, false);
if (file_ptr != NULL) {
offset_table[TESSDATA_FREQ_DAWG] = ftell(output_file);
CopyFile(file_ptr, output_file, false);
fclose(file_ptr);
TessdataType type;
bool text_file;
FILE *file_ptr[TESSDATA_NUM_ENTRIES];

// Load individual tessdata components from files.
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
ASSERT_HOST(TessdataTypeFromFileSuffix(
kTessdataFileSuffixes[i], &type, &text_file));
STRING filename = language_data_path_prefix;
filename += kTessdataFileSuffixes[i];
file_ptr[i] = fopen(filename.string(), text_file ? "r" : "rb");
if (file_ptr[i] != NULL) {
offset_table[type] = ftell(output_file);
CopyFile(file_ptr[i], output_file, text_file, -1);
fclose(file_ptr[i]);
}
}

fseek(output_file, 0, SEEK_SET);
inT32 num_entries = TESSDATA_NUM_ENTRIES;
fwrite(&num_entries, sizeof(inT32), 1, output_file);
fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, output_file);
fclose(output_file);
// Make sure that the required components are present.
if (file_ptr[TESSDATA_UNICHARSET] == NULL) {
tprintf("Error opening unicharset file\n");
fclose(output_file);
return false;
}
if (file_ptr[TESSDATA_INTTEMP] != NULL &&
(file_ptr[TESSDATA_PFFMTABLE] == NULL ||
file_ptr[TESSDATA_NORMPROTO] == NULL)) {
tprintf("Error opening pffmtable and/or normproto files"
" while inttemp file was present\n");
fclose(output_file);
return false;
}

tprintf("TessdataManager combined tesseract data files.\n");
WriteMetadata(offset_table, output_file);
return true;
}

bool TessdataManager::OverwriteComponents(
const char *new_traineddata_filename,
char **component_filenames,
int num_new_components) {
int i;
inT64 offset_table[TESSDATA_NUM_ENTRIES];
TessdataType type;
bool text_file;
FILE *file_ptr[TESSDATA_NUM_ENTRIES];
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
tprintf("Offset for type %d is %lld\n", i, offset_table[i]);
offset_table[i] = -1;
file_ptr[i] = NULL;
}
FILE *output_file = fopen(new_traineddata_filename, "wb");
if (output_file == NULL) {
tprintf("Error opening %s for writing\n", new_traineddata_filename);
return false;
}

// Leave some space for recording the offset_table.
fseek(output_file,
sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);

// Open the files with the new components.
for (i = 0; i < num_new_components; ++i) {
TessdataTypeFromFileName(component_filenames[i], &type, &text_file);
file_ptr[type] = fopen(component_filenames[i], text_file ? "r" : "rb");
}

// Write updated data to the output traineddata file.
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
if (file_ptr[i] != NULL) {
// Get the data from the opened component file.
offset_table[i] = ftell(output_file);
CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1);
fclose(file_ptr[i]);
} else {
// Get this data component from the loaded data file.
if (SeekToStart(static_cast<TessdataType>(i))) {
offset_table[i] = ftell(output_file);
CopyFile(data_file_, output_file, kTessdataFileIsText[i],
GetEndOffset(static_cast<TessdataType>(i)) -
ftell(data_file_) + 1);
}
}
}

WriteMetadata(offset_table, output_file);
return true;
}

bool TessdataManager::TessdataTypeFromFileSuffix(
const char *suffix, TessdataType *type, bool *text_file) {
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
*type = static_cast<TessdataType>(i);
*text_file = kTessdataFileIsText[i];
return true;
}
}
printf("TessdataManager can't determine which tessdata"
" component is represented by %s\n", suffix);
return false;
}

bool TessdataManager::TessdataTypeFromFileName(
const char *filename, TessdataType *type, bool *text_file) {
// Get the file suffix (extension)
const char *suffix = strrchr(filename, '.');
if (suffix == NULL || *(++suffix) == '\0') return false;
return TessdataTypeFromFileSuffix(suffix, type, text_file);
}

bool TessdataManager::ExtractToFile(const char *filename) {
TessdataType type;
bool text_file;
ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName(
filename, &type, &text_file));
if (!SeekToStart(type)) return false;

FILE *output_file = fopen(filename, "wb");
if (output_file == NULL) {
printf("Error openning %s\n", filename);
exit(1);
}
inT64 begin_offset = ftell(GetDataFilePtr());
inT64 end_offset = GetEndOffset(type);
tesseract::TessdataManager::CopyFile(
GetDataFilePtr(), output_file, text_file,
end_offset - begin_offset + 1);
fclose(output_file);
return true;
}

} // namespace tesseract
Loading

0 comments on commit 45aacc0

Please sign in to comment.