Skip to content

Commit

Permalink
Merge branch 'master' of github.com:egorpugin/tesseract
Browse files Browse the repository at this point in the history
  • Loading branch information
egorpugin committed Oct 5, 2015
2 parents 93d1c66 + f369585 commit a614edb
Show file tree
Hide file tree
Showing 83 changed files with 462 additions and 356 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ training/wordlist2dawg
*.o
*.Plo
*.a
*.class
*.jar

# tessdata
*.cube.*
Expand Down
45 changes: 45 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
language: cpp

notifications:
email: false

sudo: required

os:
- linux
- osx

branches:
only:
- master

addons:
apt:
sources:
- ubuntu-toolchain-r-test
packages:
- gcc-4.8
- g++-4.8

before_install:
- if [[ $TRAVIS_OS_NAME == linux ]]; then LINUX=true; fi
- if [[ $TRAVIS_OS_NAME == osx ]]; then OSX=true; fi

- if [[ $OSX ]]; then brew update; fi

install:
- if [[ $OSX ]]; then brew install icu4c pango; brew link --force gettext; fi
- if [[ $OSX ]]; then export ICU_ROOT=/usr/local/opt/icu4c ; fi
- wget http://www.cmake.org/files/v3.3/cmake-3.3.1-Linux-x86_64.sh
- sudo sh cmake-3.3.1-Linux-x86_64.sh --skip-license --prefix=/usr
- wget -O leptonica.zip https://github.com/egorpugin/leptonica/archive/master.zip
- unzip leptonica.zip -d .
- cmake -Hleptonica-master -Bleptonica-master/build
- make -C leptonica-master/build
- if [[ $LINUX && "$CXX" = "g++" ]]; then export CXX="g++-4.8" CC="gcc-4.8"; fi

script:
- mkdir build
- cd build
- cmake .. -DLeptonica_DIR=leptonica-master/build
- make
19 changes: 9 additions & 10 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
#
###############################################################################

cmake_minimum_required(VERSION 2.8.12)
cmake_minimum_required(VERSION 2.8.11)

# In-source builds are disabled.
if (${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
Expand Down Expand Up @@ -47,15 +47,8 @@ set(VERSION_PLAIN ${VERSION_MAJOR}.${VERSION_MINOR})
find_package(Leptonica 1.71 REQUIRED)

find_package(ICU COMPONENTS uc i18n)
find_package(PkgConfig QUIET)
pkg_check_modules(Pango pango)
pkg_check_modules(Cairo cairo)
pkg_check_modules(PangoFt2 pangoft2)
pkg_check_modules(PangoCairo pangocairo)
pkg_check_modules(FontConfig fontconfig)

include_directories(${Pango_INCLUDE_DIRS})
include_directories(${Cairo_INCLUDE_DIRS})
find_package(OpenCL QUIET)
find_package(PkgConfig)

###############################################################################
#
Expand All @@ -80,6 +73,10 @@ if (WIN32)
set(LIB_Ws2_32 Ws2_32)
endif()

if (CYGWIN)
add_definitions(-D__CYGWIN__)
endif()

if (UNIX)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --std=c++11")

Expand Down Expand Up @@ -190,7 +187,9 @@ set(tesseract_src ${tesseract_src}
)

add_library (tesseract ${LIBRARY_TYPE} ${tesseract_src} ${tesseract_hdr})
if (NOT STATIC)
target_compile_definitions (tesseract PUBLIC -DTESS_EXPORTS)
endif()
target_link_libraries (tesseract ${Leptonica_LIBRARIES} ${LIB_Ws2_32} ${LIB_pthread})
set_target_properties (tesseract PROPERTIES OUTPUT_NAME tesseract${VERSION_MAJOR}${VERSION_MINOR})
set_target_properties (tesseract PROPERTIES DEBUG_OUTPUT_NAME tesseract${VERSION_MAJOR}${VERSION_MINOR}d)
Expand Down
2 changes: 1 addition & 1 deletion COPYING
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
This package contains the Tesseract Open Source OCR Engine.
Orignally developed at Hewlett Packard Laboratories Bristol and
Originally developed at Hewlett Packard Laboratories Bristol and
at Hewlett Packard Co, Greeley Colorado, all the code
in this distribution is now licensed under the Apache License:

Expand Down
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
[![Build Status](https://travis-ci.org/egorpugin/tesseract.svg?branch=master)](https://travis-ci.org/egorpugin/tesseract)
[![Build status](https://ci.appveyor.com/api/projects/status/34s8gu4md3i9s93k?svg=true)](https://ci.appveyor.com/project/egorpugin/tesseract)

Note that this is possibly out-of-date version of the wiki ReadMe,
which is located at:

Expand Down Expand Up @@ -97,7 +100,7 @@ find its data directory. You must either:
./autogen.sh
./configure
make
make install
sudo make install
sudo ldconfig

to move the data files to the standard place, or:
Expand Down
2 changes: 1 addition & 1 deletion api/baseapi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1660,7 +1660,7 @@ char* TessBaseAPI::GetUNLVText() {
word->word->space() > 0 &&
!word->word->flag(W_FUZZY_NON) &&
!word->word->flag(W_FUZZY_SP)) {
/* Write a space to separate from preceeding good text */
/* Write a space to separate from preceding good text */
*ptr++ = ' ';
last_char_was_tilde = false;
}
Expand Down
2 changes: 1 addition & 1 deletion api/pdfrenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ void TessPDFRenderer::AppendPDFObject(const char *data) {
AppendString((const char *)data);
}

// Helper function to prevent us from accidentaly writing
// Helper function to prevent us from accidentally writing
// scientific notation to an HOCR or PDF file. Besides, three
// decimal points are all you really need.
double prec(double x) {
Expand Down
2 changes: 1 addition & 1 deletion api/tesseractmain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ int main(int argc, char **argv) {
}

// We have 2 possible sources of pagesegmode: a config file and
// the command line. For backwards compatability reasons, the
// the command line. For backwards compatibility reasons, the
// default in tesseract is tesseract::PSM_SINGLE_BLOCK, but the
// default for this program is tesseract::PSM_AUTO. We will let
// the config file take priority, so the command-line default
Expand Down
24 changes: 24 additions & 0 deletions appveyor.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
os: Visual Studio 2015

platform:
- Win32
- Win64

configuration:
- Debug

before_build:
- if %platform%==Win32 set generator=Visual Studio 14
- if %platform%==Win64 set generator=Visual Studio 14 Win64
- if %platform%==Win32 set vcplatform=Win32
- if %platform%==Win64 set vcplatform=x64
- ps: Start-FileDownload 'https://github.com/egorpugin/leptonica/archive/master.zip' -FileName leptonica.zip
- 7z x leptonica.zip
- cmake -Hleptonica-master -Bleptonica-master/build -G "%generator%"
- msbuild leptonica-master/build/leptonica.sln /p:Platform=%vcplatform% /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"

build_script:
- mkdir build
- cd build
- cmake .. -G "%generator%" -DLeptonica_DIR=leptonica-master/build -DSTATIC=1
- msbuild tesseract.sln /p:Platform=%vcplatform% /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll"
4 changes: 2 additions & 2 deletions ccmain/control.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1556,7 +1556,7 @@ void Tesseract::match_word_pass_n(int pass_n, WERD_RES *word,
word->fix_quotes();
if (tessedit_fix_hyphens)
word->fix_hyphens();
/* Dont trust fix_quotes! - though I think I've fixed the bug */
/* Don't trust fix_quotes! - though I think I've fixed the bug */
if (word->best_choice->length() != word->box_word->length()) {
tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
" #Blobs=%d\n",
Expand Down Expand Up @@ -1694,7 +1694,7 @@ ACCEPTABLE_WERD_TYPE Tesseract::acceptable_word_string(
goto not_a_word;
/*
Allow a single hyphen in a lower case word
- dont trust upper case - I've seen several cases of "H" -> "I-I"
- don't trust upper case - I've seen several cases of "H" -> "I-I"
*/
if (lengths[i] == 1 && s[offset] == '-') {
hyphen_pos = i;
Expand Down
6 changes: 3 additions & 3 deletions ccmain/docqual.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ inT16 Tesseract::count_outline_errs(char c, inT16 outline_count) {
int expected_outline_count;

if (STRING (outlines_odd).contains (c))
return 0; //Dont use this char
return 0; //Don't use this char
else if (STRING (outlines_2).contains (c))
expected_outline_count = 2;
else
Expand Down Expand Up @@ -157,7 +157,7 @@ void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it,
* - Word segmentation is the same as the original image
* - All characters have the expected number of outlines
* NOTE - the rejection counts are recalculated after unrejection
* - CANT do it in a single pass without a bit of fiddling
* - CAN'T do it in a single pass without a bit of fiddling
* - keep it simple but inefficient
*************************************************************************/
void Tesseract::unrej_good_quality_words( //unreject potential
Expand Down Expand Up @@ -403,7 +403,7 @@ void Tesseract::doc_and_block_rejection( //reject big chunks

/*************************************************************************
* reject_whole_page()
* Dont believe any of it - set the reject map to 00..00 in all words
* Don't believe any of it - set the reject map to 00..00 in all words
*
*************************************************************************/

Expand Down
10 changes: 5 additions & 5 deletions ccmain/fixspace.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ void Tesseract::fix_fuzzy_spaces(ETEXT_DESC *monitor,
WERD_RES *word_res;
WERD_RES_LIST fuzzy_space_words;
inT16 new_length;
BOOL8 prevent_null_wd_fixsp; // DONT process blobless wds
BOOL8 prevent_null_wd_fixsp; // DON'T process blobless wds
inT32 word_index; // current word

block_res_it.set_to_list(&page_res->block_res_list);
Expand Down Expand Up @@ -222,7 +222,7 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
* fuzzy spaces. The problem with the basic measure is that "561 63" would score
* the same as "56163", though given our knowledge that the space is fuzzy, and
* that there is a "1" next to the fuzzy space, we need to ensure that "56163"
* is prefered.
* is preferred.
*
* The solution is to NOT COUNT the score of any word which has a digit at one
* end and a "1Il" as the character the other side of the space.
Expand Down Expand Up @@ -272,8 +272,8 @@ inT16 Tesseract::eval_word_spacing(WERD_RES_LIST &word_res_list) {
} else {
/*
Can we add the prev word score and potentially count this word?
Yes IF it didnt end in a 1 when the first char of this word is a digit
AND it didnt end in a digit when the first char of this word is a 1
Yes IF it didn't end in a 1 when the first char of this word is a digit
AND it didn't end in a digit when the first char of this word is a 1
*/
word_len = word->reject_map.length();
current_word_ok_so_far = FALSE;
Expand Down Expand Up @@ -507,7 +507,7 @@ BOOL8 Tesseract::fixspace_thinks_word_done(WERD_RES *word) {

/*
Use all the standard pass 2 conditions for mode 5 in set_done() in
reject.c BUT DONT REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DONT
reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
CARE WHETHER WE HAVE of/at on/an etc.
*/
if (fixsp_done_mode > 0 &&
Expand Down
4 changes: 2 additions & 2 deletions ccmain/output.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -297,7 +297,7 @@ UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
/*************************************************************************
* SUSPECT LEVELS
*
* 0 - dont reject ANYTHING
* 0 - don't reject ANYTHING
* 1,2 - partial rejection
* 3 - BEST
*
Expand Down Expand Up @@ -337,7 +337,7 @@ void Tesseract::set_unlv_suspects(WERD_RES *word_res) {
rating_per_ch = word.rating() / word_res->reject_map.length();

if (rating_per_ch >= suspect_rating_per_ch)
return; //Dont touch bad ratings
return; //Don't touch bad ratings

if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
/* Unreject any Tess Acceptable word - but NOT tess reject chs*/
Expand Down
4 changes: 2 additions & 2 deletions ccmain/paramsd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -329,13 +329,13 @@ void ParamsEditor::WriteParams(char *filename,
fclose(fp);
sprintf (msg_str, "Overwrite file " "%s" "? (Y/N)", filename);
int a = sv_window_->ShowYesNoDialog(msg_str);
if (a == 'n') { return; } // dont write
if (a == 'n') { return; } // don't write
}


fp = fopen (filename, "wb"); // can we write to it?
if (fp == NULL) {
sv_window_->AddMessage("Cant write to file " "%s" "", filename);
sv_window_->AddMessage("Can't write to file " "%s" "", filename);
return;
}

Expand Down
4 changes: 2 additions & 2 deletions ccmain/reject.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -521,7 +521,7 @@ BOOL8 Tesseract::word_contains_non_1_digit(const char *word,

/*************************************************************************
* dont_allow_1Il()
* Dont unreject LONE accepted 1Il conflict set chars
* Don't unreject LONE accepted 1Il conflict set chars
*************************************************************************/
void Tesseract::dont_allow_1Il(WERD_RES *word) {
int i = 0;
Expand Down Expand Up @@ -633,7 +633,7 @@ void Tesseract::flip_hyphens(WERD_RES *word_res) {
next_left = 9999;
else
next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
// Dont touch small or touching blobs - it is too dangerous.
// Don't touch small or touching blobs - it is too dangerous.
if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
(out_box.left() > prev_right) && (out_box.right() < next_left)) {
aspect_ratio = out_box.width() / (float) out_box.height();
Expand Down
20 changes: 10 additions & 10 deletions ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ Tesseract::Tesseract()
BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true,
"Try to improve fuzzy spaces", this->params()),
BOOL_MEMBER(tessedit_unrej_any_wd, false,
"Dont bother with word plausibility", this->params()),
"Don't bother with word plausibility", this->params()),
BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?",
this->params()),
BOOL_MEMBER(tessedit_redo_xheight, true, "Check/Correct x-height",
Expand Down Expand Up @@ -310,19 +310,19 @@ Tesseract::Tesseract()
this->params()),
INT_MEMBER(crunch_pot_indicators, 1,
"How many potential indicators needed", this->params()),
BOOL_MEMBER(crunch_leave_ok_strings, true, "Dont touch sensible strings",
BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings",
this->params()),
BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring",
this->params()),
BOOL_MEMBER(crunch_leave_accept_strings, false,
"Dont pot crunch sensible strings", this->params()),
"Don't pot crunch sensible strings", this->params()),
BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures",
this->params()),
INT_MEMBER(crunch_leave_lc_strings, 4,
"Dont crunch words with long lower case strings",
"Don't crunch words with long lower case strings",
this->params()),
INT_MEMBER(crunch_leave_uc_strings, 4,
"Dont crunch words with long lower case strings",
"Don't crunch words with long lower case strings",
this->params()),
INT_MEMBER(crunch_long_repetitions, 3,
"Crunch words with long repetitions", this->params()),
Expand Down Expand Up @@ -393,21 +393,21 @@ Tesseract::Tesseract()
INT_MEMBER(suspect_space_level, 100,
"Min suspect level for rejecting spaces", this->params()),
INT_MEMBER(suspect_short_words, 2,
"Dont Suspect dict wds longer than this", this->params()),
"Don't suspect dict wds longer than this", this->params()),
BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected",
this->params()),
double_MEMBER(suspect_rating_per_ch, 999.9, "Dont touch bad rating limit",
double_MEMBER(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit",
this->params()),
double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit",
this->params()),
BOOL_MEMBER(tessedit_minimal_rejection, false,
"Only reject tess failures", this->params()),
BOOL_MEMBER(tessedit_zero_rejection, false, "Dont reject ANYTHING",
BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING",
this->params()),
BOOL_MEMBER(tessedit_word_for_word, false,
"Make output have exactly one word per WERD", this->params()),
BOOL_MEMBER(tessedit_zero_kelvin_rejection, false,
"Dont reject ANYTHING AT ALL", this->params()),
"Don't reject ANYTHING AT ALL", this->params()),
BOOL_MEMBER(tessedit_consistent_reps, true,
"Force all rep chars the same", this->params()),
INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm",
Expand All @@ -424,7 +424,7 @@ Tesseract::Tesseract()
"Use DOC dawg in 11l conf. detector", this->params()),
BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test",
this->params()),
BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Dont double check",
BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check",
this->params()),
BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control",
this->params()),
Expand Down
Loading

0 comments on commit a614edb

Please sign in to comment.