Skip to content

Commit

Permalink
unittest: Fix and enable validate_grapheme_test
Browse files Browse the repository at this point in the history
Signed-off-by: Stefan Weil <sw@weilnetz.de>
  • Loading branch information
stweil committed Jan 23, 2019
1 parent a702f2d commit d97f67d
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 37 deletions.
4 changes: 4 additions & 0 deletions unittest/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ check_PROGRAMS += commandlineflags_test
check_PROGRAMS += unichar_test
check_PROGRAMS += unicharset_test
check_PROGRAMS += unicharcompress_test
check_PROGRAMS += validate_grapheme_test
check_PROGRAMS += validator_test
endif

Expand Down Expand Up @@ -262,6 +263,9 @@ unicharcompress_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(TES
unicharset_test_SOURCES = unicharset_test.cc
unicharset_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)

validate_grapheme_test_SOURCES = validate_grapheme_test.cc
validate_grapheme_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_I18N_LIBS) $(ICU_UC_LIBS)

validator_test_SOURCES = validator_test.cc
validator_test_LDADD = $(GTEST_LIBS) $(TRAINING_LIBS) $(TESS_LIBS) $(ICU_UC_LIBS)

Expand Down
85 changes: 48 additions & 37 deletions unittest/validate_grapheme_test.cc
Original file line number Diff line number Diff line change
@@ -1,27 +1,38 @@
#include "tesseract/training/normstrngs.h"
// (C) Copyright 2017, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "tesseract/unittest/normstrngs_test.h"
#include "include_gunit.h"
#include "normstrngs.h"
#include "normstrngs_test.h"

namespace tesseract {
namespace {

TEST(ValidateGraphemeTest, MultipleSyllablesAreNotASingleGrapheme) {
string str = "\u0c15\u0c3f\u0c15\u0c0e"; // KA - dep I - KA - ind E.
std::vector<string> glyphs;
std::string str = "\u0c15\u0c3f\u0c15\u0c0e"; // KA - dep I - KA - ind E.
std::vector<std::string> glyphs;
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
str.c_str(), &glyphs))
<< PrintString32WithUnicodes(str);
// It made 3 graphemes.
EXPECT_EQ(glyphs.size(), 3);
EXPECT_EQ(glyphs[0], string("\u0c15\u0c3f"));
EXPECT_EQ(glyphs[1], string("\u0c15"));
EXPECT_EQ(glyphs[2], string("\u0c0e"));
EXPECT_EQ(glyphs[0], std::string("\u0c15\u0c3f"));
EXPECT_EQ(glyphs[1], std::string("\u0c15"));
EXPECT_EQ(glyphs[2], std::string("\u0c0e"));
}

TEST(ValidateGraphemeTest, SingleConsonantOK) {
string str = "\u0cb9"; // HA
std::vector<string> glyphs;
std::string str = "\u0cb9"; // HA
std::vector<std::string> glyphs;
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
str.c_str(), &glyphs))
Expand All @@ -31,8 +42,8 @@ TEST(ValidateGraphemeTest, SingleConsonantOK) {
}

TEST(ValidateGraphemeTest, SimpleCV) {
string str = "\u0cb9\u0cbf"; // HA I
std::vector<string> glyphs;
std::string str = "\u0cb9\u0cbf"; // HA I
std::vector<std::string> glyphs;
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
str.c_str(), &glyphs))
Expand All @@ -42,8 +53,8 @@ TEST(ValidateGraphemeTest, SimpleCV) {
}

TEST(ValidateGraphemeTest, SubscriptConjunct) {
string str = "\u0cb9\u0ccd\u0c95\u0cbf"; // HA Virama KA I
std::vector<string> glyphs;
std::string str = "\u0cb9\u0ccd\u0c95\u0cbf"; // HA Virama KA I
std::vector<std::string> glyphs;
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
str.c_str(), &glyphs))
Expand All @@ -55,12 +66,12 @@ TEST(ValidateGraphemeTest, SubscriptConjunct) {
true, str.c_str(), &glyphs))
<< PrintString32WithUnicodes(str);
EXPECT_EQ(glyphs.size(), 3);
EXPECT_EQ(glyphs[1], string("\u0ccd\u0c95"));
EXPECT_EQ(glyphs[1], std::string("\u0ccd\u0c95"));
}

TEST(ValidateGraphemeTest, HalfFormJoiner) {
string str = "\u0d15\u0d4d\u200d\u0d24"; // KA Virama ZWJ Ta
std::vector<string> glyphs;
std::string str = "\u0d15\u0d4d\u200d\u0d24"; // KA Virama ZWJ Ta
std::vector<std::string> glyphs;
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
str.c_str(), &glyphs))
Expand All @@ -72,12 +83,12 @@ TEST(ValidateGraphemeTest, HalfFormJoiner) {
true, str.c_str(), &glyphs))
<< PrintString32WithUnicodes(str);
EXPECT_EQ(glyphs.size(), 2) << PrintStringVectorWithUnicodes(glyphs);
EXPECT_EQ(glyphs[0], string("\u0d15\u0d4d\u200d"));
EXPECT_EQ(glyphs[0], std::string("\u0d15\u0d4d\u200d"));
}

TEST(ValidateGraphemeTest, TraditionalConjunctJoiner) {
string str = "\u0d15\u200d\u0d4d\u0d24"; // KA ZWI Virama Ta
std::vector<string> glyphs;
std::string str = "\u0d15\u200d\u0d4d\u0d24"; // KA ZWI Virama Ta
std::vector<std::string> glyphs;
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
str.c_str(), &glyphs))
Expand All @@ -89,12 +100,12 @@ TEST(ValidateGraphemeTest, TraditionalConjunctJoiner) {
true, str.c_str(), &glyphs))
<< PrintString32WithUnicodes(str);
EXPECT_EQ(glyphs.size(), 3);
EXPECT_EQ(glyphs[1], string("\u200d\u0d4d"));
EXPECT_EQ(glyphs[1], std::string("\u200d\u0d4d"));
}

TEST(ValidateGraphemeTest, OpenConjunctNonJoiner) {
string str = "\u0d15\u200c\u0d4d\u0d24"; // KA ZWNJ Virama Ta
std::vector<string> glyphs;
std::string str = "\u0d15\u200c\u0d4d\u0d24"; // KA ZWNJ Virama Ta
std::vector<std::string> glyphs;
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
str.c_str(), &glyphs))
Expand All @@ -106,7 +117,7 @@ TEST(ValidateGraphemeTest, OpenConjunctNonJoiner) {
true, str.c_str(), &glyphs))
<< PrintString32WithUnicodes(str);
EXPECT_EQ(glyphs.size(), 3);
EXPECT_EQ(glyphs[1], string("\u200c\u0d4d"));
EXPECT_EQ(glyphs[1], std::string("\u200c\u0d4d"));
// Malaylam only, so not allowed in Telugu.
str = "\u0c15\u200c\u0c4d\u0c24"; // KA ZWNJ Virama Ta
EXPECT_FALSE(NormalizeCleanAndSegmentUTF8(
Expand All @@ -116,26 +127,26 @@ TEST(ValidateGraphemeTest, OpenConjunctNonJoiner) {
}

TEST(ValidateGraphemeTest, ExplicitViramaNonJoiner) {
string str = "\u0d15\u0d4d\u200c\u0d24"; // KA Virama ZWNJ Ta
std::vector<string> glyphs;
std::string str = "\u0d15\u0d4d\u200c\u0d24"; // KA Virama ZWNJ Ta
std::vector<std::string> glyphs;
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
str.c_str(), &glyphs))
<< PrintString32WithUnicodes(str);
EXPECT_EQ(glyphs.size(), 2);
EXPECT_EQ(glyphs[1], string("\u0d24"));
EXPECT_EQ(glyphs[1], std::string("\u0d24"));
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kGlyphSplit,
true, str.c_str(), &glyphs))
<< PrintString32WithUnicodes(str);
EXPECT_EQ(glyphs.size(), 3);
EXPECT_EQ(glyphs[1], string("\u0d4d\u200c"));
EXPECT_EQ(glyphs[1], std::string("\u0d4d\u200c"));
}

TEST(ValidateGraphemeTest, ThaiGraphemes) {
// This is a single grapheme unless in glyph split mode
string str = "\u0e14\u0e38\u0e4a";
std::vector<string> glyphs;
std::string str = "\u0e14\u0e38\u0e4a";
std::vector<std::string> glyphs;
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
str.c_str(), &glyphs))
Expand All @@ -147,23 +158,23 @@ TEST(ValidateGraphemeTest, ThaiGraphemes) {
true, str.c_str(), &glyphs))
<< PrintString32WithUnicodes(str);
EXPECT_EQ(glyphs.size(), 3);
EXPECT_EQ(glyphs[0], string("\u0e14"));
EXPECT_EQ(glyphs[0], std::string("\u0e14"));
}

TEST(ValidateGraphemeTest, NoLonelyJoinersQuote) {
string str = "'\u0d24\u0d23\u0d32\u0d4d'\u200d";
std::vector<string> glyphs;
std::string str = "'\u0d24\u0d23\u0d32\u0d4d'\u200d";
std::vector<std::string> glyphs;
// Returns true, but the joiner is gone.
EXPECT_TRUE(NormalizeCleanAndSegmentUTF8(
UnicodeNormMode::kNFC, OCRNorm::kNone, GraphemeNormMode::kCombined, true,
str.c_str(), &glyphs))
<< PrintString32WithUnicodes(str);
EXPECT_EQ(glyphs.size(), 5);
EXPECT_EQ(glyphs[0], string("'"));
EXPECT_EQ(glyphs[1], string("\u0d24"));
EXPECT_EQ(glyphs[2], string("\u0d23"));
EXPECT_EQ(glyphs[3], string("\u0d32\u0d4d\u200c"));
EXPECT_EQ(glyphs[4], string("'"));
EXPECT_EQ(glyphs[0], std::string("'"));
EXPECT_EQ(glyphs[1], std::string("\u0d24"));
EXPECT_EQ(glyphs[2], std::string("\u0d23"));
EXPECT_EQ(glyphs[3], std::string("\u0d32\u0d4d\u200c"));
EXPECT_EQ(glyphs[4], std::string("'"));
}

} // namespace
Expand Down

0 comments on commit d97f67d

Please sign in to comment.