Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(UI): korean and english fuzzy search #3757

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/catacharset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,7 @@ std::wstring utf8_to_wstr( const std::string &str )
#else
std::size_t sz = std::mbstowcs( nullptr, str.c_str(), 0 ) + 1;
std::wstring wstr( sz, '\0' );
std::mbstowcs( &wstr[0], str.c_str(), sz );
( void )std::mbstowcs( wstr.data(), str.c_str(), sz );
strip_trailing_nulls( wstr );
return wstr;
#endif
Expand All @@ -408,7 +408,7 @@ std::string wstr_to_utf8( const std::wstring &wstr )
#else
std::size_t sz = std::wcstombs( nullptr, wstr.c_str(), 0 ) + 1;
std::string str( sz, '\0' );
std::wcstombs( &str[0], wstr.c_str(), sz );
( void )std::wcstombs( str.data(), wstr.c_str(), sz );
strip_trailing_nulls( str );
return str;
#endif
Expand Down
37 changes: 11 additions & 26 deletions src/string_utils.cpp
Original file line number Diff line number Diff line change
@@ -1,36 +1,21 @@
#include "string_utils.h"

#include <algorithm>
#include <locale>
#include <regex>
#include <sstream>

#include "catacharset.h"
#include "color.h"
#include "name.h"
#include "translations.h"
#include "string_utils_fuzzy.h"

#include <algorithm>
#include <locale>
#include <sstream>

bool lcmatch( const std::string &str, const std::string &qry )
auto lcmatch( const std::string &str, const std::string &qry ) -> bool
{
auto temp_locale = std::locale{};
if( temp_locale.name() != "en_US.UTF-8" && temp_locale.name() != "C" ) {
auto &f = std::use_facet<std::ctype<wchar_t>>( temp_locale );
std::wstring wneedle = utf8_to_wstr( qry );
std::wstring whaystack = utf8_to_wstr( str );

f.tolower( &whaystack[0], &whaystack[0] + whaystack.size() );
f.tolower( &wneedle[0], &wneedle[0] + wneedle.size() );

return whaystack.find( wneedle ) != std::wstring::npos;
}
std::string needle;
needle.reserve( qry.size() );
std::transform( qry.begin(), qry.end(), std::back_inserter( needle ), tolower );

std::string haystack;
haystack.reserve( str.size() );
std::transform( str.begin(), str.end(), std::back_inserter( haystack ), tolower );
const auto matcher = fuzzy_search( utf8_to_wstr( qry ) ).regex;

return haystack.find( needle ) != std::string::npos;
return std::regex_search( utf8_to_wstr( str ), matcher );
}

bool lcmatch( const translation &str, const std::string &qry )
Expand Down Expand Up @@ -230,7 +215,7 @@ std::string to_upper_case( const std::string &s )
if( temp_locale.name() != "en_US.UTF-8" && temp_locale.name() != "C" ) {
const auto &f = std::use_facet<std::ctype<wchar_t>>( temp_locale );
std::wstring wstr = utf8_to_wstr( s );
f.toupper( &wstr[0], &wstr[0] + wstr.size() );
f.toupper( wstr.data(), wstr.data() + wstr.size() );
return wstr_to_utf8( wstr );
}
std::string res;
Expand All @@ -246,7 +231,7 @@ std::string to_lower_case( const std::string &s )
if( temp_locale.name() != "en_US.UTF-8" && temp_locale.name() != "C" ) {
const auto &f = std::use_facet<std::ctype<wchar_t>>( temp_locale );
std::wstring wstr = utf8_to_wstr( s );
f.tolower( &wstr[0], &wstr[0] + wstr.size() );
f.tolower( wstr.data(), wstr.data() + wstr.size() );
return wstr_to_utf8( wstr );
}
std::string res;
Expand Down
143 changes: 143 additions & 0 deletions src/string_utils_fuzzy.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,143 @@
#include "string_utils_fuzzy.h"

/// @see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_syllables_block
///
/// given a character '감'...
/// ㄱ = choseong (초성, initial consonant)
/// ㅏ = jungseong (중성, medial vowel)
/// ㅁ = jongseong (종성, final consonant)
///
/// index = (choseong * choseong_offset + jungseong * jungseong_offset + jongseong) + unicode_offset
/// 감 = (ㄱ * 588 + ㅏ * 28 + ㅁ) + 44032

namespace
{

// algorithm translated from https://taegon.kim/archives/9919
namespace korean
{

constexpr int offset = L'가';
constexpr int choseong_offset = 588;
constexpr int jungseong_offset = 28;

const auto con2syl = std::map<wchar_t, int>
{
{L'ㄱ', L'가'},
{L'ㄲ', L'까'},
{L'ㄴ', L'나'},
{L'ㄷ', L'다'},
{L'ㄸ', L'따'},
{L'ㄹ', L'라'},
{L'ㅁ', L'마'},
{L'ㅂ', L'바'},
{L'ㅃ', L'빠'},
{L'ㅅ', L'사'}
};

/// check if the character is a korean syllable in unicode.
auto is_syllable( const wchar_t c ) -> bool
{
return L'가' <= c && c <= L'힣';
}

/// check if the character is a korean consonant(자음) in unicode.
auto is_consonant( const wchar_t c ) -> bool
{
return L'ㄱ' <= c && c <= L'ㅎ';
}

/// check if the character is a korean jongseon(종성, final consonant) in unicode.
///
/// @param ch_code the character code with offset (`가`) subtracted.
auto is_jongseong( const wchar_t ch_code ) -> bool
{
return ch_code % jungseong_offset > 0;
};

auto syllable_pattern( const wchar_t c ) -> std::wstring
{
const int ch_code = c - offset;

if( is_jongseong( ch_code ) ) {
return std::wstring( 1, c );
}

/// the first character that has same choseong and jungseong with the given character.
const int begin = ( ch_code / jungseong_offset ) * jungseong_offset + offset;
/// the last character that has same choseong and jungseong with the given character.
const int end = begin + ( jungseong_offset - 1 );

return L"[" + std::wstring( 1, begin ) + L"-" + std::wstring( 1, end ) + L"]";
}

/// finds all characters that has same choseong with the given character.
auto consonant_pattern( const wchar_t c ) -> std::wstring
{
const int begin = con2syl.count( c ) ? con2syl.at( c )
: ( L'사' + ( c - L'ㅅ' ) * choseong_offset );

const int end = begin + ( choseong_offset - 1 );

return L"["
+ std::wstring( 1, c ) + std::wstring( 1, begin )
+ L"-"
+ std::wstring( 1, end )
+ L"]";
}

} // namespace korean

// algorithm translated from https://github.com/lodash/lodash/blob/main/src/escapeRegExp.ts
auto escape_regex( const wchar_t c ) -> std::wstring
{
constexpr auto escape = std::wstring_view{LR"(\^$.|?*+()[]{})"};

if( escape.find( c ) == std::wstring::npos ) {
return std::wstring( 1, c );
}
return LR"(\)" + std::wstring( 1, c );
}

auto into_pattern( const wchar_t c ) -> std::wstring
{
if( korean::is_syllable( c ) ) {
return korean::syllable_pattern( c );
} else if( korean::is_consonant( c ) ) {
return korean::consonant_pattern( c );
} else {
return escape_regex( c );
}
}

auto fuzzy_pattern( const std::wstring_view s ) -> std::wstring
{
auto xs = std::vector<std::wstring> {};
for( const auto &c : s ) {
xs.emplace_back( L"(" + into_pattern( c ) + L")" );
}

return wjoin( xs, LR"(.*?)" );
}

} // namespace

auto wjoin( const std::vector<std::wstring> &xs, const std::wstring_view sep ) -> std::wstring
{
std::wstringstream result;

for( auto it = xs.begin(); it != prev( xs.end() ); ++it ) {
result << *it << sep;
}
result << xs.back();

return result.str();
}

auto fuzzy_search( const std::wstring_view s ) -> cata::WRegExpMatcher
{
const auto pattern = fuzzy_pattern( s );
const auto regex = std::wregex( pattern, std::regex_constants::icase );

return { pattern, regex };
}
29 changes: 29 additions & 0 deletions src/string_utils_fuzzy.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#pragma once
#ifndef CATA_SRC_STRING_UTILS_FUZZY_H
#define CATA_SRC_STRING_UTILS_FUZZY_H

#include <regex>
#include <string_view>

/// join a vector of `std::wstring_view`s into a single string with a delimiter/joiner.
auto wjoin( const std::vector<std::wstring> &xs, const std::wstring_view sep ) -> std::wstring;

namespace cata
{

struct WRegExpMatcher {
/// text representation of the pattern
std::wstring pattern;

/// case-insensitive regex matcher
std::wregex regex;
};

} // namespace cata

/// creates a fuzzy search regex pattern for a given string.
/// supports english and korean.
auto fuzzy_search( const std::wstring_view s ) -> cata::WRegExpMatcher;

#endif // CATA_SRC_STRING_UTILS_FUZZY_H

49 changes: 49 additions & 0 deletions tests/string_fuzzy_search_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
#include "catch/catch.hpp"

#include <catacharset.h>
#include <clocale>
#include <string_utils_fuzzy.h>
#include <regex>

namespace
{

struct Case {
std::wstring pattern;
std::wstring target;
bool expected;
};

} //namespace


TEST_CASE( "fuzzy_search" )
{
SECTION( "korean" ) {
( void )std::setlocale( LC_ALL, "ko_KR.UTF-8" );

const auto cases = std::vector<Case> {
{L"ㅋㅁㅅ", L"크리스마스", true},
{L"ㅋㅁㅅ", L"크리스", false},
{L"고라", L"골라", true},
{L"고라", L"가라", false},
{L"군ㄱㅁ", L"군고구마", true},
{L"군ㄱㅁ", L"궁고구마", false},

{L"ㅋㅌㅋ", L"카타클리즘", true},
{L"ㅋㅌㅋ", L"타클리즘", false},
{L"바밤", L"밝밤", true},
{L"바밤", L"붉밤", false},
{L"좀ㅂ", L"좀비", true},
{L"좀ㅂ", L"존비", false},
};
for( const auto &c : cases ) {
const auto matcher = fuzzy_search( c.pattern );

INFO( "pattern: " << wstr_to_utf8( c.pattern )
<< ", target: " << wstr_to_utf8( c.target )
<< ", regex: /" << wstr_to_utf8( matcher.pattern ) << "/" );
CHECK( std::regex_search( c.target, matcher.regex ) == c.expected );
}
}
}
Loading