cataclysmbnteam · scarf005 · Nov 26, 2023 · Oct 21, 2023 · Oct 21, 2023 · Nov 26, 2023
diff --git a/src/catacharset.cpp b/src/catacharset.cpp
@@ -391,7 +391,7 @@ std::wstring utf8_to_wstr( const std::string &str )
 #else
     std::size_t sz = std::mbstowcs( nullptr, str.c_str(), 0 ) + 1;
     std::wstring wstr( sz, '\0' );
-    std::mbstowcs( &wstr[0], str.c_str(), sz );
+    ( void )std::mbstowcs( wstr.data(), str.c_str(), sz );
     strip_trailing_nulls( wstr );
     return wstr;
 #endif
@@ -408,7 +408,7 @@ std::string wstr_to_utf8( const std::wstring &wstr )
 #else
     std::size_t sz = std::wcstombs( nullptr, wstr.c_str(), 0 ) + 1;
     std::string str( sz, '\0' );
-    std::wcstombs( &str[0], wstr.c_str(), sz );
+    ( void )std::wcstombs( str.data(), wstr.c_str(), sz );
     strip_trailing_nulls( str );
     return str;
 #endif

diff --git a/src/string_utils.cpp b/src/string_utils.cpp
@@ -1,36 +1,21 @@
 #include "string_utils.h"
 
+#include <algorithm>
+#include <locale>
+#include <regex>
+#include <sstream>
+
 #include "catacharset.h"
 #include "color.h"
 #include "name.h"
 #include "translations.h"
+#include "string_utils_fuzzy.h"
 
-#include <algorithm>
-#include <locale>
-#include <sstream>
-
-bool lcmatch( const std::string &str, const std::string &qry )
+auto lcmatch( const std::string &str, const std::string &qry ) -> bool
 {
-    auto temp_locale = std::locale{};
-    if( temp_locale.name() != "en_US.UTF-8" && temp_locale.name() != "C" ) {
-        auto &f = std::use_facet<std::ctype<wchar_t>>( temp_locale );
-        std::wstring wneedle = utf8_to_wstr( qry );
-        std::wstring whaystack = utf8_to_wstr( str );
-
-        f.tolower( &whaystack[0], &whaystack[0] + whaystack.size() );
-        f.tolower( &wneedle[0], &wneedle[0] + wneedle.size() );
-
-        return whaystack.find( wneedle ) != std::wstring::npos;
-    }
-    std::string needle;
-    needle.reserve( qry.size() );
-    std::transform( qry.begin(), qry.end(), std::back_inserter( needle ), tolower );
-
-    std::string haystack;
-    haystack.reserve( str.size() );
-    std::transform( str.begin(), str.end(), std::back_inserter( haystack ), tolower );
+    const auto matcher = fuzzy_search( utf8_to_wstr( qry ) ).regex;
 
-    return haystack.find( needle ) != std::string::npos;
+    return std::regex_search( utf8_to_wstr( str ), matcher );
 }
 
 bool lcmatch( const translation &str, const std::string &qry )
@@ -230,7 +215,7 @@ std::string to_upper_case( const std::string &s )
     if( temp_locale.name() != "en_US.UTF-8" && temp_locale.name() != "C" ) {
         const auto &f = std::use_facet<std::ctype<wchar_t>>( temp_locale );
         std::wstring wstr = utf8_to_wstr( s );
-        f.toupper( &wstr[0], &wstr[0] + wstr.size() );
+        f.toupper( wstr.data(), wstr.data() + wstr.size() );
         return wstr_to_utf8( wstr );
     }
     std::string res;
@@ -246,7 +231,7 @@ std::string to_lower_case( const std::string &s )
     if( temp_locale.name() != "en_US.UTF-8" && temp_locale.name() != "C" ) {
         const auto &f = std::use_facet<std::ctype<wchar_t>>( temp_locale );
         std::wstring wstr = utf8_to_wstr( s );
-        f.tolower( &wstr[0], &wstr[0] + wstr.size() );
+        f.tolower( wstr.data(), wstr.data() + wstr.size() );
         return wstr_to_utf8( wstr );
     }
     std::string res;

diff --git a/src/string_utils_fuzzy.cpp b/src/string_utils_fuzzy.cpp
@@ -0,0 +1,143 @@
+#include "string_utils_fuzzy.h"
+
+/// @see https://en.wikipedia.org/wiki/Korean_language_and_computers#Hangul_syllables_block
+///
+/// given a character '감'...
+/// ㄱ = choseong (초성, initial consonant)
+/// ㅏ = jungseong (중성, medial vowel)
+/// ㅁ = jongseong (종성, final consonant)
+///
+/// index = (choseong * choseong_offset + jungseong * jungseong_offset + jongseong) + unicode_offset
+/// 감 = (ㄱ * 588 + ㅏ * 28 + ㅁ) + 44032
+
+namespace
+{
+
+// algorithm translated from https://taegon.kim/archives/9919
+namespace korean
+{
+
+constexpr int offset = L'가';
+constexpr int choseong_offset = 588;
+constexpr int jungseong_offset = 28;
+
+const auto con2syl = std::map<wchar_t, int>
+{
+    {L'ㄱ', L'가'},
+    {L'ㄲ', L'까'},
+    {L'ㄴ', L'나'},
+    {L'ㄷ', L'다'},
+    {L'ㄸ', L'따'},
+    {L'ㄹ', L'라'},
+    {L'ㅁ', L'마'},
+    {L'ㅂ', L'바'},
+    {L'ㅃ', L'빠'},
+    {L'ㅅ', L'사'}
+};
+
+/// check if the character is a korean syllable in unicode.
+auto is_syllable( const wchar_t c ) -> bool
+{
+    return L'가' <= c && c <= L'힣';
+}
+
+/// check if the character is a korean consonant(자음) in unicode.
+auto is_consonant( const wchar_t c ) -> bool
+{
+    return L'ㄱ' <= c && c <= L'ㅎ';
+}
+
+/// check if the character is a korean jongseon(종성, final consonant) in unicode.
+///
+/// @param ch_code the character code with offset (`가`) subtracted.
+auto is_jongseong( const wchar_t ch_code ) -> bool
+{
+    return ch_code % jungseong_offset > 0;
+};
+
+auto syllable_pattern( const wchar_t c ) -> std::wstring
+{
+    const int ch_code = c - offset;
+
+    if( is_jongseong( ch_code ) ) {
+        return std::wstring( 1, c );
+    }
+
+    /// the first character that has same choseong and jungseong with the given character.
+    const int begin = ( ch_code / jungseong_offset ) * jungseong_offset + offset;
+    /// the last character that has same choseong and jungseong with the given character.
+    const int end = begin + ( jungseong_offset - 1 );
+
+    return L"[" + std::wstring( 1, begin ) + L"-" + std::wstring( 1, end ) + L"]";
+}
+
+/// finds all characters that has same choseong with the given character.
+auto consonant_pattern( const wchar_t c ) -> std::wstring
+{
+    const int begin = con2syl.count( c ) ? con2syl.at( c )
+                      : ( L'사' + ( c - L'ㅅ' ) * choseong_offset );
+
+    const int end = begin + ( choseong_offset - 1 );
+
+    return L"["
+           + std::wstring( 1, c ) + std::wstring( 1, begin )
+           + L"-"
+           + std::wstring( 1, end )
+           + L"]";
+}
+
+} // namespace korean
+
+// algorithm translated from https://github.com/lodash/lodash/blob/main/src/escapeRegExp.ts
+auto escape_regex( const wchar_t c ) -> std::wstring
+{
+    constexpr auto escape = std::wstring_view{LR"(\^$.|?*+()[]{})"};
+
+    if( escape.find( c ) == std::wstring::npos ) {
+        return std::wstring( 1, c );
+    }
+    return LR"(\)" + std::wstring( 1, c );
+}
+
+auto into_pattern( const wchar_t c ) -> std::wstring
+{
+    if( korean::is_syllable( c ) ) {
+        return korean::syllable_pattern( c );
+    } else if( korean::is_consonant( c ) ) {
+        return korean::consonant_pattern( c );
+    } else {
+        return escape_regex( c );
+    }
+}
+
+auto fuzzy_pattern( const std::wstring_view s ) -> std::wstring
+{
+    auto xs = std::vector<std::wstring> {};
+    for( const auto &c : s ) {
+        xs.emplace_back( L"(" + into_pattern( c ) + L")" );
+    }
+
+    return wjoin( xs, LR"(.*?)" );
+}
+
+} // namespace
+
+auto wjoin( const std::vector<std::wstring> &xs, const std::wstring_view sep ) -> std::wstring
+{
+    std::wstringstream result;
+
+    for( auto it = xs.begin(); it != prev( xs.end() ); ++it ) {
+        result << *it << sep;
+    }
+    result << xs.back();
+
+    return result.str();
+}
+
+auto fuzzy_search( const std::wstring_view s ) -> cata::WRegExpMatcher
+{
+    const auto pattern = fuzzy_pattern( s );
+    const auto regex = std::wregex( pattern, std::regex_constants::icase );
+
+    return { pattern, regex };
+}
diff --git a/src/string_utils_fuzzy.h b/src/string_utils_fuzzy.h
@@ -0,0 +1,29 @@
+#pragma once
+#ifndef CATA_SRC_STRING_UTILS_FUZZY_H
+#define CATA_SRC_STRING_UTILS_FUZZY_H
+
+#include <regex>
+#include <string_view>
+
+/// join a vector of `std::wstring_view`s into a single string with a delimiter/joiner.
+auto wjoin( const std::vector<std::wstring> &xs, const std::wstring_view sep ) -> std::wstring;
+
+namespace cata
+{
+
+struct WRegExpMatcher {
+    /// text representation of the pattern
+    std::wstring pattern;
+
+    /// case-insensitive regex matcher
+    std::wregex regex;
+};
+
+} // namespace cata
+
+/// creates a fuzzy search regex pattern for a given string.
+/// supports english and korean.
+auto fuzzy_search( const std::wstring_view s ) -> cata::WRegExpMatcher;
+
+#endif // CATA_SRC_STRING_UTILS_FUZZY_H
+
diff --git a/tests/string_fuzzy_search_test.cpp b/tests/string_fuzzy_search_test.cpp
@@ -0,0 +1,49 @@
+#include "catch/catch.hpp"
+
+#include <catacharset.h>
+#include <clocale>
+#include <string_utils_fuzzy.h>
+#include <regex>
+
+namespace
+{
+
+struct Case {
+    std::wstring pattern;
+    std::wstring target;
+    bool expected;
+};
+
+} //namespace
+
+
+TEST_CASE( "fuzzy_search" )
+{
+    SECTION( "korean" ) {
+        ( void )std::setlocale( LC_ALL, "ko_KR.UTF-8" );
+
+        const auto cases = std::vector<Case> {
+            {L"ㅋㅁㅅ", L"크리스마스", true},
+            {L"ㅋㅁㅅ", L"크리스", false},
+            {L"고라", L"골라", true},
+            {L"고라", L"가라", false},
+            {L"군ㄱㅁ", L"군고구마", true},
+            {L"군ㄱㅁ", L"궁고구마", false},
+
+            {L"ㅋㅌㅋ", L"카타클리즘", true},
+            {L"ㅋㅌㅋ", L"타클리즘", false},
+            {L"바밤", L"밝밤", true},
+            {L"바밤", L"붉밤", false},
+            {L"좀ㅂ", L"좀비", true},
+            {L"좀ㅂ", L"존비", false},
+        };
+        for( const auto &c : cases ) {
+            const auto matcher = fuzzy_search( c.pattern );
+
+            INFO( "pattern: " << wstr_to_utf8( c.pattern )
+                  << ", target: " << wstr_to_utf8( c.target )
+                  << ", regex: /" << wstr_to_utf8( matcher.pattern ) << "/" );
+            CHECK( std::regex_search( c.target, matcher.regex ) == c.expected );
+        }
+    }
+}