From a211d71344e5a3647e9f1730514bb522f33b76b4 Mon Sep 17 00:00:00 2001 From: Sergey Slepov Date: Mon, 8 Jan 2024 23:15:22 +0800 Subject: [PATCH] =?UTF-8?q?=D0=94=D0=BE=D0=B1=D0=B0=D0=B2=D0=BB=D0=B5?= =?UTF-8?q?=D0=BD=D1=8B=20=D0=BF=D0=B5=D1=80=D0=B5=D0=BD=D0=BE=D1=81=D1=8B?= =?UTF-8?q?=20=D1=81=D0=BB=D0=BE=D0=B2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- HtmlProcessor/HtmlProcessor.csproj | 4 ++ HtmlProcessor/Tokenizer.cs | 77 ++++++++++++++++++++++++++++++ gramdictru/Startup.cs | 14 +++++- gramdictru/gramdictru.csproj | 1 + 4 files changed, 95 insertions(+), 1 deletion(-) create mode 100644 HtmlProcessor/Tokenizer.cs diff --git a/HtmlProcessor/HtmlProcessor.csproj b/HtmlProcessor/HtmlProcessor.csproj index c7f89b4..15174a6 100644 --- a/HtmlProcessor/HtmlProcessor.csproj +++ b/HtmlProcessor/HtmlProcessor.csproj @@ -7,4 +7,8 @@ 10 + + + + diff --git a/HtmlProcessor/Tokenizer.cs b/HtmlProcessor/Tokenizer.cs new file mode 100644 index 0000000..56b3c72 --- /dev/null +++ b/HtmlProcessor/Tokenizer.cs @@ -0,0 +1,77 @@ +ï»¿using System.Collections.Generic; +using System.Globalization; + +namespace gramdictru +{ + public static class Tokenizer + { + ///

+ /// Ð Ð°Ð·Ð±Ð¸Ð²Ð°ÐµÑ‚ Ñ‚ÐµÐºÑÑ‚ Ð½Ð° Ñ‡Ð°ÑÑ‚Ð¸ - Ñ‚Ð¾ÐºÐµÐ½Ñ‹. + ///

+ /// + /// ÐŸÐ¾ÑÐ»ÐµÐ´Ð¾Ð²Ð°Ñ‚ÐµÐ»ÑŒÐ½Ð¾ÑÑ‚ÑŒ Ñ‚Ð¾ÐºÐµÐ½Ð¾Ð². + /// + /// + /// Ð¢Ð¾ÐºÐµÐ½Ñ‹ Ð±Ñ‹Ð²Ð°ÑŽÑ‚ Ð´Ð²ÑƒÑ… Ð²Ð¸Ð´Ð¾Ð² â€“ ÑÐ»Ð¾Ð²Ð° Ð¸ Ñ€Ð°Ð·Ð´ÐµÐ»Ð¸Ñ‚ÐµÐ»Ð¸. + /// ÐŸÐµÑ€Ð²Ñ‹Ð¼ Ð²ÑÐµÐ³Ð´Ð° Ð¸Ð´ÐµÑ‚ Ñ€Ð°Ð·Ð´ÐµÐ»Ð¸Ñ‚ÐµÐ»ÑŒ, Ð´Ð°Ð¶Ðµ ÐµÑÐ»Ð¸ Ð¾Ð½ Ð¿ÑƒÑÑ‚Ð¾Ð¹. + /// Ð—Ð°Ñ‚ÐµÐ¼ Ñ‡ÐµÑ€ÐµÐ´ÑƒÑŽÑ‚ÑÑ ÑÐ»Ð¾Ð²Ð° Ð¸ Ñ€Ð°Ð·Ð´ÐµÐ»Ð¸Ñ‚ÐµÐ»Ð¸. + /// Ð¢Ð°ÐºÐ¸Ð¼ Ð¾Ð±Ñ€Ð°Ð·Ð¾Ð¼, Ñ€Ð°Ð·Ð´ÐµÐ»Ð¸Ñ‚ÐµÐ»ÐµÐ¹ Ð²ÑÐµÐ³Ð´Ð° Ð½Ð° Ð¾Ð´Ð¸Ð½ Ð±Ð¾Ð»ÑŒÑˆÐµ, Ñ‡ÐµÐ¼ ÑÐ»Ð¾Ð². + /// Ð•ÑÐ»Ð¸ Ð´Ð²Ð° Ñ€ÑÐ´Ð¾Ð¼ ÑÑ‚Ð¾ÑÑ‰Ð¸Ñ… ÑÐ»Ð¾Ð²Ð° Ð¿Ñ€Ð¸Ð½Ð°Ð´Ð»ÐµÐ¶Ð°Ñ‚ Ñ€Ð°Ð·Ð½Ñ‹Ð¼ ÑÐ¸ÑÑ‚ÐµÐ¼Ð°Ð¼ Ð¿Ð¸ÑÑŒÐ¼ÐµÐ½Ð½Ð¾ÑÑ‚Ð¸ + /// (Ð½Ð°Ð¿Ñ€Ð¸Ð¼ÐµÑ€, Ñ€ÑƒÑÑÐºÐ¾Ðµ Ð¸ ÑÑ€Ð°Ð·Ñƒ Ð·Ð° Ð½Ð¸Ð¼ ÐºÐ¸Ñ‚Ð°Ð¹ÑÐºÐ¾Ðµ), Ñ‚Ð¾ + /// Ð¼ÐµÐ¶Ð´Ñƒ Ð½Ð¸Ð¼Ð¸ Ð²ÑÑ‚Ð°Ð²Ð»ÑÐµÑ‚ÑÑ Ð¿ÑƒÑÑ‚Ð¾Ð¹ Ñ€Ð°Ð·Ð´ÐµÐ»Ð¸Ñ‚ÐµÐ»ÑŒ. + /// Ð¡Ð¼. Ñ‚ÐµÑÑ‚Ñ‹. + /// + public static IEnumerable Tokenize(string text, string stressMark = "\x301") + { + int tokenStart = 0; + + const BroadType whitespaceType = BroadType.Whitespace; + + BroadType type = whitespaceType; + + for (int i = 0; i < text.Length; i++) + { + char c = text[i]; + + if (c == stressMark[0]) + continue; + + BroadType newType = CharUnicodeInfo.GetUnicodeCategory(c) switch + { + UnicodeCategory.LowercaseLetter => BroadType.Letter, + UnicodeCategory.UppercaseLetter => BroadType.Letter, + UnicodeCategory.SpaceSeparator => BroadType.Whitespace, + UnicodeCategory.DecimalDigitNumber => BroadType.Number, + UnicodeCategory.OtherLetter => BroadType.OtherLetter, + _ => BroadType.Whitespace + }; + + if (newType == type) continue; + + string token = text[tokenStart..i]; + yield return token; + + if (type != whitespaceType && newType != whitespaceType) + yield return ""; // Ð²ÑÑ‚Ð°Ð²Ð»ÑÐµÐ¼ Ð¿ÑƒÑÑ‚Ð¾Ð¹ Ñ€Ð°Ð·Ð´ÐµÐ»Ð¸Ñ‚ÐµÐ»ÑŒ Ð¼ÐµÐ¶Ð´Ñƒ Ñ€ÑƒÑÑÐºÐ¸Ð¼Ð¸ Ð¸ ÐºÐ¸Ñ‚Ð°Ð¹ÑÐºÐ¸Ð¼Ð¸ Ð±ÑƒÐºÐ²Ð°Ð¼Ð¸ + + tokenStart = i; + + type = newType; + } + + string endToken = text[tokenStart..]; + yield return endToken; + + if (type != whitespaceType) + yield return ""; + } + + enum BroadType + { + Whitespace, + Letter, + OtherLetter, // Chinese characters and stuff + Number + } + } +} diff --git a/gramdictru/Startup.cs b/gramdictru/Startup.cs index c7f55a2..918ccb1 100644 --- a/gramdictru/Startup.cs +++ b/gramdictru/Startup.cs @@ -84,13 +84,25 @@ private static async Task PostProcessHtml(HttpContext context, Func next) memoryStream.Seek(0, SeekOrigin.Begin); string body = await new StreamReader(memoryStream).ReadToEndAsync(); - string newBody = PageHtmlPostProcessor.AddNoWrap(body); + string hyphenated = Hyphenate(body); + string newBody = PageHtmlPostProcessor.AddNoWrap(hyphenated); byte[] bytes = Encoding.UTF8.GetBytes(newBody); response.ContentLength = bytes.Length; await oldStream.WriteAsync(bytes, 0, bytes.Length); response.Body = oldStream; } + private static string Hyphenate(string body) + { + const char softHyphen = '\u00AD'; + + var ts = Tokenizer.Tokenize(body) + .Where(t => !string.IsNullOrEmpty(t)) + .Select(t => Morpher.Russian.BasicHyphenator.Hyphenate(t, softHyphen)); + + return string.Join("", ts); + } + static readonly HttpClient ApiClient = new HttpClient() { BaseAddress = new Uri("https://api.gramdict.ru") diff --git a/gramdictru/gramdictru.csproj b/gramdictru/gramdictru.csproj index 1feea9c..74eb5d9 100644 --- a/gramdictru/gramdictru.csproj +++ b/gramdictru/gramdictru.csproj @@ -16,6 +16,7 @@ +