From a211d71344e5a3647e9f1730514bb522f33b76b4 Mon Sep 17 00:00:00 2001
From: Sergey Slepov
Date: Mon, 8 Jan 2024 23:15:22 +0800
Subject: [PATCH] =?UTF-8?q?=D0=94=D0=BE=D0=B1=D0=B0=D0=B2=D0=BB=D0=B5?=
=?UTF-8?q?=D0=BD=D1=8B=20=D0=BF=D0=B5=D1=80=D0=B5=D0=BD=D0=BE=D1=81=D1=8B?=
=?UTF-8?q?=20=D1=81=D0=BB=D0=BE=D0=B2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
---
HtmlProcessor/HtmlProcessor.csproj | 4 ++
HtmlProcessor/Tokenizer.cs | 77 ++++++++++++++++++++++++++++++
gramdictru/Startup.cs | 14 +++++-
gramdictru/gramdictru.csproj | 1 +
4 files changed, 95 insertions(+), 1 deletion(-)
create mode 100644 HtmlProcessor/Tokenizer.cs
diff --git a/HtmlProcessor/HtmlProcessor.csproj b/HtmlProcessor/HtmlProcessor.csproj
index c7f89b4..15174a6 100644
--- a/HtmlProcessor/HtmlProcessor.csproj
+++ b/HtmlProcessor/HtmlProcessor.csproj
@@ -7,4 +7,8 @@
10
+
+
+
+
diff --git a/HtmlProcessor/Tokenizer.cs b/HtmlProcessor/Tokenizer.cs
new file mode 100644
index 0000000..56b3c72
--- /dev/null
+++ b/HtmlProcessor/Tokenizer.cs
@@ -0,0 +1,77 @@
+using System.Collections.Generic;
+using System.Globalization;
+
+namespace gramdictru
+{
+ public static class Tokenizer
+ {
+ ///
+ /// Разбивает текÑÑ‚ на чаÑти - токены.
+ ///
+ ///
+ /// ПоÑледовательноÑÑ‚ÑŒ токенов.
+ ///
+ ///
+ /// Токены бывают двух видов – Ñлова и разделители.
+ /// Первым вÑегда идет разделитель, даже еÑли он пуÑтой.
+ /// Затем чередуютÑÑ Ñлова и разделители.
+ /// Таким образом, разделителей вÑегда на один больше, чем Ñлов.
+ /// ЕÑли два Ñ€Ñдом ÑтоÑщих Ñлова принадлежат разным ÑиÑтемам пиÑьменноÑти
+ /// (например, руÑÑкое и Ñразу за ним китайÑкое), то
+ /// между ними вÑтавлÑетÑÑ Ð¿ÑƒÑтой разделитель.
+ /// См. теÑÑ‚Ñ‹.
+ ///
+ public static IEnumerable Tokenize(string text, string stressMark = "\x301")
+ {
+ int tokenStart = 0;
+
+ const BroadType whitespaceType = BroadType.Whitespace;
+
+ BroadType type = whitespaceType;
+
+ for (int i = 0; i < text.Length; i++)
+ {
+ char c = text[i];
+
+ if (c == stressMark[0])
+ continue;
+
+ BroadType newType = CharUnicodeInfo.GetUnicodeCategory(c) switch
+ {
+ UnicodeCategory.LowercaseLetter => BroadType.Letter,
+ UnicodeCategory.UppercaseLetter => BroadType.Letter,
+ UnicodeCategory.SpaceSeparator => BroadType.Whitespace,
+ UnicodeCategory.DecimalDigitNumber => BroadType.Number,
+ UnicodeCategory.OtherLetter => BroadType.OtherLetter,
+ _ => BroadType.Whitespace
+ };
+
+ if (newType == type) continue;
+
+ string token = text[tokenStart..i];
+ yield return token;
+
+ if (type != whitespaceType && newType != whitespaceType)
+ yield return ""; // вÑтавлÑем пуÑтой разделитель между руÑÑкими и китайÑкими буквами
+
+ tokenStart = i;
+
+ type = newType;
+ }
+
+ string endToken = text[tokenStart..];
+ yield return endToken;
+
+ if (type != whitespaceType)
+ yield return "";
+ }
+
+ enum BroadType
+ {
+ Whitespace,
+ Letter,
+ OtherLetter, // Chinese characters and stuff
+ Number
+ }
+ }
+}
diff --git a/gramdictru/Startup.cs b/gramdictru/Startup.cs
index c7f55a2..918ccb1 100644
--- a/gramdictru/Startup.cs
+++ b/gramdictru/Startup.cs
@@ -84,13 +84,25 @@ private static async Task PostProcessHtml(HttpContext context, Func next)
memoryStream.Seek(0, SeekOrigin.Begin);
string body = await new StreamReader(memoryStream).ReadToEndAsync();
- string newBody = PageHtmlPostProcessor.AddNoWrap(body);
+ string hyphenated = Hyphenate(body);
+ string newBody = PageHtmlPostProcessor.AddNoWrap(hyphenated);
byte[] bytes = Encoding.UTF8.GetBytes(newBody);
response.ContentLength = bytes.Length;
await oldStream.WriteAsync(bytes, 0, bytes.Length);
response.Body = oldStream;
}
+ private static string Hyphenate(string body)
+ {
+ const char softHyphen = '\u00AD';
+
+ var ts = Tokenizer.Tokenize(body)
+ .Where(t => !string.IsNullOrEmpty(t))
+ .Select(t => Morpher.Russian.BasicHyphenator.Hyphenate(t, softHyphen));
+
+ return string.Join("", ts);
+ }
+
static readonly HttpClient ApiClient = new HttpClient()
{
BaseAddress = new Uri("https://api.gramdict.ru")
diff --git a/gramdictru/gramdictru.csproj b/gramdictru/gramdictru.csproj
index 1feea9c..74eb5d9 100644
--- a/gramdictru/gramdictru.csproj
+++ b/gramdictru/gramdictru.csproj
@@ -16,6 +16,7 @@
+