Skip to content

Commit

Permalink
Добавлены переносы слов
Browse files Browse the repository at this point in the history
  • Loading branch information
bzaar committed Jan 8, 2024
1 parent 8fe0e97 commit a211d71
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 1 deletion.
4 changes: 4 additions & 0 deletions HtmlProcessor/HtmlProcessor.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,8 @@
<LangVersion>10</LangVersion>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="IndexRange" Version="1.0.3" />
</ItemGroup>

</Project>
77 changes: 77 additions & 0 deletions HtmlProcessor/Tokenizer.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
using System.Collections.Generic;
using System.Globalization;

namespace gramdictru
{
public static class Tokenizer
{
/// <summary>
/// Разбивает текст на части - токены.
/// </summary>
/// <returns>
/// Последовательность токенов.
/// </returns>
/// <remarks>
/// Токены бывают двух видов – слова и разделители.
/// Первым всегда идет разделитель, даже если он пустой.
/// Затем чередуются слова и разделители.
/// Таким образом, разделителей всегда на один больше, чем слов.
/// Если два рядом стоящих слова принадлежат разным системам письменности
/// (например, русское и сразу за ним китайское), то
/// между ними вставляется пустой разделитель.
/// См. тесты.
/// </remarks>
public static IEnumerable<string> Tokenize(string text, string stressMark = "\x301")
{
int tokenStart = 0;

const BroadType whitespaceType = BroadType.Whitespace;

BroadType type = whitespaceType;

for (int i = 0; i < text.Length; i++)
{
char c = text[i];

if (c == stressMark[0])
continue;

BroadType newType = CharUnicodeInfo.GetUnicodeCategory(c) switch
{
UnicodeCategory.LowercaseLetter => BroadType.Letter,
UnicodeCategory.UppercaseLetter => BroadType.Letter,
UnicodeCategory.SpaceSeparator => BroadType.Whitespace,
UnicodeCategory.DecimalDigitNumber => BroadType.Number,
UnicodeCategory.OtherLetter => BroadType.OtherLetter,
_ => BroadType.Whitespace
};

if (newType == type) continue;

string token = text[tokenStart..i];
yield return token;

if (type != whitespaceType && newType != whitespaceType)
yield return ""; // вставляем пустой разделитель между русскими и китайскими буквами

tokenStart = i;

type = newType;
}

string endToken = text[tokenStart..];
yield return endToken;

if (type != whitespaceType)
yield return "";
}

enum BroadType
{
Whitespace,
Letter,
OtherLetter, // Chinese characters and stuff
Number
}
}
}
14 changes: 13 additions & 1 deletion gramdictru/Startup.cs
Original file line number Diff line number Diff line change
Expand Up @@ -84,13 +84,25 @@ private static async Task PostProcessHtml(HttpContext context, Func<Task> next)

memoryStream.Seek(0, SeekOrigin.Begin);
string body = await new StreamReader(memoryStream).ReadToEndAsync();
string newBody = PageHtmlPostProcessor.AddNoWrap(body);
string hyphenated = Hyphenate(body);
string newBody = PageHtmlPostProcessor.AddNoWrap(hyphenated);
byte[] bytes = Encoding.UTF8.GetBytes(newBody);
response.ContentLength = bytes.Length;
await oldStream.WriteAsync(bytes, 0, bytes.Length);
response.Body = oldStream;
}

private static string Hyphenate(string body)
{
const char softHyphen = '\u00AD';

var ts = Tokenizer.Tokenize(body)
.Where(t => !string.IsNullOrEmpty(t))
.Select(t => Morpher.Russian.BasicHyphenator.Hyphenate(t, softHyphen));

return string.Join("", ts);
}

static readonly HttpClient ApiClient = new HttpClient()
{
BaseAddress = new Uri("https://api.gramdict.ru")
Expand Down
1 change: 1 addition & 0 deletions gramdictru/gramdictru.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
<PackageReference Include="Microsoft.AspNetCore.StaticFiles" Version="2.0.1" />
<PackageReference Include="Microsoft.VisualStudio.Web.BrowserLink" Version="2.0.1" />
<PackageReference Include="Microsoft.VisualStudio.Web.CodeGeneration.Design" Version="2.0.4" />
<PackageReference Include="Morpher.Russian.BasicHyphenator" Version="1.0.0" />
</ItemGroup>
<ItemGroup>
<DotNetCliToolReference Include="Microsoft.VisualStudio.Web.CodeGeneration.Tools" Version="2.0.1" />
Expand Down

0 comments on commit a211d71

Please sign in to comment.