Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix Unicode 15.0 sentence segmentation #4213

Merged
merged 32 commits into from
Oct 26, 2023
Merged
Changes from 1 commit
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
e6ac4ee
Broken monkeys and better errors
eggrobin Sep 13, 2023
a28c814
log the guts
eggrobin Sep 14, 2023
1c86263
part of a fix
eggrobin Sep 15, 2023
1df249f
more logging for good measure
eggrobin Sep 19, 2023
99ef1ad
Many fixes to the tables
eggrobin Sep 25, 2023
ac8bb20
A code change probably needed
eggrobin Sep 25, 2023
9055bdc
traces etc.
eggrobin Sep 25, 2023
60cc05a
revert tools/make/data.toml
eggrobin Oct 24, 2023
e00088b
Merge remote-tracking branch 'la-vache/main' into import-monkeys
eggrobin Oct 24, 2023
b518661
More tests, more breakages.
eggrobin Oct 24, 2023
9403d59
Progress! on to the next failure.
eggrobin Oct 24, 2023
926d11d
Il faut imaginer Sisyphe heureux.
eggrobin Oct 24, 2023
66dd136
I sure hope this is wrong, but I am no monkey.
eggrobin Oct 24, 2023
b385f18
Somehow it is not failing anymore
eggrobin Oct 24, 2023
24cf9af
An eggsplanation.
eggrobin Oct 24, 2023
e795a2a
Looks like this is all we need
eggrobin Oct 24, 2023
c97b87f
Remove traces
eggrobin Oct 24, 2023
9187a76
Keep it at 100 test cases
eggrobin Oct 24, 2023
d65fecf
Just note it
eggrobin Oct 24, 2023
fcfc430
cargo make testdata
eggrobin Oct 24, 2023
1453cca
cargo fmt
eggrobin Oct 24, 2023
cf2198a
appease clippy
eggrobin Oct 24, 2023
4482f6d
Try to appease clippy while giving myself a place to restore the traces
eggrobin Oct 24, 2023
02dda40
Bit twiddling
eggrobin Oct 24, 2023
bb69e2b
properties data
eggrobin Oct 24, 2023
925dc3b
properties data
eggrobin Oct 24, 2023
4aeef2b
?????
eggrobin Oct 24, 2023
bd34306
Merge branch 'import-monkeys' of https://github.com/eggrobin/icu4x in…
eggrobin Oct 24, 2023
ad6727a
No need for the conditional unless logging
eggrobin Oct 24, 2023
21aca87
Bring back the traces
eggrobin Oct 24, 2023
785bf07
Revert "Bring back the traces"
eggrobin Oct 24, 2023
824ae44
Merge remote-tracking branch 'la-vache/main' into import-monkeys
eggrobin Oct 25, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Many fixes to the tables
  • Loading branch information
eggrobin committed Sep 25, 2023
commit 99ef1adfa1b5b00314ae1eee7849db61b5186b41
64 changes: 61 additions & 3 deletions provider/datagen/src/transform/segmenter/rules/sentence.toml
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,18 @@ name = "Lower_ATerm"
left = "Lower_ATerm"
right = "Format"

[[tables]]
# SB5
name = "ATerm_Close_Sp_SB8"
left = "ATerm_Close_Sp_SB8"
right = "Extend"

[[tables]]
# SB5
name = "ATerm_Close_Sp_SB8"
left = "ATerm_Close_Sp_SB8"
right = "Format"

[[tables]]
# SB7
name = "Upper_ATerm"
Expand Down Expand Up @@ -385,6 +397,27 @@ name = "STerm_Close_Sp_ParaSep"
left = "STerm_Close_Sp"
right = "LF"

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
left = "ATerm"
right = "Unknown"
interm_break_state = true

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
left = "ATerm_Close"
right = "Numeric"
interm_break_state = true

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
left = "ATerm_Close"
right = "Unknown"
interm_break_state = true

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
Expand All @@ -399,6 +432,13 @@ left = "ATerm_Close_Sp"
right = "Numeric"
interm_break_state = true

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
left = "ATerm_Close_Sp"
right = "Unknown"
interm_break_state = true

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
Expand All @@ -417,6 +457,24 @@ name = "ATerm_Close_Sp_SB8"
left = "ATerm_Close_Sp_SB8"
right = "Numeric"

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
left = "ATerm_Close_Sp_SB8"
right = "Sp"

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
left = "ATerm_Close_Sp_SB8"
right = "Unknown"

[[tables]]
# SB8
name = "ATerm_Close_Sp_SB8"
left = "ATerm_Close_Sp_SB8"
right = "SContinue"

[[rules]]
# SB1
left = [ "sot" ]
Expand Down Expand Up @@ -561,19 +619,19 @@ left = [
"STerm_Close_Sp_ParaSep",
"STerm_Close_Sp_CR"
]
right = [ "ATerm", "Lower", "OLetter", "Upper", "Numeric", "STerm" ]
right = [ "ATerm", "Lower", "OLetter", "Upper", "Numeric", "STerm", "CR" ]
break_state = true

[[rules]]
# SB11
left = [ "ATerm_Close_Sp", "STerm_Close_Sp" ]
right = [ "Numeric", "Upper", "Close", "Unknown", "OLetter" ]
right = [ "Numeric", "Upper", "Lower", "Close", "Unknown", "OLetter" ]
break_state = true

[[rules]]
# SB11
left = [ "ATerm_Close", "STerm", "STerm_Close" ]
right = [ "Numeric", "Upper", "OLetter" ]
right = [ "Numeric", "Upper", "Lower", "OLetter", "Unknown" ]
break_state = true

[[rules]]
Expand Down