Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move LB8a and LB9 out of the table #5001

Merged
merged 10 commits into from
Jun 6, 2024
84 changes: 72 additions & 12 deletions components/segmenter/src/line.rs
Original file line number Diff line number Diff line change
Expand Up @@ -887,16 +887,43 @@ impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y>
}
}

// The state prior to a sequence of CM and ZWJ affected by rule LB9.
let mut lb9_left: Option<u8> = None;
// Whether LB9 was applied to a ZWJ, so that breaks at the current
// position must be suppressed.
let mut lb8a_after_lb9 = false;

'a: loop {
debug_assert!(!self.is_eof());
let left_codepoint = self.get_current_codepoint()?;
let mut left_prop = self.get_linebreak_property(left_codepoint);
let mut left_prop =
lb9_left.unwrap_or_else(|| self.get_linebreak_property(left_codepoint));
let after_zwj = lb8a_after_lb9 || (lb9_left.is_none() && left_prop == ZWJ);
self.advance_iter();

let Some(right_codepoint) = self.get_current_codepoint() else {
return Some(self.len);
};
let right_prop = self.get_linebreak_property(right_codepoint);
// NOTE(egg): The special-casing of `LineBreakStrictness::Anywhere` allows us to pass
// a test, but eventually that option should just be simplified to call the extended
// grapheme cluster segmenter.
if (right_prop == CM
|| (right_prop == ZWJ && self.options.strictness != LineBreakStrictness::Anywhere))
&& left_prop != BK
&& left_prop != CR
&& left_prop != LF
&& left_prop != NL
&& left_prop != SP
&& left_prop != ZW
{
lb9_left = Some(left_prop);
lb8a_after_lb9 = right_prop == ZWJ;
continue;
} else {
lb9_left = None;
lb8a_after_lb9 = false;
}

// CSS word-break property handling
match (self.options.word_option, left_prop, right_prop) {
Expand All @@ -917,7 +944,7 @@ impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y>
// CSS line-break property handling
match self.options.strictness {
LineBreakStrictness::Normal => {
if self.is_break_by_normal(right_codepoint) {
if self.is_break_by_normal(right_codepoint) && !after_zwj {
return self.get_current_position();
}
}
Expand All @@ -928,13 +955,16 @@ impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y>
right_prop,
self.options.ja_zh,
) {
if breakable {
if breakable && !after_zwj {
return self.get_current_position();
}
continue;
}
}
LineBreakStrictness::Anywhere => {
// TODO(egg): My reading of the CSS standard is that this
// should break around extended grapheme clusters, not at
// arbitrary code points, so this seems wrong.
return self.get_current_position();
}
_ => (),
Expand All @@ -958,15 +988,27 @@ impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y>
// Line break uses more that 64 states, so they spill over into the intermediate range,
// and we cannot change that at the moment
BreakState::Intermediate(index) => index + 64,
BreakState::Break | BreakState::NoMatch => return self.get_current_position(),
BreakState::Break | BreakState::NoMatch => {
if after_zwj {
continue;
} else {
return self.get_current_position();
}
}
BreakState::Keep => continue,
};

let mut previous_iter = self.iter.clone();
let mut previous_pos_data = self.current_pos_data;

// Since we are building up a state in this inner loop, we do not
// need an analogue of lb9_left; continuing the inner loop preserves
// `index` which is the current state, and thus implements the
// “treat as” rule.
let mut left_prop_pre_lb9 = right_prop;
loop {
self.advance_iter();
let after_zwj = left_prop_pre_lb9 == ZWJ;

let Some(prop) = self.get_current_linebreak_property() else {
// Reached EOF. But we are analyzing multiple characters now, so next break may be previous point.
Expand All @@ -982,14 +1024,36 @@ impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y>
return Some(self.len);
};

if (prop == CM || prop == ZWJ)
&& left_prop_pre_lb9 != BK
&& left_prop_pre_lb9 != CR
&& left_prop_pre_lb9 != LF
&& left_prop_pre_lb9 != NL
&& left_prop_pre_lb9 != SP
&& left_prop_pre_lb9 != ZW
{
left_prop_pre_lb9 = prop;
continue;
}

match self.data.get_break_state_from_table(index, prop) {
BreakState::Keep => continue 'a,
BreakState::NoMatch => {
self.iter = previous_iter;
self.current_pos_data = previous_pos_data;
return self.get_current_position();
if after_zwj {
continue 'a;
} else {
return self.get_current_position();
}
}
BreakState::Break => {
if after_zwj {
continue 'a;
} else {
return self.get_current_position();
}
}
BreakState::Break => return self.get_current_position(),
BreakState::Index(i) => {
index = i;
previous_iter = self.iter.clone();
Expand All @@ -1001,6 +1065,7 @@ impl<'l, 's, Y: LineBreakType<'l, 's>> Iterator for LineBreakIterator<'l, 's, Y>
previous_pos_data = self.current_pos_data;
}
}
left_prop_pre_lb9 = prop;
}
}
}
Expand Down Expand Up @@ -1374,12 +1439,7 @@ mod tests {
assert_eq!(is_break(AL, SP), false);
assert_eq!(is_break(AL, ZW), false);
// LB8
// LB8a
assert_eq!(is_break(ZWJ, AL), false);
// LB9
assert_eq!(is_break(AL, ZWJ), false);
assert_eq!(is_break(AL, CM), false);
assert_eq!(is_break(ID, ZWJ), false);
// LB8a and LB9 omitted: These are handled outside of the state table.
// LB10
assert_eq!(is_break(ZWJ, SP), false);
assert_eq!(is_break(SP, CM), true);
Expand Down

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion provider/baked/segmenter/fingerprints.csv
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ segmenter/dictionary/wl_ext@1, und-x-khmerdict, 798377B, e1da910a05c02674
segmenter/dictionary/wl_ext@1, und-x-laodict, 292463B, 13c58c252d049c90
segmenter/dictionary/wl_ext@1, und-x-thaidict, 224981B, f2d574736bb1a754
segmenter/grapheme@1, und, 9730B, 922c04bad19f0d2e
segmenter/line@1, und, 24183B, 817be8a85b29478d
segmenter/line@1, und, 18103B, f922c8477455abdd
segmenter/lstm/wl_auto@1, und-x-Burmese_codepoints_exclusive_model4_heavy, 91074B, eec2f7a1f6819f91
segmenter/lstm/wl_auto@1, und-x-Khmer_codepoints_exclusive_model4_heavy, 74372B, b25f5219c4b970f2
segmenter/lstm/wl_auto@1, und-x-Lao_codepoints_exclusive_model4_heavy, 71867B, 7e0c3ea7801791bd
Expand Down
Loading
Loading