Skip to content

Commit

Permalink
ICU-22984 Optimize old monkeys
Browse files Browse the repository at this point in the history
  • Loading branch information
eggrobin committed Dec 4, 2024
1 parent 5519b85 commit 3f95935
Showing 1 changed file with 85 additions and 29 deletions.
114 changes: 85 additions & 29 deletions icu4c/source/test/intltest/rbbitst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2619,8 +2619,7 @@ class SegmentationRule {
const SegmentationRule *appliedRule = nullptr;
};

SegmentationRule(std::u16string_view name)
: name_(UnicodeString(name).toUTF8String(std::string{})) {}
SegmentationRule(std::u16string_view name) { UnicodeString(name).toUTF8String(name_); }
virtual ~SegmentationRule() = default;

virtual void apply(UnicodeString &remapped, std::vector<BreakContext> &resolved) const = 0;
Expand All @@ -2630,7 +2629,7 @@ class SegmentationRule {
std::chrono::steady_clock::duration timeSpent() const { return time_spent_; }

private:
const std::string name_;
std::string name_;
protected:
mutable std::chrono::steady_clock::duration time_spent_{};
};
Expand All @@ -2651,15 +2650,15 @@ class RemapRule : public SegmentationRule {
auto const start = std::chrono::steady_clock::now();
UErrorCode status = U_ZERO_ERROR;
UnicodeString result;
int i = 0;
int offset = 0;
std::size_t i = 0;
std::ptrdiff_t offset = 0;
std::unique_ptr<RegexMatcher> matcher(pattern_->matcher(remapped, status));
while (matcher->find()) {
for (;; ++i) {
if (!resolved[i].indexInRemapped.has_value()) {
continue;
}
if (*resolved[i].indexInRemapped > matcher->start(status)) {
if (*resolved[i].indexInRemapped > static_cast<std::size_t>(matcher->start64(status))) {
break;
}
*resolved[i].indexInRemapped += offset;
Expand All @@ -2668,13 +2667,13 @@ class RemapRule : public SegmentationRule {
if (!resolved[i].indexInRemapped.has_value()) {
continue;
}
if (*resolved[i].indexInRemapped == matcher->end(status)) {
if (*resolved[i].indexInRemapped == static_cast<std::size_t>(matcher->end64(status))) {
break;
}
if (resolved[i].appliedRule != nullptr &&
resolved[i].appliedRule->resolution() == BREAK) {
printf("Replacement rule at remapped indices %d sqq. spans a break: %s",
matcher->start(status), remapped.toUTF8String(std::string{}).c_str());
printf("Replacement rule at remapped indices %d sqq. spans a break",
matcher->start(status));
std::terminate();
}
resolved[i].appliedRule = this;
Expand All @@ -2696,9 +2695,9 @@ class RemapRule : public SegmentationRule {
indices += r.indexInRemapped.has_value() ? std::to_string(*r.indexInRemapped) : "null";
indices += ",";
}
printf(("Inconsistent indexInRemapped " + indices + " for new remapped string " +
result.toUTF8String(std::string{}).c_str())
.c_str());
std::string s;
puts(("Inconsistent indexInRemapped " + indices + " for new remapped string " +
result.toUTF8String(s)).c_str());
std::terminate();
}
remapped = result;
Expand All @@ -2721,36 +2720,89 @@ class RegexRule : public SegmentationRule {
: SegmentationRule(name), resolution_(static_cast<Resolution>(resolution)) {
UParseError parseError;
UErrorCode status = U_ZERO_ERROR;
before_.reset(RegexPattern::compile(u".*(" + before + ")", UREGEX_COMMENTS | UREGEX_DOTALL,
parseError, status));
before_.reset(
RegexPattern::compile(before, UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
endsWithBefore_.reset(RegexPattern::compile(
".*(" + before + ")", UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
after_.reset(RegexPattern::compile(after, UREGEX_COMMENTS | UREGEX_DOTALL, parseError, status));
U_ASSERT(U_SUCCESS(status));
}

virtual void apply(UnicodeString &remapped, std::vector<BreakContext> &resolved) const override {
auto const start = std::chrono::steady_clock::now();
UErrorCode status = U_ZERO_ERROR;
for (auto &[indexInRemapped, appliedRule] : resolved) {
if (appliedRule != nullptr) {
continue;
}
if (std::unique_ptr<RegexMatcher>(before_->matcher(remapped, status))
->region(0, *indexInRemapped, status)
.matches(status) &&
std::unique_ptr<RegexMatcher>(after_->matcher(remapped, status))
->region(*indexInRemapped, remapped.length(), status)
.lookingAt(status)) {
appliedRule = this;
// The unicodetools implementation simply tries, for each index, to
// match the string up to the index against /.*(before)/ (with
// `matches`) and the beginning of the string after the index against
// /after/ (with `lookingAt`), but that is very slow, especially for
// nonempty /before/. While the old monkeys are not a production
// implementation, we still do not want them to be too slow, since we
// need to test millions of sample strings. Instead we search for
// /before/ and /after/, and check resulting candidates. This speeds
// things up by a factor of ~40.
// We need to be careful about greedy matching: The first position where
// the rule matches may be before the end of the first /before/ match.
// However, it is both:
// 1. within a /before/ match or at its bounds,
// 2. at the beginning of an /after/ match.
// Further, the /before/ context of the rule matches within the
// aforementioned /before/ match. Note that we need to look for
// overlapping matches, thus calls to `find` are always preceded by a
// reset via `region`.
std::unique_ptr<RegexMatcher> beforeSearch(before_->matcher(remapped, status));
std::unique_ptr<RegexMatcher> afterSearch(after_->matcher(remapped, status));
beforeSearch->useAnchoringBounds(false);
afterSearch->useAnchoringBounds(false);
U_ASSERT(U_SUCCESS(status));
if (beforeSearch->find() && afterSearch->find()) {
for (;;) {
if (afterSearch->start(status) < beforeSearch->start(status)) {
afterSearch->region(beforeSearch->start(status), remapped.length(), status);
if (!afterSearch->find()) {
break;
}
} else if (afterSearch->start(status) > beforeSearch->end(status)) {
if (beforeSearch->start(status) == remapped.length()) {
break;
}
beforeSearch->region(remapped.moveIndex32(beforeSearch->start(status), 1),
remapped.length(), status);
if (!beforeSearch->find()) {
break;
}
} else {
auto const it = std::find_if(resolved.begin(), resolved.end(), [&](auto r) {
return r.indexInRemapped == afterSearch->start(status);
});
U_ASSERT(it != resolved.end());
U_ASSERT(U_SUCCESS(status));
if (it->appliedRule == nullptr &&
std::unique_ptr<RegexMatcher>(endsWithBefore_->matcher(remapped, status))
->useAnchoringBounds(false)
.region(beforeSearch->start(status), afterSearch->start(status), status)
.matches(status)) {
it->appliedRule = this;
}
if (afterSearch->start(status) == remapped.length()) {
break;
}
afterSearch->region(remapped.moveIndex32(afterSearch->start(status), 1),
remapped.length(), status);
if (!afterSearch->find()) {
break;
}
}
U_ASSERT(U_SUCCESS(status));
}
}
U_ASSERT(U_SUCCESS(status));
time_spent_ += std::chrono::steady_clock::now() - start;
}

virtual Resolution resolution() const override { return resolution_; }

private:
std::unique_ptr<RegexPattern> before_;
std::unique_ptr<RegexPattern> endsWithBefore_;
std::unique_ptr<RegexPattern> after_;
const Resolution resolution_;
};
Expand Down Expand Up @@ -3005,7 +3057,7 @@ void RBBILineMonkey::setText(const UnicodeString &s) {
UnicodeString remapped = s;
resolved.clear();
resolved.reserve(s.length() + 1);
for (std::size_t i = 0; i < s.length() + 1; ++i) {
for (int i = 0; i < s.length() + 1; ++i) {
resolved.emplace_back(i);
}
for (const auto& rule : rules) {
Expand Down Expand Up @@ -3038,8 +3090,12 @@ const std::vector<UnicodeSet>& RBBILineMonkey::charClasses() {


RBBILineMonkey::~RBBILineMonkey() {
for (auto const& rule : rules) {
printf("%s : %lld ms\n", rule->name().c_str(), rule->timeSpent() / std::chrono::milliseconds(1));
constexpr bool debuggingOldMonkeyPerformance = false;
if (debuggingOldMonkeyPerformance) {
for (auto const &rule : rules) {
puts((rule->name() + " : " + std::to_string(rule->timeSpent() / std::chrono::milliseconds(1)) +
" ms").c_str());
}
}

delete fCharBI;
Expand Down

0 comments on commit 3f95935

Please sign in to comment.