Skip to content

Commit

Permalink
Merge pull request #134 from VeriFIT/regex-loop
Browse files Browse the repository at this point in the history
Regex construction: optimizations
vhavlena authored Apr 26, 2024
2 parents eb1a496 + b0899c4 commit b90edac
Showing 4 changed files with 103 additions and 20 deletions.
15 changes: 15 additions & 0 deletions src/ast/rewriter/seq_rewriter.cpp
Original file line number Diff line number Diff line change
@@ -4944,6 +4944,9 @@ br_status seq_rewriter::mk_re_loop(func_decl* f, unsigned num_args, expr* const*
rational n1, n2;
unsigned lo, hi, lo2, hi2, np;
expr* a = nullptr;
expr* comp = nullptr, *tore = nullptr;
zstring zstr;

switch (num_args) {
case 1:
np = f->get_num_parameters();
@@ -4982,6 +4985,18 @@ br_status seq_rewriter::mk_re_loop(func_decl* f, unsigned num_args, expr* const*
result = re().mk_star(args[0]);
return BR_DONE;
}

if(lo2 > 0 && re().is_complement(args[0], comp) && re().is_to_re(comp, tore) && str().is_string(tore, zstr) ) {
// (loop (compl (to_re str)) n m) = compl (to_re str) if |str| == 1
// (loop (compl (to_re str)) n m) = re.* allchar if |str| > 1
if(zstr.length() == 1) {
result = args[0];
return BR_DONE;
} else if (zstr.length() > 1) {
result = re().mk_full_seq(a->get_sort());
return BR_DONE;
}
}
break;
case 2:
if (m_autil.is_numeral(args[1], n1) && n1.is_unsigned()) {
56 changes: 41 additions & 15 deletions src/smt/theory_str_noodler/regex.cpp
Original file line number Diff line number Diff line change
@@ -14,7 +14,7 @@ namespace {
namespace smt::noodler::regex {

[[nodiscard]] Nfa conv_to_nfa(const app *expression, const seq_util& m_util_s, const ast_manager& m,
const std::set<uint32_t>& alphabet, bool determinize, bool make_complement) {
const Alphabet& alphabet, bool determinize, bool make_complement) {
Nfa nfa{};

if (m_util_s.re.is_to_re(expression)) { // Handle conversion of to regex function call.
@@ -48,7 +48,7 @@ namespace smt::noodler::regex {
} else if (m_util_s.re.is_dot_plus(expression)) { // Handle dot plus.
nfa.initial.insert(0);
nfa.final.insert(1);
for (const auto& symbol : alphabet) {
for (const auto& symbol : alphabet.get_alphabet()) {
nfa.delta.add(0, symbol, 1);
nfa.delta.add(1, symbol, 1);
}
@@ -59,13 +59,13 @@ namespace smt::noodler::regex {
} else if (m_util_s.re.is_full_char(expression)) { // Handle full char (single occurrence of any string symbol, '.').
nfa.initial.insert(0);
nfa.final.insert(1);
for (const auto& symbol : alphabet) {
for (const auto& symbol : alphabet.get_alphabet()) {
nfa.delta.add(0, symbol, 1);
}
} else if (m_util_s.re.is_full_seq(expression)) {
nfa.initial.insert(0);
nfa.final.insert(0);
for (const auto& symbol : alphabet) {
for (const auto& symbol : alphabet.get_alphabet()) {
nfa.delta.add(0, symbol, 0);
}
} else if (m_util_s.re.is_intersection(expression)) { // Handle intersection.
@@ -97,22 +97,29 @@ namespace smt::noodler::regex {
// ... or empty language
nfa = std::move(body_nfa);
}
} else if(body_nfa.is_universal(alphabet.get_mata_alphabet())) {
nfa = std::move(body_nfa);
} else {
body_nfa.unify_final();
body_nfa.unify_initial();

body_nfa = mata::nfa::reduce(body_nfa);
nfa = mata::nfa::builder::create_empty_string_nfa();
// we need to repeat body_nfa at least low times
for (unsigned i = 0; i < low; ++i) {
nfa.concatenate(body_nfa);
nfa.trim();

if(low >= LOOP_BOUND) {
nfa = create_large_concat(body_nfa, low);
} else {
// we need to repeat body_nfa at least low times
for (unsigned i = 0; i < low; ++i) {
nfa.concatenate(body_nfa);
nfa.trim();
}
}

// we will now either repeat body_nfa high-low times (if is_high_set) or
// unlimited times (if it is not set), but we have to accept after each loop,
// so we add an empty word into body_nfa
mata::nfa::State new_state = nfa.add_state();
mata::nfa::State new_state = body_nfa.add_state();
body_nfa.initial.insert(new_state);
body_nfa.final.insert(new_state);

@@ -217,7 +224,7 @@ namespace smt::noodler::regex {

// intermediate automata reduction
// if the automaton is too big --> skip it. The computation of the simulation would be too expensive.
if(nfa.num_of_states() < 1000) {
if(nfa.num_of_states() < RED_BOUND) {
nfa = mata::nfa::reduce(nfa);
}
if(determinize) {
@@ -233,11 +240,7 @@ namespace smt::noodler::regex {
// Warning: is_complement assumes we do the following, so if you to change this, go check is_complement first
if (make_complement) {
STRACE("str-create_nfa", tout << "Complemented NFA:" << std::endl;);
mata::OnTheFlyAlphabet mata_alphabet{};
for (const auto& symbol : alphabet) {
mata_alphabet.add_new_symbol(std::to_string(symbol), symbol);
}
nfa = mata::nfa::complement(nfa, mata_alphabet, {
nfa = mata::nfa::complement(nfa, alphabet.get_mata_alphabet(), {
{"algorithm", "classical"},
//{"minimize", "true"} // it seems that minimizing during complement causes more TOs in benchmarks
});
@@ -446,4 +449,27 @@ namespace smt::noodler::regex {
}
return RegexInfo{.min_length = 0, .universal = l_undef, .empty = l_undef};
}

mata::nfa::Nfa create_large_concat(const mata::nfa::Nfa& body_nfa, unsigned count) {
mata::nfa::Nfa nfa_part = mata::nfa::builder::create_empty_string_nfa();
mata::nfa::Nfa nfa = mata::nfa::builder::create_empty_string_nfa();
const unsigned CONCAT = 100;

for(unsigned i = 0; i < CONCAT; i++) {
nfa_part.concatenate(body_nfa);
nfa_part.trim();
}
unsigned cnt = 0;
for(unsigned i = 0; i < count / CONCAT; i++) {
nfa.concatenate(nfa_part);
nfa.trim();
cnt += CONCAT;
}
for(; cnt <= count; cnt++) {
nfa.concatenate(body_nfa);
nfa.trim();
}

return nfa;
}
}
41 changes: 40 additions & 1 deletion src/smt/theory_str_noodler/regex.h
Original file line number Diff line number Diff line change
@@ -34,6 +34,11 @@ namespace smt::noodler::regex {
using expr_pair = std::pair<expr_ref, expr_ref>;
using expr_pair_flag = std::tuple<expr_ref, expr_ref, bool>;

// bound for loop (above this number an optimized construction is used)
const unsigned LOOP_BOUND = 5000;
// simulation reduction bound in states (bigger automata are not reduced)
const unsigned RED_BOUND = 1000;

/**
* @brief Info gathered about a regex.
* - min_length: length of shortest words in the regex. In fact it expresses that in the regex there is no
@@ -48,6 +53,31 @@ namespace smt::noodler::regex {
lbool empty;
};

/**
* @brief Alphabet wrapper for Z3 alphabet represented by
* std::set<uint32_t> and a Mata alphabet.
*/
struct Alphabet {

private:
std::set<uint32_t> alphabet;
mata::OnTheFlyAlphabet mata_alphabet;
public:
Alphabet(const std::set<uint32_t>& alph) : alphabet(alph) {
for (const auto& symbol : alph) {
this->mata_alphabet.add_new_symbol(std::to_string(symbol), symbol);
}
}

const std::set<uint32_t>& get_alphabet() const {
return this->alphabet;
}

const mata::OnTheFlyAlphabet& get_mata_alphabet() const {
return this->mata_alphabet;
}
};

/**
* Convert expression @p expr to NFA.
* @param[in] expression Expression to be converted to NFA.
@@ -59,7 +89,7 @@ namespace smt::noodler::regex {
* @return The resulting regex.
*/
[[nodiscard]] mata::nfa::Nfa conv_to_nfa(const app *expression, const seq_util& m_util_s, const ast_manager& m,
const std::set<uint32_t>& alphabet, bool determinize = false, bool make_complement = false);
const Alphabet& alphabet, bool determinize = false, bool make_complement = false);

/**
* @brief Get basic information about the regular expression in the form of RegexInfo (see the description above).
@@ -71,6 +101,15 @@ namespace smt::noodler::regex {
* @return RegexInfo
*/
RegexInfo get_regex_info(const app *expression, const seq_util& m_util_s, const ast_manager& m);

/**
* @brief Create bounded iteration of a given automaton.
*
* @param body_nfa Core NFA
* @param count Number of concatenations
* @return mata::nfa::Nfa NFA
*/
mata::nfa::Nfa create_large_concat(const mata::nfa::Nfa& body_nfa, unsigned count);
}

#endif
11 changes: 7 additions & 4 deletions src/smt/theory_str_noodler/theory_str_noodler_final_check.cpp
Original file line number Diff line number Diff line change
@@ -88,6 +88,7 @@ namespace smt::noodler {
) {
AutAssignment aut_assignment{};
aut_assignment.set_alphabet(noodler_alphabet);
regex::Alphabet alph(noodler_alphabet);
for (const auto &word_equation: m_membership_todo_rel) {
const expr_ref& var_expr{ std::get<0>(word_equation) };
assert(is_app(var_expr));
@@ -102,7 +103,7 @@ namespace smt::noodler {
}
// If the regular constraint is in a negative form, create a complement of the regular expression instead.
const bool make_complement{ !std::get<2>(word_equation) };
mata::nfa::Nfa nfa{ regex::conv_to_nfa(to_app(std::get<1>(word_equation)), m_util_s, m, noodler_alphabet, make_complement, make_complement) };
mata::nfa::Nfa nfa{ regex::conv_to_nfa(to_app(std::get<1>(word_equation)), m_util_s, m, alph, make_complement, make_complement) };
auto aut_ass_it{ aut_assignment.find(term) };
if (aut_ass_it != aut_assignment.end()) {
// This variable already has some regular constraints. Hence, we create an intersection of the new one
@@ -200,10 +201,11 @@ namespace smt::noodler {
std::set<uint32_t> alphabet;
extract_symbols(left_side, alphabet);
extract_symbols(right_side, alphabet);
regex::Alphabet alph(alphabet);

// construct NFAs for both sides
mata::nfa::Nfa nfa1 = regex::conv_to_nfa(to_app(left_side), m_util_s, m, alphabet, false );
mata::nfa::Nfa nfa2 = regex::conv_to_nfa(to_app(right_side), m_util_s, m, alphabet, false );
mata::nfa::Nfa nfa1 = regex::conv_to_nfa(to_app(left_side), m_util_s, m, alph, false );
mata::nfa::Nfa nfa2 = regex::conv_to_nfa(to_app(right_side), m_util_s, m, alph, false );

// check if NFAs are equivalent (if we have equation) or not (if we have disequation)
bool are_equiv = mata::nfa::are_equivalent(nfa1, nfa2);
@@ -513,8 +515,9 @@ namespace smt::noodler {
// start with minterm representing symbols not ocurring in the regex
std::set<mata::Symbol> symbols_in_regex{get_dummy_symbol()};
extract_symbols(std::get<1>(reg_data), symbols_in_regex);
regex::Alphabet reg_alph(symbols_in_regex);

mata::nfa::Nfa nfa{ regex::conv_to_nfa(to_app(std::get<1>(reg_data)), m_util_s, m, symbols_in_regex, false, false) };
mata::nfa::Nfa nfa{ regex::conv_to_nfa(to_app(std::get<1>(reg_data)), m_util_s, m, reg_alph, false, false) };

mata::EnumAlphabet alph(symbols_in_regex.begin(), symbols_in_regex.end());
mata::nfa::Nfa sigma_star = mata::nfa::builder::create_sigma_star_nfa(&alph);

0 comments on commit b90edac

Please sign in to comment.