VeriFIT · jurajsic · May 1, 2024 · May 1, 2024 · May 1, 2024 · May 1, 2024
diff --git a/src/smt/theory_str_noodler/regex.cpp b/src/smt/theory_str_noodler/regex.cpp
@@ -225,9 +225,17 @@ namespace smt::noodler::regex {
         // intermediate automata reduction
         // if the automaton is too big --> skip it. The computation of the simulation would be too expensive.
         if(nfa.num_of_states() < RED_BOUND) {
+            STRACE("str-create_nfa-reduce", 
+                tout << "--------------" << "NFA for: " << mk_pp(const_cast<app*>(expression), const_cast<ast_manager&>(m)) << " that is going to be reduced" << "---------------" << std::endl;
+                nfa.print_to_DOT(tout);
+            );
             nfa = mata::nfa::reduce(nfa);
         }
         if(determinize) {
+            STRACE("str-create_nfa-reduce", 
+                tout << "--------------" << "NFA for: " << mk_pp(const_cast<app*>(expression), const_cast<ast_manager&>(m)) << " that is going to be minimized" << "---------------" << std::endl;
+                nfa.print_to_DOT(tout);
+            );
             nfa = mata::nfa::minimize(nfa);
         }
 
@@ -472,4 +480,25 @@ namespace smt::noodler::regex {
 
         return nfa;
     }
+
+    unsigned get_loop_sum(const app* reg, const seq_util& m_util_s) {
+        expr* body;
+        unsigned lo, hi;
+        if (m_util_s.re.is_loop(reg, body, lo, hi)) {
+            unsigned body_loop = get_loop_sum(to_app(body), m_util_s);
+            if (body_loop == 0) {
+                return hi;
+            } else {
+                return hi*body_loop;
+            }
+        } else if (m_util_s.str.is_string(reg)) {
+            return 0;
+        } else {
+            unsigned sum = 0;
+            for (unsigned arg_num = 0; arg_num < reg->get_num_args(); ++arg_num) {
+                sum += get_loop_sum(to_app(reg->get_arg(arg_num)), m_util_s);
+            }
+            return sum;
+        }
+    }
 }
diff --git a/src/smt/theory_str_noodler/regex.h b/src/smt/theory_str_noodler/regex.h
@@ -110,6 +110,15 @@ namespace smt::noodler::regex {
      * @return mata::nfa::Nfa NFA
      */
     mata::nfa::Nfa create_large_concat(const mata::nfa::Nfa& body_nfa, unsigned count);
+
+    /**
+     * @brief Get the sum of loops of a regex (loop inside a loop is multiplied)
+     * 
+     * @param reg some regular expression predicate (can be also string literal/var)
+     * @param m_util_s string ast util
+     * @return sum of loops inside @p regex, with nested loops multiplied 
+     */
+    unsigned get_loop_sum(const app* reg, const seq_util& m_util_s);
 }
 
 #endif
diff --git a/src/smt/theory_str_noodler/theory_str_noodler.cpp b/src/smt/theory_str_noodler/theory_str_noodler.cpp
@@ -775,10 +775,6 @@ namespace smt::noodler {
             }
         );
 
-        bool contains_word_equations = !this->m_word_eq_todo_rel.empty();
-        bool contains_word_disequations = !this->m_word_diseq_todo_rel.empty();
-        bool contains_conversions = !this->m_conversion_todo.empty();
-
         // Solve Language (dis)equations
         if (!solve_lang_eqs_diseqs()) {
             // one of the (dis)equations is unsat
@@ -799,6 +795,10 @@ namespace smt::noodler {
             }
         }
 
+        bool contains_word_equations = !this->m_word_eq_todo_rel.empty();
+        bool contains_word_disequations = !this->m_word_diseq_todo_rel.empty();
+        bool contains_conversions = !this->m_conversion_todo.empty();
+
         // As a heuristic, for the case we have exactly one constraint, which is of type 'x notin RE', we use universality
         // checking instead of constructing the automaton for complement of RE. The complement can sometimes blow up, so
         // universality checking should be faster.
@@ -811,6 +811,15 @@ namespace smt::noodler {
             }
         }
 
+        if (is_mult_membership_suitable()) {
+            lbool result = run_mult_membership_heur();
+            if(result == l_true) {
+                return FC_DONE;
+            } else if(result == l_false) {
+                return FC_CONTINUE;
+            }
+        }
+
         // Gather relevant word (dis)equations to noodler formula
         Formula instance = get_word_formula_from_relevant();
         STRACE("str",

diff --git a/src/smt/theory_str_noodler/theory_str_noodler.h b/src/smt/theory_str_noodler/theory_str_noodler.h
@@ -368,6 +368,16 @@ namespace smt::noodler {
          */
         bool is_underapprox_suitable(const Formula& instance, const AutAssignment& aut_ass, const std::vector<TermConversion>& convs) const;
 
+        /**
+         * @brief Checks if the relevant predicates are suitable for multiple membership heuristic
+         * 
+         * Multiple membership heuristic is used when we have formula containing only memberships.
+         * We then only need to compute intersection from the regular languages and check if it is not empty.
+         * The heuristics sorts the regexes by expected complexity of computing nfa, and iteratively computes
+         * the intersection, so that if the formula is unsat, we do not need to build all automata.
+         */
+        bool is_mult_membership_suitable();
+
         /**
          * @brief Wrapper for running the Nielsen transformation.
          * 
@@ -385,6 +395,15 @@ namespace smt::noodler {
          */
         lbool run_membership_heur();
 
+        /**
+         * @brief Wrapper for running the mulitple membership query heuristics.
+         * 
+         * Check is_mult_membership_suitable() for explanation.
+         * 
+         * @return lbool Outcome of the heuristic procedure.
+         */
+        lbool run_mult_membership_heur();
+
         /**
          * @brief Wrapper for running the loop protection.
          * 

diff --git a/src/smt/theory_str_noodler/theory_str_noodler_final_check.cpp b/src/smt/theory_str_noodler/theory_str_noodler_final_check.cpp
@@ -535,6 +535,112 @@ namespace smt::noodler {
         return l_undef;
     }
 
+    bool theory_str_noodler::is_mult_membership_suitable() {
+        if (!this->m_conversion_todo.empty() || !this->m_not_contains_todo_rel.empty()) {
+            return false;
+        }
+
+        // TODO handle (string_literal in RE) and also length vars, instead of giving up
+        for (const auto& membership: m_membership_todo_rel) {
+            if(m_util_s.str.is_string(to_app(std::get<0>(membership))) || len_vars.contains(std::get<0>(membership))) {
+                return false;
+            }
+        }
+
+        // we now check if we only have "trivial" (dis)equations, i.e. they can only be of form
+        //    x == str_literal or x != str_literal
+        // i.e., one variable on the left, a string literal on the right
+
+        for (const auto& diseq : m_word_diseq_todo_rel) {
+            if (!(util::is_str_variable(diseq.first, m_util_s) && m_util_s.str.is_string(diseq.second))
+                || len_vars.contains(diseq.first)) // TODO handle length vars in run_mult_membership_heur() by using the lengths from the final intersection automaton
+            {
+                return false;
+            }
+        }
+
+        for (const auto& eq : m_word_eq_todo_rel) {
+            if (!(util::is_str_variable(eq.first, m_util_s) && m_util_s.str.is_string(eq.second))
+                || len_vars.contains(eq.first)) // TODO handle length vars in run_mult_membership_heur() by using the lengths from the final intersection automaton
+            {
+                return false;
+            }
+        }
+
+        return true;
+    }
+
+    lbool theory_str_noodler::run_mult_membership_heur() {
+        STRACE("str", tout << "trying multiple regex membership heuristic" << std::endl;);
+
+        regex::Alphabet alph(get_symbols_from_relevant());
+        // to each var x we map all the regexes RE where we have (x in RE) + bool that is true if it is (x not in RE)
+        std::map<BasicTerm, std::vector<std::pair<bool,app*>>> var_to_list_of_regexes_and_complement_flag;
+
+        // collect from relevant memberships
+        for (const auto &membership: m_membership_todo_rel) {
+            BasicTerm var(BasicTermType::Variable, to_app(std::get<0>(membership))->get_decl()->get_name().str());
+            app* reg = to_app(std::get<1>(membership));
+            var_to_list_of_regexes_and_complement_flag[var].push_back(std::make_pair(!std::get<2>(membership), reg));
+        }
+
+        // we assume that is_mult_membership_heur was run before, therefore we have only disequations
+        //   x != str_literal
+        // i.e., one var on left and some string literal on right, we can replace this with (x not in {str_literal})
+        for (const auto& diseq : m_word_diseq_todo_rel) {
+            BasicTerm var(BasicTermType::Variable, to_app(diseq.first)->get_decl()->get_name().str());
+            app* reg = to_app(diseq.second);
+            var_to_list_of_regexes_and_complement_flag[var].push_back(std::make_pair(true, reg));
+        }
+
+        // we assume that is_mult_membership_heur was run before, therefore we have only equations
+        //   x == str_literal
+        // i.e., one var on left and some string literal on right, we can replace this with (x in {str_literal})
+        for (const auto& eq : m_word_eq_todo_rel) {
+            BasicTerm var(BasicTermType::Variable, to_app(eq.first)->get_decl()->get_name().str());
+            app* reg = to_app(eq.second);
+            var_to_list_of_regexes_and_complement_flag[var].push_back(std::make_pair(false, reg));
+        }
+
+        for (auto& [var, list_of_regexes] : var_to_list_of_regexes_and_complement_flag) {
+            // sort the regexes using get_loop_sum, where those regexes that needs to be complemented should all be at the end
+            std::sort(list_of_regexes.begin(), list_of_regexes.end(), [this](const std::pair<bool,app*>& l, const std::pair<bool,app*>& r) {
+                return ((!l.first && r.first) | (regex::get_loop_sum(l.second, m_util_s) < regex::get_loop_sum(r.second, m_util_s)));
+            });
+            STRACE("str-mult-memb-heur",
+                tout << "Sorted NFAs for var " << var << std::endl;
+                unsigned i = 0;
+                for (const auto & [is_complement, nfa] : list_of_regexes) {
+                    tout << i << " (" << (is_complement ? "" : "not ") <<"complemented):" << std::endl;
+                    tout << nfa << std::endl;
+                }
+            );
+
+            std::shared_ptr<mata::nfa::Nfa> intersection = nullptr; // we save the intersected automata here
+            for (auto& [is_complement, reg] : list_of_regexes) {
+                STRACE("str", tout << "building intersection for var " << var << " and regex " << mk_pp(reg, m) << (is_complement ? " that needs to be first complemented" : " that does not need to be first complemented") << std::endl;);
+
+                std::shared_ptr<mata::nfa::Nfa> nfa = std::make_shared<mata::nfa::Nfa>(regex::conv_to_nfa(reg, m_util_s, m, alph, is_complement, is_complement));
+
+                if (intersection == nullptr) {
+                    intersection = nfa; // this is first nfa
+                } else {
+                    intersection = std::make_shared<mata::nfa::Nfa>(mata::nfa::reduce(mata::nfa::intersection(*nfa, *intersection)));
+                }
+                nfa = nullptr;
+
+                if (intersection->is_lang_empty()) {
+                    STRACE("str", tout << "intersection is empty => UNSAT" << std::endl;);
+                    block_curr_len(expr_ref(this->m.mk_false(), this->m));
+                    return l_false;
+                }
+            }
+        }
+
+        STRACE("str", tout << "intersection is not empty => SAT" << std::endl;);
+        return l_true;
+    }
+
     lbool theory_str_noodler::run_loop_protection() {
         expr_ref refine = construct_refinement();
         if (refine != nullptr) {