Skip to content

Commit

Permalink
add basic regex functions
Browse files Browse the repository at this point in the history
Signed-off-by: Nikolaj Bjorner <nbjorner@microsoft.com>
  • Loading branch information
NikolajBjorner committed Nov 26, 2024
1 parent b143a95 commit fce377e
Show file tree
Hide file tree
Showing 2 changed files with 167 additions and 26 deletions.
180 changes: 155 additions & 25 deletions src/ast/sls/sls_seq_plugin.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,11 +74,16 @@ Revert bias on long strings:
- bake in bias for shorter strings into equation solving?
Equality solving using stochastic Nelson.
- When solving for an equality w = v, first convert them into two vectors by removing concatenations.
The updates are then performed on the arguments to concatenations and not the concatenations themselves.
This saves some amount of spurious work when pushing assignments down over concatenations, which is
what the current first version of the solver does.
- Given equality where current assignment does not satisfy it:
- Xw = v:
- let X' range over prefixes of X that matches v.
- non-deterministic set X <- strval0(X')
- non-deterministic set X <- strval0(X') + 'a' where strval0(X') + 'a' matches prefix of strval0(v), and X' is longest prefix of X that matches v.
- non-deterministic set X <- strval0(X') + 'a' where strval0(X') + 'a'
matches prefix of strval0(v), and X' is longest prefix of X that matches v.
- If X fully matches a prefix of v, then, in addition to the rules above:
- consume constant character from strval0(X)w = v
- reveal the next variable to solve for.
Expand All @@ -90,6 +95,8 @@ Equality solving using stochastic Nelson.
#include "ast/sls/sls_seq_plugin.h"
#include "ast/sls/sls_context.h"
#include "ast/ast_pp.h"
#include "ast/rewriter/seq_rewriter.h"
#include "ast/rewriter/th_rewriter.h"


namespace sls {
Expand Down Expand Up @@ -258,7 +265,6 @@ namespace sls {
VERIFY(seq.str.is_contains(e, a, b));
if (seq.is_string(a->get_sort()))
return strval0(a).contains(strval0(b));

NOT_IMPLEMENTED_YET();
break;
case OP_SEQ_PREFIX:
Expand All @@ -274,6 +280,11 @@ namespace sls {
NOT_IMPLEMENTED_YET();
break;
case OP_SEQ_IN_RE:
VERIFY(seq.str.is_in_re(e, a, b));
if (seq.is_string(a->get_sort()))
return is_in_re(strval0(a), b);
NOT_IMPLEMENTED_YET();
break;
case OP_SEQ_NTH:
case OP_SEQ_NTH_I:
case OP_SEQ_NTH_U:
Expand Down Expand Up @@ -420,35 +431,24 @@ namespace sls {
}

void seq_plugin::repair_up(app* e) {

if (m.is_bool(e))
return;

if (seq.str.is_itos(e)) {
repair_up_str_itos(e);
if (is_value(e))
return;
}
if (seq.str.is_stoi(e)) {
if (seq.str.is_itos(e))
repair_up_str_itos(e);
else if (seq.str.is_stoi(e))
repair_up_str_stoi(e);
return;
}
if (seq.str.is_length(e)) {
else if (seq.str.is_length(e))
repair_up_str_length(e);
return;
}
if (seq.str.is_index(e)) {
else if (seq.str.is_index(e))
repair_up_str_indexof(e);
return;
}
if (seq.is_string(e->get_sort())) {
if (is_value(e))
return;
else if (seq.is_string(e->get_sort())) {
strval0(e) = strval1(e);
ctx.new_value_eh(e);
return;
}

verbose_stream() << "repair up nyi: " << mk_bounded_pp(e, m) << "\n";
else
verbose_stream() << "repair up nyi: " << mk_bounded_pp(e, m) << "\n";
}

bool seq_plugin::repair_down(app* e) {
Expand All @@ -461,6 +461,7 @@ namespace sls {
if (m.is_eq(e))
return repair_down_eq(e);


NOT_IMPLEMENTED_YET();
return false;
}
Expand Down Expand Up @@ -621,6 +622,10 @@ namespace sls {
return repair_down_str_itos(e);
case OP_STRING_STOI:
return repair_down_str_stoi(e);
case OP_SEQ_IN_RE:
if (seq.is_string(to_app(e)->get_arg(0)->get_sort()))
return repair_down_in_re(e);
break;
case OP_STRING_UBVTOS:
case OP_STRING_SBVTOS:
case OP_STRING_TO_CODE:
Expand All @@ -639,8 +644,6 @@ namespace sls {
case OP_SEQ_FOLDLI:

case OP_SEQ_TO_RE:
case OP_SEQ_IN_RE:

case OP_RE_PLUS:
case OP_RE_STAR:
case OP_RE_OPTION:
Expand Down Expand Up @@ -679,7 +682,6 @@ namespace sls {
m_int_updates.push_back({ x, r, 1 });
else
m_int_updates.push_back({ x, rational(-1 - ctx.rand(10)), 1 });

return apply_update();
}

Expand Down Expand Up @@ -1137,4 +1139,132 @@ namespace sls {
return get_eval(e).is_value;
return m.is_value(e);
}

// Regular expressions

bool seq_plugin::is_in_re(zstring const& s, expr* r) {
expr_ref sval(seq.str.mk_string(s), m);
th_rewriter rw(m);
expr_ref in_re(seq.re.mk_in_re(sval, r), m);
rw(in_re);
SASSERT(m.limit().is_canceled() || m.is_true(in_re) || m.is_false(in_re));
return m.is_true(in_re);
}

bool seq_plugin::repair_down_in_re(app* e) {
expr* x, * y;
VERIFY(seq.str.is_in_re(e, x, y));
auto info = seq.re.get_info(y);
if (!info.interpreted)
return false;
auto s = strval0(x);
expr_ref xval(seq.str.mk_string(s), m);
expr_ref in_re(seq.re.mk_in_re(xval, y), m);
th_rewriter rw(m);
rw(in_re);
SASSERT(m.limit().is_canceled() || m.is_true(in_re) || m.is_false(in_re));
if (m.is_true(in_re) == ctx.is_true(e))
return true;

if (is_value(x))
return false;

vector<zstring> conts;
expr_ref d_r(y, m);
seq_rewriter seqrw(m);
for (unsigned i = 0; i < s.length(); ++i) {
verbose_stream() << "Derivative " << s.extract(0, i) << ": " << d_r << "\n";
if (seq.re.is_empty(d_r))
break;
zstring prefix = s.extract(0, i);
choose(d_r, 2, prefix, conts);
expr_ref ch(seq.str.mk_char(s[i]), m);
d_r = seqrw.mk_derivative(ch, d_r);
}
if (!seq.re.is_empty(d_r))
choose(d_r, 2, s, conts);

verbose_stream() << "repair in_re " << mk_pp(e, m) << " " << s << "\n";
for (auto& str : conts)
verbose_stream() << "prefix " << str << "\n";

// TODO: do some length analysis to prune out short candidates when there are longer ones.
// TODO: when matching .*"bcd" with string ab, the extension abc is more interesting than aba.
if (ctx.is_true(e)) {
for (auto& str : conts)
m_str_updates.push_back({ x, str, 1 });
}
else {
for (auto& str : conts)
m_str_updates.push_back({ x, str + m_chars[ctx.rand(m_chars.size())], 1});
}
return apply_update();
}

void seq_plugin::next_char(expr* r, unsigned_vector& chars) {
SASSERT(seq.is_re(r));
expr* x, * y;
zstring s;
if (seq.re.is_concat(r, x, y)) {
auto info = seq.re.get_info(x);
next_char(x, chars);
if (info.nullable == l_true)
next_char(y, chars);
}
else if (seq.re.is_to_re(r, x)) {
if (seq.str.is_string(x, s) && !s.empty())
chars.push_back(s[0]);
}
else if (seq.re.is_union(r, x, y)) {
next_char(x, chars);
next_char(y, chars);
}
else if (seq.re.is_range(r, x, y)) {
zstring s1, s2;
seq.str.is_string(x, s1);
seq.str.is_string(y, s2);
if (s1.length() == 1 && s2.length() == 1 && s1[0] <= s2[0]) {
chars.push_back(s1[0] + ctx.rand(s2[0] - s1[0] + 1));
chars.push_back(s1[0]);
chars.push_back(s2[0]);
}
}
else if (seq.re.is_star(r, x) || seq.re.is_plus(r, x)) {
next_char(x, chars);
}
else if (seq.re.is_empty(r)) {
;
}
else if (seq.re.is_full_seq(r)) {
if (!m_chars.empty())
chars.push_back(m_chars[ctx.rand(m_chars.size())]);
}
else if (seq.re.is_full_char(r)) {
if (!m_chars.empty())
chars.push_back(m_chars[ctx.rand(m_chars.size())]);
}
else {
verbose_stream() << "regex nyi " << mk_bounded_pp(r, m) << "\n";
NOT_IMPLEMENTED_YET();
}
}

void seq_plugin::choose(expr* r, unsigned k, zstring& prefix, vector<zstring>& result) {
auto info = seq.re.get_info(r);
result.push_back(prefix);
if (k == 0)
return;
unsigned_vector chars;
next_char(r, chars);
std::stable_sort(chars.begin(), chars.end());
auto it = std::unique(chars.begin(), chars.end());
chars.shrink((unsigned)(it - chars.begin()));
for (auto ch : chars) {
expr_ref c(seq.str.mk_char(ch), m);
seq_rewriter rw(m);
expr_ref r2 = rw.mk_derivative(c, r);
zstring prefix2 = prefix + zstring(ch);
choose(r2, k - 1, prefix2, result);
}
}
}
13 changes: 12 additions & 1 deletion src/ast/sls/sls_seq_plugin.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ namespace sls {
arith_util a;
scoped_ptr_vector<eval> m_values;
indexed_uint_set m_chars;
bool m_initialized = false;
bool m_initialized = false;

struct str_update {
expr* e;
Expand Down Expand Up @@ -81,13 +81,24 @@ namespace sls {
bool repair_down_str_suffixof(app* e);
bool repair_down_str_itos(app* e);
bool repair_down_str_stoi(app* e);
bool repair_down_in_re(app* e);

void repair_up_str_length(app* e);
void repair_up_str_indexof(app* e);
void repair_up_str_itos(app* e);
void repair_up_str_stoi(app* e);

// regex functionality

// enumerate set of strings that can match a prefix of regex r.
void choose(expr* r, unsigned k, zstring& prefix, vector<zstring>& result);

// enumerate set of possible next chars, including possibly sampling from m_chars for whild-cards.
void next_char(expr* r, unsigned_vector& chars);

bool is_in_re(zstring const& s, expr* r);

// access evaluation
bool is_seq_predicate(expr* e);

eval& get_eval(expr* e);
Expand Down

0 comments on commit fce377e

Please sign in to comment.