Skip to content

Commit

Permalink
find(str, regex, vector), match(str, regex), fix #1550
Browse files Browse the repository at this point in the history
  • Loading branch information
sc1f committed Sep 24, 2021
1 parent 0448699 commit c4cbb0e
Show file tree
Hide file tree
Showing 10 changed files with 473 additions and 11 deletions.
10 changes: 10 additions & 0 deletions cpp/perspective/src/cpp/computed_expression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,12 @@ computed_function::to_string
t_computed_expression_parser::TO_STRING_VALIDATOR_FN
= computed_function::to_string(nullptr);

computed_function::match t_computed_expression_parser::MATCH_FN
= computed_function::match();

computed_function::find t_computed_expression_parser::FIND_FN
= computed_function::find();

t_tscalar
t_computed_expression_parser::TRUE_SCALAR = mktscalar(true);

Expand Down Expand Up @@ -132,6 +138,8 @@ t_computed_expression_parser::FALSE_SCALAR = mktscalar(false);
sym_table.add_function("upper", upper_fn); \
sym_table.add_function("lower", lower_fn); \
sym_table.add_function("length", length_fn); \
sym_table.add_function("match", t_computed_expression_parser::MATCH_FN); \
sym_table.add_function("find", t_computed_expression_parser::FIND_FN); \
sym_table.add_function("string", to_string_fn); \
sym_table.add_reserved_function( \
"inrange", t_computed_expression_parser::INRANGE_FN); \
Expand Down Expand Up @@ -178,6 +186,8 @@ t_computed_expression_parser::FALSE_SCALAR = mktscalar(false);
"lower", t_computed_expression_parser::LOWER_VALIDATOR_FN); \
sym_table.add_function( \
"length", t_computed_expression_parser::LENGTH_VALIDATOR_FN); \
sym_table.add_function("match", t_computed_expression_parser::MATCH_FN); \
sym_table.add_function("find", t_computed_expression_parser::FIND_FN); \
sym_table.add_reserved_function( \
"inrange", t_computed_expression_parser::INRANGE_FN); \
sym_table.add_reserved_function( \
Expand Down
89 changes: 89 additions & 0 deletions cpp/perspective/src/cpp/computed_function.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -407,6 +407,95 @@ namespace computed_function {
return rval;
}

match::match()
: exprtk::igeneric_function<t_tscalar>("TT") {}

match::~match() {}

t_tscalar match::operator()(t_parameter_list parameters) {
t_tscalar rval;
rval.clear();
rval.m_type = DTYPE_BOOL;

// Parameters already validated
t_scalar_view str_view(parameters[0]);
t_scalar_view substr_view(parameters[1]);

t_tscalar str = str_view();
t_tscalar substr = substr_view();

if (
(str.get_dtype() != DTYPE_STR || str.m_status == STATUS_CLEAR) ||
(substr.get_dtype() != DTYPE_STR || substr.m_status == STATUS_CLEAR)) {
rval.m_status = STATUS_CLEAR;
return rval;
}

if (!str.is_valid() || !substr.is_valid()) return rval;

boost::regex pattern(substr.to_string());
const std::string& search_string = str.to_string();

rval.set(boost::regex_match(search_string, pattern));

return rval;
}

find::find()
: exprtk::igeneric_function<t_tscalar>("TTV") {}

find::~find() {}

t_tscalar find::operator()(t_parameter_list parameters) {
t_tscalar rval;
rval.clear();
rval.m_type = DTYPE_BOOL;

// Parameters already validated for number and type
t_scalar_view _str(parameters[0]);
t_scalar_view _search_str(parameters[1]);
t_vector_view output_vector(parameters[2]);

t_tscalar str_scalar = _str();
t_tscalar search_str_scalar = _search_str();

// Type check - only allow strings and input vector of size > 0
if (
(str_scalar.get_dtype() != DTYPE_STR || str_scalar.m_status == STATUS_CLEAR) ||
(search_str_scalar.get_dtype() != DTYPE_STR || search_str_scalar.m_status == STATUS_CLEAR) ||
output_vector.size() < 2) {
rval.m_status = STATUS_CLEAR;
return rval;
}

// Inside actual execution, break if the value is null
if (!str_scalar.is_valid() || !search_str_scalar.is_valid()) return rval;

const std::string& str = str_scalar.to_string();
boost::regex pattern(search_str_scalar.to_string());
boost::match_results<std::string::const_iterator> results;

bool found = boost::regex_search(str, results, pattern, boost::match_default);

rval.set(found);

if (!found || results.empty()) {
output_vector[0] = mknone();
output_vector[1] = mknone();
return rval;
}

double start = static_cast<double>(results.position());
double end = static_cast<double>(start + results.length() - 1);

if (end < 0) end = 0;

output_vector[0] = mktscalar(start);
output_vector[1] = mktscalar(end);

return rval;
}

hour_of_day::hour_of_day()
: exprtk::igeneric_function<t_tscalar>("T") {}

Expand Down
10 changes: 9 additions & 1 deletion cpp/perspective/src/include/perspective/computed_expression.h
Original file line number Diff line number Diff line change
Expand Up @@ -129,23 +129,31 @@ class PERSPECTIVE_EXPORT t_computed_expression_parser {
// Applied to the parser
static std::size_t PARSER_COMPILE_OPTIONS;

// Instances of Exprtk functions
// Datetime functions
static computed_function::bucket BUCKET_FN;
static computed_function::hour_of_day HOUR_OF_DAY_FN;
static computed_function::day_of_week DAY_OF_WEEK_VALIDATOR_FN;
static computed_function::month_of_year MONTH_OF_YEAR_VALIDATOR_FN;

// String functions
static computed_function::intern INTERN_VALIDATOR_FN;
static computed_function::concat CONCAT_VALIDATOR_FN;
static computed_function::order ORDER_VALIDATOR_FN;
static computed_function::upper UPPER_VALIDATOR_FN;
static computed_function::lower LOWER_VALIDATOR_FN;
static computed_function::length LENGTH_VALIDATOR_FN;
static computed_function::match MATCH_FN;
static computed_function::find FIND_FN;

// Numeric functions
static computed_function::percent_of PERCENT_OF_FN;
static computed_function::inrange_fn INRANGE_FN;
static computed_function::min_fn MIN_FN;
static computed_function::max_fn MAX_FN;
static computed_function::is_null IS_NULL_FN;
static computed_function::is_not_null IS_NOT_NULL_FN;

// Type conversion functions
static computed_function::to_string TO_STRING_VALIDATOR_FN;
static computed_function::to_integer TO_INTEGER_FN;
static computed_function::to_float TO_FLOAT_FN;
Expand Down
38 changes: 32 additions & 6 deletions cpp/perspective/src/include/perspective/computed_function.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <perspective/data_table.h>
#include <perspective/exprtk.h>
#include <boost/algorithm/string.hpp>
#include <boost/regex.hpp>
#include <type_traits>
#include <date/date.h>
#include <tsl/hopscotch_set.h>
Expand All @@ -31,6 +32,7 @@ namespace computed_function {
typedef typename exprtk::igeneric_function<t_tscalar>::generic_type
t_generic_type;
typedef typename t_generic_type::scalar_view t_scalar_view;
typedef typename t_generic_type::vector_view t_vector_view;
typedef typename t_generic_type::string_view t_string_view;

#define STRING_FUNCTION_HEADER(NAME) \
Expand Down Expand Up @@ -58,6 +60,17 @@ namespace computed_function {
// Length of the string
STRING_FUNCTION_HEADER(length)

STRING_FUNCTION_HEADER(search)

// split(string, substring, output_vector) - writes results into output_vector
// which can be accessed in the expression, but not returned into the column.
// calling split() returns a boolean stating whether the operation succeeded.
STRING_FUNCTION_HEADER(split)

// substr(string, start_idx, end_idx)
STRING_FUNCTION_HEADER(substr)


/**
* @brief Given a string column and 1...n string parameters, generate a
* numeric column that will act as a custom sort order for the string
Expand All @@ -83,19 +96,32 @@ namespace computed_function {
t_tscalar m_sentinel;
};

/**
* @brief Given a string column and a non-regex string literal, check
* whether each row in the string column contains the string literal.
*/
STRING_FUNCTION_HEADER(contains)

#define FUNCTION_HEADER(NAME) \
struct NAME : public exprtk::igeneric_function<t_tscalar> { \
NAME(); \
~NAME(); \
t_tscalar operator()(t_parameter_list parameters); \
};

/**
* @brief match(string, regex) => True if the entirety of the string
* matches regex, and False otherwise. Does not need a vocab as it
* does not write a string to the output column.
*/
FUNCTION_HEADER(match)

/**
* @brief find(string, regex, vector) => True if any substring of string
* matches regex, False otherwise. A vector of size 2 or greater MUST be
* passed into the function in order to store the results.
*
* Usage:
*
* var x[2]; // vector to hold results
* find("column", "abc", x) ? x[0] : null;
*/
FUNCTION_HEADER(find)

/**
* @brief Return the hour of the day the date/datetime belongs to.
*/
Expand Down
7 changes: 5 additions & 2 deletions packages/perspective/src/js/perspective.js
Original file line number Diff line number Diff line change
Expand Up @@ -1510,7 +1510,7 @@ export default function (Module) {
// parameter and does not work if that param is interned. TODO:
// this is clumsy and we should have a better way of handling it.
parsed_expression_string = parsed_expression_string.replace(
/bucket\(.*?, (intern\(\'([smhDWMY])\'\))\)/g,
/bucket\(.*?, *(intern\(\'([smhDWMY])\'\))\)/g,
(match, full, value) => {
return `${match.substr(0, match.indexOf(full))}'${value}')`;
}
Expand Down Expand Up @@ -1559,7 +1559,10 @@ export default function (Module) {
* // {'"Sales" + "Profit"': "float"}
* console.log(results.expression_schema);
*
* // {"invalid": "unknown token!", "1 + 'string'": "TypeError"}
* // {
* // "invalid": {column: 0, line: 0, error_message: "unknown token!"},
* // "1 + 'string'": {column: 0, line: 0, error_message: "Type Error"}
* // }
* console.log(results.errors);
*/
table.prototype.validate_expressions = function (
Expand Down
8 changes: 8 additions & 0 deletions packages/perspective/test/js/expressions/functionality.js
Original file line number Diff line number Diff line change
Expand Up @@ -2784,6 +2784,10 @@ module.exports = (perspective) => {
'upper("c")',
`bucket("b", 'M')`,
`bucket("b", 's')`,
`bucket("b",'M')`,
`bucket("b",'s')`,
`bucket("b", 'M')`,
`bucket("b", 's')`,
],
});
const schema = await view.expression_schema();
Expand All @@ -2796,6 +2800,10 @@ module.exports = (perspective) => {
'upper("c")': "string",
"bucket(\"b\", 'M')": "date",
"bucket(\"b\", 's')": "datetime",
"bucket(\"b\",'M')": "date",
"bucket(\"b\",'s')": "datetime",
"bucket(\"b\", 'M')": "date",
"bucket(\"b\", 's')": "datetime",
});
view.delete();
table.delete();
Expand Down
Loading

0 comments on commit c4cbb0e

Please sign in to comment.