Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix corrupt string results in expressions #1589

Merged
merged 7 commits into from
Oct 24, 2021
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
new expression vocab with multiple pages
  • Loading branch information
sc1f authored and texodus committed Oct 21, 2021
commit 322ed150b8148a65de03e12dc36239cffbe687c0
1 change: 1 addition & 0 deletions cpp/perspective/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,7 @@ set (SOURCE_FILES
${PSP_CPP_SRC}/src/cpp/dense_tree.cpp
${PSP_CPP_SRC}/src/cpp/dependency.cpp
${PSP_CPP_SRC}/src/cpp/expression_tables.cpp
${PSP_CPP_SRC}/src/cpp/expression_vocab.cpp
${PSP_CPP_SRC}/src/cpp/extract_aggregate.cpp
${PSP_CPP_SRC}/src/cpp/filter.cpp
${PSP_CPP_SRC}/src/cpp/flat_traversal.cpp
Expand Down
12 changes: 6 additions & 6 deletions cpp/perspective/src/cpp/computed_expression.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ t_computed_expression::t_computed_expression(

void
t_computed_expression::compute(std::shared_ptr<t_data_table> source_table,
std::shared_ptr<t_data_table> destination_table, t_vocab& vocab) const {
std::shared_ptr<t_data_table> destination_table, t_expression_vocab& vocab) const {
// TODO: share symtables across pre/re/compute
exprtk::symbol_table<t_tscalar> sym_table;

Expand Down Expand Up @@ -209,7 +209,7 @@ t_computed_expression_parser::precompute(const std::string& expression_alias,
const std::string& expression_string,
const std::string& parsed_expression_string,
const std::vector<std::pair<std::string, std::string>>& column_ids,
std::shared_ptr<t_schema> schema, t_vocab& vocab) {
std::shared_ptr<t_schema> schema, t_expression_vocab& vocab) {
exprtk::symbol_table<t_tscalar> sym_table;
sym_table.add_constants();

Expand Down Expand Up @@ -237,7 +237,7 @@ t_computed_expression_parser::precompute(const std::string& expression_alias,
// nullptr as a string - use the empty string that we interned
// in the gnode's expression vocab in t_gnode::init() at idx 0
if (rval.m_type == DTYPE_STR) {
rval.set(vocab.unintern_c(0));
rval.set(vocab.get_empty_string());
rval.m_status = STATUS_INVALID;
}

Expand Down Expand Up @@ -271,7 +271,7 @@ t_computed_expression_parser::get_dtype(const std::string& expression_alias,
const std::string& expression_string,
const std::string& parsed_expression_string,
const std::vector<std::pair<std::string, std::string>>& column_ids,
const t_schema& schema, t_expression_error& error, t_vocab& vocab) {
const t_schema& schema, t_expression_error& error, t_expression_vocab& vocab) {
exprtk::symbol_table<t_tscalar> sym_table;
sym_table.add_constants();

Expand Down Expand Up @@ -311,7 +311,7 @@ t_computed_expression_parser::get_dtype(const std::string& expression_alias,
// nullptr as a string - use the empty string that we interned
// in the gnode's expression vocab in t_gnode::init() at idx 0
if (rval.m_type == DTYPE_STR) {
rval.set(vocab.unintern_c(0));
rval.set(vocab.get_empty_string());
rval.m_status = STATUS_INVALID;
}

Expand Down Expand Up @@ -419,7 +419,7 @@ t_validated_expression_map::get_expression_errors() const {
}

t_computed_function_store::t_computed_function_store(
t_vocab& vocab, bool is_type_validator)
t_expression_vocab& vocab, bool is_type_validator)
: m_day_of_week_fn(computed_function::day_of_week(vocab, is_type_validator))
, m_month_of_year_fn(
computed_function::month_of_year(vocab, is_type_validator))
Expand Down
56 changes: 23 additions & 33 deletions cpp/perspective/src/cpp/computed_function.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ namespace computed_function {
using float32 = float;
using float64 = double;

intern::intern(t_vocab& expression_vocab, bool is_type_validator)
intern::intern(t_expression_vocab& expression_vocab, bool is_type_validator)
: exprtk::igeneric_function<t_tscalar>("S")
, m_expression_vocab(expression_vocab)
, m_is_type_validator(is_type_validator) {
Expand All @@ -34,7 +34,7 @@ namespace computed_function {
// errors in strcmp().
t_tscalar sentinel;
sentinel.clear();
sentinel.set(m_expression_vocab.unintern_c(0));
sentinel.set(m_expression_vocab.get_empty_string());
sentinel.m_status = STATUS_INVALID;
m_sentinel = sentinel;
}
Expand Down Expand Up @@ -63,19 +63,17 @@ namespace computed_function {
return m_sentinel;
}

// Intern the string into the vocabulary, and return the index of the
// string inside the vocabulary.
t_uindex interned = m_expression_vocab.get_interned(temp_str);
rval.set(m_expression_vocab.unintern_c(interned));
// Intern the string into the vocabulary.
rval.set(m_expression_vocab.intern(temp_str));
return rval;
}

concat::concat(t_vocab& expression_vocab, bool is_type_validator)
concat::concat(t_expression_vocab& expression_vocab, bool is_type_validator)
: m_expression_vocab(expression_vocab)
, m_is_type_validator(is_type_validator) {
t_tscalar sentinel;
sentinel.clear();
sentinel.set(m_expression_vocab.unintern_c(0));
sentinel.set(m_expression_vocab.get_empty_string());
sentinel.m_status = STATUS_INVALID;

m_sentinel = sentinel;
Expand Down Expand Up @@ -132,19 +130,17 @@ namespace computed_function {
return m_sentinel;
}

t_uindex interned = m_expression_vocab.get_interned(result);
rval.set(m_expression_vocab.unintern_c(interned));

rval.set(m_expression_vocab.intern(result));
return rval;
}

upper::upper(t_vocab& expression_vocab, bool is_type_validator)
upper::upper(t_expression_vocab& expression_vocab, bool is_type_validator)
: exprtk::igeneric_function<t_tscalar>("T")
, m_expression_vocab(expression_vocab)
, m_is_type_validator(is_type_validator) {
t_tscalar sentinel;
sentinel.clear();
sentinel.set(m_expression_vocab.unintern_c(0));
sentinel.set(m_expression_vocab.get_empty_string());
sentinel.m_status = STATUS_INVALID;

m_sentinel = sentinel;
Expand Down Expand Up @@ -188,18 +184,18 @@ namespace computed_function {

boost::to_upper(temp_str);

t_uindex interned = m_expression_vocab.get_interned(temp_str);
rval.set(m_expression_vocab.unintern_c(interned));

rval.set(m_expression_vocab.intern(temp_str));
return rval;
}

lower::lower(t_vocab& expression_vocab, bool is_type_validator)
lower::lower(t_expression_vocab& expression_vocab, bool is_type_validator)
: exprtk::igeneric_function<t_tscalar>("T")
, m_expression_vocab(expression_vocab)
, m_is_type_validator(is_type_validator) {
t_tscalar sentinel;
sentinel.clear();
sentinel.set(m_expression_vocab.unintern_c(0));
sentinel.set(m_expression_vocab.get_empty_string());
sentinel.m_status = STATUS_INVALID;

m_sentinel = sentinel;
Expand Down Expand Up @@ -242,8 +238,7 @@ namespace computed_function {

boost::to_lower(temp_str);

t_uindex interned = m_expression_vocab.get_interned(temp_str);
rval.set(m_expression_vocab.unintern_c(interned));
rval.set(m_expression_vocab.intern(temp_str));
return rval;
}

Expand Down Expand Up @@ -478,13 +473,13 @@ namespace computed_function {
"03 March", "04 April", "05 May", "06 June", "07 July", "08 August",
"09 September", "10 October", "11 November", "12 December"};

day_of_week::day_of_week(t_vocab& expression_vocab, bool is_type_validator)
day_of_week::day_of_week(t_expression_vocab& expression_vocab, bool is_type_validator)
: exprtk::igeneric_function<t_tscalar>("T")
, m_expression_vocab(expression_vocab)
, m_is_type_validator(is_type_validator) {
t_tscalar sentinel;
sentinel.clear();
sentinel.set(m_expression_vocab.unintern_c(0));
sentinel.set(m_expression_vocab.get_empty_string());
sentinel.m_status = STATUS_INVALID;

m_sentinel = sentinel;
Expand Down Expand Up @@ -555,21 +550,18 @@ namespace computed_function {
result = days_of_week[(weekday - date::Sunday).count()];
}

// Intern the string pointer so it does not fall out of reference and
// cause a memory error.
t_uindex interned = m_expression_vocab.get_interned(result);
rval.set(m_expression_vocab.unintern_c(interned));
rval.set(m_expression_vocab.intern(result));
return rval;
}

month_of_year::month_of_year(
t_vocab& expression_vocab, bool is_type_validator)
t_expression_vocab& expression_vocab, bool is_type_validator)
: exprtk::igeneric_function<t_tscalar>("T")
, m_expression_vocab(expression_vocab)
, m_is_type_validator(is_type_validator) {
t_tscalar sentinel;
sentinel.clear();
sentinel.set(m_expression_vocab.unintern_c(0));
sentinel.set(m_expression_vocab.get_empty_string());
sentinel.m_status = STATUS_INVALID;

m_sentinel = sentinel;
Expand Down Expand Up @@ -634,8 +626,7 @@ namespace computed_function {

// Intern the string pointer so it does not fall out of reference and
// cause a memory error.
t_uindex interned = m_expression_vocab.get_interned(result);
rval.set(m_expression_vocab.unintern_c(interned));
rval.set(m_expression_vocab.intern(result));
return rval;
}

Expand Down Expand Up @@ -1271,13 +1262,13 @@ namespace computed_function {
return rval;
}

to_string::to_string(t_vocab& expression_vocab, bool is_type_validator)
to_string::to_string(t_expression_vocab& expression_vocab, bool is_type_validator)
: exprtk::igeneric_function<t_tscalar>("T")
, m_expression_vocab(expression_vocab)
, m_is_type_validator(is_type_validator) {
t_tscalar sentinel;
sentinel.clear();
sentinel.set(m_expression_vocab.unintern_c(0));
sentinel.set(m_expression_vocab.get_empty_string());
sentinel.m_status = STATUS_INVALID;
m_sentinel = sentinel;
}
Expand Down Expand Up @@ -1309,8 +1300,7 @@ namespace computed_function {
return m_sentinel;
}

t_uindex interned = m_expression_vocab.get_interned(temp_str);
rval.set(m_expression_vocab.unintern_c(interned));
rval.set(m_expression_vocab.intern(temp_str));
return rval;
}

Expand Down
4 changes: 2 additions & 2 deletions cpp/perspective/src/cpp/context_grouped_pkey.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -688,7 +688,7 @@ t_ctx_grouped_pkey::get_column_dtype(t_uindex idx) const {

void
t_ctx_grouped_pkey::compute_expressions(
std::shared_ptr<t_data_table> flattened_masked, t_vocab& expression_vocab) {
std::shared_ptr<t_data_table> flattened_masked, t_expression_vocab& expression_vocab) {
// Clear the transitional expression tables on the context so they are
// ready for the next update.
m_expression_tables->clear_transitional_tables();
Expand All @@ -714,7 +714,7 @@ t_ctx_grouped_pkey::compute_expressions(std::shared_ptr<t_data_table> master,
std::shared_ptr<t_data_table> delta, std::shared_ptr<t_data_table> prev,
std::shared_ptr<t_data_table> current,
std::shared_ptr<t_data_table> transitions,
std::shared_ptr<t_data_table> existed, t_vocab& expression_vocab) {
std::shared_ptr<t_data_table> existed, t_expression_vocab& expression_vocab) {
// Clear the tables so they are ready for this round of updates
m_expression_tables->clear_transitional_tables();

Expand Down
4 changes: 2 additions & 2 deletions cpp/perspective/src/cpp/context_one.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -609,7 +609,7 @@ t_ctx1::get_trav_depth(t_index idx) const {

void
t_ctx1::compute_expressions(
std::shared_ptr<t_data_table> flattened_masked, t_vocab& expression_vocab) {
std::shared_ptr<t_data_table> flattened_masked, t_expression_vocab& expression_vocab) {
// Clear the transitional expression tables on the context so they are
// ready for the next update.
m_expression_tables->clear_transitional_tables();
Expand All @@ -636,7 +636,7 @@ t_ctx1::compute_expressions(std::shared_ptr<t_data_table> master,
std::shared_ptr<t_data_table> delta, std::shared_ptr<t_data_table> prev,
std::shared_ptr<t_data_table> current,
std::shared_ptr<t_data_table> transitions,
std::shared_ptr<t_data_table> existed, t_vocab& expression_vocab) {
std::shared_ptr<t_data_table> existed, t_expression_vocab& expression_vocab) {
// Clear the tables so they are ready for this round of updates
m_expression_tables->clear_transitional_tables();

Expand Down
4 changes: 2 additions & 2 deletions cpp/perspective/src/cpp/context_two.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1051,7 +1051,7 @@ t_ctx2::get_column_dtype(t_uindex idx) const {

void
t_ctx2::compute_expressions(
std::shared_ptr<t_data_table> flattened_masked, t_vocab& expression_vocab) {
std::shared_ptr<t_data_table> flattened_masked, t_expression_vocab& expression_vocab) {
// Clear the transitional expression tables on the context so they are
// ready for the next update.
m_expression_tables->clear_transitional_tables();
Expand All @@ -1078,7 +1078,7 @@ t_ctx2::compute_expressions(std::shared_ptr<t_data_table> master,
std::shared_ptr<t_data_table> delta, std::shared_ptr<t_data_table> prev,
std::shared_ptr<t_data_table> current,
std::shared_ptr<t_data_table> transitions,
std::shared_ptr<t_data_table> existed, t_vocab& expression_vocab) {
std::shared_ptr<t_data_table> existed, t_expression_vocab& expression_vocab) {
// Clear the tables so they are ready for this round of updates
m_expression_tables->clear_transitional_tables();

Expand Down
4 changes: 2 additions & 2 deletions cpp/perspective/src/cpp/context_zero.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -617,7 +617,7 @@ t_ctx0::get_step_delta(t_index bidx, t_index eidx) {

void
t_ctx0::compute_expressions(
std::shared_ptr<t_data_table> flattened_masked, t_vocab& expression_vocab) {
std::shared_ptr<t_data_table> flattened_masked, t_expression_vocab& expression_vocab) {
// Clear the transitional expression tables on the context so they are
// ready for the next update.
m_expression_tables->clear_transitional_tables();
Expand Down Expand Up @@ -645,7 +645,7 @@ t_ctx0::compute_expressions(std::shared_ptr<t_data_table> master,
std::shared_ptr<t_data_table> delta, std::shared_ptr<t_data_table> prev,
std::shared_ptr<t_data_table> current,
std::shared_ptr<t_data_table> transitions,
std::shared_ptr<t_data_table> existed, t_vocab& expression_vocab) {
std::shared_ptr<t_data_table> existed, t_expression_vocab& expression_vocab) {
// Clear the tables so they are ready for this round of updates
m_expression_tables->clear_transitional_tables();

Expand Down
2 changes: 1 addition & 1 deletion cpp/perspective/src/cpp/emscripten.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1470,7 +1470,7 @@ namespace binding {
std::vector<std::shared_ptr<t_computed_expression>> expressions;
expressions.reserve(js_expressions.size());

t_vocab& expression_vocab = *(gnode.get_expression_vocab());
t_expression_vocab& expression_vocab = *(gnode.get_expression_vocab());

// Will either abort() or succeed completely, and this isn't a public
// API so we can directly index for speed.
Expand Down
73 changes: 73 additions & 0 deletions cpp/perspective/src/cpp/expression_vocab.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
/******************************************************************************
*
* Copyright (c) 2019, the Perspective Authors.
*
* This file is part of the Perspective library, distributed under the terms of
* the Apache License 2.0. The full license can be found in the LICENSE file.
*
*/

#include <perspective/expression_vocab.h>

namespace perspective {

t_expression_vocab::t_expression_vocab()
: m_empty_string("") {
// at an assumed 64 bytes per string, we will either hit 1000 unique strings
// or 6400 bytes per vocab, whichever comes first.
m_max_vocab_num_strings = 1000;
m_max_vocab_size = m_max_vocab_num_strings * 64;

// Always start with one vocab
allocate_new_vocab();
}

const char*
t_expression_vocab::intern(const char* str) {
std::size_t bytelength = strlen(str);

if (m_current_vocab_size + bytelength + 1 > m_max_vocab_size
|| m_current_vocab_num_strings + 1 > m_max_vocab_num_strings) {
allocate_new_vocab();
}

t_vocab& current_vocab = *(m_vocabs[0]);
t_uindex interned_idx = current_vocab.get_interned(str);
return current_vocab.unintern_c(interned_idx);
}

const char*
t_expression_vocab::intern(const std::string& str) {
return intern(str.c_str());
}

void
t_expression_vocab::clear() {
m_vocabs.clear();
}

const char*
t_expression_vocab::get_empty_string() const {
return m_empty_string.c_str();
}

void
t_expression_vocab::pprint() const {
for (const std::shared_ptr<t_vocab>& vocab : m_vocabs) {
vocab->pprint_vocabulary();
}
}

void
t_expression_vocab::allocate_new_vocab() {
std::shared_ptr<t_vocab> vocab = std::make_shared<t_vocab>();
vocab->init(false);
vocab->reserve(m_max_vocab_size, m_max_vocab_num_strings);
m_vocabs.insert(m_vocabs.begin(), vocab);

// set the size counters back to 0
m_current_vocab_size = 0;
m_current_vocab_num_strings = 0;
}

} // end namespace perspective
Loading