Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improvements to Arrow updates and indexed columns #837

Merged
merged 13 commits into from
Dec 9, 2019
Prev Previous commit
Next Next commit
fix dictionary array reading in python
  • Loading branch information
sc1f committed Dec 7, 2019
commit 5c7f84d8741b29d62f3f11ce3b1f5d9c9067a03e
22 changes: 2 additions & 20 deletions cpp/perspective/src/cpp/vocab.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,13 +93,7 @@ t_vocab::get_interned(const char* s) {
} else {
idx = iter->second;
}
#ifndef PSP_ENABLE_WASM
#ifdef PSP_COLUMN_VERIFY
if (std::string(s) == "") {
PSP_VERBOSE_ASSERT(idx == 0, "Expected empty string to map to 0");
}
#endif
#endif

return idx;
}

Expand All @@ -115,9 +109,6 @@ t_vocab::init(bool from_recipe) {
if (from_recipe) {
rebuild_map();
}
#ifndef PSP_ENABLE_WASM
get_interned("");
#endif // PSP_ENABLE_WASM
}

t_uindex
Expand All @@ -133,16 +124,7 @@ t_vocab::verify() const {
rlookup[kv.second] = kv.first;
}

#ifndef PSP_ENABLE_WASM
auto zero = rlookup.find(t_uindex(0));
PSP_VERBOSE_ASSERT(zero, != rlookup.end(), "0 Not found");
PSP_VERBOSE_ASSERT(std::string(zero->second), == "", "0 mapped to unknown");
#endif

tsl::hopscotch_set<std::string> seen;
#ifndef PSP_ENABLE_WASM
seen.insert(std::string(""));
#endif

for (t_uindex idx = 1; idx < m_vlenidx; ++idx) {
std::stringstream ss;
Expand Down Expand Up @@ -193,7 +175,7 @@ void
t_vocab::pprint_vocabulary() const {
std::cout << "vocabulary =========\n";
for (t_uindex idx = 0; idx < m_vlenidx; ++idx) {
std::cout << "\t" << idx << " => " << unintern_c(idx) << std::endl;
std::cout << "\t" << idx << " => '" << unintern_c(idx) << "'" << std::endl;
}

std::cout << "end vocabulary =========\n";
Expand Down
2 changes: 1 addition & 1 deletion python/perspective/perspective/src/table.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,10 +159,10 @@ std::shared_ptr<Table> make_table_py(t_val table, t_data_accessor accessor, t_va
t_data_table data_table(output_schema);
data_table.init();
std::uint32_t row_count;

if (is_arrow) {
row_count = arrow_loader.row_count();
data_table.extend(arrow_loader.row_count());

arrow_loader.fill_table(data_table, index, offset, limit, is_update);
} else if (is_numpy) {
row_count = numpy_loader.row_count();
Expand Down
16 changes: 12 additions & 4 deletions python/perspective/perspective/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,14 +69,16 @@ def make_arrow(names, data, types=None, legacy=False):
return stream.getvalue().to_pybytes()

@staticmethod
def make_dictionary_arrow(names, data, legacy=False):
def make_dictionary_arrow(names, data, types=None, legacy=False):
"""Create an arrow binary that can be loaded and manipulated from memory, with
each column being a dictionary array of `str` values and `int` indices.

Args:
names (list): a list of str column names
data (list:tuple): a list of tuples, the first value being a list of indices,
and the second value being a list of values.
types (list:list:pyarrow.func): a list of lists, containing the indices type and
dictionary value type for each array.
legacy (bool): if True, use legacy IPC format (pre-pyarrow 0.15). Defaults to False.

Returns:
Expand All @@ -86,9 +88,15 @@ def make_dictionary_arrow(names, data, legacy=False):

arrays = []
for idx, column in enumerate(data):
# only apply types if array is present
indices = pa.array(column[0], type=pa.int64())
values = pa.array(column[1], type=pa.string())
indice_type = pa.int64()
value_type = pa.string()

if types is not None:
indice_type = types[idx][0]
value_type = types[idx][1]

indices = pa.array(column[0], type=indice_type)
values = pa.array(column[1], type=value_type)
parray = pa.DictionaryArray.from_arrays(indices, values)
arrays.append(parray)

Expand Down
100 changes: 97 additions & 3 deletions python/perspective/perspective/tests/table/test_table_arrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,6 @@ def test_table_arrow_loads_date64_file(self):
}
assert tbl.size() == 29

@mark.skip
def test_table_arrow_loads_dict_file(self):
with open(DICT_ARROW, mode='rb') as file: # b is important -> binary
tbl = Table(file.read())
Expand Down Expand Up @@ -205,8 +204,70 @@ def test_table_arrow_loads_string_stream(self, util):
"a": data[0]
}

@mark.skip
def test_table_arrow_loads_dictionary_stream(self, util):
def test_table_arrow_loads_dictionary_stream_int8(self, util):
data = [
([0, 1, 1, None], ["abc", "def"]),
([0, 1, None, 2], ["xx", "yy", "zz"])
]
types = [[pa.int8(), pa.string()]] * 2
arrow_data = util.make_dictionary_arrow(["a", "b"],
data,
types=types)
tbl = Table(arrow_data)

assert tbl.size() == 4
assert tbl.schema() == {
"a": str,
"b": str
}
assert tbl.view().to_dict() == {
"a": ["abc", "def", "def", None],
"b": ["xx", "yy", None, "zz"]
}

def test_table_arrow_loads_dictionary_stream_int16(self, util):
data = [
([0, 1, 1, None], ["abc", "def"]),
([0, 1, None, 2], ["xx", "yy", "zz"])
]
types = [[pa.int16(), pa.string()]] * 2
arrow_data = util.make_dictionary_arrow(["a", "b"],
data,
types=types)
tbl = Table(arrow_data)

assert tbl.size() == 4
assert tbl.schema() == {
"a": str,
"b": str
}
assert tbl.view().to_dict() == {
"a": ["abc", "def", "def", None],
"b": ["xx", "yy", None, "zz"]
}

def test_table_arrow_loads_dictionary_stream_int32(self, util):
data = [
([0, 1, 1, None], ["abc", "def"]),
([0, 1, None, 2], ["xx", "yy", "zz"])
]
types = [[pa.int32(), pa.string()]] * 2
arrow_data = util.make_dictionary_arrow(["a", "b"],
data,
types=types)
tbl = Table(arrow_data)

assert tbl.size() == 4
assert tbl.schema() == {
"a": str,
"b": str
}
assert tbl.view().to_dict() == {
"a": ["abc", "def", "def", None],
"b": ["xx", "yy", None, "zz"]
}

def test_table_arrow_loads_dictionary_stream_int64(self, util):
data = [
([0, 1, 1, None], ["abc", "def"]),
([0, 1, None, 2], ["xx", "yy", "zz"])
Expand All @@ -224,6 +285,39 @@ def test_table_arrow_loads_dictionary_stream(self, util):
"b": ["xx", "yy", None, "zz"]
}

def test_table_arrow_loads_dictionary_stream_nones(self, util):
data = [
([None, 0, 1, 2], ["", "abc", "def"])
]
arrow_data = util.make_dictionary_arrow(["a"], data)
tbl = Table(arrow_data)

assert tbl.size() == 4
assert tbl.schema() == {
"a": str
}
assert tbl.view().to_dict() == {
"a": [None, "", "abc", "def"]
}

@mark.skip
def test_table_arrow_loads_dictionary_stream_nones_indexed(self, util):
data = [
([1, None, 0, 2], ["", "abc", "def"]),
([2, 1, 0, None], ["", "hij", "klm"])
]
arrow_data = util.make_dictionary_arrow(["a", "b"], data)
tbl = Table(arrow_data, index="a")

assert tbl.schema() == {
"a": str,
"b": str
}
assert tbl.view().to_dict() == {
"a": [None, "", "abc", "def"],
"b": ["klm", "hij", "", None]
}

# legacy

def test_table_arrow_loads_int_legacy(self, util):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#
import numpy as np
from datetime import date, datetime
from pytest import mark
from perspective.table import Table


Expand Down Expand Up @@ -251,6 +252,7 @@ def test_update_np_nonseq_partial(self):
"b": ["a", "b", "c", "d"]
}

@mark.skip
def test_update_np_with_none_partial(self):
tbl = Table({
"a": [1, np.nan, 3],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import numpy as np
import pandas as pd
from datetime import date, datetime
from pytest import mark
from perspective.table import Table


Expand Down Expand Up @@ -213,6 +214,7 @@ def test_update_df_nonseq_partial(self):
"b": ["a", "b", "c", "d"]
}

@mark.skip
def test_update_df_with_none_partial(self):
tbl = Table({
"a": [1, np.nan, 3],
Expand Down