Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix column ordering in Python, null handling for computed columns #907

Merged
merged 4 commits into from
Feb 8, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
117 changes: 89 additions & 28 deletions cpp/perspective/src/cpp/computed_function.cpp

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ var Borders = cellRenderersRegistry.BaseClass.extend("Borders", {
var color;

gc.save();
gc.translate(-0.5, 0.5); // paint "sharp" lines on pixels instead of "blury" lines between pixels
gc.translate(-0.5, 0.5); // paint "sharp" lines on pixels instead of "blurry" lines between pixels
gc.cache.lineWidth = 1;

color = config.borderTop;
Expand Down
988 changes: 905 additions & 83 deletions packages/perspective/test/js/computed.js

Large diffs are not rendered by default.

25 changes: 20 additions & 5 deletions python/perspective/perspective/src/numpy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -600,17 +600,32 @@ namespace numpy {
*/
std::vector<std::string>
NumpyLoader::make_names() {
auto names = py::list(m_accessor.attr("data")().attr("keys")());
return names.cast<std::vector<std::string>>();
auto data = m_accessor.attr("data")();
auto py_names = m_accessor.attr("names")().cast<std::vector<std::string>>();

// Match names to dataset - only keep names that are present in dataset.
// The `m_names` variable is used internally to access the numpy arrays
// containing each column. On first-time load, `m_names` contains
// every name in the dataset. On update, `m_names` is recalculated to
// only include columns that are present in the update dataset.
std::vector<std::string> names;
for (const auto& name : py_names) {
if (data.contains(py::str(name))) {
names.push_back(name);
}
}

return names;
}

std::vector<t_dtype>
NumpyLoader::make_types() {
std::vector<t_dtype> rval;

py::list arrays = m_accessor.attr("data")().attr("values")();
for (const auto& a : arrays) {
py::array array = py::array::ensure(a);
auto data = m_accessor.attr("data")();
for (const auto& name : m_names) {
// Access each array by name to guarantee ordered access.
py::array array = py::array::ensure(data[py::str(name)]);

if (!array) {
PSP_COMPLAIN_AND_ABORT("Perspective does not support the mixing of ndarrays and lists.");
Expand Down
14 changes: 13 additions & 1 deletion python/perspective/perspective/src/table.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,8 +113,14 @@ std::shared_ptr<Table> make_table_py(t_val table, t_data_accessor accessor, t_va
* not created from a DataFrame, the "index" column would not exist.
*/
if (is_numpy) {
// `numpy_loader`s `m_names` and `m_types` variable contains only
// the column names and data types present in the update dataset,
// not the names/types of the entire `Table`.
numpy_loader.init();
}

// `column_names` and `data_types` contain every single column in the
// dataset, as well as `__INDEX__` if it exists.
column_names = accessor.attr("names")().cast<std::vector<std::string>>();
data_types = accessor.attr("types")().cast<std::vector<t_dtype>>();
} else if (is_numpy) {
Expand All @@ -123,9 +129,15 @@ std::shared_ptr<Table> make_table_py(t_val table, t_data_accessor accessor, t_va
* Perspective. Using `get_data_types` allows us to know the type of an array with `dtype=object`.
*/
numpy_loader.init();

// This will contain every single column in the dataset, as the
// first-time data load path does not mutate the `names` property of
// `accessor`.
column_names = numpy_loader.names();

// composite array and inferred `data_types` for the Table
// Infer data type for each column, and then use a composite of numpy
// dtype, inferred `t_dtype`, and stringified numpy dtype to get the
// final, canonical data type mapping.
std::vector<t_dtype> inferred_types = get_data_types(accessor.attr("data")(), 1, column_names, accessor.attr("date_validator")().cast<t_val>());
data_types = numpy_loader.reconcile_dtypes(inferred_types);
} else {
Expand Down
18 changes: 10 additions & 8 deletions python/perspective/perspective/src/view.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,16 +120,18 @@ make_view_config(const t_schema& schema, t_val date_parser, t_val config) {

// to preserve order, do not cast to std::map - use keys and python 3.7's guarantee that dicts respect insertion order
auto p_aggregates = py::dict(config.attr("get_aggregates")());
auto aggregate_keys = py::list(p_aggregates.attr("keys")());
tsl::ordered_map<std::string, std::vector<std::string>> aggregates;

for (auto& key : aggregate_keys) {
const std::string key_str = key.cast<std::string>();
if (py::isinstance<py::str>(p_aggregates[key])) {
std::vector<std::string> agg{p_aggregates[key].cast<std::string>()};
aggregates[key_str] = agg;
} else {
aggregates[key_str] = p_aggregates[key].cast<std::vector<std::string>>();
for (auto& column : columns) {
py::str py_column_name = py::str(column);
if (p_aggregates.contains(py_column_name)) {
if (py::isinstance<py::str>(p_aggregates[py_column_name])) {
std::vector<std::string> agg{
p_aggregates[py_column_name].cast<std::string>()};
aggregates[column] = agg;
} else {
aggregates[column] = p_aggregates[py_column_name].cast<std::vector<std::string>>();
}
}
};

Expand Down
23 changes: 11 additions & 12 deletions python/perspective/perspective/table/_accessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,41 +43,44 @@ def _type_to_format(data_or_schema):
- 0: records (:obj:`list` of :obj:`dict`)
- 1: columns (:obj:`dict` of :obj:`str` to :obj:`list`)
- 2: schema (dist[str]/dict[type])
:obj:`list`: column names
():obj:`list`/:obj:`dict`): processed data
'''
if isinstance(data_or_schema, list):
# records
return False, 0, data_or_schema
names = list(data_or_schema[0].keys()) if len(data_or_schema) > 0 else []
return False, 0, names, data_or_schema
elif isinstance(data_or_schema, dict):
# schema or columns
for v in data_or_schema.values():
if isinstance(v, type) or isinstance(v, str):
# schema maps name -> type
return False, 2, data_or_schema
return False, 2, list(data_or_schema.keys()), data_or_schema
elif isinstance(v, list):
# a dict of iterables = type 1
return False, 1, data_or_schema
return False, 1, list(data_or_schema.keys()), data_or_schema
else:
# See if iterable
try:
iter(v)
except TypeError:
raise NotImplementedError("Cannot load dataset of non-iterable type: Data passed in through a dict must be of type `list` or `numpy.ndarray`.")
else:
return isinstance(v, numpy.ndarray), 1, data_or_schema
return isinstance(v, numpy.ndarray), 1, list(data_or_schema.keys()), data_or_schema
elif isinstance(data_or_schema, numpy.ndarray):
# structured or record array
if not isinstance(data_or_schema.dtype.names, tuple):
raise NotImplementedError("Data should be dict of numpy.ndarray or a structured array.")
return True, 1, _flatten_structure(data_or_schema)
flattened = _flatten_structure(data_or_schema)
return True, 1, list(flattened.keys()), flattened
else:
if not (isinstance(data_or_schema, pandas.DataFrame) or isinstance(data_or_schema, pandas.Series)):
# if pandas not installed or is not a dataframe or series
raise NotImplementedError("Data must be dataframe, dict, list, numpy.recarray, or a numpy structured array.")
else:
# flatten column/index multiindex
df, _ = deconstruct_pandas(data_or_schema)
return True, 1, {c: df[c].values for c in df.columns}
return True, 1, df.columns.tolist(), {c: df[c].values for c in df.columns}


class _PerspectiveAccessor(object):
Expand All @@ -88,18 +91,13 @@ class _PerspectiveAccessor(object):
INTEGER_TYPES = six.integer_types + (numpy.integer,)

def __init__(self, data_or_schema):
self._is_numpy, self._format, self._data_or_schema = _type_to_format(data_or_schema)
self._is_numpy, self._format, self._names, self._data_or_schema = _type_to_format(data_or_schema)
self._date_validator = _PerspectiveDateValidator()
self._row_count = \
len(self._data_or_schema) if self._format == 0 else \
len(max(self._data_or_schema.values(), key=len)) if self._format == 1 else \
0

if isinstance(self._data_or_schema, list):
self._names = list(self._data_or_schema[0].keys()) if len(self._data_or_schema) > 0 else []
elif isinstance(self._data_or_schema, dict):
self._names = list(self._data_or_schema.keys())

self._types = []

# Verify that column names are strings, and that numpy arrays are of
Expand All @@ -115,6 +113,7 @@ def __init__(self, data_or_schema):
raise PerspectiveError("Mixed datasets of numpy.ndarray and lists are not supported.")

dtype = array.dtype

if name == "index" and isinstance(data_or_schema.index, pandas.DatetimeIndex):
# use the index of the original, unflattened dataframe
dtype = _parse_datetime_index(data_or_schema.index)
Expand Down
50 changes: 50 additions & 0 deletions python/perspective/perspective/tests/table/test_table_numpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,26 @@ def test_table_int(self):
"b": [4, 5, 6]
}

def test_table_int_lots_of_columns(self):
data = {
"a": np.array([1, 2, 3]),
"b": np.array([4, 5, 6]),
"c": np.array([4, 5, 6]),
"d": np.array([4, 5, 6]),
"e": np.array([4, 5, 6]),
"f": np.array([4, 5, 6]),
}
tbl = Table(data)
assert tbl.size() == 3
assert tbl.view().to_dict() == {
"a": [1, 2, 3],
"b": [4, 5, 6],
"c": [4, 5, 6],
"d": [4, 5, 6],
"e": [4, 5, 6],
"f": [4, 5, 6]
}

def test_table_int_with_None(self):
data = {"a": np.array([1, 2, 3, None, None]), "b": np.array([4, 5, 6, None, None])}
tbl = Table(data)
Expand Down Expand Up @@ -738,6 +758,36 @@ def test_table_numpy_from_schema_str(self):
table.update(df)
assert table.view().to_dict()["a"] == data

# partial update

def test_table_numpy_partial_update(self):
data = ["a", None, "b", None, "c"]
df = {"a": np.array([1, 2, 3, 4, 5]), "b": np.array(data), "c": np.array(data)}
table = Table(df, index="a")
table.update({
"a": np.array([2, 4, 5]),
"b": np.array(["x", "y", "z"])
})
assert table.view().to_dict() == {
"a": [1, 2, 3, 4, 5],
"b": ["a", "x", "b", "y", "z"],
"c": ["a", None, "b", None, "c"]
}

def test_table_numpy_partial_update_implicit(self):
data = ["a", None, "b", None, "c"]
df = {"a": np.array([1, 2, 3, 4, 5]), "b": np.array(data), "c": np.array(data)}
table = Table(df)
table.update({
"__INDEX__": np.array([1, 3, 4]),
"b": np.array(["x", "y", "z"])
})
assert table.view().to_dict() == {
"a": [1, 2, 3, 4, 5],
"b": ["a", "x", "b", "y", "z"],
"c": ["a", None, "b", None, "c"]
}

# structured array

def test_table_structured_array(self):
Expand Down
14 changes: 14 additions & 0 deletions python/perspective/perspective/tests/table/test_table_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,20 @@ def test_table_dataframe(self):
{"a": 3, "b": 4, "index": 1}
]

def test_table_dataframe_column_order(self):
d = [{"a": 1, "b": 2, "c": 3, "d": 4}, {"a": 3, "b": 4, "c": 5, "d": 6}]
data = pd.DataFrame(d, columns=["b", "c", "a", "d"])
tbl = Table(data)
assert tbl.size() == 2
assert tbl.columns() == ["index", "b", "c", "a", "d"]

def test_table_dataframe_selective_column_order(self):
d = [{"a": 1, "b": 2, "c": 3, "d": 4}, {"a": 3, "b": 4, "c": 5, "d": 6}]
data = pd.DataFrame(d, columns=["b", "c", "a"])
tbl = Table(data)
assert tbl.size() == 2
assert tbl.columns() == ["index", "b", "c", "a"]

def test_table_dataframe_does_not_mutate(self):
# make sure we don't mutate the dataframe that a user passes in
data = pd.DataFrame({
Expand Down
Loading