Merge pull request #821 from finos/py2

Default to int64 in Python3, add `long` and `unicode` to schema and type inference
finos · Nov 22, 2019 · 01e24b9 · 01e24b9
2 parents f1bfb55 + 572bc35
commit 01e24b9
Show file tree

Hide file tree

Showing 9 changed files with 190 additions and 29 deletions.
diff --git a/python/perspective/perspective/src/accessor.cpp b/python/perspective/perspective/src/accessor.cpp
@@ -74,10 +74,16 @@ infer_type(t_val x, t_val date_validator) {
     } else if (py::isinstance<py::bool_>(x) || type_string == "bool") {
         // booleans are both instances of bool_ and int_ -  check for bool first
         t = t_dtype::DTYPE_BOOL;
+    } else if (type_string == "long") {
+        t = t_dtype::DTYPE_INT64;
     } else if (py::isinstance<py::float_>(x)) {
         t = t_dtype::DTYPE_FLOAT64;
     } else if (py::isinstance<py::int_>(x)) {
-        t = t_dtype::DTYPE_INT32;
+        if (PY_MAJOR_VERSION < 3) {
+            t = t_dtype::DTYPE_INT32;
+        } else {
+            t = t_dtype::DTYPE_INT64;
+        }
     } else if (py::isinstance<py::str>(x) || type_string == "str") {
         t_dtype parsed_type = date_validator.attr("format")(x).cast<t_dtype>();
         if (parsed_type == t_dtype::DTYPE_DATE || parsed_type == t_dtype::DTYPE_TIME) {

diff --git a/python/perspective/perspective/src/fill.cpp b/python/perspective/perspective/src/fill.cpp
@@ -236,13 +236,13 @@ _fill_col_numeric(t_data_accessor accessor, t_data_table& tbl,
                 // inference checked the entire column/we could reset parsing.
                 double fval = item.cast<double>();
                 if (!is_update && (fval > 2147483647 || fval < -2147483648)) {
-                    WARN("Promoting %s to float from int32", name);
+                    WARN("Promoting column `%s` to float from int32", name);
                     tbl.promote_column(name, DTYPE_FLOAT64, i, true);
                     col = tbl.get_column(name);
                     type = DTYPE_FLOAT64;
                     col->set_nth(i, fval);
                 } else if (!is_update && isnan(fval)) {
-                    WARN("Promoting column %s to string from int32", name);
+                    WARN("Promoting column `%s` to string from int32", name);
                     tbl.promote_column(name, DTYPE_STR, i, false);
                     col = tbl.get_column(name);
                     _fill_col_string(
@@ -255,7 +255,7 @@ _fill_col_numeric(t_data_accessor accessor, t_data_table& tbl,
             case DTYPE_INT64: {
                 double fval = item.cast<double>();
                 if (!is_update && isnan(fval)) {
-                    WARN("Promoting %s to string from int64", name);
+                    WARN("Promoting column `%s` to string from int64", name);
                     tbl.promote_column(name, DTYPE_STR, i, false);
                     col = tbl.get_column(name);
                     _fill_col_string(
@@ -272,7 +272,7 @@ _fill_col_numeric(t_data_accessor accessor, t_data_table& tbl,
                 bool is_float = py::isinstance<py::float_>(item);
                 bool is_numpy_nan = is_float && npy_isnan(item.cast<double>());
                 if (!is_update && (!is_float || is_numpy_nan)) {
-                    WARN("Promoting column %s to string from float64", name);
+                    WARN("Promoting column `%s` to string from float64", name);
                     tbl.promote_column(name, DTYPE_STR, i, false);
                     col = tbl.get_column(name);
                     _fill_col_string(

diff --git a/python/perspective/perspective/src/numpy.cpp b/python/perspective/perspective/src/numpy.cpp
@@ -239,13 +239,13 @@ namespace numpy {
 
             double fval = item.cast<double>();
             if (fval > 2147483647 || fval < -2147483648) {
-                binding::WARN("Promoting %s to float from int32", name);
+                binding::WARN("Promoting column `%s` to float from int32", name);
                 tbl.promote_column(name, DTYPE_FLOAT64, i, true);
                 col = tbl.get_column(name);
                 type = DTYPE_FLOAT64;
                 col->set_nth(i, fval);
             } else if (isnan(fval)) {
-                binding::WARN("Promoting column %s to string from int32", name);
+                binding::WARN("Promoting column `%s` to string from int32", name);
                 tbl.promote_column(name, DTYPE_STR, i, false);
                 col = tbl.get_column(name);
                 fill_object_iter<std::string>(
@@ -276,7 +276,7 @@ namespace numpy {
 
             double fval = item.cast<double>();
             if (isnan(fval)) {
-                binding::WARN("Promoting %s to string from int64", name);
+                binding::WARN("Promoting column `%s` to string from int64", name);
                 tbl.promote_column(name, DTYPE_STR, i, false);
                 col = tbl.get_column(name);
                 fill_object_iter<std::string>(
@@ -308,7 +308,7 @@ namespace numpy {
             bool is_float = py::isinstance<py::float_>(item);
             bool is_numpy_nan = is_float && npy_isnan(item.cast<double>());
             if (!is_float || is_numpy_nan) {
-                binding::WARN("Promoting column %s to string from float64", name);
+                binding::WARN("Promoting column `%s` to string from float64", name);
                 tbl.promote_column(name, DTYPE_STR, i, false);
                 col = tbl.get_column(name);
                 fill_object_iter<std::string>(

diff --git a/python/perspective/perspective/src/utils.cpp b/python/perspective/perspective/src/utils.cpp
@@ -20,8 +20,8 @@ t_dtype type_string_to_t_dtype(std::string value, std::string name){
     auto type = t_dtype::DTYPE_STR;
 
     // TODO consider refactor
-    if (value == "int" || value == "integer") {
-        // Python int
+    if (value == "int" || value == "integer" || value == "int64" || value == "long") {
+        // Python int, long, and Numpy int64
         type = t_dtype::DTYPE_INT64;
     } else if (value == "int8") {
         // Numpy int8
@@ -32,9 +32,6 @@ t_dtype type_string_to_t_dtype(std::string value, std::string name){
     } else if (value == "int32") {
         // Numpy int32
         type = t_dtype::DTYPE_INT32;
-    } else if (value == "int64") {
-        // Numpy int64
-        type = t_dtype::DTYPE_INT64;
     } else if (value == "float") {
         // Python float
         type = t_dtype::DTYPE_FLOAT64;
@@ -53,7 +50,7 @@ t_dtype type_string_to_t_dtype(std::string value, std::string name){
         // TODO
         // Numpy float128
         type = t_dtype::DTYPE_FLOAT64;
-    } else if (value == "str" || value == "string") {
+    } else if (value == "str" || value == "string" || value == "unicode") {
         // Python unicode str
         type = t_dtype::DTYPE_STR;
     } else if (value == "bool" || value == "boolean") {

diff --git a/python/perspective/perspective/table/_accessor.py b/python/perspective/perspective/table/_accessor.py
@@ -55,14 +55,17 @@ def _type_to_format(data_or_schema):
             if isinstance(v, type) or isinstance(v, str):
                 # schema maps name -> type
                 return False, 2, data_or_schema
-            elif isinstance(v, list) or iter(v):
-                # if columns entries are iterable, type 1
-                return isinstance(v, numpy.ndarray), 1, data_or_schema
+            elif isinstance(v, list):
+                # a dict of iterables = type 1
+                return False, 1, data_or_schema
             else:
-                # Can't process
-                raise NotImplementedError("Dict values must be list or type!")
-        # Can't process
-        raise NotImplementedError("Dict values must be list or type!")
+                # See if iterable
+                try:
+                    iter(v)
+                except TypeError:
+                    raise NotImplementedError("Cannot load dataset of non-iterable type: Data passed in through a dict must be of type `list` or `numpy.ndarray`.")
+                else:
+                    return isinstance(v, numpy.ndarray), 1, data_or_schema
     elif isinstance(data_or_schema, numpy.ndarray):
         # structured or record array
         if not isinstance(data_or_schema.dtype.names, tuple):
@@ -81,6 +84,8 @@ def _type_to_format(data_or_schema):
 class _PerspectiveAccessor(object):
     '''A uniform accessor that wraps data/schemas of varying formats with a common `marshal` function.'''
 
+    INTEGER_TYPES = six.integer_types + (numpy.integer,)
+
     def __init__(self, data_or_schema):
         self._is_numpy, self._format, self._data_or_schema = _type_to_format(data_or_schema)
         self._date_validator = _PerspectiveDateValidator()
@@ -187,7 +192,7 @@ def marshal(self, cidx, ridx, dtype):
                 # should be able to update int columns with either ints or floats
                 val = int(val)
         elif dtype == t_dtype.DTYPE_FLOAT32 or dtype == t_dtype.DTYPE_FLOAT64:
-            if not isinstance(val, bool) and isinstance(val, (int, numpy.integer)):
+            if not isinstance(val, bool) and isinstance(val, _PerspectiveAccessor.INTEGER_TYPES):
                 # should be able to update float columns with either ints or floats
                 val = float(val)
         elif dtype == t_dtype.DTYPE_DATE:
@@ -213,6 +218,7 @@ def marshal(self, cidx, ridx, dtype):
                     val = unicode(val)  # noqa: F821
                 else:
                     val = str(val)
+
         return val
 
     def _get_numpy_column(self, name):

diff --git a/python/perspective/perspective/tests/table/test_table.py b/python/perspective/perspective/tests/table/test_table.py
@@ -26,6 +26,13 @@ def test_empty_table(self):
         tbl = Table([])
         assert tbl.size() == 0
 
+    def test_table_not_iterable(self):
+        data = {
+            "a": 1
+        }
+        with raises(NotImplementedError):
+            Table(data)
+
     def test_table_int(self):
         data = [{"a": 1, "b": 2}, {"a": 3, "b": 4}]
         tbl = Table(data)
@@ -46,12 +53,68 @@ def test_table_int_column_names(self):
     def test_table_int_overflow(self):
         if six.PY2:
             maxint = sys.maxint + 1
+            # overflows into float
             data = {"a": [i for i in range(100)] + [maxint, maxint, maxint]}
             tbl = Table(data)
             # two promotions later
             assert tbl.schema() == {
-                "a": str
+                "a": float
+            }
+
+    def test_table_long(self):
+        if six.PY2:
+            # don't overflow in this test
+            data = [long(100), long(200), long(300)]  # noqa: F821
+            tbl = Table({
+                "a": data
+            })
+            assert tbl.schema() == {
+                "a": int
+            }
+            assert tbl.view().to_dict()["a"] == [int(d) for d in data]
+
+    def test_table_long_overflow(self):
+        if six.PY2:
+            maxint = sys.maxint
+            # don't overflow in this test
+            data = [maxint, maxint + 1, maxint + 2]
+            tbl = Table({
+                "a": data
+            })
+            assert tbl.schema() == {
+                "a": float
+            }
+            assert tbl.view().to_dict()["a"] == [float(d) for d in data]
+
+    def test_table_int_to_long(self):
+        if six.PY2:
+            # don't overflow in this test
+            data = [int(100), int(200), int(300)]
+            tbl = Table({
+                "a": long  # noqa: F821
+            })
+            assert tbl.schema() == {
+                "a": int
+            }
+            tbl.update({
+                "a": data
+            })
+            assert tbl.view().to_dict()["a"] == data
+
+    def test_table_float_to_long(self):
+        if six.PY2:
+            # don't overflow in this test
+            data = [1.5, 2.5, 3.5]  # noqa: F821
+            tbl = Table({
+                "a": long  # noqa: F821
+            })
+            assert tbl.schema() == {
+                "a": int
             }
+            tbl.update({
+                "a": data
+            })
+            assert tbl.view().to_dict()["a"] == [1, 2, 3]
 
     def test_table_nones(self):
         none_data = [{"a": 1, "b": None}, {"a": None, "b": 2}]
@@ -299,6 +362,30 @@ def test_table_symmetric_string_schema(self):
 
         assert tbl2.schema(True) == schema
 
+    def test_table_long_schema(self):
+        if six.PY2:
+            schema = {
+                "a": long,  # noqa: F821
+                "b": int
+            }
+            tbl = Table(schema)
+            assert tbl.schema() == {
+                "a": int,
+                "b": int
+            }
+
+    def test_table_unicode_schema(self):
+        if six.PY2:
+            schema = {
+                "a": unicode,  # noqa: F821
+                "b": int
+            }
+            tbl = Table(schema)
+            assert tbl.schema() == {
+                "a": str,
+                "b": int
+            }
+
     # is_valid_filter
 
     def test_table_is_valid_filter_str(self):

diff --git a/python/perspective/perspective/tests/table/test_table_infer.py b/python/perspective/perspective/tests/table/test_table_infer.py
@@ -5,7 +5,7 @@
 # This file is part of the Perspective library, distributed under the terms of
 # the Apache License 2.0.  The full license can be found in the LICENSE file.
 #
-
+import six
 from perspective.table import Table
 from datetime import date, datetime
 
@@ -23,10 +23,18 @@ def test_table_infer_float(self):
         assert tbl.schema() == {"a": float}
 
     def test_table_promote_float(self):
-        data = {"a": [1, 2, 3, 4, 2147483648]}
-        tbl = Table(data)
-        assert tbl.schema() == {"a": float}
-        assert tbl.view().to_dict() == {"a": [1.0, 2.0, 3.0, 4.0, 2147483648.0]}
+        if six.PY2:
+            data = {"a": [1.5, 2.5, 3.5, 4.5, "abc"]}
+            tbl = Table(data)
+            assert tbl.schema() == {"a": str}
+            assert tbl.view().to_dict() == {"a": ["1.5", "2.5", "3.5", "4.5", "abc"]}
+
+    def test_table_promote_float_py2(self):
+        if six.PY2:
+            data = {"a": [1, 2, 3, 4, 2147483648]}
+            tbl = Table(data)
+            assert tbl.schema() == {"a": float}
+            assert tbl.view().to_dict() == {"a": [1.0, 2.0, 3.0, 4.0, 2147483648.0]}
 
     def test_table_infer_bool(self):
         data = {"a": [None, None, None, None, True, True, True]}

diff --git a/python/perspective/perspective/tests/table/test_table_numpy.py b/python/perspective/perspective/tests/table/test_table_numpy.py
@@ -74,6 +74,18 @@ def test_table_int64(self):
             "b": [4, 5, 6]
         }
 
+    def test_table_long_numpy(self):
+        if six.PY2:
+            data = {"a": np.array([1, 2, 3], dtype=long)}  # noqa: F821
+            tbl = Table(data)
+            assert tbl.schema() == {
+                "a": int
+            }
+            assert tbl.size() == 3
+            assert tbl.view().to_dict() == {
+                "a": [1, 2, 3]
+            }
+
     def test_table_float(self):
         data = {"a": np.array([1.1, 2.2]), "b": np.array([3.3, 4.4])}
         tbl = Table(data)
@@ -572,6 +584,39 @@ def test_table_numpy_from_schema_int(self):
         table.update(df)
         assert table.view().to_dict()["a"] == [1, None, 2, None, 3, 4]
 
+    def test_table_numpy_from_schema_long(self):
+        if six.PY2:
+            df = {
+                "a": np.array([1, None, 2, None, 3, 4])
+            }
+            table = Table({
+                "a": long  # noqa: F821
+            })
+            table.update(df)
+            assert table.view().to_dict()["a"] == [1, None, 2, None, 3, 4]
+
+    def test_table_numpy_from_schema_int_to_long(self):
+        if six.PY2:
+            df = {
+                "a": np.array([1, 2, 3, 4], dtype="int64")
+            }
+            table = Table({
+                "a": long  # noqa: F821
+            })
+            table.update(df)
+            assert table.view().to_dict()["a"] == [1, 2, 3, 4]
+
+    def test_table_numpy_from_schema_float_to_long(self):
+        if six.PY2:
+            df = {
+                "a": np.array([1, None, 2, None, 3, 4], dtype="float64")
+            }
+            table = Table({
+                "a": long  # noqa: F821
+            })
+            table.update(df)
+            assert table.view().to_dict()["a"] == [1, None, 2, None, 3, 4]
+
     def test_table_numpy_from_schema_bool(self):
         data = [True, False, True, False]
         df = {

diff --git a/python/perspective/perspective/tests/table/test_table_pandas.py b/python/perspective/perspective/tests/table/test_table_pandas.py
@@ -297,6 +297,18 @@ def test_table_pandas_from_schema_int(self):
         table.update(df)
         assert table.view().to_dict()["a"] == data
 
+    def test_table_pandas_from_schema_long(self):
+        if six.PY2:
+            data = [None, 1, None, 2, None, 3, 4]
+            df = pd.DataFrame({
+                "a": data
+            })
+            table = Table({
+                "a": long  # noqa: F821
+            })
+            table.update(df)
+            assert table.view().to_dict()["a"] == data
+
     def test_table_pandas_from_schema_bool(self):
         data = [True, False, True, False]
         df = pd.DataFrame({