Skip to content

Commit

Permalink
[MRG+1] Fix scikit-learn#10229: check_array should fail if array has …
Browse files Browse the repository at this point in the history
…strings (scikit-learn#10495)

* Add deprecation warning to check_array for flexible array w/ dtype=numeric
  • Loading branch information
rtlee9 authored and jnothman committed Feb 22, 2018
1 parent d9a52cf commit 7108d17
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 0 deletions.
6 changes: 6 additions & 0 deletions doc/whats_new/v0.20.rst
Original file line number Diff line number Diff line change
Expand Up @@ -335,6 +335,12 @@ Feature Extraction
(words or n-grams). :issue:`9147` by :user:`Claes-Fredrik Mannby <mannby>`
and `Roman Yurchak`_.

Utils

- :func:`utils.validation.check_array` yield a ``FutureWarning`` indicating
that arrays of bytes/strings will be interpreted as decimal numbers
beginning in version 0.22. :issue:`10229` by :user:`Ryan Lee <rtlee9>`

Preprocessing

- Fixed bugs in :class:`preprocessing.LabelEncoder` which would sometimes throw
Expand Down
36 changes: 36 additions & 0 deletions sklearn/utils/tests/test_validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,42 @@ def test_check_array():
result = check_array(X_no_array)
assert_true(isinstance(result, np.ndarray))

# deprecation warning if string-like array with dtype="numeric"
X_str = [['a', 'b'], ['c', 'd']]
assert_warns_message(
FutureWarning,
"arrays of strings will be interpreted as decimal numbers if "
"parameter 'dtype' is 'numeric'. It is recommended that you convert "
"the array to type np.float64 before passing it to check_array.",
check_array, X_str, "numeric")
assert_warns_message(
FutureWarning,
"arrays of strings will be interpreted as decimal numbers if "
"parameter 'dtype' is 'numeric'. It is recommended that you convert "
"the array to type np.float64 before passing it to check_array.",
check_array, np.array(X_str, dtype='U'), "numeric")
assert_warns_message(
FutureWarning,
"arrays of strings will be interpreted as decimal numbers if "
"parameter 'dtype' is 'numeric'. It is recommended that you convert "
"the array to type np.float64 before passing it to check_array.",
check_array, np.array(X_str, dtype='S'), "numeric")

# deprecation warning if byte-like array with dtype="numeric"
X_bytes = [[b'a', b'b'], [b'c', b'd']]
assert_warns_message(
FutureWarning,
"arrays of strings will be interpreted as decimal numbers if "
"parameter 'dtype' is 'numeric'. It is recommended that you convert "
"the array to type np.float64 before passing it to check_array.",
check_array, X_bytes, "numeric")
assert_warns_message(
FutureWarning,
"arrays of strings will be interpreted as decimal numbers if "
"parameter 'dtype' is 'numeric'. It is recommended that you convert "
"the array to type np.float64 before passing it to check_array.",
check_array, np.array(X_bytes, dtype='V1'), "numeric")


def test_check_array_pandas_dtype_object_conversion():
# test that data-frame like objects with dtype object
Expand Down
9 changes: 9 additions & 0 deletions sklearn/utils/validation.py
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,15 @@ def check_array(array, accept_sparse=False, dtype="numeric", order=None,
# To ensure that array flags are maintained
array = np.array(array, dtype=dtype, order=order, copy=copy)

# in the future np.flexible dtypes will be handled like object dtypes
if dtype_numeric and np.issubdtype(array.dtype, np.flexible):
warnings.warn(
"Beginning in version 0.22, arrays of strings will be "
"interpreted as decimal numbers if parameter 'dtype' is "
"'numeric'. It is recommended that you convert the array to "
"type np.float64 before passing it to check_array.",
FutureWarning)

# make sure we actually converted to numeric:
if dtype_numeric and array.dtype.kind == "O":
array = array.astype(np.float64)
Expand Down

0 comments on commit 7108d17

Please sign in to comment.