Skip to content

Commit

Permalink
ENH/MAINT Updates to check_array for SimpleImputer + other minor chan…
Browse files Browse the repository at this point in the history
sergeyf authored and jnothman committed Feb 27, 2018
1 parent 55da79f commit 080e22d
Showing 5 changed files with 44 additions and 63 deletions.
3 changes: 3 additions & 0 deletions doc/whats_new/v0.20.rst
Original file line number Diff line number Diff line change
@@ -50,6 +50,9 @@ Classifiers and regressors
via ``n_iter_no_change``, ``validation_fraction`` and ``tol``. :issue:`7071`
by `Raghav RV`_

- :class:`dummy.DummyRegressor` now has a ``return_std`` option in its
``predict`` method. The returned standard deviations will be zeros.

- Added :class:`naive_bayes.ComplementNB`, which implements the Complement
Naive Bayes classifier described in Rennie et al. (2003).
:issue:`8190` by :user:`Michael A. Alcorn <airalcorn2>`.
15 changes: 12 additions & 3 deletions sklearn/dummy.py
Original file line number Diff line number Diff line change
@@ -445,7 +445,7 @@ def fit(self, X, y, sample_weight=None):
self.constant_ = np.reshape(self.constant_, (1, -1))
return self

def predict(self, X):
def predict(self, X, return_std=False):
"""
Perform classification on test vectors X.
@@ -454,17 +454,26 @@ def predict(self, X):
X : {array-like, object with finite length or shape}
Training data, requires length = n_samples
return_std : boolean, optional
Whether to return the standard deviation of posterior prediction.
All zeros in this case.
Returns
-------
y : array, shape = [n_samples] or [n_samples, n_outputs]
Predicted target values for X.
y_std : array, shape = [n_samples] or [n_samples, n_outputs]
Standard deviation of predictive distribution of query points.
"""
check_is_fitted(self, "constant_")
n_samples = _num_samples(X)

y = np.ones((n_samples, 1)) * self.constant_
y = np.ones((n_samples, self.n_outputs_)) * self.constant_
y_std = np.zeros((n_samples, self.n_outputs_))

if self.n_outputs_ == 1 and not self.output_2d_:
y = np.ravel(y)
y_std = np.ravel(y_std)

return y
return (y, y_std) if return_std else y
22 changes: 16 additions & 6 deletions sklearn/impute.py
Original file line number Diff line number Diff line change
@@ -151,8 +151,10 @@ def fit(self, X, y=None):
# transform(X), the imputation data will be computed in transform()
# when the imputation is done per sample (i.e., when axis=1).
if self.axis == 0:
X = check_array(X, accept_sparse='csc', dtype=np.float64,
force_all_finite=False)
X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES,
force_all_finite='allow-nan'
if self.missing_values == 'NaN'
or np.isnan(self.missing_values) else True)

if sparse.issparse(X):
self.statistics_ = self._sparse_fit(X,
@@ -249,7 +251,9 @@ def _sparse_fit(self, X, strategy, missing_values, axis):

def _dense_fit(self, X, strategy, missing_values, axis):
"""Fit the transformer on dense data."""
X = check_array(X, force_all_finite=False)
X = check_array(X, force_all_finite='allow-nan'
if self.missing_values == 'NaN'
or np.isnan(self.missing_values) else True)
mask = _get_mask(X, missing_values)
masked_X = ma.masked_array(X, mask=mask)

@@ -303,7 +307,10 @@ def transform(self, X):
if self.axis == 0:
check_is_fitted(self, 'statistics_')
X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES,
force_all_finite=False, copy=self.copy)
force_all_finite='allow-nan'
if self.missing_values == 'NaN'
or np.isnan(self.missing_values) else True,
copy=self.copy)
statistics = self.statistics_
if X.shape[1] != statistics.shape[0]:
raise ValueError("X has %d features per sample, expected %d"
@@ -314,7 +321,10 @@ def transform(self, X):
# when the imputation is done per sample
else:
X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES,
force_all_finite=False, copy=self.copy)
force_all_finite='allow-nan'
if self.missing_values == 'NaN'
or np.isnan(self.missing_values) else True,
copy=self.copy)

if sparse.issparse(X):
statistics = self._sparse_fit(X,
@@ -332,7 +342,7 @@ def transform(self, X):
invalid_mask = np.isnan(statistics)
valid_mask = np.logical_not(invalid_mask)
valid_statistics = statistics[valid_mask]
valid_statistics_indexes = np.where(valid_mask)[0]
valid_statistics_indexes = np.flatnonzero(valid_mask)
missing = np.arange(X.shape[not self.axis])[invalid_mask]

if self.axis == 0 and invalid_mask.any():
13 changes: 13 additions & 0 deletions sklearn/tests/test_dummy.py
Original file line number Diff line number Diff line change
@@ -620,3 +620,16 @@ def test_dummy_classifier_on_3D_array():
y_pred_proba = cls.predict_proba(X)
assert_array_equal(y_pred, y_expected)
assert_array_equal(y_pred_proba, y_proba_expected)


def test_dummy_regressor_return_std():
X = [[0]] * 3 # ignored
y = np.array([2, 2, 2])
y_std_expected = np.array([0, 0, 0])
cls = DummyRegressor()
cls.fit(X, y)
y_pred_list = cls.predict(X, return_std=True)
# there should be two elements when return_std is True
assert_equal(len(y_pred_list), 2)
# the second element should be all zeros
assert_array_equal(y_pred_list[1], y_std_expected)
54 changes: 0 additions & 54 deletions sklearn/tests/test_impute.py
Original file line number Diff line number Diff line change
@@ -92,40 +92,6 @@ def test_imputation_shape():
assert_equal(X_imputed.shape, (10, 2))


def test_imputation_mean_median_only_zero():
# Test imputation using the mean and median strategies, when
# missing_values == 0.
X = np.array([
[np.nan, 0, 0, 0, 5],
[np.nan, 1, 0, np.nan, 3],
[np.nan, 2, 0, 0, 0],
[np.nan, 6, 0, 5, 13],
])

X_imputed_mean = np.array([
[3, 5],
[1, 3],
[2, 7],
[6, 13],
])
statistics_mean = [np.nan, 3, np.nan, np.nan, 7]

# Behaviour of median with NaN is undefined, e.g. different results in
# np.median and np.ma.median
X_for_median = X[:, [0, 1, 2, 4]]
X_imputed_median = np.array([
[2, 5],
[1, 3],
[2, 5],
[6, 13],
])
statistics_median = [np.nan, 2, np.nan, 5]

_check_statistics(X, X_imputed_mean, "mean", statistics_mean, 0)
_check_statistics(X_for_median, X_imputed_median, "median",
statistics_median, 0)


def safe_median(arr, *args, **kwargs):
# np.median([]) raises a TypeError for numpy >= 1.10.1
length = arr.size if hasattr(arr, 'size') else len(arr)
@@ -276,26 +242,6 @@ def test_imputation_pipeline_grid_search():
gs.fit(X, Y)


def test_imputation_pickle():
# Test for pickling imputers.
import pickle

X = sparse_random_matrix(100, 100, density=0.10)

for strategy in ["mean", "median", "most_frequent"]:
imputer = SimpleImputer(missing_values=0, strategy=strategy)
imputer.fit(X)

imputer_pickled = pickle.loads(pickle.dumps(imputer))

assert_array_almost_equal(
imputer.transform(X.copy()),
imputer_pickled.transform(X.copy()),
err_msg="Fail to transform the data after pickling "
"(strategy = %s)" % (strategy)
)


def test_imputation_copy():
# Test imputation with copy
X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0)

0 comments on commit 080e22d

Please sign in to comment.