From 080e22dfde0ee6eb086be6feb2f086e33e56dc46 Mon Sep 17 00:00:00 2001 From: Sergey Feldman Date: Tue, 27 Feb 2018 14:15:07 -0800 Subject: [PATCH] ENH/MAINT Updates to check_array for SimpleImputer + other minor changes (#10640) --- doc/whats_new/v0.20.rst | 3 ++ sklearn/dummy.py | 15 ++++++++-- sklearn/impute.py | 22 +++++++++++---- sklearn/tests/test_dummy.py | 13 +++++++++ sklearn/tests/test_impute.py | 54 ------------------------------------ 5 files changed, 44 insertions(+), 63 deletions(-) diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst index 193204002664a..998a78ecf8f36 100644 --- a/doc/whats_new/v0.20.rst +++ b/doc/whats_new/v0.20.rst @@ -50,6 +50,9 @@ Classifiers and regressors via ``n_iter_no_change``, ``validation_fraction`` and ``tol``. :issue:`7071` by `Raghav RV`_ +- :class:`dummy.DummyRegressor` now has a ``return_std`` option in its + ``predict`` method. The returned standard deviations will be zeros. + - Added :class:`naive_bayes.ComplementNB`, which implements the Complement Naive Bayes classifier described in Rennie et al. (2003). :issue:`8190` by :user:`Michael A. Alcorn `. diff --git a/sklearn/dummy.py b/sklearn/dummy.py index 8b0ce6713774e..f9a4762806f1a 100644 --- a/sklearn/dummy.py +++ b/sklearn/dummy.py @@ -445,7 +445,7 @@ def fit(self, X, y, sample_weight=None): self.constant_ = np.reshape(self.constant_, (1, -1)) return self - def predict(self, X): + def predict(self, X, return_std=False): """ Perform classification on test vectors X. @@ -454,17 +454,26 @@ def predict(self, X): X : {array-like, object with finite length or shape} Training data, requires length = n_samples + return_std : boolean, optional + Whether to return the standard deviation of posterior prediction. + All zeros in this case. + Returns ------- y : array, shape = [n_samples] or [n_samples, n_outputs] Predicted target values for X. + + y_std : array, shape = [n_samples] or [n_samples, n_outputs] + Standard deviation of predictive distribution of query points. """ check_is_fitted(self, "constant_") n_samples = _num_samples(X) - y = np.ones((n_samples, 1)) * self.constant_ + y = np.ones((n_samples, self.n_outputs_)) * self.constant_ + y_std = np.zeros((n_samples, self.n_outputs_)) if self.n_outputs_ == 1 and not self.output_2d_: y = np.ravel(y) + y_std = np.ravel(y_std) - return y + return (y, y_std) if return_std else y diff --git a/sklearn/impute.py b/sklearn/impute.py index 8f6fe21d5ebcb..69fba61f3d8ff 100644 --- a/sklearn/impute.py +++ b/sklearn/impute.py @@ -151,8 +151,10 @@ def fit(self, X, y=None): # transform(X), the imputation data will be computed in transform() # when the imputation is done per sample (i.e., when axis=1). if self.axis == 0: - X = check_array(X, accept_sparse='csc', dtype=np.float64, - force_all_finite=False) + X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES, + force_all_finite='allow-nan' + if self.missing_values == 'NaN' + or np.isnan(self.missing_values) else True) if sparse.issparse(X): self.statistics_ = self._sparse_fit(X, @@ -249,7 +251,9 @@ def _sparse_fit(self, X, strategy, missing_values, axis): def _dense_fit(self, X, strategy, missing_values, axis): """Fit the transformer on dense data.""" - X = check_array(X, force_all_finite=False) + X = check_array(X, force_all_finite='allow-nan' + if self.missing_values == 'NaN' + or np.isnan(self.missing_values) else True) mask = _get_mask(X, missing_values) masked_X = ma.masked_array(X, mask=mask) @@ -303,7 +307,10 @@ def transform(self, X): if self.axis == 0: check_is_fitted(self, 'statistics_') X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES, - force_all_finite=False, copy=self.copy) + force_all_finite='allow-nan' + if self.missing_values == 'NaN' + or np.isnan(self.missing_values) else True, + copy=self.copy) statistics = self.statistics_ if X.shape[1] != statistics.shape[0]: raise ValueError("X has %d features per sample, expected %d" @@ -314,7 +321,10 @@ def transform(self, X): # when the imputation is done per sample else: X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES, - force_all_finite=False, copy=self.copy) + force_all_finite='allow-nan' + if self.missing_values == 'NaN' + or np.isnan(self.missing_values) else True, + copy=self.copy) if sparse.issparse(X): statistics = self._sparse_fit(X, @@ -332,7 +342,7 @@ def transform(self, X): invalid_mask = np.isnan(statistics) valid_mask = np.logical_not(invalid_mask) valid_statistics = statistics[valid_mask] - valid_statistics_indexes = np.where(valid_mask)[0] + valid_statistics_indexes = np.flatnonzero(valid_mask) missing = np.arange(X.shape[not self.axis])[invalid_mask] if self.axis == 0 and invalid_mask.any(): diff --git a/sklearn/tests/test_dummy.py b/sklearn/tests/test_dummy.py index 4ad877146c306..5d955f51017a1 100644 --- a/sklearn/tests/test_dummy.py +++ b/sklearn/tests/test_dummy.py @@ -620,3 +620,16 @@ def test_dummy_classifier_on_3D_array(): y_pred_proba = cls.predict_proba(X) assert_array_equal(y_pred, y_expected) assert_array_equal(y_pred_proba, y_proba_expected) + + +def test_dummy_regressor_return_std(): + X = [[0]] * 3 # ignored + y = np.array([2, 2, 2]) + y_std_expected = np.array([0, 0, 0]) + cls = DummyRegressor() + cls.fit(X, y) + y_pred_list = cls.predict(X, return_std=True) + # there should be two elements when return_std is True + assert_equal(len(y_pred_list), 2) + # the second element should be all zeros + assert_array_equal(y_pred_list[1], y_std_expected) diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py index 9c73770062cc6..802ab82e406eb 100644 --- a/sklearn/tests/test_impute.py +++ b/sklearn/tests/test_impute.py @@ -92,40 +92,6 @@ def test_imputation_shape(): assert_equal(X_imputed.shape, (10, 2)) -def test_imputation_mean_median_only_zero(): - # Test imputation using the mean and median strategies, when - # missing_values == 0. - X = np.array([ - [np.nan, 0, 0, 0, 5], - [np.nan, 1, 0, np.nan, 3], - [np.nan, 2, 0, 0, 0], - [np.nan, 6, 0, 5, 13], - ]) - - X_imputed_mean = np.array([ - [3, 5], - [1, 3], - [2, 7], - [6, 13], - ]) - statistics_mean = [np.nan, 3, np.nan, np.nan, 7] - - # Behaviour of median with NaN is undefined, e.g. different results in - # np.median and np.ma.median - X_for_median = X[:, [0, 1, 2, 4]] - X_imputed_median = np.array([ - [2, 5], - [1, 3], - [2, 5], - [6, 13], - ]) - statistics_median = [np.nan, 2, np.nan, 5] - - _check_statistics(X, X_imputed_mean, "mean", statistics_mean, 0) - _check_statistics(X_for_median, X_imputed_median, "median", - statistics_median, 0) - - def safe_median(arr, *args, **kwargs): # np.median([]) raises a TypeError for numpy >= 1.10.1 length = arr.size if hasattr(arr, 'size') else len(arr) @@ -276,26 +242,6 @@ def test_imputation_pipeline_grid_search(): gs.fit(X, Y) -def test_imputation_pickle(): - # Test for pickling imputers. - import pickle - - X = sparse_random_matrix(100, 100, density=0.10) - - for strategy in ["mean", "median", "most_frequent"]: - imputer = SimpleImputer(missing_values=0, strategy=strategy) - imputer.fit(X) - - imputer_pickled = pickle.loads(pickle.dumps(imputer)) - - assert_array_almost_equal( - imputer.transform(X.copy()), - imputer_pickled.transform(X.copy()), - err_msg="Fail to transform the data after pickling " - "(strategy = %s)" % (strategy) - ) - - def test_imputation_copy(): # Test imputation with copy X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0)