ENH/MAINT Updates to check_array for SimpleImputer + other minor chan…

…ges (scikit-learn#10640)
ccatalfo · Feb 27, 2018 · 080e22d · 080e22d
1 parent 55da79f
commit 080e22d
Showing 5 changed files with 44 additions and 63 deletions.
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -50,6 +50,9 @@ Classifiers and regressors
   via ``n_iter_no_change``, ``validation_fraction`` and ``tol``. :issue:`7071`
   by `Raghav RV`_
 
+- :class:`dummy.DummyRegressor` now has a ``return_std`` option in its 
+  ``predict`` method. The returned standard deviations will be zeros.
+
 - Added :class:`naive_bayes.ComplementNB`, which implements the Complement
   Naive Bayes classifier described in Rennie et al. (2003).
   :issue:`8190` by :user:`Michael A. Alcorn <airalcorn2>`.

diff --git a/sklearn/dummy.py b/sklearn/dummy.py
@@ -445,7 +445,7 @@ def fit(self, X, y, sample_weight=None):
         self.constant_ = np.reshape(self.constant_, (1, -1))
         return self
 
-    def predict(self, X):
+    def predict(self, X, return_std=False):
         """
         Perform classification on test vectors X.
 
@@ -454,17 +454,26 @@ def predict(self, X):
         X : {array-like, object with finite length or shape}
             Training data, requires length = n_samples
 
+        return_std : boolean, optional
+            Whether to return the standard deviation of posterior prediction.
+            All zeros in this case.
+
         Returns
         -------
         y : array, shape = [n_samples]  or [n_samples, n_outputs]
             Predicted target values for X.
+
+        y_std : array, shape = [n_samples]  or [n_samples, n_outputs]
+            Standard deviation of predictive distribution of query points.
         """
         check_is_fitted(self, "constant_")
         n_samples = _num_samples(X)
 
-        y = np.ones((n_samples, 1)) * self.constant_
+        y = np.ones((n_samples, self.n_outputs_)) * self.constant_
+        y_std = np.zeros((n_samples, self.n_outputs_))
 
         if self.n_outputs_ == 1 and not self.output_2d_:
             y = np.ravel(y)
+            y_std = np.ravel(y_std)
 
-        return y
+        return (y, y_std) if return_std else y
diff --git a/sklearn/impute.py b/sklearn/impute.py
@@ -151,8 +151,10 @@ def fit(self, X, y=None):
         # transform(X), the imputation data will be computed in transform()
         # when the imputation is done per sample (i.e., when axis=1).
         if self.axis == 0:
-            X = check_array(X, accept_sparse='csc', dtype=np.float64,
-                            force_all_finite=False)
+            X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES,
+                            force_all_finite='allow-nan'
+                            if self.missing_values == 'NaN'
+                            or np.isnan(self.missing_values) else True)
 
             if sparse.issparse(X):
                 self.statistics_ = self._sparse_fit(X,
@@ -249,7 +251,9 @@ def _sparse_fit(self, X, strategy, missing_values, axis):
 
     def _dense_fit(self, X, strategy, missing_values, axis):
         """Fit the transformer on dense data."""
-        X = check_array(X, force_all_finite=False)
+        X = check_array(X, force_all_finite='allow-nan'
+                        if self.missing_values == 'NaN'
+                        or np.isnan(self.missing_values) else True)
         mask = _get_mask(X, missing_values)
         masked_X = ma.masked_array(X, mask=mask)
 
@@ -303,7 +307,10 @@ def transform(self, X):
         if self.axis == 0:
             check_is_fitted(self, 'statistics_')
             X = check_array(X, accept_sparse='csc', dtype=FLOAT_DTYPES,
-                            force_all_finite=False, copy=self.copy)
+                            force_all_finite='allow-nan'
+                            if self.missing_values == 'NaN'
+                            or np.isnan(self.missing_values) else True,
+                            copy=self.copy)
             statistics = self.statistics_
             if X.shape[1] != statistics.shape[0]:
                 raise ValueError("X has %d features per sample, expected %d"
@@ -314,7 +321,10 @@ def transform(self, X):
         # when the imputation is done per sample
         else:
             X = check_array(X, accept_sparse='csr', dtype=FLOAT_DTYPES,
-                            force_all_finite=False, copy=self.copy)
+                            force_all_finite='allow-nan'
+                            if self.missing_values == 'NaN'
+                            or np.isnan(self.missing_values) else True,
+                            copy=self.copy)
 
             if sparse.issparse(X):
                 statistics = self._sparse_fit(X,
@@ -332,7 +342,7 @@ def transform(self, X):
         invalid_mask = np.isnan(statistics)
         valid_mask = np.logical_not(invalid_mask)
         valid_statistics = statistics[valid_mask]
-        valid_statistics_indexes = np.where(valid_mask)[0]
+        valid_statistics_indexes = np.flatnonzero(valid_mask)
         missing = np.arange(X.shape[not self.axis])[invalid_mask]
 
         if self.axis == 0 and invalid_mask.any():

diff --git a/sklearn/tests/test_dummy.py b/sklearn/tests/test_dummy.py
@@ -620,3 +620,16 @@ def test_dummy_classifier_on_3D_array():
     y_pred_proba = cls.predict_proba(X)
     assert_array_equal(y_pred, y_expected)
     assert_array_equal(y_pred_proba, y_proba_expected)
+
+
+def test_dummy_regressor_return_std():
+    X = [[0]] * 3  # ignored
+    y = np.array([2, 2, 2])
+    y_std_expected = np.array([0, 0, 0])
+    cls = DummyRegressor()
+    cls.fit(X, y)
+    y_pred_list = cls.predict(X, return_std=True)
+    # there should be two elements when return_std is True
+    assert_equal(len(y_pred_list), 2)
+    # the second element should be all zeros
+    assert_array_equal(y_pred_list[1], y_std_expected)
diff --git a/sklearn/tests/test_impute.py b/sklearn/tests/test_impute.py
@@ -92,40 +92,6 @@ def test_imputation_shape():
         assert_equal(X_imputed.shape, (10, 2))
 
 
-def test_imputation_mean_median_only_zero():
-    # Test imputation using the mean and median strategies, when
-    # missing_values == 0.
-    X = np.array([
-        [np.nan, 0, 0, 0, 5],
-        [np.nan, 1, 0, np.nan, 3],
-        [np.nan, 2, 0, 0, 0],
-        [np.nan, 6, 0, 5, 13],
-    ])
-
-    X_imputed_mean = np.array([
-        [3, 5],
-        [1, 3],
-        [2, 7],
-        [6, 13],
-    ])
-    statistics_mean = [np.nan, 3, np.nan, np.nan, 7]
-
-    # Behaviour of median with NaN is undefined, e.g. different results in
-    # np.median and np.ma.median
-    X_for_median = X[:, [0, 1, 2, 4]]
-    X_imputed_median = np.array([
-        [2, 5],
-        [1, 3],
-        [2, 5],
-        [6, 13],
-    ])
-    statistics_median = [np.nan, 2, np.nan, 5]
-
-    _check_statistics(X, X_imputed_mean, "mean", statistics_mean, 0)
-    _check_statistics(X_for_median, X_imputed_median, "median",
-                      statistics_median, 0)
-
-
 def safe_median(arr, *args, **kwargs):
     # np.median([]) raises a TypeError for numpy >= 1.10.1
     length = arr.size if hasattr(arr, 'size') else len(arr)
@@ -276,26 +242,6 @@ def test_imputation_pipeline_grid_search():
     gs.fit(X, Y)
 
 
-def test_imputation_pickle():
-    # Test for pickling imputers.
-    import pickle
-
-    X = sparse_random_matrix(100, 100, density=0.10)
-
-    for strategy in ["mean", "median", "most_frequent"]:
-        imputer = SimpleImputer(missing_values=0, strategy=strategy)
-        imputer.fit(X)
-
-        imputer_pickled = pickle.loads(pickle.dumps(imputer))
-
-        assert_array_almost_equal(
-            imputer.transform(X.copy()),
-            imputer_pickled.transform(X.copy()),
-            err_msg="Fail to transform the data after pickling "
-            "(strategy = %s)" % (strategy)
-        )
-
-
 def test_imputation_copy():
     # Test imputation with copy
     X_orig = sparse_random_matrix(5, 5, density=0.75, random_state=0)