diff --git a/README.md b/README.md index 406fb5a..91dcdc1 100644 --- a/README.md +++ b/README.md @@ -228,7 +228,3 @@ Here is the corresponding Bibtex entry ### Convergence test * ["The graphical lasso: New Insights and alternatives"](https://web.stanford.edu/~hastie/Papers/glassoinsights.pdf) Mazumder and Hastie, 2012. - -### Repeated KFold cross-validation - -* ["Cross-validation pitfalls when selecting and assessing regression and classification models"](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3994246/) D. Krstajic, L. Buturovic, D. Leahy, and S. Thomas, 2014. \ No newline at end of file diff --git a/inverse_covariance/__init__.py b/inverse_covariance/__init__.py index 7e9b48b..67e15ef 100644 --- a/inverse_covariance/__init__.py +++ b/inverse_covariance/__init__.py @@ -13,7 +13,6 @@ from .rank_correlation import spearman_correlation, kendalltau_correlation from .model_average import ModelAverage from .adaptive_graph_lasso import AdaptiveGraphLasso, AdaptiveGraphicalLasso -from .cross_validation import RepeatedKFold __all__ = [ "InverseCovarianceEstimator", @@ -33,5 +32,4 @@ "ModelAverage", "AdaptiveGraphLasso", "AdaptiveGraphicalLasso", - "RepeatedKFold", ] diff --git a/inverse_covariance/cross_validation.py b/inverse_covariance/cross_validation.py deleted file mode 100644 index 5c7c446..0000000 --- a/inverse_covariance/cross_validation.py +++ /dev/null @@ -1,119 +0,0 @@ -import numpy as np -from abc import ABCMeta, abstractmethod -from sklearn.externals.six import with_metaclass -from sklearn.cross_validation import _PartitionIterator -from sklearn.utils import check_random_state - - -# TODO: This may be easily deprecated in favor of -# https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/model_selection/_split.py#L1091 - - -class _BaseRepeatedKFold(with_metaclass(ABCMeta, _PartitionIterator)): - """Base class to validate KFoldRepeated approaches""" - - @abstractmethod - def __init__(self, n, n_folds, n_trials, random_state): - super(_BaseRepeatedKFold, self).__init__(n) - - if abs(n_folds - int(n_folds)) >= np.finfo("f").eps: - raise ValueError("n_folds must be an integer") - self.n_folds = n_folds = int(n_folds) - - if n_folds <= 1: - raise ValueError( - "repeated k-fold cross validation requires at least one" - " train / test split by setting n_folds=2 or more," - " got n_folds={0}.".format(n_folds) - ) - if n_folds > self.n: - raise ValueError( - ( - "Cannot have number of folds n_folds={0} greater" - " than the number of samples: {1}." - ).format(n_folds, n) - ) - - if not isinstance(n_trials, int) or n_trials <= 0: - raise ValueError( - "n_trials must be int and greater than 0;" " got {0}".format(n_trials) - ) - - self.n_trials = n_trials - self.random_state = random_state - - -class RepeatedKFold(_BaseRepeatedKFold): - """Repeated K-Folds cross validation iterator. - - Provides train/test indices to split data in train test sets. We reshuffle - the data n_trials times and split dataset into k consecutive folds for each - trial. - - Each fold is then used as a validation set once while the k - 1 remaining - fold(s) form the training set. - - The iterator will generate n_folds * n_trials train/test splits. - - Technique outlined in: - "Cross-validation pitfalls when selecting and assessing - regression and classification models" - D. Krstajic, L. Buturovic, D. Leahy, and S. Thomas - https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3994246/ - - Parameters - ---------- - n : int - Total number of elements. - - n_folds : int, default=3 - Number of folds. Must be at least 2. - - n_trials : int, default=3 - Number of random index shuffles. - n_trials=1 is equivalent to KFold with shuffle=True. - - random_state : None, int or RandomState - If None, use default numpy RNG for shuffling. - - See also - -------- - sklearn.cross_validation.KFold - sklearn.cross_validation.ShuffleSplit - """ - - def __init__(self, n, n_folds=3, n_trials=3, random_state=None): - super(RepeatedKFold, self).__init__(n, n_folds, n_trials, random_state) - rng = check_random_state(self.random_state) - - self.idxs = [] - for tt in range(self.n_trials): - idxs = np.arange(n) - rng.shuffle(idxs) - self.idxs.append(idxs) - - def _iter_test_indices(self): - n = self.n - n_folds = self.n_folds - fold_sizes = (n // n_folds) * np.ones(n_folds, dtype=np.int) - fold_sizes[: n % n_folds] += 1 - - for idxs in self.idxs: - current = 0 - for fold_size in fold_sizes: - start, stop = current, current + fold_size - yield idxs[start:stop] - current = stop - - def __repr__(self): - return "%s.%s(n=%i, n_folds=%i, n_trials=%s, random_state=%s)" % ( - self.__class__.__module__, - self.__class__.__name__, - self.n, - self.n_folds, - self.n_trials, - self.random_state, - ) - - def __len__(self): - return self.n_folds * self.n_trials diff --git a/inverse_covariance/quic_graph_lasso.py b/inverse_covariance/quic_graph_lasso.py index 5f95ca9..3b48282 100644 --- a/inverse_covariance/quic_graph_lasso.py +++ b/inverse_covariance/quic_graph_lasso.py @@ -10,9 +10,7 @@ from sklearn.utils import check_array, as_float_array, deprecated from sklearn.utils.testing import assert_array_almost_equal from sklearn.externals.joblib import Parallel, delayed -from sklearn.model_selection import cross_val_score # NOQA >= 0.18 - -# from sklearn.cross_validation import cross_val_score # NOQA < 0.18 +from sklearn.model_selection import cross_val_score, RepeatedKFold from . import pyquic from .inverse_covariance import ( @@ -21,7 +19,6 @@ _compute_error, _validate_path, ) -from .cross_validation import RepeatedKFold def quic( @@ -625,7 +622,7 @@ def fit(self, X, y=None): elif isinstance(self.cv, tuple): cv = self.cv - cv = RepeatedKFold(X.shape[0], n_folds=cv[0], n_trials=cv[1]) + cv = RepeatedKFold(n_splits=cv[0], n_repeats=cv[1]) self.init_coefs(X) @@ -662,11 +659,11 @@ def fit(self, X, y=None): score_metric=self.score_metric, init_method=self.init_method, ) - for train, test in cv + for train, test in cv.split(X) ) else: # parallel via spark - train_test_grid = [(train, test) for (train, test) in cv] + train_test_grid = [(train, test) for (train, test) in cv.split(X)] indexed_param_grid = list( zip(range(len(train_test_grid)), train_test_grid) ) diff --git a/inverse_covariance/tests/cross_validation_test.py b/inverse_covariance/tests/cross_validation_test.py deleted file mode 100644 index fb93296..0000000 --- a/inverse_covariance/tests/cross_validation_test.py +++ /dev/null @@ -1,35 +0,0 @@ -from sklearn.utils.testing import assert_raises -from sklearn.tests.test_cross_validation import check_cv_coverage -from inverse_covariance import RepeatedKFold - - -def test_repeated_kfold_coverage(): - n_samples = 300 - n_folds = 3 - n_trials = 3 - kf = RepeatedKFold(n_samples, n_folds, n_trials) - check_cv_coverage(kf, expected_n_iter=n_folds * n_trials, n_samples=n_samples) - - n_samples = 17 - n_folds = 3 - n_trials = 5 - kf = RepeatedKFold(n_samples, n_folds, n_trials) - check_cv_coverage(kf, expected_n_iter=n_folds * n_trials, n_samples=n_samples) - - -def test_repeated_kfold_values(): - # Check that errors are raised if there is not enough samples - assert_raises(ValueError, RepeatedKFold, 3, 4) - - # Error when number of folds is <= 1 - assert_raises(ValueError, RepeatedKFold, 2, 0) - assert_raises(ValueError, RepeatedKFold, 2, 1) - - # When n is not integer: - assert_raises(ValueError, RepeatedKFold, 2.5, 2) - - # When n_folds is not integer: - assert_raises(ValueError, RepeatedKFold, 5, 1.5) - - # When n_trials is not integer: - assert_raises(ValueError, RepeatedKFold, 5, 3, 1.5)