[ENH] PCA on sparse data.

PCA widget uses sklearn's TruncatedSVD when dealing with sparse data, since sklearn`s PCA does not support sparse data. Preserve old behaviour for non-sparse data (sklearn.decomposition.PCA). Use radio button to select decomposition (PCA, TruncatedSVD). On sparse data, TruncatedSVD is selected. PCA and normalization are disabled. Changes: widgets/ursupervised/owpca.py: - Default method: PCA - When input data is sparse, changes to TruncatedSVD automatically, PCA is disabled. - In addition, user can Choose TruncatedSVD for non-sparse data. Due to sklearn's limitations, results for only n_features-1 features can be shown. projection/pca.py: New SklProjector TruncatedSVD (Wrapper for PCAModel). projection/base.py: sparse_support class attribute. tests:/test_pca.py: - Test for TruncatedSVD projector - Remove test that checks error message (not needed anymore).
biolab · May 17, 2017 · c7e9b43 · c7e9b43
1 parent 291642a
commit c7e9b43
Show file tree

Hide file tree

Showing 5 changed files with 123 additions and 23 deletions.
diff --git a/Orange/projection/base.py b/Orange/projection/base.py
@@ -82,6 +82,8 @@ class SklProjector(Projector, metaclass=WrapperMeta):
     __wraps__ = None
     _params = {}
     name = 'skl projection'
+    supports_sparse = True
+
     preprocessors = [Orange.preprocess.Continuize(),
                      Orange.preprocess.SklImpute()]
 

diff --git a/Orange/projection/pca.py b/Orange/projection/pca.py
@@ -18,7 +18,7 @@ def save_state(_):
 from Orange.projection import SklProjector, Projection
 from Orange.preprocess.score import LearnerScorer
 
-__all__ = ["PCA", "SparsePCA", "IncrementalPCA"]
+__all__ = ["PCA", "SparsePCA", "IncrementalPCA", "TruncatedSVD"]
 
 
 class _FeatureScorerMixin(LearnerScorer):
@@ -33,7 +33,8 @@ def score(self, data):
 
 class PCA(SklProjector, _FeatureScorerMixin):
     __wraps__ = skl_decomposition.PCA
-    name = 'pca'
+    name = 'PCA'
+    supports_sparse = False
 
     def __init__(self, n_components=None, copy=True, whiten=False,
                  svd_solver='auto', tol=0.0, iterated_power='auto',
@@ -60,7 +61,8 @@ def fit(self, X, Y=None):
 
 class SparsePCA(SklProjector):
     __wraps__ = skl_decomposition.SparsePCA
-    name = 'sparse pca'
+    name = 'Sparse PCA'
+    supports_sparse = False
 
     def __init__(self, n_components=None, alpha=1, ridge_alpha=0.01,
                  max_iter=1000, tol=1e-8, method='lars', n_jobs=1, U_init=None,
@@ -124,7 +126,8 @@ def pca_variable(i):
 
 class IncrementalPCA(SklProjector):
     __wraps__ = skl_decomposition.IncrementalPCA
-    name = 'incremental pca'
+    name = 'Incremental PCA'
+    supports_sparse = False
 
     def __init__(self, n_components=None, whiten=False, copy=True,
                  batch_size=None, preprocessors=None):
@@ -152,6 +155,39 @@ def partial_fit(self, data):
         return self
 
 
+class TruncatedSVD(SklProjector, _FeatureScorerMixin):
+    __wraps__ = skl_decomposition.TruncatedSVD
+    name = 'Truncated SVD'
+
+    def __init__(self, n_components=None, algorithm='randomized', n_iter=5,
+                 random_state=None, tol=0.0, preprocessors=None, max_components=None):
+        super().__init__(preprocessors=preprocessors)
+        if n_components is not None and max_components is not None:
+            raise ValueError("n_components and max_components can not both be defined.")
+        # max_components limits the number of SVD components if the minimum
+        # shape of the X matrix (after preprocessing) is higher than
+        # max_components, so that sklearn does not always compute the full
+        # transform, which is faster and uses less memory for big data.
+        self.max_components = max_components
+        self.params = vars()
+
+    def fit(self, X, Y=None):
+        params = self.params.copy()
+        if params["n_components"] is None:
+            params["n_components"] = self.max_components
+
+        if params["n_components"] >= min(X.shape):
+            # strict requirement in scikit fit_transform:
+            # n_components must be < n_features
+            params["n_components"] = min(X.shape) - 1
+
+        proj = self.__wraps__(**params)
+        proj = proj.fit(X, Y)
+        return PCAModel(proj, self.domain)
+
+
+
+
 class Projector(SharedComputeValue):
     """Transform into a given PCA component."""
 

diff --git a/Orange/tests/test_pca.py b/Orange/tests/test_pca.py
@@ -6,7 +6,7 @@
 import numpy as np
 
 from Orange.preprocess import Continuize, Normalize
-from Orange.projection import PCA, SparsePCA, IncrementalPCA
+from Orange.projection import PCA, SparsePCA, IncrementalPCA, TruncatedSVD
 from Orange.data import Table
 
 
@@ -84,6 +84,22 @@ def __ipca_test_helper(self, data, n_com, min_xpl_var):
         pc1_ipca = pca_model.partial_fit(data[1::2]).components_[0]
         self.assertAlmostEqual(abs(pc1_ipca.dot(pc1_pca)), 1, 4)
 
+    def test_truncated_svd(self):
+        data = self.ionosphere
+        self.__truncated_svd_test_helper(data, n_components=3, min_variance=0.5)
+        self.__truncated_svd_test_helper(data, n_components=10, min_variance=0.7)
+        self.__truncated_svd_test_helper(data, n_components=31, min_variance=0.99)
+
+    def __truncated_svd_test_helper(self, data, n_components, min_variance):
+        trsvd = TruncatedSVD(n_components=n_components)
+        model = trsvd(data)
+        pca_xpl_var = np.sum(model.explained_variance_ratio_)
+        self.assertGreaterEqual(pca_xpl_var + 1e-6, min_variance)
+        self.assertEqual(n_components, model.n_components)
+        self.assertEqual((n_components, data.X.shape[1]), model.components_.shape)
+        proj = np.dot(data.X, model.components_.T)
+        np.testing.assert_almost_equal(model(data).X, proj)
+
     def test_compute_value(self):
         iris = self.iris
         pca = PCA(n_components=2)(iris)

diff --git a/Orange/widgets/unsupervised/owpca.py b/Orange/widgets/unsupervised/owpca.py
@@ -10,7 +10,7 @@
 from Orange.data import Table, Domain, StringVariable
 from Orange.data.sql.table import SqlTable, AUTO_DL_LIMIT
 from Orange.preprocess import Normalize
-from Orange.projection import PCA
+from Orange.projection import PCA, TruncatedSVD
 from Orange.widgets import widget, gui, settings
 
 try:
@@ -23,6 +23,10 @@
 # Maximum number of PCA components that we can set in the widget
 MAX_COMPONENTS = 100
 
+DECOMPOSITIONS = [
+    PCA,
+    TruncatedSVD
+]
 
 class OWPCA(widget.OWWidget):
     name = "PCA"
@@ -35,13 +39,16 @@ class OWPCA(widget.OWWidget):
                ("Components", Table),
                ("PCA", PCA)]
 
+    settingsHandler = settings.DomainContextHandler()
+
     ncomponents = settings.Setting(2)
     variance_covered = settings.Setting(100)
     batch_size = settings.Setting(100)
     address = settings.Setting('')
     auto_update = settings.Setting(True)
     auto_commit = settings.Setting(True)
-    normalize = settings.Setting(True)
+    normalize = settings.ContextSetting(True)
+    decomposition_idx = settings.ContextSetting(0)
     maxp = settings.Setting(20)
     axis_labels = settings.Setting(10)
 
@@ -66,10 +73,7 @@ def __init__(self):
         self._variance_ratio = None
         self._cumulative = None
         self._line = False
-        # max_components limit allows scikit-learn to select a faster method for big data
-        self._pca_projector = PCA(max_components=MAX_COMPONENTS)
-        self._pca_projector.component = self.ncomponents
-        self._pca_preprocessors = PCA.preprocessors
+        self._init_projector()
 
         # Components Selection
         box = gui.vBox(self.controlArea, "Components Selection")
@@ -121,10 +125,20 @@ def __init__(self):
 
         self.sampling_box.setVisible(remotely)
 
+        # Decomposition
+        self.decomposition_box = gui.radioButtons(
+            self.controlArea, self,
+            "decomposition_idx", [d.name for d in DECOMPOSITIONS],
+            box="Decomposition", callback=self._update_decomposition
+        )
+
         # Options
         self.options_box = gui.vBox(self.controlArea, "Options")
-        gui.checkBox(self.options_box, self, "normalize", "Normalize data",
-                     callback=self._update_normalize)
+        self.normalize_box = gui.checkBox(
+            self.options_box, self, "normalize",
+            "Normalize data", callback=self._update_normalize
+        )
+
         self.maxp_spin = gui.spin(
             self.options_box, self, "maxp", 1, MAX_COMPONENTS,
             label="Show only first", callback=self._setup_plot,
@@ -160,6 +174,23 @@ def update_model(self):
         else:
             self.__timer.stop()
 
+    def update_buttons_sparse(self):
+        update_selection = False
+        if DECOMPOSITIONS[self.decomposition_idx].supports_sparse == False:
+            update_selection = True
+
+        for i, cls in enumerate(DECOMPOSITIONS):
+            if cls.supports_sparse == False:
+                self.decomposition_box.group.box.buttons[i].setEnabled(False)
+            elif update_selection == True:
+                # Fallback to first method that supports sparse
+                self.decomposition_idx = i
+                update_selection = False
+
+    def update_buttons_dense(self):
+        for i in range(len(DECOMPOSITIONS)):
+            self.decomposition_box.group.box.buttons[i].setEnabled(True)
+
     def start(self):
         if 'Abort' in self.start_button.text():
             self.rpca.abort()
@@ -175,6 +206,7 @@ def start(self):
             self.start_button.setText("Abort remote computation")
 
     def set_data(self, data):
+        self.closeContext()
         self.clear_messages()
         self.clear()
         self.start_button.setEnabled(False)
@@ -194,11 +226,21 @@ def set_data(self, data):
                 self.start_button.setEnabled(True)
         if not isinstance(data, SqlTable):
             self.sampling_box.setVisible(False)
+
+        self.openContext(data)
         if isinstance(data, Table):
             if data.is_sparse():
-                self.Error.sparse_data()
-                self.clear_outputs()
-                return
+                # PCA does not support sparse data
+                # Falling back to TruncatedSVD aka LSA
+                self.normalize = False
+                self.normalize_box.setEnabled(False)
+                self.update_buttons_sparse()
+            else:
+                self.normalize_box.setEnabled(True)
+                self.update_buttons_dense()
+
+            self._update_decomposition()
+
             if len(data.domain.attributes) == 0:
                 self.Error.no_features()
                 self.clear_outputs()
@@ -375,6 +417,17 @@ def _update_normalize(self):
         if self.data is None:
             self._invalidate_selection()
 
+    def _init_projector(self):
+        cls = DECOMPOSITIONS[self.decomposition_idx]
+        self._pca_projector = cls(max_components=MAX_COMPONENTS)
+        self._pca_projector.component = self.ncomponents
+        self._pca_preprocessors = cls.preprocessors
+
+    def _update_decomposition(self):
+        self.clear_messages()
+        self._init_projector()
+        self._update_normalize()
+
     def _nselected_components(self):
         """Return the number of selected components."""
         if self._pca is None:

diff --git a/Orange/widgets/unsupervised/tests/test_owpca.py b/Orange/widgets/unsupervised/tests/test_owpca.py
@@ -1,7 +1,6 @@
 # Test methods with long descriptive names can omit docstrings
 # pylint: disable=missing-docstring
 import numpy as np
-import scipy.sparse as sp
 
 from Orange.data import Table, Domain
 from Orange.widgets.tests.base import WidgetTest
@@ -70,9 +69,3 @@ def test_variance_shown(self):
         self.widget._update_selection_component_spin()
         var3 = self.widget.variance_covered
         self.assertGreater(var3, var2)
-
-    def test_error_on_sparse_data(self):
-        data = Table('iris')
-        data.X = sp.csr_matrix(data.X)
-        self.widget.set_data(data)
-        self.assertTrue(self.widget.Error.sparse_data.is_shown())