Skip to content

Commit

Permalink
[ENH] PCA on sparse data.
Browse files Browse the repository at this point in the history
PCA widget uses sklearn's TruncatedSVD when dealing with sparse data, since sklearn`s PCA does not support sparse data. Preserve old behaviour for non-sparse data (sklearn.decomposition.PCA).

Use radio button to select decomposition (PCA, TruncatedSVD).
On sparse data, TruncatedSVD is selected. PCA and normalization are disabled.

Changes:
widgets/ursupervised/owpca.py:
   - Default method: PCA
   - When input data is sparse, changes to TruncatedSVD automatically, PCA is disabled.
   - In addition, user can Choose TruncatedSVD for non-sparse data. Due to sklearn's limitations, results for only n_features-1 features can be shown.

projection/pca.py: New SklProjector TruncatedSVD (Wrapper for PCAModel).
projection/base.py: sparse_support class attribute.

tests:/test_pca.py:
  - Test for TruncatedSVD projector
  - Remove test that checks error message (not needed anymore).
  • Loading branch information
acopar committed May 17, 2017
1 parent 291642a commit c7e9b43
Show file tree
Hide file tree
Showing 5 changed files with 123 additions and 23 deletions.
2 changes: 2 additions & 0 deletions Orange/projection/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ class SklProjector(Projector, metaclass=WrapperMeta):
__wraps__ = None
_params = {}
name = 'skl projection'
supports_sparse = True

preprocessors = [Orange.preprocess.Continuize(),
Orange.preprocess.SklImpute()]

Expand Down
44 changes: 40 additions & 4 deletions Orange/projection/pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def save_state(_):
from Orange.projection import SklProjector, Projection
from Orange.preprocess.score import LearnerScorer

__all__ = ["PCA", "SparsePCA", "IncrementalPCA"]
__all__ = ["PCA", "SparsePCA", "IncrementalPCA", "TruncatedSVD"]


class _FeatureScorerMixin(LearnerScorer):
Expand All @@ -33,7 +33,8 @@ def score(self, data):

class PCA(SklProjector, _FeatureScorerMixin):
__wraps__ = skl_decomposition.PCA
name = 'pca'
name = 'PCA'
supports_sparse = False

def __init__(self, n_components=None, copy=True, whiten=False,
svd_solver='auto', tol=0.0, iterated_power='auto',
Expand All @@ -60,7 +61,8 @@ def fit(self, X, Y=None):

class SparsePCA(SklProjector):
__wraps__ = skl_decomposition.SparsePCA
name = 'sparse pca'
name = 'Sparse PCA'
supports_sparse = False

def __init__(self, n_components=None, alpha=1, ridge_alpha=0.01,
max_iter=1000, tol=1e-8, method='lars', n_jobs=1, U_init=None,
Expand Down Expand Up @@ -124,7 +126,8 @@ def pca_variable(i):

class IncrementalPCA(SklProjector):
__wraps__ = skl_decomposition.IncrementalPCA
name = 'incremental pca'
name = 'Incremental PCA'
supports_sparse = False

def __init__(self, n_components=None, whiten=False, copy=True,
batch_size=None, preprocessors=None):
Expand Down Expand Up @@ -152,6 +155,39 @@ def partial_fit(self, data):
return self


class TruncatedSVD(SklProjector, _FeatureScorerMixin):
__wraps__ = skl_decomposition.TruncatedSVD
name = 'Truncated SVD'

def __init__(self, n_components=None, algorithm='randomized', n_iter=5,
random_state=None, tol=0.0, preprocessors=None, max_components=None):
super().__init__(preprocessors=preprocessors)
if n_components is not None and max_components is not None:
raise ValueError("n_components and max_components can not both be defined.")
# max_components limits the number of SVD components if the minimum
# shape of the X matrix (after preprocessing) is higher than
# max_components, so that sklearn does not always compute the full
# transform, which is faster and uses less memory for big data.
self.max_components = max_components
self.params = vars()

def fit(self, X, Y=None):
params = self.params.copy()
if params["n_components"] is None:
params["n_components"] = self.max_components

if params["n_components"] >= min(X.shape):
# strict requirement in scikit fit_transform:
# n_components must be < n_features
params["n_components"] = min(X.shape) - 1

proj = self.__wraps__(**params)
proj = proj.fit(X, Y)
return PCAModel(proj, self.domain)




class Projector(SharedComputeValue):
"""Transform into a given PCA component."""

Expand Down
18 changes: 17 additions & 1 deletion Orange/tests/test_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import numpy as np

from Orange.preprocess import Continuize, Normalize
from Orange.projection import PCA, SparsePCA, IncrementalPCA
from Orange.projection import PCA, SparsePCA, IncrementalPCA, TruncatedSVD
from Orange.data import Table


Expand Down Expand Up @@ -84,6 +84,22 @@ def __ipca_test_helper(self, data, n_com, min_xpl_var):
pc1_ipca = pca_model.partial_fit(data[1::2]).components_[0]
self.assertAlmostEqual(abs(pc1_ipca.dot(pc1_pca)), 1, 4)

def test_truncated_svd(self):
data = self.ionosphere
self.__truncated_svd_test_helper(data, n_components=3, min_variance=0.5)
self.__truncated_svd_test_helper(data, n_components=10, min_variance=0.7)
self.__truncated_svd_test_helper(data, n_components=31, min_variance=0.99)

def __truncated_svd_test_helper(self, data, n_components, min_variance):
trsvd = TruncatedSVD(n_components=n_components)
model = trsvd(data)
pca_xpl_var = np.sum(model.explained_variance_ratio_)
self.assertGreaterEqual(pca_xpl_var + 1e-6, min_variance)
self.assertEqual(n_components, model.n_components)
self.assertEqual((n_components, data.X.shape[1]), model.components_.shape)
proj = np.dot(data.X, model.components_.T)
np.testing.assert_almost_equal(model(data).X, proj)

def test_compute_value(self):
iris = self.iris
pca = PCA(n_components=2)(iris)
Expand Down
75 changes: 64 additions & 11 deletions Orange/widgets/unsupervised/owpca.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from Orange.data import Table, Domain, StringVariable
from Orange.data.sql.table import SqlTable, AUTO_DL_LIMIT
from Orange.preprocess import Normalize
from Orange.projection import PCA
from Orange.projection import PCA, TruncatedSVD
from Orange.widgets import widget, gui, settings

try:
Expand All @@ -23,6 +23,10 @@
# Maximum number of PCA components that we can set in the widget
MAX_COMPONENTS = 100

DECOMPOSITIONS = [
PCA,
TruncatedSVD
]

class OWPCA(widget.OWWidget):
name = "PCA"
Expand All @@ -35,13 +39,16 @@ class OWPCA(widget.OWWidget):
("Components", Table),
("PCA", PCA)]

settingsHandler = settings.DomainContextHandler()

ncomponents = settings.Setting(2)
variance_covered = settings.Setting(100)
batch_size = settings.Setting(100)
address = settings.Setting('')
auto_update = settings.Setting(True)
auto_commit = settings.Setting(True)
normalize = settings.Setting(True)
normalize = settings.ContextSetting(True)
decomposition_idx = settings.ContextSetting(0)
maxp = settings.Setting(20)
axis_labels = settings.Setting(10)

Expand All @@ -66,10 +73,7 @@ def __init__(self):
self._variance_ratio = None
self._cumulative = None
self._line = False
# max_components limit allows scikit-learn to select a faster method for big data
self._pca_projector = PCA(max_components=MAX_COMPONENTS)
self._pca_projector.component = self.ncomponents
self._pca_preprocessors = PCA.preprocessors
self._init_projector()

# Components Selection
box = gui.vBox(self.controlArea, "Components Selection")
Expand Down Expand Up @@ -121,10 +125,20 @@ def __init__(self):

self.sampling_box.setVisible(remotely)

# Decomposition
self.decomposition_box = gui.radioButtons(
self.controlArea, self,
"decomposition_idx", [d.name for d in DECOMPOSITIONS],
box="Decomposition", callback=self._update_decomposition
)

# Options
self.options_box = gui.vBox(self.controlArea, "Options")
gui.checkBox(self.options_box, self, "normalize", "Normalize data",
callback=self._update_normalize)
self.normalize_box = gui.checkBox(
self.options_box, self, "normalize",
"Normalize data", callback=self._update_normalize
)

self.maxp_spin = gui.spin(
self.options_box, self, "maxp", 1, MAX_COMPONENTS,
label="Show only first", callback=self._setup_plot,
Expand Down Expand Up @@ -160,6 +174,23 @@ def update_model(self):
else:
self.__timer.stop()

def update_buttons_sparse(self):
update_selection = False
if DECOMPOSITIONS[self.decomposition_idx].supports_sparse == False:
update_selection = True

for i, cls in enumerate(DECOMPOSITIONS):
if cls.supports_sparse == False:
self.decomposition_box.group.box.buttons[i].setEnabled(False)
elif update_selection == True:
# Fallback to first method that supports sparse
self.decomposition_idx = i
update_selection = False

def update_buttons_dense(self):
for i in range(len(DECOMPOSITIONS)):
self.decomposition_box.group.box.buttons[i].setEnabled(True)

def start(self):
if 'Abort' in self.start_button.text():
self.rpca.abort()
Expand All @@ -175,6 +206,7 @@ def start(self):
self.start_button.setText("Abort remote computation")

def set_data(self, data):
self.closeContext()
self.clear_messages()
self.clear()
self.start_button.setEnabled(False)
Expand All @@ -194,11 +226,21 @@ def set_data(self, data):
self.start_button.setEnabled(True)
if not isinstance(data, SqlTable):
self.sampling_box.setVisible(False)

self.openContext(data)
if isinstance(data, Table):
if data.is_sparse():
self.Error.sparse_data()
self.clear_outputs()
return
# PCA does not support sparse data
# Falling back to TruncatedSVD aka LSA
self.normalize = False
self.normalize_box.setEnabled(False)
self.update_buttons_sparse()
else:
self.normalize_box.setEnabled(True)
self.update_buttons_dense()

self._update_decomposition()

if len(data.domain.attributes) == 0:
self.Error.no_features()
self.clear_outputs()
Expand Down Expand Up @@ -375,6 +417,17 @@ def _update_normalize(self):
if self.data is None:
self._invalidate_selection()

def _init_projector(self):
cls = DECOMPOSITIONS[self.decomposition_idx]
self._pca_projector = cls(max_components=MAX_COMPONENTS)
self._pca_projector.component = self.ncomponents
self._pca_preprocessors = cls.preprocessors

def _update_decomposition(self):
self.clear_messages()
self._init_projector()
self._update_normalize()

def _nselected_components(self):
"""Return the number of selected components."""
if self._pca is None:
Expand Down
7 changes: 0 additions & 7 deletions Orange/widgets/unsupervised/tests/test_owpca.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# Test methods with long descriptive names can omit docstrings
# pylint: disable=missing-docstring
import numpy as np
import scipy.sparse as sp

from Orange.data import Table, Domain
from Orange.widgets.tests.base import WidgetTest
Expand Down Expand Up @@ -70,9 +69,3 @@ def test_variance_shown(self):
self.widget._update_selection_component_spin()
var3 = self.widget.variance_covered
self.assertGreater(var3, var2)

def test_error_on_sparse_data(self):
data = Table('iris')
data.X = sp.csr_matrix(data.X)
self.widget.set_data(data)
self.assertTrue(self.widget.Error.sparse_data.is_shown())

0 comments on commit c7e9b43

Please sign in to comment.