Skip to content

Commit

Permalink
Merge pull request #3657 from ajdapretnar/sparse-jaccard
Browse files Browse the repository at this point in the history
[ENH] Support sparse Jaccard
  • Loading branch information
janezd authored Mar 15, 2019
2 parents 7705335 + a30e688 commit 574f2c2
Show file tree
Hide file tree
Showing 5 changed files with 87 additions and 45 deletions.
21 changes: 14 additions & 7 deletions Orange/distance/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# TODO this *private* function is called from several widgets to prepare
# data for calling the below classes. After we (mostly) stopped relying
# on sklearn.metrics, this is (mostly) unnecessary

def _preprocess(table, impute=True):
"""Remove categorical attributes and impute missing values."""
if not len(table):
Expand Down Expand Up @@ -273,8 +274,9 @@ def __init__(self, attributes, axis=1, impute=False):
self.attributes = attributes

def __call__(self, e1, e2=None):
if e1.domain.attributes != self.attributes or \
e2 is not None and e2.domain.attributes != self.attributes:
if self.attributes is not None and (
e1.domain.attributes != self.attributes
or e2 is not None and e2.domain.attributes != self.attributes):
raise ValueError("mismatching domains")
return super().__call__(e1, e2)

Expand Down Expand Up @@ -348,12 +350,17 @@ def fit(self, data):
Prepare the data on attributes, call `fit_cols` or `fit_rows` and
return the resulting model.
"""
attributes = data.domain.attributes
x = _orange_to_numpy(data)
n_vals = np.fromiter(
(len(attr.values) if attr.is_discrete else 0
for attr in attributes),
dtype=np.int32, count=len(attributes))
if hasattr(data, "domain"):
attributes = data.domain.attributes
n_vals = np.fromiter(
(len(attr.values) if attr.is_discrete else 0
for attr in attributes),
dtype=np.int32, count=len(attributes))
else:
assert isinstance(x, np.ndarray)
attributes = None
n_vals = np.zeros(x.shape[1], dtype=np.int32)
return [self.fit_cols, self.fit_rows][self.axis](attributes, x, n_vals)

def fit_cols(self, attributes, x, n_vals):
Expand Down
41 changes: 35 additions & 6 deletions Orange/distance/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import numpy as np
from scipy import stats
from scipy import sparse as sp
import sklearn.metrics as skl_metrics
from sklearn.utils.extmath import row_norms, safe_sparse_dot
from sklearn.metrics import pairwise_distances
Expand Down Expand Up @@ -368,7 +369,7 @@ def prepare_data(x):
data1 = prepare_data(x1)
data2 = data1 if x2 is None else prepare_data(x2)
dist = safe_sparse_dot(data1, data2.T)
np.clip(dist, 0, 1, out=dist)
np.clip(dist, -1, 1, out=dist)
if x2 is None:
diag = np.diag_indices_from(dist)
dist[diag] = np.where(np.isnan(dist[diag]), np.nan, 1.0)
Expand All @@ -385,6 +386,12 @@ def __init__(self, attributes, axis, impute, ps):
self.ps = ps

def compute_distances(self, x1, x2):
if sp.issparse(x1):
return self._compute_sparse(x1, x2)
else:
return self._compute_dense(x1, x2)

def _compute_dense(self, x1, x2):
"""
The method uses a function implemented in Cython. Data (`x1` and `x2`)
is accompanied by two tables. One is a 2-d table in which elements of
Expand Down Expand Up @@ -414,21 +421,43 @@ def compute_distances(self, x1, x2):
return _distance.jaccard_cols(
nonzeros1, x1, nans1, self.ps)

def _compute_sparse(self, x1, x2=None):
symmetric = x2 is None
if symmetric:
x2 = x1
x1 = sp.csr_matrix(x1)
x1.eliminate_zeros()
x2 = sp.csr_matrix(x2)
x2.eliminate_zeros()
n, m = x1.shape[0], x2.shape[0]
matrix = np.zeros((n, m))
for i in range(n):
xi_ind = set(x1[i].indices)
for j in range(i if symmetric else m):
jacc = 1 - len(xi_ind.intersection(x2[j].indices))\
/ len(set(x1[i].indices).union(x1[j].indices))
matrix[i, j] = jacc
if symmetric:
matrix[j, i] = jacc
return matrix


class Jaccard(FittedDistance):
supports_sparse = False
supports_sparse = True
supports_discrete = True
fallback = SklDistance('jaccard')
ModelType = JaccardModel

def fit_rows(self, attributes, x, n_vals):
"""
Return a model for computation of Jaccard values. The model stores
frequencies of non-zero values per each column.
"""
ps = np.fromiter(
(_distance.p_nonzero(x[:, col]) for col in range(len(n_vals))),
dtype=np.double, count=len(n_vals))
if sp.issparse(x):
ps = x.getnnz(axis=0)
else:
ps = np.fromiter(
(_distance.p_nonzero(x[:, col]) for col in range(len(n_vals))),
dtype=np.double, count=len(n_vals))
return JaccardModel(attributes, self.axis, self.impute, ps)

fit_cols = fit_rows
Expand Down
13 changes: 10 additions & 3 deletions Orange/distance/tests/test_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,18 @@ def test_no_data(self):

def test_sparse(self):
"""Test sparse support in distances."""
sparse_iris = csr_matrix(Table('iris').X)
domain = Domain([ContinuousVariable(c) for c in "abc"])
dense_data = Table.from_list(
domain, [[1, 0, 2], [-1, 5, 0], [0, 1, 1], [7, 0, 0]])
sparse_data = Table(domain, csr_matrix(dense_data.X))

if not self.Distance.supports_sparse:
self.assertRaises(TypeError, self.Distance, sparse_iris)
self.assertRaises(TypeError, self.Distance, sparse_data)
else:
self.Distance(sparse_iris)
# check the result is the same for sparse and dense
dist_dense = self.Distance(dense_data)
dist_sparse = self.Distance(sparse_data)
np.testing.assert_allclose(dist_sparse, dist_dense)


class CommonFittedTests(CommonTests):
Expand Down
42 changes: 20 additions & 22 deletions Orange/tests/test_distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import scipy.spatial
import scipy.stats
from scipy.sparse import csr_matrix
from sklearn.exceptions import DataConversionWarning

from Orange.data import (Table, Domain, ContinuousVariable,
DiscreteVariable, StringVariable, Instance)
Expand Down Expand Up @@ -500,27 +499,26 @@ def test_jaccard_distance_many_examples(self):
[0., 0., 0.5]]))

def test_jaccard_distance_numpy(self):
with self.assertWarns(DataConversionWarning):
np.testing.assert_almost_equal(
self.dist(self.titanic[0].x, self.titanic[2].x, axis=1),
np.array([[0.5]]))
np.testing.assert_almost_equal(
self.dist(self.titanic.X),
np.array([[0., 0., 0.5, 0.5],
[0., 0., 0.5, 0.5],
[0.5, 0.5, 0., 0.],
[0.5, 0.5, 0., 0.]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[2].x, self.titanic[:3].X),
np.array([[0.5, 0.5, 0.]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[:2].X, self.titanic[3].x),
np.array([[0.5],
[0.5]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[:2].X, self.titanic[:3].X),
np.array([[0., 0., 0.5],
[0., 0., 0.5]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[0].x, self.titanic[2].x, axis=1),
np.array([[0.5]]))
np.testing.assert_almost_equal(
self.dist(self.titanic.X),
np.array([[0., 0., 0.5, 0.5],
[0., 0., 0.5, 0.5],
[0.5, 0.5, 0., 0.],
[0.5, 0.5, 0., 0.]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[2].x, self.titanic[:3].X),
np.array([[0.5, 0.5, 0.]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[:2].X, self.titanic[3].x),
np.array([[0.5],
[0.5]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[:2].X, self.titanic[:3].X),
np.array([[0., 0., 0.5],
[0., 0., 0.5]]))


# noinspection PyTypeChecker
Expand Down
15 changes: 8 additions & 7 deletions Orange/widgets/unsupervised/owdistances.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,11 @@ def _check_sparse():

def _fix_discrete():
nonlocal data
if data.domain.has_discrete_attributes() and (
issparse(data.X) and getattr(metric, "fallback", None)
or not metric.supports_discrete
or self.axis == 1 and metric is not distance.Jaccard):
if data.domain.has_discrete_attributes() \
and metric is not distance.Jaccard \
and (issparse(data.X) and getattr(metric, "fallback", None)
or not metric.supports_discrete
or self.axis == 1):
if not data.domain.has_continuous_attributes():
self.Error.no_continuous_features()
return False
Expand All @@ -131,7 +132,7 @@ def _fix_discrete():

def _fix_nonbinary():
nonlocal data
if metric is distance.Jaccard:
if metric is distance.Jaccard and not issparse(data.X):
nbinary = sum(a.is_discrete and len(a.values) == 2
for a in data.domain.attributes)
if not nbinary:
Expand All @@ -151,11 +152,11 @@ def _fix_missing():

self.clear_messages()
if data is None:
return
return None
for check in (_check_sparse,
_fix_discrete, _fix_missing, _fix_nonbinary):
if not check():
return
return None
try:
if metric.supports_normalization and self.normalized_dist:
return metric(data, axis=1 - self.axis, impute=True,
Expand Down

0 comments on commit 574f2c2

Please sign in to comment.