Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] Support sparse Jaccard #3657

Merged
merged 9 commits into from
Mar 15, 2019
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Jaccard distance: Move from a fallback to its own class
  • Loading branch information
janezd committed Mar 13, 2019
commit d962655dbb60c9941c1c16d2983741ea8b4d2adf
49 changes: 0 additions & 49 deletions Orange/distance/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import numpy as np
import sklearn.metrics as skl_metrics
from scipy.sparse import issparse, csr_matrix

from Orange.data import Table, Domain, Instance, RowInstance
from Orange.misc import DistMatrix
Expand Down Expand Up @@ -507,51 +506,3 @@ def __call__(self, e1, e2=None, axis=1, impute=False):
else:
dist_matrix = DistMatrix(dist)
return dist_matrix

class SparseJaccard:
"""
Fallback for `Jaccard` on sparse data or raw numpy arrays. If data is
sparse, data normalized with intersection/union. Sklearn's function can't
handle discrete or missing data and normalization.
"""

def __call__(self, e1, e2=None, axis=1, impute=False):
x1 = _orange_to_numpy(e1)
x2 = _orange_to_numpy(e2)
if axis == 0:
x1 = x1.T
if x2 is not None:
x2 = x2.T
if issparse(x1):
dist = self.sparse_jaccard(x1, x2)
else:
dist = skl_metrics.pairwise.pairwise_distances(x1,
x2,
metric="jaccard")
if impute and np.isnan(dist).any():
dist = np.nan_to_num(dist)
if isinstance(e1, (Table, RowInstance)):
dist_matrix = DistMatrix(dist, e1, e2, axis)
else:
dist_matrix = DistMatrix(dist)
return dist_matrix

def sparse_jaccard(self, x1, x2=None):
symmetric = x2 is None
if symmetric:
x2 = x1
x1 = csr_matrix(x1)
x1.eliminate_zeros()
x2 = csr_matrix(x2)
x2.eliminate_zeros()
n, m = x1.shape[0], x2.shape[0]
matrix = np.zeros((n, m))
for i in range(n):
xi_ind = set(x1[i].indices)
for j in range(i if symmetric else m):
jacc = 1 - len(xi_ind.intersection(x2[j].indices))\
/ len(set(x1[i].indices).union(x1[j].indices))
matrix[i, j] = jacc
if symmetric:
matrix[j, i] = jacc
return matrix
36 changes: 31 additions & 5 deletions Orange/distance/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import numpy as np
from scipy import stats
from scipy import sparse as sp
import sklearn.metrics as skl_metrics
from sklearn.utils.extmath import row_norms, safe_sparse_dot
from sklearn.metrics import pairwise_distances
Expand All @@ -11,7 +12,7 @@
from Orange.statistics import util

from .base import (Distance, DistanceModel, FittedDistance, FittedDistanceModel,
SklDistance, _orange_to_numpy, SparseJaccard)
SklDistance, _orange_to_numpy)

class EuclideanRowsModel(FittedDistanceModel):
"""
Expand Down Expand Up @@ -395,6 +396,9 @@ def compute_distances(self, x1, x2):
compute distances between rows without missing values, and a slower
loop for those with missing values.
"""
if sp.issparse(x1):
return self.sparse_jaccard(x1, x2)

nonzeros1 = np.not_equal(x1, 0).view(np.int8)
if self.axis == 1:
nans1 = _distance.any_nan_row(x1)
Expand All @@ -414,21 +418,43 @@ def compute_distances(self, x1, x2):
return _distance.jaccard_cols(
nonzeros1, x1, nans1, self.ps)

def sparse_jaccard(self, x1, x2=None):
symmetric = x2 is None
if symmetric:
x2 = x1
x1 = sp.csr_matrix(x1)
x1.eliminate_zeros()
x2 = sp.csr_matrix(x2)
x2.eliminate_zeros()
n, m = x1.shape[0], x2.shape[0]
matrix = np.zeros((n, m))
for i in range(n):
xi_ind = set(x1[i].indices)
for j in range(i if symmetric else m):
jacc = 1 - len(xi_ind.intersection(x2[j].indices))\
/ len(set(x1[i].indices).union(x1[j].indices))
matrix[i, j] = jacc
if symmetric:
matrix[j, i] = jacc
return matrix


class Jaccard(FittedDistance):
supports_sparse = True
supports_discrete = True
fallback = SparseJaccard()
ModelType = JaccardModel

def fit_rows(self, attributes, x, n_vals):
"""
Return a model for computation of Jaccard values. The model stores
frequencies of non-zero values per each column.
"""
ps = np.fromiter(
(_distance.p_nonzero(x[:, col]) for col in range(len(n_vals))),
dtype=np.double, count=len(n_vals))
if sp.issparse(x):
ps = None # wrong!
else:
ps = np.fromiter(
(_distance.p_nonzero(x[:, col]) for col in range(len(n_vals))),
dtype=np.double, count=len(n_vals))
return JaccardModel(attributes, self.axis, self.impute, ps)

fit_cols = fit_rows
Expand Down
27 changes: 10 additions & 17 deletions Orange/distance/tests/test_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,18 @@ def test_no_data(self):

def test_sparse(self):
"""Test sparse support in distances."""
domain = Domain([ContinuousVariable(c) for c in "abc"])
dense_data = Table.from_list(
domain, [[1, 0, 2], [-1, 5, 0], [0, 1, 1], [7, 0, 0]])
sparse_data = Table(domain, csr_matrix(dense_data.X))

if not self.Distance.supports_sparse:
self.assertRaises(TypeError, self.Distance, self.sparse_data)
self.assertRaises(TypeError, self.Distance, sparse_data)
else:
# check the result is the same as for dense
dist_numpy = self.Distance(self.dense_X)
dist_sparse = self.Distance(self.sparse_data)
np.testing.assert_allclose(dist_sparse, dist_numpy)
# check the result is the same for sparse and dense
dist_dense = self.Distance(dense_data)
dist_sparse = self.Distance(sparse_data)
np.testing.assert_allclose(dist_sparse, dist_dense)


class CommonFittedTests(CommonTests):
Expand Down Expand Up @@ -146,12 +151,6 @@ def setUp(self):
self.mixed_data = self.data = Table.from_numpy(
self.domain, np.hstack((self.cont_data.X[:3], self.disc_data.X)))

self.dense_X = np.array([[1, 0, 2],
[-1, 5, 0],
[0, 1, 1],
[7, 0, 0]])
self.sparse_data = Table(csr_matrix(self.dense_X))



# Correct results in these tests were computed manually or with Excel;
Expand Down Expand Up @@ -846,12 +845,6 @@ def setUp(self):
[1, 0, 1],
[1, 0, 0]])

self.dense_X = np.array([[1, 0, 2],
[-1, 5, 0],
[0, 1, 1],
[7, 0, 0]])
self.sparse_data = Table(csr_matrix(self.dense_X))

def test_jaccard_rows(self):
assert_almost_equal = np.testing.assert_almost_equal

Expand Down
42 changes: 20 additions & 22 deletions Orange/tests/test_distances.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
import scipy.spatial
import scipy.stats
from scipy.sparse import csr_matrix
from sklearn.exceptions import DataConversionWarning

from Orange.data import (Table, Domain, ContinuousVariable,
DiscreteVariable, StringVariable, Instance)
Expand Down Expand Up @@ -500,27 +499,26 @@ def test_jaccard_distance_many_examples(self):
[0., 0., 0.5]]))

def test_jaccard_distance_numpy(self):
with self.assertWarns(DataConversionWarning):
np.testing.assert_almost_equal(
self.dist(self.titanic[0].x, self.titanic[2].x, axis=1),
np.array([[0.5]]))
np.testing.assert_almost_equal(
self.dist(self.titanic.X),
np.array([[0., 0., 0.5, 0.5],
[0., 0., 0.5, 0.5],
[0.5, 0.5, 0., 0.],
[0.5, 0.5, 0., 0.]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[2].x, self.titanic[:3].X),
np.array([[0.5, 0.5, 0.]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[:2].X, self.titanic[3].x),
np.array([[0.5],
[0.5]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[:2].X, self.titanic[:3].X),
np.array([[0., 0., 0.5],
[0., 0., 0.5]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[0].x, self.titanic[2].x, axis=1),
np.array([[0.5]]))
np.testing.assert_almost_equal(
self.dist(self.titanic.X),
np.array([[0., 0., 0.5, 0.5],
[0., 0., 0.5, 0.5],
[0.5, 0.5, 0., 0.],
[0.5, 0.5, 0., 0.]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[2].x, self.titanic[:3].X),
np.array([[0.5, 0.5, 0.]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[:2].X, self.titanic[3].x),
np.array([[0.5],
[0.5]]))
np.testing.assert_almost_equal(
self.dist(self.titanic[:2].X, self.titanic[:3].X),
np.array([[0., 0., 0.5],
[0., 0., 0.5]]))


# noinspection PyTypeChecker
Expand Down