Jaccard distance: Move from a fallback to its own class

biolab · janezd · Mar 15, 2019 · Mar 1, 2019 · Mar 4, 2019 · Mar 4, 2019
commit d962655dbb60c9941c1c16d2983741ea8b4d2adf
diff --git a/Orange/distance/base.py b/Orange/distance/base.py
@@ -1,6 +1,5 @@
 import numpy as np
 import sklearn.metrics as skl_metrics
-from scipy.sparse import issparse, csr_matrix
 
 from Orange.data import Table, Domain, Instance, RowInstance
 from Orange.misc import DistMatrix
@@ -507,51 +506,3 @@ def __call__(self, e1, e2=None, axis=1, impute=False):
         else:
             dist_matrix = DistMatrix(dist)
         return dist_matrix
-
-class SparseJaccard:
-    """
-    Fallback for `Jaccard` on sparse data or raw numpy arrays. If data is
-    sparse, data normalized with intersection/union. Sklearn's function can't
-    handle discrete or missing data and normalization.
-    """
-
-    def __call__(self, e1, e2=None, axis=1, impute=False):
-        x1 = _orange_to_numpy(e1)
-        x2 = _orange_to_numpy(e2)
-        if axis == 0:
-            x1 = x1.T
-            if x2 is not None:
-                x2 = x2.T
-        if issparse(x1):
-            dist = self.sparse_jaccard(x1, x2)
-        else:
-            dist = skl_metrics.pairwise.pairwise_distances(x1,
-                                                           x2,
-                                                           metric="jaccard")
-        if impute and np.isnan(dist).any():
-            dist = np.nan_to_num(dist)
-        if isinstance(e1, (Table, RowInstance)):
-            dist_matrix = DistMatrix(dist, e1, e2, axis)
-        else:
-            dist_matrix = DistMatrix(dist)
-        return dist_matrix
-
-    def sparse_jaccard(self, x1, x2=None):
-        symmetric = x2 is None
-        if symmetric:
-            x2 = x1
-        x1 = csr_matrix(x1)
-        x1.eliminate_zeros()
-        x2 = csr_matrix(x2)
-        x2.eliminate_zeros()
-        n, m = x1.shape[0], x2.shape[0]
-        matrix = np.zeros((n, m))
-        for i in range(n):
-            xi_ind = set(x1[i].indices)
-            for j in range(i if symmetric else m):
-                jacc = 1 - len(xi_ind.intersection(x2[j].indices))\
-                           / len(set(x1[i].indices).union(x1[j].indices))
-                matrix[i, j] = jacc
-                if symmetric:
-                    matrix[j, i] = jacc
-        return matrix
diff --git a/Orange/distance/distance.py b/Orange/distance/distance.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 from scipy import stats
+from scipy import sparse as sp
 import sklearn.metrics as skl_metrics
 from sklearn.utils.extmath import row_norms, safe_sparse_dot
 from sklearn.metrics import pairwise_distances
@@ -11,7 +12,7 @@
 from Orange.statistics import util
 
 from .base import (Distance, DistanceModel, FittedDistance, FittedDistanceModel,
-                   SklDistance, _orange_to_numpy, SparseJaccard)
+                   SklDistance, _orange_to_numpy)
 
 class EuclideanRowsModel(FittedDistanceModel):
     """
@@ -395,6 +396,9 @@ def compute_distances(self, x1, x2):
         compute distances between rows without missing values, and a slower
         loop for those with missing values.
         """
+        if sp.issparse(x1):
+            return self.sparse_jaccard(x1, x2)
+
         nonzeros1 = np.not_equal(x1, 0).view(np.int8)
         if self.axis == 1:
             nans1 = _distance.any_nan_row(x1)
@@ -414,21 +418,43 @@ def compute_distances(self, x1, x2):
             return _distance.jaccard_cols(
                 nonzeros1, x1, nans1, self.ps)
 
+    def sparse_jaccard(self, x1, x2=None):
+        symmetric = x2 is None
+        if symmetric:
+            x2 = x1
+        x1 = sp.csr_matrix(x1)
+        x1.eliminate_zeros()
+        x2 = sp.csr_matrix(x2)
+        x2.eliminate_zeros()
+        n, m = x1.shape[0], x2.shape[0]
+        matrix = np.zeros((n, m))
+        for i in range(n):
+            xi_ind = set(x1[i].indices)
+            for j in range(i if symmetric else m):
+                jacc = 1 - len(xi_ind.intersection(x2[j].indices))\
+                           / len(set(x1[i].indices).union(x1[j].indices))
+                matrix[i, j] = jacc
+                if symmetric:
+                    matrix[j, i] = jacc
+        return matrix
+
 
 class Jaccard(FittedDistance):
     supports_sparse = True
     supports_discrete = True
-    fallback = SparseJaccard()
     ModelType = JaccardModel
 
     def fit_rows(self, attributes, x, n_vals):
         """
         Return a model for computation of Jaccard values. The model stores
         frequencies of non-zero values per each column.
         """
-        ps = np.fromiter(
-            (_distance.p_nonzero(x[:, col]) for col in range(len(n_vals))),
-            dtype=np.double, count=len(n_vals))
+        if sp.issparse(x):
+            ps = None  # wrong!
+        else:
+            ps = np.fromiter(
+                (_distance.p_nonzero(x[:, col]) for col in range(len(n_vals))),
+                dtype=np.double, count=len(n_vals))
         return JaccardModel(attributes, self.axis, self.impute, ps)
 
     fit_cols = fit_rows

diff --git a/Orange/distance/tests/test_distance.py b/Orange/distance/tests/test_distance.py
@@ -27,13 +27,18 @@ def test_no_data(self):
 
     def test_sparse(self):
         """Test sparse support in distances."""
+        domain = Domain([ContinuousVariable(c) for c in "abc"])
+        dense_data = Table.from_list(
+            domain, [[1, 0, 2], [-1, 5, 0], [0, 1, 1], [7, 0, 0]])
+        sparse_data = Table(domain, csr_matrix(dense_data.X))
+
         if not self.Distance.supports_sparse:
-            self.assertRaises(TypeError, self.Distance, self.sparse_data)
+            self.assertRaises(TypeError, self.Distance, sparse_data)
         else:
-            # check the result is the same as for dense
-            dist_numpy = self.Distance(self.dense_X)
-            dist_sparse = self.Distance(self.sparse_data)
-            np.testing.assert_allclose(dist_sparse, dist_numpy)
+            # check the result is the same for sparse and dense
+            dist_dense = self.Distance(dense_data)
+            dist_sparse = self.Distance(sparse_data)
+            np.testing.assert_allclose(dist_sparse, dist_dense)
 
 
 class CommonFittedTests(CommonTests):
@@ -146,12 +151,6 @@ def setUp(self):
         self.mixed_data = self.data = Table.from_numpy(
             self.domain, np.hstack((self.cont_data.X[:3], self.disc_data.X)))
 
-        self.dense_X = np.array([[1, 0, 2],
-                                 [-1, 5, 0],
-                                 [0, 1, 1],
-                                 [7, 0, 0]])
-        self.sparse_data = Table(csr_matrix(self.dense_X))
-
 
 
 # Correct results in these tests were computed manually or with Excel;
@@ -846,12 +845,6 @@ def setUp(self):
              [1, 0, 1],
              [1, 0, 0]])
 
-        self.dense_X = np.array([[1, 0, 2],
-                      [-1, 5, 0],
-                      [0, 1, 1],
-                      [7, 0, 0]])
-        self.sparse_data = Table(csr_matrix(self.dense_X))
-
     def test_jaccard_rows(self):
         assert_almost_equal = np.testing.assert_almost_equal
 

diff --git a/Orange/tests/test_distances.py b/Orange/tests/test_distances.py
@@ -9,7 +9,6 @@
 import scipy.spatial
 import scipy.stats
 from scipy.sparse import csr_matrix
-from sklearn.exceptions import DataConversionWarning
 
 from Orange.data import (Table, Domain, ContinuousVariable,
                          DiscreteVariable, StringVariable, Instance)
@@ -500,27 +499,26 @@ def test_jaccard_distance_many_examples(self):
                       [0., 0., 0.5]]))
 
     def test_jaccard_distance_numpy(self):
-        with self.assertWarns(DataConversionWarning):
-            np.testing.assert_almost_equal(
-                self.dist(self.titanic[0].x, self.titanic[2].x, axis=1),
-                np.array([[0.5]]))
-            np.testing.assert_almost_equal(
-                self.dist(self.titanic.X),
-                np.array([[0., 0., 0.5, 0.5],
-                          [0., 0., 0.5, 0.5],
-                          [0.5, 0.5, 0., 0.],
-                          [0.5, 0.5, 0., 0.]]))
-            np.testing.assert_almost_equal(
-                self.dist(self.titanic[2].x, self.titanic[:3].X),
-                np.array([[0.5, 0.5, 0.]]))
-            np.testing.assert_almost_equal(
-                self.dist(self.titanic[:2].X, self.titanic[3].x),
-                np.array([[0.5],
-                          [0.5]]))
-            np.testing.assert_almost_equal(
-                self.dist(self.titanic[:2].X, self.titanic[:3].X),
-                np.array([[0., 0., 0.5],
-                          [0., 0., 0.5]]))
+        np.testing.assert_almost_equal(
+            self.dist(self.titanic[0].x, self.titanic[2].x, axis=1),
+            np.array([[0.5]]))
+        np.testing.assert_almost_equal(
+            self.dist(self.titanic.X),
+            np.array([[0., 0., 0.5, 0.5],
+                      [0., 0., 0.5, 0.5],
+                      [0.5, 0.5, 0., 0.],
+                      [0.5, 0.5, 0., 0.]]))
+        np.testing.assert_almost_equal(
+            self.dist(self.titanic[2].x, self.titanic[:3].X),
+            np.array([[0.5, 0.5, 0.]]))
+        np.testing.assert_almost_equal(
+            self.dist(self.titanic[:2].X, self.titanic[3].x),
+            np.array([[0.5],
+                      [0.5]]))
+        np.testing.assert_almost_equal(
+            self.dist(self.titanic[:2].X, self.titanic[:3].X),
+            np.array([[0., 0., 0.5],
+                      [0., 0., 0.5]]))
 
 
 # noinspection PyTypeChecker