diff --git a/cleanlab/data_valuation.py b/cleanlab/data_valuation.py index 7a80babc82..fcc00b9276 100644 --- a/cleanlab/data_valuation.py +++ b/cleanlab/data_valuation.py @@ -19,14 +19,12 @@ """ -from typing import Callable, Optional, Union, cast +from typing import Callable, Optional, Union import numpy as np from scipy.sparse import csr_matrix -from scipy.spatial.distance import euclidean -from sklearn.neighbors import NearestNeighbors -from sklearn.exceptions import NotFittedError -from sklearn.utils.validation import check_is_fitted + +from cleanlab.internal.neighbor.knn_graph import create_knn_graph_and_index def _knn_shapley_score(knn_graph: csr_matrix, labels: np.ndarray, k: int) -> np.ndarray: @@ -45,29 +43,6 @@ def _knn_shapley_score(knn_graph: csr_matrix, labels: np.ndarray, k: int) -> np. return 0.5 * (np.mean(scores / k, axis=0) + 1) -def _process_knn_graph_from_features( - features: np.ndarray, metric: Optional[Union[str, Callable]], k: int = 10 -) -> csr_matrix: - """Calculate the knn graph from the features if it is not provided in the kwargs.""" - if k > len(features): # Ensure number of neighbors less than number of examples - raise ValueError( - f"Number of nearest neighbors k={k} cannot exceed the number of examples N={len(features)} passed into the estimator (knn)." - ) - if metric == None: - metric = ( - "cosine" - if features.shape[1] > 3 - else "euclidean" if features.shape[0] > 100 else euclidean - ) - knn = NearestNeighbors(n_neighbors=k, metric=metric).fit(features) - knn_graph = knn.kneighbors_graph(mode="distance") - try: - check_is_fitted(knn) - except NotFittedError: - knn.fit(features) - return knn_graph - - def data_shapley_knn( labels: np.ndarray, *, @@ -135,6 +110,7 @@ def data_shapley_knn( if knn_graph is None and features is None: raise ValueError("Either knn_graph or features must be provided.") + # Use provided knn_graph or compute it from features if knn_graph is None: - knn_graph = _process_knn_graph_from_features(cast(np.ndarray, features), metric, k) + knn_graph, _ = create_knn_graph_and_index(features, n_neighbors=k, metric=metric) return _knn_shapley_score(knn_graph, labels, k) diff --git a/cleanlab/datalab/internal/issue_manager/data_valuation.py b/cleanlab/datalab/internal/issue_manager/data_valuation.py index 759c9bc400..347a443189 100644 --- a/cleanlab/datalab/internal/issue_manager/data_valuation.py +++ b/cleanlab/datalab/internal/issue_manager/data_valuation.py @@ -25,19 +25,15 @@ Optional, Union, ) -import warnings import numpy as np import pandas as pd from scipy.sparse import csr_matrix -from scipy.spatial.distance import euclidean -from sklearn.exceptions import NotFittedError -from sklearn.neighbors import NearestNeighbors -from sklearn.utils.validation import check_is_fitted from cleanlab.data_valuation import data_shapley_knn from cleanlab.datalab.internal.issue_manager import IssueManager +from cleanlab.internal.neighbor.knn_graph import create_knn_graph_and_index if TYPE_CHECKING: # pragma: no cover import numpy.typing as npt @@ -139,33 +135,11 @@ def find_issues( ) raise TypeError(error_msg) if knn_graph is None or metric_changes: - if features is None: - raise ValueError( - "If a knn_graph is not provided, features must be provided to fit a new knn." - ) - if self.metric is None: - self.metric = ( - "cosine" - if features.shape[1] > 3 - else "euclidean" if features.shape[0] > 100 else euclidean - ) - knn = NearestNeighbors(n_neighbors=self.k, metric=self.metric).fit(features) - - if self.metric and self.metric != knn.metric: - warnings.warn( - f"Metric {self.metric} does not match metric {knn.metric} used to fit knn. " - "Most likely an existing NearestNeighbors object was passed in, but a different " - "metric was specified." - ) + knn_graph, knn = create_knn_graph_and_index( + features, n_neighbors=self.k, metric=self.metric + ) self.metric = knn.metric - try: - check_is_fitted(knn) - except NotFittedError: - knn.fit(features) - - knn_graph = knn.kneighbors_graph(mode="distance") - scores = data_shapley_knn(labels, knn_graph=knn_graph, k=self.k) self.issues = pd.DataFrame( diff --git a/cleanlab/datalab/internal/issue_manager/duplicate.py b/cleanlab/datalab/internal/issue_manager/duplicate.py index a446d111fc..fb1fd7f319 100644 --- a/cleanlab/datalab/internal/issue_manager/duplicate.py +++ b/cleanlab/datalab/internal/issue_manager/duplicate.py @@ -21,12 +21,10 @@ import numpy as np import pandas as pd from scipy.sparse import csr_matrix -from scipy.spatial.distance import euclidean -from sklearn.neighbors import NearestNeighbors -from sklearn.exceptions import NotFittedError -from sklearn.utils.validation import check_is_fitted + from cleanlab.datalab.internal.issue_manager import IssueManager +from cleanlab.internal.neighbor.knn_graph import create_knn_graph_and_index from cleanlab.internal.constants import EPSILON if TYPE_CHECKING: # pragma: no cover @@ -76,32 +74,10 @@ def find_issues( metric_changes = self.metric and self.metric != old_knn_metric if knn_graph is None or metric_changes: - if features is None: - raise ValueError( - "If a knn_graph is not provided, features must be provided to fit a new knn." - ) - if self.metric is None: - self.metric = ( - "cosine" - if features.shape[1] > 3 - else "euclidean" if features.shape[0] > 100 else euclidean - ) - knn = NearestNeighbors(n_neighbors=self.k, metric=self.metric) - - if self.metric and self.metric != knn.metric: - warnings.warn( - f"Metric {self.metric} does not match metric {knn.metric} used to fit knn. " - "Most likely an existing NearestNeighbors object was passed in, but a different " - "metric was specified." - ) + knn_graph, knn = create_knn_graph_and_index( + features, n_neighbors=self.k, metric=self.metric + ) self.metric = knn.metric - - try: - check_is_fitted(knn) - except NotFittedError: - knn.fit(features) - - knn_graph = knn.kneighbors_graph(mode="distance") N = knn_graph.shape[0] nn_distances = knn_graph.data.reshape(N, -1)[:, 0] median_nn_distance = max(np.median(nn_distances), EPSILON) # avoid threshold = 0 diff --git a/cleanlab/datalab/internal/issue_manager/noniid.py b/cleanlab/datalab/internal/issue_manager/noniid.py index 12d42859b4..b28f8c3868 100644 --- a/cleanlab/datalab/internal/issue_manager/noniid.py +++ b/cleanlab/datalab/internal/issue_manager/noniid.py @@ -1,19 +1,16 @@ from __future__ import annotations from typing import TYPE_CHECKING, Any, Callable, ClassVar, Dict, Optional, Union, cast -import warnings import itertools from scipy.stats import gaussian_kde import numpy as np import pandas as pd from scipy.sparse import csr_matrix -from scipy.spatial.distance import euclidean from sklearn.neighbors import NearestNeighbors -from sklearn.exceptions import NotFittedError -from sklearn.utils.validation import check_is_fitted from cleanlab.datalab.internal.issue_manager import IssueManager +from cleanlab.internal.neighbor.knn_graph import construct_knn_graph_from_index, features_to_knn if TYPE_CHECKING: # pragma: no cover import numpy.typing as npt @@ -203,28 +200,8 @@ def _setup_knn( return None features_to_use = self._determine_features(features, pred_probs) - if self.metric is None: - self.metric = ( - "cosine" - if features_to_use.shape[1] > 3 - else "euclidean" if features_to_use.shape[0] > 100 else euclidean - ) - - knn = NearestNeighbors(n_neighbors=self.k, metric=self.metric) - - if self.metric != knn.metric: - warnings.warn( - f"Metric {self.metric} does not match metric {knn.metric} used to fit knn. " - "Most likely an existing NearestNeighbors object was passed in, but a different " - "metric was specified." - ) - self.metric = knn.metric - - try: - check_is_fitted(knn) - except NotFittedError: - knn.fit(features_to_use) - + knn = features_to_knn(features_to_use, n_neighbors=self.k, metric=self.metric) + self.metric = knn.metric # Update the metric to the one used in the KNN object. return knn def find_issues( @@ -305,7 +282,7 @@ def collect_info( } if knn_graph is None: assert knn is not None, "If knn_graph is None, knn must be provided." - knn_graph = knn.kneighbors_graph(mode="distance") # type: ignore[union-attr] + knn_graph = construct_knn_graph_from_index(knn) assert knn_graph is not None, "knn_graph must be provided or computed." statistics_dict = self._build_statistics_dictionary(knn_graph=knn_graph) diff --git a/cleanlab/datalab/internal/issue_manager/outlier.py b/cleanlab/datalab/internal/issue_manager/outlier.py index 0ba01c0e00..93ca6b798e 100644 --- a/cleanlab/datalab/internal/issue_manager/outlier.py +++ b/cleanlab/datalab/internal/issue_manager/outlier.py @@ -23,6 +23,7 @@ import pandas as pd from cleanlab.datalab.internal.issue_manager import IssueManager +from cleanlab.internal.neighbor.knn_graph import construct_knn_graph_from_index from cleanlab.outlier import OutOfDistribution, transform_distances_to_scores if TYPE_CHECKING: # pragma: no cover @@ -219,7 +220,7 @@ def _process_knn_graph_from_features(self, kwargs: Dict) -> csr_matrix: # If the pre-existing knn graph has fewer neighbors than the knn object, # then we need to recompute the knn graph assert knn == self.ood.params["knn"] # type: ignore[union-attr] - knn_graph = knn.kneighbors_graph(mode="distance") # type: ignore[union-attr] + knn_graph = construct_knn_graph_from_index(knn) self._metric = knn.metric # type: ignore[union-attr] return knn_graph diff --git a/cleanlab/datalab/internal/issue_manager/underperforming_group.py b/cleanlab/datalab/internal/issue_manager/underperforming_group.py index 707d788fd1..6f52055af0 100644 --- a/cleanlab/datalab/internal/issue_manager/underperforming_group.py +++ b/cleanlab/datalab/internal/issue_manager/underperforming_group.py @@ -22,13 +22,10 @@ import numpy as np import pandas as pd from scipy.sparse import csr_matrix -from scipy.spatial.distance import euclidean -from sklearn.neighbors import NearestNeighbors -from sklearn.exceptions import NotFittedError -from sklearn.utils.validation import check_is_fitted from sklearn.cluster import DBSCAN from cleanlab.datalab.internal.issue_manager import IssueManager +from cleanlab.internal.neighbor.knn_graph import create_knn_graph_and_index from cleanlab.rank import get_self_confidence_for_each_label if TYPE_CHECKING: # pragma: no cover @@ -153,31 +150,10 @@ def set_knn_graph( metric_changes = self.metric and self.metric != old_knn_metric if knn_graph is None or metric_changes: - if features is None: - raise ValueError( - "If a knn_graph is not provided, features must be provided to fit a new knn." - ) - if self.metric is None: - self.metric = ( - "cosine" - if features.shape[1] > 3 - else "euclidean" if features.shape[0] > 100 else euclidean - ) - knn = NearestNeighbors(n_neighbors=self.k, metric=self.metric) - - if self.metric and self.metric != knn.metric: - warnings.warn( - f"Metric {self.metric} does not match metric {knn.metric} used to fit knn. " - "Most likely an existing NearestNeighbors object was passed in, but a different " - "metric was specified." - ) + knn_graph, knn = create_knn_graph_and_index( + features, n_neighbors=self.k, metric=self.metric + ) self.metric = knn.metric - - try: - check_is_fitted(knn) - except NotFittedError: - knn.fit(features) - knn_graph = knn.kneighbors_graph(mode="distance") return knn_graph def perform_clustering(self, knn_graph: csr_matrix) -> npt.NDArray[np.int_]: diff --git a/cleanlab/internal/neighbor/__init__.py b/cleanlab/internal/neighbor/__init__.py new file mode 100644 index 0000000000..2575a85545 --- /dev/null +++ b/cleanlab/internal/neighbor/__init__.py @@ -0,0 +1 @@ +from .knn_graph import features_to_knn diff --git a/cleanlab/internal/neighbor/knn_graph.py b/cleanlab/internal/neighbor/knn_graph.py new file mode 100644 index 0000000000..fbbe96318d --- /dev/null +++ b/cleanlab/internal/neighbor/knn_graph.py @@ -0,0 +1,180 @@ +from __future__ import annotations +from typing import Optional, TYPE_CHECKING, Tuple + +import numpy as np +from scipy.sparse import csr_matrix +from sklearn.neighbors import NearestNeighbors + +if TYPE_CHECKING: + from cleanlab.typing import FeatureArray, Metric + +from cleanlab.internal.neighbor.metric import decide_default_metric +from cleanlab.internal.neighbor.search import construct_knn + + +DEFAULT_K = 10 +"""Default number of neighbors to consider in the k-nearest neighbors search, +unless the size of the feature array is too small or the user specifies a different value. + +This should be the largest desired value of k for all desired issue types that require a KNN graph. + +E.g. if near duplicates wants k=1 but outliers wants 10, then DEFAULT_K should be 10. This way, all issue types can rely on the same KNN graph. +""" + + +def features_to_knn( + features: Optional[FeatureArray], + *, + n_neighbors: Optional[int] = None, + metric: Optional[Metric] = None, + **sklearn_knn_kwargs, +) -> NearestNeighbors: + """Build and fit a k-nearest neighbors search object from an array of numerical features. + + Parameters + ---------- + features : + The input feature array, with shape (N, M), where N is the number of samples and M is the number of features. + n_neighbors : + The number of nearest neighbors to consider. If None, a default value is determined based on the feature array size. + metric : + The distance metric to use for computing distances between points. If None, the metric is determined based on the feature array shape. + **sklearn_knn_kwargs : + Additional keyword arguments to be passed to the search index constructor. + + Returns + ------- + knn : + A k-nearest neighbors search object fitted to the input feature array. + + Examples + -------- + + >>> import numpy as np + >>> from cleanlab.internal.neighbor import features_to_knn + >>> features = np.random.rand(100, 10) + >>> knn = features_to_knn(features) + >>> knn + NearestNeighbors(metric='cosine', n_neighbors=10) + """ + if features is None: + raise ValueError("Both knn and features arguments cannot be None at the same time.") + # Use provided metric if available, otherwise decide based on the features. + metric = metric or decide_default_metric(features) + + # Decide the number of neighbors to use in the KNN search. + n_neighbors = _configure_num_neighbors(features, n_neighbors) + + knn = construct_knn(n_neighbors, metric, **sklearn_knn_kwargs) + return knn.fit(features) + + +def construct_knn_graph_from_index(knn: NearestNeighbors) -> csr_matrix: + """Construct a sparse distance matrix representation of KNN graph out of a fitted NearestNeighbors search object. + + Parameters + ---------- + knn : + A NearestNeighbors object that has been fitted to a feature array. + The knn graph is constructed based on the distances and indices of each feature row's nearest neighbors. + + Returns + ------- + knn_graph : + A sparse, weighted adjacency matrix representing the KNN graph of the feature array. + + Note + ---- + This is *not* intended to construct a KNN graph of test data. It is only used to construct a KNN graph of the data used to fit the NearestNeighbors object. + + Examples + -------- + >>> import numpy as np + >>> from cleanlab.internal.neighbor.knn_graph import features_to_knn, construct_knn_graph_from_index + >>> features = np.array([ + ... [0.701, 0.701], + ... [0.900, 0.436], + ... [0.000, 1.000], + ... ]) + >>> knn = features_to_knn(features, n_neighbors=1) + >>> knn_graph = construct_knn_graph_from_index(knn) + >>> knn_graph.toarray() # For demonstration purposes only. It is generally a bad idea to transform to dense matrix for large graphs. + array([[0. , 0.33140006, 0. ], + [0.33140006, 0. , 0. ], + [0.76210367, 0. , 0. ]]) + """ + + distances, indices = knn.kneighbors(return_distance=True) + + N, K = distances.shape + + # Pointers to the row elements distances[indptr[i]:indptr[i+1]], + # and their corresponding column indices indices[indptr[i]:indptr[i+1]]. + indptr = np.arange(0, N * K + 1, K) + + return csr_matrix((distances.reshape(-1), indices.reshape(-1), indptr), shape=(N, N)) + + +def create_knn_graph_and_index( + features: Optional[FeatureArray], + *, + n_neighbors: Optional[int] = None, + metric: Optional[Metric] = None, + **sklearn_knn_kwargs, +) -> Tuple[csr_matrix, NearestNeighbors]: + """Calculate the KNN graph from the features if it is not provided in the kwargs. + + Parameters + ---------- + features : + The input feature array, with shape (N, M), where N is the number of samples and M is the number of features. + n_neighbors : + The number of nearest neighbors to consider. If None, a default value is determined based on the feature array size. + metric : + The distance metric to use for computing distances between points. If None, the metric is determined based on the feature array shape. + **sklearn_knn_kwargs : + Additional keyword arguments to be passed to the search index constructor. + + Returns + ------- + knn_graph : + A sparse, weighted adjacency matrix representing the KNN graph of the feature array. + knn : + A k-nearest neighbors search object fitted to the input feature array. This object can be used to query the nearest neighbors of new data points. + + Examples + -------- + >>> import numpy as np + >>> from cleanlab.internal.neighbor.knn_graph import create_knn_graph_and_index + >>> features = np.array([ + ... [0.701, 0.701], + ... [0.900, 0.436], + ... [0.000, 1.000], + ... ]) + >>> knn_graph, knn = create_knn_graph_and_index(features, n_neighbors=1) + >>> knn_graph.toarray() # For demonstration purposes only. It is generally a bad idea to transform to dense matrix for large graphs. + array([[0. , 0.33140006, 0. ], + [0.33140006, 0. , 0. ], + [0.76210367, 0. , 0. ]]) + >>> knn + NearestNeighbors(metric=, n_neighbors=1) # For demonstration purposes only. The actual metric may vary. + """ + # Construct NearestNeighbors object + knn = features_to_knn(features, n_neighbors=n_neighbors, metric=metric, **sklearn_knn_kwargs) + # Build graph from NearestNeighbors object + knn_graph = construct_knn_graph_from_index(knn) + return knn_graph, knn + + +def _configure_num_neighbors(features: FeatureArray, k: Optional[int]): + # Error if the provided value is greater or equal to the number of examples. + N = features.shape[0] + k_larger_than_dataset = k is not None and k >= N + if k_larger_than_dataset: + raise ValueError( + f"Number of nearest neighbors k={k} cannot exceed the number of examples N={len(features)} passed into the estimator (knn)." + ) + + # Either use the provided value or select a default value based on the feature array size. + k = k or min(DEFAULT_K, N - 1) + return k diff --git a/cleanlab/internal/neighbor/metric.py b/cleanlab/internal/neighbor/metric.py new file mode 100644 index 0000000000..321be90ecb --- /dev/null +++ b/cleanlab/internal/neighbor/metric.py @@ -0,0 +1,107 @@ +from scipy.spatial.distance import euclidean + +from cleanlab.typing import FeatureArray, Metric + +HIGH_DIMENSION_CUTOFF: int = 3 +""" +If the number of columns (M) in the `features` array is greater than this cutoff value, +then by default, K-nearest-neighbors will use the "cosine" metric. +The cosine metric is more suitable for high-dimensional data. +Otherwise the "euclidean" distance will be used. + +""" +ROW_COUNT_CUTOFF: int = 100 +""" +Only affects settings where Euclidean metrics would be used by default. +If the number of rows (N) in the `features` array is greater than this cutoff value, +then by default, Euclidean distances are computed via the "euclidean" metric +(implemented in sklearn for efficiency reasons). +Otherwise, Euclidean distances are by default computed via +the ``euclidean`` metric from scipy (slower but numerically more precise/accurate). +""" + + +# Metric decision functions +def _euclidean_large_dataset() -> str: + return "euclidean" + + +def _euclidean_small_dataset() -> Metric: + return euclidean + + +def _cosine_metric() -> str: + return "cosine" + + +def decide_euclidean_metric(features: FeatureArray) -> Metric: + """ + Decide the appropriate Euclidean metric implementation based on the size of the dataset. + + Parameters + ---------- + features : + The input features array. + + Returns + ------- + metric : + A string or a callable representing a specific implementation of computing the euclidean distance. + + Note + ---- + A choice is made between two implementations + of the euclidean metric based on the number of rows in the feature array. + If the number of rows (N) in the feature array is greater than another predefined + cutoff value (ROW_COUNT_CUTOFF), the ``"euclidean"`` metric is used. This + is because the euclidean metric performs better on larger datasets. + If neither condition is met, the ``euclidean`` metric function from scipy is returned. + + See also + -------- + ROW_COUNT_CUTOFF: The cutoff value for the number of rows in the feature array. + sklearn.metrics.pairwise.euclidean_distances: The euclidean metric function from scikit-learn. + scipy.spatial.distance.euclidean: The euclidean metric function from scipy. + """ + num_rows = features.shape[0] + if num_rows > ROW_COUNT_CUTOFF: + return _euclidean_large_dataset() + else: + return _euclidean_small_dataset() + + +# Main function to decide the metric +def decide_default_metric(features: FeatureArray) -> Metric: + """ + Decide the KNN metric to be used based on the shape of the feature array. + + Parameters + ---------- + features : + The input feature array, with shape (N, M), where N is the number of samples and M is the number of features. + + Returns + ------- + metric : + The distance metric to be used for neighbor search. It can be either a string + representing the metric name ("cosine" or "euclidean") or a callable + representing the metric function from scipy (euclidean). + + Note + ---- + The decision of which metric to use is based on the shape of the feature array. + If the number of columns (M) in the feature array is greater than a predefined + cutoff value (HIGH_DIMENSION_CUTOFF), the "cosine" metric is used. This is because the cosine + metric is more suitable for high-dimensional data. + + Otherwise, a euclidean metric is used. + That is handled by the :py:meth:`~cleanlab.internal.neighbor.metric.decide_euclidean_metric` function. + + See Also + -------- + HIGH_DIMENSION_CUTOFF: The cutoff value for the number of columns in the feature array. + sklearn.metrics.pairwise.cosine_distances: The cosine metric function from scikit-learn + """ + if features.shape[1] > HIGH_DIMENSION_CUTOFF: + return _cosine_metric() + return decide_euclidean_metric(features) diff --git a/cleanlab/internal/neighbor/search.py b/cleanlab/internal/neighbor/search.py new file mode 100644 index 0000000000..cf4bb0ab57 --- /dev/null +++ b/cleanlab/internal/neighbor/search.py @@ -0,0 +1,75 @@ +from __future__ import annotations +from typing import TYPE_CHECKING + +from sklearn.neighbors import NearestNeighbors + + +if TYPE_CHECKING: + + from cleanlab.typing import Metric + + +def construct_knn(n_neighbors: int, metric: Metric, **knn_kwargs) -> NearestNeighbors: + """ + Constructs a k-nearest neighbors search object. You can implement a similar method to run cleanlab with your own approximate-KNN library. + + Parameters + ---------- + n_neighbors : + The number of nearest neighbors to consider. + metric : + The distance metric to use for computing distances between points. + See :py:mod:`~cleanlab.internal.neighbor.metric` for more information. + **knn_kwargs: + Additional keyword arguments to be passed to the search index constructor. + See https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.NearestNeighbors.html for more details on the available options. + + Returns + ------- + knn : + A k-nearest neighbors search object compatible with the scikit-learn NearestNeighbors class interface. + + Implements: + + - `fit` method: Accepts a feature array `X` to fit the model. + This enables subsequent neighbor searches on the data. + - `kneighbors` method: Finds the K-neighbors of a point, returning distances and indices of the k-nearest neighbors. Handles two scenarios: + 1. When a query array `features: np.ndarray` is provided, it returns the distances and indices for each point in the query array. + 2. When no query array is provided (`features = None`), it returns neighbors for each indexed point without considering the query point as its own neighbor. + Optionally, allows re-specification of the number of neighbors for each query point, defaulting to the constructor's value if not specified. + + Attributes: + + - `n_neighbors`: Number of neighbors to consider. + - `metric`: Distance metric used to compute distances between points. + - `metric_params`: Additional parameters for the distance metric function. + + Optional: + + - `kneighbors_graph` method: Not required but can be implemented for convenience. + Responsibility shifted to :py:ref:`construct_knn_graph_from_index `. + + Fitted Attributes: + + - `n_features_in_`: Number of features observed during fit. + - `effective_metric_params_`: Metric parameters used in distance computation. + - `effective_metric_`: Metric used for computing distances to neighbors. + - `n_samples_fit_`: Number of samples in the fitted data. + + Additional: + + - `__sklearn_is_fitted__`: Method returning a boolean indicating if the object is fitted, + useful for conducting an is_fitted validation, which verifies the presence of fitted attributes (typically ending with a trailing underscore). + + + The above specifications ensure compatibility and provide a clear directive for developers needing to integrate alternative k-nearest neighbors implementations or modify existing functionalities. + + Note + ---- + The `metric` argument should be a callable that takes two arguments (the two points) and returns the distance between them. + The additional keyword arguments (`**knn_kwargs`) are passed directly to the underlying k-nearest neighbors search algorithm. + + """ + sklearn_knn = NearestNeighbors(n_neighbors=n_neighbors, metric=metric, **knn_kwargs) + + return sklearn_knn diff --git a/cleanlab/outlier.py b/cleanlab/outlier.py index b851f0d305..cb3432a0db 100644 --- a/cleanlab/outlier.py +++ b/cleanlab/outlier.py @@ -21,10 +21,9 @@ """ import warnings -from typing import Callable, Dict, Optional, Tuple, Union +from typing import Dict, Optional, Tuple, Union import numpy as np -from scipy.spatial.distance import euclidean from sklearn.exceptions import NotFittedError from sklearn.neighbors import NearestNeighbors @@ -33,6 +32,7 @@ _subtract_confident_thresholds, get_normalized_entropy, ) +from cleanlab.internal.neighbor.knn_graph import features_to_knn from cleanlab.internal.numerics import softmax from cleanlab.internal.outlier import correct_precision_errors, transform_distances_to_scores from cleanlab.internal.validation import assert_valid_inputs, labels_to_array @@ -424,30 +424,13 @@ def _get_ood_features_scores( distance_metric = None if knn is None: # setup default KNN estimator # Make sure both knn and features are not None - if features is None: - raise ValueError( - "Both knn and features arguments cannot be None at the same time. Not enough information to compute outlier scores." - ) - if k is None: - k = DEFAULT_K # use default when knn and k are both None - N, M = features.shape - if k > N: # Ensure number of neighbors less than number of examples - raise ValueError( - f"Number of nearest neighbors k={k} cannot exceed the number of examples N={len(features)} passed into the estimator (knn)." - ) - - # strings are used for sklearn metrics, callables are scipy pairwise distance functions - metric: Union[str, Callable] - if M > 3: # use euclidean distance for lower dimensional spaces - metric = "cosine" - elif N > 100: # Use efficient implementation (numerically unstable in edge cases) - metric = "euclidean" - else: # Use scipy implementation for precise results - metric = euclidean - - knn = NearestNeighbors(n_neighbors=k, metric=metric).fit(features) + knn = features_to_knn(features, n_neighbors=k) features = None # features should be None in knn.kneighbors(features) to avoid counting duplicate data points - distance_metric = metric if isinstance(metric, str) else str(metric.__name__) + # Log knn metric as string to ensure compatibility for score correction + distance_metric = ( + metric if isinstance((metric := knn.metric), str) else str(metric.__name__) + ) + k = knn.n_neighbors elif k is None: k = knn.n_neighbors diff --git a/cleanlab/regression/rank.py b/cleanlab/regression/rank.py index 4badef6a13..439ddbb02a 100644 --- a/cleanlab/regression/rank.py +++ b/cleanlab/regression/rank.py @@ -27,9 +27,9 @@ from typing import Dict, Callable, Optional, Union import numpy as np from numpy.typing import ArrayLike -from scipy.spatial.distance import euclidean -from sklearn.neighbors import NearestNeighbors +from cleanlab.internal.neighbor.metric import decide_euclidean_metric +from cleanlab.internal.neighbor.knn_graph import features_to_knn from cleanlab.outlier import OutOfDistribution from cleanlab.internal.regression_utils import assert_valid_prediction_inputs @@ -142,7 +142,7 @@ def _get_outre_score_for_each_label( *, residual_scale: float = 5, frac_neighbors: float = 0.5, - neighbor_metric: Optional[Union[str, Callable]] = "euclidean", + neighbor_metric: Optional[Union[str, Callable]] = None, ) -> np.ndarray: """Returns OUTRE based label-quality scores. @@ -181,12 +181,9 @@ def _get_outre_score_for_each_label( features = np.array([labels, residual]).T neighbors = int(np.ceil(frac_neighbors * labels.shape[0])) - if neighbor_metric is None: - if features.shape[0] > 100: - neighbor_metric = "euclidean" - else: - neighbor_metric = euclidean - knn = NearestNeighbors(n_neighbors=neighbors, metric=neighbor_metric).fit(features) + # Use provided metric or select a decent implementation of the euclidean metric for knn search + neighbor_metric = neighbor_metric or decide_euclidean_metric(features) + knn = features_to_knn(features, n_neighbors=neighbors, metric=neighbor_metric) ood = OutOfDistribution(params={"knn": knn}) label_quality_scores = ood.score(features=features) diff --git a/cleanlab/typing.py b/cleanlab/typing.py index 995d20bdec..48aabe1a04 100644 --- a/cleanlab/typing.py +++ b/cleanlab/typing.py @@ -1,4 +1,4 @@ -from typing import Any, Union +from typing import Any, Callable, Union import numpy as np import pandas as pd @@ -8,3 +8,20 @@ DatasetLike = Any """Type for objects that behave like datasets.""" + +########################################################### +# Types aliases used in cleanlab/internal/neighbor/ modules +########################################################### + +FeatureArray = np.ndarray +"""A type alias for a 2D numpy array representing numerical features.""" +Metric = Union[str, Callable] +"""A type alias for the distance metric to be used for neighbor search. It can be either a string +representing the metric name ("cosine" or "euclidean") or a callable representing the metric function from scipy (euclidean). + +Valid values for metric are mentioned in the scikit-learn documentation for the sklearn.metrics.pairwise_distances function. + +See Also +-------- +sklearn.metrics.pairwise_distances: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances.html#sklearn-metrics-pairwise-distances +""" diff --git a/docs/source/cleanlab/internal/index.rst b/docs/source/cleanlab/internal/index.rst index 6642479281..df59cbcbb5 100644 --- a/docs/source/cleanlab/internal/index.rst +++ b/docs/source/cleanlab/internal/index.rst @@ -17,6 +17,7 @@ internal label_quality_utils multilabel_utils multilabel_scorer + neighbor/index outlier token_classification_utils validation diff --git a/docs/source/cleanlab/internal/neighbor/index.rst b/docs/source/cleanlab/internal/neighbor/index.rst new file mode 100644 index 0000000000..27439ece96 --- /dev/null +++ b/docs/source/cleanlab/internal/neighbor/index.rst @@ -0,0 +1,21 @@ +neighbor +======== + +The `neighbor` modules provide functionality for performing nearest neighbor search and pairwise distance calculations in those searches. + +This submodule consists of the following modules: + +- `neighbor.knn_graph`: Contains functions for setting up a nearest neighbor search index and constructing knn graphs. +- `neighbor.search`: Contains a helper function that wraps the default implementation of nearest neighbor searches. +- `neighbor.metric`: Contains functions for selecting distance metrics for nearest neighbor searches. + +.. automodule:: cleanlab.internal.neighbor + :autosummary: + :members: + :undoc-members: + :show-inheritance: + +.. toctree:: + knn_graph + metric + search diff --git a/docs/source/cleanlab/internal/neighbor/knn_graph.rst b/docs/source/cleanlab/internal/neighbor/knn_graph.rst new file mode 100644 index 0000000000..1486a76e16 --- /dev/null +++ b/docs/source/cleanlab/internal/neighbor/knn_graph.rst @@ -0,0 +1,8 @@ +knn_graph +========= + +.. automodule:: cleanlab.internal.neighbor.knn_graph + :autosummary: + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/source/cleanlab/internal/neighbor/metric.rst b/docs/source/cleanlab/internal/neighbor/metric.rst new file mode 100644 index 0000000000..f78f47cf50 --- /dev/null +++ b/docs/source/cleanlab/internal/neighbor/metric.rst @@ -0,0 +1,8 @@ +metric +====== + +.. automodule:: cleanlab.internal.neighbor.metric + :autosummary: + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/docs/source/cleanlab/internal/neighbor/search.rst b/docs/source/cleanlab/internal/neighbor/search.rst new file mode 100644 index 0000000000..056bfbc0a2 --- /dev/null +++ b/docs/source/cleanlab/internal/neighbor/search.rst @@ -0,0 +1,8 @@ +search +====== + +.. automodule:: cleanlab.internal.neighbor.search + :autosummary: + :members: + :undoc-members: + :show-inheritance: \ No newline at end of file diff --git a/setup.cfg b/setup.cfg index 82b8ba2d76..fab96aa43f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -3,6 +3,7 @@ ignore = select = F401,F403 per-file-ignores = cleanlab/__init__.py: F401 + cleanlab/internal/neighbor/__init__.py: F401 cleanlab/token_classification/__init__.py: F401 cleanlab/benchmarking/__init__.py: F401 cleanlab/regression/__init__.py: F401 diff --git a/tests/datalab/datalab/test_datalab.py b/tests/datalab/datalab/test_datalab.py index f52115180c..9693c11544 100644 --- a/tests/datalab/datalab/test_datalab.py +++ b/tests/datalab/datalab/test_datalab.py @@ -938,6 +938,16 @@ def test_non_iid_issues_pred_probs_knn_graph_checks(self, lab, random_embeddings issues_2 = lab_2.get_issues("non_iid") pd.testing.assert_frame_equal(issues_1, issues_2) + def test_all_identical_dataset(self): + """Test that the non-IID issue finder correctly identifies an all-identical dataset.""" + N, M = 200, 10 + data = {"labels": [0] * N} + features = np.full((N, M), fill_value=np.random.rand(M)) + lab = Datalab(data=data, label_name="labels") + lab.find_issues(features=features, issue_types={"non_iid": {}}) + assert lab.get_issues()["is_non_iid_issue"].sum() == 1 + assert lab.get_issue_summary()["score"].values[0] < 0.05 + class TestDatalabFindLabelIssues: @pytest.fixture @@ -1036,6 +1046,17 @@ def test_incremental_search(self, pred_probs, random_embeddings): outlier_summary = lab.get_issue_summary("outlier") assert outlier_summary["num_issues"].values[0] > 0 + def test_all_identical_dataset(self): + """Test that the non-IID issue finder correctly identifies an all-identical dataset.""" + N, M = 200, 10 + data = {"labels": [0] * N} + features = np.full((N, M), fill_value=np.random.rand(M)) + lab = Datalab(data=data, label_name="labels") + lab.find_issues(features=features, issue_types={"outlier": {}}) + outlier_issues = lab.get_issues("outlier") + assert outlier_issues["is_outlier_issue"].sum() == 0 + np.testing.assert_allclose(outlier_issues["outlier_score"].to_numpy(), 1) + class TestDatalabFindNearDuplicateIssues: @pytest.fixture @@ -1148,6 +1169,17 @@ def test_fixed_embeddings_outputs(self, fixed_embeddings): unique_non_empty_sets = [tuple(s) for s in near_duplicate_sets if len(s) > 0] assert len(set(unique_non_empty_sets)) == 18 + def test_all_identical_dataset(self): + """Test that the non-IID issue finder correctly identifies an all-identical dataset.""" + N, M = 200, 10 + data = {"labels": [0] * N} + features = np.full((N, M), fill_value=np.random.rand(M)) + lab = Datalab(data=data, label_name="labels") + lab.find_issues(features=features, issue_types={"near_duplicate": {}}) + near_duplicate_issues = lab.get_issues("near_duplicate") + assert near_duplicate_issues["is_near_duplicate_issue"].sum() == N + np.testing.assert_allclose(near_duplicate_issues["near_duplicate_score"].to_numpy(), 0) + class TestDatalabWithoutLabels: num_examples = 100 @@ -1402,6 +1434,22 @@ def test_no_cluster_ids(self, data): ) assert len(lab.issue_summary["issue_type"].values) == 0 + def test_all_identical_dataset(self): + """Test that the non-IID issue finder correctly handles an all-identical dataset.""" + N, M = 200, 10 + data = {"labels": [0] * N} + features = np.full((N, M), fill_value=np.random.rand(M)) + pred_probs = np.full((N, 2), fill_value=[1, 0]) + lab = Datalab(data=data, label_name="labels") + lab.find_issues( + features=features, pred_probs=pred_probs, issue_types={"underperforming_group": {}} + ) + underperforming_issues = lab.get_issues("underperforming_group") + assert underperforming_issues["is_underperforming_group_issue"].sum() == 0 + np.testing.assert_allclose( + underperforming_issues["underperforming_group_score"].to_numpy(), 1 + ) + class TestDataLabNullIssues: K = 3 @@ -1610,6 +1658,23 @@ def test_find_issues_with_different_metrics(self, dataset, knn_graph): lab.get_info("statistics")["weighted_knn_graph"].toarray(), ) + def test_all_identical_dataset(self): + """Test that the data_valuation issue finder correctly handles an all-identical dataset.""" + N, M = 11, 10 + data = {"labels": [0] * N} + features = np.full((N, M), fill_value=np.random.rand(M)) + lab = Datalab(data=data, label_name="labels") + lab.find_issues(features=features, issue_types={"data_valuation": {}}) + data_valuation_issues = lab.get_issues("data_valuation") + assert data_valuation_issues["is_data_valuation_issue"].sum() == 0 + + # For a full knn-graph, all data points have the same value. Here, they all contribute the same value. + # The score of 54/99 is a value that works for 11 identical data points. + # TODO: Find a reasonable test for larger dataset, with k much smaller than N. Hard to guarantee a score of 0.5. + np.testing.assert_allclose( + data_valuation_issues["data_valuation_score"].to_numpy(), 54 / 99 + ) + class TestIssueManagersReuseKnnGraph: """ diff --git a/tests/internal/neighbor/test_metric.py b/tests/internal/neighbor/test_metric.py new file mode 100644 index 0000000000..ae65a242c0 --- /dev/null +++ b/tests/internal/neighbor/test_metric.py @@ -0,0 +1,31 @@ +import numpy as np +import pytest + +from cleanlab.internal.neighbor.metric import decide_default_metric + + +@pytest.mark.parametrize( + "N", + [2, 10, 50, 100, 101], +) +def test_decide_default_metric_for_2d_and_3d_features(N): + # 2D and 3D features should always use the euclidean metric, disregarding the different implementations. + for M in [2, 3]: + X = np.random.rand(N, M) + metric = decide_default_metric(X) + if hasattr(metric, "__name__"): + error_msg = "The metric should be the string 'euclidean' for N > 100." + assert N <= 100, error_msg + metric = getattr(metric, "__name__") + assert metric == "euclidean" + + +@pytest.mark.parametrize( + "M", + [4, 5, 10, 50, 100], +) +def test_decide_default_metric_for_high_dimensional_features(M): + # High-dimensional features should always use the cosine metric. + X = np.random.rand(100, M) + metric = decide_default_metric(X) + assert metric == "cosine" diff --git a/tests/internal/neighbor/test_neighbor.py b/tests/internal/neighbor/test_neighbor.py new file mode 100644 index 0000000000..b3f6180a2d --- /dev/null +++ b/tests/internal/neighbor/test_neighbor.py @@ -0,0 +1,105 @@ +from typing import cast +import pytest +import numpy as np +from sklearn.neighbors import NearestNeighbors + +from cleanlab.internal.neighbor import features_to_knn +from cleanlab.internal.neighbor.knn_graph import construct_knn_graph_from_index + + +@pytest.mark.parametrize( + "N", + [2, 10, 100, 101], + ids=lambda x: f"N={x}", +) +@pytest.mark.parametrize( + "M", + [2, 3, 4, 5, 10, 50, 100], + ids=lambda x: f"M={x}", +) +def test_features_to_knn(N, M): + + features = np.random.rand(N, M) + if N >= 100: + features[-10:] = features[-11] # Make the last 11 entries all identical, as an edge-case. + knn = features_to_knn(features) + + assert isinstance(knn, NearestNeighbors) + knn = cast(NearestNeighbors, knn) + assert knn.n_neighbors == min(10, N - 1) + if M > 3: + metric = knn.metric + assert metric == "cosine" + else: + metric = knn.metric + if N <= 100: + assert hasattr(metric, "__name__") + metric = metric.__name__ + assert metric == "euclidean" + + if N >= 100: + distances, indices = knn.kneighbors(n_neighbors=10) + # Assert that the last 10 rows are identical to the 11th last row. + assert np.allclose(features[-10:], features[-11]) + np.testing.assert_allclose(distances[-11:], 0, atol=1e-15) + # All the indices belong to the same example, so the set of indices should be the same. + # No guarantees about the order of the indices, but each point is not considered its own neighbor. + np.testing.assert_allclose(np.unique(indices[-11:]), np.arange(start=N - 11, stop=N)) + + # The knn object should be fitted to the features. + # TODO: This is not a good test, but it's the best we can do without exposing the internal state of the knn object. + # Assert is_ + assert knn._fit_X is features + + +def test_knn_kwargs(): + """Check that features_to_knn passes additional keyword arguments to the NearestNeighbors constructor correctly.""" + N, M = 100, 10 + features = np.random.rand(N, M) + V = features.var(axis=0) + knn = features_to_knn( + features, + n_neighbors=6, + metric="seuclidean", + metric_params={"V": V}, + ) + + assert knn.n_neighbors == 6 + assert knn.radius == 1.0 + assert (alg := knn.algorithm) == "auto" + assert knn.leaf_size == 30 + assert knn.metric == "seuclidean" + assert knn.metric_params == {"V": V} + assert knn.p == 2 + assert knn._fit_X is features # Not a public attribute, bad idea to rely on this attribute. + + # Attributes estimated from fitted data + assert knn.n_features_in_ == M + assert knn.effective_metric_params_ == {"V": V} + assert knn.effective_metric_ == "seuclidean" + assert knn.n_samples_fit_ == N + assert ( + knn._fit_method == "ball_tree" if alg == "auto" else alg + ) # Should be one of ["kd_tree", "ball_tree" and "brute"], set with "algorithm" + + +@pytest.mark.parametrize("metric", ["cosine", "euclidean"]) +def test_construct_knn_graph_from_index(metric): + N, k = 100, 10 + knn = NearestNeighbors(n_neighbors=k, metric=metric) + X = np.random.rand(N, 10) + knn.fit(X) + knn_graph = construct_knn_graph_from_index(knn) + + assert knn_graph.shape == (N, N) + assert knn_graph.nnz == N * k + assert knn_graph.dtype == np.float64 + assert np.all(knn_graph.data >= 0) + assert np.all(knn_graph.indices >= 0) + assert np.all(knn_graph.indices < 100) + + distances = knn_graph.data.reshape(N, k) + indices = knn_graph.indices.reshape(N, k) + + # Assert all rows in distances are sorted + assert np.all(np.diff(distances, axis=1) >= 0)