add predictions argument to find_issues method of RegressionLabelIssu…

…eManager Move featrure-based and prediction-based strategies to different helper functions within module. Add several tests for the issue_manager and the find_issues method.
cleanlab · elisno · Dec 7, 2023 · Nov 22, 2023 · Nov 22, 2023 · Nov 22, 2023
commit 48b776fe63a7fcba6d32f39b1c15392dc45a6daa
diff --git a/cleanlab/datalab/internal/issue_manager/regression/label.py b/cleanlab/datalab/internal/issue_manager/regression/label.py
@@ -17,13 +17,14 @@
 from __future__ import annotations
 
 from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional
+import numpy as np
+import pandas as pd
 
 from cleanlab.regression.learn import CleanLearning
 from cleanlab.datalab.internal.issue_manager import IssueManager
+from cleanlab.regression.rank import get_label_quality_scores
 
 if TYPE_CHECKING:  # pragma: no cover
-    import numpy as np
-    import pandas as pd
     from cleanlab.datalab.datalab import Datalab
 
 
@@ -63,45 +64,41 @@ def __init__(
     ):
         super().__init__(datalab)
         self.cl = CleanLearning(**(clean_learning_kwargs or {}))
-
-    @staticmethod
-    def _process_find_label_issues_kwargs(kwargs: Dict[str, Any]) -> Dict[str, Any]:
-        """Searches for keyword arguments that are meant for the
-        CleanLearning.find_label_issues method call
-
-        Examples
-        --------
-        >>> from cleanlab.datalab.internal.issue_manager.regression.label import LabelIssueManager
-        >>> RegressionLabelIssueManager._process_find_label_issues_kwargs({'coarse_search_range': [0.1, 0.9]})
-        {'coarse_search_range': [0.1, 0.9]}
-        """
-        accepted_kwargs = [
-            "uncertainty",
-            "coarse_search_range",
-            "fine_search_size",
-            "save_space",
-            "model_kwargs",
-        ]
-        return {k: v for k, v in kwargs.items() if k in accepted_kwargs and v is not None}
+        # This is a field for prioritizing features only when using a custom model
+        self._uses_custom_model = "model" in (clean_learning_kwargs or {})
 
     def find_issues(
         self,
-        features: np.ndarray,
+        features: Optional[np.ndarray] = None,
+        predictions: Optional[np.ndarray] = None,
         **kwargs,
     ) -> None:
         """Find label issues in the datalab."""
-        if features is None:
+        if features is None and predictions is None:
             raise ValueError(
-                "Regression requires numerical `features` "
+                "Regression requires numerical `features` or `predictions` "
                 "to be passed in as an argument to `find_issues`."
             )
-
-        self.issues = self.cl.find_label_issues(
-            X=features,
-            y=self.datalab.labels,
-            **self._process_find_label_issues_kwargs(kwargs),
-        )
-        self.issues.rename(columns={"label_quality": self.issue_score_key}, inplace=True)
+        # If features are provided and either a custom model is used or no predictions are provided
+        use_features = features is not None and (self._uses_custom_model or predictions is None)
+        if use_features:
+            assert features is not None  # mypy won't narrow the type for some reason
+            self.issues = find_issues_with_features(
+                features=features,
+                y=self.datalab.labels,
+                cl=self.cl,
+                **kwargs,  # function sanitizes kwargs
+            )
+            self.issues.rename(columns={"label_quality": self.issue_score_key}, inplace=True)
+
+        # Otherwise, if predictions are provided, process them
+        else:
+            assert predictions is not None  # mypy won't narrow the type for some reason
+            self.issues = find_issues_with_predictions(
+                predictions=predictions,
+                y=self.datalab.labels,
+                **kwargs,  # function sanitizes kwargs
+            )
 
         # Get a summarized dataframe of the label issues
         self.summary = self.make_summary(score=self.issues[self.issue_score_key].mean())
@@ -131,3 +128,104 @@ def collect_info(self, issues: pd.DataFrame) -> dict:
         }
 
         return info_dict
+
+
+def find_issues_with_predictions(
+    predictions: np.ndarray,
+    y: np.ndarray,
+    threshold=0.1,
+    **kwargs,
+) -> pd.DataFrame:
+    """Find label issues in a regression dataset based on predictions.
+    This uses a threshold to determine if an example has a label issue
+    based on the quality score.
+
+    Parameters
+    ----------
+    predictions :
+        The predictions from a regression model.
+
+    y :
+        The given labels.
+
+    threshold :
+        The threshold to use to determine if an example has a label issue.
+        Default is 0.1.
+
+    **kwargs :
+        Various keyword arguments.
+
+    Returns
+    -------
+    issues :
+        A dataframe of the issues. It contains the following columns:
+        - is_label_issue : bool
+            True if the example has a label issue.
+        - label_score : float
+            The quality score of the label.
+        - given_label : float
+            The given label. It is the same as the y parameter.
+        - predicted_label : float
+            The predicted label. It is the same as the predictions parameter.
+    """
+    _accepted_kwargs = ["method"]
+    _kwargs = {k: kwargs.get(k) for k in _accepted_kwargs}
+    _kwargs = {k: v for k, v in _kwargs.items() if v is not None}
+    quality_scores = get_label_quality_scores(labels=y, predictions=predictions, **_kwargs)
+
+    median_score = np.median(quality_scores)
+    is_label_issue_mask = quality_scores < median_score * threshold
+
+    issues = pd.DataFrame(
+        {
+            "is_label_issue": is_label_issue_mask,
+            "label_score": quality_scores,
+            "given_label": y,
+            "predicted_label": predictions,
+        }
+    )
+    return issues
+
+
+def find_issues_with_features(
+    features: np.ndarray,
+    y: np.ndarray,
+    cl: CleanLearning,
+    **kwargs,
+) -> pd.DataFrame:
+    """Find label issues in a regression dataset based on features.
+    This delegates the work to the CleanLearning.find_label_issues method.
+
+    Parameters
+    ----------
+    features :
+        The numerical features from a regression dataset.
+
+    y :
+        The given labels.
+
+    **kwargs :
+        Various keyword arguments.
+
+    Returns
+    -------
+    issues :
+        A dataframe of the issues. It contains the following columns:
+        - is_label_issue : bool
+            True if the example has a label issue.
+        - label_score : float
+            The quality score of the label.
+        - given_label : float
+            The given label. It is the same as the y parameter.
+        - predicted_label : float
+            The predicted label. It is determined by the CleanLearning.find_label_issues method.
+    """
+    _accepted_kwargs = [
+        "uncertainty",
+        "coarse_search_range",
+        "fine_search_size",
+        "save_space",
+        "model_kwargs",
+    ]
+    _kwargs = {k: v for k, v in kwargs.items() if k in _accepted_kwargs and v is not None}
+    return cl.find_label_issues(X=features, y=y, **_kwargs)
diff --git a/tests/datalab/issue_manager/regression/test_label.py b/tests/datalab/issue_manager/regression/test_label.py
@@ -0,0 +1,147 @@
+import numpy as np
+import pandas as pd
+import pytest
+
+from cleanlab import Datalab
+from cleanlab.datalab.internal.issue_manager.regression.label import RegressionLabelIssueManager
+
+
+def ground_truth_target_function(x):
+    return 10 * x + 1
+
+
+@pytest.mark.cleanvision
+class TestRegressionLabelIssueManager:
+    def test_manager_found_in_registry(self):
+        from cleanlab.datalab.internal.issue_manager_factory import REGISTRY
+
+        error_msg = (
+            "RegressionLabelIssueManager should be registered to the regression task as 'label'"
+        )
+        assert REGISTRY["regression"].get("label") == RegressionLabelIssueManager, error_msg
+
+    @pytest.fixture
+    def features(self):
+        # 1 feature, 7 points
+        return np.array([0.1, 0.2, 0.3, 0.35, 0.4, 0.45, 0.5]).reshape(-1, 1)
+
+    @pytest.fixture
+    def regression_lab(self, features):
+        y = ground_truth_target_function(features)
+        # Flip the sign of the point x=0.4
+        y[features == 0.4] *= -1
+        y = y.ravel()
+        return Datalab({"y": y}, label_name="y", task="regression")
+
+    @pytest.fixture
+    def issue_manager(self, regression_lab):
+        return RegressionLabelIssueManager(datalab=regression_lab)
+
+    def test_find_issues_with_features(self, issue_manager, features):
+        issue_manager.find_issues(features=features)
+        issues = issue_manager.issues
+        assert isinstance(issues, pd.DataFrame), "Issues should be a dataframe"
+        expected_issue_mask = features.ravel() == 0.4
+        assert sum(expected_issue_mask) == 1, "There should be exactly one issue"
+
+        np.testing.assert_array_equal(issues["is_label_issue"].values, expected_issue_mask)
+        # Assert that he minimum score "label_score" is at the correct index
+        index_of_error = np.where(expected_issue_mask)[0][0]
+        assert issues["label_score"].values.argmin() == index_of_error
+
+    def test_init_with_model(self, issue_manager):
+        from sklearn.neighbors import KNeighborsRegressor
+
+        model = KNeighborsRegressor(n_neighbors=2)
+        assert issue_manager.cl.model != model
+
+        # Passing in a model to the constructor should set the cl.model field
+        clean_learning_kwargs = {"model": model}
+        lab = issue_manager.datalab
+        new_issue_manager = RegressionLabelIssueManager(
+            datalab=lab, clean_learning_kwargs=clean_learning_kwargs
+        )
+        assert new_issue_manager.cl.model == model
+
+    @pytest.fixture
+    def predictions(self, features):
+        y_ground_truth = ground_truth_target_function(features).ravel()
+        noise = 0.1 * np.random.randn(len(y_ground_truth))
+        return y_ground_truth + noise
+
+    def test_raises_find_issues_error_without_valid_inputs(self, issue_manager):
+        with pytest.raises(ValueError) as e:
+            expected_error_msg = (
+                "Regression requires numerical `features` or `predictions` "
+                "to be passed in as an argument to `find_issues`."
+            )
+            issue_manager.find_issues()
+            assert expected_error_msg in str(e)
+
+    def test_find_issue_with_predictions(self, issue_manager, features, predictions):
+        issue_manager.find_issues(predictions=predictions)
+        issues = issue_manager.issues
+        assert isinstance(issues, pd.DataFrame), "Issues should be a dataframe"
+        expected_issue_mask = features.ravel() == 0.4
+        assert sum(expected_issue_mask) == 1, "There should be exactly one issue"
+
+        np.testing.assert_array_equal(issues["is_label_issue"].values, expected_issue_mask)
+        # Assert that he minimum score "label_score" is at the correct index
+        index_of_error = np.where(expected_issue_mask)[0][0]
+        assert issues["label_score"].values.argmin() == index_of_error
+
+
+class TestRegressionLabelIssueManagerIntegration:
+
+    """This class contains tests for the find_issues method with a CleanLearning
+    object that behaves deterministically. This is useful to run a "regression"-test on
+    the results computed by the find_issues method.
+    The test dataset is a random toy regression dataset with 5 features and 100 samples.
+    The ground truth is a linear function of the first feature plus a bias defined in the
+    class attribute BIAS.
+    The ground truth is used to emulate a perfect model and compute the expected score
+    for the label issue detection. The gaussian noise contributes to lower label quality
+    scores.
+    """
+    BIAS = 1.0
+
+    @pytest.fixture()
+    def regression_dataset(self):
+        """For integration tests, a simple regression dataset is simpler than
+        a tiny, hand-crafted one."""
+        from sklearn.datasets import make_regression
+
+        # Return coefficients as well for testing purposes,
+        # interpret as ground truth
+        X, y, coef = make_regression(
+            n_samples=100,
+            n_features=5,
+            n_informative=1,
+            n_targets=1,
+            bias=self.BIAS,
+            noise=0.1,
+            random_state=0,
+            coef=True,
+        )
+        return X, y, coef
+
+    @pytest.fixture()
+    def issue_manager(self, regression_dataset):
+
+        _, y, _  = regression_dataset
+        lab = Datalab({"y": y}, label_name="y", task="regression")
+        return RegressionLabelIssueManager(datalab=lab, clean_learning_kwargs={"seed": 0})
+
+    def test_find_issues_with_features(self, regression_dataset, issue_manager):
+        X, _, _ = regression_dataset
+        issue_manager.find_issues(features=X)
+        summary = issue_manager.summary
+        assert np.isclose(summary["score"], 0.262423, atol=1e-5)
+
+
+    def test_find_issues_with_predictions(self, regression_dataset, issue_manager):
+        X, _ , coef = regression_dataset
+        y_pred = X @ coef + self.BIAS
+        issue_manager.find_issues(predictions=y_pred)
+        summary = issue_manager.summary
+        assert np.isclose(summary["score"], 0.075765, atol=1e-5)