Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Predictions for regression in Datalab #902

Merged
merged 38 commits into from
Dec 7, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
38 commits
Select commit Hold shift + click to select a range
48b776f
add predictions argument to find_issues method of RegressionLabelIssu…
elisno Nov 22, 2023
a754f6f
move Datalab + regression tests into separate test class
elisno Nov 22, 2023
b0bd9d8
apply black formatter
elisno Nov 22, 2023
c2d3049
add predictions as an argument to Datalab's find_issues method
elisno Nov 22, 2023
c3d004e
split tests up further
elisno Nov 22, 2023
7266f37
add score assertion to test on finding label issue based on features
elisno Nov 22, 2023
3d41559
add datalab tests for label issue checks with predictions for regression
elisno Nov 22, 2023
2511f24
add Datalab test for supplying another model for finding label issues…
elisno Nov 22, 2023
5db1554
apply black formatter
elisno Nov 22, 2023
75cf0c3
clarify settings tasks with register decorator in guide
elisno Nov 22, 2023
d03bfed
set threshold as a field in issue manager (for predictions)
elisno Nov 22, 2023
4fbfd15
add __init__.py for test-module discovery
elisno Dec 1, 2023
62ecc51
Introduce helper classes for validating model outputs
elisno Dec 1, 2023
404c607
remove predictions argument from top-level Datalab-methods.
elisno Dec 1, 2023
60231ac
apply suggestions from review
elisno Dec 1, 2023
77d2977
clarify module name of ClearnLearning reference
elisno Dec 1, 2023
ff58245
update docstring notes on pred_probs for high-level classes (Datalab …
elisno Dec 1, 2023
3fe3728
fix formatting in issue_finder.py
elisno Dec 4, 2023
516645b
Merge branch 'master' into predictions-for-regression-in-datalab
elisno Dec 5, 2023
13325d3
Merge branch 'master' into predictions-for-regression-in-datalab
elisno Dec 5, 2023
6428708
reintroduce pred_probs to kwargs in IssueFinder.get_available_issue_t…
elisno Dec 5, 2023
c921934
add punctuation [skip ci]
elisno Dec 5, 2023
a70ff32
update tests on datalab with regression
elisno Dec 5, 2023
c351149
Apply black formatter to test_datalab.py
elisno Dec 5, 2023
7f99c68
Update cleanlab/datalab/datalab.py
elisno Dec 5, 2023
9956325
Update cleanlab/datalab/internal/issue_finder.py
elisno Dec 5, 2023
181e3b7
simplify generation of regression dataset for testing
elisno Dec 6, 2023
700c238
apply black formatter
elisno Dec 6, 2023
97b26b6
refactor test cases
elisno Dec 6, 2023
4c9fc08
apply black formatter
elisno Dec 6, 2023
41be070
update regression dataset generation for testing
elisno Dec 6, 2023
0974401
adjust tests for regression + datalab
elisno Dec 7, 2023
e201c43
update regression tests
elisno Dec 7, 2023
3dc227a
Apply suggestions from code review
elisno Dec 7, 2023
0f36966
suppress health check for unrelated test on empty near duplicate sets
elisno Dec 7, 2023
af11282
Comment on priority order for find_issues method
elisno Dec 7, 2023
5165eb4
add Datalab regressions issue managers to docs
elisno Dec 7, 2023
41caca5
Apply suggestions from code review
elisno Dec 7, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Next Next commit
add predictions argument to find_issues method of RegressionLabelIssu…
…eManager

Move featrure-based and prediction-based strategies to different helper functions within module.

Add several tests for the issue_manager and the find_issues method.
  • Loading branch information
elisno committed Nov 22, 2023
commit 48b776fe63a7fcba6d32f39b1c15392dc45a6daa
162 changes: 130 additions & 32 deletions cleanlab/datalab/internal/issue_manager/regression/label.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,14 @@
from __future__ import annotations

from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional
import numpy as np
import pandas as pd

from cleanlab.regression.learn import CleanLearning
from cleanlab.datalab.internal.issue_manager import IssueManager
from cleanlab.regression.rank import get_label_quality_scores

if TYPE_CHECKING: # pragma: no cover
import numpy as np
import pandas as pd
from cleanlab.datalab.datalab import Datalab


Expand Down Expand Up @@ -63,45 +64,41 @@ def __init__(
):
super().__init__(datalab)
self.cl = CleanLearning(**(clean_learning_kwargs or {}))

@staticmethod
def _process_find_label_issues_kwargs(kwargs: Dict[str, Any]) -> Dict[str, Any]:
"""Searches for keyword arguments that are meant for the
CleanLearning.find_label_issues method call

Examples
--------
>>> from cleanlab.datalab.internal.issue_manager.regression.label import LabelIssueManager
>>> RegressionLabelIssueManager._process_find_label_issues_kwargs({'coarse_search_range': [0.1, 0.9]})
{'coarse_search_range': [0.1, 0.9]}
"""
accepted_kwargs = [
"uncertainty",
"coarse_search_range",
"fine_search_size",
"save_space",
"model_kwargs",
]
return {k: v for k, v in kwargs.items() if k in accepted_kwargs and v is not None}
# This is a field for prioritizing features only when using a custom model
self._uses_custom_model = "model" in (clean_learning_kwargs or {})

def find_issues(
self,
features: np.ndarray,
features: Optional[np.ndarray] = None,
predictions: Optional[np.ndarray] = None,
**kwargs,
) -> None:
"""Find label issues in the datalab."""
jwmueller marked this conversation as resolved.
Show resolved Hide resolved
if features is None:
if features is None and predictions is None:
raise ValueError(
"Regression requires numerical `features` "
"Regression requires numerical `features` or `predictions` "
"to be passed in as an argument to `find_issues`."
)

self.issues = self.cl.find_label_issues(
X=features,
y=self.datalab.labels,
**self._process_find_label_issues_kwargs(kwargs),
)
self.issues.rename(columns={"label_quality": self.issue_score_key}, inplace=True)
# If features are provided and either a custom model is used or no predictions are provided
use_features = features is not None and (self._uses_custom_model or predictions is None)
elisno marked this conversation as resolved.
Show resolved Hide resolved
if use_features:
assert features is not None # mypy won't narrow the type for some reason
self.issues = find_issues_with_features(
features=features,
y=self.datalab.labels,
cl=self.cl,
**kwargs, # function sanitizes kwargs
)
self.issues.rename(columns={"label_quality": self.issue_score_key}, inplace=True)

# Otherwise, if predictions are provided, process them
else:
assert predictions is not None # mypy won't narrow the type for some reason
self.issues = find_issues_with_predictions(
predictions=predictions,
y=self.datalab.labels,
**kwargs, # function sanitizes kwargs
)

# Get a summarized dataframe of the label issues
self.summary = self.make_summary(score=self.issues[self.issue_score_key].mean())
Expand Down Expand Up @@ -131,3 +128,104 @@ def collect_info(self, issues: pd.DataFrame) -> dict:
}

return info_dict


def find_issues_with_predictions(
predictions: np.ndarray,
y: np.ndarray,
threshold=0.1,
**kwargs,
) -> pd.DataFrame:
"""Find label issues in a regression dataset based on predictions.
This uses a threshold to determine if an example has a label issue
based on the quality score.

Parameters
----------
predictions :
The predictions from a regression model.

y :
The given labels.

threshold :
The threshold to use to determine if an example has a label issue.
Default is 0.1.

**kwargs :
Various keyword arguments.

Returns
-------
issues :
A dataframe of the issues. It contains the following columns:
- is_label_issue : bool
True if the example has a label issue.
- label_score : float
The quality score of the label.
- given_label : float
The given label. It is the same as the y parameter.
- predicted_label : float
The predicted label. It is the same as the predictions parameter.
"""
_accepted_kwargs = ["method"]
_kwargs = {k: kwargs.get(k) for k in _accepted_kwargs}
_kwargs = {k: v for k, v in _kwargs.items() if v is not None}
quality_scores = get_label_quality_scores(labels=y, predictions=predictions, **_kwargs)

median_score = np.median(quality_scores)
is_label_issue_mask = quality_scores < median_score * threshold

issues = pd.DataFrame(
{
"is_label_issue": is_label_issue_mask,
"label_score": quality_scores,
"given_label": y,
"predicted_label": predictions,
}
)
return issues


def find_issues_with_features(
features: np.ndarray,
y: np.ndarray,
cl: CleanLearning,
**kwargs,
) -> pd.DataFrame:
"""Find label issues in a regression dataset based on features.
This delegates the work to the CleanLearning.find_label_issues method.

Parameters
----------
features :
The numerical features from a regression dataset.

y :
The given labels.

**kwargs :
Various keyword arguments.

Returns
-------
issues :
A dataframe of the issues. It contains the following columns:
- is_label_issue : bool
True if the example has a label issue.
- label_score : float
The quality score of the label.
- given_label : float
The given label. It is the same as the y parameter.
- predicted_label : float
The predicted label. It is determined by the CleanLearning.find_label_issues method.
"""
_accepted_kwargs = [
"uncertainty",
"coarse_search_range",
"fine_search_size",
"save_space",
"model_kwargs",
]
_kwargs = {k: v for k, v in kwargs.items() if k in _accepted_kwargs and v is not None}
return cl.find_label_issues(X=features, y=y, **_kwargs)
147 changes: 147 additions & 0 deletions tests/datalab/issue_manager/regression/test_label.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import numpy as np
import pandas as pd
import pytest

from cleanlab import Datalab
from cleanlab.datalab.internal.issue_manager.regression.label import RegressionLabelIssueManager


def ground_truth_target_function(x):
return 10 * x + 1


@pytest.mark.cleanvision
class TestRegressionLabelIssueManager:
def test_manager_found_in_registry(self):
from cleanlab.datalab.internal.issue_manager_factory import REGISTRY

error_msg = (
"RegressionLabelIssueManager should be registered to the regression task as 'label'"
)
assert REGISTRY["regression"].get("label") == RegressionLabelIssueManager, error_msg

@pytest.fixture
def features(self):
# 1 feature, 7 points
return np.array([0.1, 0.2, 0.3, 0.35, 0.4, 0.45, 0.5]).reshape(-1, 1)

@pytest.fixture
def regression_lab(self, features):
y = ground_truth_target_function(features)
# Flip the sign of the point x=0.4
y[features == 0.4] *= -1
y = y.ravel()
return Datalab({"y": y}, label_name="y", task="regression")

@pytest.fixture
def issue_manager(self, regression_lab):
return RegressionLabelIssueManager(datalab=regression_lab)

def test_find_issues_with_features(self, issue_manager, features):
issue_manager.find_issues(features=features)
issues = issue_manager.issues
assert isinstance(issues, pd.DataFrame), "Issues should be a dataframe"
expected_issue_mask = features.ravel() == 0.4
assert sum(expected_issue_mask) == 1, "There should be exactly one issue"

np.testing.assert_array_equal(issues["is_label_issue"].values, expected_issue_mask)
# Assert that he minimum score "label_score" is at the correct index
index_of_error = np.where(expected_issue_mask)[0][0]
assert issues["label_score"].values.argmin() == index_of_error

def test_init_with_model(self, issue_manager):
from sklearn.neighbors import KNeighborsRegressor

model = KNeighborsRegressor(n_neighbors=2)
assert issue_manager.cl.model != model

# Passing in a model to the constructor should set the cl.model field
clean_learning_kwargs = {"model": model}
lab = issue_manager.datalab
new_issue_manager = RegressionLabelIssueManager(
datalab=lab, clean_learning_kwargs=clean_learning_kwargs
)
assert new_issue_manager.cl.model == model

@pytest.fixture
def predictions(self, features):
y_ground_truth = ground_truth_target_function(features).ravel()
noise = 0.1 * np.random.randn(len(y_ground_truth))
return y_ground_truth + noise

def test_raises_find_issues_error_without_valid_inputs(self, issue_manager):
with pytest.raises(ValueError) as e:
expected_error_msg = (
"Regression requires numerical `features` or `predictions` "
"to be passed in as an argument to `find_issues`."
)
issue_manager.find_issues()
assert expected_error_msg in str(e)

def test_find_issue_with_predictions(self, issue_manager, features, predictions):
issue_manager.find_issues(predictions=predictions)
issues = issue_manager.issues
assert isinstance(issues, pd.DataFrame), "Issues should be a dataframe"
expected_issue_mask = features.ravel() == 0.4
assert sum(expected_issue_mask) == 1, "There should be exactly one issue"

np.testing.assert_array_equal(issues["is_label_issue"].values, expected_issue_mask)
# Assert that he minimum score "label_score" is at the correct index
index_of_error = np.where(expected_issue_mask)[0][0]
assert issues["label_score"].values.argmin() == index_of_error


class TestRegressionLabelIssueManagerIntegration:

"""This class contains tests for the find_issues method with a CleanLearning
object that behaves deterministically. This is useful to run a "regression"-test on
the results computed by the find_issues method.
The test dataset is a random toy regression dataset with 5 features and 100 samples.
The ground truth is a linear function of the first feature plus a bias defined in the
class attribute BIAS.
The ground truth is used to emulate a perfect model and compute the expected score
for the label issue detection. The gaussian noise contributes to lower label quality
scores.
"""
BIAS = 1.0

@pytest.fixture()
def regression_dataset(self):
"""For integration tests, a simple regression dataset is simpler than
a tiny, hand-crafted one."""
from sklearn.datasets import make_regression

# Return coefficients as well for testing purposes,
# interpret as ground truth
X, y, coef = make_regression(
n_samples=100,
n_features=5,
n_informative=1,
n_targets=1,
bias=self.BIAS,
noise=0.1,
random_state=0,
coef=True,
)
return X, y, coef

@pytest.fixture()
def issue_manager(self, regression_dataset):

_, y, _ = regression_dataset
lab = Datalab({"y": y}, label_name="y", task="regression")
return RegressionLabelIssueManager(datalab=lab, clean_learning_kwargs={"seed": 0})

def test_find_issues_with_features(self, regression_dataset, issue_manager):
X, _, _ = regression_dataset
issue_manager.find_issues(features=X)
summary = issue_manager.summary
assert np.isclose(summary["score"], 0.262423, atol=1e-5)


def test_find_issues_with_predictions(self, regression_dataset, issue_manager):
X, _ , coef = regression_dataset
y_pred = X @ coef + self.BIAS
issue_manager.find_issues(predictions=y_pred)
summary = issue_manager.summary
assert np.isclose(summary["score"], 0.075765, atol=1e-5)