From baa5021d39cfa5535a0ad0aa73c45fb6a32f1a97 Mon Sep 17 00:00:00 2001 From: abhijitpal1247 Date: Tue, 3 Oct 2023 23:51:06 +0530 Subject: [PATCH 01/30] changes for NullIssueManager --- .../datalab/internal/issue_manager/null.py | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 cleanlab/datalab/internal/issue_manager/null.py diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py new file mode 100644 index 0000000000..95b5d227d4 --- /dev/null +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -0,0 +1,68 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Tuple, Union, cast + +from scipy.sparse import csr_matrix +from scipy.stats import iqr +import numpy as np +import pandas as pd + +from cleanlab.datalab.internal.issue_manager import IssueManager +from cleanlab.outlier import OutOfDistribution, transform_distances_to_scores + +if TYPE_CHECKING: # pragma: no cover + import numpy.typing as npt + from sklearn.neighbors import NearestNeighbors + from cleanlab.datalab.datalab import Datalab + + +class NullIssueManager(IssueManager): + """Manages issues related to null/missing values in rows of features.""" + description: ClassVar[ + str + ] = """Whether the dataset has any missing/null values + """ + issue_name: ClassVar[str] = "null" + verbosity_levels = { + 0: ["null_scores"], + 1: [], + 2: [], + } + + def __init__(self, datalab: Datalab): + super().__init__(datalab) + + def find_issues( + self, + features: Optional[npt.NDArray] = None, + **kwargs, + ) -> None: + rows = features.shape[0] + cols = features.shape[1] + scores = np.zeros_like((rows, 1)).astype(np.float) + is_null_issue = np.full((rows, 1), False) + null_tracker = np.isnan(features) + if null_tracker.any(): + for row in range(rows): + if null_tracker[row].any(): + is_null_issue[row] = True + null_row_count = np.count_nonzero(np.isnan(null_tracker[row])).astype(np.float) + scores[row] = null_row_count/cols + + self.issues = pd.DataFrame( + { + f"is_{self.issue_name}_issue": is_null_issue, + self.issue_score_key: scores, + }, + ) + + self.summary = self.make_summary(score=scores.mean()) + + def collect_info(self) -> dict: + issues_dict = { + "average_null_score": self.issues[self.issue_score_key].mean() + } + info_dict: Dict[str, Any] = { + **issues_dict + } + return info_dict From 10a8bfe82778f1df2bdd022a75fbf1284aae4ac5 Mon Sep 17 00:00:00 2001 From: abhijitpal1247 Date: Tue, 3 Oct 2023 23:58:06 +0530 Subject: [PATCH 02/30] changing null_scores to average_null_score --- cleanlab/datalab/internal/issue_manager/null.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index 95b5d227d4..6d660c0acb 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -24,7 +24,7 @@ class NullIssueManager(IssueManager): """ issue_name: ClassVar[str] = "null" verbosity_levels = { - 0: ["null_scores"], + 0: ["average_null_score"], 1: [], 2: [], } From 46b471d5dac92376acc2f03c5489f637a1c6ca4d Mon Sep 17 00:00:00 2001 From: abhijitpal1247 Date: Wed, 4 Oct 2023 00:01:47 +0530 Subject: [PATCH 03/30] adding comments for readability --- cleanlab/datalab/internal/issue_manager/null.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index 6d660c0acb..edc3ab0c35 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -17,7 +17,12 @@ class NullIssueManager(IssueManager): - """Manages issues related to null/missing values in rows of features.""" + """Manages issues related to null/missing values in the rows of features. + Parameters + ---------- + datalab : + The Datalab instance that this issue manager searches for issues in. + """ description: ClassVar[ str ] = """Whether the dataset has any missing/null values From 65fe3f8e487372e04bffec4832e7d7b86ffdda70 Mon Sep 17 00:00:00 2001 From: abhijitpal1247 Date: Wed, 4 Oct 2023 00:04:51 +0530 Subject: [PATCH 04/30] removing unnecessary dependencies --- cleanlab/datalab/internal/issue_manager/null.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index edc3ab0c35..2dc4d57b8c 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -1,18 +1,14 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, Tuple, Union, cast +from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional -from scipy.sparse import csr_matrix -from scipy.stats import iqr import numpy as np import pandas as pd from cleanlab.datalab.internal.issue_manager import IssueManager -from cleanlab.outlier import OutOfDistribution, transform_distances_to_scores if TYPE_CHECKING: # pragma: no cover import numpy.typing as npt - from sklearn.neighbors import NearestNeighbors from cleanlab.datalab.datalab import Datalab From c3a53f4f7686f365abb502035d86e484af6f0ea4 Mon Sep 17 00:00:00 2001 From: abhijitpal1247 Date: Wed, 4 Oct 2023 00:36:00 +0530 Subject: [PATCH 05/30] autocorrected formatting with black --- cleanlab/datalab/internal/issue_manager/null.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index 2dc4d57b8c..2ed1578038 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -19,6 +19,7 @@ class NullIssueManager(IssueManager): datalab : The Datalab instance that this issue manager searches for issues in. """ + description: ClassVar[ str ] = """Whether the dataset has any missing/null values @@ -48,7 +49,7 @@ def find_issues( if null_tracker[row].any(): is_null_issue[row] = True null_row_count = np.count_nonzero(np.isnan(null_tracker[row])).astype(np.float) - scores[row] = null_row_count/cols + scores[row] = null_row_count / cols self.issues = pd.DataFrame( { @@ -60,10 +61,6 @@ def find_issues( self.summary = self.make_summary(score=scores.mean()) def collect_info(self) -> dict: - issues_dict = { - "average_null_score": self.issues[self.issue_score_key].mean() - } - info_dict: Dict[str, Any] = { - **issues_dict - } + issues_dict = {"average_null_score": self.issues[self.issue_score_key].mean()} + info_dict: Dict[str, Any] = {**issues_dict} return info_dict From aef2c60379deea4be8cdd60f058b65cc796e3860 Mon Sep 17 00:00:00 2001 From: abhijitpal1247 Date: Wed, 4 Oct 2023 00:41:46 +0530 Subject: [PATCH 06/30] correcting some datatypes --- cleanlab/datalab/internal/issue_manager/null.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index 2ed1578038..9402c6b4bc 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -39,16 +39,20 @@ def find_issues( features: Optional[npt.NDArray] = None, **kwargs, ) -> None: + if features is None: + raise ValueError( + "features must be provided to check for null values." + ) rows = features.shape[0] cols = features.shape[1] - scores = np.zeros_like((rows, 1)).astype(np.float) + scores = np.zeros_like((rows, 1)).astype(np.float32) is_null_issue = np.full((rows, 1), False) null_tracker = np.isnan(features) if null_tracker.any(): for row in range(rows): if null_tracker[row].any(): is_null_issue[row] = True - null_row_count = np.count_nonzero(np.isnan(null_tracker[row])).astype(np.float) + null_row_count = np.count_nonzero(np.isnan(null_tracker[row])) scores[row] = null_row_count / cols self.issues = pd.DataFrame( From 143e28998de12cf559684cf81384fccb91b127cc Mon Sep 17 00:00:00 2001 From: abhijitpal1247 Date: Wed, 4 Oct 2023 00:43:04 +0530 Subject: [PATCH 07/30] autocorrecting the format with black-2 --- cleanlab/datalab/internal/issue_manager/null.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index 9402c6b4bc..c6a9fedbe7 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -40,9 +40,7 @@ def find_issues( **kwargs, ) -> None: if features is None: - raise ValueError( - "features must be provided to check for null values." - ) + raise ValueError("features must be provided to check for null values.") rows = features.shape[0] cols = features.shape[1] scores = np.zeros_like((rows, 1)).astype(np.float32) From 2a8347adfda39f1ccf49470d1a86522431c95998 Mon Sep 17 00:00:00 2001 From: abhijitpal1247 Date: Sun, 8 Oct 2023 11:34:09 +0530 Subject: [PATCH 08/30] added test and some minor changes --- .../datalab/internal/issue_manager/null.py | 7 +- tests/datalab/issue_manager/test_null.py | 78 +++++++++++++++++++ 2 files changed, 82 insertions(+), 3 deletions(-) create mode 100644 tests/datalab/issue_manager/test_null.py diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index c6a9fedbe7..9908be9383 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -43,14 +43,14 @@ def find_issues( raise ValueError("features must be provided to check for null values.") rows = features.shape[0] cols = features.shape[1] - scores = np.zeros_like((rows, 1)).astype(np.float32) - is_null_issue = np.full((rows, 1), False) + scores = np.zeros(rows).astype(np.float32) + is_null_issue = np.full(rows, False) null_tracker = np.isnan(features) if null_tracker.any(): for row in range(rows): if null_tracker[row].any(): is_null_issue[row] = True - null_row_count = np.count_nonzero(np.isnan(null_tracker[row])) + null_row_count = np.count_nonzero(null_tracker[row]) scores[row] = null_row_count / cols self.issues = pd.DataFrame( @@ -61,6 +61,7 @@ def find_issues( ) self.summary = self.make_summary(score=scores.mean()) + self.info = self.collect_info() def collect_info(self) -> dict: issues_dict = {"average_null_score": self.issues[self.issue_score_key].mean()} diff --git a/tests/datalab/issue_manager/test_null.py b/tests/datalab/issue_manager/test_null.py new file mode 100644 index 0000000000..b7fc99504f --- /dev/null +++ b/tests/datalab/issue_manager/test_null.py @@ -0,0 +1,78 @@ +import numpy as np +import pytest + +from cleanlab.datalab.internal.issue_manager.null import ( + NullIssueManager, +) + +SEED = 42 + + +class TestNullIssueManager: + @pytest.fixture + def embeddings(self, lab): + np.random.seed(SEED) + embeddings_array = np.random.random((4, 3)) + embeddings_array[0][0] = np.NaN + embeddings_array[2][1] = np.NaN + embeddings_array[2][2] = np.NaN + return embeddings_array + + @pytest.fixture + def issue_manager(self, lab): + return NullIssueManager(datalab=lab) + + def test_init(self, lab, issue_manager): + assert issue_manager.datalab == lab + + def test_find_issues(self, issue_manager, embeddings): + np.random.seed(SEED) + issue_manager.find_issues(features=embeddings) + issues_sort, summary_sort, info_sort = ( + issue_manager.issues, + issue_manager.summary, + issue_manager.info, + ) + expected_sorted_issue_mask = np.array([True, False, True, False]) + assert np.all( + issues_sort["is_null_issue"] == expected_sorted_issue_mask + ), "Issue mask should be correct" + assert summary_sort["issue_type"][0] == "null" + assert summary_sort["score"][0] == pytest.approx(expected=0.25, abs=1e-7) + assert ( + info_sort.get("average_null_score", None) is not None + ), "Should have average null score" + assert summary_sort["score"][0] == pytest.approx( + expected=info_sort["average_null_score"], abs=1e-7 + ) + + def test_report(self, issue_manager, embeddings): + np.random.seed(SEED) + issue_manager.find_issues(features=embeddings) + report = issue_manager.report( + issues=issue_manager.issues, + summary=issue_manager.summary, + info=issue_manager.info, + ) + + assert isinstance(report, str) + assert ( + "----------------------- null issues ------------------------\n\n" + "Number of examples with this issue:" + ) in report + + report = issue_manager.report( + issues=issue_manager.issues, + summary=issue_manager.summary, + info=issue_manager.info, + verbosity=3, + ) + assert "Additional Information: " in report + + def test_collect_info(self, issue_manager, embeddings): + """Test some values in the info dict.""" + + issue_manager.find_issues(features=embeddings) + info = issue_manager.info + + assert info["average_null_score"] == 0.25 From b028779b01c21f6dcc7e9fc097d4ab47c9ceea92 Mon Sep 17 00:00:00 2001 From: abhijitpal1247 Date: Sun, 8 Oct 2023 12:11:29 +0530 Subject: [PATCH 09/30] added test for both null and non-null values --- tests/datalab/issue_manager/test_null.py | 69 +++++++++++++++++------- 1 file changed, 49 insertions(+), 20 deletions(-) diff --git a/tests/datalab/issue_manager/test_null.py b/tests/datalab/issue_manager/test_null.py index b7fc99504f..ed85316e54 100644 --- a/tests/datalab/issue_manager/test_null.py +++ b/tests/datalab/issue_manager/test_null.py @@ -10,13 +10,15 @@ class TestNullIssueManager: @pytest.fixture - def embeddings(self, lab): + def embeddings(self, request): + no_null = request.param np.random.seed(SEED) embeddings_array = np.random.random((4, 3)) - embeddings_array[0][0] = np.NaN - embeddings_array[2][1] = np.NaN - embeddings_array[2][2] = np.NaN - return embeddings_array + if not no_null: + embeddings_array[0][0] = np.NaN + embeddings_array[2][1] = np.NaN + embeddings_array[2][2] = np.NaN + return embeddings_array, no_null @pytest.fixture def issue_manager(self, lab): @@ -25,29 +27,51 @@ def issue_manager(self, lab): def test_init(self, lab, issue_manager): assert issue_manager.datalab == lab + @pytest.mark.parametrize('embeddings', + [True, False], + indirect=['embeddings']) def test_find_issues(self, issue_manager, embeddings): np.random.seed(SEED) + embeddings, no_null_flag = embeddings issue_manager.find_issues(features=embeddings) issues_sort, summary_sort, info_sort = ( issue_manager.issues, issue_manager.summary, issue_manager.info, ) - expected_sorted_issue_mask = np.array([True, False, True, False]) - assert np.all( - issues_sort["is_null_issue"] == expected_sorted_issue_mask - ), "Issue mask should be correct" - assert summary_sort["issue_type"][0] == "null" - assert summary_sort["score"][0] == pytest.approx(expected=0.25, abs=1e-7) - assert ( - info_sort.get("average_null_score", None) is not None - ), "Should have average null score" - assert summary_sort["score"][0] == pytest.approx( - expected=info_sort["average_null_score"], abs=1e-7 - ) + if no_null_flag: + expected_sorted_issue_mask = np.array([False, False, False, False]) + assert np.all( + issues_sort["is_null_issue"] == expected_sorted_issue_mask + ), "Issue mask should be correct" + assert summary_sort["issue_type"][0] == "null" + assert summary_sort["score"][0] == pytest.approx(expected=0.0, abs=1e-7) + assert ( + info_sort.get("average_null_score", None) is not None + ), "Should have average null score" + assert summary_sort["score"][0] == pytest.approx( + expected=info_sort["average_null_score"], abs=1e-7 + ) + else: + expected_sorted_issue_mask = np.array([True, False, True, False]) + assert np.all( + issues_sort["is_null_issue"] == expected_sorted_issue_mask + ), "Issue mask should be correct" + assert summary_sort["issue_type"][0] == "null" + assert summary_sort["score"][0] == pytest.approx(expected=0.25, abs=1e-7) + assert ( + info_sort.get("average_null_score", None) is not None + ), "Should have average null score" + assert summary_sort["score"][0] == pytest.approx( + expected=info_sort["average_null_score"], abs=1e-7 + ) + @pytest.mark.parametrize('embeddings', + [True, False], + indirect=['embeddings']) def test_report(self, issue_manager, embeddings): np.random.seed(SEED) + embeddings, no_null_flag = embeddings issue_manager.find_issues(features=embeddings) report = issue_manager.report( issues=issue_manager.issues, @@ -69,10 +93,15 @@ def test_report(self, issue_manager, embeddings): ) assert "Additional Information: " in report + @pytest.mark.parametrize('embeddings', + [True, False], + indirect=['embeddings']) def test_collect_info(self, issue_manager, embeddings): """Test some values in the info dict.""" - + embeddings, no_null_flag = embeddings issue_manager.find_issues(features=embeddings) info = issue_manager.info - - assert info["average_null_score"] == 0.25 + if no_null_flag: + assert info["average_null_score"] == 0.0 + else: + assert info["average_null_score"] == 0.25 From 0fa071081ea423daadb058b2683c34fb62c469c5 Mon Sep 17 00:00:00 2001 From: abhijitpal1247 Date: Sun, 8 Oct 2023 12:12:56 +0530 Subject: [PATCH 10/30] reformatted with black --- tests/datalab/issue_manager/test_null.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/tests/datalab/issue_manager/test_null.py b/tests/datalab/issue_manager/test_null.py index ed85316e54..19e66b0e27 100644 --- a/tests/datalab/issue_manager/test_null.py +++ b/tests/datalab/issue_manager/test_null.py @@ -27,9 +27,7 @@ def issue_manager(self, lab): def test_init(self, lab, issue_manager): assert issue_manager.datalab == lab - @pytest.mark.parametrize('embeddings', - [True, False], - indirect=['embeddings']) + @pytest.mark.parametrize("embeddings", [True, False], indirect=["embeddings"]) def test_find_issues(self, issue_manager, embeddings): np.random.seed(SEED) embeddings, no_null_flag = embeddings @@ -66,9 +64,7 @@ def test_find_issues(self, issue_manager, embeddings): expected=info_sort["average_null_score"], abs=1e-7 ) - @pytest.mark.parametrize('embeddings', - [True, False], - indirect=['embeddings']) + @pytest.mark.parametrize("embeddings", [True, False], indirect=["embeddings"]) def test_report(self, issue_manager, embeddings): np.random.seed(SEED) embeddings, no_null_flag = embeddings @@ -93,9 +89,7 @@ def test_report(self, issue_manager, embeddings): ) assert "Additional Information: " in report - @pytest.mark.parametrize('embeddings', - [True, False], - indirect=['embeddings']) + @pytest.mark.parametrize("embeddings", [True, False], indirect=["embeddings"]) def test_collect_info(self, issue_manager, embeddings): """Test some values in the info dict.""" embeddings, no_null_flag = embeddings From afa34e8b7414276ea7d8f9f988604f7735b1da77 Mon Sep 17 00:00:00 2001 From: abhijitpal1247 Date: Sat, 14 Oct 2023 19:20:23 +0530 Subject: [PATCH 11/30] adding some more metrics to track and test them --- .../datalab/internal/issue_manager/null.py | 107 +++++++++++++-- tests/datalab/issue_manager/test_null.py | 126 ++++++++++++------ 2 files changed, 175 insertions(+), 58 deletions(-) diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index 9908be9383..46b3e39d27 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -1,9 +1,11 @@ from __future__ import annotations -from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional +from collections import Counter +from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, List, Tuple import numpy as np import pandas as pd +from numpy import ndarray from cleanlab.datalab.internal.issue_manager import IssueManager @@ -27,20 +29,17 @@ class NullIssueManager(IssueManager): issue_name: ClassVar[str] = "null" verbosity_levels = { 0: ["average_null_score"], - 1: [], + 1: [ + "most_common_issue", + ], 2: [], } def __init__(self, datalab: Datalab): super().__init__(datalab) - def find_issues( - self, - features: Optional[npt.NDArray] = None, - **kwargs, - ) -> None: - if features is None: - raise ValueError("features must be provided to check for null values.") + @staticmethod + def _calculate_null_issues(features: npt.NDArray) -> tuple[ndarray, ndarray, Any]: rows = features.shape[0] cols = features.shape[1] scores = np.zeros(rows).astype(np.float32) @@ -49,9 +48,20 @@ def find_issues( if null_tracker.any(): for row in range(rows): if null_tracker[row].any(): - is_null_issue[row] = True - null_row_count = np.count_nonzero(null_tracker[row]) + null_row_count = np.count_nonzero(~null_tracker[row]) scores[row] = null_row_count / cols + if scores[row] == 0.00: + is_null_issue[row] = True + return is_null_issue, scores, null_tracker + + def find_issues( + self, + features: Optional[npt.NDArray] = None, + **kwargs, + ) -> None: + if features is None: + raise ValueError("features must be provided to check for null values.") + is_null_issue, scores, null_tracker = self._calculate_null_issues(features=features) self.issues = pd.DataFrame( { @@ -61,9 +71,78 @@ def find_issues( ) self.summary = self.make_summary(score=scores.mean()) - self.info = self.collect_info() + self.info = self.collect_info(null_tracker) + + @staticmethod + def most_common_issue(null_tracker: np.ndarray) -> Dict[str, Dict[str, float]]: + """ + Identify and return the most common null value pattern across all rows + and count the number of rows with this pattern. + + Parameters + ------------ + null_tracker : np.ndarray + A boolean array of the same shape as features, where True indicates null/missing entries. + + Returns + -------- + Dict[str, Any] + A dictionary containing the most common issue pattern and the count of rows with this pattern. + """ + # Convert the boolean null_tracker matrix into a list of strings. + most_frequent_pattern = "no_null" + rows_affected = [] + occurrence_of_most_frequent_pattern = 0 + if null_tracker.any(): + null_patterns_as_strings = [ + "".join(map(str, row.astype(int).tolist())) for row in null_tracker if row.any() + ] + + # Use Counter to efficiently count occurrences and find the most common pattern. + pattern_counter = Counter(null_patterns_as_strings) + ( + most_frequent_pattern, + occurrence_of_most_frequent_pattern, + ) = pattern_counter.most_common(1)[0] + rows_affected = [] + for idx, row in enumerate(null_patterns_as_strings): + if row == most_frequent_pattern: + rows_affected.append(idx) + return { + "most_common_issue": { + "pattern": most_frequent_pattern, + "rows_affected": rows_affected, + "count": occurrence_of_most_frequent_pattern, + } + } + + @staticmethod + def column_impact(null_tracker: np.ndarray) -> Dict[str, List[float]]: + """ + Calculate and return the impact of null values per column, represented as the proportion + of rows having null values in each column. + + Parameters + ---------- + null_tracker : np.ndarray + A boolean array of the same shape as features, where True indicates null/missing entries. + + Returns + ------- + Dict[str, List[float]] + A dictionary containing the impact per column, with values being a list + where each element is the percentage of rows having null values in the corresponding column. + """ + # Calculate proportion of nulls in each column + proportion_of_nulls_per_column = null_tracker.mean(axis=0) + + # Return result as a dictionary containing a list of proportions + return {"column_impact": proportion_of_nulls_per_column.tolist()} - def collect_info(self) -> dict: - issues_dict = {"average_null_score": self.issues[self.issue_score_key].mean()} + def collect_info(self, null_tracker: np.ndarray) -> dict: + most_common_issue = self.most_common_issue(null_tracker=null_tracker) + column_impact = self.column_impact(null_tracker=null_tracker) + average_null_score = {"average_null_score": self.issues[self.issue_score_key].mean()} + issues_dict = {**average_null_score, **most_common_issue, **column_impact} info_dict: Dict[str, Any] = {**issues_dict} return info_dict diff --git a/tests/datalab/issue_manager/test_null.py b/tests/datalab/issue_manager/test_null.py index 19e66b0e27..8ad8173edb 100644 --- a/tests/datalab/issue_manager/test_null.py +++ b/tests/datalab/issue_manager/test_null.py @@ -10,15 +10,18 @@ class TestNullIssueManager: @pytest.fixture - def embeddings(self, request): - no_null = request.param + def embeddings(self): np.random.seed(SEED) embeddings_array = np.random.random((4, 3)) - if not no_null: - embeddings_array[0][0] = np.NaN - embeddings_array[2][1] = np.NaN - embeddings_array[2][2] = np.NaN - return embeddings_array, no_null + return embeddings_array + + @pytest.fixture + def embeddings_with_null(self): + np.random.seed(SEED) + embeddings_array = np.random.random((4, 3)) + embeddings_array[0][0] = np.NaN + embeddings_array[1] = np.NaN + return embeddings_array @pytest.fixture def issue_manager(self, lab): @@ -27,47 +30,50 @@ def issue_manager(self, lab): def test_init(self, lab, issue_manager): assert issue_manager.datalab == lab - @pytest.mark.parametrize("embeddings", [True, False], indirect=["embeddings"]) def test_find_issues(self, issue_manager, embeddings): np.random.seed(SEED) - embeddings, no_null_flag = embeddings issue_manager.find_issues(features=embeddings) issues_sort, summary_sort, info_sort = ( issue_manager.issues, issue_manager.summary, issue_manager.info, ) - if no_null_flag: - expected_sorted_issue_mask = np.array([False, False, False, False]) - assert np.all( - issues_sort["is_null_issue"] == expected_sorted_issue_mask - ), "Issue mask should be correct" - assert summary_sort["issue_type"][0] == "null" - assert summary_sort["score"][0] == pytest.approx(expected=0.0, abs=1e-7) - assert ( - info_sort.get("average_null_score", None) is not None - ), "Should have average null score" - assert summary_sort["score"][0] == pytest.approx( - expected=info_sort["average_null_score"], abs=1e-7 - ) - else: - expected_sorted_issue_mask = np.array([True, False, True, False]) - assert np.all( - issues_sort["is_null_issue"] == expected_sorted_issue_mask - ), "Issue mask should be correct" - assert summary_sort["issue_type"][0] == "null" - assert summary_sort["score"][0] == pytest.approx(expected=0.25, abs=1e-7) - assert ( - info_sort.get("average_null_score", None) is not None - ), "Should have average null score" - assert summary_sort["score"][0] == pytest.approx( - expected=info_sort["average_null_score"], abs=1e-7 - ) - - @pytest.mark.parametrize("embeddings", [True, False], indirect=["embeddings"]) + expected_sorted_issue_mask = np.array([False, False, False, False]) + assert np.all( + issues_sort["is_null_issue"] == expected_sorted_issue_mask + ), "Issue mask should be correct" + assert summary_sort["issue_type"][0] == "null" + assert summary_sort["score"][0] == pytest.approx(expected=0.0, abs=1e-7) + assert ( + info_sort.get("average_null_score", None) is not None + ), "Should have average null score" + assert summary_sort["score"][0] == pytest.approx( + expected=info_sort["average_null_score"], abs=1e-7 + ) + + def test_find_issues_with_null(self, issue_manager, embeddings_with_null): + np.random.seed(SEED) + issue_manager.find_issues(features=embeddings_with_null) + issues_sort, summary_sort, info_sort = ( + issue_manager.issues, + issue_manager.summary, + issue_manager.info, + ) + expected_sorted_issue_mask = np.array([False, True, False, False]) + assert np.all( + issues_sort["is_null_issue"] == expected_sorted_issue_mask + ), "Issue mask should be correct" + assert summary_sort["issue_type"][0] == "null" + assert summary_sort["score"][0] == pytest.approx(expected=0.16666667, abs=1e-7) + assert ( + info_sort.get("average_null_score", None) is not None + ), "Should have average null score" + assert summary_sort["score"][0] == pytest.approx( + expected=info_sort["average_null_score"], abs=1e-7 + ) + def test_report(self, issue_manager, embeddings): np.random.seed(SEED) - embeddings, no_null_flag = embeddings issue_manager.find_issues(features=embeddings) report = issue_manager.report( issues=issue_manager.issues, @@ -89,13 +95,45 @@ def test_report(self, issue_manager, embeddings): ) assert "Additional Information: " in report - @pytest.mark.parametrize("embeddings", [True, False], indirect=["embeddings"]) + def test_report_with_null(self, issue_manager, embeddings_with_null): + np.random.seed(SEED) + issue_manager.find_issues(features=embeddings_with_null) + report = issue_manager.report( + issues=issue_manager.issues, + summary=issue_manager.summary, + info=issue_manager.info, + ) + + assert isinstance(report, str) + assert ( + "----------------------- null issues ------------------------\n\n" + "Number of examples with this issue:" + ) in report + + report = issue_manager.report( + issues=issue_manager.issues, + summary=issue_manager.summary, + info=issue_manager.info, + verbosity=3, + ) + assert "Additional Information: " in report + def test_collect_info(self, issue_manager, embeddings): """Test some values in the info dict.""" - embeddings, no_null_flag = embeddings issue_manager.find_issues(features=embeddings) info = issue_manager.info - if no_null_flag: - assert info["average_null_score"] == 0.0 - else: - assert info["average_null_score"] == 0.25 + assert info["average_null_score"] == 0.0 + assert info["most_common_issue"]["pattern"] == "no_null" + assert info["most_common_issue"]["count"] == 0 + assert info["most_common_issue"]["rows_affected"] == [] + assert info["column_impact"] == [0, 0, 0] + + def test_collect_info_with_nulls(self, issue_manager, embeddings_with_null): + """Test some values in the info dict.""" + issue_manager.find_issues(features=embeddings_with_null) + info = issue_manager.info + assert info["average_null_score"] == pytest.approx(expected=0.16666667, abs=1e-7) + assert info["most_common_issue"]["pattern"] == "100" + assert info["most_common_issue"]["count"] == 1 + assert info["most_common_issue"]["rows_affected"] == [0] + assert info["column_impact"] == [0.5, 0.25, 0.25] From 1c96a11e35c9bc70db97d9346208ed35f22126c0 Mon Sep 17 00:00:00 2001 From: abhijitpal1247 Date: Sat, 14 Oct 2023 19:21:49 +0530 Subject: [PATCH 12/30] removing unused imports --- cleanlab/datalab/internal/issue_manager/null.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index 46b3e39d27..77d7d7ba28 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -1,7 +1,7 @@ from __future__ import annotations from collections import Counter -from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, List, Tuple +from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, List import numpy as np import pandas as pd From 348bcad38a42caa993bfd7fa9d984a1f23f8016c Mon Sep 17 00:00:00 2001 From: abhijitpal1247 Date: Sat, 14 Oct 2023 19:30:16 +0530 Subject: [PATCH 13/30] adding some type declarations --- cleanlab/datalab/internal/issue_manager/null.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index 77d7d7ba28..376e9bd05f 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -1,7 +1,7 @@ from __future__ import annotations from collections import Counter -from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, List +from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, List, Union import numpy as np import pandas as pd @@ -74,7 +74,7 @@ def find_issues( self.info = self.collect_info(null_tracker) @staticmethod - def most_common_issue(null_tracker: np.ndarray) -> Dict[str, Dict[str, float]]: + def most_common_issue(null_tracker: np.ndarray) -> Dict[str, Union[Dict[str, float], List]]: """ Identify and return the most common null value pattern across all rows and count the number of rows with this pattern. From 4efc50c46bd0b7ad1943d109212cfb1dcb6b6653 Mon Sep 17 00:00:00 2001 From: abhijitpal1247 Date: Sat, 14 Oct 2023 19:36:48 +0530 Subject: [PATCH 14/30] adding some type declarations - 2 --- cleanlab/datalab/internal/issue_manager/null.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index 376e9bd05f..421af5a256 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -74,7 +74,7 @@ def find_issues( self.info = self.collect_info(null_tracker) @staticmethod - def most_common_issue(null_tracker: np.ndarray) -> Dict[str, Union[Dict[str, float], List]]: + def most_common_issue(null_tracker: np.ndarray) -> Dict[str, Union[Dict[str, float], List[int]]]: """ Identify and return the most common null value pattern across all rows and count the number of rows with this pattern. From 9791913052f0f80e1c90a52e86d06f54713b3aaf Mon Sep 17 00:00:00 2001 From: abhijitpal1247 Date: Sat, 14 Oct 2023 19:38:09 +0530 Subject: [PATCH 15/30] adding some type declarations - 3 --- cleanlab/datalab/internal/issue_manager/null.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index 421af5a256..efa46292c0 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -74,7 +74,7 @@ def find_issues( self.info = self.collect_info(null_tracker) @staticmethod - def most_common_issue(null_tracker: np.ndarray) -> Dict[str, Union[Dict[str, float], List[int]]]: + def most_common_issue(null_tracker: np.ndarray) -> Dict[str, Union[Dict[str, float], List[int]], int]: """ Identify and return the most common null value pattern across all rows and count the number of rows with this pattern. From 964348351a0aedb98811d7b1071bffed550cdcb6 Mon Sep 17 00:00:00 2001 From: abhijitpal1247 Date: Sat, 14 Oct 2023 19:39:28 +0530 Subject: [PATCH 16/30] reformatted using black - 2 --- cleanlab/datalab/internal/issue_manager/null.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index efa46292c0..8db52116ca 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -74,7 +74,9 @@ def find_issues( self.info = self.collect_info(null_tracker) @staticmethod - def most_common_issue(null_tracker: np.ndarray) -> Dict[str, Union[Dict[str, float], List[int]], int]: + def most_common_issue( + null_tracker: np.ndarray, + ) -> Dict[str, Union[Dict[str, float], List[int]], int]: """ Identify and return the most common null value pattern across all rows and count the number of rows with this pattern. From 931952dd48cc659d876ecafdc64fcdd4380a4d80 Mon Sep 17 00:00:00 2001 From: abhijitpal1247 Date: Sat, 14 Oct 2023 19:55:29 +0530 Subject: [PATCH 17/30] typing changes - 3 --- cleanlab/datalab/internal/issue_manager/null.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index 8db52116ca..9933488214 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -76,7 +76,7 @@ def find_issues( @staticmethod def most_common_issue( null_tracker: np.ndarray, - ) -> Dict[str, Union[Dict[str, float], List[int]], int]: + ) -> Dict[str, List[int], int]: """ Identify and return the most common null value pattern across all rows and count the number of rows with this pattern. From 20da838d4d649627341b210eb90650111fabcab6 Mon Sep 17 00:00:00 2001 From: abhijitpal1247 Date: Sat, 14 Oct 2023 19:57:14 +0530 Subject: [PATCH 18/30] removing unused imports - 2 --- cleanlab/datalab/internal/issue_manager/null.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index 9933488214..8d06d34d21 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -1,7 +1,7 @@ from __future__ import annotations from collections import Counter -from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, List, Union +from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, List import numpy as np import pandas as pd From 9622342041d41c07ae92d2cd2f4b51efe24bf556 Mon Sep 17 00:00:00 2001 From: abhijitpal1247 Date: Sat, 14 Oct 2023 19:58:37 +0530 Subject: [PATCH 19/30] typing changes - 4 --- cleanlab/datalab/internal/issue_manager/null.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index 8d06d34d21..52c456e6e6 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -76,7 +76,7 @@ def find_issues( @staticmethod def most_common_issue( null_tracker: np.ndarray, - ) -> Dict[str, List[int], int]: + ) -> dict[str, dict[str, str | int | list[int]]]: """ Identify and return the most common null value pattern across all rows and count the number of rows with this pattern. From 683ba7298a63b6738e1f5122a2dc9fe1fdb5a038 Mon Sep 17 00:00:00 2001 From: abhijitpal1247 Date: Sat, 14 Oct 2023 20:07:49 +0530 Subject: [PATCH 20/30] typing changes - 5 --- cleanlab/datalab/internal/issue_manager/null.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index 52c456e6e6..536d0276e9 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -76,7 +76,7 @@ def find_issues( @staticmethod def most_common_issue( null_tracker: np.ndarray, - ) -> dict[str, dict[str, str | int | list[int]]]: + ) -> dict[str, dict[str, str | int | list[int] | list[int | None]]]: """ Identify and return the most common null value pattern across all rows and count the number of rows with this pattern. @@ -93,7 +93,7 @@ def most_common_issue( """ # Convert the boolean null_tracker matrix into a list of strings. most_frequent_pattern = "no_null" - rows_affected = [] + rows_affected: List[int] = [] occurrence_of_most_frequent_pattern = 0 if null_tracker.any(): null_patterns_as_strings = [ From c678f1c9391fbc2d2f2d4f8a7bfe2539f65beaf8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Thu, 26 Oct 2023 14:58:47 +0000 Subject: [PATCH 21/30] add numpy extra for hypothesis dev dependency --- requirements-dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index eb86e6968d..241a9e5ed8 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,7 +1,7 @@ # Python dependencies for development coverage != 6.3, != 6.3.* datasets -hypothesis +hypothesis[numpy] mypy pandas-stubs pre-commit From 2b10067ff2a073ea957b94fe506cacf3e395d432 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Thu, 26 Oct 2023 15:00:49 +0000 Subject: [PATCH 22/30] fix quality score for null issue manager As a quality score, high values mean there are proportionally fewer NaN values in a given row. --- cleanlab/datalab/internal/issue_manager/null.py | 6 +++--- tests/datalab/issue_manager/test_null.py | 8 ++++---- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index 536d0276e9..70767dda0a 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -42,14 +42,14 @@ def __init__(self, datalab: Datalab): def _calculate_null_issues(features: npt.NDArray) -> tuple[ndarray, ndarray, Any]: rows = features.shape[0] cols = features.shape[1] - scores = np.zeros(rows).astype(np.float32) + scores = np.ones(rows).astype(np.float32) is_null_issue = np.full(rows, False) null_tracker = np.isnan(features) if null_tracker.any(): for row in range(rows): if null_tracker[row].any(): - null_row_count = np.count_nonzero(~null_tracker[row]) - scores[row] = null_row_count / cols + non_null_col_count = np.count_nonzero(~null_tracker[row]) + scores[row] = non_null_col_count / cols if scores[row] == 0.00: is_null_issue[row] = True return is_null_issue, scores, null_tracker diff --git a/tests/datalab/issue_manager/test_null.py b/tests/datalab/issue_manager/test_null.py index 8ad8173edb..7b0c24452b 100644 --- a/tests/datalab/issue_manager/test_null.py +++ b/tests/datalab/issue_manager/test_null.py @@ -43,7 +43,7 @@ def test_find_issues(self, issue_manager, embeddings): issues_sort["is_null_issue"] == expected_sorted_issue_mask ), "Issue mask should be correct" assert summary_sort["issue_type"][0] == "null" - assert summary_sort["score"][0] == pytest.approx(expected=0.0, abs=1e-7) + assert summary_sort["score"][0] == pytest.approx(expected=1.0, abs=1e-7) assert ( info_sort.get("average_null_score", None) is not None ), "Should have average null score" @@ -64,7 +64,7 @@ def test_find_issues_with_null(self, issue_manager, embeddings_with_null): issues_sort["is_null_issue"] == expected_sorted_issue_mask ), "Issue mask should be correct" assert summary_sort["issue_type"][0] == "null" - assert summary_sort["score"][0] == pytest.approx(expected=0.16666667, abs=1e-7) + assert summary_sort["score"][0] == pytest.approx(expected=8/12, abs=1e-7) assert ( info_sort.get("average_null_score", None) is not None ), "Should have average null score" @@ -122,7 +122,7 @@ def test_collect_info(self, issue_manager, embeddings): """Test some values in the info dict.""" issue_manager.find_issues(features=embeddings) info = issue_manager.info - assert info["average_null_score"] == 0.0 + assert info["average_null_score"] == 1.0 assert info["most_common_issue"]["pattern"] == "no_null" assert info["most_common_issue"]["count"] == 0 assert info["most_common_issue"]["rows_affected"] == [] @@ -132,7 +132,7 @@ def test_collect_info_with_nulls(self, issue_manager, embeddings_with_null): """Test some values in the info dict.""" issue_manager.find_issues(features=embeddings_with_null) info = issue_manager.info - assert info["average_null_score"] == pytest.approx(expected=0.16666667, abs=1e-7) + assert info["average_null_score"] == pytest.approx(expected=8/12, abs=1e-7) assert info["most_common_issue"]["pattern"] == "100" assert info["most_common_issue"]["count"] == 1 assert info["most_common_issue"]["rows_affected"] == [0] From 29e155541fac3865116ae8d91f4fcf0380b47968 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Thu, 26 Oct 2023 15:01:31 +0000 Subject: [PATCH 23/30] format import --- tests/datalab/issue_manager/test_null.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tests/datalab/issue_manager/test_null.py b/tests/datalab/issue_manager/test_null.py index 7b0c24452b..8c553559c2 100644 --- a/tests/datalab/issue_manager/test_null.py +++ b/tests/datalab/issue_manager/test_null.py @@ -1,9 +1,7 @@ import numpy as np import pytest -from cleanlab.datalab.internal.issue_manager.null import ( - NullIssueManager, -) +from cleanlab.datalab.internal.issue_manager.null import NullIssueManager SEED = 42 From 18de7a57fda5fbb362da15748e1ba93bbd8d2b83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Thu, 26 Oct 2023 15:03:35 +0000 Subject: [PATCH 24/30] remove useless super() delegation in __init__ method --- cleanlab/datalab/internal/issue_manager/null.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index 70767dda0a..a155e1bdfb 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -11,7 +11,6 @@ if TYPE_CHECKING: # pragma: no cover import numpy.typing as npt - from cleanlab.datalab.datalab import Datalab class NullIssueManager(IssueManager): @@ -35,9 +34,6 @@ class NullIssueManager(IssueManager): 2: [], } - def __init__(self, datalab: Datalab): - super().__init__(datalab) - @staticmethod def _calculate_null_issues(features: npt.NDArray) -> tuple[ndarray, ndarray, Any]: rows = features.shape[0] From 766ff5c0cdae7e1e2c087574b92ee242f2f69996 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Thu, 26 Oct 2023 15:05:45 +0000 Subject: [PATCH 25/30] add property based test for scoring null issues --- tests/datalab/issue_manager/test_null.py | 39 ++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/datalab/issue_manager/test_null.py b/tests/datalab/issue_manager/test_null.py index 8c553559c2..85b169292e 100644 --- a/tests/datalab/issue_manager/test_null.py +++ b/tests/datalab/issue_manager/test_null.py @@ -1,5 +1,8 @@ import numpy as np import pytest +from hypothesis.extra.numpy import arrays, array_shapes +from hypothesis.strategies import floats, just +from hypothesis import HealthCheck, given, settings from cleanlab.datalab.internal.issue_manager.null import NullIssueManager @@ -135,3 +138,39 @@ def test_collect_info_with_nulls(self, issue_manager, embeddings_with_null): assert info["most_common_issue"]["count"] == 1 assert info["most_common_issue"]["rows_affected"] == [0] assert info["column_impact"] == [0.5, 0.25, 0.25] + + # Strategy for generating NaN values + nan_strategy = just(np.nan) + + # Strategy for generating regular float values, including NaNs + float_with_nan = floats(allow_nan=True) + + # Strategy for generating NumPy arrays with some NaN values + features_with_nan_strategy = arrays( + dtype=np.float64, + shape=array_shapes(min_dims=2, max_dims=2, min_side=1, max_side=5), + elements=float_with_nan, + fill=nan_strategy, + ) + + @settings(suppress_health_check=[HealthCheck.function_scoped_fixture]) # No need to reset state of issue_manager fixture + @given(embeddings=features_with_nan_strategy) + def test_quality_scores_and_full_null_row_identification(self, issue_manager, embeddings): + + # Run the find_issues method + issue_manager.find_issues(features=embeddings) + issues_sort, _, _ = ( + issue_manager.issues, + issue_manager.summary, + issue_manager.info, + ) + + # Check for the two main properties: + + # 1. The quality score for each row should be the fraction of features which are not null in that row. + non_null_fractions = [np.count_nonzero(~np.isnan(row)) / len(row) for row in embeddings] + assert np.allclose(issues_sort[issue_manager.issue_score_key], non_null_fractions, atol=1e-7) + + # 2. The rows that are marked as is_null_issue should ONLY be those rows which are 100% null values. + all_rows_are_null = np.all(np.isnan(embeddings), axis=1) + assert np.all(issues_sort["is_null_issue"] == all_rows_are_null) \ No newline at end of file From a87ac46b4873e0c1ea5543641dbaebeb2266ddd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Sat, 28 Oct 2023 05:23:54 +0000 Subject: [PATCH 26/30] turn info helper methods into internal methods --- cleanlab/datalab/internal/issue_manager/null.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index a155e1bdfb..ab0cdac66c 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -70,7 +70,7 @@ def find_issues( self.info = self.collect_info(null_tracker) @staticmethod - def most_common_issue( + def _most_common_issue( null_tracker: np.ndarray, ) -> dict[str, dict[str, str | int | list[int] | list[int | None]]]: """ @@ -115,7 +115,7 @@ def most_common_issue( } @staticmethod - def column_impact(null_tracker: np.ndarray) -> Dict[str, List[float]]: + def _column_impact(null_tracker: np.ndarray) -> Dict[str, List[float]]: """ Calculate and return the impact of null values per column, represented as the proportion of rows having null values in each column. @@ -138,8 +138,8 @@ def column_impact(null_tracker: np.ndarray) -> Dict[str, List[float]]: return {"column_impact": proportion_of_nulls_per_column.tolist()} def collect_info(self, null_tracker: np.ndarray) -> dict: - most_common_issue = self.most_common_issue(null_tracker=null_tracker) - column_impact = self.column_impact(null_tracker=null_tracker) + most_common_issue = self._most_common_issue(null_tracker=null_tracker) + column_impact = self._column_impact(null_tracker=null_tracker) average_null_score = {"average_null_score": self.issues[self.issue_score_key].mean()} issues_dict = {**average_null_score, **most_common_issue, **column_impact} info_dict: Dict[str, Any] = {**issues_dict} From f61765327cf803e53a5758972933a568bf9227c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Sat, 28 Oct 2023 05:25:42 +0000 Subject: [PATCH 27/30] add basic docstring for method containing main logic of issue manager --- cleanlab/datalab/internal/issue_manager/null.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index ab0cdac66c..d94edb9531 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -36,6 +36,9 @@ class NullIssueManager(IssueManager): @staticmethod def _calculate_null_issues(features: npt.NDArray) -> tuple[ndarray, ndarray, Any]: + """Tracks the number of null values in each row of a feature array, + computes quality scores based on the fraction of null values in each row, + and returns a boolean array indicating whether each row only has null values.""" rows = features.shape[0] cols = features.shape[1] scores = np.ones(rows).astype(np.float32) From e701df0387ace859d4d2155f0b22d0eb07792480 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Sat, 28 Oct 2023 05:25:53 +0000 Subject: [PATCH 28/30] update formatting --- cleanlab/datalab/internal/issue_manager/null.py | 5 ++--- tests/datalab/issue_manager/test_null.py | 13 +++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/cleanlab/datalab/internal/issue_manager/null.py b/cleanlab/datalab/internal/issue_manager/null.py index d94edb9531..332879ff1f 100644 --- a/cleanlab/datalab/internal/issue_manager/null.py +++ b/cleanlab/datalab/internal/issue_manager/null.py @@ -15,6 +15,7 @@ class NullIssueManager(IssueManager): """Manages issues related to null/missing values in the rows of features. + Parameters ---------- datalab : @@ -28,9 +29,7 @@ class NullIssueManager(IssueManager): issue_name: ClassVar[str] = "null" verbosity_levels = { 0: ["average_null_score"], - 1: [ - "most_common_issue", - ], + 1: ["most_common_issue"], 2: [], } diff --git a/tests/datalab/issue_manager/test_null.py b/tests/datalab/issue_manager/test_null.py index 85b169292e..94cb70c846 100644 --- a/tests/datalab/issue_manager/test_null.py +++ b/tests/datalab/issue_manager/test_null.py @@ -65,7 +65,7 @@ def test_find_issues_with_null(self, issue_manager, embeddings_with_null): issues_sort["is_null_issue"] == expected_sorted_issue_mask ), "Issue mask should be correct" assert summary_sort["issue_type"][0] == "null" - assert summary_sort["score"][0] == pytest.approx(expected=8/12, abs=1e-7) + assert summary_sort["score"][0] == pytest.approx(expected=8 / 12, abs=1e-7) assert ( info_sort.get("average_null_score", None) is not None ), "Should have average null score" @@ -133,7 +133,7 @@ def test_collect_info_with_nulls(self, issue_manager, embeddings_with_null): """Test some values in the info dict.""" issue_manager.find_issues(features=embeddings_with_null) info = issue_manager.info - assert info["average_null_score"] == pytest.approx(expected=8/12, abs=1e-7) + assert info["average_null_score"] == pytest.approx(expected=8 / 12, abs=1e-7) assert info["most_common_issue"]["pattern"] == "100" assert info["most_common_issue"]["count"] == 1 assert info["most_common_issue"]["rows_affected"] == [0] @@ -152,11 +152,12 @@ def test_collect_info_with_nulls(self, issue_manager, embeddings_with_null): elements=float_with_nan, fill=nan_strategy, ) - - @settings(suppress_health_check=[HealthCheck.function_scoped_fixture]) # No need to reset state of issue_manager fixture + + @settings( + suppress_health_check=[HealthCheck.function_scoped_fixture] + ) # No need to reset state of issue_manager fixture @given(embeddings=features_with_nan_strategy) def test_quality_scores_and_full_null_row_identification(self, issue_manager, embeddings): - # Run the find_issues method issue_manager.find_issues(features=embeddings) issues_sort, _, _ = ( @@ -173,4 +174,4 @@ def test_quality_scores_and_full_null_row_identification(self, issue_manager, em # 2. The rows that are marked as is_null_issue should ONLY be those rows which are 100% null values. all_rows_are_null = np.all(np.isnan(embeddings), axis=1) - assert np.all(issues_sort["is_null_issue"] == all_rows_are_null) \ No newline at end of file + assert np.all(issues_sort["is_null_issue"] == all_rows_are_null) From a48395250d46d7cec83e06541520d095a261800c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Sat, 28 Oct 2023 05:27:20 +0000 Subject: [PATCH 29/30] extract variable in test --- tests/datalab/issue_manager/test_null.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/datalab/issue_manager/test_null.py b/tests/datalab/issue_manager/test_null.py index 94cb70c846..0ca092c4c3 100644 --- a/tests/datalab/issue_manager/test_null.py +++ b/tests/datalab/issue_manager/test_null.py @@ -170,7 +170,8 @@ def test_quality_scores_and_full_null_row_identification(self, issue_manager, em # 1. The quality score for each row should be the fraction of features which are not null in that row. non_null_fractions = [np.count_nonzero(~np.isnan(row)) / len(row) for row in embeddings] - assert np.allclose(issues_sort[issue_manager.issue_score_key], non_null_fractions, atol=1e-7) + scores = issues_sort[issue_manager.issue_score_key] + assert np.allclose(scores, non_null_fractions, atol=1e-7) # 2. The rows that are marked as is_null_issue should ONLY be those rows which are 100% null values. all_rows_are_null = np.all(np.isnan(embeddings), axis=1) From cfa26aae24e7c54fb9789e7bd317bb0c5e25b40d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?El=C3=ADas=20Snorrason?= Date: Sat, 28 Oct 2023 05:42:22 +0000 Subject: [PATCH 30/30] add temporary docs page for null issue manager --- .../issue_manager/_notices/not_registered.rst | 5 +++++ .../datalab/internal/issue_manager/index.rst | 13 +++++++++++++ .../datalab/internal/issue_manager/null.rst | 10 ++++++++++ 3 files changed, 28 insertions(+) create mode 100644 docs/source/cleanlab/datalab/internal/issue_manager/_notices/not_registered.rst create mode 100644 docs/source/cleanlab/datalab/internal/issue_manager/null.rst diff --git a/docs/source/cleanlab/datalab/internal/issue_manager/_notices/not_registered.rst b/docs/source/cleanlab/datalab/internal/issue_manager/_notices/not_registered.rst new file mode 100644 index 0000000000..7d6dccf43d --- /dev/null +++ b/docs/source/cleanlab/datalab/internal/issue_manager/_notices/not_registered.rst @@ -0,0 +1,5 @@ +.. warning:: + + This issue manager isn't set up for direct Datalab use yet. + + Register it first using `~cleanlab.datalab.internal.issue_manager_factory.register`. diff --git a/docs/source/cleanlab/datalab/internal/issue_manager/index.rst b/docs/source/cleanlab/datalab/internal/issue_manager/index.rst index f314611a0d..1bc47728ab 100644 --- a/docs/source/cleanlab/datalab/internal/issue_manager/index.rst +++ b/docs/source/cleanlab/datalab/internal/issue_manager/index.rst @@ -5,6 +5,11 @@ issue_manager Methods in this ``issue_manager`` module are bleeding edge and may have sharp edges. They are not guaranteed to be stable between different ``cleanlab`` versions. +Registered issue managers +------------------------- + +These are the issue managers that Datalab has registered. + .. toctree:: Base issue_manager module label @@ -12,3 +17,11 @@ issue_manager duplicate noniid imbalance + +Unregistered issue managers +--------------------------- + +These are the issue managers that Datalab has not registered (yet). + +.. toctree:: + null diff --git a/docs/source/cleanlab/datalab/internal/issue_manager/null.rst b/docs/source/cleanlab/datalab/internal/issue_manager/null.rst new file mode 100644 index 0000000000..615faff5c0 --- /dev/null +++ b/docs/source/cleanlab/datalab/internal/issue_manager/null.rst @@ -0,0 +1,10 @@ +null +==== + +.. include:: _notices/not_registered.rst + +.. automodule:: cleanlab.datalab.internal.issue_manager.null + :autosummary: + :members: + :undoc-members: + :show-inheritance: