Merge branch 'master' into set_non_iid_num_issues

cleanlab · May 10, 2023 · e9aaac1 · e9aaac1
2 parents c92ca4d + a3ae30f
commit e9aaac1
Show file tree

Hide file tree

Showing 24 changed files with 201 additions and 79 deletions.
diff --git a/.github/get_min_dependencies.py b/.github/get_min_dependencies.py
@@ -0,0 +1,15 @@
+"""This script fetches minimum dependencies of cleanlab package and writes them to the file requirements-min.txt"""
+import json
+
+
+if __name__ == "__main__":
+    with open("./deps.json", "r") as f:
+        deps = json.load(f)
+
+    for package in deps:
+        if package["package"]["package_name"] == "cleanlab":
+            for dep in package["dependencies"]:
+                req_version = dep["required_version"]
+                with open("requirements-min.txt", "a") as f:
+                    if req_version.startswith(">="):
+                        f.write(f"{dep['package_name']}=={req_version[2:]}\n")
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -26,8 +26,6 @@ jobs:
           python-version: ${{ matrix.python }}
       - name: Install cleanlab
         run: pip install -e .
-      - name: Check cleanlab runs without optional dependencies
-        run: python3 -c "import cleanlab"
       - name: Install development dependencies
         run: pip install -r requirements-dev.txt
       - name: Install fasttext for non-Windows machines
@@ -44,6 +42,29 @@ jobs:
         env:
           TEST_FASTTEXT: true
       - uses: codecov/codecov-action@v3
+  test-without-extras-min-versions:
+    name: Test without optional dependencies and with minimum compatible versions of dependencies
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: '3.7'
+      - name: Install cleanlab
+        run: |
+          python -m  pip install --upgrade pip
+          pip install .
+      - name: Install test dependencies
+        run: |
+          pip install pytest pytest-lazy-fixture pipdeptree
+          pipdeptree -j > deps.json
+      - name: Install minimum versions
+        run: |
+          python ./.github/get_min_dependencies.py
+          pip install -r requirements-min.txt
+      - name: Run tests
+        run: |
+          pytest tests/test_multilabel_classification.py tests/test_multiannotator.py tests/test_filter_count.py
   typecheck:
     name: Type check
     runs-on: ubuntu-latest

diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md
@@ -113,13 +113,13 @@ The package can be installed with the optional dependency (here called `gpu`) vi
 1. PyPI installation
 
 ```shell
-pip install -r cleanlab[gpu]
+pip install -r "cleanlab[gpu]"
 ```
 
 2. Editable installation
 
 ```shell
-pip install -e .[gpu]
+pip install -e ".[gpu]"
 ```
 
 ## Testing
@@ -293,8 +293,15 @@ Try to adhere to this standardized terminology unless you have good reason not t
 
 Use relative linking to connect information between docs and jupyter notebooks, and make sure links will remain valid in the future as new cleanlab versions are released! Sphinx/html works with relative paths so try to specify relative paths if necessary. For specific situations:
 
-- Link another function from within a source code docstring: ``:py:func:`function_name <cleanlab.file.function_name>` ``
-- Link another class from within a source code docstring: ``:py:class:`class_name <cleanlab.file.class_name>` ``
+- Link another function or class from within a source code docstring: 
+  - If you just want to specify the function/class name (ie. the function/class is unique throughout our library): `` `~cleanlab.file.function_or_class_name` ``. 
+
+    This uses the [Sphinx's](https://www.sphinx-doc.org/en/master/usage/configuration.html#confval-default_role) `default_role = "py:obj"` setting, so the leading tilde shortens the link to only display `function_or_class_name`.
+  - If you want to additionally specify the module which the function belongs to: 
+      - `` :py:func:`file.function_name <cleanlab.file.function_name>` `` for functions 
+      - ``:py:class:`file.class_name <cleanlab.file.class_name>` `` for classes
+
+    Here you have more control over the text that is displayed to display the module name.  When referring to a function that is alternatively defined in other modules as well, always use this option to be more explicit about which module you are referencing.
 - Link a tutorial (rst file) from within a source code docstring or rst file: ``:ref:`tutorial_name <tutorial_name>` ``
 - Link a tutorial notebook (ipynb file) from within a source code docstring or rst file: `` `notebook_name <tutorials/notebook_name.ipynb>`_ `` . (If the notebook is not the in the same folder as the source code, use a relative path)
 - Link a function from within a tutorial notebook: `[function_name](../cleanlab/file.rst#cleanlab.file.function_name)`

diff --git a/cleanlab/__init__.py b/cleanlab/__init__.py
@@ -34,7 +34,7 @@ def _datalab_import_factory():
     except ImportError:
         return DatalabUnavailable(
             "Datalab is not available due to missing dependencies. "
-            "To install Datalab, run `pip install cleanlab[datalab]`."
+            "To install Datalab, run `pip install 'cleanlab[datalab]'`."
         )
 
 
@@ -46,7 +46,7 @@ def _issue_manager_import_factory():
     except ImportError:
         return DatalabUnavailable(
             "IssueManager is not available due to missing dependencies for Datalab. "
-            "To install Datalab, run `pip install cleanlab[datalab]`."
+            "To install Datalab, run `pip install 'cleanlab[datalab]'`."
         )
 
 

diff --git a/cleanlab/datalab/data.py b/cleanlab/datalab/data.py
@@ -24,7 +24,7 @@
     raise ImportError(
         "Cannot import datasets package. "
         "Please install it and try again, or just install cleanlab with "
-        "all optional dependencies via: `pip install cleanlab[all]`"
+        "all optional dependencies via: `pip install 'cleanlab[all]'`"
     ) from error
 import numpy as np
 import pandas as pd

diff --git a/cleanlab/datalab/datalab.py b/cleanlab/datalab/datalab.py
@@ -273,6 +273,7 @@ def report(
         num_examples: int = 5,
         verbosity: Optional[int] = None,
         include_description: bool = True,
+        show_summary_score: bool = False,
     ) -> None:
         """Prints informative summary of all issues.
 
@@ -300,6 +301,7 @@ def report(
             data_issues=self.data_issues,
             verbosity=verbosity,
             include_description=include_description,
+            show_summary_score=show_summary_score,
         )
         reporter.report(num_examples=num_examples)
 

diff --git a/cleanlab/datalab/issue_manager/noniid.py b/cleanlab/datalab/issue_manager/noniid.py
@@ -103,6 +103,7 @@ def __init__(
         metric: Optional[str] = None,
         k: int = 10,
         num_permutations: int = 25,
+        seed: Optional[int] = 0,
         significance_threshold: float = 0.05,
         **_,
     ):
@@ -114,6 +115,7 @@ def __init__(
             "ks": simplified_kolmogorov_smirnov_test,
         }
         self.background_distribution = None
+        self.seed = seed
         self.significance_threshold = significance_threshold
 
     def find_issues(self, features: Optional[npt.NDArray] = None, **kwargs) -> None:
@@ -251,6 +253,8 @@ def _build_statistics_dictionary(self, knn_graph: csr_matrix) -> Dict[str, Dict[
     def _permutation_test(self, num_permutations) -> float:
         N = self.N
 
+        if self.seed is not None:
+            np.random.seed(self.seed)
         perms = np.fromiter(
             itertools.chain.from_iterable(
                 np.random.permutation(N) for i in range(num_permutations)

diff --git a/cleanlab/datalab/report.py b/cleanlab/datalab/report.py
@@ -54,11 +54,16 @@ class Reporter:
     """
 
     def __init__(
-        self, data_issues: "DataIssues", verbosity: int = 1, include_description: bool = True
+        self,
+        data_issues: "DataIssues",
+        verbosity: int = 1,
+        include_description: bool = True,
+        show_summary_score: bool = False,
     ):
         self.data_issues = data_issues
         self.verbosity = verbosity
         self.include_description = include_description
+        self.show_summary_score = show_summary_score
 
     def report(self, num_examples: int) -> None:
         """Prints a report about identified issues in the data.
@@ -121,10 +126,19 @@ def _write_summary(self, summary: pd.DataFrame) -> str:
         dataset_information = f"Dataset Information: num_examples: {num_examples}"
         if num_classes is not None:
             dataset_information += f", num_classes: {num_classes}"
+
+        if self.show_summary_score:
+            return (
+                "Here is a summary of the different kinds of issues found in the data:\n\n"
+                + summary.to_string(index=False)
+                + "\n\n"
+                + "(Note: A lower score indicates a more severe issue across all examples in the dataset.)\n\n"
+                + f"{dataset_information}\n\n\n"
+            )
+
         return (
             "Here is a summary of the different kinds of issues found in the data:\n\n"
-            + summary.to_string(index=False)
+            + summary.drop(columns=["score"]).to_string(index=False)
             + "\n\n"
-            + "(Note: A lower score indicates a more severe issue across all examples in the dataset.)\n\n"
             + f"{dataset_information}\n\n\n"
         )
diff --git a/cleanlab/outlier.py b/cleanlab/outlier.py
@@ -42,7 +42,7 @@ class OutOfDistribution:
     Each example's OOD score lies in [0,1] with smaller values indicating examples that are less typical under the data distribution.
     OOD scores may be estimated from either: numeric feature embeddings or predicted probabilities from a trained classifier.
 
-    To get indices of examples that are the most severe outliers, call :py:func:`find_top_issues <cleanlab.rank.find_top_issues>` function on the returned OOD scores.
+    To get indices of examples that are the most severe outliers, call `~cleanlab.rank.find_top_issues` function on the returned OOD scores.
 
     Parameters
     ----------
@@ -126,22 +126,21 @@ def fit_score(
         in to calculate scores.
 
         If `features` are passed in a ``NearestNeighbors`` object is fit. If `pred_probs` and 'labels' are passed in a
-        `confident_thresholds` ``np.ndarray`` is fit. For details see :py:func:`fit
-        <cleanlab.outlier.OutOfDistribution.fit>`.
+        `confident_thresholds` ``np.ndarray`` is fit. For details see `~cleanlab.outlier.OutOfDistribution.fit`.
 
         Parameters
         ----------
         features : np.ndarray, optional
           Feature array of shape ``(N, M)``, where N is the number of examples and M is the number of features used to represent each example.
-          For details, `features` in the same format expected by the :py:func:`fit <cleanlab.outlier.OutOfDistribution.fit>` function.
+          For details, `features` in the same format expected by the `~cleanlab.outlier.OutOfDistribution.fit` function.
 
         pred_probs : np.ndarray, optional
           An array of shape ``(N, K)`` of predicted class probabilities output by a trained classifier.
-          For details, `pred_probs` in the same format expected by the :py:func:`fit <cleanlab.outlier.OutOfDistribution.fit>` function.
+          For details, `pred_probs` in the same format expected by the `~cleanlab.outlier.OutOfDistribution.fit` function.
 
         labels : array_like, optional
           A discrete array of given class labels for the data of shape ``(N,)``.
-          For details, `labels` in the same format expected by the :py:func:`fit <cleanlab.outlier.OutOfDistribution.fit>` function.
+          For details, `labels` in the same format expected by the `~cleanlab.outlier.OutOfDistribution.fit` function.
 
         verbose : bool, default = True
           Set to ``False`` to suppress all print statements.
@@ -151,7 +150,7 @@ def fit_score(
         scores : np.ndarray
           If `features` are passed in, `ood_features_scores` are returned.
           If `pred_probs` are passed in, `ood_predictions_scores` are returned.
-          For details see return of :py:func:`score <cleanlab.outlier.OutOfDistribution.scores>` function.
+          For details see return of `~cleanlab.outlier.OutOfDistribution.scores` function.
 
         """
         scores = self._shared_fit(
@@ -181,7 +180,7 @@ def fit(
 
         If `features` are passed in, a ``NearestNeighbors`` object is fit.
         If `pred_probs` and 'labels' are passed in, a `confident_thresholds` ``np.ndarray`` is fit.
-        For details see :py:class:`OutOfDistribution <cleanlab.outlier.OutOfDistribution>` documentation.
+        For details see `~cleanlab.outlier.OutOfDistribution` documentation.
 
         Parameters
         ----------
@@ -232,11 +231,11 @@ def score(
         ----------
         features : np.ndarray, optional
           Feature array of shape ``(N, M)``, where N is the number of examples and M is the number of features used to represent each example.
-          For details, see `features` in :py:func:`fit <cleanlab.outlier.OutOfDistribution.fit>` function.
+          For details, see `features` in `~cleanlab.outlier.OutOfDistribution.fit` function.
 
         pred_probs : np.ndarray, optional
           An array of shape ``(N, K)``  of predicted class probabilities output by a trained classifier.
-          For details, see `pred_probs` in :py:func:`fit <cleanlab.outlier.OutOfDistribution.fit>` function.
+          For details, see `pred_probs` in `~cleanlab.outlier.OutOfDistribution.fit` function.
 
         Returns
         -------
@@ -313,8 +312,8 @@ def _shared_fit(
         """
         Shared fit functionality between ``fit()`` and ``fit_score()``.
 
-        For details, refer to :py:func:`fit <cleanlab.outlier.OutOfDistribution.fit>`
-        or :py:func:`fit_score <cleanlab.outlier.OutOfDistribution.fit_score>`.
+        For details, refer to `~cleanlab.outlier.OutOfDistribution.fit`
+        or `~cleanlab.outlier.OutOfDistribution.fit_score`.
         """
         self._assert_valid_inputs(features, pred_probs)
         scores = None  # If none scores are returned, fit was skipped
@@ -378,18 +377,18 @@ def _get_ood_features_scores(
     ----------
     features : np.ndarray
       Feature array of shape ``(N, M)``, where N is the number of examples and M is the number of features used to represent each example.
-      For details, `features` in the same format expected by the :py:func:`fit <cleanlab.outlier.OutOfDistribution.fit>` function.
+      For details, `features` in the same format expected by the `~cleanlab.outlier.OutOfDistribution.fit` function.
 
     knn : sklearn.neighbors.NearestNeighbors, default = None
-      For details, see key `knn` in the params dict arg of :py:class:`OutOfDistribution <cleanlab.outlier.OutOfDistribution>`.
+      For details, see key `knn` in the params dict arg of `~cleanlab.outlier.OutOfDistribution`.
 
     k : int, default=None
       Optional number of neighbors to use when calculating outlier score (average distance to neighbors).
-      For details, see key `k` in the params dict arg of :py:class:`OutOfDistribution <cleanlab.outlier.OutOfDistribution>`.
+      For details, see key `k` in the params dict arg of `~cleanlab.outlier.OutOfDistribution`.
 
     t : int, default=1
       Controls transformation of distances between examples into similarity scores that lie in [0,1].
-      For details, see key `t` in the params dict arg of :py:class:`OutOfDistribution <cleanlab.outlier.OutOfDistribution>`.
+      For details, see key `t` in the params dict arg of `~cleanlab.outlier.OutOfDistribution`.
 
     Returns
     -------
@@ -460,21 +459,21 @@ def _get_ood_predictions_scores(
     ----------
     pred_probs : np.ndarray
       An array of shape ``(N, K)`` of model-predicted probabilities,
-      `pred_probs` in the same format expected by the :py:func:`fit <cleanlab.outlier.OutOfDistribution.fit>` function.
+      `pred_probs` in the same format expected by the `~cleanlab.outlier.OutOfDistribution.fit` function.
 
     confident_thresholds : np.ndarray, default = None
-      For details, see key `confident_thresholds` in the params dict arg of :py:class:`OutOfDistribution <cleanlab.outlier.OutOfDistribution>`.
+      For details, see key `confident_thresholds` in the params dict arg of `~cleanlab.outlier.OutOfDistribution`.
 
     labels : array_like, optional
-      `labels` in the same format expected by the :py:func:`fit <cleanlab.outlier.OutOfDistribution.fit>` function.
+      `labels` in the same format expected by the `~cleanlab.outlier.OutOfDistribution.fit` function.
 
     adjust_pred_probs : bool, True
       Account for class imbalance in the label-quality scoring.
-      For details, see key `adjust_pred_probs` in the params dict arg of :py:class:`OutOfDistribution <cleanlab.outlier.OutOfDistribution>`.
+      For details, see key `adjust_pred_probs` in the params dict arg of `~cleanlab.outlier.OutOfDistribution`.
 
     method : {"entropy", "least_confidence"}, default="entropy"
       OOD scoring method.
-      For details see key `method` in the params dict arg of :py:class:`OutOfDistribution <cleanlab.outlier.OutOfDistribution>`.
+      For details see key `method` in the params dict arg of `~cleanlab.outlier.OutOfDistribution`.
 
 
     Returns

diff --git a/docs/source/cleanlab/datalab/guide/issue_type_description.rst b/docs/source/cleanlab/datalab/guide/issue_type_description.rst
@@ -203,6 +203,7 @@ Non-IID Issue Parameters
     	"metric": # `metric` argument to constructor of `NonIIDIssueManager`. String for the distance metric used for nearest neighbors search if necessary. `metric` argument to constructor of `sklearn.neighbors.NearestNeighbors`,
     	"k": # `k` argument to constructor of `NonIIDIssueManager`. Integer representing the number of nearest neighbors for nearest neighbors search if necessary. `n_neighbors` argument to constructor of `sklearn.neighbors.NearestNeighbors`,
 		"num_permutations": # `num_permutations` argument to constructor of `NonIIDIssueManager`,
+        "seed": # seed for numpy's random number generator (used for permutation tests),
 		"significance_threshold": # `significance_threshold` argument to constructor of `NonIIDIssueManager`. Floating value between 0 and 1 that determines the overall signicance of non-IID issues found in the dataset.
     }
 

diff --git a/docs/source/cleanlab/datalab/optional_dependencies.rst b/docs/source/cleanlab/datalab/optional_dependencies.rst
@@ -2,10 +2,10 @@ This package has additional dependencies that are not required for the core ``cl
 
 .. code-block:: console
 
-    $ pip install cleanlab[datalab]
+    $ pip install "cleanlab[datalab]"
 
 For the developmental version of the package, install from source:
 
 .. code-block:: console
 
-    $ pip install git+https://github.com/cleanlab/cleanlab.git#egg=cleanlab[datalab]
+    $ pip install "git+https://github.com/cleanlab/cleanlab.git#egg=cleanlab[datalab]"
diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -64,8 +64,11 @@
 
 autosummary_generate = True
 
+# set the default role of `name` to make cross references
+default_role = "py:obj"
+
 # -- Options for doctest extension ---------------------------------------------
-nbsphinx_allow_errors = True # to allow make doctest to run
+nbsphinx_allow_errors = True  # to allow make doctest to run
 
 # -- Options for apidoc extension ----------------------------------------------
 
@@ -161,7 +164,7 @@
         "v2.3.0",
         "v2.2.0",
         "v2.1.0",
-        "v2.0.0", 
+        "v2.0.0",
         "v1.0.1",
     ],
     # fmt: on
@@ -190,7 +193,7 @@
         .dataframe {
             background: #D7D7D7;
         }
-    
+
         th {
             color:black;
         }