Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[1044] Added end-to-end tests at the end of Quickstart editorial #1118

Merged
merged 8 commits into from
May 14, 2024
108 changes: 95 additions & 13 deletions docs/source/tutorials/datalab/datalab_quickstart.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"metadata": {},
elisno marked this conversation as resolved.
Show resolved Hide resolved
"nbsphinx": "hidden"
},
"outputs": [],
Expand Down Expand Up @@ -108,7 +109,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"metadata": {}
},
"outputs": [],
"source": [
"import numpy as np\n",
Expand Down Expand Up @@ -236,6 +239,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"metadata": {},
"nbsphinx": "hidden"
},
"outputs": [],
Expand Down Expand Up @@ -335,7 +339,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"metadata": {}
},
"outputs": [],
"source": [
"X_train, y_train_idx, noisy_labels, noisy_labels_idx, X_out, X_duplicate = create_data()"
Expand Down Expand Up @@ -421,6 +427,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"metadata": {},
"nbsphinx": "hidden"
},
"outputs": [],
Expand Down Expand Up @@ -486,7 +493,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"metadata": {}
},
"outputs": [],
"source": [
"plot_data(X_train, y_train_idx, noisy_labels_idx, X_out, X_duplicate)"
Expand All @@ -507,7 +516,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"metadata": {}
},
"outputs": [],
"source": [
"data = {\"X\": X_train, \"y\": noisy_labels}"
Expand Down Expand Up @@ -543,7 +554,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"metadata": {}
},
"outputs": [],
"source": [
"model = LogisticRegression()\n",
Expand Down Expand Up @@ -572,7 +585,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"metadata": {}
},
"outputs": [],
"source": [
"lab = Datalab(data, label_name=\"y\")\n",
Expand All @@ -590,7 +605,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"metadata": {}
},
"outputs": [],
"source": [
"lab.report()"
Expand Down Expand Up @@ -618,7 +635,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"metadata": {}
},
"outputs": [],
"source": [
"lab.get_issue_summary()"
Expand All @@ -636,7 +655,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"metadata": {}
},
"outputs": [],
"source": [
"lab.get_issue_summary(\"label\")"
Expand All @@ -652,7 +673,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"metadata": {}
},
"outputs": [],
"source": [
"lab.get_issues().head()"
Expand All @@ -671,7 +694,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"metadata": {}
},
"outputs": [],
"source": [
"examples_w_issue = (\n",
Expand All @@ -680,7 +705,7 @@
" .sort_values(\"label_score\")\n",
")\n",
"\n",
"examples_w_issue.head()"
"examples_w_issue.head(50)"
]
},
{
Expand All @@ -703,6 +728,7 @@
"cell_type": "code",
"execution_count": null,
"metadata": {
"metadata": {},
"scrolled": true
},
"outputs": [],
Expand Down Expand Up @@ -730,7 +756,9 @@
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"metadata": {
"metadata": {}
},
"outputs": [],
"source": [
"lab.get_issues(\"near_duplicate\").query(\"is_near_duplicate_issue\").sort_values(\"near_duplicate_score\")"
Expand All @@ -746,6 +774,60 @@
"\n",
"To learn more, check out this [example notebook](https://github.com/cleanlab/examples/blob/master/datalab_image_classification/datalab.ipynb) (demonstrates Datalab applied to a real dataset) and the [advanced Datalab tutorial](datalab_advanced.html) (demonstrates configuration and customization options to exert greater control)."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"metadata": {}
},
elisno marked this conversation as resolved.
Show resolved Hide resolved
"outputs": [],
"source": [
"issue_results = lab.get_issues(\"label\")\n",
"outlier_results = lab.get_issues(\"outlier\")\n",
"duplicate_results = lab.get_issues(\"near_duplicate\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"metadata": {}
},
"outputs": [],
"source": [
"# Note: This cell is only for docs.cleanlab.ai, if running on local Jupyter or Colab, please ignore it.\n",
"from sklearn.metrics import roc_auc_score\n",
"\n",
"def jaccard_similarity(l1, l2):\n",
" s1 = set(l1)\n",
" s2 = set(l2)\n",
" intersect_set = s1.intersection(s2)\n",
" union_set = s1.union(s2)\n",
" if len(intersect_set) == 0:\n",
" return 0\n",
" return len(intersect_set) / len(union_set)\n",
"\n",
"identified_label_issues_indices = issue_results[issue_results[\"is_label_issue\"] == True].index.tolist()\n",
"label_issue_indices = issue_results[issue_results[\"given_label\"] != issue_results[\"predicted_label\"]].index.tolist()\n",
elisno marked this conversation as resolved.
Show resolved Hide resolved
"\n",
"label_quality_scores = issue_results[\"label_score\"].tolist()\n",
"Z = (issue_results['given_label'] == issue_results['predicted_label']).astype(float).tolist()\n",
"\n",
"identified_outlier_issues_indices = outlier_results[outlier_results[\"is_outlier_issue\"] == True].index.to_list()\n",
"outlier_issue_indices = list(range(125, 130+1))\n",
"exact_duplicate_idx = [index for index, elem in enumerate(X_train) if (elem == X_duplicate).all()][0]\n",
"if exact_duplicate_idx >= 125: # if the random index selected to create a duplicate >= 125, then the last point is also an outlier\n",
" outlier_issue_indices.append(131)\n",
" \n",
"identified_duplicate_issues_indices = duplicate_results[duplicate_results[\"is_near_duplicate_issue\"] == True].index.tolist()\n",
"duplicate_issue_indices = [exact_duplicate_idx, 129, 130, 131]\n",
"\n",
"assert jaccard_similarity(identified_label_issues_indices, label_issue_indices) > 0.75\n",
"assert roc_auc_score(Z, label_quality_scores) > 0.9\n",
"assert jaccard_similarity(identified_outlier_issues_indices, outlier_issue_indices) > 0.9\n",
"assert jaccard_similarity(identified_duplicate_issues_indices, duplicate_issue_indices) > 0.9"
]
}
],
"metadata": {
Expand Down
Loading