cleanlab · jwmueller · Feb 8, 2024 · Jan 26, 2024 · Jan 26, 2024 · Jan 26, 2024
diff --git a/docs/source/tutorials/object_detection.ipynb b/docs/source/tutorials/object_detection.ipynb
@@ -119,10 +119,14 @@
    "outputs": [],
    "source": [
     "import pickle\n",
-    "\n",
-    "from cleanlab.object_detection.rank import get_label_quality_scores, issues_from_scores\n",
     "from cleanlab.object_detection.filter import find_label_issues\n",
-    "from cleanlab.object_detection.summary import visualize"
+    "from cleanlab.object_detection.rank import (\n",
+    "    _separate_label,\n",
+    "    _separate_prediction,\n",
+    "    get_label_quality_scores,\n",
+    "    issues_from_scores,\n",
+    ")\n",
+    "from cleanlab.object_detection.summary import visualize "
    ]
   },
   {
@@ -387,7 +391,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "issue_to_visualize = issue_idx[6]\n",
+    "issue_to_visualize = issue_idx[9]\n",
     "label = labels[issue_to_visualize]\n",
     "prediction = predictions[issue_to_visualize]\n",
     "score = scores[issue_to_visualize]\n",
@@ -498,6 +502,149 @@
     "visualize(image_path)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "46d6282a-4601-4cc3-b8a8-187ea6d5f8bc",
+   "metadata": {},
+   "source": [
+    "## Exploratory data analysys\n",
+    "\n",
+    "This section focuses on techniques to uncover annotation irregularities through exploratory data analysis. Specifically, it aims to highlight anomalies in object sizes, detect images with unusual object counts, and examine the distribution of class labels.\n",
+    "\n",
+    "Let's first consider the number of objects per image, and inspect the images with the largest values (which might reveal something off in our dataset):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5cacec81-2adf-46a8-82c5-7ec0185d4356",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "from cleanlab.internal.object_detection_utils import calculate_bounding_box_areas\n",
+    "from cleanlab.object_detection.summary import (\n",
+    "    bounding_box_size_distribution,\n",
+    "    class_label_distribution,\n",
+    "    object_counts_per_image,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3335b8a3-d0b4-415a-a97d-c203088a124e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_imgs_to_show = 3\n",
+    "lab_object_counts,pred_object_counts = object_counts_per_image(labels,predictions)\n",
+    "for image_to_visualize in np.argsort(lab_object_counts)[::-1][0:num_imgs_to_show]:\n",
+    "    image_path = IMAGE_PATH + labels[image_to_visualize]['seg_map']\n",
+    "    print(image_path, '| idx', image_to_visualize)\n",
+    "    visualize(image_path, label=labels[image_to_visualize], class_names=class_names)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e5ddd4fe-4477-4b68-ba79-e5cbb62822eb",
+   "metadata": {},
+   "source": [
+    "Next let's study the distribution of class labels in the overall annotations, comparing the distribution in the given annotations vs. in the model predictions. This can sometimes reveal that something's off in our dataset or model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d4b7677-6ebd-447d-b0a1-76e094686628",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "label_norm,pred_norm = class_label_distribution(labels,predictions)\n",
+    "print(\"Frequency of each class amongst annotated | predicted bounding boxes in the dataset:\\n\")\n",
+    "for i in label_norm:\n",
+    "    print(f\"{class_names[str(i)]} : {label_norm[i]} | {pred_norm[i]}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "200cdebf-b24c-4c2b-8914-6a2fce218daf",
+   "metadata": {},
+   "source": [
+    "Finally, let's consider the distribution of bounding box sizes (aka object sizes) in the given annotations for each class label.  The idea is to review any anomalies in bounding box areas for a given class (which might reveal problematic annotations or abnormal instances of this object class).  The following code determines such anomalies by comparing where each bounding box's area falls in comparison to the  on mean and standard deviation of areas for bounding boxes with the same class label."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "59d7ee39-3785-434b-8680-9133014851cd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "lab_area,pred_area = bounding_box_size_distribution(labels,predictions)\n",
+    "lab_area_mean = {i: np.mean(lab_area[i]) for i in lab_area.keys()}\n",
+    "lab_area_std = {i: np.std(lab_area[i]) for i in lab_area.keys()}\n",
+    "\n",
+    "max_deviation_values = []\n",
+    "max_deviation_classes = []\n",
+    "\n",
+    "for label in labels:\n",
+    "    bounding_boxes, label_names = _separate_label(label)\n",
+    "    areas = calculate_bounding_box_areas(bounding_boxes)\n",
+    "    deviation_values = []\n",
+    "    deviation_classes = []\n",
+    "\n",
+    "    for class_name, mean_area, std_area in zip(lab_area_mean.keys(), lab_area_mean.values(), lab_area_std.values()):\n",
+    "        class_areas = areas[label_names == class_name]\n",
+    "        deviations_away = (class_areas - mean_area) / std_area\n",
+    "        deviation_values.extend(list(deviations_away))\n",
+    "        deviation_classes.extend([class_name] * len(class_areas))\n",
+    "\n",
+    "    if deviation_values==[]:\n",
+    "        max_deviation_values.append(0.0)\n",
+    "        max_deviation_classes.append(-1)\n",
+    "    else:\n",
+    "        max_deviation_index = np.argmax(np.abs(deviation_values))\n",
+    "        max_deviation_values.append(deviation_values[max_deviation_index])\n",
+    "        max_deviation_classes.append(deviation_classes[max_deviation_index])\n",
+    "\n",
+    "max_deviation_classes, max_deviation_values = np.array(max_deviation_classes), np.array(max_deviation_values)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b260142e-b760-490c-818e-c037fab5c6c8",
+   "metadata": {},
+   "source": [
+    "In our dataset here, this analysis reveals certain abnormally large bounding boxes that take up most of the image."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "47b6a8ff-7a58-4a1f-baee-e6cfe7a85a6d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "num_imgs_to_show_per_class = 3\n",
+    "\n",
+    "for c in class_names.keys():\n",
+    "    class_num = int(c)\n",
+    "    sorted_indices = np.argsort(max_deviation_values)[::-1]\n",
+    "    count = 0\n",
+    "\n",
+    "    for image_to_visualize in sorted_indices:\n",
+    "        if max_deviation_values[i] == 0 or max_deviation_classes[i] != class_num:\n",
+    "            continue\n",
+    "        image_path = IMAGE_PATH + labels[image_to_visualize]['seg_map']\n",
+    "        print(image_path, '| idx', image_to_visualize, '| class', class_names[c])\n",
+    "        visualize(image_path, label=labels[image_to_visualize], class_names=class_names)\n",
+    "\n",
+    "        count += 1\n",
+    "        if count == num_imgs_to_show_per_class:\n",
+    "            break  # Break the loop after visualizing the top 3 instances for the current class"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -508,14 +655,13 @@
    "outputs": [],
    "source": [
     "# Note: This cell is only for docs.cleanlab.ai, if running on local Jupyter or Colab, please ignore it.\n",
-    "import numpy as np\n",
     "\n",
-    "assert 50 in issue_idx and issue_idx[0] == 50\n",
-    "assert 16 in issue_idx and issue_idx[1] == 16\n",
-    "assert 31 in issue_idx and issue_idx[2] == 31\n",
-    "# assert 62 in issue_idx and issue_idx[6] == 62 # This failing line is being updated in a separate PR. In merge conflict, this line should be removed.\n",
+    "expected_values = {0: 50, 1: 16, 2: 31, 9: 62}\n",
+    "\n",
+    "for idx, value in expected_values.items():\n",
+    "    assert value in issue_idx and issue_idx[idx] == value, f\"Assertion error at index {idx}: Expected {value}, got {issue_idx.get(idx, None)}\"\n",
     "\n",
-    "assert 2 not in issue_idx and 3 not in issue_idx and 0 not in issue_idx"
+    "assert all(i not in issue_idx for i in [0, 2, 3]), \"Unexpected values found in issue_idx\""
    ]
   }
  ],
@@ -535,7 +681,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.11.5"
   }
  },
  "nbformat": 4,