Skip to content

Commit

Permalink
feat: Add secondary filter to weed out high scoring matches.
Browse files Browse the repository at this point in the history
  • Loading branch information
WillieMaddox committed Dec 17, 2019
1 parent d2950fc commit c6333a5
Showing 1 changed file with 73 additions and 2 deletions.
75 changes: 73 additions & 2 deletions notebooks/eda/image_hashes.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@
"matches_threshold = 0.9\n",
"\n",
"sdcic = SDCImageContainer()\n",
"sdcic.load_image_metrics(['md5', 'bmh32', 'bmh96'])\n",
"sdcic.load_image_metrics(['md5', 'bmh32', 'bmh96', 'avg'])\n",
"img_ids = os.listdir(train_image_dir)"
]
},
Expand Down Expand Up @@ -720,6 +720,7 @@
"metadata": {},
"outputs": [],
"source": [
"# Skip matches if any tile overlap is below the threshold.\n",
"matches = set()\n",
"for img1_id, img2_id, img1_overlap_tag in tqdm_notebook(test_matches):\n",
" bmh_scores = sdcic.overlap_scores_config[matches_metric]['func'](img1_id, img2_id, img1_overlap_tag)\n",
Expand All @@ -729,17 +730,87 @@
"len(matches)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# If available, calculate the 'avg' overlap score for each match.\n",
"overlap_image_maps2 = {}\n",
"for img1_id, img2_id, img1_overlap_tag in tqdm_notebook(matches):\n",
" if img1_id not in overlap_image_maps2:\n",
" overlap_image_maps2[img1_id] = {}\n",
" if img1_overlap_tag not in overlap_image_maps2[img1_id]:\n",
" overlap_image_maps2[img1_id][img1_overlap_tag] = {}\n",
" if img2_id not in overlap_image_maps2[img1_id][img1_overlap_tag]:\n",
" avg_scores = sdcic.overlap_scores_config['avg']['func'](img1_id, img2_id, img1_overlap_tag)\n",
" overlap_image_maps2[img1_id][img1_overlap_tag][img2_id] = avg_scores[:len(overlap_tag_maps[img1_overlap_tag])]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check and verify.\n",
"ctr = Counter()\n",
"for img1_id, img1_matches in overlap_image_maps2.items():\n",
" for img1_overlap_tag, img2_ids in img1_matches.items():\n",
" if len(img2_ids) == 576:\n",
" print(img1_id)\n",
" for img2_id, score in sorted(img2_ids.items(), key=lambda x: np.max(x[1])):\n",
" print(img2_id, score)\n",
" ctr[len(img2_ids)] += 1"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sorted(ctr.items())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Filter high scoring matches.\n",
"new_matches = set()\n",
"for img1_id, img1_matches in tqdm_notebook(overlap_image_maps2.items()):\n",
" for img1_overlap_tag, img2_ids in img1_matches.items():\n",
" ii = 0\n",
" for img2_id, score in sorted(img2_ids.items(), key=lambda x: np.max(x[1])):\n",
" if ii >= 3 and np.max(score) > 20000:\n",
" break\n",
" ii += 1\n",
" new_matches.add((img1_id, img2_id, img1_overlap_tag))\n",
"len(new_matches)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"matches_file = os.path.join(interim_data_dir, f'matches_{matches_metric}_{matches_threshold}_offset.csv')\n",
"df = pd.DataFrame(sorted(matches))\n",
"df = pd.DataFrame(sorted(new_matches))\n",
"df.to_csv(matches_file, index=False)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Post processing extras (optional)"
]
},
{
"cell_type": "code",
"execution_count": null,
Expand Down

0 comments on commit c6333a5

Please sign in to comment.