From 2d67e1986cec4c2ac22b517c2490e0c362704fbb Mon Sep 17 00:00:00 2001
From: Priya Goyal
Date: Wed, 3 Mar 2021 13:07:40 -0800
Subject: [PATCH] Allow the option to replace the image path prefixes with a
new prefix for disk loading (#212)
Summary:
Pull Request resolved: https://github.com/facebookresearch/vissl/pull/212
this diff has 2 advantages:
1. users can now use same dataset via disk_filelist that have different root paths.
2. we can use manifold memcache for faster loading of datasets that live in manifold
Based off of sujitoc diff D26781752
Reviewed By: sujitoc
Differential Revision: D26785318
fbshipit-source-id: f3108a354e9f47f74f25faa00de989151593a6f2
---
vissl/config/defaults.yaml | 10 ++++++++++
vissl/data/disk_dataset.py | 12 ++++++++++++
2 files changed, 22 insertions(+)
diff --git a/vissl/config/defaults.yaml b/vissl/config/defaults.yaml
index ed7358906..b1e43ed62 100644
--- a/vissl/config/defaults.yaml
+++ b/vissl/config/defaults.yaml
@@ -154,6 +154,11 @@ config:
USE_STATEFUL_DISTRIBUTED_SAMPLER: False
# whether to drop the last incomplete batch per process
DROP_LAST: False
+ # if users want to replace certain prefixes from the image paths and replace them with
+ # some other prefix, they can do so here.
+ REMOVE_IMG_PATH_PREFIX: ""
+ # what prefix to replace the old prefix with. Could stay empty too
+ NEW_IMG_PATH_PREFIX: ""
# name of the dataset. Meaningful and used to do lookup in the dataset_catalog.json
# it has the advantage that user needs to full the dataset_catalog.json once
# and then simply use the dataset name without having to specify data paths every time.
@@ -264,6 +269,11 @@ config:
TEST:
# if we want to resume the data sampler as well from a previous iteration
USE_STATEFUL_DISTRIBUTED_SAMPLER: False
+ # if users want to replace certain prefixes from the image paths and replace them with
+ # some other prefix, they can do so here.
+ REMOVE_IMG_PATH_PREFIX: ""
+ # what prefix to replace the old prefix with. Could stay empty too
+ NEW_IMG_PATH_PREFIX: ""
DROP_LAST: False
DATA_SOURCES: []
DATA_PATHS: []
diff --git a/vissl/data/disk_dataset.py b/vissl/data/disk_dataset.py
index 023e9c34a..569f008be 100644
--- a/vissl/data/disk_dataset.py
+++ b/vissl/data/disk_dataset.py
@@ -63,6 +63,8 @@ def __init__(self, cfg, data_source, path, split, dataset_name):
self.is_initialized = False
self._load_data(path)
self._num_samples = len(self.image_dataset)
+ self._remove_prefix = cfg["DATA"][self.split]["REMOVE_IMG_PATH_PREFIX"]
+ self._new_prefix = cfg["DATA"][self.split]["NEW_IMG_PATH_PREFIX"]
if self.data_source == "disk_filelist":
# Set dataset to null so that workers dont need to pickle this file.
# This saves memory when disk_filelist is large, especially when memory mapping.
@@ -105,6 +107,11 @@ def get_image_paths(self):
self._load_data(self._path)
return self.image_dataset
+ def _replace_img_path_prefix(self, img_path, replace_prefix, new_prefix):
+ if img_path.startswith(replace_prefix):
+ return img_path.replace(replace_prefix, new_prefix)
+ return img_path
+
def __len__(self):
"""
Size of the dataset
@@ -131,6 +138,11 @@ def __getitem__(self, idx):
image_path = self.image_dataset[idx]
try:
if self.data_source == "disk_filelist":
+ image_path = self._replace_img_path_prefix(
+ image_path,
+ replace_prefix=self._remove_prefix,
+ new_prefix=self._new_prefix,
+ )
with PathManager.open(image_path, "rb") as fopen:
img = Image.open(fopen).convert("RGB")
elif self.data_source == "disk_folder":