From 2d67e1986cec4c2ac22b517c2490e0c362704fbb Mon Sep 17 00:00:00 2001 From: Priya Goyal Date: Wed, 3 Mar 2021 13:07:40 -0800 Subject: [PATCH] Allow the option to replace the image path prefixes with a new prefix for disk loading (#212) Summary: Pull Request resolved: https://github.com/facebookresearch/vissl/pull/212 this diff has 2 advantages: 1. users can now use same dataset via disk_filelist that have different root paths. 2. we can use manifold memcache for faster loading of datasets that live in manifold Based off of sujitoc diff D26781752 Reviewed By: sujitoc Differential Revision: D26785318 fbshipit-source-id: f3108a354e9f47f74f25faa00de989151593a6f2 --- vissl/config/defaults.yaml | 10 ++++++++++ vissl/data/disk_dataset.py | 12 ++++++++++++ 2 files changed, 22 insertions(+) diff --git a/vissl/config/defaults.yaml b/vissl/config/defaults.yaml index ed7358906..b1e43ed62 100644 --- a/vissl/config/defaults.yaml +++ b/vissl/config/defaults.yaml @@ -154,6 +154,11 @@ config: USE_STATEFUL_DISTRIBUTED_SAMPLER: False # whether to drop the last incomplete batch per process DROP_LAST: False + # if users want to replace certain prefixes from the image paths and replace them with + # some other prefix, they can do so here. + REMOVE_IMG_PATH_PREFIX: "" + # what prefix to replace the old prefix with. Could stay empty too + NEW_IMG_PATH_PREFIX: "" # name of the dataset. Meaningful and used to do lookup in the dataset_catalog.json # it has the advantage that user needs to full the dataset_catalog.json once # and then simply use the dataset name without having to specify data paths every time. @@ -264,6 +269,11 @@ config: TEST: # if we want to resume the data sampler as well from a previous iteration USE_STATEFUL_DISTRIBUTED_SAMPLER: False + # if users want to replace certain prefixes from the image paths and replace them with + # some other prefix, they can do so here. + REMOVE_IMG_PATH_PREFIX: "" + # what prefix to replace the old prefix with. Could stay empty too + NEW_IMG_PATH_PREFIX: "" DROP_LAST: False DATA_SOURCES: [] DATA_PATHS: [] diff --git a/vissl/data/disk_dataset.py b/vissl/data/disk_dataset.py index 023e9c34a..569f008be 100644 --- a/vissl/data/disk_dataset.py +++ b/vissl/data/disk_dataset.py @@ -63,6 +63,8 @@ def __init__(self, cfg, data_source, path, split, dataset_name): self.is_initialized = False self._load_data(path) self._num_samples = len(self.image_dataset) + self._remove_prefix = cfg["DATA"][self.split]["REMOVE_IMG_PATH_PREFIX"] + self._new_prefix = cfg["DATA"][self.split]["NEW_IMG_PATH_PREFIX"] if self.data_source == "disk_filelist": # Set dataset to null so that workers dont need to pickle this file. # This saves memory when disk_filelist is large, especially when memory mapping. @@ -105,6 +107,11 @@ def get_image_paths(self): self._load_data(self._path) return self.image_dataset + def _replace_img_path_prefix(self, img_path, replace_prefix, new_prefix): + if img_path.startswith(replace_prefix): + return img_path.replace(replace_prefix, new_prefix) + return img_path + def __len__(self): """ Size of the dataset @@ -131,6 +138,11 @@ def __getitem__(self, idx): image_path = self.image_dataset[idx] try: if self.data_source == "disk_filelist": + image_path = self._replace_img_path_prefix( + image_path, + replace_prefix=self._remove_prefix, + new_prefix=self._new_prefix, + ) with PathManager.open(image_path, "rb") as fopen: img = Image.open(fopen).convert("RGB") elif self.data_source == "disk_folder":