Skip to content

Commit

Permalink
Allow the option to replace the image path prefixes with a new prefix…
Browse files Browse the repository at this point in the history
… for disk loading (facebookresearch#212)

Summary:
Pull Request resolved: facebookresearch#212

this diff has 2 advantages:
1. users can now use same dataset via disk_filelist that have different root paths.
2. we can use manifold memcache for faster loading of datasets that live in manifold

Based off of sujitoc diff D26781752

Reviewed By: sujitoc

Differential Revision: D26785318

fbshipit-source-id: f3108a354e9f47f74f25faa00de989151593a6f2
  • Loading branch information
prigoyal authored and facebook-github-bot committed Mar 3, 2021
1 parent 4dc3db3 commit 2d67e19
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 0 deletions.
10 changes: 10 additions & 0 deletions vissl/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,11 @@ config:
USE_STATEFUL_DISTRIBUTED_SAMPLER: False
# whether to drop the last incomplete batch per process
DROP_LAST: False
# if users want to replace certain prefixes from the image paths and replace them with
# some other prefix, they can do so here.
REMOVE_IMG_PATH_PREFIX: ""
# what prefix to replace the old prefix with. Could stay empty too
NEW_IMG_PATH_PREFIX: ""
# name of the dataset. Meaningful and used to do lookup in the dataset_catalog.json
# it has the advantage that user needs to full the dataset_catalog.json once
# and then simply use the dataset name without having to specify data paths every time.
Expand Down Expand Up @@ -264,6 +269,11 @@ config:
TEST:
# if we want to resume the data sampler as well from a previous iteration
USE_STATEFUL_DISTRIBUTED_SAMPLER: False
# if users want to replace certain prefixes from the image paths and replace them with
# some other prefix, they can do so here.
REMOVE_IMG_PATH_PREFIX: ""
# what prefix to replace the old prefix with. Could stay empty too
NEW_IMG_PATH_PREFIX: ""
DROP_LAST: False
DATA_SOURCES: []
DATA_PATHS: []
Expand Down
12 changes: 12 additions & 0 deletions vissl/data/disk_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,8 @@ def __init__(self, cfg, data_source, path, split, dataset_name):
self.is_initialized = False
self._load_data(path)
self._num_samples = len(self.image_dataset)
self._remove_prefix = cfg["DATA"][self.split]["REMOVE_IMG_PATH_PREFIX"]
self._new_prefix = cfg["DATA"][self.split]["NEW_IMG_PATH_PREFIX"]
if self.data_source == "disk_filelist":
# Set dataset to null so that workers dont need to pickle this file.
# This saves memory when disk_filelist is large, especially when memory mapping.
Expand Down Expand Up @@ -105,6 +107,11 @@ def get_image_paths(self):
self._load_data(self._path)
return self.image_dataset

def _replace_img_path_prefix(self, img_path, replace_prefix, new_prefix):
if img_path.startswith(replace_prefix):
return img_path.replace(replace_prefix, new_prefix)
return img_path

def __len__(self):
"""
Size of the dataset
Expand All @@ -131,6 +138,11 @@ def __getitem__(self, idx):
image_path = self.image_dataset[idx]
try:
if self.data_source == "disk_filelist":
image_path = self._replace_img_path_prefix(
image_path,
replace_prefix=self._remove_prefix,
new_prefix=self._new_prefix,
)
with PathManager.open(image_path, "rb") as fopen:
img = Image.open(fopen).convert("RGB")
elif self.data_source == "disk_folder":
Expand Down

0 comments on commit 2d67e19

Please sign in to comment.