Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hugging Face Hub Upgrades #4231

Merged
merged 10 commits into from
Apr 5, 2024
Prev Previous commit
Next Next commit
only download necessary media files for FiftyOneDatasets
  • Loading branch information
jacobmarks committed Apr 4, 2024
commit b71e3fb58d265af784cb4a7020b1adedbffd5b6d
39 changes: 36 additions & 3 deletions fiftyone/utils/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -1077,6 +1077,12 @@ def _resolve_dataset_name(config, **kwargs):
return name


def _get_files_to_download(dataset):
filepaths = dataset.values("filepath")
filepaths = [fp for fp in filepaths if not os.path.exists(fp)]
return filepaths


def _load_fiftyone_dataset_from_config(config, **kwargs):
logger.info("Loading dataset")

Expand All @@ -1086,12 +1092,27 @@ def _load_fiftyone_dataset_from_config(config, **kwargs):
splits = _parse_split_kwargs(**kwargs)

download_dir = _get_download_dir(config._repo_id, **kwargs)
hfh.snapshot_download(
repo_id=config._repo_id, repo_type="dataset", local_dir=download_dir
)

init_download_kwargs = {
"repo_id": config._repo_id,
"repo_type": "dataset",
"local_dir": download_dir,
}

dataset_type_name = config._format.strip()

if dataset_type_name == "FiftyOneDataset":
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make sense to download the images separately only if max_samples was provided?

I'm wondering if this might be less efficient:

hfh.snapshot_download(**init_download_kwargs, allow_patterns=[very, long, list, of patterns])

than this:

hfh.snapshot_download(**init_download_kwargs)

# If the dataset is a FiftyOneDataset, we can smart only download the
# necessary files
hfh.snapshot_download(
**init_download_kwargs,
ignore_patterns="data/*",
)
else:
hfh.snapshot_download(
**init_download_kwargs,
)

dataset_type = getattr(
__import__("fiftyone.types", fromlist=[dataset_type_name]),
dataset_type_name,
Expand All @@ -1110,6 +1131,18 @@ def _load_fiftyone_dataset_from_config(config, **kwargs):
dataset_kwargs["name"] = name

dataset = fod.Dataset.from_dir(download_dir, **dataset_kwargs)

if dataset_type_name != "FiftyOneDataset":
return dataset

filepaths = _get_files_to_download(dataset)
if filepaths:
logger.info(f"Downloading {len(filepaths)} media files...")
filenames = [os.path.basename(fp) for fp in filepaths]
allowed_globs = ["data/" + fn for fn in filenames]
hfh.snapshot_download(
**init_download_kwargs, allow_patterns=allowed_globs
)
return dataset


Expand Down