Skip to content

Commit

Permalink
FIX: Only traverse necessary directories (hoechenberger#81)
Browse files Browse the repository at this point in the history
I have been trying to download https://openneuro.org/datasets/ds004024/,
and traversing the directory structure takes over a minute locally (!)
for example for:
```
openneuro-py download --dataset=ds004024 --target_dir=$PWD --include="sub-CON001/*.eeg" --include="sub-CON001/*.tsv"
```
or
```
openneuro-py download --dataset=ds004024 --target_dir=$PWD --include="sub-CON001"
```
With these changes, either of these requires less than 10 seconds to
traverse, because they are smart enough to ignore all other `sub-*`
directories.

I'm not 100% sure the logic here is correct, but I do think we want
something like this eventually to make `openneuro-py` more usable.
  • Loading branch information
larsoner authored Oct 18, 2022
1 parent 9450517 commit 04364cd
Show file tree
Hide file tree
Showing 3 changed files with 81 additions and 20 deletions.
66 changes: 56 additions & 10 deletions openneuro/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from difflib import get_close_matches
import hashlib
import asyncio
from pathlib import Path
from pathlib import Path, PosixPath
import string
import json
from typing import Optional, Union
Expand Down Expand Up @@ -544,7 +544,8 @@ def _iterate_filenames(
dataset_id: str,
tag: str,
max_retries: int,
root: str = ''
root: str = '',
include: Iterable[str] = tuple(),
) -> Generator[dict, None, None]:
"""Iterate over all files in a dataset, yielding filenames."""
directories = list()
Expand All @@ -557,6 +558,37 @@ def _iterate_filenames(
yield entity

for directory in directories:
# Only bother with directories that are in the include list
if include:
# Take the example:
#
# --include="sub-CON001/*.eeg"
#
# or
#
# --include="sub-CON001"
#
# or
#
# --include="sub-CON001/*"
#
# All three of these should traverse `sub-CON001` and its
# subdirectories.
n_parts = len(PosixPath(root).parts)
dir_include = [PosixPath(inc) for inc in include]
dir_include = [ # for stuff like sub-CON001/*
'/'.join(inc.parts[:n_parts] + ('*',))
for inc in dir_include
if len(inc.parts) >= n_parts
] + [ # and stuff like sub-CON001/*.eeg
'/'.join(inc.parts[:n_parts - 1] + ('*',))
for inc in dir_include
if len(inc.parts) >= n_parts - 1 and len(inc.parts) > 1
] # we want to traverse sub-CON001 in both cases
matches_include, _ = _match_include_exclude(
directory['filename'], include=dir_include, exclude=[])
if dir_include and not any(matches_include):
continue
# Query filenames
this_dir = directory['filename']
metadata = _get_download_metadata(
Expand All @@ -572,11 +604,27 @@ def _iterate_filenames(
tag=tag,
max_retries=max_retries,
root=this_dir,
include=include,
)
for path in dir_iterator:
yield path


def _match_include_exclude(
filename: str,
*,
include: Iterable[str],
exclude: Iterable[str],
) -> bool:
"""Check if a filename matches an include or exclude pattern."""
matches_keep = [filename.startswith(i) or fnmatch.fnmatch(filename, i)
for i in include]
matches_remove = [filename.startswith(e) or
fnmatch.fnmatch(filename, e)
for e in exclude]
return matches_keep, matches_remove


def download(*,
dataset: str,
tag: Optional[str] = None,
Expand Down Expand Up @@ -681,7 +729,8 @@ def download(*,

for file in tqdm(
_iterate_filenames(
these_files, dataset_id=dataset, tag=tag, max_retries=max_retries
these_files, dataset_id=dataset, tag=tag, max_retries=max_retries,
include=include,
),
desc=_unicode(
f'Traversing directories for {dataset}', end='', emoji='📁'
Expand All @@ -703,12 +752,9 @@ def download(*,
include_counts[include.index(filename)] += 1
continue

matches_keep = [filename.startswith(i) or fnmatch.fnmatch(filename, i)
for i in include]
matches_remove = [filename.startswith(e) or
fnmatch.fnmatch(filename, e)
for e in exclude]
if (not include or any(matches_keep)) and not any(matches_remove):
matches_keep, matches_exclude = _match_include_exclude(
filename, include=include, exclude=exclude)
if (not include or any(matches_keep)) and not any(matches_exclude):
files.append(file)
# Keep track of include matches.
if any(matches_keep):
Expand All @@ -727,7 +773,7 @@ def download(*,
else:
extra = (
'There were no similar filenames found in the '
'metadata.'
'metadata. '
)
raise RuntimeError(
f'Could not find path in the dataset:\n- {this}\n{extra}'
Expand Down
32 changes: 22 additions & 10 deletions openneuro/tests/test_download.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,23 +8,27 @@
dataset_id_aws = 'ds000246'
tag_aws = '1.0.0'
include_aws = 'sub-0001/anat'
exclude_aws = []

dataset_id_on = 'ds000117'
tag_on = None
include_on = 'sub-16/ses-meg'
exclude_on = '*.fif' # save GBs of downloads

invalid_tag = 'abcdefg'


@pytest.mark.parametrize(
('dataset_id', 'tag', 'include'),
('dataset_id', 'tag', 'include', 'exclude'),
[
(dataset_id_aws, tag_aws, include_aws),
(dataset_id_on, None, include_on)
(dataset_id_aws, tag_aws, include_aws, exclude_aws),
(dataset_id_on, tag_on, include_on, exclude_on),
]
)
def test_download(tmp_path: Path, dataset_id, tag, include):
def test_download(tmp_path: Path, dataset_id, tag, include, exclude):
"""Test downloading some files."""
download(dataset=dataset_id, tag=tag, target_dir=tmp_path, include=include)
download(dataset=dataset_id, tag=tag, target_dir=tmp_path, include=include,
exclude=exclude)


def test_download_invalid_tag(tmp_path: Path, dataset_id=dataset_id_aws,
Expand All @@ -49,14 +53,16 @@ def test_resume_download(tmp_path: Path):

# Download from a different revision / tag
new_tag = '00001'
include = ['CHANGES']
with pytest.raises(FileExistsError, match=f'revision {tag} exists'):
download(dataset=dataset, tag=new_tag, target_dir=tmp_path)
download(dataset=dataset, tag=new_tag, target_dir=tmp_path,
include=include)

# Try to "resume" from a different dataset
new_dataset = 'ds000117'
with pytest.raises(RuntimeError,
match='existing dataset.*appears to be different'):
download(dataset=new_dataset, target_dir=tmp_path)
download(dataset=new_dataset, target_dir=tmp_path, include=include)

# Remove "DatasetDOI" from JSON
json_path = tmp_path / 'dataset_description.json'
Expand Down Expand Up @@ -100,15 +106,21 @@ def test_doi_handling(tmp_path: Path):

# Now inject a `doi:` prefix into the DOI
dataset_description_path = tmp_path / 'dataset_description.json'
dataset_description = json.loads(
dataset_description_text = \
dataset_description_path.read_text(encoding='utf-8')
)
dataset_description = json.loads(dataset_description_text)
# Make sure we can dumps to get the same thing back (if they change their
# indent 4->8 for example, we might try to resume our download of the file
# and things will break in a challenging way)
dataset_description_rt = json.dumps(dataset_description, indent=4)
assert dataset_description_text == dataset_description_rt
# Ensure the dataset doesn't already have the problematic prefix, then add
assert not dataset_description['DatasetDOI'].startswith('doi:')
dataset_description['DatasetDOI'] = (
'doi:' + dataset_description['DatasetDOI']
)
dataset_description_path.write_text(
data=json.dumps(dataset_description, indent=2),
data=json.dumps(dataset_description, indent=4),
encoding='utf-8'
)

Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,3 +46,6 @@ build-backend = "setuptools.build_meta"

[tool.setuptools_scm]
# can be left blank

[tool.pytest.ini_options]
addopts = "-ra -vv --tb=short --durations=10"

0 comments on commit 04364cd

Please sign in to comment.