Skip to content

Commit

Permalink
Simplify use of pooch in data module, add to module docs
Browse files Browse the repository at this point in the history
Signed-off-by: Håkon Wiik Ånes <hwaanes@gmail.com>
  • Loading branch information
hakonanes committed Apr 10, 2022
1 parent 8533715 commit 3d31dd1
Show file tree
Hide file tree
Showing 4 changed files with 89 additions and 82 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
repos:
- repo: https://github.com/psf/black
rev: 21.7b0
rev: 22.3.0
hooks:
- id: black
22 changes: 12 additions & 10 deletions CONTRIBUTING.rst
Original file line number Diff line number Diff line change
Expand Up @@ -260,20 +260,22 @@ the `pooch <https://www.fatiando.org/pooch/latest/>`_ Python library. These are
in a file registry (`kikuchipy.data._registry.py`) with their file verification string
(hash, SHA256, obtain with e.g. `sha256sum <file>`) and location, the latter potentially
not within the package but from the `kikuchipy-data
<https://github.com/pyxem/kikuchipy-data>`_ repository, since some files are considered
too large to include in the package.
<https://github.com/pyxem/kikuchipy-data>`_ repository or elsewhere, since some files
are considered too large to include in the package.

If a required dataset isn't in the package, but is in the registry, it can be downloaded
from the repository when the user passes `allow_download=True` to e.g.
:func:`~kikuchipy.data.nickel_ebsd_large`. The dataset is then downloaded to a local
cache, e.g. `/home/user/.cache/kikuchipy/`. Pooch handles downloading, caching, version
control, file verification (against hash) etc. If we have updated the file hash, pooch
will re-download it. If the file is available in the cache, it can be loaded as the
other files in the data module.

The desired data cache directory used by pooch can be set with a global
`KIKUCHIPY_DATA_DIR` variable locally, e.g. by setting
`export KIKUCHIPY_DATA_DIR=~/kikuchipy_data` in `~/.bashrc`.
cache, in the location returned from `pooch.os_cache("kikuchipy")`. The location can be
set with a global `KIKUCHIPY_DATA_DIR` variable locally, e.g. by setting
`export KIKUCHIPY_DATA_DIR=~/kikuchipy_data` in `~/.bashrc`. Pooch handles downloading,
caching, version control, file verification (against hash) etc. of files not included in
the package. If we have updated the file hash, pooch will re-download it. If the file is
available in the cache, it can be loaded as the other files in the data module.

With every new version of kikuchipy, a new directory of datasets with the version name
is added to the cache directory. Any old directories are not deleted automatically, and
should then be deleted manually if desired.

Improving performance
=====================
Expand Down
145 changes: 75 additions & 70 deletions kikuchipy/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,18 +19,27 @@
Some datasets must be downloaded from the web. For more test datasets,
see :doc:`open datasets <open_datasets>`.
Some datasets must be downloaded from the web. Datasets are placed in a
local cache, in the location returned from `pooch.os_cache("kikuchipy")`
by default. The location can be overwritten with a global
`KIKUCHIPY_DATA_DIR` environment variable.
With every new version of kikuchipy, a new directory of datasets with
the version name is added to the cache directory. Any old directories
are not deleted automatically, and should then be deleted manually if
desired.
"""

import os
from pathlib import Path
from typing import Union

import pooch as ppooch
import pooch

from kikuchipy.signals import EBSD, EBSDMasterPattern
from kikuchipy import load
from kikuchipy.release import version
from kikuchipy.data._registry import registry, registry_urls
from kikuchipy.data._registry import registry_hashes, registry_urls


__all__ = [
Expand All @@ -43,56 +52,39 @@
]


fetcher = ppooch.create(
path=ppooch.os_cache("kikuchipy"),
_fetcher = pooch.create(
path=pooch.os_cache("kikuchipy"),
base_url="",
version=version.replace(".dev", "+"),
version_dev="develop",
env="KIKUCHIPY_DATA_DIR",
registry=registry,
registry=registry_hashes,
urls=registry_urls,
)
cache_data_path = fetcher.path.joinpath("data")
package_data_path = Path(os.path.abspath(os.path.dirname(__file__)))


def _has_hash(path, expected_hash):
"""Check if the provided path has the expected hash."""
if not os.path.exists(path):
return False
else:
return ppooch.file_hash(path) == expected_hash


def _cautious_downloader(url, output_file, pooch):
if pooch.allow_download:
delattr(pooch, "allow_download")
# HTTPDownloader() requires tqdm, a HyperSpy dependency, so
# adding it to our dependencies doesn't cost anything
download = ppooch.HTTPDownloader(progressbar=True)
download(url, output_file, pooch)
def _fetch(filename: str, allow_download: bool = False, progressbar=True) -> Path:
fname = "data/" + filename
expected_hash = registry_hashes[fname]
file_in_package = Path(os.path.dirname(__file__)) / ".." / fname
if file_in_package.exists() and pooch.file_hash(file_in_package) == expected_hash:
# Bypass pooch
file_path = file_in_package
else:
raise ValueError(
"The dataset must be (re)downloaded from the kikuchipy-data "
"repository on GitHub (https://github.com/pyxem/kikuchipy-data) to "
"your local cache with the pooch Python package. Pass "
"`allow_download=True` to allow this download."
)


def _fetch(filename: str, allow_download: bool = False):
resolved_path = os.path.join(package_data_path, "..", filename)
expected_hash = registry[filename]
if _has_hash(resolved_path, expected_hash): # File already in data module
return resolved_path
else: # Pooch must download the data to the local cache
fetcher.allow_download = allow_download # Extremely ugly
resolved_path = fetcher.fetch(filename, downloader=_cautious_downloader)
return resolved_path


def _load(filename: str, **kwargs) -> Union[EBSD, EBSDMasterPattern]:
allow_download = kwargs.pop("allow_download", False)
return load(_fetch(filename, allow_download=allow_download), **kwargs)
file_in_cache = Path(_fetcher.path) / fname
if file_in_cache.exists():
allow_download = True
if allow_download:
downloader = pooch.HTTPDownloader(progressbar=progressbar)
file_path = _fetcher.fetch(fname, downloader=downloader)
else:
raise ValueError(
f"Dataset {filename} must be (re)downloaded from the kikuchipy-data "
"repository on GitHub (https://github.com/pyxem/kikuchipy-data) to your"
" local cache with the pooch Python package. Pass `allow_download=True`"
" to allow this download."
)
return file_path


def nickel_ebsd_small(**kwargs) -> EBSD:
Expand All @@ -110,7 +102,8 @@ def nickel_ebsd_small(**kwargs) -> EBSD:
signal : EBSD
EBSD signal.
"""
return _load(filename="data/kikuchipy_h5ebsd/patterns.h5", **kwargs)
fname = _fetch("kikuchipy_h5ebsd/patterns.h5")
return load(fname, **kwargs)


def nickel_ebsd_master_pattern_small(**kwargs) -> EBSDMasterPattern:
Expand Down Expand Up @@ -140,11 +133,13 @@ def nickel_ebsd_master_pattern_small(**kwargs) -> EBSDMasterPattern:
keyword arguments `compression="gzip"` and `compression_opts=9`. All
other HDF5 groups and datasets are the same as in the original file.
"""
fname = "data/emsoft_ebsd_master_pattern/ni_mc_mp_20kv_uint8_gzip_opts9.h5"
return _load(fname, **kwargs)
fname = _fetch("emsoft_ebsd_master_pattern/ni_mc_mp_20kv_uint8_gzip_opts9.h5")
return load(fname, **kwargs)


def nickel_ebsd_large(allow_download: bool = False, **kwargs) -> EBSD:
def nickel_ebsd_large(
allow_download: bool = False, progressbar: bool = True, **kwargs
) -> EBSD:
"""4125 EBSD patterns in a (55, 75) navigation shape of (60, 60)
detector pixels from Nickel, acquired on a NORDIF UF-1100 detector
:cite:`aanes2019electron`.
Expand All @@ -155,6 +150,9 @@ def nickel_ebsd_large(allow_download: bool = False, **kwargs) -> EBSD:
Whether to allow downloading the dataset from the kikuchipy-data
GitHub repository (https://github.com/pyxem/kikuchipy-data) to
the local cache with the pooch Python package. Default is False.
progressbar
Whether to show a progressbar when downloading. Default is
False.
kwargs
Keyword arguments passed to :func:`~kikuchipy.io._io.load`.
Expand All @@ -163,14 +161,13 @@ def nickel_ebsd_large(allow_download: bool = False, **kwargs) -> EBSD:
signal : EBSD
EBSD signal.
"""
return _load(
filename="data/nickel_ebsd_large/patterns.h5",
allow_download=allow_download,
**kwargs,
)
fname = _fetch("nickel_ebsd_large/patterns.h5", allow_download, progressbar)
return load(fname, **kwargs)


def silicon_ebsd_moving_screen_in(allow_download: bool = False, **kwargs) -> EBSD:
def silicon_ebsd_moving_screen_in(
allow_download: bool = False, progressbar: bool = True, **kwargs
) -> EBSD:
"""One EBSD pattern of (480, 480) detector pixels from a single
crystal Silicon sample, acquired on a NORDIF UF-420 detector.
Expand All @@ -185,6 +182,9 @@ def silicon_ebsd_moving_screen_in(allow_download: bool = False, **kwargs) -> EBS
Whether to allow downloading the dataset from the kikuchipy-data
GitHub repository (https://github.com/pyxem/kikuchipy-data) to
the local cache with the pooch Python package. Default is False.
progressbar
Whether to show a progressbar when downloading. Default is
False.
kwargs
Keyword arguments passed to :func:`~kikuchipy.io._io.load`.
Expand All @@ -198,14 +198,13 @@ def silicon_ebsd_moving_screen_in(allow_download: bool = False, **kwargs) -> EBS
silicon_ebsd_moving_screen_out5mm
silicon_ebsd_moving_screen_out10mm
"""
return _load(
filename="data/silicon_ebsd_moving_screen/si_in.h5",
allow_download=allow_download,
**kwargs,
)
fname = _fetch("silicon_ebsd_moving_screen/si_in.h5", allow_download, progressbar)
return load(fname, **kwargs)


def silicon_ebsd_moving_screen_out5mm(allow_download: bool = False, **kwargs) -> EBSD:
def silicon_ebsd_moving_screen_out5mm(
allow_download: bool = False, progressbar: bool = True, **kwargs
) -> EBSD:
"""One EBSD pattern of (480, 480) detector pixels from a single
crystal Silicon sample, acquired on a NORDIF UF-420 detector.
Expand All @@ -222,6 +221,9 @@ def silicon_ebsd_moving_screen_out5mm(allow_download: bool = False, **kwargs) ->
Whether to allow downloading the dataset from the kikuchipy-data
GitHub repository (https://github.com/pyxem/kikuchipy-data) to
the local cache with the pooch Python package. Default is False.
progressbar
Whether to show a progressbar when downloading. Default is
False.
kwargs
Keyword arguments passed to :func:`~kikuchipy.io._io.load`.
Expand All @@ -235,14 +237,15 @@ def silicon_ebsd_moving_screen_out5mm(allow_download: bool = False, **kwargs) ->
silicon_ebsd_moving_screen_in
silicon_ebsd_moving_screen_out10mm
"""
return _load(
filename="data/silicon_ebsd_moving_screen/si_out5mm.h5",
allow_download=allow_download,
**kwargs,
fname = _fetch(
"silicon_ebsd_moving_screen/si_out5mm.h5", allow_download, progressbar
)
return load(fname, **kwargs)


def silicon_ebsd_moving_screen_out10mm(allow_download: bool = False, **kwargs) -> EBSD:
def silicon_ebsd_moving_screen_out10mm(
allow_download: bool = False, progressbar: bool = True, **kwargs
) -> EBSD:
"""One EBSD pattern of (480, 480) detector pixels from a single
crystal Silicon sample, acquired on a NORDIF UF-420 detector.
Expand All @@ -259,6 +262,9 @@ def silicon_ebsd_moving_screen_out10mm(allow_download: bool = False, **kwargs) -
Whether to allow downloading the dataset from the kikuchipy-data
GitHub repository (https://github.com/pyxem/kikuchipy-data) to
the local cache with the pooch Python package. Default is False.
progressbar
Whether to show a progressbar when downloading. Default is
False.
kwargs
Keyword arguments passed to :func:`~kikuchipy.io._io.load`.
Expand All @@ -272,8 +278,7 @@ def silicon_ebsd_moving_screen_out10mm(allow_download: bool = False, **kwargs) -
silicon_ebsd_moving_screen_in
silicon_ebsd_moving_screen_out5mm
"""
return _load(
filename="data/silicon_ebsd_moving_screen/si_out10mm.h5",
allow_download=allow_download,
**kwargs,
fname = _fetch(
"silicon_ebsd_moving_screen/si_out10mm.h5", allow_download, progressbar
)
return load(fname, **kwargs)
2 changes: 1 addition & 1 deletion kikuchipy/data/_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
# along with kikuchipy. If not, see <http://www.gnu.org/licenses/>.

# fmt: off
registry = {
registry_hashes = {
"data/kikuchipy_h5ebsd/patterns.h5": "7a99ce88174c725f5407b6fcc1eab0c4255694ca8e6029fde7f372f3ab40897f",
"data/emsoft_ebsd_master_pattern/ni_mc_mp_20kv_uint8_gzip_opts9.h5": "8a7c1fb471d9ce750f0332a154e87cf41eed7529be508548e0c0f51ec6f92bc2",
"data/nickel_ebsd_large/patterns.h5": "3ea6e729c3adfdea9dce461806f011c24bf70b011dcf4d90a23a6aa29f15872c",
Expand Down

0 comments on commit 3d31dd1

Please sign in to comment.