Skip to content

Commit

Permalink
h5netcdf new API support (pydata#1915)
Browse files Browse the repository at this point in the history
* Ignore dask scratch area

* Public API for HDF5 support

* Remove save_mfdataset_hdf5

* Replace to_hdf5 with to_netcdf(engine='h5netcdf-ng')

* h5netcdf-ng -> h5netcdf-new

* Trivial fixes

* Functional implementation

* stickler fixes

* Reimplement as extra params for h5netcdf

* Cosmetic tweaks

* Bugfixes

* More robust mixed-style encoding handling

* Crash on mismatched encoding if check_encoding=True

* Test check_encoding

* stickler fix

* Use parentheses instead of explicit continuation with \
  • Loading branch information
crusaderky authored and shoyer committed May 8, 2018
1 parent 98373f0 commit c6977f1
Show file tree
Hide file tree
Showing 8 changed files with 149 additions and 26 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ nosetests.xml
.tags*
.testmon*
.pytest_cache
dask-worker-space/

# asv environments
.asv
Expand Down
5 changes: 5 additions & 0 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ Enhancements

- Support writing lists of strings as netCDF attributes (:issue:`2044`).
By `Dan Nowacki <https://github.com/dnowacki-usgs>`_.
- :py:meth:`~xarray.Dataset.to_netcdf(engine='h5netcdf')` now accepts h5py
encoding settings ``compression`` and ``compression_opts``, along with the
NetCDF4-Python style settings ``gzip=True`` and ``complevel``.
This allows using any compression plugin installed in hdf5, e.g. LZF
(:issue:`1536`). By `Guido Imperiale <https://github.com/crusaderky>`_.
- :py:meth:`~xarray.dot` on dask-backed data will now call :func:`dask.array.einsum`.
This greatly boosts speed and allows chunking on the core dims.
The function now requires dask >= 0.17.3 to work on dask-backed data
Expand Down
1 change: 1 addition & 0 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -741,6 +741,7 @@ def save_mfdataset(datasets, paths, mode='w', format=None, groups=None,
Engine to use when writing netCDF files. If not provided, the
default engine is chosen based on available dependencies, with a
preference for 'netcdf4' if writing to a file on disk.
See `Dataset.to_netcdf` for additional information.
Examples
--------
Expand Down
73 changes: 51 additions & 22 deletions xarray/backends/h5netcdf_.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,21 +45,21 @@ def _read_attributes(h5netcdf_var):
# to ensure conventions decoding works properly on Python 3, decode all
# bytes attributes to strings
attrs = OrderedDict()
for k in h5netcdf_var.ncattrs():
v = h5netcdf_var.getncattr(k)
for k, v in h5netcdf_var.attrs.items():
if k not in ['_FillValue', 'missing_value']:
v = maybe_decode_bytes(v)
attrs[k] = v
return attrs


_extract_h5nc_encoding = functools.partial(_extract_nc4_variable_encoding,
lsd_okay=False, backend='h5netcdf')
_extract_h5nc_encoding = functools.partial(
_extract_nc4_variable_encoding,
lsd_okay=False, h5py_okay=True, backend='h5netcdf')


def _open_h5netcdf_group(filename, mode, group):
import h5netcdf.legacyapi
ds = h5netcdf.legacyapi.Dataset(filename, mode=mode)
import h5netcdf
ds = h5netcdf.File(filename, mode=mode)
with close_on_error(ds):
return _nc4_group(ds, group, mode)

Expand Down Expand Up @@ -96,10 +96,19 @@ def open_store_variable(self, name, var):
attrs = _read_attributes(var)

# netCDF4 specific encoding
encoding = dict(var.filters())
chunking = var.chunking()
encoding['chunksizes'] = chunking \
if chunking != 'contiguous' else None
encoding = {
'chunksizes': var.chunks,
'fletcher32': var.fletcher32,
'shuffle': var.shuffle,
}
# Convert h5py-style compression options to NetCDF4-Python
# style, if possible
if var.compression == 'gzip':
encoding['zlib'] = True
encoding['complevel'] = var.compression_opts
elif var.compression is not None:
encoding['compression'] = var.compression
encoding['compression_opts'] = var.compression_opts

# save source so __repr__ can detect if it's local or not
encoding['source'] = self._filename
Expand Down Expand Up @@ -130,14 +139,14 @@ def get_encoding(self):
def set_dimension(self, name, length, is_unlimited=False):
with self.ensure_open(autoclose=False):
if is_unlimited:
self.ds.createDimension(name, size=None)
self.ds.dimensions[name] = None
self.ds.resize_dimension(name, length)
else:
self.ds.createDimension(name, size=length)
self.ds.dimensions[name] = length

def set_attribute(self, key, value):
with self.ensure_open(autoclose=False):
self.ds.setncattr(key, value)
self.ds.attrs[key] = value

def encode_variable(self, variable):
return _encode_nc4_variable(variable)
Expand All @@ -149,8 +158,8 @@ def prepare_variable(self, name, variable, check_encoding=False,
attrs = variable.attrs.copy()
dtype = _get_datatype(variable)

fill_value = attrs.pop('_FillValue', None)
if dtype is str and fill_value is not None:
fillvalue = attrs.pop('_FillValue', None)
if dtype is str and fillvalue is not None:
raise NotImplementedError(
'h5netcdf does not yet support setting a fill value for '
'variable-length strings '
Expand All @@ -166,18 +175,38 @@ def prepare_variable(self, name, variable, check_encoding=False,
raise_on_invalid=check_encoding)
kwargs = {}

for key in ['zlib', 'complevel', 'shuffle',
'chunksizes', 'fletcher32']:
# Convert from NetCDF4-Python style compression settings to h5py style
# If both styles are used together, h5py takes precedence
# If set_encoding=True, raise ValueError in case of mismatch
if encoding.pop('zlib', False):
if (check_encoding and encoding.get('compression')
not in (None, 'gzip')):
raise ValueError("'zlib' and 'compression' encodings mismatch")
encoding.setdefault('compression', 'gzip')

if (check_encoding and encoding.get('complevel') not in
(None, encoding.get('compression_opts'))):
raise ValueError("'complevel' and 'compression_opts' encodings "
"mismatch")
complevel = encoding.pop('complevel', 0)
if complevel != 0:
encoding.setdefault('compression_opts', complevel)

encoding['chunks'] = encoding.pop('chunksizes', None)

for key in ['compression', 'compression_opts', 'shuffle',
'chunks', 'fletcher32']:
if key in encoding:
kwargs[key] = encoding[key]
if name not in self.ds.variables:
nc4_var = self.ds.createVariable(name, dtype, variable.dims,
fill_value=fill_value, **kwargs)
if name not in self.ds:
nc4_var = self.ds.create_variable(
name, dtype=dtype, dimensions=variable.dims,
fillvalue=fillvalue, **kwargs)
else:
nc4_var = self.ds.variables[name]
nc4_var = self.ds[name]

for k, v in iteritems(attrs):
nc4_var.setncattr(k, v)
nc4_var.attrs[k] = v

target = H5NetCDFArrayWrapper(name, self)

Expand Down
7 changes: 5 additions & 2 deletions xarray/backends/netCDF4_.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,8 +159,8 @@ def _force_native_endianness(var):


def _extract_nc4_variable_encoding(variable, raise_on_invalid=False,
lsd_okay=True, backend='netCDF4',
unlimited_dims=None):
lsd_okay=True, h5py_okay=False,
backend='netCDF4', unlimited_dims=None):
if unlimited_dims is None:
unlimited_dims = ()

Expand All @@ -171,6 +171,9 @@ def _extract_nc4_variable_encoding(variable, raise_on_invalid=False,
'chunksizes', 'shuffle', '_FillValue'])
if lsd_okay:
valid_encodings.add('least_significant_digit')
if h5py_okay:
valid_encodings.add('compression')
valid_encodings.add('compression_opts')

if not raise_on_invalid and encoding.get('chunksizes') is not None:
# It's possible to get encoded chunksizes larger than a dimension size
Expand Down
3 changes: 1 addition & 2 deletions xarray/core/dataarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1443,8 +1443,7 @@ def to_masked_array(self, copy=True):
return np.ma.MaskedArray(data=self.values, mask=isnull, copy=copy)

def to_netcdf(self, *args, **kwargs):
"""
Write DataArray contents to a netCDF file.
"""Write DataArray contents to a netCDF file.
Parameters
----------
Expand Down
7 changes: 7 additions & 0 deletions xarray/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1123,6 +1123,13 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None,
variable specific encodings as values, e.g.,
``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1,
'zlib': True}, ...}``
The `h5netcdf` engine supports both the NetCDF4-style compression
encoding parameters ``{'zlib': True, 'complevel': 9}`` and the h5py
ones ``{'compression': 'gzip', 'compression_opts': 9}``.
This allows using any compression plugin installed in the HDF5
library, e.g. LZF.
unlimited_dims : sequence of str, optional
Dimension(s) that should be serialized as unlimited dimensions.
By default, no dimensions are treated as unlimited dimensions.
Expand Down
78 changes: 78 additions & 0 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -1668,6 +1668,84 @@ def test_encoding_unlimited_dims(self):
self.assertEqual(actual.encoding['unlimited_dims'], set('y'))
assert_equal(ds, actual)

def test_compression_encoding_h5py(self):
ENCODINGS = (
# h5py style compression with gzip codec will be converted to
# NetCDF4-Python style on round-trip
({'compression': 'gzip', 'compression_opts': 9},
{'zlib': True, 'complevel': 9}),
# What can't be expressed in NetCDF4-Python style is
# round-tripped unaltered
({'compression': 'lzf', 'compression_opts': None},
{'compression': 'lzf', 'compression_opts': None}),
# If both styles are used together, h5py format takes precedence
({'compression': 'lzf', 'compression_opts': None,
'zlib': True, 'complevel': 9},
{'compression': 'lzf', 'compression_opts': None}))

for compr_in, compr_out in ENCODINGS:
data = create_test_data()
compr_common = {
'chunksizes': (5, 5),
'fletcher32': True,
'shuffle': True,
'original_shape': data.var2.shape
}
data['var2'].encoding.update(compr_in)
data['var2'].encoding.update(compr_common)
compr_out.update(compr_common)
with self.roundtrip(data) as actual:
for k, v in compr_out.items():
self.assertEqual(v, actual['var2'].encoding[k])

def test_compression_check_encoding_h5py(self):
"""When mismatched h5py and NetCDF4-Python encodings are expressed
in to_netcdf(encoding=...), must raise ValueError
"""
data = Dataset({'x': ('y', np.arange(10.0))})
# Compatible encodings are graciously supported
with create_tmp_file() as tmp_file:
data.to_netcdf(
tmp_file, engine='h5netcdf',
encoding={'x': {'compression': 'gzip', 'zlib': True,
'compression_opts': 6, 'complevel': 6}})
with open_dataset(tmp_file, engine='h5netcdf') as actual:
assert actual.x.encoding['zlib'] is True
assert actual.x.encoding['complevel'] == 6

# Incompatible encodings cause a crash
with create_tmp_file() as tmp_file:
with raises_regex(ValueError,
"'zlib' and 'compression' encodings mismatch"):
data.to_netcdf(
tmp_file, engine='h5netcdf',
encoding={'x': {'compression': 'lzf', 'zlib': True}})

with create_tmp_file() as tmp_file:
with raises_regex(
ValueError,
"'complevel' and 'compression_opts' encodings mismatch"):
data.to_netcdf(
tmp_file, engine='h5netcdf',
encoding={'x': {'compression': 'gzip',
'compression_opts': 5, 'complevel': 6}})

def test_dump_encodings_h5py(self):
# regression test for #709
ds = Dataset({'x': ('y', np.arange(10.0))})

kwargs = {'encoding': {'x': {
'compression': 'gzip', 'compression_opts': 9}}}
with self.roundtrip(ds, save_kwargs=kwargs) as actual:
self.assertEqual(actual.x.encoding['zlib'], True)
self.assertEqual(actual.x.encoding['complevel'], 9)

kwargs = {'encoding': {'x': {
'compression': 'lzf', 'compression_opts': None}}}
with self.roundtrip(ds, save_kwargs=kwargs) as actual:
self.assertEqual(actual.x.encoding['compression'], 'lzf')
self.assertEqual(actual.x.encoding['compression_opts'], None)


# tests pending h5netcdf fix
@unittest.skip
Expand Down

0 comments on commit c6977f1

Please sign in to comment.