h5netcdf new API support (pydata#1915)

* Ignore dask scratch area * Public API for HDF5 support * Remove save_mfdataset_hdf5 * Replace to_hdf5 with to_netcdf(engine='h5netcdf-ng') * h5netcdf-ng -> h5netcdf-new * Trivial fixes * Functional implementation * stickler fixes * Reimplement as extra params for h5netcdf * Cosmetic tweaks * Bugfixes * More robust mixed-style encoding handling * Crash on mismatched encoding if check_encoding=True * Test check_encoding * stickler fix * Use parentheses instead of explicit continuation with \
HertugHelms · May 8, 2018 · c6977f1 · c6977f1
1 parent 98373f0
commit c6977f1
Show file tree

Hide file tree

Showing 8 changed files with 149 additions and 26 deletions.
diff --git a/.gitignore b/.gitignore
@@ -39,6 +39,7 @@ nosetests.xml
 .tags*
 .testmon*
 .pytest_cache
+dask-worker-space/
 
 # asv environments
 .asv

diff --git a/doc/whats-new.rst b/doc/whats-new.rst
@@ -36,6 +36,11 @@ Enhancements
 
 - Support writing lists of strings as netCDF attributes (:issue:`2044`).
   By `Dan Nowacki <https://github.com/dnowacki-usgs>`_.
+- :py:meth:`~xarray.Dataset.to_netcdf(engine='h5netcdf')` now accepts h5py
+  encoding settings ``compression`` and ``compression_opts``, along with the
+  NetCDF4-Python style settings ``gzip=True`` and ``complevel``.
+  This allows using any compression plugin installed in hdf5, e.g. LZF
+  (:issue:`1536`). By `Guido Imperiale <https://github.com/crusaderky>`_.
 - :py:meth:`~xarray.dot` on dask-backed data will now call :func:`dask.array.einsum`.
   This greatly boosts speed and allows chunking on the core dims.
   The function now requires dask >= 0.17.3 to work on dask-backed data

diff --git a/xarray/backends/api.py b/xarray/backends/api.py
@@ -741,6 +741,7 @@ def save_mfdataset(datasets, paths, mode='w', format=None, groups=None,
         Engine to use when writing netCDF files. If not provided, the
         default engine is chosen based on available dependencies, with a
         preference for 'netcdf4' if writing to a file on disk.
+        See `Dataset.to_netcdf` for additional information.
 
     Examples
     --------

diff --git a/xarray/backends/h5netcdf_.py b/xarray/backends/h5netcdf_.py
@@ -45,21 +45,21 @@ def _read_attributes(h5netcdf_var):
     # to ensure conventions decoding works properly on Python 3, decode all
     # bytes attributes to strings
     attrs = OrderedDict()
-    for k in h5netcdf_var.ncattrs():
-        v = h5netcdf_var.getncattr(k)
+    for k, v in h5netcdf_var.attrs.items():
         if k not in ['_FillValue', 'missing_value']:
             v = maybe_decode_bytes(v)
         attrs[k] = v
     return attrs
 
 
-_extract_h5nc_encoding = functools.partial(_extract_nc4_variable_encoding,
-                                           lsd_okay=False, backend='h5netcdf')
+_extract_h5nc_encoding = functools.partial(
+    _extract_nc4_variable_encoding,
+    lsd_okay=False, h5py_okay=True, backend='h5netcdf')
 
 
 def _open_h5netcdf_group(filename, mode, group):
-    import h5netcdf.legacyapi
-    ds = h5netcdf.legacyapi.Dataset(filename, mode=mode)
+    import h5netcdf
+    ds = h5netcdf.File(filename, mode=mode)
     with close_on_error(ds):
         return _nc4_group(ds, group, mode)
 
@@ -96,10 +96,19 @@ def open_store_variable(self, name, var):
             attrs = _read_attributes(var)
 
             # netCDF4 specific encoding
-            encoding = dict(var.filters())
-            chunking = var.chunking()
-            encoding['chunksizes'] = chunking \
-                if chunking != 'contiguous' else None
+            encoding = {
+                'chunksizes': var.chunks,
+                'fletcher32': var.fletcher32,
+                'shuffle': var.shuffle,
+            }
+            # Convert h5py-style compression options to NetCDF4-Python
+            # style, if possible
+            if var.compression == 'gzip':
+                encoding['zlib'] = True
+                encoding['complevel'] = var.compression_opts
+            elif var.compression is not None:
+                encoding['compression'] = var.compression
+                encoding['compression_opts'] = var.compression_opts
 
             # save source so __repr__ can detect if it's local or not
             encoding['source'] = self._filename
@@ -130,14 +139,14 @@ def get_encoding(self):
     def set_dimension(self, name, length, is_unlimited=False):
         with self.ensure_open(autoclose=False):
             if is_unlimited:
-                self.ds.createDimension(name, size=None)
+                self.ds.dimensions[name] = None
                 self.ds.resize_dimension(name, length)
             else:
-                self.ds.createDimension(name, size=length)
+                self.ds.dimensions[name] = length
 
     def set_attribute(self, key, value):
         with self.ensure_open(autoclose=False):
-            self.ds.setncattr(key, value)
+            self.ds.attrs[key] = value
 
     def encode_variable(self, variable):
         return _encode_nc4_variable(variable)
@@ -149,8 +158,8 @@ def prepare_variable(self, name, variable, check_encoding=False,
         attrs = variable.attrs.copy()
         dtype = _get_datatype(variable)
 
-        fill_value = attrs.pop('_FillValue', None)
-        if dtype is str and fill_value is not None:
+        fillvalue = attrs.pop('_FillValue', None)
+        if dtype is str and fillvalue is not None:
             raise NotImplementedError(
                 'h5netcdf does not yet support setting a fill value for '
                 'variable-length strings '
@@ -166,18 +175,38 @@ def prepare_variable(self, name, variable, check_encoding=False,
                                           raise_on_invalid=check_encoding)
         kwargs = {}
 
-        for key in ['zlib', 'complevel', 'shuffle',
-                    'chunksizes', 'fletcher32']:
+        # Convert from NetCDF4-Python style compression settings to h5py style
+        # If both styles are used together, h5py takes precedence
+        # If set_encoding=True, raise ValueError in case of mismatch
+        if encoding.pop('zlib', False):
+            if (check_encoding and encoding.get('compression')
+                    not in (None, 'gzip')):
+                raise ValueError("'zlib' and 'compression' encodings mismatch")
+            encoding.setdefault('compression', 'gzip')
+
+        if (check_encoding and encoding.get('complevel') not in
+                (None, encoding.get('compression_opts'))):
+            raise ValueError("'complevel' and 'compression_opts' encodings "
+                             "mismatch")
+        complevel = encoding.pop('complevel', 0)
+        if complevel != 0:
+            encoding.setdefault('compression_opts', complevel)
+
+        encoding['chunks'] = encoding.pop('chunksizes', None)
+
+        for key in ['compression', 'compression_opts', 'shuffle',
+                    'chunks', 'fletcher32']:
             if key in encoding:
                 kwargs[key] = encoding[key]
-        if name not in self.ds.variables:
-            nc4_var = self.ds.createVariable(name, dtype, variable.dims,
-                                             fill_value=fill_value, **kwargs)
+        if name not in self.ds:
+            nc4_var = self.ds.create_variable(
+                name, dtype=dtype, dimensions=variable.dims,
+                fillvalue=fillvalue, **kwargs)
         else:
-            nc4_var = self.ds.variables[name]
+            nc4_var = self.ds[name]
 
         for k, v in iteritems(attrs):
-            nc4_var.setncattr(k, v)
+            nc4_var.attrs[k] = v
 
         target = H5NetCDFArrayWrapper(name, self)
 

diff --git a/xarray/backends/netCDF4_.py b/xarray/backends/netCDF4_.py
@@ -159,8 +159,8 @@ def _force_native_endianness(var):
 
 
 def _extract_nc4_variable_encoding(variable, raise_on_invalid=False,
-                                   lsd_okay=True, backend='netCDF4',
-                                   unlimited_dims=None):
+                                   lsd_okay=True, h5py_okay=False,
+                                   backend='netCDF4', unlimited_dims=None):
     if unlimited_dims is None:
         unlimited_dims = ()
 
@@ -171,6 +171,9 @@ def _extract_nc4_variable_encoding(variable, raise_on_invalid=False,
                            'chunksizes', 'shuffle', '_FillValue'])
     if lsd_okay:
         valid_encodings.add('least_significant_digit')
+    if h5py_okay:
+        valid_encodings.add('compression')
+        valid_encodings.add('compression_opts')
 
     if not raise_on_invalid and encoding.get('chunksizes') is not None:
         # It's possible to get encoded chunksizes larger than a dimension size

diff --git a/xarray/core/dataarray.py b/xarray/core/dataarray.py
@@ -1443,8 +1443,7 @@ def to_masked_array(self, copy=True):
         return np.ma.MaskedArray(data=self.values, mask=isnull, copy=copy)
 
     def to_netcdf(self, *args, **kwargs):
-        """
-        Write DataArray contents to a netCDF file.
+        """Write DataArray contents to a netCDF file.
 
         Parameters
         ----------

diff --git a/xarray/core/dataset.py b/xarray/core/dataset.py
@@ -1123,6 +1123,13 @@ def to_netcdf(self, path=None, mode='w', format=None, group=None,
             variable specific encodings as values, e.g.,
             ``{'my_variable': {'dtype': 'int16', 'scale_factor': 0.1,
                                'zlib': True}, ...}``
+
+            The `h5netcdf` engine supports both the NetCDF4-style compression
+            encoding parameters ``{'zlib': True, 'complevel': 9}`` and the h5py
+            ones ``{'compression': 'gzip', 'compression_opts': 9}``.
+            This allows using any compression plugin installed in the HDF5
+            library, e.g. LZF.
+
         unlimited_dims : sequence of str, optional
             Dimension(s) that should be serialized as unlimited dimensions.
             By default, no dimensions are treated as unlimited dimensions.

diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py
@@ -1668,6 +1668,84 @@ def test_encoding_unlimited_dims(self):
             self.assertEqual(actual.encoding['unlimited_dims'], set('y'))
             assert_equal(ds, actual)
 
+    def test_compression_encoding_h5py(self):
+        ENCODINGS = (
+            # h5py style compression with gzip codec will be converted to
+            # NetCDF4-Python style on round-trip
+            ({'compression': 'gzip', 'compression_opts': 9},
+             {'zlib': True, 'complevel': 9}),
+            # What can't be expressed in NetCDF4-Python style is
+            # round-tripped unaltered
+            ({'compression': 'lzf', 'compression_opts': None},
+             {'compression': 'lzf', 'compression_opts': None}),
+            # If both styles are used together, h5py format takes precedence
+            ({'compression': 'lzf', 'compression_opts': None,
+              'zlib': True, 'complevel': 9},
+             {'compression': 'lzf', 'compression_opts': None}))
+
+        for compr_in, compr_out in ENCODINGS:
+            data = create_test_data()
+            compr_common = {
+                'chunksizes': (5, 5),
+                'fletcher32': True,
+                'shuffle': True,
+                'original_shape': data.var2.shape
+            }
+            data['var2'].encoding.update(compr_in)
+            data['var2'].encoding.update(compr_common)
+            compr_out.update(compr_common)
+            with self.roundtrip(data) as actual:
+                for k, v in compr_out.items():
+                    self.assertEqual(v, actual['var2'].encoding[k])
+
+    def test_compression_check_encoding_h5py(self):
+        """When mismatched h5py and NetCDF4-Python encodings are expressed
+        in to_netcdf(encoding=...), must raise ValueError
+        """
+        data = Dataset({'x': ('y', np.arange(10.0))})
+        # Compatible encodings are graciously supported
+        with create_tmp_file() as tmp_file:
+            data.to_netcdf(
+                tmp_file, engine='h5netcdf',
+                encoding={'x': {'compression': 'gzip', 'zlib': True,
+                                'compression_opts': 6, 'complevel': 6}})
+            with open_dataset(tmp_file, engine='h5netcdf') as actual:
+                assert actual.x.encoding['zlib'] is True
+                assert actual.x.encoding['complevel'] == 6
+
+        # Incompatible encodings cause a crash
+        with create_tmp_file() as tmp_file:
+            with raises_regex(ValueError,
+                              "'zlib' and 'compression' encodings mismatch"):
+                data.to_netcdf(
+                    tmp_file, engine='h5netcdf',
+                    encoding={'x': {'compression': 'lzf', 'zlib': True}})
+
+        with create_tmp_file() as tmp_file:
+            with raises_regex(
+                    ValueError,
+                    "'complevel' and 'compression_opts' encodings mismatch"):
+                data.to_netcdf(
+                    tmp_file, engine='h5netcdf',
+                    encoding={'x': {'compression': 'gzip',
+                                    'compression_opts': 5, 'complevel': 6}})
+
+    def test_dump_encodings_h5py(self):
+        # regression test for #709
+        ds = Dataset({'x': ('y', np.arange(10.0))})
+
+        kwargs = {'encoding': {'x': {
+            'compression': 'gzip', 'compression_opts': 9}}}
+        with self.roundtrip(ds, save_kwargs=kwargs) as actual:
+            self.assertEqual(actual.x.encoding['zlib'], True)
+            self.assertEqual(actual.x.encoding['complevel'], 9)
+
+        kwargs = {'encoding': {'x': {
+            'compression': 'lzf', 'compression_opts': None}}}
+        with self.roundtrip(ds, save_kwargs=kwargs) as actual:
+            self.assertEqual(actual.x.encoding['compression'], 'lzf')
+            self.assertEqual(actual.x.encoding['compression_opts'], None)
+
 
 # tests pending h5netcdf fix
 @unittest.skip