From b62b572e6a03ea020e658846cdf30daf16f5bd75 Mon Sep 17 00:00:00 2001 From: Jack Kelly Date: Tue, 15 Feb 2022 12:19:28 +0000 Subject: [PATCH] test script runs! --- .gitignore | 15 +++ satip/intermediate.py | 6 + satip/jpeg_xl_float_with_nans.py | 185 +++++++++++++++++++++++++++++++ satip/jpeg_xl_future.py | 88 --------------- satip/scale_to_zero_to_one.py | 58 +--------- satip/utils.py | 13 ++- scripts/generate_test_plots.py | 8 ++ 7 files changed, 229 insertions(+), 144 deletions(-) create mode 100644 satip/jpeg_xl_float_with_nans.py delete mode 100644 satip/jpeg_xl_future.py diff --git a/.gitignore b/.gitignore index aed00b64..9971006e 100644 --- a/.gitignore +++ b/.gitignore @@ -147,3 +147,18 @@ checklink/cookies.txt # .gitconfig is now autogenerated .gitconfig + +# Additions for satip +*.tif +*.xml +*.grb +log.txt +rss.zarr +cloud.zarr +hrv.zarr +hrv_RSS.png +hrv_UK.png +rss_RSS.png +rss_UK.png +cloud_mask_RSS.png +cloud_mask_UK.png diff --git a/satip/intermediate.py b/satip/intermediate.py index 32c66f7f..f2a44867 100644 --- a/satip/intermediate.py +++ b/satip/intermediate.py @@ -89,6 +89,7 @@ def split_per_month( save_dataset_to_zarr( dataset, zarr_path=month_zarr_path, + compressor_name="jpeg-xl", zarr_mode="w", x_size_per_chunk=spatial_chunk_size, y_size_per_chunk=spatial_chunk_size, @@ -97,6 +98,7 @@ def split_per_month( save_dataset_to_zarr( hrv_dataset, zarr_path=hrv_month_zarr_path, + compressor_name="jpeg-xl", zarr_mode="w", x_size_per_chunk=spatial_chunk_size, y_size_per_chunk=spatial_chunk_size, @@ -178,6 +180,7 @@ def cloudmask_split_per_month( save_dataset_to_zarr( dataset, zarr_path=month_zarr_path, + compressor_name="bz2", x_size_per_chunk=spatial_chunk_size, y_size_per_chunk=spatial_chunk_size, timesteps_per_chunk=temporal_chunk_size, @@ -253,6 +256,7 @@ def create_or_update_zarr_with_cloud_mask_files( save_dataset_to_zarr( dataset, zarr_path=zarr_path, + compressor_name="bz2", x_size_per_chunk=spatial_chunk_size, y_size_per_chunk=spatial_chunk_size, timesteps_per_chunk=temporal_chunk_size, @@ -316,6 +320,7 @@ def create_or_update_zarr_with_native_files( save_dataset_to_zarr( dataset, zarr_path=zarr_path, + compressor_name="jpeg-xl", x_size_per_chunk=spatial_chunk_size, y_size_per_chunk=spatial_chunk_size, timesteps_per_chunk=temporal_chunk_size, @@ -323,6 +328,7 @@ def create_or_update_zarr_with_native_files( save_dataset_to_zarr( hrv_dataset, zarr_path=hrv_zarr_path, + compressor_name="jpeg-xl", x_size_per_chunk=spatial_chunk_size, y_size_per_chunk=spatial_chunk_size, timesteps_per_chunk=temporal_chunk_size, diff --git a/satip/jpeg_xl_float_with_nans.py b/satip/jpeg_xl_float_with_nans.py new file mode 100644 index 00000000..54cec1e9 --- /dev/null +++ b/satip/jpeg_xl_float_with_nans.py @@ -0,0 +1,185 @@ +"""Thin wrapper around imagecodecs.JpegXl. + +This this wrapper is only required while the most recent version of imagecodecs is version +2021.11.20. +""" +from typing import Optional + +import numpy as np +from imagecodecs.numcodecs import JpegXl +from numcodecs.registry import register_codec + +# See the docs in `encode_nans` for an explanation of what these consts do. +LOWER_BOUND_FOR_REAL_PIXELS = 0.075 +NAN_THRESHOLD = 0.05 +NAN_VALUE = 0.025 + + +class JpegXlFloatWithNaNs(JpegXl): + """Thin wrapper around imagecodecs.JpegXl for floats with NaNs.""" + + codec_id = "imagecodecs_jpegxl_float_with_nans" + + def __init__( + self, + lossless: Optional[bool] = None, + distance: Optional[int] = None, + level: Optional[int] = None, + decodingspeed: Optional[float] = None, + *args, + **kwargs, + ): + """Initialise. + + This __init__ function is a simple hack to make the JpegXl compressor in the currently + released version of imagecodecs (version 2021.11.20) look like the version in development. + (We can't simply use the version in development because the imagecodecs author does not + develop on GitHub. The imagecodecs authors just uses GitHub as one of several mechanisms + to release imagecodecs.) + + See https://github.com/cgohlke/imagecodecs/issues/31#issuecomment-1026179413 + + Args: + lossless: Set to True to enable lossless compression. + distance: Lowest settings are 0.00 or 0.01. If 0.0 then also set lossless to True. + To quote the cjxl man page: + The preferred way to specify quality. It is specified in multiples of a + just-noticeable difference. That is, -d 0 is mathematically lossless, + -d 1 should be visually lossless, and higher distances yield denser and + denser files with lower and lower fidelity. + effort: To quote the cjxl man page: + Controls the amount of effort that goes into producing an “optimal” file in + terms of quality/size. That is to say, all other parameters being equal, + a higher effort should yield a file that is at least as dense and possibly + denser, and with at least as high and possibly higher quality. + 1 is fastest. 9 is slowest. + level: DON'T SET THIS WITH THIS JpegXlFuture wrapper! + In imagecodecs version 2021.11.20, level is mapped (incorrectly) to the decoding + speed tier. Minimum is 0 (highest quality), and maximum is 4 (lowest quality). + Default is 0. + decodingspeed: DON'T SET THIS WITH THIS JpegXlFuture wrapper! + """ + assert decodingspeed is None + if lossless is not None: + if lossless: + assert ( + level is None + ) # level must be None to enable lossless in imagecodecs 2021.11.20. + assert distance is None or distance == 0 + else: + # Enable lossy compression. + # level must be set to 0, 1, 2, 3, or 4 to enable lossy + # compression in imagecodecs 2021.11.20. + level = 0 + super().__init__(level=level, distance=distance, *args, **kwargs) + + def encode(self, buf: np.ndarray) -> None: + """Encode `buf` with JPEG-XL. + + Under the hood, NaNs will be encoded as NAN_VALUE, and all "real" + values will be in the range [LOWER_BOUND_FOR_REAL_PIXELS, 1]. But + this is all taken care of by encode and decode. + + Args: + buf: The input array to compress as JPEG-XL. + Expects buf to be of shape (n_timesteps, y, x, n_channels). + n_timesteps and n_channels must == 1. + buf can be float16 or float32. + All values must be in the range [0, 1]. + Use as much of the range [0, 1] as possible. 0 is black and 1 is white. + If all the information is squished into, say, the range [0, 0.1] then + JPEG-XL will interpret the image as very dark, and will agressively + compress the data because JPEG-XL assumes that human viewers do not + notice if detail is lost in the shaddows. + """ + assert buf.dtype in ( + np.float16, + np.float32, + ), f"dtype must be float16 or float32, not {buf.dtype}" + + # Even if the original DataArray doesn't have NaNs, + # when Zarr saves chunks at the edges of the image, the image data for that chunk + # might be smaller than the chunk. In that case, `buf` will be the same shape + # as the nominal chunk size, but will include NaNs. Those NaNs must be encoded + # with a floating point value in the range [0, 1] because JPEG-XL cannot + # handle NaN values. + buf = encode_nans(buf) + + # Sanity checks. + assert np.all(np.isfinite(buf)) + assert np.amin(buf) >= 0 + assert np.amax(buf) <= 1 + + # In the future, JPEG-XL will be able to encode multiple images per file. + # But, for now, it can only compress one image at a time. See: + # https://github.com/cgohlke/imagecodecs/issues/32 + n_timesteps = buf.shape[0] + n_channels = buf.shape[-1] + assert n_timesteps == 1 + assert n_channels == 1 + + return super().encode(buf) + + def decode(self, buf, out=None) -> np.ndarray: + """Decode JPEG-XL `buf`. + + Reconstruct the NaNs encoded by encode_nans. + """ + out = super().decode(buf, out) + out = decode_nans(out) + return out + + +def encode_nans(data: np.ndarray) -> np.ndarray: + """Encode NaNs as the value NAN_VALUE. + + Encode all other values in the range [LOWER_BOUND_FOR_REAL_PIXELS, 1]. + + JPEG-XL does not understand "NaN" values. JPEG-XL only understands floating + point values in the range [0, 1]. So we must encode NaN values + as real values in the range [0, 1]. + + After lossy JPEG-XL compression, there is slight "ringing" around the edges + of regions with filled with a constant number. In experiments, when NAN_VALUE = 0.025, + it appears that the values at the inner edges of a "NaN region" vary in the range + [0.0227, 0.0280]. But, to be safe, we use a nice wide margin: We don't set + the value of "NaNs" to be 0.00 because the ringing would cause the values + to drop below zero, which is illegal for JPEG-XL images. + + After decompression, reconstruct regions of NaNs using "image < NAN_THRESHOLD" + to find NaNs. + + See this comment for more info: + https://github.com/openclimatefix/Satip/issues/67#issuecomment-1036456502 + + Args: + data: The input data. All values must already + be in the range [0, 1]. The original data is modified in place. + + Returns: + The returned array. "Real" values will be shifted to + the range [LOWER_BOUND_FOR_REAL_PIXELS, 1]. + NaNs will be encoded as NAN_VALUE. + """ + assert issubclass( + data.dtype.type, np.floating + ), f"dataarray.dtype must be floating point not {data.dtype}!" + + # Shift all the "real" values up to the range [0.075, 1] + data += LOWER_BOUND_FOR_REAL_PIXELS * (1 + LOWER_BOUND_FOR_REAL_PIXELS) + data /= 1 + LOWER_BOUND_FOR_REAL_PIXELS + data = np.nan_to_num(data, nan=NAN_VALUE) + return data + + +def decode_nans(data: np.ndarray) -> np.ndarray: + """Reconstruct the NaNs encoded by encode_nans.""" + assert np.all(np.isfinite(data)) + assert issubclass(data.dtype.type, np.floating) + data[data <= NAN_THRESHOLD] = np.NaN + data *= 1 + LOWER_BOUND_FOR_REAL_PIXELS + data -= LOWER_BOUND_FOR_REAL_PIXELS * (1 + LOWER_BOUND_FOR_REAL_PIXELS) + return data + + +register_codec(JpegXlFloatWithNaNs) diff --git a/satip/jpeg_xl_future.py b/satip/jpeg_xl_future.py deleted file mode 100644 index bfb94ca6..00000000 --- a/satip/jpeg_xl_future.py +++ /dev/null @@ -1,88 +0,0 @@ -"""Thin wrapper around imagecodecs.JpegXl. - -This this wrapper is only required while the most recent version of imagecodecs is version -2021.11.20. -""" -from typing import Optional - -import numpy as np -from imagecodecs.numcodecs import JpegXl -from numcodecs.registry import register_codec - - -class JpegXlFuture(JpegXl): - """Thin wrapper around imagecodecs.JpegXl. - - Simple hack to make the JpegXl compressor in the currently released - version of imagecodecs (version 2021.11.20) look like the version in development. - (We can't simply use the version in development because the imagecodecs author - does not develop on GitHub. The imagecodecs authors just uses GitHub as one of - several mechanisms to release imagecodecs.) - - See https://github.com/cgohlke/imagecodecs/issues/31#issuecomment-1026179413 - """ - - codec_id = "imagecodecs_jpegxl" - - def __init__( - self, - lossless: Optional[bool] = None, - distance: Optional[int] = None, - level: Optional[int] = None, - decodingspeed: Optional[float] = None, - *args, - **kwargs - ): - """Initialise JpegXlFuture. - - Args: - lossless: Set to True to enable lossless compression. - distance: Lowest settings are 0.00 or 0.01. If 0.0 then also set lossless to True. - To quote the cjxl man page: - The preferred way to specify quality. It is specified in multiples of a - just-noticeable difference. That is, -d 0 is mathematically lossless, - -d 1 should be visually lossless, and higher distances yield denser and - denser files with lower and lower fidelity. - effort: To quote the cjxl man page: - Controls the amount of effort that goes into producing an “optimal” file in - terms of quality/size. That is to say, all other parameters being equal, - a higher effort should yield a file that is at least as dense and possibly - denser, and with at least as high and possibly higher quality. - 1 is fastest. 9 is slowest. - level: DON'T SET THIS WITH THIS JpegXlFuture wrapper! - In imagecodecs version 2021.11.20, level is mapped (incorrectly) to the decoding - speed tier. Minimum is 0 (highest quality), and maximum is 4 (lowest quality). - Default is 0. - decodingspeed: DON'T SET THIS WITH THIS JpegXlFuture wrapper! - """ - assert decodingspeed is None - if lossless is not None: - if lossless: - assert ( - level is None - ) # level must be None to enable lossless in imagecodecs 2011.11.20. - assert distance is None or distance == 0 - else: - # Enable lossy compression. - # level must be set to 0, 1, 2, 3, or 4 to enable lossy - # compression in imagecodecs 2021.11.20. - level = 0 - - super().__init__(level=level, distance=distance, *args, **kwargs) - - def encode(self, buf: np.ndarray) -> None: - """Encode buf at JPEG-XL. - - Args: - buf: The input array to compress as JPEG-XL. - Expects buf to be of shape (n_timesteps, y, x, n_channels). - n_timesteps and n_channels must == 1 - """ - n_timesteps = buf.shape[0] - n_channels = buf.shape[-1] - assert n_timesteps == 1 - assert n_channels == 1 - return super().encode(buf) - - -register_codec(JpegXlFuture) diff --git a/satip/scale_to_zero_to_one.py b/satip/scale_to_zero_to_one.py index 54c03bcf..57840395 100644 --- a/satip/scale_to_zero_to_one.py +++ b/satip/scale_to_zero_to_one.py @@ -107,9 +107,8 @@ def rescale(self, dataarray: xr.DataArray) -> Union[xr.DataArray, None]: dataarray: DataArray to rescale. Returns: - The rescaled DataArray. NaNs in the original `dataarray` will have been - converted to 0.025, and "real" pixels will be in the range [0.075, 1]. - The returned DataArray will be float32. + The DataArray rescaled to [0, 1]. NaNs in the original `dataarray` will still + be present in the returned dataarray. The returned DataArray will be float32. """ for attr in ["mins", "maxs"]: assert ( @@ -124,12 +123,8 @@ def rescale(self, dataarray: xr.DataArray) -> Union[xr.DataArray, None]: dataarray -= self.mins dataarray /= range dataarray = dataarray.clip(min=0, max=1) - # JPEG-XL cannot handle NaN values, so we must encode NaNs as a different value. - # See the docstring of encode_nans_as_floats for more details. - dataarray = encode_nans_as_floats(dataarray) dataarray = dataarray.astype(np.float32) dataarray.attrs = serialize_attrs(dataarray.attrs) # Must be serializable - return dataarray def compress_mask(self, dataarray: xr.DataArray) -> Union[xr.DataArray, None]: @@ -162,11 +157,8 @@ def compress_mask(dataarray: xr.DataArray) -> xr.DataArray: """ dataarray = dataarray.transpose("time", "y_geostationary", "x_geostationary", "variable") dataarray = dataarray.round().clip(min=0, max=3) - dataarray += 1 - dataarray *= 64 - dataarray -= 1 # Because uint8 goes up to 255, not 256. - dataarray = dataarray.fillna(0) - dataarray = dataarray.astype(np.uint8) + dataarray = dataarray.fillna(-1) + dataarray = dataarray.astype(np.int8) dataarray.attrs = serialize_attrs(dataarray.attrs) # Must be serializable return dataarray @@ -184,45 +176,3 @@ def is_dataset_clean(dataarray: xr.DataArray) -> bool: return ( dataarray.notnull().compute().all().values and np.isfinite(dataarray).compute().all().values ) - - -def encode_nans_as_floats(dataarray: xr.DataArray) -> xr.DataArray: - """Encode NaNs as the value 0.025. Encode all other values in the range [0.075, 1]. - - JPEG-XL does not understand "NaN" values. JPEG-XL only understands floating - point values in the range [0, 1]. So we must encode NaN values - as real values in the range [0, 1]. - - After JPEG-XL compression, there is slight "ringing" around the edges - of regions with filled with a constant number. In experiments, it appears - that the values at the inner edges of a "NaN region" vary in the range - [0.0227, 0.0280]. But, to be safe, we use a nice wide margin: We don't set - the value of "NaNs" to be 0.00 because the ringing would cause the values - to drop below zero, which is illegal for JPEG-XL images. - - After decompression, reconstruct regions of NaNs using "image < 0.05" to find NaNs. - - See this comment for more info: - https://github.com/openclimatefix/Satip/issues/67#issuecomment-1036456502 - - Args: - dataarray (xr.DataArray): The input DataArray. All values must already - be in the range [0, 1]. The original dataarray is modified in place. - - Returns: - xr.DataArray: The returned DataArray. "Real" values will be shifted to - the range [0.075, 1]. NaNs will be encoded as 0.025. - """ - LOWER_BOUND_FOR_REAL_PIXELS = 0.075 - NAN_VALUE = 0.025 - - assert issubclass( - dataarray.dtype.type, np.floating - ), f"dataarray.dtype must be floating point not {dataarray.dtype}!" - dataarray = dataarray.clip(min=0, max=1) - - # Shift all the "real" values up to the range [0.075, 1] - dataarray /= 1 + LOWER_BOUND_FOR_REAL_PIXELS - dataarray += LOWER_BOUND_FOR_REAL_PIXELS - dataarray = dataarray.fillna(NAN_VALUE) - return dataarray diff --git a/satip/utils.py b/satip/utils.py index af75c94b..c8cf70ea 100644 --- a/satip/utils.py +++ b/satip/utils.py @@ -16,13 +16,14 @@ from pathlib import Path from typing import Any, Tuple +import numcodecs import numpy as np import pandas as pd import xarray as xr from satpy import Scene from satip.geospatial import GEOGRAPHIC_BOUNDS, lat_lon_to_osgb -from satip.jpeg_xl_future import JpegXlFuture +from satip.jpeg_xl_float_with_nans import JpegXlFloatWithNaNs from satip.scale_to_zero_to_one import ScaleToZeroToOne, compress_mask, is_dataset_clean warnings.filterwarnings("ignore", message="divide by zero encountered in true_divide") @@ -296,6 +297,7 @@ def convert_scene_to_dataarray( def save_dataset_to_zarr( dataarray: xr.DataArray, zarr_path: str, + compressor_name: str, zarr_mode: str = "a", timesteps_per_chunk: int = 1, y_size_per_chunk: int = 256, @@ -308,6 +310,7 @@ def save_dataset_to_zarr( Args: dataarray: DataArray to save zarr_path: Filename of the Zarr dataset + compressor_name: The name of the compression algorithm to use. Must be 'bz2' or 'jpeg-xl'. zarr_mode: Mode to write to the filename, either 'w' for write, or 'a' to append timesteps_per_chunk: Timesteps per Zarr chunk y_size_per_chunk: Y pixels per Zarr chunk @@ -331,12 +334,18 @@ def save_dataset_to_zarr( print("Failing clean check after chunking") return + compression_algos = { + "bz2": numcodecs.get_codec(dict(id="bz2", level=5)), + "jpeg-xl": JpegXlFloatWithNaNs(lossless=False, distance=0.4, effort=8), + } + compression_algo = compression_algos[compressor_name] + zarr_mode_to_extra_kwargs = { "a": {"append_dim": "time"}, "w": { "encoding": { "data": { - "compressor": JpegXlFuture(lossless=False, distance=0.4, effort=8), + "compressor": compression_algo, "chunks": chunks, }, "time": {"units": "nanoseconds since 1970-01-01"}, diff --git a/scripts/generate_test_plots.py b/scripts/generate_test_plots.py index e885e6ee..922b8c6b 100644 --- a/scripts/generate_test_plots.py +++ b/scripts/generate_test_plots.py @@ -121,18 +121,26 @@ def _plot_dataset(dataset: xr.DataArray, name: str, area: str) -> None: save_dataset_to_zarr( cloudmask_dataset, zarr_path=os.path.join(os.getcwd(), "cloud.zarr"), + compressor_name="bz2", zarr_mode="w", ) + del cloudmask_dataset + save_dataset_to_zarr( rss_dataset, zarr_path=os.path.join(os.getcwd(), "rss.zarr"), + compressor_name="jpeg-xl", zarr_mode="w", ) + del rss_dataset + save_dataset_to_zarr( hrv_dataset, zarr_path=os.path.join(os.getcwd(), "hrv.zarr"), + compressor_name="jpeg-xl", zarr_mode="w", ) + del hrv_dataset # Load them from Zarr to ensure its the same as the output from satip cloudmask_dataset = (