From 679bf160e3f17fa5d398021a5340715fd12f97f2 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 30 Sep 2021 12:12:16 +0530 Subject: [PATCH 01/20] json validation --- hub/htype.py | 3 + hub/util/json_validation.py | 156 ++++++++++++++++++++++++++++++++++++ 2 files changed, 159 insertions(+) create mode 100644 hub/util/json_validation.py diff --git a/hub/htype.py b/hub/htype.py index cf135a3e25..00b9237b43 100644 --- a/hub/htype.py +++ b/hub/htype.py @@ -56,6 +56,9 @@ "dtype": "bool" }, # TODO: pack numpy arrays to store bools as 1 bit instead of 1 byte "segment_mask": {"dtype": "int32"}, + "json": { + "dtype": None + } } # these configs are added to every `htype` diff --git a/hub/util/json_validation.py b/hub/util/json_validation.py new file mode 100644 index 0000000000..0a7529989f --- /dev/null +++ b/hub/util/json_validation.py @@ -0,0 +1,156 @@ +from typing import Any, Dict, List, Optional, Tuple, Union, GenericMeta +from numpy import ndarray + + +scalars = ["int", "float", "bool", "str", "list", "dict", "ndarray"] +types = ["Any", "Dict", "List", "Optional", "Union"] + + +def _norm_type(typ: str): + typ = typ.replace("typing.", "") + replacements = { + "numpy.ndarray": "ndarray", + "np.ndarray": "ndarray" + } + return replacements.get(typ, typ) + +def _parse_schema(schema: Union[str, GenericMeta]) -> Tuple[str, List[str]]: + if isinstance(schema, GenericMeta): + schema = str(schema) + validate = False + else: + validate = True + + if schema in scalars: + return schema, [] + + if "[" not in schema: + return _norm_type(schema), [] + + typ, param_string = schema.split("[", 1) + typ = _norm_type(typ) + assert param_string[-1] == "]" + params = [] + buff = "" + level = 0 + for c in param_string: + if c == "[": + level += 1 + buff += c + elif c == "]": + if level == 0: + if buff: + params.append(buff) + if validate: + _validate_schema(typ, params) + return typ, params + else: + buff += c + level -= 1 + elif c == ",": + if level == 0: + params.append(buff) + buff = "" + else: + buff += c + elif c == " ": + continue + else: + buff += c + + +class InvalidJsonSchemaException(Exception): + pass + +class ArgumentMismatchException(InvalidJsonSchemaException): + def __init__(self, typ: str, actual: int, expected: int, exact: bool = False): + assert actual != expected + gt = actual > expected + super(ArgumentMismatchException, self).__init__( + f"Too {'many' if gt else 'few'} parameters for {typ};" + + f" actual {actual},expected {'exatcly' if exact else ('at most' if gt else 'at least')} {expected}." + ) + + +def _validate_schema(typ: str, params: List[str]) -> Tuple[str, List[str]]: + if typ in scalars: + return typ, params + + if typ not in types: + raise InvalidJsonSchemaException(f"Unsupported type: {typ}") + + def _err(expected_num_params: int, exact: bool = False): + raise ArgumentMismatchException(typ, len(params), expected_num_params, exact) + + if typ == "Any": + if params: + _err(0) + elif typ == "Optional": + if len(params) > 1: + _err(1) + elif typ == "Union": + if len(params) == 0: + _err(1) + elif typ == "List": + if len(params) > 1: + _err(1) + elif typ == "Dict": + if len(params) not in (0, 2): + _err(2, True) + return typ, params + + +def _validate_any(obj: Any, params: List[str]): + assert not params + return True + + +def _validate_union(obj: Any, params: List[str]): + for schema in params: + if _validate_object(obj, schema): + return True + return False + + +def _validate_optional(obj: Any, params: List[str]) -> bool: + assert len(params) <= 1 + if obj is None: + return True + if params: + return _validate_object(obj, params[0]) + + +def _validate_list(obj: Any, params: List[str]) -> bool: + assert len(params) == 1 + assert isinstance(obj, (list, tuple)) + for item in obj: + if not _validate_object(item, params[0]): + return False + return True + + +def _validate_dict(obj: Any, params: List[str]) -> bool: + assert len(params) in (0, 2) + assert isinstance(obj, dict) + if params: + assert params[0] in ("str", "Any"), "Only string keys are allowed for json dicts." + for v in obj.values(): + if not _validate_object(v, params[1]): + return False + return True + + +def _validate_object(obj: Any, schema: Union[str, GenericMeta]) -> bool: + typ, params = _parse_schema(schema) + if typ in scalars: + return isinstance(obj, eval(typ)) + return globals()[f"_validate_{typ.lower()}"](obj, params) + + +class JsonValidationError(Exception): + pass + + +def validate_json_object(obj: Any, schema: Union[str, GenericMeta]) -> None: + if not _validate_object(obj, schema): + raise JsonValidationError() From de91ae076cbd8b65e3258571fa36799e226d93ea Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 30 Sep 2021 12:38:38 +0530 Subject: [PATCH 02/20] updates --- hub/util/{json_validation.py => json.py} | 10 ++++++++++ 1 file changed, 10 insertions(+) rename hub/util/{json_validation.py => json.py} (95%) diff --git a/hub/util/json_validation.py b/hub/util/json.py similarity index 95% rename from hub/util/json_validation.py rename to hub/util/json.py index 0a7529989f..fbcac4b596 100644 --- a/hub/util/json_validation.py +++ b/hub/util/json.py @@ -1,5 +1,6 @@ from typing import Any, Dict, List, Optional, Tuple, Union, GenericMeta from numpy import ndarray +import json scalars = ["int", "float", "bool", "str", "list", "dict", "ndarray"] @@ -154,3 +155,12 @@ class JsonValidationError(Exception): def validate_json_object(obj: Any, schema: Union[str, GenericMeta]) -> None: if not _validate_object(obj, schema): raise JsonValidationError() + + +class _NumpyEncoder(json.JSONEncoder): + def default(self, obj): + if isinstance(obj, ndarray): + { + "_type": "ndarray", + "data": "" + } From 6db239476a85387fc4af0afc9673315fb427b181 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 30 Sep 2021 16:11:08 +0530 Subject: [PATCH 03/20] initial --- hub/__init__.py | 2 +- hub/api/tests/test_api.py | 11 ++++--- hub/core/chunk_engine.py | 34 ++++++++++++++++++-- hub/core/dataset.py | 14 +++++++-- hub/core/meta/tensor_meta.py | 47 +++++++++++++++++++-------- hub/core/serialize.py | 22 +++++++++++-- hub/core/tensor.py | 4 +++ hub/htype.py | 6 ++-- hub/util/json.py | 61 ++++++++++++++++++++++++++++++------ 9 files changed, 164 insertions(+), 37 deletions(-) diff --git a/hub/__init__.py b/hub/__init__.py index 64c47a6bc2..6e78868062 100644 --- a/hub/__init__.py +++ b/hub/__init__.py @@ -27,7 +27,7 @@ from .htype import HTYPE_CONFIGURATIONS compressions = list(SUPPORTED_COMPRESSIONS) -htypes = list(HTYPE_CONFIGURATIONS.keys()) +htypes = sorted(list(HTYPE_CONFIGURATIONS)) list = dataset.list load = dataset.load empty = dataset.empty diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index 380e93fcc2..36ce0d3807 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -690,13 +690,16 @@ def test_compressions_list(): def test_htypes_list(): assert hub.htypes == [ - "generic", - "image", - "class_label", "bbox", - "video", "binary_mask", + "class_label", + "generic", + "image", + "json", + "list", "segment_mask", + "text", + "video", ] diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index 7c9056df84..db8647b82c 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -8,7 +8,7 @@ from hub.core.version_control.commit_node import CommitNode # type: ignore from hub.core.version_control.commit_chunk_set import CommitChunkSet # type: ignore from hub.core.fast_forwarding import ffw_chunk_id_encoder -from hub.core.compression import decompress_array +from hub.core.compression import decompress_array, decompress_bytes from hub.core.sample import Sample, SampleValue # type: ignore from hub.core.meta.tensor_meta import TensorMeta from hub.core.index.index import Index @@ -30,6 +30,7 @@ CorruptedMetaError, DynamicTensorNumpyError, ) +from hub.util.json import HubJsonDecoder from hub.util.casting import get_dtype, intelligent_cast from hub.util.version_control import auto_checkout, commit, commit_chunk_set_exists @@ -684,6 +685,34 @@ def read_sample_from_chunk( return np.zeros(shape, dtype=dtype) chunk_compression = self.tensor_meta.chunk_compression + sample_compression = self.tensor_meta.sample_compression + + htype = self.tensor_meta.htype + + if htype in ("json", "text", "list"): + if chunk_compression: + decompressed = chunk.decompressed_data(compression=chunk_compression) + sb, eb = chunk.byte_positions_encoder[local_sample_index] + buffer = decompressed[sb:eb] + elif sample_compression: + buffer = decompress_bytes(buffer[sb:eb], compression=sample_compression) + else: + buffer = buffer[sb:eb] + if htype == "json": + arr = np.empty(1, dtype=object) + arr[0] = json.loads(str.encode(buffer), cls=HubJsonDecoder) + return arr + elif htype == "list": + lst = json.loads(str.encode(buffer), cls=HubJsonDecoder) + arr = np.empty(len(lst), dtype=object) + arr[:] = lst + return arr + elif htype == "text": + arr = np.array(str.encode(buffer)).reshape( + 1, + ) + return arr + if chunk_compression: if get_compression_type(chunk_compression) == BYTE_COMPRESSION: decompressed = chunk.decompressed_data(compression=chunk_compression) @@ -691,10 +720,9 @@ def read_sample_from_chunk( return np.frombuffer(decompressed[sb:eb], dtype=dtype).reshape(shape) else: return chunk.decompressed_samples()[local_sample_index] + sb, eb = chunk.byte_positions_encoder[local_sample_index] buffer = buffer[sb:eb] - - sample_compression = self.tensor_meta.sample_compression if sample_compression: sample = decompress_array( buffer, shape, dtype=dtype, compression=sample_compression diff --git a/hub/core/dataset.py b/hub/core/dataset.py index 2e9ab9dba8..d691f3b67e 100644 --- a/hub/core/dataset.py +++ b/hub/core/dataset.py @@ -3,7 +3,17 @@ import warnings import posixpath import numpy as np -from typing import Any, Callable, Dict, Optional, Union, Tuple, List, Sequence +from typing import ( + Any, + Callable, + Dict, + Optional, + Union, + Tuple, + List, + Sequence, + GenericMeta, +) from hub.api.info import load_info from hub.client.log import logger @@ -208,7 +218,7 @@ def create_tensor( self, name: str, htype: str = DEFAULT_HTYPE, - dtype: Union[str, np.dtype, type] = UNSPECIFIED, + dtype: Union[str, np.dtype, type, GenericMeta] = UNSPECIFIED, sample_compression: str = UNSPECIFIED, chunk_compression: str = UNSPECIFIED, **kwargs, diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py index 46fca81caf..df29fdb9d4 100644 --- a/hub/core/meta/tensor_meta.py +++ b/hub/core/meta/tensor_meta.py @@ -1,6 +1,6 @@ import hub from hub.core.fast_forwarding import ffw_tensor_meta -from typing import Any, Callable, Dict, List, Tuple +from typing import Any, Callable, Dict, List, Tuple, GenericMeta import numpy as np from hub.util.exceptions import ( TensorMetaInvalidHtype, @@ -11,6 +11,7 @@ UnsupportedCompressionError, TensorInvalidSampleShapeError, ) +from hub.util.json import validate_json_schema from hub.constants import ( REQUIRE_USER_SPECIFICATION, UNSPECIFIED, @@ -53,8 +54,8 @@ def __init__( _validate_htype_exists(htype) _validate_htype_overwrites(htype, kwargs) _replace_unspecified_values(htype, kwargs) - _validate_required_htype_overwrites(kwargs) - _format_values(kwargs) + _validate_required_htype_overwrites(htype, kwargs) + _format_values(htype, kwargs) required_meta = _required_meta_from_htype(htype) required_meta.update(kwargs) @@ -158,6 +159,15 @@ def _validate_htype_overwrites(htype: str, htype_overwrite: dict): htype, ["chunk_compression", "sample_compression"] # type: ignore ) + if htype in ("json", "list", "text"): + compr = htype_overwrite["chunk_compression"] + if compr in (None, UNSPECIFIED): + compr = htype_overwrite["sample_compression"] + if compr not in (None, UNSPECIFIED): + if get_compression_type(compr) != BYTE_COMPRESSION: + raise Exception() # TODO + dtype = htype_overwrite["dtype"] + def _replace_unspecified_values(htype: str, htype_overwrite: dict): """Replaces `UNSPECIFIED` values in `htype_overwrite` with the `htype`'s defaults.""" @@ -168,8 +178,11 @@ def _replace_unspecified_values(htype: str, htype_overwrite: dict): if v == UNSPECIFIED: htype_overwrite[k] = defaults[k] + if htype in ("json", "list") and not htype_overwrite["dtype"]: + htype_overwrite["dtype"] = HTYPE_CONFIGURATIONS[htype]["dtype"] + -def _validate_required_htype_overwrites(htype_overwrite: dict): +def _validate_required_htype_overwrites(htype: str, htype_overwrite: dict): """Raises errors if `htype_overwrite` has invalid values.""" sample_compression = htype_overwrite["sample_compression"] @@ -188,19 +201,27 @@ def _validate_required_htype_overwrites(htype_overwrite: dict): ) if htype_overwrite["dtype"] is not None: - _raise_if_condition( - "dtype", - htype_overwrite, - lambda dtype: not _is_dtype_supported_by_numpy(dtype), - "Datatype must be supported by numpy. Can be an `str`, `np.dtype`, or normal python type (like `bool`, `float`, `int`, etc.). List of available numpy dtypes found here: https://numpy.org/doc/stable/user/basics.types.html", - ) + if htype in ("json", "list"): + validate_json_schema(htype_overwrite["dtype"]) + else: + _raise_if_condition( + "dtype", + htype_overwrite, + lambda dtype: not _is_dtype_supported_by_numpy(dtype), + "Datatype must be supported by numpy. Can be an `str`, `np.dtype`, or normal python type (like `bool`, `float`, `int`, etc.). List of available numpy dtypes found here: https://numpy.org/doc/stable/user/basics.types.html", + ) -def _format_values(htype_overwrite: dict): +def _format_values(htype: str, htype_overwrite: dict): """Replaces values in `htype_overwrite` with consistent types/formats.""" - if htype_overwrite["dtype"] is not None: - htype_overwrite["dtype"] = np.dtype(htype_overwrite["dtype"]).name + dtype = htype_overwrite["dtype"] + if dtype is not None: + if htype in ("json", "list"): + if isinstance(dtype, GenericMeta): + dtype = str(dtype) + else: + htype_overwrite["dtype"] = np.dtype(htype_overwrite["dtype"]).name for key, value in COMPRESSION_ALIASES.items(): if htype_overwrite.get("sample_compression") == key: diff --git a/hub/core/serialize.py b/hub/core/serialize.py index b4bab0b89d..dca2503500 100644 --- a/hub/core/serialize.py +++ b/hub/core/serialize.py @@ -1,8 +1,9 @@ from hub.core.meta.tensor_meta import TensorMeta from hub.util.exceptions import TensorInvalidSampleShapeError from hub.util.casting import intelligent_cast +from hub.util.json import HubJsonEncoder, validate_json_object from hub.core.sample import Sample, SampleValue # type: ignore -from hub.core.compression import compress_array +from hub.core.compression import compress_array, compress_bytes from typing import List, Optional, Sequence, Union, Tuple, Iterable from itertools import repeat import hub @@ -226,11 +227,26 @@ def deserialize_chunkids(byts: Union[bytes, memoryview]) -> Tuple[str, np.ndarra def _serialize_input_sample( sample: SampleValue, sample_compression: Optional[str], - expected_dtype: np.dtype, + expected_dtype: str, htype: str, ) -> Tuple[bytes, Tuple[int]]: """Converts the incoming sample into a buffer with the proper dtype and compression.""" + if htype in ("json", "list"): + validate_json_object(sample, expected_dtype) + byts = json.dumps(sample, cls=HubJsonEncoder).encode() + if sample_compression: + byts = compress_bytes(byts, compression=sample_compression) + shape = (len(sample),) if htype == "list" else 1 + return byts, shape + elif htype == "text": + if not isinstance(sample, str): + raise TypeError("Expected str, received: " + str(sample)) + byts = sample.encode() + if sample_compression: + byts = compress_bytes(byts, compression=sample_compression) + return byts, (1,) + if isinstance(sample, Sample): if ( sample_compression @@ -305,7 +321,7 @@ def serialize_input_samples( raise ValueError("Dtype must be set before input samples can be serialized.") sample_compression = meta.sample_compression - dtype = np.dtype(meta.dtype) + dtype = meta.dtype htype = meta.htype if sample_compression or not hasattr(samples, "dtype"): diff --git a/hub/core/tensor.py b/hub/core/tensor.py index 12fad2b587..21f587b584 100644 --- a/hub/core/tensor.py +++ b/hub/core/tensor.py @@ -404,3 +404,7 @@ def __ixor__(self, other): @_inplace_op def __ior__(self, other): pass + + def data(self) -> Any: + # TODO + pass diff --git a/hub/htype.py b/hub/htype.py index 00b9237b43..22c062c70b 100644 --- a/hub/htype.py +++ b/hub/htype.py @@ -57,8 +57,10 @@ }, # TODO: pack numpy arrays to store bools as 1 bit instead of 1 byte "segment_mask": {"dtype": "int32"}, "json": { - "dtype": None - } + "dtype": "Any", + }, + "list": {"dtype": "List"}, + "text": {"dtype": " Tuple[str, List[str]]: if isinstance(schema, GenericMeta): schema = str(schema) @@ -63,13 +69,14 @@ def _parse_schema(schema: Union[str, GenericMeta]) -> Tuple[str, List[str]]: class InvalidJsonSchemaException(Exception): pass + class ArgumentMismatchException(InvalidJsonSchemaException): def __init__(self, typ: str, actual: int, expected: int, exact: bool = False): assert actual != expected gt = actual > expected super(ArgumentMismatchException, self).__init__( - f"Too {'many' if gt else 'few'} parameters for {typ};" + - f" actual {actual},expected {'exatcly' if exact else ('at most' if gt else 'at least')} {expected}." + f"Too {'many' if gt else 'few'} parameters for {typ};" + + f" actual {actual},expected {'exatcly' if exact else ('at most' if gt else 'at least')} {expected}." ) @@ -134,7 +141,10 @@ def _validate_dict(obj: Any, params: List[str]) -> bool: assert len(params) in (0, 2) assert isinstance(obj, dict) if params: - assert params[0] in ("str", "Any"), "Only string keys are allowed for json dicts." + assert params[0] in ( + "str", + "Any", + ), "Only string keys are allowed for json dicts." for v in obj.values(): if not _validate_object(v, params[1]): return False @@ -157,10 +167,43 @@ def validate_json_object(obj: Any, schema: Union[str, GenericMeta]) -> None: raise JsonValidationError() -class _NumpyEncoder(json.JSONEncoder): +def validate_json_schema(schema: str): + _parse_schema(schema) + + +class HubJsonEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, ndarray): - { - "_type": "ndarray", - "data": "" + return { + "_hub_custom_type": "ndarray", + "data": base64.b64encode(obj.tobytes()).decode(), + "shape": obj.shape, + "dtype": obj.dtype.name, } + elif isinstance(obj, Sample): + if obj.compression: + return { + "_hub_custom_type": "Sample", + "data": base64.b64encode(obj.buffer).decode(), + "compression": obj.compression, + } + else: + return self.default(obj.array) + return obj + + +class HubJsonDecoder(json.JSONDecoder): + def __init__(self, *args, **kwargs): + json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs) + + def object_hook(self, obj): + hub_custom_type = obj.get(hub_custom_type) + if hub_custom_type == "ndarray": + return np.frombuffer( + base64.b64encode(obj["data"]), dtype=obj["dtype"] + ).reshape(obj.shape) + elif hub_custom_type == "Sample": + return Sample( + buffer=base64.b64encode(obj["data"]), compression=obj["compression"] + ) + return obj From e4823e68eb2d87d6042fcaccbb23b64e275cc4b0 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 7 Oct 2021 18:19:26 +0400 Subject: [PATCH 04/20] fixes --- hub/core/chunk_engine.py | 10 ++++---- hub/core/sample.py | 50 +++++++++++++++++++++++++++------------- hub/core/serialize.py | 3 ++- hub/util/json.py | 9 ++++---- 4 files changed, 47 insertions(+), 25 deletions(-) diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py index db8647b82c..3cab4e8b15 100644 --- a/hub/core/chunk_engine.py +++ b/hub/core/chunk_engine.py @@ -1,5 +1,6 @@ import hub import warnings +import json import numpy as np from math import ceil from typing import Any, Dict, Optional, Sequence, Union, Tuple, List, Set @@ -690,25 +691,26 @@ def read_sample_from_chunk( htype = self.tensor_meta.htype if htype in ("json", "text", "list"): + sb, eb = chunk.byte_positions_encoder[local_sample_index] if chunk_compression: decompressed = chunk.decompressed_data(compression=chunk_compression) - sb, eb = chunk.byte_positions_encoder[local_sample_index] buffer = decompressed[sb:eb] elif sample_compression: buffer = decompress_bytes(buffer[sb:eb], compression=sample_compression) else: buffer = buffer[sb:eb] + buffer = bytes(buffer) if htype == "json": arr = np.empty(1, dtype=object) - arr[0] = json.loads(str.encode(buffer), cls=HubJsonDecoder) + arr[0] = json.loads(bytes.decode(buffer), cls=HubJsonDecoder) return arr elif htype == "list": - lst = json.loads(str.encode(buffer), cls=HubJsonDecoder) + lst = json.loads(bytes.decode(buffer), cls=HubJsonDecoder) arr = np.empty(len(lst), dtype=object) arr[:] = lst return arr elif htype == "text": - arr = np.array(str.encode(buffer)).reshape( + arr = np.array(bytes.decode(buffer)).reshape( 1, ) return arr diff --git a/hub/core/sample.py b/hub/core/sample.py index 1b1c6d196d..27aad3f065 100644 --- a/hub/core/sample.py +++ b/hub/core/sample.py @@ -22,6 +22,8 @@ def __init__( self, path: str = None, array: np.ndarray = None, + buffer: Union[bytes, memoryview] = None, + compression: str = None, verify: bool = False, ): """Represents a single sample for a tensor. Provides all important meta information in one place. @@ -34,6 +36,8 @@ def __init__( path (str): Path to a sample stored on the local file system that represents a single sample. If `path` is provided, `array` should not be. Implicitly makes `self.is_lazy == True`. array (np.ndarray): Array that represents a single sample. If `array` is provided, `path` should not be. Implicitly makes `self.is_lazy == False`. + buffer: (bytes): Byte buffer that represents a single sample. If compressed, `compression` argument should be provided. + compression (str): Specify in case of byte buffer. verify (bool): If a path is provided, verifies the sample if True. Raises: @@ -47,22 +51,31 @@ def __init__( self._uncompressed_bytes = None self._convert_grayscale = False + self._array = None + self._typestr = None + self._shape = None + self.path = None + if path is not None: self.path = path - self._array = None - self._typestr = None - self._shape = None self._compression = None self._verified = False self._verify = verify if array is not None: - self.path = None self._array = array self._shape = array.shape self._typestr = array.__array_interface__["typestr"] self._compression = None + if buffer is not None: + self._compression = compression + self._buffer = buffer + if compression is None: + self._uncompressed_bytes = buffer + else: + self._compressed_bytes[compression] = buffer + @property def dtype(self): self._read_meta() @@ -83,7 +96,7 @@ def _read_meta(self, f=None): if self._shape is not None: return if f is None: - f = self.path + f = self.path if self.path else self.compressed_bytes[self._compression] self._compression, self._shape, self._typestr = read_meta_from_compressed_file( f ) @@ -96,6 +109,18 @@ def is_lazy(self) -> bool: def is_empty(self) -> bool: return 0 in self.shape + def _recompress(self, buffer: bytes, compression: str) -> bytes: + if get_compression_type(self._compression) != IMAGE_COMPRESSION: + raise ValueError( + "Recompression with different format is only supported for images." + ) + img = Image.open(BytesIO(buffer)) + if img.mode == "1": + self._uncompressed_bytes = img.tobytes("raw", "L") + else: + self._uncompressed_bytes = img.tobytes() + return compress_array(self.array, compression) + def compressed_bytes(self, compression: str) -> bytes: """Returns this sample as compressed bytes. @@ -135,16 +160,9 @@ def compressed_bytes(self, compression: str) -> bytes: compressed_bytes, compression=self._compression ) else: - if get_compression_type(self._compression) != IMAGE_COMPRESSION: - raise ValueError( - "Recompression with different format is only supported for images." - ) - img = Image.open(BytesIO(compressed_bytes)) - if img.mode == "1": - self._uncompressed_bytes = img.tobytes("raw", "L") - else: - self._uncompressed_bytes = img.tobytes() - compressed_bytes = compress_array(self.array, compression) + compressed_bytes = self._recompress(compressed_bytes, compression) + elif self._buffer is not None: + compressed_bytes = self._recompress(self._buffer, compression) else: compressed_bytes = compress_array(self.array, compression) self._compressed_bytes[compression] = compressed_bytes @@ -184,7 +202,7 @@ def array(self) -> np.ndarray: compr = get_compression(path=self.path) if get_compression_type(compr) == AUDIO_COMPRESSION: self._compression = compr - array = decompress_array(self.path, compression=compr) + array = decompress_array(self.path or self._buffer, compression=compr) if self._shape is None: self._shape = array.shape self._typestr = array.__array_interface__["typestr"] diff --git a/hub/core/serialize.py b/hub/core/serialize.py index 852369b8cc..c6ceb98557 100644 --- a/hub/core/serialize.py +++ b/hub/core/serialize.py @@ -11,6 +11,7 @@ import numpy as np import struct import warnings +import json def infer_chunk_num_bytes( @@ -239,7 +240,7 @@ def _serialize_input_sample( byts = json.dumps(sample, cls=HubJsonEncoder).encode() if sample_compression: byts = compress_bytes(byts, compression=sample_compression) - shape = (len(sample),) if htype == "list" else 1 + shape = (len(sample),) if htype == "list" else (1,) return byts, shape elif htype == "text": if not isinstance(sample, str): diff --git a/hub/util/json.py b/hub/util/json.py index b3d0b0f0eb..66d0b4c3b6 100644 --- a/hub/util/json.py +++ b/hub/util/json.py @@ -129,11 +129,12 @@ def _validate_optional(obj: Any, params: List[str]) -> bool: def _validate_list(obj: Any, params: List[str]) -> bool: - assert len(params) == 1 + assert len(params) <= 1 assert isinstance(obj, (list, tuple)) - for item in obj: - if not _validate_object(item, params[0]): - return False + if params: + for item in obj: + if not _validate_object(item, params[0]): + return False return True From d6fc7fb905d9f19ba42cfa81f7fbb71135eba06f Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 7 Oct 2021 18:27:44 +0400 Subject: [PATCH 05/20] test --- hub/api/tests/test_api.py | 17 +++++++++++++++++ hub/util/json.py | 2 +- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index 5256d5d8e0..2128d1dda8 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -751,3 +751,20 @@ def test_groups(local_ds_generator): ds.create_group("g") ds.g.create_tensor("g") + + +def test_json(memory_ds): + ds = memory_ds + ds.create_tensor("json", htype="json") + items = [ + {"x": [1, 2, 3], "y": [4, [5, 6]]}, + {"x": [1, 2, 3], "y": [4, {"z": [0.1, 0.2, []]}]} + ] + with ds: + for x in items: + ds.json.append(x) + ds.json.extend(items) + for i in [0, 2]: + assert ds.json[i].numpy()[0] == items[0] + for i in [1, 3]: + assert ds.json[i].numpy()[0] == items[1] diff --git a/hub/util/json.py b/hub/util/json.py index 66d0b4c3b6..38ffa973d9 100644 --- a/hub/util/json.py +++ b/hub/util/json.py @@ -198,7 +198,7 @@ def __init__(self, *args, **kwargs): json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs) def object_hook(self, obj): - hub_custom_type = obj.get(hub_custom_type) + hub_custom_type = obj.get("_hub_custom_type") if hub_custom_type == "ndarray": return np.frombuffer( base64.b64encode(obj["data"]), dtype=obj["dtype"] From 3cefdff0b42746146cf0297319f70006267aab17 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 7 Oct 2021 18:27:56 +0400 Subject: [PATCH 06/20] format --- hub/api/tests/test_api.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index 2128d1dda8..5dc5155b61 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -758,7 +758,7 @@ def test_json(memory_ds): ds.create_tensor("json", htype="json") items = [ {"x": [1, 2, 3], "y": [4, [5, 6]]}, - {"x": [1, 2, 3], "y": [4, {"z": [0.1, 0.2, []]}]} + {"x": [1, 2, 3], "y": [4, {"z": [0.1, 0.2, []]}]}, ] with ds: for x in items: From daf0ec26dafb3a12c69d8475e7432fb19bc36b59 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 7 Oct 2021 18:36:08 +0400 Subject: [PATCH 07/20] py38 fix --- hub/core/dataset.py | 7 ++++++- hub/core/meta/tensor_meta.py | 8 +++++++- hub/util/json.py | 7 ++++++- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/hub/core/dataset.py b/hub/core/dataset.py index 0890142bda..d51499e922 100644 --- a/hub/core/dataset.py +++ b/hub/core/dataset.py @@ -12,7 +12,6 @@ Tuple, List, Sequence, - GenericMeta, ) from hub.api.info import load_info @@ -55,6 +54,12 @@ from hub.util.remove_cache import get_base_storage +try: + from typing import GenericMeta +except ImportError: + from typing import _GenericAlias as GenericMeta + + class Dataset: def __init__( self, diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py index b6097039b3..e20d01989e 100644 --- a/hub/core/meta/tensor_meta.py +++ b/hub/core/meta/tensor_meta.py @@ -1,6 +1,6 @@ import hub from hub.core.fast_forwarding import ffw_tensor_meta -from typing import Any, Callable, Dict, List, Tuple, GenericMeta +from typing import Any, Callable, Dict, List, Tuple import numpy as np from hub.util.exceptions import ( TensorMetaInvalidHtype, @@ -30,6 +30,12 @@ from hub.core.meta.meta import Meta +try: + from typing import GenericMeta +except ImportError: + from typing import _GenericAlias as GenericMeta + + class TensorMeta(Meta): htype: str dtype: str diff --git a/hub/util/json.py b/hub/util/json.py index 38ffa973d9..052e87b0a7 100644 --- a/hub/util/json.py +++ b/hub/util/json.py @@ -1,10 +1,15 @@ -from typing import Any, Dict, List, Optional, Tuple, Union, GenericMeta +from typing import Any, Dict, List, Optional, Tuple, Union from numpy import ndarray import json import base64 from hub.core.sample import Sample +try: + from typing import GenericMeta +except ImportError: + from typing import _GenericAlias as GenericMeta + scalars = ["int", "float", "bool", "str", "list", "dict", "ndarray", "Sample"] types = ["Any", "Dict", "List", "Optional", "Union"] From f127d1a7bb083fe7f52543f2b871548b6f257833 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 7 Oct 2021 18:44:23 +0400 Subject: [PATCH 08/20] test list --- hub/api/tests/test_api.py | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index 5dc5155b61..f55e08871b 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -753,7 +753,7 @@ def test_groups(local_ds_generator): ds.g.create_tensor("g") -def test_json(memory_ds): +def test_json_basic(memory_ds): ds = memory_ds ds.create_tensor("json", htype="json") items = [ @@ -764,7 +764,26 @@ def test_json(memory_ds): for x in items: ds.json.append(x) ds.json.extend(items) + assert ds.json.shape == (4, 1) for i in [0, 2]: assert ds.json[i].numpy()[0] == items[0] for i in [1, 3]: assert ds.json[i].numpy()[0] == items[1] + + +def test_json_list_basic(memory_ds): + ds = memory_ds + ds.create_tensor("list", htype="list") + items = [ + [{"x": [1, 2, 3], "y": [4, [5, 6]]}, [[]], [None, 0.1]], + [[], [[[]]], {"a": [0.1, 1, "a", []]}], + ] + with ds: + for x in items: + ds.list.append(x) + ds.list.extend(items) + assert ds.list.shape == (4, 3) + for i in [0, 2]: + assert list(ds.list[i].numpy()) == items[0] + for i in [1, 3]: + assert list(ds.list[i].numpy()) == items[1] From 523bcee3cfedf2b417d8e5937dea6c01c1832652 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Thu, 7 Oct 2021 18:51:00 +0400 Subject: [PATCH 09/20] test text --- hub/api/tests/test_api.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index f55e08871b..49f96c38fe 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -765,10 +765,8 @@ def test_json_basic(memory_ds): ds.json.append(x) ds.json.extend(items) assert ds.json.shape == (4, 1) - for i in [0, 2]: - assert ds.json[i].numpy()[0] == items[0] - for i in [1, 3]: - assert ds.json[i].numpy()[0] == items[1] + for i in range(4): + assert ds.json[i].numpy()[0] == items[i % 2] def test_json_list_basic(memory_ds): @@ -783,7 +781,18 @@ def test_json_list_basic(memory_ds): ds.list.append(x) ds.list.extend(items) assert ds.list.shape == (4, 3) - for i in [0, 2]: - assert list(ds.list[i].numpy()) == items[0] - for i in [1, 3]: - assert list(ds.list[i].numpy()) == items[1] + for i in range(4): + assert list(ds.list[i].numpy()) == items[i % 2] + + +def test_text(memory_ds): + ds = memory_ds + ds.create_tensor("text", htype="text") + items = ["abcd", "efgh", "0123456"] + with ds: + for x in items: + ds.text.append(x) + ds.text.extend(items) + assert ds.text.shape == (6, 1) + for i in range(6): + assert ds.text[i].numpy()[0] == items[i % 3] From 025e2f4994213ca64f559e78dc3dc835829fb9bf Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 11 Oct 2021 02:20:39 +0400 Subject: [PATCH 10/20] refac --- hub/api/tests/test_api.py | 45 -------------------------------------- hub/api/tests/test_json.py | 35 +++++++++++++++++++++++++++++ hub/api/tests/test_text.py | 15 +++++++++++++ 3 files changed, 50 insertions(+), 45 deletions(-) create mode 100644 hub/api/tests/test_json.py create mode 100644 hub/api/tests/test_text.py diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py index 49f96c38fe..5256d5d8e0 100644 --- a/hub/api/tests/test_api.py +++ b/hub/api/tests/test_api.py @@ -751,48 +751,3 @@ def test_groups(local_ds_generator): ds.create_group("g") ds.g.create_tensor("g") - - -def test_json_basic(memory_ds): - ds = memory_ds - ds.create_tensor("json", htype="json") - items = [ - {"x": [1, 2, 3], "y": [4, [5, 6]]}, - {"x": [1, 2, 3], "y": [4, {"z": [0.1, 0.2, []]}]}, - ] - with ds: - for x in items: - ds.json.append(x) - ds.json.extend(items) - assert ds.json.shape == (4, 1) - for i in range(4): - assert ds.json[i].numpy()[0] == items[i % 2] - - -def test_json_list_basic(memory_ds): - ds = memory_ds - ds.create_tensor("list", htype="list") - items = [ - [{"x": [1, 2, 3], "y": [4, [5, 6]]}, [[]], [None, 0.1]], - [[], [[[]]], {"a": [0.1, 1, "a", []]}], - ] - with ds: - for x in items: - ds.list.append(x) - ds.list.extend(items) - assert ds.list.shape == (4, 3) - for i in range(4): - assert list(ds.list[i].numpy()) == items[i % 2] - - -def test_text(memory_ds): - ds = memory_ds - ds.create_tensor("text", htype="text") - items = ["abcd", "efgh", "0123456"] - with ds: - for x in items: - ds.text.append(x) - ds.text.extend(items) - assert ds.text.shape == (6, 1) - for i in range(6): - assert ds.text[i].numpy()[0] == items[i % 3] diff --git a/hub/api/tests/test_json.py b/hub/api/tests/test_json.py new file mode 100644 index 0000000000..b6ee9d1d48 --- /dev/null +++ b/hub/api/tests/test_json.py @@ -0,0 +1,35 @@ +import numpy as np +import hub +import pytest + + +def test_json_basic(memory_ds): + ds = memory_ds + ds.create_tensor("json", htype="json") + items = [ + {"x": [1, 2, 3], "y": [4, [5, 6]]}, + {"x": [1, 2, 3], "y": [4, {"z": [0.1, 0.2, []]}]}, + ] + with ds: + for x in items: + ds.json.append(x) + ds.json.extend(items) + assert ds.json.shape == (4, 1) + for i in range(4): + assert ds.json[i].numpy()[0] == items[i % 2] + + +def test_json_list_basic(memory_ds): + ds = memory_ds + ds.create_tensor("list", htype="list") + items = [ + [{"x": [1, 2, 3], "y": [4, [5, 6]]}, [[]], [None, 0.1]], + [[], [[[]]], {"a": [0.1, 1, "a", []]}], + ] + with ds: + for x in items: + ds.list.append(x) + ds.list.extend(items) + assert ds.list.shape == (4, 3) + for i in range(4): + assert list(ds.list[i].numpy()) == items[i % 2] diff --git a/hub/api/tests/test_text.py b/hub/api/tests/test_text.py new file mode 100644 index 0000000000..1d3294a53f --- /dev/null +++ b/hub/api/tests/test_text.py @@ -0,0 +1,15 @@ +import hub +import pytest + + +def test_text(memory_ds): + ds = memory_ds + ds.create_tensor("text", htype="text") + items = ["abcd", "efgh", "0123456"] + with ds: + for x in items: + ds.text.append(x) + ds.text.extend(items) + assert ds.text.shape == (6, 1) + for i in range(6): + assert ds.text[i].numpy()[0] == items[i % 3] From 7c4ee2a55033c6e099fa3ec04e3290a00d579b07 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 11 Oct 2021 02:23:32 +0400 Subject: [PATCH 11/20] better exception --- hub/core/meta/tensor_meta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py index e20d01989e..e9d6fa9c0d 100644 --- a/hub/core/meta/tensor_meta.py +++ b/hub/core/meta/tensor_meta.py @@ -177,7 +177,7 @@ def _validate_htype_overwrites(htype: str, htype_overwrite: dict): compr = htype_overwrite["sample_compression"] if compr not in (None, UNSPECIFIED): if get_compression_type(compr) != BYTE_COMPRESSION: - raise Exception() # TODO + raise UnsupportedCompressionError(compr, htype) elif htype == "audio": if htype_overwrite["chunk_compression"] not in [UNSPECIFIED, None]: raise UnsupportedCompressionError("Chunk compression", htype=htype) From d68d448b0bc115c68e31b8505bd9b6de0646c29a Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 11 Oct 2021 02:35:12 +0400 Subject: [PATCH 12/20] numpy tests --- hub/api/tests/test_json.py | 17 +++++++++++++++++ hub/util/json.py | 7 ++++--- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/hub/api/tests/test_json.py b/hub/api/tests/test_json.py index b6ee9d1d48..b1e9083299 100644 --- a/hub/api/tests/test_json.py +++ b/hub/api/tests/test_json.py @@ -1,6 +1,7 @@ import numpy as np import hub import pytest +from hub.util.json import JsonValidationError def test_json_basic(memory_ds): @@ -19,6 +20,22 @@ def test_json_basic(memory_ds): assert ds.json[i].numpy()[0] == items[i % 2] +def test_json_with_numpy(memory_ds): + ds = memory_ds + ds.create_tensor("json", htype="json") + items = [ + {"x": np.array([1, 2, 3], dtype=np.float32), "y": [4, [5, 6]]}, + {"x": np.array([1, 2, 3], dtype=np.uint8), "y": [4, {"z": [0.1, 0.2, []]}]}, + ] + with ds: + for x in items: + ds.json.append(x) + ds.json.extend(items) + for i in range(4): + assert ds.json[i].numpy()[0]["y"] == items[i % 2]["y"] + np.testing.assert_array_equal(ds.json[i].numpy()[0]["x"], items[i % 2]["x"]) + + def test_json_list_basic(memory_ds): ds = memory_ds ds.create_tensor("list", htype="list") diff --git a/hub/util/json.py b/hub/util/json.py index 052e87b0a7..7919baa6ef 100644 --- a/hub/util/json.py +++ b/hub/util/json.py @@ -1,4 +1,5 @@ from typing import Any, Dict, List, Optional, Tuple, Union +import numpy as np from numpy import ndarray import json import base64 @@ -206,10 +207,10 @@ def object_hook(self, obj): hub_custom_type = obj.get("_hub_custom_type") if hub_custom_type == "ndarray": return np.frombuffer( - base64.b64encode(obj["data"]), dtype=obj["dtype"] - ).reshape(obj.shape) + base64.b64decode(obj["data"]), dtype=obj["dtype"] + ).reshape(obj["shape"]) elif hub_custom_type == "Sample": return Sample( - buffer=base64.b64encode(obj["data"]), compression=obj["compression"] + buffer=base64.b64decode(obj["data"]), compression=obj["compression"] ) return obj From ac62f8627219a6668c5a2a93a725b0a1684dbcf0 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 11 Oct 2021 02:50:37 +0400 Subject: [PATCH 13/20] hub.read support tests --- hub/api/tests/test_json.py | 24 ++++++++++++++++++++++++ hub/core/sample.py | 16 ++++++++++++++-- 2 files changed, 38 insertions(+), 2 deletions(-) diff --git a/hub/api/tests/test_json.py b/hub/api/tests/test_json.py index b1e9083299..028eac9abc 100644 --- a/hub/api/tests/test_json.py +++ b/hub/api/tests/test_json.py @@ -36,6 +36,30 @@ def test_json_with_numpy(memory_ds): np.testing.assert_array_equal(ds.json[i].numpy()[0]["x"], items[i % 2]["x"]) +def test_json_with_hub_sample(memory_ds, compressed_image_paths): + ds = memory_ds + ds.create_tensor("json", htype="json") + items = [ + { + "x": [1, 2, 3], + "y": [4, [5, 6]], + "z": hub.read(compressed_image_paths["jpeg"][0]), + }, + { + "x": [1, 2, 3], + "y": [4, {"z": [0.1, 0.2, []]}], + "z": hub.read(compressed_image_paths["png"][0]), + }, + ] + with ds: + for x in items: + ds.json.append(x) + ds.json.extend(items) + assert ds.json.shape == (4, 1) + for i in range(4): + assert ds.json[i].numpy()[0] == items[i % 2] + + def test_json_list_basic(memory_ds): ds = memory_ds ds.create_tensor("list", htype="list") diff --git a/hub/core/sample.py b/hub/core/sample.py index 27aad3f065..af862f0c1b 100644 --- a/hub/core/sample.py +++ b/hub/core/sample.py @@ -44,8 +44,8 @@ def __init__( ValueError: Cannot create a sample from both a `path` and `array`. """ - if (path is None) == (array is None): - raise ValueError("Must pass either `path` or `array`.") + if not any((path, array, buffer)): + raise ValueError("Must pass one of `path`, `array` or `buffer`.") self._compressed_bytes = {} self._uncompressed_bytes = None @@ -55,6 +55,7 @@ def __init__( self._typestr = None self._shape = None self.path = None + self._buffer = None if path is not None: self.path = path @@ -76,6 +77,12 @@ def __init__( else: self._compressed_bytes[compression] = buffer + @property + def buffer(self): + if self._buffer is not None: + return self._buffer + return self.compressed_bytes(self.compression) + @property def dtype(self): self._read_meta() @@ -234,5 +241,10 @@ def __repr__(self): def __array__(self): return self.array + def __eq__(self, other): + if self.path is not None and other.path is not None: + return self.path == other.path + return self.buffer == other.buffer + SampleValue = Union[np.ndarray, int, float, bool, Sample] From 7110eee21643d8593ed2e388c581368241c5c70a Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 11 Oct 2021 02:55:43 +0400 Subject: [PATCH 14/20] list + numpy test --- hub/api/tests/test_json.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/hub/api/tests/test_json.py b/hub/api/tests/test_json.py index 028eac9abc..3e2f1c0a22 100644 --- a/hub/api/tests/test_json.py +++ b/hub/api/tests/test_json.py @@ -74,3 +74,26 @@ def test_json_list_basic(memory_ds): assert ds.list.shape == (4, 3) for i in range(4): assert list(ds.list[i].numpy()) == items[i % 2] + + +def test_list_with_numpy(memory_ds): + ds = memory_ds + ds.create_tensor("list", htype="list") + items = [ + [ + np.random.random((3, 4)), + {"x": [1, 2, 3], "y": [4, [5, 6]]}, + [[]], + [None, 0.1], + ], + [np.random.randint(0, 10, (4, 5)), [], [[[]]], {"a": [0.1, 1, "a", []]}], + ] + with ds: + for x in items: + ds.list.append(x) + ds.list.extend(items) + assert ds.list.shape == (4, 4) + for i in range(4): + actual, expected = list(ds.list[i].numpy()), items[i % 2] + np.testing.assert_array_equal(actual[0], expected[0]) + assert actual[1:] == expected[1:] From ce9c2e87e6e7ddd4663a22b5ad2bf98fc61bcbab Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 11 Oct 2021 02:58:43 +0400 Subject: [PATCH 15/20] list + hub sample --- hub/api/tests/test_json.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/hub/api/tests/test_json.py b/hub/api/tests/test_json.py index 3e2f1c0a22..f9b88b1ef7 100644 --- a/hub/api/tests/test_json.py +++ b/hub/api/tests/test_json.py @@ -97,3 +97,30 @@ def test_list_with_numpy(memory_ds): actual, expected = list(ds.list[i].numpy()), items[i % 2] np.testing.assert_array_equal(actual[0], expected[0]) assert actual[1:] == expected[1:] + + +def test_list_with_hub_sample(memory_ds, compressed_image_paths): + ds = memory_ds + ds.create_tensor("list", htype="list") + items = [ + [ + { + "x": [1, 2, 3], + "y": [4, [5, 6, hub.read(compressed_image_paths["jpeg"][0])]], + }, + [[hub.read(compressed_image_paths["jpeg"][1])]], + [None, 0.1], + ], + [ + [], + [[[hub.read(compressed_image_paths["png"][0])]]], + {"a": [0.1, 1, "a", hub.read(compressed_image_paths["png"][0])]}, + ], + ] + with ds: + for x in items: + ds.list.append(x) + ds.list.extend(items) + assert ds.list.shape == (4, 3) + for i in range(4): + assert list(ds.list[i].numpy()) == items[i % 2] From f4023754d192ae92bf659989b7a885e726fa27b7 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 11 Oct 2021 03:39:55 +0400 Subject: [PATCH 16/20] schema tests --- hub/api/tests/test_json.py | 24 ++++++++++++++++++++++++ hub/core/dataset.py | 8 +------- hub/core/meta/tensor_meta.py | 10 ++-------- hub/util/json.py | 28 +++++++++++++++++----------- 4 files changed, 44 insertions(+), 26 deletions(-) diff --git a/hub/api/tests/test_json.py b/hub/api/tests/test_json.py index f9b88b1ef7..56833e8a7b 100644 --- a/hub/api/tests/test_json.py +++ b/hub/api/tests/test_json.py @@ -2,6 +2,7 @@ import hub import pytest from hub.util.json import JsonValidationError +from typing import Any, Optional, Union, List, Dict def test_json_basic(memory_ds): @@ -124,3 +125,26 @@ def test_list_with_hub_sample(memory_ds, compressed_image_paths): assert ds.list.shape == (4, 3) for i in range(4): assert list(ds.list[i].numpy()) == items[i % 2] + + +def test_json_with_schema(memory_ds): + ds = memory_ds + ds.create_tensor("json", htype="json", dtype=List[Dict[str, int]]) + ds.json.append([{"x": 1, "y": 2}]) + with pytest.raises(JsonValidationError): + ds.json.append({"x": 1, "y": 2}) + with pytest.raises(JsonValidationError): + ds.json.append([{"x": 1, "y": "2"}]) + + assert ds.json.numpy()[0, 0] == [{"x": 1, "y": 2}] + + ds.create_tensor("json2", htype="json", dtype=Optional[List[Dict[str, int]]]) + items = [ + [{"x": 1, "y": 2}], + None, + [{"x": 2, "y": 3}], + None, + ] + ds.json2.extend(items) + for i in range(len(items)): + assert ds.json2[i].numpy()[0] == ds.json2.numpy()[i][0] == items[i] diff --git a/hub/core/dataset.py b/hub/core/dataset.py index d51499e922..6e198eed6f 100644 --- a/hub/core/dataset.py +++ b/hub/core/dataset.py @@ -54,12 +54,6 @@ from hub.util.remove_cache import get_base_storage -try: - from typing import GenericMeta -except ImportError: - from typing import _GenericAlias as GenericMeta - - class Dataset: def __init__( self, @@ -223,7 +217,7 @@ def create_tensor( self, name: str, htype: str = DEFAULT_HTYPE, - dtype: Union[str, np.dtype, type, GenericMeta] = UNSPECIFIED, + dtype: Union[str, np.dtype] = UNSPECIFIED, sample_compression: str = UNSPECIFIED, chunk_compression: str = UNSPECIFIED, **kwargs, diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py index e9d6fa9c0d..abda33c76f 100644 --- a/hub/core/meta/tensor_meta.py +++ b/hub/core/meta/tensor_meta.py @@ -30,12 +30,6 @@ from hub.core.meta.meta import Meta -try: - from typing import GenericMeta -except ImportError: - from typing import _GenericAlias as GenericMeta - - class TensorMeta(Meta): htype: str dtype: str @@ -243,8 +237,8 @@ def _format_values(htype: str, htype_overwrite: dict): dtype = htype_overwrite["dtype"] if dtype is not None: if htype in ("json", "list"): - if isinstance(dtype, GenericMeta): - dtype = str(dtype) + if getattr(dtype, "__module__", None) == "typing": + htype_overwrite["dtype"] = str(dtype) else: htype_overwrite["dtype"] = np.dtype(htype_overwrite["dtype"]).name diff --git a/hub/util/json.py b/hub/util/json.py index 7919baa6ef..d5b2896061 100644 --- a/hub/util/json.py +++ b/hub/util/json.py @@ -6,10 +6,8 @@ from hub.core.sample import Sample -try: - from typing import GenericMeta -except ImportError: - from typing import _GenericAlias as GenericMeta + +Schema = Any scalars = ["int", "float", "bool", "str", "list", "dict", "ndarray", "Sample"] @@ -21,14 +19,14 @@ def _norm_type(typ: str): replacements = { "numpy.ndarray": "ndarray", "np.ndarray": "ndarray", - "hub.core.Sample": "Sample", + "hub.core.sample.Sample": "Sample", "hub.Sample": "Sample", } return replacements.get(typ, typ) -def _parse_schema(schema: Union[str, GenericMeta]) -> Tuple[str, List[str]]: - if isinstance(schema, GenericMeta): +def _parse_schema(schema: Union[str, Schema]) -> Tuple[str, List[str]]: + if getattr(schema, "__module__", None) == "typing": schema = str(schema) validate = False else: @@ -37,6 +35,7 @@ def _parse_schema(schema: Union[str, GenericMeta]) -> Tuple[str, List[str]]: if schema in scalars: return schema, [] + if "[" not in schema: return _norm_type(schema), [] @@ -136,7 +135,8 @@ def _validate_optional(obj: Any, params: List[str]) -> bool: def _validate_list(obj: Any, params: List[str]) -> bool: assert len(params) <= 1 - assert isinstance(obj, (list, tuple)) + if not isinstance(obj, (list, tuple)): + return False if params: for item in obj: if not _validate_object(item, params[0]): @@ -146,7 +146,8 @@ def _validate_list(obj: Any, params: List[str]) -> bool: def _validate_dict(obj: Any, params: List[str]) -> bool: assert len(params) in (0, 2) - assert isinstance(obj, dict) + if not isinstance(obj, dict): + return False if params: assert params[0] in ( "str", @@ -158,7 +159,12 @@ def _validate_dict(obj: Any, params: List[str]) -> bool: return True -def _validate_object(obj: Any, schema: Union[str, GenericMeta]) -> bool: +def _validate_nonetype(obj: Any, params: List[str]) -> bool: + assert not params + return obj is None + + +def _validate_object(obj: Any, schema: Union[str, Schema]) -> bool: typ, params = _parse_schema(schema) if typ in scalars: return isinstance(obj, eval(typ)) @@ -169,7 +175,7 @@ class JsonValidationError(Exception): pass -def validate_json_object(obj: Any, schema: Union[str, GenericMeta]) -> None: +def validate_json_object(obj: Any, schema: Union[str, Schema]) -> None: if not _validate_object(obj, schema): raise JsonValidationError() From 362c8999244fd358d2a90a0bdea19855f5c01be8 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 11 Oct 2021 04:15:04 +0400 Subject: [PATCH 17/20] .data() --- hub/api/tests/test_json.py | 18 ++++++++++-------- hub/core/meta/tensor_meta.py | 3 +++ hub/core/tensor.py | 31 ++++++++++++++++++++++++++++--- hub/htype.py | 2 +- 4 files changed, 42 insertions(+), 12 deletions(-) diff --git a/hub/api/tests/test_json.py b/hub/api/tests/test_json.py index 56833e8a7b..2bb81f2d66 100644 --- a/hub/api/tests/test_json.py +++ b/hub/api/tests/test_json.py @@ -18,7 +18,7 @@ def test_json_basic(memory_ds): ds.json.extend(items) assert ds.json.shape == (4, 1) for i in range(4): - assert ds.json[i].numpy()[0] == items[i % 2] + assert ds.json[i].data() == items[i % 2] def test_json_with_numpy(memory_ds): @@ -33,8 +33,8 @@ def test_json_with_numpy(memory_ds): ds.json.append(x) ds.json.extend(items) for i in range(4): - assert ds.json[i].numpy()[0]["y"] == items[i % 2]["y"] - np.testing.assert_array_equal(ds.json[i].numpy()[0]["x"], items[i % 2]["x"]) + assert ds.json[i].data()["y"] == items[i % 2]["y"] + np.testing.assert_array_equal(ds.json[i].data()["x"], items[i % 2]["x"]) def test_json_with_hub_sample(memory_ds, compressed_image_paths): @@ -58,7 +58,7 @@ def test_json_with_hub_sample(memory_ds, compressed_image_paths): ds.json.extend(items) assert ds.json.shape == (4, 1) for i in range(4): - assert ds.json[i].numpy()[0] == items[i % 2] + assert ds.json[i].data() == items[i % 2] def test_json_list_basic(memory_ds): @@ -74,7 +74,9 @@ def test_json_list_basic(memory_ds): ds.list.extend(items) assert ds.list.shape == (4, 3) for i in range(4): - assert list(ds.list[i].numpy()) == items[i % 2] + assert ds.list[i].data() == items[i % 2] + for i, x in enumerate(ds.list.data()): + assert x == items[i % 2] def test_list_with_numpy(memory_ds): @@ -95,7 +97,7 @@ def test_list_with_numpy(memory_ds): ds.list.extend(items) assert ds.list.shape == (4, 4) for i in range(4): - actual, expected = list(ds.list[i].numpy()), items[i % 2] + actual, expected = ds.list[i].data(), items[i % 2] np.testing.assert_array_equal(actual[0], expected[0]) assert actual[1:] == expected[1:] @@ -124,7 +126,7 @@ def test_list_with_hub_sample(memory_ds, compressed_image_paths): ds.list.extend(items) assert ds.list.shape == (4, 3) for i in range(4): - assert list(ds.list[i].numpy()) == items[i % 2] + assert ds.list[i].data() == items[i % 2] def test_json_with_schema(memory_ds): @@ -147,4 +149,4 @@ def test_json_with_schema(memory_ds): ] ds.json2.extend(items) for i in range(len(items)): - assert ds.json2[i].numpy()[0] == ds.json2.numpy()[i][0] == items[i] + assert ds.json2[i].data() == ds.json2.data()[i] == items[i] diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py index abda33c76f..ec58255859 100644 --- a/hub/core/meta/tensor_meta.py +++ b/hub/core/meta/tensor_meta.py @@ -172,6 +172,9 @@ def _validate_htype_overwrites(htype: str, htype_overwrite: dict): if compr not in (None, UNSPECIFIED): if get_compression_type(compr) != BYTE_COMPRESSION: raise UnsupportedCompressionError(compr, htype) + if htype == "text": + if htype_overwrite["dtype"] not in (str, "str"): + raise TensorMetaInvalidHtypeOverwriteValue("dtype", htype_overwrite["dtype"], "dtype for tensors with text htype should always be `str`") elif htype == "audio": if htype_overwrite["chunk_compression"] not in [UNSPECIFIED, None]: raise UnsupportedCompressionError("Chunk compression", htype=htype) diff --git a/hub/core/tensor.py b/hub/core/tensor.py index 21f587b584..99b935dfa6 100644 --- a/hub/core/tensor.py +++ b/hub/core/tensor.py @@ -195,15 +195,28 @@ def shape(self) -> Tuple[Optional[int], ...]: tuple: Tuple where each value is either `None` (if that axis is dynamic) or an `int` (if that axis is fixed). """ + shape = self.shape_interval.astuple() + if self.index.values[0].subscriptable(): + return shape + return shape[1:] - return self.shape_interval.astuple() + + @property + def ndim(self) -> int: + return len(self.shape) @property def dtype(self) -> np.dtype: + if self.htype in ("json", "list"): + return self.dtype if self.meta.dtype: return np.dtype(self.meta.dtype) return None + @property + def htype(self): + return self.meta.htype + @property def shape_interval(self) -> ShapeInterval: """Returns a `ShapeInterval` object that describes this tensor's shape more accurately. Length is included. @@ -406,5 +419,17 @@ def __ior__(self, other): pass def data(self) -> Any: - # TODO - pass + htype = self.htype + if htype in ("json", "text"): + + if self.ndim == 1: + return self.numpy()[0] + else: + return [sample[0] for sample in self.numpy(aslist=True)] + elif htype == "list": + if self.ndim == 1: + return list(self.numpy()) + else: + return list(map(list, self.numpy(aslist=True))) + else: + return self.numpy() diff --git a/hub/htype.py b/hub/htype.py index 48bf35254a..ce499ea96c 100644 --- a/hub/htype.py +++ b/hub/htype.py @@ -61,7 +61,7 @@ "dtype": "Any", }, "list": {"dtype": "List"}, - "text": {"dtype": " Date: Mon, 11 Oct 2021 04:23:25 +0400 Subject: [PATCH 18/20] format --- hub/core/meta/tensor_meta.py | 6 +++++- hub/core/tensor.py | 3 +-- hub/util/json.py | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py index ec58255859..ec28661285 100644 --- a/hub/core/meta/tensor_meta.py +++ b/hub/core/meta/tensor_meta.py @@ -174,7 +174,11 @@ def _validate_htype_overwrites(htype: str, htype_overwrite: dict): raise UnsupportedCompressionError(compr, htype) if htype == "text": if htype_overwrite["dtype"] not in (str, "str"): - raise TensorMetaInvalidHtypeOverwriteValue("dtype", htype_overwrite["dtype"], "dtype for tensors with text htype should always be `str`") + raise TensorMetaInvalidHtypeOverwriteValue( + "dtype", + htype_overwrite["dtype"], + "dtype for tensors with text htype should always be `str`", + ) elif htype == "audio": if htype_overwrite["chunk_compression"] not in [UNSPECIFIED, None]: raise UnsupportedCompressionError("Chunk compression", htype=htype) diff --git a/hub/core/tensor.py b/hub/core/tensor.py index 99b935dfa6..824b8e44e8 100644 --- a/hub/core/tensor.py +++ b/hub/core/tensor.py @@ -200,7 +200,6 @@ def shape(self) -> Tuple[Optional[int], ...]: return shape return shape[1:] - @property def ndim(self) -> int: return len(self.shape) @@ -421,7 +420,7 @@ def __ior__(self, other): def data(self) -> Any: htype = self.htype if htype in ("json", "text"): - + if self.ndim == 1: return self.numpy()[0] else: diff --git a/hub/util/json.py b/hub/util/json.py index d5b2896061..d2948c04ad 100644 --- a/hub/util/json.py +++ b/hub/util/json.py @@ -4,7 +4,7 @@ import json import base64 -from hub.core.sample import Sample +from hub.core.sample import Sample # type: ignore Schema = Any @@ -35,7 +35,6 @@ def _parse_schema(schema: Union[str, Schema]) -> Tuple[str, List[str]]: if schema in scalars: return schema, [] - if "[" not in schema: return _norm_type(schema), [] @@ -69,6 +68,7 @@ def _parse_schema(schema: Union[str, Schema]) -> Tuple[str, List[str]]: continue else: buff += c + raise InvalidJsonSchemaException() class InvalidJsonSchemaException(Exception): From 1cb89edff29a80b718acb24199d6de79c99674e9 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 11 Oct 2021 05:55:12 +0400 Subject: [PATCH 19/20] smol fix --- hub/core/meta/tensor_meta.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py index ec28661285..04e76ebae9 100644 --- a/hub/core/meta/tensor_meta.py +++ b/hub/core/meta/tensor_meta.py @@ -172,13 +172,6 @@ def _validate_htype_overwrites(htype: str, htype_overwrite: dict): if compr not in (None, UNSPECIFIED): if get_compression_type(compr) != BYTE_COMPRESSION: raise UnsupportedCompressionError(compr, htype) - if htype == "text": - if htype_overwrite["dtype"] not in (str, "str"): - raise TensorMetaInvalidHtypeOverwriteValue( - "dtype", - htype_overwrite["dtype"], - "dtype for tensors with text htype should always be `str`", - ) elif htype == "audio": if htype_overwrite["chunk_compression"] not in [UNSPECIFIED, None]: raise UnsupportedCompressionError("Chunk compression", htype=htype) @@ -204,7 +197,7 @@ def _replace_unspecified_values(htype: str, htype_overwrite: dict): if v == UNSPECIFIED: htype_overwrite[k] = defaults[k] - if htype in ("json", "list") and not htype_overwrite["dtype"]: + if htype in ("json", "list", "text") and not htype_overwrite["dtype"]: htype_overwrite["dtype"] = HTYPE_CONFIGURATIONS[htype]["dtype"] @@ -237,6 +230,14 @@ def _validate_required_htype_overwrites(htype: str, htype_overwrite: dict): "Datatype must be supported by numpy. Can be an `str`, `np.dtype`, or normal python type (like `bool`, `float`, `int`, etc.). List of available numpy dtypes found here: https://numpy.org/doc/stable/user/basics.types.html", ) + if htype == "text": + if htype_overwrite["dtype"] not in (str, "str"): + raise TensorMetaInvalidHtypeOverwriteValue( + "dtype", + htype_overwrite["dtype"], + "dtype for tensors with text htype should always be `str`", + ) + def _format_values(htype: str, htype_overwrite: dict): """Replaces values in `htype_overwrite` with consistent types/formats.""" From b32da45a60490b795ab69d4933658502f58fccf7 Mon Sep 17 00:00:00 2001 From: Fariz Rahman Date: Mon, 11 Oct 2021 06:06:25 +0400 Subject: [PATCH 20/20] mypy --- hub/util/json.py | 1 + 1 file changed, 1 insertion(+) diff --git a/hub/util/json.py b/hub/util/json.py index d2948c04ad..2630996d18 100644 --- a/hub/util/json.py +++ b/hub/util/json.py @@ -131,6 +131,7 @@ def _validate_optional(obj: Any, params: List[str]) -> bool: return True if params: return _validate_object(obj, params[0]) + return True def _validate_list(obj: Any, params: List[str]) -> bool: