From 679bf160e3f17fa5d398021a5340715fd12f97f2 Mon Sep 17 00:00:00 2001
From: Fariz Rahman
Date: Thu, 30 Sep 2021 12:12:16 +0530
Subject: [PATCH 01/20] json validation
---
hub/htype.py | 3 +
hub/util/json_validation.py | 156 ++++++++++++++++++++++++++++++++++++
2 files changed, 159 insertions(+)
create mode 100644 hub/util/json_validation.py
diff --git a/hub/htype.py b/hub/htype.py
index cf135a3e25..00b9237b43 100644
--- a/hub/htype.py
+++ b/hub/htype.py
@@ -56,6 +56,9 @@
"dtype": "bool"
}, # TODO: pack numpy arrays to store bools as 1 bit instead of 1 byte
"segment_mask": {"dtype": "int32"},
+ "json": {
+ "dtype": None
+ }
}
# these configs are added to every `htype`
diff --git a/hub/util/json_validation.py b/hub/util/json_validation.py
new file mode 100644
index 0000000000..0a7529989f
--- /dev/null
+++ b/hub/util/json_validation.py
@@ -0,0 +1,156 @@
+from typing import Any, Dict, List, Optional, Tuple, Union, GenericMeta
+from numpy import ndarray
+
+
+scalars = ["int", "float", "bool", "str", "list", "dict", "ndarray"]
+types = ["Any", "Dict", "List", "Optional", "Union"]
+
+
+def _norm_type(typ: str):
+ typ = typ.replace("typing.", "")
+ replacements = {
+ "numpy.ndarray": "ndarray",
+ "np.ndarray": "ndarray"
+ }
+ return replacements.get(typ, typ)
+
+def _parse_schema(schema: Union[str, GenericMeta]) -> Tuple[str, List[str]]:
+ if isinstance(schema, GenericMeta):
+ schema = str(schema)
+ validate = False
+ else:
+ validate = True
+
+ if schema in scalars:
+ return schema, []
+
+ if "[" not in schema:
+ return _norm_type(schema), []
+
+ typ, param_string = schema.split("[", 1)
+ typ = _norm_type(typ)
+ assert param_string[-1] == "]"
+ params = []
+ buff = ""
+ level = 0
+ for c in param_string:
+ if c == "[":
+ level += 1
+ buff += c
+ elif c == "]":
+ if level == 0:
+ if buff:
+ params.append(buff)
+ if validate:
+ _validate_schema(typ, params)
+ return typ, params
+ else:
+ buff += c
+ level -= 1
+ elif c == ",":
+ if level == 0:
+ params.append(buff)
+ buff = ""
+ else:
+ buff += c
+ elif c == " ":
+ continue
+ else:
+ buff += c
+
+
+class InvalidJsonSchemaException(Exception):
+ pass
+
+class ArgumentMismatchException(InvalidJsonSchemaException):
+ def __init__(self, typ: str, actual: int, expected: int, exact: bool = False):
+ assert actual != expected
+ gt = actual > expected
+ super(ArgumentMismatchException, self).__init__(
+ f"Too {'many' if gt else 'few'} parameters for {typ};" +
+ f" actual {actual},expected {'exatcly' if exact else ('at most' if gt else 'at least')} {expected}."
+ )
+
+
+def _validate_schema(typ: str, params: List[str]) -> Tuple[str, List[str]]:
+ if typ in scalars:
+ return typ, params
+
+ if typ not in types:
+ raise InvalidJsonSchemaException(f"Unsupported type: {typ}")
+
+ def _err(expected_num_params: int, exact: bool = False):
+ raise ArgumentMismatchException(typ, len(params), expected_num_params, exact)
+
+ if typ == "Any":
+ if params:
+ _err(0)
+ elif typ == "Optional":
+ if len(params) > 1:
+ _err(1)
+ elif typ == "Union":
+ if len(params) == 0:
+ _err(1)
+ elif typ == "List":
+ if len(params) > 1:
+ _err(1)
+ elif typ == "Dict":
+ if len(params) not in (0, 2):
+ _err(2, True)
+ return typ, params
+
+
+def _validate_any(obj: Any, params: List[str]):
+ assert not params
+ return True
+
+
+def _validate_union(obj: Any, params: List[str]):
+ for schema in params:
+ if _validate_object(obj, schema):
+ return True
+ return False
+
+
+def _validate_optional(obj: Any, params: List[str]) -> bool:
+ assert len(params) <= 1
+ if obj is None:
+ return True
+ if params:
+ return _validate_object(obj, params[0])
+
+
+def _validate_list(obj: Any, params: List[str]) -> bool:
+ assert len(params) == 1
+ assert isinstance(obj, (list, tuple))
+ for item in obj:
+ if not _validate_object(item, params[0]):
+ return False
+ return True
+
+
+def _validate_dict(obj: Any, params: List[str]) -> bool:
+ assert len(params) in (0, 2)
+ assert isinstance(obj, dict)
+ if params:
+ assert params[0] in ("str", "Any"), "Only string keys are allowed for json dicts."
+ for v in obj.values():
+ if not _validate_object(v, params[1]):
+ return False
+ return True
+
+
+def _validate_object(obj: Any, schema: Union[str, GenericMeta]) -> bool:
+ typ, params = _parse_schema(schema)
+ if typ in scalars:
+ return isinstance(obj, eval(typ))
+ return globals()[f"_validate_{typ.lower()}"](obj, params)
+
+
+class JsonValidationError(Exception):
+ pass
+
+
+def validate_json_object(obj: Any, schema: Union[str, GenericMeta]) -> None:
+ if not _validate_object(obj, schema):
+ raise JsonValidationError()
From de91ae076cbd8b65e3258571fa36799e226d93ea Mon Sep 17 00:00:00 2001
From: Fariz Rahman
Date: Thu, 30 Sep 2021 12:38:38 +0530
Subject: [PATCH 02/20] updates
---
hub/util/{json_validation.py => json.py} | 10 ++++++++++
1 file changed, 10 insertions(+)
rename hub/util/{json_validation.py => json.py} (95%)
diff --git a/hub/util/json_validation.py b/hub/util/json.py
similarity index 95%
rename from hub/util/json_validation.py
rename to hub/util/json.py
index 0a7529989f..fbcac4b596 100644
--- a/hub/util/json_validation.py
+++ b/hub/util/json.py
@@ -1,5 +1,6 @@
from typing import Any, Dict, List, Optional, Tuple, Union, GenericMeta
from numpy import ndarray
+import json
scalars = ["int", "float", "bool", "str", "list", "dict", "ndarray"]
@@ -154,3 +155,12 @@ class JsonValidationError(Exception):
def validate_json_object(obj: Any, schema: Union[str, GenericMeta]) -> None:
if not _validate_object(obj, schema):
raise JsonValidationError()
+
+
+class _NumpyEncoder(json.JSONEncoder):
+ def default(self, obj):
+ if isinstance(obj, ndarray):
+ {
+ "_type": "ndarray",
+ "data": ""
+ }
From 6db239476a85387fc4af0afc9673315fb427b181 Mon Sep 17 00:00:00 2001
From: Fariz Rahman
Date: Thu, 30 Sep 2021 16:11:08 +0530
Subject: [PATCH 03/20] initial
---
hub/__init__.py | 2 +-
hub/api/tests/test_api.py | 11 ++++---
hub/core/chunk_engine.py | 34 ++++++++++++++++++--
hub/core/dataset.py | 14 +++++++--
hub/core/meta/tensor_meta.py | 47 +++++++++++++++++++--------
hub/core/serialize.py | 22 +++++++++++--
hub/core/tensor.py | 4 +++
hub/htype.py | 6 ++--
hub/util/json.py | 61 ++++++++++++++++++++++++++++++------
9 files changed, 164 insertions(+), 37 deletions(-)
diff --git a/hub/__init__.py b/hub/__init__.py
index 64c47a6bc2..6e78868062 100644
--- a/hub/__init__.py
+++ b/hub/__init__.py
@@ -27,7 +27,7 @@
from .htype import HTYPE_CONFIGURATIONS
compressions = list(SUPPORTED_COMPRESSIONS)
-htypes = list(HTYPE_CONFIGURATIONS.keys())
+htypes = sorted(list(HTYPE_CONFIGURATIONS))
list = dataset.list
load = dataset.load
empty = dataset.empty
diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index 380e93fcc2..36ce0d3807 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -690,13 +690,16 @@ def test_compressions_list():
def test_htypes_list():
assert hub.htypes == [
- "generic",
- "image",
- "class_label",
"bbox",
- "video",
"binary_mask",
+ "class_label",
+ "generic",
+ "image",
+ "json",
+ "list",
"segment_mask",
+ "text",
+ "video",
]
diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
index 7c9056df84..db8647b82c 100644
--- a/hub/core/chunk_engine.py
+++ b/hub/core/chunk_engine.py
@@ -8,7 +8,7 @@
from hub.core.version_control.commit_node import CommitNode # type: ignore
from hub.core.version_control.commit_chunk_set import CommitChunkSet # type: ignore
from hub.core.fast_forwarding import ffw_chunk_id_encoder
-from hub.core.compression import decompress_array
+from hub.core.compression import decompress_array, decompress_bytes
from hub.core.sample import Sample, SampleValue # type: ignore
from hub.core.meta.tensor_meta import TensorMeta
from hub.core.index.index import Index
@@ -30,6 +30,7 @@
CorruptedMetaError,
DynamicTensorNumpyError,
)
+from hub.util.json import HubJsonDecoder
from hub.util.casting import get_dtype, intelligent_cast
from hub.util.version_control import auto_checkout, commit, commit_chunk_set_exists
@@ -684,6 +685,34 @@ def read_sample_from_chunk(
return np.zeros(shape, dtype=dtype)
chunk_compression = self.tensor_meta.chunk_compression
+ sample_compression = self.tensor_meta.sample_compression
+
+ htype = self.tensor_meta.htype
+
+ if htype in ("json", "text", "list"):
+ if chunk_compression:
+ decompressed = chunk.decompressed_data(compression=chunk_compression)
+ sb, eb = chunk.byte_positions_encoder[local_sample_index]
+ buffer = decompressed[sb:eb]
+ elif sample_compression:
+ buffer = decompress_bytes(buffer[sb:eb], compression=sample_compression)
+ else:
+ buffer = buffer[sb:eb]
+ if htype == "json":
+ arr = np.empty(1, dtype=object)
+ arr[0] = json.loads(str.encode(buffer), cls=HubJsonDecoder)
+ return arr
+ elif htype == "list":
+ lst = json.loads(str.encode(buffer), cls=HubJsonDecoder)
+ arr = np.empty(len(lst), dtype=object)
+ arr[:] = lst
+ return arr
+ elif htype == "text":
+ arr = np.array(str.encode(buffer)).reshape(
+ 1,
+ )
+ return arr
+
if chunk_compression:
if get_compression_type(chunk_compression) == BYTE_COMPRESSION:
decompressed = chunk.decompressed_data(compression=chunk_compression)
@@ -691,10 +720,9 @@ def read_sample_from_chunk(
return np.frombuffer(decompressed[sb:eb], dtype=dtype).reshape(shape)
else:
return chunk.decompressed_samples()[local_sample_index]
+
sb, eb = chunk.byte_positions_encoder[local_sample_index]
buffer = buffer[sb:eb]
-
- sample_compression = self.tensor_meta.sample_compression
if sample_compression:
sample = decompress_array(
buffer, shape, dtype=dtype, compression=sample_compression
diff --git a/hub/core/dataset.py b/hub/core/dataset.py
index 2e9ab9dba8..d691f3b67e 100644
--- a/hub/core/dataset.py
+++ b/hub/core/dataset.py
@@ -3,7 +3,17 @@
import warnings
import posixpath
import numpy as np
-from typing import Any, Callable, Dict, Optional, Union, Tuple, List, Sequence
+from typing import (
+ Any,
+ Callable,
+ Dict,
+ Optional,
+ Union,
+ Tuple,
+ List,
+ Sequence,
+ GenericMeta,
+)
from hub.api.info import load_info
from hub.client.log import logger
@@ -208,7 +218,7 @@ def create_tensor(
self,
name: str,
htype: str = DEFAULT_HTYPE,
- dtype: Union[str, np.dtype, type] = UNSPECIFIED,
+ dtype: Union[str, np.dtype, type, GenericMeta] = UNSPECIFIED,
sample_compression: str = UNSPECIFIED,
chunk_compression: str = UNSPECIFIED,
**kwargs,
diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py
index 46fca81caf..df29fdb9d4 100644
--- a/hub/core/meta/tensor_meta.py
+++ b/hub/core/meta/tensor_meta.py
@@ -1,6 +1,6 @@
import hub
from hub.core.fast_forwarding import ffw_tensor_meta
-from typing import Any, Callable, Dict, List, Tuple
+from typing import Any, Callable, Dict, List, Tuple, GenericMeta
import numpy as np
from hub.util.exceptions import (
TensorMetaInvalidHtype,
@@ -11,6 +11,7 @@
UnsupportedCompressionError,
TensorInvalidSampleShapeError,
)
+from hub.util.json import validate_json_schema
from hub.constants import (
REQUIRE_USER_SPECIFICATION,
UNSPECIFIED,
@@ -53,8 +54,8 @@ def __init__(
_validate_htype_exists(htype)
_validate_htype_overwrites(htype, kwargs)
_replace_unspecified_values(htype, kwargs)
- _validate_required_htype_overwrites(kwargs)
- _format_values(kwargs)
+ _validate_required_htype_overwrites(htype, kwargs)
+ _format_values(htype, kwargs)
required_meta = _required_meta_from_htype(htype)
required_meta.update(kwargs)
@@ -158,6 +159,15 @@ def _validate_htype_overwrites(htype: str, htype_overwrite: dict):
htype, ["chunk_compression", "sample_compression"] # type: ignore
)
+ if htype in ("json", "list", "text"):
+ compr = htype_overwrite["chunk_compression"]
+ if compr in (None, UNSPECIFIED):
+ compr = htype_overwrite["sample_compression"]
+ if compr not in (None, UNSPECIFIED):
+ if get_compression_type(compr) != BYTE_COMPRESSION:
+ raise Exception() # TODO
+ dtype = htype_overwrite["dtype"]
+
def _replace_unspecified_values(htype: str, htype_overwrite: dict):
"""Replaces `UNSPECIFIED` values in `htype_overwrite` with the `htype`'s defaults."""
@@ -168,8 +178,11 @@ def _replace_unspecified_values(htype: str, htype_overwrite: dict):
if v == UNSPECIFIED:
htype_overwrite[k] = defaults[k]
+ if htype in ("json", "list") and not htype_overwrite["dtype"]:
+ htype_overwrite["dtype"] = HTYPE_CONFIGURATIONS[htype]["dtype"]
+
-def _validate_required_htype_overwrites(htype_overwrite: dict):
+def _validate_required_htype_overwrites(htype: str, htype_overwrite: dict):
"""Raises errors if `htype_overwrite` has invalid values."""
sample_compression = htype_overwrite["sample_compression"]
@@ -188,19 +201,27 @@ def _validate_required_htype_overwrites(htype_overwrite: dict):
)
if htype_overwrite["dtype"] is not None:
- _raise_if_condition(
- "dtype",
- htype_overwrite,
- lambda dtype: not _is_dtype_supported_by_numpy(dtype),
- "Datatype must be supported by numpy. Can be an `str`, `np.dtype`, or normal python type (like `bool`, `float`, `int`, etc.). List of available numpy dtypes found here: https://numpy.org/doc/stable/user/basics.types.html",
- )
+ if htype in ("json", "list"):
+ validate_json_schema(htype_overwrite["dtype"])
+ else:
+ _raise_if_condition(
+ "dtype",
+ htype_overwrite,
+ lambda dtype: not _is_dtype_supported_by_numpy(dtype),
+ "Datatype must be supported by numpy. Can be an `str`, `np.dtype`, or normal python type (like `bool`, `float`, `int`, etc.). List of available numpy dtypes found here: https://numpy.org/doc/stable/user/basics.types.html",
+ )
-def _format_values(htype_overwrite: dict):
+def _format_values(htype: str, htype_overwrite: dict):
"""Replaces values in `htype_overwrite` with consistent types/formats."""
- if htype_overwrite["dtype"] is not None:
- htype_overwrite["dtype"] = np.dtype(htype_overwrite["dtype"]).name
+ dtype = htype_overwrite["dtype"]
+ if dtype is not None:
+ if htype in ("json", "list"):
+ if isinstance(dtype, GenericMeta):
+ dtype = str(dtype)
+ else:
+ htype_overwrite["dtype"] = np.dtype(htype_overwrite["dtype"]).name
for key, value in COMPRESSION_ALIASES.items():
if htype_overwrite.get("sample_compression") == key:
diff --git a/hub/core/serialize.py b/hub/core/serialize.py
index b4bab0b89d..dca2503500 100644
--- a/hub/core/serialize.py
+++ b/hub/core/serialize.py
@@ -1,8 +1,9 @@
from hub.core.meta.tensor_meta import TensorMeta
from hub.util.exceptions import TensorInvalidSampleShapeError
from hub.util.casting import intelligent_cast
+from hub.util.json import HubJsonEncoder, validate_json_object
from hub.core.sample import Sample, SampleValue # type: ignore
-from hub.core.compression import compress_array
+from hub.core.compression import compress_array, compress_bytes
from typing import List, Optional, Sequence, Union, Tuple, Iterable
from itertools import repeat
import hub
@@ -226,11 +227,26 @@ def deserialize_chunkids(byts: Union[bytes, memoryview]) -> Tuple[str, np.ndarra
def _serialize_input_sample(
sample: SampleValue,
sample_compression: Optional[str],
- expected_dtype: np.dtype,
+ expected_dtype: str,
htype: str,
) -> Tuple[bytes, Tuple[int]]:
"""Converts the incoming sample into a buffer with the proper dtype and compression."""
+ if htype in ("json", "list"):
+ validate_json_object(sample, expected_dtype)
+ byts = json.dumps(sample, cls=HubJsonEncoder).encode()
+ if sample_compression:
+ byts = compress_bytes(byts, compression=sample_compression)
+ shape = (len(sample),) if htype == "list" else 1
+ return byts, shape
+ elif htype == "text":
+ if not isinstance(sample, str):
+ raise TypeError("Expected str, received: " + str(sample))
+ byts = sample.encode()
+ if sample_compression:
+ byts = compress_bytes(byts, compression=sample_compression)
+ return byts, (1,)
+
if isinstance(sample, Sample):
if (
sample_compression
@@ -305,7 +321,7 @@ def serialize_input_samples(
raise ValueError("Dtype must be set before input samples can be serialized.")
sample_compression = meta.sample_compression
- dtype = np.dtype(meta.dtype)
+ dtype = meta.dtype
htype = meta.htype
if sample_compression or not hasattr(samples, "dtype"):
diff --git a/hub/core/tensor.py b/hub/core/tensor.py
index 12fad2b587..21f587b584 100644
--- a/hub/core/tensor.py
+++ b/hub/core/tensor.py
@@ -404,3 +404,7 @@ def __ixor__(self, other):
@_inplace_op
def __ior__(self, other):
pass
+
+ def data(self) -> Any:
+ # TODO
+ pass
diff --git a/hub/htype.py b/hub/htype.py
index 00b9237b43..22c062c70b 100644
--- a/hub/htype.py
+++ b/hub/htype.py
@@ -57,8 +57,10 @@
}, # TODO: pack numpy arrays to store bools as 1 bit instead of 1 byte
"segment_mask": {"dtype": "int32"},
"json": {
- "dtype": None
- }
+ "dtype": "Any",
+ },
+ "list": {"dtype": "List"},
+ "text": {"dtype": " Tuple[str, List[str]]:
if isinstance(schema, GenericMeta):
schema = str(schema)
@@ -63,13 +69,14 @@ def _parse_schema(schema: Union[str, GenericMeta]) -> Tuple[str, List[str]]:
class InvalidJsonSchemaException(Exception):
pass
+
class ArgumentMismatchException(InvalidJsonSchemaException):
def __init__(self, typ: str, actual: int, expected: int, exact: bool = False):
assert actual != expected
gt = actual > expected
super(ArgumentMismatchException, self).__init__(
- f"Too {'many' if gt else 'few'} parameters for {typ};" +
- f" actual {actual},expected {'exatcly' if exact else ('at most' if gt else 'at least')} {expected}."
+ f"Too {'many' if gt else 'few'} parameters for {typ};"
+ + f" actual {actual},expected {'exatcly' if exact else ('at most' if gt else 'at least')} {expected}."
)
@@ -134,7 +141,10 @@ def _validate_dict(obj: Any, params: List[str]) -> bool:
assert len(params) in (0, 2)
assert isinstance(obj, dict)
if params:
- assert params[0] in ("str", "Any"), "Only string keys are allowed for json dicts."
+ assert params[0] in (
+ "str",
+ "Any",
+ ), "Only string keys are allowed for json dicts."
for v in obj.values():
if not _validate_object(v, params[1]):
return False
@@ -157,10 +167,43 @@ def validate_json_object(obj: Any, schema: Union[str, GenericMeta]) -> None:
raise JsonValidationError()
-class _NumpyEncoder(json.JSONEncoder):
+def validate_json_schema(schema: str):
+ _parse_schema(schema)
+
+
+class HubJsonEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, ndarray):
- {
- "_type": "ndarray",
- "data": ""
+ return {
+ "_hub_custom_type": "ndarray",
+ "data": base64.b64encode(obj.tobytes()).decode(),
+ "shape": obj.shape,
+ "dtype": obj.dtype.name,
}
+ elif isinstance(obj, Sample):
+ if obj.compression:
+ return {
+ "_hub_custom_type": "Sample",
+ "data": base64.b64encode(obj.buffer).decode(),
+ "compression": obj.compression,
+ }
+ else:
+ return self.default(obj.array)
+ return obj
+
+
+class HubJsonDecoder(json.JSONDecoder):
+ def __init__(self, *args, **kwargs):
+ json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs)
+
+ def object_hook(self, obj):
+ hub_custom_type = obj.get(hub_custom_type)
+ if hub_custom_type == "ndarray":
+ return np.frombuffer(
+ base64.b64encode(obj["data"]), dtype=obj["dtype"]
+ ).reshape(obj.shape)
+ elif hub_custom_type == "Sample":
+ return Sample(
+ buffer=base64.b64encode(obj["data"]), compression=obj["compression"]
+ )
+ return obj
From e4823e68eb2d87d6042fcaccbb23b64e275cc4b0 Mon Sep 17 00:00:00 2001
From: Fariz Rahman
Date: Thu, 7 Oct 2021 18:19:26 +0400
Subject: [PATCH 04/20] fixes
---
hub/core/chunk_engine.py | 10 ++++----
hub/core/sample.py | 50 +++++++++++++++++++++++++++-------------
hub/core/serialize.py | 3 ++-
hub/util/json.py | 9 ++++----
4 files changed, 47 insertions(+), 25 deletions(-)
diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
index db8647b82c..3cab4e8b15 100644
--- a/hub/core/chunk_engine.py
+++ b/hub/core/chunk_engine.py
@@ -1,5 +1,6 @@
import hub
import warnings
+import json
import numpy as np
from math import ceil
from typing import Any, Dict, Optional, Sequence, Union, Tuple, List, Set
@@ -690,25 +691,26 @@ def read_sample_from_chunk(
htype = self.tensor_meta.htype
if htype in ("json", "text", "list"):
+ sb, eb = chunk.byte_positions_encoder[local_sample_index]
if chunk_compression:
decompressed = chunk.decompressed_data(compression=chunk_compression)
- sb, eb = chunk.byte_positions_encoder[local_sample_index]
buffer = decompressed[sb:eb]
elif sample_compression:
buffer = decompress_bytes(buffer[sb:eb], compression=sample_compression)
else:
buffer = buffer[sb:eb]
+ buffer = bytes(buffer)
if htype == "json":
arr = np.empty(1, dtype=object)
- arr[0] = json.loads(str.encode(buffer), cls=HubJsonDecoder)
+ arr[0] = json.loads(bytes.decode(buffer), cls=HubJsonDecoder)
return arr
elif htype == "list":
- lst = json.loads(str.encode(buffer), cls=HubJsonDecoder)
+ lst = json.loads(bytes.decode(buffer), cls=HubJsonDecoder)
arr = np.empty(len(lst), dtype=object)
arr[:] = lst
return arr
elif htype == "text":
- arr = np.array(str.encode(buffer)).reshape(
+ arr = np.array(bytes.decode(buffer)).reshape(
1,
)
return arr
diff --git a/hub/core/sample.py b/hub/core/sample.py
index 1b1c6d196d..27aad3f065 100644
--- a/hub/core/sample.py
+++ b/hub/core/sample.py
@@ -22,6 +22,8 @@ def __init__(
self,
path: str = None,
array: np.ndarray = None,
+ buffer: Union[bytes, memoryview] = None,
+ compression: str = None,
verify: bool = False,
):
"""Represents a single sample for a tensor. Provides all important meta information in one place.
@@ -34,6 +36,8 @@ def __init__(
path (str): Path to a sample stored on the local file system that represents a single sample. If `path` is provided, `array` should not be.
Implicitly makes `self.is_lazy == True`.
array (np.ndarray): Array that represents a single sample. If `array` is provided, `path` should not be. Implicitly makes `self.is_lazy == False`.
+ buffer: (bytes): Byte buffer that represents a single sample. If compressed, `compression` argument should be provided.
+ compression (str): Specify in case of byte buffer.
verify (bool): If a path is provided, verifies the sample if True.
Raises:
@@ -47,22 +51,31 @@ def __init__(
self._uncompressed_bytes = None
self._convert_grayscale = False
+ self._array = None
+ self._typestr = None
+ self._shape = None
+ self.path = None
+
if path is not None:
self.path = path
- self._array = None
- self._typestr = None
- self._shape = None
self._compression = None
self._verified = False
self._verify = verify
if array is not None:
- self.path = None
self._array = array
self._shape = array.shape
self._typestr = array.__array_interface__["typestr"]
self._compression = None
+ if buffer is not None:
+ self._compression = compression
+ self._buffer = buffer
+ if compression is None:
+ self._uncompressed_bytes = buffer
+ else:
+ self._compressed_bytes[compression] = buffer
+
@property
def dtype(self):
self._read_meta()
@@ -83,7 +96,7 @@ def _read_meta(self, f=None):
if self._shape is not None:
return
if f is None:
- f = self.path
+ f = self.path if self.path else self.compressed_bytes[self._compression]
self._compression, self._shape, self._typestr = read_meta_from_compressed_file(
f
)
@@ -96,6 +109,18 @@ def is_lazy(self) -> bool:
def is_empty(self) -> bool:
return 0 in self.shape
+ def _recompress(self, buffer: bytes, compression: str) -> bytes:
+ if get_compression_type(self._compression) != IMAGE_COMPRESSION:
+ raise ValueError(
+ "Recompression with different format is only supported for images."
+ )
+ img = Image.open(BytesIO(buffer))
+ if img.mode == "1":
+ self._uncompressed_bytes = img.tobytes("raw", "L")
+ else:
+ self._uncompressed_bytes = img.tobytes()
+ return compress_array(self.array, compression)
+
def compressed_bytes(self, compression: str) -> bytes:
"""Returns this sample as compressed bytes.
@@ -135,16 +160,9 @@ def compressed_bytes(self, compression: str) -> bytes:
compressed_bytes, compression=self._compression
)
else:
- if get_compression_type(self._compression) != IMAGE_COMPRESSION:
- raise ValueError(
- "Recompression with different format is only supported for images."
- )
- img = Image.open(BytesIO(compressed_bytes))
- if img.mode == "1":
- self._uncompressed_bytes = img.tobytes("raw", "L")
- else:
- self._uncompressed_bytes = img.tobytes()
- compressed_bytes = compress_array(self.array, compression)
+ compressed_bytes = self._recompress(compressed_bytes, compression)
+ elif self._buffer is not None:
+ compressed_bytes = self._recompress(self._buffer, compression)
else:
compressed_bytes = compress_array(self.array, compression)
self._compressed_bytes[compression] = compressed_bytes
@@ -184,7 +202,7 @@ def array(self) -> np.ndarray:
compr = get_compression(path=self.path)
if get_compression_type(compr) == AUDIO_COMPRESSION:
self._compression = compr
- array = decompress_array(self.path, compression=compr)
+ array = decompress_array(self.path or self._buffer, compression=compr)
if self._shape is None:
self._shape = array.shape
self._typestr = array.__array_interface__["typestr"]
diff --git a/hub/core/serialize.py b/hub/core/serialize.py
index 852369b8cc..c6ceb98557 100644
--- a/hub/core/serialize.py
+++ b/hub/core/serialize.py
@@ -11,6 +11,7 @@
import numpy as np
import struct
import warnings
+import json
def infer_chunk_num_bytes(
@@ -239,7 +240,7 @@ def _serialize_input_sample(
byts = json.dumps(sample, cls=HubJsonEncoder).encode()
if sample_compression:
byts = compress_bytes(byts, compression=sample_compression)
- shape = (len(sample),) if htype == "list" else 1
+ shape = (len(sample),) if htype == "list" else (1,)
return byts, shape
elif htype == "text":
if not isinstance(sample, str):
diff --git a/hub/util/json.py b/hub/util/json.py
index b3d0b0f0eb..66d0b4c3b6 100644
--- a/hub/util/json.py
+++ b/hub/util/json.py
@@ -129,11 +129,12 @@ def _validate_optional(obj: Any, params: List[str]) -> bool:
def _validate_list(obj: Any, params: List[str]) -> bool:
- assert len(params) == 1
+ assert len(params) <= 1
assert isinstance(obj, (list, tuple))
- for item in obj:
- if not _validate_object(item, params[0]):
- return False
+ if params:
+ for item in obj:
+ if not _validate_object(item, params[0]):
+ return False
return True
From d6fc7fb905d9f19ba42cfa81f7fbb71135eba06f Mon Sep 17 00:00:00 2001
From: Fariz Rahman
Date: Thu, 7 Oct 2021 18:27:44 +0400
Subject: [PATCH 05/20] test
---
hub/api/tests/test_api.py | 17 +++++++++++++++++
hub/util/json.py | 2 +-
2 files changed, 18 insertions(+), 1 deletion(-)
diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index 5256d5d8e0..2128d1dda8 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -751,3 +751,20 @@ def test_groups(local_ds_generator):
ds.create_group("g")
ds.g.create_tensor("g")
+
+
+def test_json(memory_ds):
+ ds = memory_ds
+ ds.create_tensor("json", htype="json")
+ items = [
+ {"x": [1, 2, 3], "y": [4, [5, 6]]},
+ {"x": [1, 2, 3], "y": [4, {"z": [0.1, 0.2, []]}]}
+ ]
+ with ds:
+ for x in items:
+ ds.json.append(x)
+ ds.json.extend(items)
+ for i in [0, 2]:
+ assert ds.json[i].numpy()[0] == items[0]
+ for i in [1, 3]:
+ assert ds.json[i].numpy()[0] == items[1]
diff --git a/hub/util/json.py b/hub/util/json.py
index 66d0b4c3b6..38ffa973d9 100644
--- a/hub/util/json.py
+++ b/hub/util/json.py
@@ -198,7 +198,7 @@ def __init__(self, *args, **kwargs):
json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs)
def object_hook(self, obj):
- hub_custom_type = obj.get(hub_custom_type)
+ hub_custom_type = obj.get("_hub_custom_type")
if hub_custom_type == "ndarray":
return np.frombuffer(
base64.b64encode(obj["data"]), dtype=obj["dtype"]
From 3cefdff0b42746146cf0297319f70006267aab17 Mon Sep 17 00:00:00 2001
From: Fariz Rahman
Date: Thu, 7 Oct 2021 18:27:56 +0400
Subject: [PATCH 06/20] format
---
hub/api/tests/test_api.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index 2128d1dda8..5dc5155b61 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -758,7 +758,7 @@ def test_json(memory_ds):
ds.create_tensor("json", htype="json")
items = [
{"x": [1, 2, 3], "y": [4, [5, 6]]},
- {"x": [1, 2, 3], "y": [4, {"z": [0.1, 0.2, []]}]}
+ {"x": [1, 2, 3], "y": [4, {"z": [0.1, 0.2, []]}]},
]
with ds:
for x in items:
From daf0ec26dafb3a12c69d8475e7432fb19bc36b59 Mon Sep 17 00:00:00 2001
From: Fariz Rahman
Date: Thu, 7 Oct 2021 18:36:08 +0400
Subject: [PATCH 07/20] py38 fix
---
hub/core/dataset.py | 7 ++++++-
hub/core/meta/tensor_meta.py | 8 +++++++-
hub/util/json.py | 7 ++++++-
3 files changed, 19 insertions(+), 3 deletions(-)
diff --git a/hub/core/dataset.py b/hub/core/dataset.py
index 0890142bda..d51499e922 100644
--- a/hub/core/dataset.py
+++ b/hub/core/dataset.py
@@ -12,7 +12,6 @@
Tuple,
List,
Sequence,
- GenericMeta,
)
from hub.api.info import load_info
@@ -55,6 +54,12 @@
from hub.util.remove_cache import get_base_storage
+try:
+ from typing import GenericMeta
+except ImportError:
+ from typing import _GenericAlias as GenericMeta
+
+
class Dataset:
def __init__(
self,
diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py
index b6097039b3..e20d01989e 100644
--- a/hub/core/meta/tensor_meta.py
+++ b/hub/core/meta/tensor_meta.py
@@ -1,6 +1,6 @@
import hub
from hub.core.fast_forwarding import ffw_tensor_meta
-from typing import Any, Callable, Dict, List, Tuple, GenericMeta
+from typing import Any, Callable, Dict, List, Tuple
import numpy as np
from hub.util.exceptions import (
TensorMetaInvalidHtype,
@@ -30,6 +30,12 @@
from hub.core.meta.meta import Meta
+try:
+ from typing import GenericMeta
+except ImportError:
+ from typing import _GenericAlias as GenericMeta
+
+
class TensorMeta(Meta):
htype: str
dtype: str
diff --git a/hub/util/json.py b/hub/util/json.py
index 38ffa973d9..052e87b0a7 100644
--- a/hub/util/json.py
+++ b/hub/util/json.py
@@ -1,10 +1,15 @@
-from typing import Any, Dict, List, Optional, Tuple, Union, GenericMeta
+from typing import Any, Dict, List, Optional, Tuple, Union
from numpy import ndarray
import json
import base64
from hub.core.sample import Sample
+try:
+ from typing import GenericMeta
+except ImportError:
+ from typing import _GenericAlias as GenericMeta
+
scalars = ["int", "float", "bool", "str", "list", "dict", "ndarray", "Sample"]
types = ["Any", "Dict", "List", "Optional", "Union"]
From f127d1a7bb083fe7f52543f2b871548b6f257833 Mon Sep 17 00:00:00 2001
From: Fariz Rahman
Date: Thu, 7 Oct 2021 18:44:23 +0400
Subject: [PATCH 08/20] test list
---
hub/api/tests/test_api.py | 21 ++++++++++++++++++++-
1 file changed, 20 insertions(+), 1 deletion(-)
diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index 5dc5155b61..f55e08871b 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -753,7 +753,7 @@ def test_groups(local_ds_generator):
ds.g.create_tensor("g")
-def test_json(memory_ds):
+def test_json_basic(memory_ds):
ds = memory_ds
ds.create_tensor("json", htype="json")
items = [
@@ -764,7 +764,26 @@ def test_json(memory_ds):
for x in items:
ds.json.append(x)
ds.json.extend(items)
+ assert ds.json.shape == (4, 1)
for i in [0, 2]:
assert ds.json[i].numpy()[0] == items[0]
for i in [1, 3]:
assert ds.json[i].numpy()[0] == items[1]
+
+
+def test_json_list_basic(memory_ds):
+ ds = memory_ds
+ ds.create_tensor("list", htype="list")
+ items = [
+ [{"x": [1, 2, 3], "y": [4, [5, 6]]}, [[]], [None, 0.1]],
+ [[], [[[]]], {"a": [0.1, 1, "a", []]}],
+ ]
+ with ds:
+ for x in items:
+ ds.list.append(x)
+ ds.list.extend(items)
+ assert ds.list.shape == (4, 3)
+ for i in [0, 2]:
+ assert list(ds.list[i].numpy()) == items[0]
+ for i in [1, 3]:
+ assert list(ds.list[i].numpy()) == items[1]
From 523bcee3cfedf2b417d8e5937dea6c01c1832652 Mon Sep 17 00:00:00 2001
From: Fariz Rahman
Date: Thu, 7 Oct 2021 18:51:00 +0400
Subject: [PATCH 09/20] test text
---
hub/api/tests/test_api.py | 25 +++++++++++++++++--------
1 file changed, 17 insertions(+), 8 deletions(-)
diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index f55e08871b..49f96c38fe 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -765,10 +765,8 @@ def test_json_basic(memory_ds):
ds.json.append(x)
ds.json.extend(items)
assert ds.json.shape == (4, 1)
- for i in [0, 2]:
- assert ds.json[i].numpy()[0] == items[0]
- for i in [1, 3]:
- assert ds.json[i].numpy()[0] == items[1]
+ for i in range(4):
+ assert ds.json[i].numpy()[0] == items[i % 2]
def test_json_list_basic(memory_ds):
@@ -783,7 +781,18 @@ def test_json_list_basic(memory_ds):
ds.list.append(x)
ds.list.extend(items)
assert ds.list.shape == (4, 3)
- for i in [0, 2]:
- assert list(ds.list[i].numpy()) == items[0]
- for i in [1, 3]:
- assert list(ds.list[i].numpy()) == items[1]
+ for i in range(4):
+ assert list(ds.list[i].numpy()) == items[i % 2]
+
+
+def test_text(memory_ds):
+ ds = memory_ds
+ ds.create_tensor("text", htype="text")
+ items = ["abcd", "efgh", "0123456"]
+ with ds:
+ for x in items:
+ ds.text.append(x)
+ ds.text.extend(items)
+ assert ds.text.shape == (6, 1)
+ for i in range(6):
+ assert ds.text[i].numpy()[0] == items[i % 3]
From 025e2f4994213ca64f559e78dc3dc835829fb9bf Mon Sep 17 00:00:00 2001
From: Fariz Rahman
Date: Mon, 11 Oct 2021 02:20:39 +0400
Subject: [PATCH 10/20] refac
---
hub/api/tests/test_api.py | 45 --------------------------------------
hub/api/tests/test_json.py | 35 +++++++++++++++++++++++++++++
hub/api/tests/test_text.py | 15 +++++++++++++
3 files changed, 50 insertions(+), 45 deletions(-)
create mode 100644 hub/api/tests/test_json.py
create mode 100644 hub/api/tests/test_text.py
diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
index 49f96c38fe..5256d5d8e0 100644
--- a/hub/api/tests/test_api.py
+++ b/hub/api/tests/test_api.py
@@ -751,48 +751,3 @@ def test_groups(local_ds_generator):
ds.create_group("g")
ds.g.create_tensor("g")
-
-
-def test_json_basic(memory_ds):
- ds = memory_ds
- ds.create_tensor("json", htype="json")
- items = [
- {"x": [1, 2, 3], "y": [4, [5, 6]]},
- {"x": [1, 2, 3], "y": [4, {"z": [0.1, 0.2, []]}]},
- ]
- with ds:
- for x in items:
- ds.json.append(x)
- ds.json.extend(items)
- assert ds.json.shape == (4, 1)
- for i in range(4):
- assert ds.json[i].numpy()[0] == items[i % 2]
-
-
-def test_json_list_basic(memory_ds):
- ds = memory_ds
- ds.create_tensor("list", htype="list")
- items = [
- [{"x": [1, 2, 3], "y": [4, [5, 6]]}, [[]], [None, 0.1]],
- [[], [[[]]], {"a": [0.1, 1, "a", []]}],
- ]
- with ds:
- for x in items:
- ds.list.append(x)
- ds.list.extend(items)
- assert ds.list.shape == (4, 3)
- for i in range(4):
- assert list(ds.list[i].numpy()) == items[i % 2]
-
-
-def test_text(memory_ds):
- ds = memory_ds
- ds.create_tensor("text", htype="text")
- items = ["abcd", "efgh", "0123456"]
- with ds:
- for x in items:
- ds.text.append(x)
- ds.text.extend(items)
- assert ds.text.shape == (6, 1)
- for i in range(6):
- assert ds.text[i].numpy()[0] == items[i % 3]
diff --git a/hub/api/tests/test_json.py b/hub/api/tests/test_json.py
new file mode 100644
index 0000000000..b6ee9d1d48
--- /dev/null
+++ b/hub/api/tests/test_json.py
@@ -0,0 +1,35 @@
+import numpy as np
+import hub
+import pytest
+
+
+def test_json_basic(memory_ds):
+ ds = memory_ds
+ ds.create_tensor("json", htype="json")
+ items = [
+ {"x": [1, 2, 3], "y": [4, [5, 6]]},
+ {"x": [1, 2, 3], "y": [4, {"z": [0.1, 0.2, []]}]},
+ ]
+ with ds:
+ for x in items:
+ ds.json.append(x)
+ ds.json.extend(items)
+ assert ds.json.shape == (4, 1)
+ for i in range(4):
+ assert ds.json[i].numpy()[0] == items[i % 2]
+
+
+def test_json_list_basic(memory_ds):
+ ds = memory_ds
+ ds.create_tensor("list", htype="list")
+ items = [
+ [{"x": [1, 2, 3], "y": [4, [5, 6]]}, [[]], [None, 0.1]],
+ [[], [[[]]], {"a": [0.1, 1, "a", []]}],
+ ]
+ with ds:
+ for x in items:
+ ds.list.append(x)
+ ds.list.extend(items)
+ assert ds.list.shape == (4, 3)
+ for i in range(4):
+ assert list(ds.list[i].numpy()) == items[i % 2]
diff --git a/hub/api/tests/test_text.py b/hub/api/tests/test_text.py
new file mode 100644
index 0000000000..1d3294a53f
--- /dev/null
+++ b/hub/api/tests/test_text.py
@@ -0,0 +1,15 @@
+import hub
+import pytest
+
+
+def test_text(memory_ds):
+ ds = memory_ds
+ ds.create_tensor("text", htype="text")
+ items = ["abcd", "efgh", "0123456"]
+ with ds:
+ for x in items:
+ ds.text.append(x)
+ ds.text.extend(items)
+ assert ds.text.shape == (6, 1)
+ for i in range(6):
+ assert ds.text[i].numpy()[0] == items[i % 3]
From 7c4ee2a55033c6e099fa3ec04e3290a00d579b07 Mon Sep 17 00:00:00 2001
From: Fariz Rahman
Date: Mon, 11 Oct 2021 02:23:32 +0400
Subject: [PATCH 11/20] better exception
---
hub/core/meta/tensor_meta.py | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py
index e20d01989e..e9d6fa9c0d 100644
--- a/hub/core/meta/tensor_meta.py
+++ b/hub/core/meta/tensor_meta.py
@@ -177,7 +177,7 @@ def _validate_htype_overwrites(htype: str, htype_overwrite: dict):
compr = htype_overwrite["sample_compression"]
if compr not in (None, UNSPECIFIED):
if get_compression_type(compr) != BYTE_COMPRESSION:
- raise Exception() # TODO
+ raise UnsupportedCompressionError(compr, htype)
elif htype == "audio":
if htype_overwrite["chunk_compression"] not in [UNSPECIFIED, None]:
raise UnsupportedCompressionError("Chunk compression", htype=htype)
From d68d448b0bc115c68e31b8505bd9b6de0646c29a Mon Sep 17 00:00:00 2001
From: Fariz Rahman
Date: Mon, 11 Oct 2021 02:35:12 +0400
Subject: [PATCH 12/20] numpy tests
---
hub/api/tests/test_json.py | 17 +++++++++++++++++
hub/util/json.py | 7 ++++---
2 files changed, 21 insertions(+), 3 deletions(-)
diff --git a/hub/api/tests/test_json.py b/hub/api/tests/test_json.py
index b6ee9d1d48..b1e9083299 100644
--- a/hub/api/tests/test_json.py
+++ b/hub/api/tests/test_json.py
@@ -1,6 +1,7 @@
import numpy as np
import hub
import pytest
+from hub.util.json import JsonValidationError
def test_json_basic(memory_ds):
@@ -19,6 +20,22 @@ def test_json_basic(memory_ds):
assert ds.json[i].numpy()[0] == items[i % 2]
+def test_json_with_numpy(memory_ds):
+ ds = memory_ds
+ ds.create_tensor("json", htype="json")
+ items = [
+ {"x": np.array([1, 2, 3], dtype=np.float32), "y": [4, [5, 6]]},
+ {"x": np.array([1, 2, 3], dtype=np.uint8), "y": [4, {"z": [0.1, 0.2, []]}]},
+ ]
+ with ds:
+ for x in items:
+ ds.json.append(x)
+ ds.json.extend(items)
+ for i in range(4):
+ assert ds.json[i].numpy()[0]["y"] == items[i % 2]["y"]
+ np.testing.assert_array_equal(ds.json[i].numpy()[0]["x"], items[i % 2]["x"])
+
+
def test_json_list_basic(memory_ds):
ds = memory_ds
ds.create_tensor("list", htype="list")
diff --git a/hub/util/json.py b/hub/util/json.py
index 052e87b0a7..7919baa6ef 100644
--- a/hub/util/json.py
+++ b/hub/util/json.py
@@ -1,4 +1,5 @@
from typing import Any, Dict, List, Optional, Tuple, Union
+import numpy as np
from numpy import ndarray
import json
import base64
@@ -206,10 +207,10 @@ def object_hook(self, obj):
hub_custom_type = obj.get("_hub_custom_type")
if hub_custom_type == "ndarray":
return np.frombuffer(
- base64.b64encode(obj["data"]), dtype=obj["dtype"]
- ).reshape(obj.shape)
+ base64.b64decode(obj["data"]), dtype=obj["dtype"]
+ ).reshape(obj["shape"])
elif hub_custom_type == "Sample":
return Sample(
- buffer=base64.b64encode(obj["data"]), compression=obj["compression"]
+ buffer=base64.b64decode(obj["data"]), compression=obj["compression"]
)
return obj
From ac62f8627219a6668c5a2a93a725b0a1684dbcf0 Mon Sep 17 00:00:00 2001
From: Fariz Rahman
Date: Mon, 11 Oct 2021 02:50:37 +0400
Subject: [PATCH 13/20] hub.read support tests
---
hub/api/tests/test_json.py | 24 ++++++++++++++++++++++++
hub/core/sample.py | 16 ++++++++++++++--
2 files changed, 38 insertions(+), 2 deletions(-)
diff --git a/hub/api/tests/test_json.py b/hub/api/tests/test_json.py
index b1e9083299..028eac9abc 100644
--- a/hub/api/tests/test_json.py
+++ b/hub/api/tests/test_json.py
@@ -36,6 +36,30 @@ def test_json_with_numpy(memory_ds):
np.testing.assert_array_equal(ds.json[i].numpy()[0]["x"], items[i % 2]["x"])
+def test_json_with_hub_sample(memory_ds, compressed_image_paths):
+ ds = memory_ds
+ ds.create_tensor("json", htype="json")
+ items = [
+ {
+ "x": [1, 2, 3],
+ "y": [4, [5, 6]],
+ "z": hub.read(compressed_image_paths["jpeg"][0]),
+ },
+ {
+ "x": [1, 2, 3],
+ "y": [4, {"z": [0.1, 0.2, []]}],
+ "z": hub.read(compressed_image_paths["png"][0]),
+ },
+ ]
+ with ds:
+ for x in items:
+ ds.json.append(x)
+ ds.json.extend(items)
+ assert ds.json.shape == (4, 1)
+ for i in range(4):
+ assert ds.json[i].numpy()[0] == items[i % 2]
+
+
def test_json_list_basic(memory_ds):
ds = memory_ds
ds.create_tensor("list", htype="list")
diff --git a/hub/core/sample.py b/hub/core/sample.py
index 27aad3f065..af862f0c1b 100644
--- a/hub/core/sample.py
+++ b/hub/core/sample.py
@@ -44,8 +44,8 @@ def __init__(
ValueError: Cannot create a sample from both a `path` and `array`.
"""
- if (path is None) == (array is None):
- raise ValueError("Must pass either `path` or `array`.")
+ if not any((path, array, buffer)):
+ raise ValueError("Must pass one of `path`, `array` or `buffer`.")
self._compressed_bytes = {}
self._uncompressed_bytes = None
@@ -55,6 +55,7 @@ def __init__(
self._typestr = None
self._shape = None
self.path = None
+ self._buffer = None
if path is not None:
self.path = path
@@ -76,6 +77,12 @@ def __init__(
else:
self._compressed_bytes[compression] = buffer
+ @property
+ def buffer(self):
+ if self._buffer is not None:
+ return self._buffer
+ return self.compressed_bytes(self.compression)
+
@property
def dtype(self):
self._read_meta()
@@ -234,5 +241,10 @@ def __repr__(self):
def __array__(self):
return self.array
+ def __eq__(self, other):
+ if self.path is not None and other.path is not None:
+ return self.path == other.path
+ return self.buffer == other.buffer
+
SampleValue = Union[np.ndarray, int, float, bool, Sample]
From 7110eee21643d8593ed2e388c581368241c5c70a Mon Sep 17 00:00:00 2001
From: Fariz Rahman
Date: Mon, 11 Oct 2021 02:55:43 +0400
Subject: [PATCH 14/20] list + numpy test
---
hub/api/tests/test_json.py | 23 +++++++++++++++++++++++
1 file changed, 23 insertions(+)
diff --git a/hub/api/tests/test_json.py b/hub/api/tests/test_json.py
index 028eac9abc..3e2f1c0a22 100644
--- a/hub/api/tests/test_json.py
+++ b/hub/api/tests/test_json.py
@@ -74,3 +74,26 @@ def test_json_list_basic(memory_ds):
assert ds.list.shape == (4, 3)
for i in range(4):
assert list(ds.list[i].numpy()) == items[i % 2]
+
+
+def test_list_with_numpy(memory_ds):
+ ds = memory_ds
+ ds.create_tensor("list", htype="list")
+ items = [
+ [
+ np.random.random((3, 4)),
+ {"x": [1, 2, 3], "y": [4, [5, 6]]},
+ [[]],
+ [None, 0.1],
+ ],
+ [np.random.randint(0, 10, (4, 5)), [], [[[]]], {"a": [0.1, 1, "a", []]}],
+ ]
+ with ds:
+ for x in items:
+ ds.list.append(x)
+ ds.list.extend(items)
+ assert ds.list.shape == (4, 4)
+ for i in range(4):
+ actual, expected = list(ds.list[i].numpy()), items[i % 2]
+ np.testing.assert_array_equal(actual[0], expected[0])
+ assert actual[1:] == expected[1:]
From ce9c2e87e6e7ddd4663a22b5ad2bf98fc61bcbab Mon Sep 17 00:00:00 2001
From: Fariz Rahman
Date: Mon, 11 Oct 2021 02:58:43 +0400
Subject: [PATCH 15/20] list + hub sample
---
hub/api/tests/test_json.py | 27 +++++++++++++++++++++++++++
1 file changed, 27 insertions(+)
diff --git a/hub/api/tests/test_json.py b/hub/api/tests/test_json.py
index 3e2f1c0a22..f9b88b1ef7 100644
--- a/hub/api/tests/test_json.py
+++ b/hub/api/tests/test_json.py
@@ -97,3 +97,30 @@ def test_list_with_numpy(memory_ds):
actual, expected = list(ds.list[i].numpy()), items[i % 2]
np.testing.assert_array_equal(actual[0], expected[0])
assert actual[1:] == expected[1:]
+
+
+def test_list_with_hub_sample(memory_ds, compressed_image_paths):
+ ds = memory_ds
+ ds.create_tensor("list", htype="list")
+ items = [
+ [
+ {
+ "x": [1, 2, 3],
+ "y": [4, [5, 6, hub.read(compressed_image_paths["jpeg"][0])]],
+ },
+ [[hub.read(compressed_image_paths["jpeg"][1])]],
+ [None, 0.1],
+ ],
+ [
+ [],
+ [[[hub.read(compressed_image_paths["png"][0])]]],
+ {"a": [0.1, 1, "a", hub.read(compressed_image_paths["png"][0])]},
+ ],
+ ]
+ with ds:
+ for x in items:
+ ds.list.append(x)
+ ds.list.extend(items)
+ assert ds.list.shape == (4, 3)
+ for i in range(4):
+ assert list(ds.list[i].numpy()) == items[i % 2]
From f4023754d192ae92bf659989b7a885e726fa27b7 Mon Sep 17 00:00:00 2001
From: Fariz Rahman
Date: Mon, 11 Oct 2021 03:39:55 +0400
Subject: [PATCH 16/20] schema tests
---
hub/api/tests/test_json.py | 24 ++++++++++++++++++++++++
hub/core/dataset.py | 8 +-------
hub/core/meta/tensor_meta.py | 10 ++--------
hub/util/json.py | 28 +++++++++++++++++-----------
4 files changed, 44 insertions(+), 26 deletions(-)
diff --git a/hub/api/tests/test_json.py b/hub/api/tests/test_json.py
index f9b88b1ef7..56833e8a7b 100644
--- a/hub/api/tests/test_json.py
+++ b/hub/api/tests/test_json.py
@@ -2,6 +2,7 @@
import hub
import pytest
from hub.util.json import JsonValidationError
+from typing import Any, Optional, Union, List, Dict
def test_json_basic(memory_ds):
@@ -124,3 +125,26 @@ def test_list_with_hub_sample(memory_ds, compressed_image_paths):
assert ds.list.shape == (4, 3)
for i in range(4):
assert list(ds.list[i].numpy()) == items[i % 2]
+
+
+def test_json_with_schema(memory_ds):
+ ds = memory_ds
+ ds.create_tensor("json", htype="json", dtype=List[Dict[str, int]])
+ ds.json.append([{"x": 1, "y": 2}])
+ with pytest.raises(JsonValidationError):
+ ds.json.append({"x": 1, "y": 2})
+ with pytest.raises(JsonValidationError):
+ ds.json.append([{"x": 1, "y": "2"}])
+
+ assert ds.json.numpy()[0, 0] == [{"x": 1, "y": 2}]
+
+ ds.create_tensor("json2", htype="json", dtype=Optional[List[Dict[str, int]]])
+ items = [
+ [{"x": 1, "y": 2}],
+ None,
+ [{"x": 2, "y": 3}],
+ None,
+ ]
+ ds.json2.extend(items)
+ for i in range(len(items)):
+ assert ds.json2[i].numpy()[0] == ds.json2.numpy()[i][0] == items[i]
diff --git a/hub/core/dataset.py b/hub/core/dataset.py
index d51499e922..6e198eed6f 100644
--- a/hub/core/dataset.py
+++ b/hub/core/dataset.py
@@ -54,12 +54,6 @@
from hub.util.remove_cache import get_base_storage
-try:
- from typing import GenericMeta
-except ImportError:
- from typing import _GenericAlias as GenericMeta
-
-
class Dataset:
def __init__(
self,
@@ -223,7 +217,7 @@ def create_tensor(
self,
name: str,
htype: str = DEFAULT_HTYPE,
- dtype: Union[str, np.dtype, type, GenericMeta] = UNSPECIFIED,
+ dtype: Union[str, np.dtype] = UNSPECIFIED,
sample_compression: str = UNSPECIFIED,
chunk_compression: str = UNSPECIFIED,
**kwargs,
diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py
index e9d6fa9c0d..abda33c76f 100644
--- a/hub/core/meta/tensor_meta.py
+++ b/hub/core/meta/tensor_meta.py
@@ -30,12 +30,6 @@
from hub.core.meta.meta import Meta
-try:
- from typing import GenericMeta
-except ImportError:
- from typing import _GenericAlias as GenericMeta
-
-
class TensorMeta(Meta):
htype: str
dtype: str
@@ -243,8 +237,8 @@ def _format_values(htype: str, htype_overwrite: dict):
dtype = htype_overwrite["dtype"]
if dtype is not None:
if htype in ("json", "list"):
- if isinstance(dtype, GenericMeta):
- dtype = str(dtype)
+ if getattr(dtype, "__module__", None) == "typing":
+ htype_overwrite["dtype"] = str(dtype)
else:
htype_overwrite["dtype"] = np.dtype(htype_overwrite["dtype"]).name
diff --git a/hub/util/json.py b/hub/util/json.py
index 7919baa6ef..d5b2896061 100644
--- a/hub/util/json.py
+++ b/hub/util/json.py
@@ -6,10 +6,8 @@
from hub.core.sample import Sample
-try:
- from typing import GenericMeta
-except ImportError:
- from typing import _GenericAlias as GenericMeta
+
+Schema = Any
scalars = ["int", "float", "bool", "str", "list", "dict", "ndarray", "Sample"]
@@ -21,14 +19,14 @@ def _norm_type(typ: str):
replacements = {
"numpy.ndarray": "ndarray",
"np.ndarray": "ndarray",
- "hub.core.Sample": "Sample",
+ "hub.core.sample.Sample": "Sample",
"hub.Sample": "Sample",
}
return replacements.get(typ, typ)
-def _parse_schema(schema: Union[str, GenericMeta]) -> Tuple[str, List[str]]:
- if isinstance(schema, GenericMeta):
+def _parse_schema(schema: Union[str, Schema]) -> Tuple[str, List[str]]:
+ if getattr(schema, "__module__", None) == "typing":
schema = str(schema)
validate = False
else:
@@ -37,6 +35,7 @@ def _parse_schema(schema: Union[str, GenericMeta]) -> Tuple[str, List[str]]:
if schema in scalars:
return schema, []
+
if "[" not in schema:
return _norm_type(schema), []
@@ -136,7 +135,8 @@ def _validate_optional(obj: Any, params: List[str]) -> bool:
def _validate_list(obj: Any, params: List[str]) -> bool:
assert len(params) <= 1
- assert isinstance(obj, (list, tuple))
+ if not isinstance(obj, (list, tuple)):
+ return False
if params:
for item in obj:
if not _validate_object(item, params[0]):
@@ -146,7 +146,8 @@ def _validate_list(obj: Any, params: List[str]) -> bool:
def _validate_dict(obj: Any, params: List[str]) -> bool:
assert len(params) in (0, 2)
- assert isinstance(obj, dict)
+ if not isinstance(obj, dict):
+ return False
if params:
assert params[0] in (
"str",
@@ -158,7 +159,12 @@ def _validate_dict(obj: Any, params: List[str]) -> bool:
return True
-def _validate_object(obj: Any, schema: Union[str, GenericMeta]) -> bool:
+def _validate_nonetype(obj: Any, params: List[str]) -> bool:
+ assert not params
+ return obj is None
+
+
+def _validate_object(obj: Any, schema: Union[str, Schema]) -> bool:
typ, params = _parse_schema(schema)
if typ in scalars:
return isinstance(obj, eval(typ))
@@ -169,7 +175,7 @@ class JsonValidationError(Exception):
pass
-def validate_json_object(obj: Any, schema: Union[str, GenericMeta]) -> None:
+def validate_json_object(obj: Any, schema: Union[str, Schema]) -> None:
if not _validate_object(obj, schema):
raise JsonValidationError()
From 362c8999244fd358d2a90a0bdea19855f5c01be8 Mon Sep 17 00:00:00 2001
From: Fariz Rahman
Date: Mon, 11 Oct 2021 04:15:04 +0400
Subject: [PATCH 17/20] .data()
---
hub/api/tests/test_json.py | 18 ++++++++++--------
hub/core/meta/tensor_meta.py | 3 +++
hub/core/tensor.py | 31 ++++++++++++++++++++++++++++---
hub/htype.py | 2 +-
4 files changed, 42 insertions(+), 12 deletions(-)
diff --git a/hub/api/tests/test_json.py b/hub/api/tests/test_json.py
index 56833e8a7b..2bb81f2d66 100644
--- a/hub/api/tests/test_json.py
+++ b/hub/api/tests/test_json.py
@@ -18,7 +18,7 @@ def test_json_basic(memory_ds):
ds.json.extend(items)
assert ds.json.shape == (4, 1)
for i in range(4):
- assert ds.json[i].numpy()[0] == items[i % 2]
+ assert ds.json[i].data() == items[i % 2]
def test_json_with_numpy(memory_ds):
@@ -33,8 +33,8 @@ def test_json_with_numpy(memory_ds):
ds.json.append(x)
ds.json.extend(items)
for i in range(4):
- assert ds.json[i].numpy()[0]["y"] == items[i % 2]["y"]
- np.testing.assert_array_equal(ds.json[i].numpy()[0]["x"], items[i % 2]["x"])
+ assert ds.json[i].data()["y"] == items[i % 2]["y"]
+ np.testing.assert_array_equal(ds.json[i].data()["x"], items[i % 2]["x"])
def test_json_with_hub_sample(memory_ds, compressed_image_paths):
@@ -58,7 +58,7 @@ def test_json_with_hub_sample(memory_ds, compressed_image_paths):
ds.json.extend(items)
assert ds.json.shape == (4, 1)
for i in range(4):
- assert ds.json[i].numpy()[0] == items[i % 2]
+ assert ds.json[i].data() == items[i % 2]
def test_json_list_basic(memory_ds):
@@ -74,7 +74,9 @@ def test_json_list_basic(memory_ds):
ds.list.extend(items)
assert ds.list.shape == (4, 3)
for i in range(4):
- assert list(ds.list[i].numpy()) == items[i % 2]
+ assert ds.list[i].data() == items[i % 2]
+ for i, x in enumerate(ds.list.data()):
+ assert x == items[i % 2]
def test_list_with_numpy(memory_ds):
@@ -95,7 +97,7 @@ def test_list_with_numpy(memory_ds):
ds.list.extend(items)
assert ds.list.shape == (4, 4)
for i in range(4):
- actual, expected = list(ds.list[i].numpy()), items[i % 2]
+ actual, expected = ds.list[i].data(), items[i % 2]
np.testing.assert_array_equal(actual[0], expected[0])
assert actual[1:] == expected[1:]
@@ -124,7 +126,7 @@ def test_list_with_hub_sample(memory_ds, compressed_image_paths):
ds.list.extend(items)
assert ds.list.shape == (4, 3)
for i in range(4):
- assert list(ds.list[i].numpy()) == items[i % 2]
+ assert ds.list[i].data() == items[i % 2]
def test_json_with_schema(memory_ds):
@@ -147,4 +149,4 @@ def test_json_with_schema(memory_ds):
]
ds.json2.extend(items)
for i in range(len(items)):
- assert ds.json2[i].numpy()[0] == ds.json2.numpy()[i][0] == items[i]
+ assert ds.json2[i].data() == ds.json2.data()[i] == items[i]
diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py
index abda33c76f..ec58255859 100644
--- a/hub/core/meta/tensor_meta.py
+++ b/hub/core/meta/tensor_meta.py
@@ -172,6 +172,9 @@ def _validate_htype_overwrites(htype: str, htype_overwrite: dict):
if compr not in (None, UNSPECIFIED):
if get_compression_type(compr) != BYTE_COMPRESSION:
raise UnsupportedCompressionError(compr, htype)
+ if htype == "text":
+ if htype_overwrite["dtype"] not in (str, "str"):
+ raise TensorMetaInvalidHtypeOverwriteValue("dtype", htype_overwrite["dtype"], "dtype for tensors with text htype should always be `str`")
elif htype == "audio":
if htype_overwrite["chunk_compression"] not in [UNSPECIFIED, None]:
raise UnsupportedCompressionError("Chunk compression", htype=htype)
diff --git a/hub/core/tensor.py b/hub/core/tensor.py
index 21f587b584..99b935dfa6 100644
--- a/hub/core/tensor.py
+++ b/hub/core/tensor.py
@@ -195,15 +195,28 @@ def shape(self) -> Tuple[Optional[int], ...]:
tuple: Tuple where each value is either `None` (if that axis is dynamic) or
an `int` (if that axis is fixed).
"""
+ shape = self.shape_interval.astuple()
+ if self.index.values[0].subscriptable():
+ return shape
+ return shape[1:]
- return self.shape_interval.astuple()
+
+ @property
+ def ndim(self) -> int:
+ return len(self.shape)
@property
def dtype(self) -> np.dtype:
+ if self.htype in ("json", "list"):
+ return self.dtype
if self.meta.dtype:
return np.dtype(self.meta.dtype)
return None
+ @property
+ def htype(self):
+ return self.meta.htype
+
@property
def shape_interval(self) -> ShapeInterval:
"""Returns a `ShapeInterval` object that describes this tensor's shape more accurately. Length is included.
@@ -406,5 +419,17 @@ def __ior__(self, other):
pass
def data(self) -> Any:
- # TODO
- pass
+ htype = self.htype
+ if htype in ("json", "text"):
+
+ if self.ndim == 1:
+ return self.numpy()[0]
+ else:
+ return [sample[0] for sample in self.numpy(aslist=True)]
+ elif htype == "list":
+ if self.ndim == 1:
+ return list(self.numpy())
+ else:
+ return list(map(list, self.numpy(aslist=True)))
+ else:
+ return self.numpy()
diff --git a/hub/htype.py b/hub/htype.py
index 48bf35254a..ce499ea96c 100644
--- a/hub/htype.py
+++ b/hub/htype.py
@@ -61,7 +61,7 @@
"dtype": "Any",
},
"list": {"dtype": "List"},
- "text": {"dtype": "
Date: Mon, 11 Oct 2021 04:23:25 +0400
Subject: [PATCH 18/20] format
---
hub/core/meta/tensor_meta.py | 6 +++++-
hub/core/tensor.py | 3 +--
hub/util/json.py | 4 ++--
3 files changed, 8 insertions(+), 5 deletions(-)
diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py
index ec58255859..ec28661285 100644
--- a/hub/core/meta/tensor_meta.py
+++ b/hub/core/meta/tensor_meta.py
@@ -174,7 +174,11 @@ def _validate_htype_overwrites(htype: str, htype_overwrite: dict):
raise UnsupportedCompressionError(compr, htype)
if htype == "text":
if htype_overwrite["dtype"] not in (str, "str"):
- raise TensorMetaInvalidHtypeOverwriteValue("dtype", htype_overwrite["dtype"], "dtype for tensors with text htype should always be `str`")
+ raise TensorMetaInvalidHtypeOverwriteValue(
+ "dtype",
+ htype_overwrite["dtype"],
+ "dtype for tensors with text htype should always be `str`",
+ )
elif htype == "audio":
if htype_overwrite["chunk_compression"] not in [UNSPECIFIED, None]:
raise UnsupportedCompressionError("Chunk compression", htype=htype)
diff --git a/hub/core/tensor.py b/hub/core/tensor.py
index 99b935dfa6..824b8e44e8 100644
--- a/hub/core/tensor.py
+++ b/hub/core/tensor.py
@@ -200,7 +200,6 @@ def shape(self) -> Tuple[Optional[int], ...]:
return shape
return shape[1:]
-
@property
def ndim(self) -> int:
return len(self.shape)
@@ -421,7 +420,7 @@ def __ior__(self, other):
def data(self) -> Any:
htype = self.htype
if htype in ("json", "text"):
-
+
if self.ndim == 1:
return self.numpy()[0]
else:
diff --git a/hub/util/json.py b/hub/util/json.py
index d5b2896061..d2948c04ad 100644
--- a/hub/util/json.py
+++ b/hub/util/json.py
@@ -4,7 +4,7 @@
import json
import base64
-from hub.core.sample import Sample
+from hub.core.sample import Sample # type: ignore
Schema = Any
@@ -35,7 +35,6 @@ def _parse_schema(schema: Union[str, Schema]) -> Tuple[str, List[str]]:
if schema in scalars:
return schema, []
-
if "[" not in schema:
return _norm_type(schema), []
@@ -69,6 +68,7 @@ def _parse_schema(schema: Union[str, Schema]) -> Tuple[str, List[str]]:
continue
else:
buff += c
+ raise InvalidJsonSchemaException()
class InvalidJsonSchemaException(Exception):
From 1cb89edff29a80b718acb24199d6de79c99674e9 Mon Sep 17 00:00:00 2001
From: Fariz Rahman
Date: Mon, 11 Oct 2021 05:55:12 +0400
Subject: [PATCH 19/20] smol fix
---
hub/core/meta/tensor_meta.py | 17 +++++++++--------
1 file changed, 9 insertions(+), 8 deletions(-)
diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py
index ec28661285..04e76ebae9 100644
--- a/hub/core/meta/tensor_meta.py
+++ b/hub/core/meta/tensor_meta.py
@@ -172,13 +172,6 @@ def _validate_htype_overwrites(htype: str, htype_overwrite: dict):
if compr not in (None, UNSPECIFIED):
if get_compression_type(compr) != BYTE_COMPRESSION:
raise UnsupportedCompressionError(compr, htype)
- if htype == "text":
- if htype_overwrite["dtype"] not in (str, "str"):
- raise TensorMetaInvalidHtypeOverwriteValue(
- "dtype",
- htype_overwrite["dtype"],
- "dtype for tensors with text htype should always be `str`",
- )
elif htype == "audio":
if htype_overwrite["chunk_compression"] not in [UNSPECIFIED, None]:
raise UnsupportedCompressionError("Chunk compression", htype=htype)
@@ -204,7 +197,7 @@ def _replace_unspecified_values(htype: str, htype_overwrite: dict):
if v == UNSPECIFIED:
htype_overwrite[k] = defaults[k]
- if htype in ("json", "list") and not htype_overwrite["dtype"]:
+ if htype in ("json", "list", "text") and not htype_overwrite["dtype"]:
htype_overwrite["dtype"] = HTYPE_CONFIGURATIONS[htype]["dtype"]
@@ -237,6 +230,14 @@ def _validate_required_htype_overwrites(htype: str, htype_overwrite: dict):
"Datatype must be supported by numpy. Can be an `str`, `np.dtype`, or normal python type (like `bool`, `float`, `int`, etc.). List of available numpy dtypes found here: https://numpy.org/doc/stable/user/basics.types.html",
)
+ if htype == "text":
+ if htype_overwrite["dtype"] not in (str, "str"):
+ raise TensorMetaInvalidHtypeOverwriteValue(
+ "dtype",
+ htype_overwrite["dtype"],
+ "dtype for tensors with text htype should always be `str`",
+ )
+
def _format_values(htype: str, htype_overwrite: dict):
"""Replaces values in `htype_overwrite` with consistent types/formats."""
From b32da45a60490b795ab69d4933658502f58fccf7 Mon Sep 17 00:00:00 2001
From: Fariz Rahman
Date: Mon, 11 Oct 2021 06:06:25 +0400
Subject: [PATCH 20/20] mypy
---
hub/util/json.py | 1 +
1 file changed, 1 insertion(+)
diff --git a/hub/util/json.py b/hub/util/json.py
index d2948c04ad..2630996d18 100644
--- a/hub/util/json.py
+++ b/hub/util/json.py
@@ -131,6 +131,7 @@ def _validate_optional(obj: Any, params: List[str]) -> bool:
return True
if params:
return _validate_object(obj, params[0])
+ return True
def _validate_list(obj: Any, params: List[str]) -> bool: