Merge pull request #1214 from activeloopai/fr_json

Json, List, Text
activeloopai · Oct 12, 2021 · 872ce9f · 872ce9f
2 parents e085e02 + b32da45
commit 872ce9f
Show file tree

Hide file tree

Showing 12 changed files with 585 additions and 46 deletions.
diff --git a/hub/__init__.py b/hub/__init__.py
@@ -27,7 +27,7 @@
 from .htype import HTYPE_CONFIGURATIONS
 
 compressions = list(SUPPORTED_COMPRESSIONS)
-htypes = list(HTYPE_CONFIGURATIONS.keys())
+htypes = sorted(list(HTYPE_CONFIGURATIONS))
 list = dataset.list
 load = dataset.load
 empty = dataset.empty

diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
@@ -694,14 +694,17 @@ def test_compressions_list():
 
 def test_htypes_list():
     assert hub.htypes == [
-        "generic",
-        "image",
-        "class_label",
-        "bbox",
         "audio",
-        "video",
+        "bbox",
         "binary_mask",
+        "class_label",
+        "generic",
+        "image",
+        "json",
+        "list",
         "segment_mask",
+        "text",
+        "video",
     ]
 
 

diff --git a/hub/api/tests/test_json.py b/hub/api/tests/test_json.py
@@ -0,0 +1,152 @@
+import numpy as np
+import hub
+import pytest
+from hub.util.json import JsonValidationError
+from typing import Any, Optional, Union, List, Dict
+
+
+def test_json_basic(memory_ds):
+    ds = memory_ds
+    ds.create_tensor("json", htype="json")
+    items = [
+        {"x": [1, 2, 3], "y": [4, [5, 6]]},
+        {"x": [1, 2, 3], "y": [4, {"z": [0.1, 0.2, []]}]},
+    ]
+    with ds:
+        for x in items:
+            ds.json.append(x)
+        ds.json.extend(items)
+    assert ds.json.shape == (4, 1)
+    for i in range(4):
+        assert ds.json[i].data() == items[i % 2]
+
+
+def test_json_with_numpy(memory_ds):
+    ds = memory_ds
+    ds.create_tensor("json", htype="json")
+    items = [
+        {"x": np.array([1, 2, 3], dtype=np.float32), "y": [4, [5, 6]]},
+        {"x": np.array([1, 2, 3], dtype=np.uint8), "y": [4, {"z": [0.1, 0.2, []]}]},
+    ]
+    with ds:
+        for x in items:
+            ds.json.append(x)
+        ds.json.extend(items)
+    for i in range(4):
+        assert ds.json[i].data()["y"] == items[i % 2]["y"]
+        np.testing.assert_array_equal(ds.json[i].data()["x"], items[i % 2]["x"])
+
+
+def test_json_with_hub_sample(memory_ds, compressed_image_paths):
+    ds = memory_ds
+    ds.create_tensor("json", htype="json")
+    items = [
+        {
+            "x": [1, 2, 3],
+            "y": [4, [5, 6]],
+            "z": hub.read(compressed_image_paths["jpeg"][0]),
+        },
+        {
+            "x": [1, 2, 3],
+            "y": [4, {"z": [0.1, 0.2, []]}],
+            "z": hub.read(compressed_image_paths["png"][0]),
+        },
+    ]
+    with ds:
+        for x in items:
+            ds.json.append(x)
+        ds.json.extend(items)
+    assert ds.json.shape == (4, 1)
+    for i in range(4):
+        assert ds.json[i].data() == items[i % 2]
+
+
+def test_json_list_basic(memory_ds):
+    ds = memory_ds
+    ds.create_tensor("list", htype="list")
+    items = [
+        [{"x": [1, 2, 3], "y": [4, [5, 6]]}, [[]], [None, 0.1]],
+        [[], [[[]]], {"a": [0.1, 1, "a", []]}],
+    ]
+    with ds:
+        for x in items:
+            ds.list.append(x)
+        ds.list.extend(items)
+    assert ds.list.shape == (4, 3)
+    for i in range(4):
+        assert ds.list[i].data() == items[i % 2]
+    for i, x in enumerate(ds.list.data()):
+        assert x == items[i % 2]
+
+
+def test_list_with_numpy(memory_ds):
+    ds = memory_ds
+    ds.create_tensor("list", htype="list")
+    items = [
+        [
+            np.random.random((3, 4)),
+            {"x": [1, 2, 3], "y": [4, [5, 6]]},
+            [[]],
+            [None, 0.1],
+        ],
+        [np.random.randint(0, 10, (4, 5)), [], [[[]]], {"a": [0.1, 1, "a", []]}],
+    ]
+    with ds:
+        for x in items:
+            ds.list.append(x)
+        ds.list.extend(items)
+    assert ds.list.shape == (4, 4)
+    for i in range(4):
+        actual, expected = ds.list[i].data(), items[i % 2]
+        np.testing.assert_array_equal(actual[0], expected[0])
+        assert actual[1:] == expected[1:]
+
+
+def test_list_with_hub_sample(memory_ds, compressed_image_paths):
+    ds = memory_ds
+    ds.create_tensor("list", htype="list")
+    items = [
+        [
+            {
+                "x": [1, 2, 3],
+                "y": [4, [5, 6, hub.read(compressed_image_paths["jpeg"][0])]],
+            },
+            [[hub.read(compressed_image_paths["jpeg"][1])]],
+            [None, 0.1],
+        ],
+        [
+            [],
+            [[[hub.read(compressed_image_paths["png"][0])]]],
+            {"a": [0.1, 1, "a", hub.read(compressed_image_paths["png"][0])]},
+        ],
+    ]
+    with ds:
+        for x in items:
+            ds.list.append(x)
+        ds.list.extend(items)
+    assert ds.list.shape == (4, 3)
+    for i in range(4):
+        assert ds.list[i].data() == items[i % 2]
+
+
+def test_json_with_schema(memory_ds):
+    ds = memory_ds
+    ds.create_tensor("json", htype="json", dtype=List[Dict[str, int]])
+    ds.json.append([{"x": 1, "y": 2}])
+    with pytest.raises(JsonValidationError):
+        ds.json.append({"x": 1, "y": 2})
+    with pytest.raises(JsonValidationError):
+        ds.json.append([{"x": 1, "y": "2"}])
+
+    assert ds.json.numpy()[0, 0] == [{"x": 1, "y": 2}]
+
+    ds.create_tensor("json2", htype="json", dtype=Optional[List[Dict[str, int]]])
+    items = [
+        [{"x": 1, "y": 2}],
+        None,
+        [{"x": 2, "y": 3}],
+        None,
+    ]
+    ds.json2.extend(items)
+    for i in range(len(items)):
+        assert ds.json2[i].data() == ds.json2.data()[i] == items[i]
diff --git a/hub/api/tests/test_text.py b/hub/api/tests/test_text.py
@@ -0,0 +1,15 @@
+import hub
+import pytest
+
+
+def test_text(memory_ds):
+    ds = memory_ds
+    ds.create_tensor("text", htype="text")
+    items = ["abcd", "efgh", "0123456"]
+    with ds:
+        for x in items:
+            ds.text.append(x)
+        ds.text.extend(items)
+    assert ds.text.shape == (6, 1)
+    for i in range(6):
+        assert ds.text[i].numpy()[0] == items[i % 3]
diff --git a/hub/core/chunk_engine.py b/hub/core/chunk_engine.py
@@ -1,5 +1,6 @@
 import hub
 import warnings
+import json
 import numpy as np
 from math import ceil
 from typing import Any, Dict, Optional, Sequence, Union, Tuple, List, Set
@@ -8,7 +9,7 @@
 from hub.core.version_control.commit_node import CommitNode  # type: ignore
 from hub.core.version_control.commit_chunk_set import CommitChunkSet  # type: ignore
 from hub.core.fast_forwarding import ffw_chunk_id_encoder
-from hub.core.compression import decompress_array
+from hub.core.compression import decompress_array, decompress_bytes
 from hub.core.sample import Sample, SampleValue  # type: ignore
 from hub.core.meta.tensor_meta import TensorMeta
 from hub.core.index.index import Index
@@ -30,6 +31,7 @@
     CorruptedMetaError,
     DynamicTensorNumpyError,
 )
+from hub.util.json import HubJsonDecoder
 from hub.util.casting import get_dtype, intelligent_cast
 from hub.util.version_control import auto_checkout, commit, commit_chunk_set_exists
 
@@ -684,17 +686,45 @@ def read_sample_from_chunk(
             return np.zeros(shape, dtype=dtype)
 
         chunk_compression = self.tensor_meta.chunk_compression
+        sample_compression = self.tensor_meta.sample_compression
+
+        htype = self.tensor_meta.htype
+
+        if htype in ("json", "text", "list"):
+            sb, eb = chunk.byte_positions_encoder[local_sample_index]
+            if chunk_compression:
+                decompressed = chunk.decompressed_data(compression=chunk_compression)
+                buffer = decompressed[sb:eb]
+            elif sample_compression:
+                buffer = decompress_bytes(buffer[sb:eb], compression=sample_compression)
+            else:
+                buffer = buffer[sb:eb]
+            buffer = bytes(buffer)
+            if htype == "json":
+                arr = np.empty(1, dtype=object)
+                arr[0] = json.loads(bytes.decode(buffer), cls=HubJsonDecoder)
+                return arr
+            elif htype == "list":
+                lst = json.loads(bytes.decode(buffer), cls=HubJsonDecoder)
+                arr = np.empty(len(lst), dtype=object)
+                arr[:] = lst
+                return arr
+            elif htype == "text":
+                arr = np.array(bytes.decode(buffer)).reshape(
+                    1,
+                )
+            return arr
+
         if chunk_compression:
             if get_compression_type(chunk_compression) == BYTE_COMPRESSION:
                 decompressed = chunk.decompressed_data(compression=chunk_compression)
                 sb, eb = chunk.byte_positions_encoder[local_sample_index]
                 return np.frombuffer(decompressed[sb:eb], dtype=dtype).reshape(shape)
             else:
                 return chunk.decompressed_samples()[local_sample_index]
+
         sb, eb = chunk.byte_positions_encoder[local_sample_index]
         buffer = buffer[sb:eb]
-
-        sample_compression = self.tensor_meta.sample_compression
         if sample_compression:
             sample = decompress_array(
                 buffer, shape, dtype=dtype, compression=sample_compression

diff --git a/hub/core/dataset.py b/hub/core/dataset.py
@@ -4,7 +4,16 @@
 import warnings
 import posixpath
 import numpy as np
-from typing import Any, Callable, Dict, Optional, Union, Tuple, List, Sequence
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    Optional,
+    Union,
+    Tuple,
+    List,
+    Sequence,
+)
 
 from hub.api.info import load_info
 from hub.client.log import logger
@@ -209,7 +218,7 @@ def create_tensor(
         self,
         name: str,
         htype: str = DEFAULT_HTYPE,
-        dtype: Union[str, np.dtype, type] = UNSPECIFIED,
+        dtype: Union[str, np.dtype] = UNSPECIFIED,
         sample_compression: str = UNSPECIFIED,
         chunk_compression: str = UNSPECIFIED,
         **kwargs,

diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py
@@ -11,6 +11,7 @@
     UnsupportedCompressionError,
     TensorInvalidSampleShapeError,
 )
+from hub.util.json import validate_json_schema
 from hub.constants import (
     REQUIRE_USER_SPECIFICATION,
     UNSPECIFIED,
@@ -59,8 +60,8 @@ def __init__(
             _validate_htype_exists(htype)
             _validate_htype_overwrites(htype, kwargs)
             _replace_unspecified_values(htype, kwargs)
-            _validate_required_htype_overwrites(kwargs)
-            _format_values(kwargs)
+            _validate_required_htype_overwrites(htype, kwargs)
+            _format_values(htype, kwargs)
 
             required_meta = _required_meta_from_htype(htype)
             required_meta.update(kwargs)
@@ -164,7 +165,14 @@ def _validate_htype_overwrites(htype: str, htype_overwrite: dict):
             htype, ["chunk_compression", "sample_compression"]  # type: ignore
         )
 
-    if htype == "audio":
+    if htype in ("json", "list", "text"):
+        compr = htype_overwrite["chunk_compression"]
+        if compr in (None, UNSPECIFIED):
+            compr = htype_overwrite["sample_compression"]
+        if compr not in (None, UNSPECIFIED):
+            if get_compression_type(compr) != BYTE_COMPRESSION:
+                raise UnsupportedCompressionError(compr, htype)
+    elif htype == "audio":
         if htype_overwrite["chunk_compression"] not in [UNSPECIFIED, None]:
             raise UnsupportedCompressionError("Chunk compression", htype=htype)
         elif htype_overwrite["sample_compression"] == UNSPECIFIED:
@@ -189,8 +197,11 @@ def _replace_unspecified_values(htype: str, htype_overwrite: dict):
         if v == UNSPECIFIED:
             htype_overwrite[k] = defaults[k]
 
+    if htype in ("json", "list", "text") and not htype_overwrite["dtype"]:
+        htype_overwrite["dtype"] = HTYPE_CONFIGURATIONS[htype]["dtype"]
 
-def _validate_required_htype_overwrites(htype_overwrite: dict):
+
+def _validate_required_htype_overwrites(htype: str, htype_overwrite: dict):
     """Raises errors if `htype_overwrite` has invalid values."""
 
     sample_compression = htype_overwrite["sample_compression"]
@@ -209,19 +220,35 @@ def _validate_required_htype_overwrites(htype_overwrite: dict):
         )
 
     if htype_overwrite["dtype"] is not None:
-        _raise_if_condition(
-            "dtype",
-            htype_overwrite,
-            lambda dtype: not _is_dtype_supported_by_numpy(dtype),
-            "Datatype must be supported by numpy. Can be an `str`, `np.dtype`, or normal python type (like `bool`, `float`, `int`, etc.). List of available numpy dtypes found here: https://numpy.org/doc/stable/user/basics.types.html",
-        )
+        if htype in ("json", "list"):
+            validate_json_schema(htype_overwrite["dtype"])
+        else:
+            _raise_if_condition(
+                "dtype",
+                htype_overwrite,
+                lambda dtype: not _is_dtype_supported_by_numpy(dtype),
+                "Datatype must be supported by numpy. Can be an `str`, `np.dtype`, or normal python type (like `bool`, `float`, `int`, etc.). List of available numpy dtypes found here: https://numpy.org/doc/stable/user/basics.types.html",
+            )
+
+    if htype == "text":
+        if htype_overwrite["dtype"] not in (str, "str"):
+            raise TensorMetaInvalidHtypeOverwriteValue(
+                "dtype",
+                htype_overwrite["dtype"],
+                "dtype for tensors with text htype should always be `str`",
+            )
 
 
-def _format_values(htype_overwrite: dict):
+def _format_values(htype: str, htype_overwrite: dict):
     """Replaces values in `htype_overwrite` with consistent types/formats."""
 
-    if htype_overwrite["dtype"] is not None:
-        htype_overwrite["dtype"] = np.dtype(htype_overwrite["dtype"]).name
+    dtype = htype_overwrite["dtype"]
+    if dtype is not None:
+        if htype in ("json", "list"):
+            if getattr(dtype, "__module__", None) == "typing":
+                htype_overwrite["dtype"] = str(dtype)
+        else:
+            htype_overwrite["dtype"] = np.dtype(htype_overwrite["dtype"]).name
 
     for key, value in COMPRESSION_ALIASES.items():
         if htype_overwrite.get("sample_compression") == key: