Skip to content

Commit

Permalink
Merge pull request #1214 from activeloopai/fr_json
Browse files Browse the repository at this point in the history
Json, List, Text
  • Loading branch information
verbii-1 authored Oct 12, 2021
2 parents e085e02 + b32da45 commit 872ce9f
Show file tree
Hide file tree
Showing 12 changed files with 585 additions and 46 deletions.
2 changes: 1 addition & 1 deletion hub/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
from .htype import HTYPE_CONFIGURATIONS

compressions = list(SUPPORTED_COMPRESSIONS)
htypes = list(HTYPE_CONFIGURATIONS.keys())
htypes = sorted(list(HTYPE_CONFIGURATIONS))
list = dataset.list
load = dataset.load
empty = dataset.empty
Expand Down
13 changes: 8 additions & 5 deletions hub/api/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -694,14 +694,17 @@ def test_compressions_list():

def test_htypes_list():
assert hub.htypes == [
"generic",
"image",
"class_label",
"bbox",
"audio",
"video",
"bbox",
"binary_mask",
"class_label",
"generic",
"image",
"json",
"list",
"segment_mask",
"text",
"video",
]


Expand Down
152 changes: 152 additions & 0 deletions hub/api/tests/test_json.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
import numpy as np
import hub
import pytest
from hub.util.json import JsonValidationError
from typing import Any, Optional, Union, List, Dict


def test_json_basic(memory_ds):
ds = memory_ds
ds.create_tensor("json", htype="json")
items = [
{"x": [1, 2, 3], "y": [4, [5, 6]]},
{"x": [1, 2, 3], "y": [4, {"z": [0.1, 0.2, []]}]},
]
with ds:
for x in items:
ds.json.append(x)
ds.json.extend(items)
assert ds.json.shape == (4, 1)
for i in range(4):
assert ds.json[i].data() == items[i % 2]


def test_json_with_numpy(memory_ds):
ds = memory_ds
ds.create_tensor("json", htype="json")
items = [
{"x": np.array([1, 2, 3], dtype=np.float32), "y": [4, [5, 6]]},
{"x": np.array([1, 2, 3], dtype=np.uint8), "y": [4, {"z": [0.1, 0.2, []]}]},
]
with ds:
for x in items:
ds.json.append(x)
ds.json.extend(items)
for i in range(4):
assert ds.json[i].data()["y"] == items[i % 2]["y"]
np.testing.assert_array_equal(ds.json[i].data()["x"], items[i % 2]["x"])


def test_json_with_hub_sample(memory_ds, compressed_image_paths):
ds = memory_ds
ds.create_tensor("json", htype="json")
items = [
{
"x": [1, 2, 3],
"y": [4, [5, 6]],
"z": hub.read(compressed_image_paths["jpeg"][0]),
},
{
"x": [1, 2, 3],
"y": [4, {"z": [0.1, 0.2, []]}],
"z": hub.read(compressed_image_paths["png"][0]),
},
]
with ds:
for x in items:
ds.json.append(x)
ds.json.extend(items)
assert ds.json.shape == (4, 1)
for i in range(4):
assert ds.json[i].data() == items[i % 2]


def test_json_list_basic(memory_ds):
ds = memory_ds
ds.create_tensor("list", htype="list")
items = [
[{"x": [1, 2, 3], "y": [4, [5, 6]]}, [[]], [None, 0.1]],
[[], [[[]]], {"a": [0.1, 1, "a", []]}],
]
with ds:
for x in items:
ds.list.append(x)
ds.list.extend(items)
assert ds.list.shape == (4, 3)
for i in range(4):
assert ds.list[i].data() == items[i % 2]
for i, x in enumerate(ds.list.data()):
assert x == items[i % 2]


def test_list_with_numpy(memory_ds):
ds = memory_ds
ds.create_tensor("list", htype="list")
items = [
[
np.random.random((3, 4)),
{"x": [1, 2, 3], "y": [4, [5, 6]]},
[[]],
[None, 0.1],
],
[np.random.randint(0, 10, (4, 5)), [], [[[]]], {"a": [0.1, 1, "a", []]}],
]
with ds:
for x in items:
ds.list.append(x)
ds.list.extend(items)
assert ds.list.shape == (4, 4)
for i in range(4):
actual, expected = ds.list[i].data(), items[i % 2]
np.testing.assert_array_equal(actual[0], expected[0])
assert actual[1:] == expected[1:]


def test_list_with_hub_sample(memory_ds, compressed_image_paths):
ds = memory_ds
ds.create_tensor("list", htype="list")
items = [
[
{
"x": [1, 2, 3],
"y": [4, [5, 6, hub.read(compressed_image_paths["jpeg"][0])]],
},
[[hub.read(compressed_image_paths["jpeg"][1])]],
[None, 0.1],
],
[
[],
[[[hub.read(compressed_image_paths["png"][0])]]],
{"a": [0.1, 1, "a", hub.read(compressed_image_paths["png"][0])]},
],
]
with ds:
for x in items:
ds.list.append(x)
ds.list.extend(items)
assert ds.list.shape == (4, 3)
for i in range(4):
assert ds.list[i].data() == items[i % 2]


def test_json_with_schema(memory_ds):
ds = memory_ds
ds.create_tensor("json", htype="json", dtype=List[Dict[str, int]])
ds.json.append([{"x": 1, "y": 2}])
with pytest.raises(JsonValidationError):
ds.json.append({"x": 1, "y": 2})
with pytest.raises(JsonValidationError):
ds.json.append([{"x": 1, "y": "2"}])

assert ds.json.numpy()[0, 0] == [{"x": 1, "y": 2}]

ds.create_tensor("json2", htype="json", dtype=Optional[List[Dict[str, int]]])
items = [
[{"x": 1, "y": 2}],
None,
[{"x": 2, "y": 3}],
None,
]
ds.json2.extend(items)
for i in range(len(items)):
assert ds.json2[i].data() == ds.json2.data()[i] == items[i]
15 changes: 15 additions & 0 deletions hub/api/tests/test_text.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import hub
import pytest


def test_text(memory_ds):
ds = memory_ds
ds.create_tensor("text", htype="text")
items = ["abcd", "efgh", "0123456"]
with ds:
for x in items:
ds.text.append(x)
ds.text.extend(items)
assert ds.text.shape == (6, 1)
for i in range(6):
assert ds.text[i].numpy()[0] == items[i % 3]
36 changes: 33 additions & 3 deletions hub/core/chunk_engine.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import hub
import warnings
import json
import numpy as np
from math import ceil
from typing import Any, Dict, Optional, Sequence, Union, Tuple, List, Set
Expand All @@ -8,7 +9,7 @@
from hub.core.version_control.commit_node import CommitNode # type: ignore
from hub.core.version_control.commit_chunk_set import CommitChunkSet # type: ignore
from hub.core.fast_forwarding import ffw_chunk_id_encoder
from hub.core.compression import decompress_array
from hub.core.compression import decompress_array, decompress_bytes
from hub.core.sample import Sample, SampleValue # type: ignore
from hub.core.meta.tensor_meta import TensorMeta
from hub.core.index.index import Index
Expand All @@ -30,6 +31,7 @@
CorruptedMetaError,
DynamicTensorNumpyError,
)
from hub.util.json import HubJsonDecoder
from hub.util.casting import get_dtype, intelligent_cast
from hub.util.version_control import auto_checkout, commit, commit_chunk_set_exists

Expand Down Expand Up @@ -684,17 +686,45 @@ def read_sample_from_chunk(
return np.zeros(shape, dtype=dtype)

chunk_compression = self.tensor_meta.chunk_compression
sample_compression = self.tensor_meta.sample_compression

htype = self.tensor_meta.htype

if htype in ("json", "text", "list"):
sb, eb = chunk.byte_positions_encoder[local_sample_index]
if chunk_compression:
decompressed = chunk.decompressed_data(compression=chunk_compression)
buffer = decompressed[sb:eb]
elif sample_compression:
buffer = decompress_bytes(buffer[sb:eb], compression=sample_compression)
else:
buffer = buffer[sb:eb]
buffer = bytes(buffer)
if htype == "json":
arr = np.empty(1, dtype=object)
arr[0] = json.loads(bytes.decode(buffer), cls=HubJsonDecoder)
return arr
elif htype == "list":
lst = json.loads(bytes.decode(buffer), cls=HubJsonDecoder)
arr = np.empty(len(lst), dtype=object)
arr[:] = lst
return arr
elif htype == "text":
arr = np.array(bytes.decode(buffer)).reshape(
1,
)
return arr

if chunk_compression:
if get_compression_type(chunk_compression) == BYTE_COMPRESSION:
decompressed = chunk.decompressed_data(compression=chunk_compression)
sb, eb = chunk.byte_positions_encoder[local_sample_index]
return np.frombuffer(decompressed[sb:eb], dtype=dtype).reshape(shape)
else:
return chunk.decompressed_samples()[local_sample_index]

sb, eb = chunk.byte_positions_encoder[local_sample_index]
buffer = buffer[sb:eb]

sample_compression = self.tensor_meta.sample_compression
if sample_compression:
sample = decompress_array(
buffer, shape, dtype=dtype, compression=sample_compression
Expand Down
13 changes: 11 additions & 2 deletions hub/core/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,16 @@
import warnings
import posixpath
import numpy as np
from typing import Any, Callable, Dict, Optional, Union, Tuple, List, Sequence
from typing import (
Any,
Callable,
Dict,
Optional,
Union,
Tuple,
List,
Sequence,
)

from hub.api.info import load_info
from hub.client.log import logger
Expand Down Expand Up @@ -209,7 +218,7 @@ def create_tensor(
self,
name: str,
htype: str = DEFAULT_HTYPE,
dtype: Union[str, np.dtype, type] = UNSPECIFIED,
dtype: Union[str, np.dtype] = UNSPECIFIED,
sample_compression: str = UNSPECIFIED,
chunk_compression: str = UNSPECIFIED,
**kwargs,
Expand Down
53 changes: 40 additions & 13 deletions hub/core/meta/tensor_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
UnsupportedCompressionError,
TensorInvalidSampleShapeError,
)
from hub.util.json import validate_json_schema
from hub.constants import (
REQUIRE_USER_SPECIFICATION,
UNSPECIFIED,
Expand Down Expand Up @@ -59,8 +60,8 @@ def __init__(
_validate_htype_exists(htype)
_validate_htype_overwrites(htype, kwargs)
_replace_unspecified_values(htype, kwargs)
_validate_required_htype_overwrites(kwargs)
_format_values(kwargs)
_validate_required_htype_overwrites(htype, kwargs)
_format_values(htype, kwargs)

required_meta = _required_meta_from_htype(htype)
required_meta.update(kwargs)
Expand Down Expand Up @@ -164,7 +165,14 @@ def _validate_htype_overwrites(htype: str, htype_overwrite: dict):
htype, ["chunk_compression", "sample_compression"] # type: ignore
)

if htype == "audio":
if htype in ("json", "list", "text"):
compr = htype_overwrite["chunk_compression"]
if compr in (None, UNSPECIFIED):
compr = htype_overwrite["sample_compression"]
if compr not in (None, UNSPECIFIED):
if get_compression_type(compr) != BYTE_COMPRESSION:
raise UnsupportedCompressionError(compr, htype)
elif htype == "audio":
if htype_overwrite["chunk_compression"] not in [UNSPECIFIED, None]:
raise UnsupportedCompressionError("Chunk compression", htype=htype)
elif htype_overwrite["sample_compression"] == UNSPECIFIED:
Expand All @@ -189,8 +197,11 @@ def _replace_unspecified_values(htype: str, htype_overwrite: dict):
if v == UNSPECIFIED:
htype_overwrite[k] = defaults[k]

if htype in ("json", "list", "text") and not htype_overwrite["dtype"]:
htype_overwrite["dtype"] = HTYPE_CONFIGURATIONS[htype]["dtype"]

def _validate_required_htype_overwrites(htype_overwrite: dict):

def _validate_required_htype_overwrites(htype: str, htype_overwrite: dict):
"""Raises errors if `htype_overwrite` has invalid values."""

sample_compression = htype_overwrite["sample_compression"]
Expand All @@ -209,19 +220,35 @@ def _validate_required_htype_overwrites(htype_overwrite: dict):
)

if htype_overwrite["dtype"] is not None:
_raise_if_condition(
"dtype",
htype_overwrite,
lambda dtype: not _is_dtype_supported_by_numpy(dtype),
"Datatype must be supported by numpy. Can be an `str`, `np.dtype`, or normal python type (like `bool`, `float`, `int`, etc.). List of available numpy dtypes found here: https://numpy.org/doc/stable/user/basics.types.html",
)
if htype in ("json", "list"):
validate_json_schema(htype_overwrite["dtype"])
else:
_raise_if_condition(
"dtype",
htype_overwrite,
lambda dtype: not _is_dtype_supported_by_numpy(dtype),
"Datatype must be supported by numpy. Can be an `str`, `np.dtype`, or normal python type (like `bool`, `float`, `int`, etc.). List of available numpy dtypes found here: https://numpy.org/doc/stable/user/basics.types.html",
)

if htype == "text":
if htype_overwrite["dtype"] not in (str, "str"):
raise TensorMetaInvalidHtypeOverwriteValue(
"dtype",
htype_overwrite["dtype"],
"dtype for tensors with text htype should always be `str`",
)


def _format_values(htype_overwrite: dict):
def _format_values(htype: str, htype_overwrite: dict):
"""Replaces values in `htype_overwrite` with consistent types/formats."""

if htype_overwrite["dtype"] is not None:
htype_overwrite["dtype"] = np.dtype(htype_overwrite["dtype"]).name
dtype = htype_overwrite["dtype"]
if dtype is not None:
if htype in ("json", "list"):
if getattr(dtype, "__module__", None) == "typing":
htype_overwrite["dtype"] = str(dtype)
else:
htype_overwrite["dtype"] = np.dtype(htype_overwrite["dtype"]).name

for key, value in COMPRESSION_ALIASES.items():
if htype_overwrite.get("sample_compression") == key:
Expand Down
Loading

0 comments on commit 872ce9f

Please sign in to comment.