Skip to content

Commit

Permalink
Merge branch 'main' of github.com:activeloopai/deeplake into ingest_s…
Browse files Browse the repository at this point in the history
…tandartization
  • Loading branch information
ProgerDav committed Feb 21, 2023
2 parents 757f5cf + 6ff1cf6 commit 0b70e8d
Show file tree
Hide file tree
Showing 58 changed files with 1,567 additions and 680 deletions.
5 changes: 2 additions & 3 deletions deeplake/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
r"""
The deeplake package provides a database which stores data as compressed chunked arrays that can be stored anywhere and
The deeplake package provides a database which stores data as compressed chunked arrays that can be stored anywhere and
later streamed to deep learning models.
"""

Expand Down Expand Up @@ -33,7 +33,6 @@

compressions = list(SUPPORTED_COMPRESSIONS)
htypes = sorted(list(HTYPE_CONFIGURATIONS))
list = api_dataset.list
exists = api_dataset.exists
load = api_dataset.load
empty = api_dataset.empty
Expand Down Expand Up @@ -80,7 +79,7 @@
]


__version__ = "3.2.7"
__version__ = "3.3.0"
warn_if_update_required(__version__)
__encoded_version__ = np.array(__version__)
config = {"s3": Config(max_pool_connections=50, connect_timeout=300, read_timeout=300)}
Expand Down
46 changes: 14 additions & 32 deletions deeplake/api/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -942,7 +942,7 @@ def process_meta(k):
if tensors:
assert metas
len_keys = len(keys)
if num_workers == 0:
if num_workers <= 1:
keys = [keys]
else:
keys = [keys[i::num_workers] for i in range(num_workers)]
Expand Down Expand Up @@ -1034,6 +1034,7 @@ def ingest_coco(
dest_creds: Optional[Dict] = None,
inspect_limit: int = 1000000,
progressbar: bool = True,
shuffle: bool = False,
num_workers: int = 0,
token: Optional[str] = None,
connect_kwargs: Optional[Dict] = None,
Expand Down Expand Up @@ -1085,6 +1086,7 @@ def ingest_coco(
dest_creds (Optional[Dict]): A dictionary containing credentials used to access the destination path of the dataset.
inspect_limit (int): The maximum number of samples to inspect in the annotations json, in order to generate the set of COCO annotation keys. Set to ``1000000`` by default.
progressbar (bool): Enables or disables ingestion progress bar. Set to ``True`` by default.
shuffle (bool): Shuffles the input data prior to ingestion. Set to ``False`` by default.
num_workers (int): The number of workers to use for ingestion. Set to ``0`` by default.
token (Optional[str]): The token to use for accessing the dataset and/or connecting it to Deep Lake.
connect_kwargs (Optional[Dict]): If specified, the dataset will be connected to Deep Lake, and connect_kwargs will be passed to :meth:`Dataset.connect <deeplake.core.dataset.Dataset.connect>`.
Expand Down Expand Up @@ -1134,11 +1136,7 @@ def ingest_coco(

structure.create_missing(ds)

unstructured.structure(
ds,
progressbar,
num_workers,
)
unstructured.structure(ds, progressbar, num_workers, shuffle)

return ds

Expand All @@ -1157,6 +1155,7 @@ def ingest_yolo(
image_creds_key: Optional[str] = None,
inspect_limit: int = 1000,
progressbar: bool = True,
shuffle: bool = False,
num_workers: int = 0,
token: Optional[str] = None,
connect_kwargs: Optional[Dict] = None,
Expand Down Expand Up @@ -1202,6 +1201,7 @@ def ingest_yolo(
image_creds_key (Optional[str]): creds_key for linked tensors, applicable if the htype for the images tensor is specified as 'link[image]' in the 'image_params' input.
inspect_limit (int): The maximum number of annotations to inspect, in order to infer whether they are bounding boxes of polygons. This in put is ignored if the htype is specfied in the 'coordinates_params'.
progressbar (bool): Enables or disables ingestion progress bar. Set to ``True`` by default.
shuffle (bool): Shuffles the input data prior to ingestion. Set to ``False`` by default.
num_workers (int): The number of workers to use for ingestion. Set to ``0`` by default.
token (Optional[str]): The token to use for accessing the dataset and/or connecting it to Deep Lake.
connect_kwargs (Optional[Dict]): If specified, the dataset will be connected to Deep Lake, and connect_kwargs will be passed to :meth:`Dataset.connect <deeplake.core.dataset.Dataset.connect>`.
Expand Down Expand Up @@ -1264,6 +1264,7 @@ def ingest_yolo(
ds,
progressbar,
num_workers,
shuffle,
)

return ds
Expand All @@ -1278,6 +1279,7 @@ def ingest_classification(
progressbar: bool = True,
summary: bool = True,
num_workers: int = 0,
shuffle: bool = True,
token: Optional[str] = None,
connect_kwargs: Optional[Dict] = None,
**dataset_kwargs,
Expand All @@ -1297,6 +1299,7 @@ def ingest_classification(
progressbar (bool): Enables or disables ingestion progress bar. Defaults to ``True``.
summary (bool): If ``True``, a summary of skipped files will be printed after completion. Defaults to ``True``.
num_workers (int): The number of workers to use for ingestion. Set to ``0`` by default.
shuffle (bool): Shuffles the input data prior to ingestion. Since data arranged in folders by class is highly non-random, shuffling is important in order to produce optimal results when training. Defaults to ``True``.
token (Optional[str]): The token to use for accessing the dataset.
connect_kwargs (Optional[Dict]): If specified, the dataset will be connected to Deep Lake, and connect_kwargs will be passed to :meth:`Dataset.connect <deeplake.core.dataset.Dataset.connect>`.
**dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function see :func:`deeplake.empty`.
Expand Down Expand Up @@ -1411,6 +1414,8 @@ def ingest_classification(
image_tensor_args=image_params,
label_tensor_args=label_params,
num_workers=num_workers,
shuffle=shuffle,
image_tensor_args={"sample_compression": images_compression},
)

return ds # type: ignore
Expand All @@ -1426,6 +1431,7 @@ def ingest_kaggle(
kaggle_credentials: Optional[dict] = None,
progressbar: bool = True,
summary: bool = True,
shuffle: bool = True,
**dataset_kwargs,
) -> Dataset:
"""Download and ingest a kaggle dataset and store it as a structured dataset to destination.
Expand All @@ -1444,6 +1450,7 @@ def ingest_kaggle(
kaggle_credentials (dict): A dictionary containing kaggle credentials {"username":"YOUR_USERNAME", "key": "YOUR_KEY"}. If ``None``, environment variables/the kaggle.json file will be used if available.
progressbar (bool): Enables or disables ingestion progress bar. Set to ``True`` by default.
summary (bool): Generates ingestion summary. Set to ``True`` by default.
shuffle (bool): Shuffles the input data prior to ingestion. Since data arranged in folders by class is highly non-random, shuffling is important in order to produce optimal results when training. Defaults to ``True``.
**dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function. See :func:`deeplake.dataset`.
Returns:
Expand Down Expand Up @@ -1488,6 +1495,7 @@ def ingest_kaggle(
dest_creds=dest_creds,
progressbar=progressbar,
summary=summary,
shuffle=shuffle,
**dataset_kwargs,
)

Expand Down Expand Up @@ -1551,29 +1559,3 @@ def ingest_dataframe(
structured.fill_dataset(ds, progressbar) # type: ignore

return ds # type: ignore

@staticmethod
def list(
org_id: str = "",
token: Optional[str] = None,
) -> None:
"""List all available Deep Lake cloud datasets.
Args:
org_id (str): Specify organization id. If not given,
returns a list of all datasets that can be accessed, regardless of what workspace they are in.
Otherwise, lists all datasets in the given organization.
token (str, optional): Activeloop token, used for fetching credentials for Deep Lake datasets. This is optional, tokens are normally autogenerated.
Returns:
List: List of dataset names.
"""

deeplake_reporter.feature_report(
feature_name="list",
parameters={"org_id": org_id},
)

client = DeepLakeBackendClient(token=token)
datasets = client.get_datasets(workspace=org_id)
return datasets
63 changes: 63 additions & 0 deletions deeplake/api/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2131,6 +2131,7 @@ def token_permission_error_check(

with pytest.raises(TokenPermissionError):
ds = deeplake.load("hub://activeloop/fake-path")
runner.invoke(logout)


def invalid_token_exception_check():
Expand All @@ -2154,6 +2155,7 @@ def dataset_handler_error_check(runner, username, password):
result = runner.invoke(login, f"-u {username} -p {password}")
with pytest.raises(DatasetHandlerError):
ds = deeplake.load(f"hub://{username}/wrong-path")
runner.invoke(logout)


def test_hub_related_permission_exceptions(
Expand Down Expand Up @@ -2376,3 +2378,64 @@ def test_pickle_bug(local_ds):
ds["__temp_123"].numpy()

assert ds._temp_tensors == []


def test_max_view(memory_ds):
with memory_ds as ds:
ds.create_tensor("abc")
ds.create_tensor("xyz")
ds.create_tensor("pqr")

ds.abc.extend([1, 2, 3, 4])
ds.xyz.extend([1, 2, 3])
ds.pqr.extend([1, 2])

expected = {
"abc": [[1], [2], [3], [4]],
"xyz": [[1], [2], [3], []],
"pqr": [[1], [2], [], []],
}

for i, sample in enumerate(ds.max_view):
np.testing.assert_array_equal(sample.abc.numpy(), expected["abc"][i])


def test_min_view(memory_ds):
with memory_ds as ds:
ds.create_tensor("abc")
ds.create_tensor("xyz")
ds.create_tensor("pqr")

ds.abc.extend([1, 2, 3, 4])
ds.xyz.extend([1, 2, 3])
ds.pqr.extend([1, 2])

expected = {
"abc": [[1], [2]],
"xyz": [[1], [2]],
"pqr": [[1], [2]],
}

for i, sample in enumerate(ds.min_view):
np.testing.assert_array_equal(sample.abc.numpy(), expected["abc"][i])


def test_extend_with_empty_tensor(memory_ds):
with memory_ds as ds:
ds.create_tensor("abc")
ds.abc.extend([None, None, None])

ds.create_tensor("xyz")
ds.xyz.extend(ds.abc)
ds.xyz.extend([ds.abc[0], ds.abc[1]])

with pytest.raises(EmptyTensorError):
ds.xyz.numpy()

ds.xyz.append(1)

data = ds.xyz.numpy(aslist=True)
expected = [[]] * 5 + [1]

for i in range(len(data)):
np.testing.assert_array_equal(data[i], expected[i])
81 changes: 81 additions & 0 deletions deeplake/api/tests/test_downsample.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import deeplake
import numpy as np
import pytest


def test_downsample(local_ds_generator, cat_path):
Expand Down Expand Up @@ -105,3 +106,83 @@ def test_downsample_tiled(memory_ds):
for i in range(4):
x = i * 5472
ds.image[0][0:3648, x : x + 5472, :] = arr


@pytest.mark.parametrize(
"sample_compression", [None]
) # add back apng when bug is fixed
def test_downsample_binary_mask(memory_ds, sample_compression):
with memory_ds as ds:
ds.create_tensor(
"mask",
htype="binary_mask",
sample_compression=sample_compression,
downsampling=(2, 5),
)
binary_masks = [
np.ones((1000, 1000, 3), dtype=bool),
np.zeros((1000, 1000, 3), dtype=bool),
]
ds.mask.extend(binary_masks)

for i in range(1, 6):
tensor = ds[f"_mask_downsampled_{2 ** i}"]
assert len(tensor) == 2
for j in range(2):
np.testing.assert_array_equal(
tensor[j], binary_masks[j][:: 2**i, :: 2**i, :]
)


def test_downsample_group_bug(memory_ds):
with memory_ds as ds:
ds.create_group("stuff")
ds.create_tensor(
"mask", htype="binary_mask", sample_compression="lz4", downsampling=(2, 2)
)
ds.create_tensor(
"stuff/mask",
htype="binary_mask",
sample_compression="lz4",
downsampling=(2, 2),
)


def test_downsample_image(memory_ds):
with memory_ds as ds:
ds.create_tensor(
"image", htype="image", sample_compression="jpeg", downsampling=(2, 2)
)
ds.image.append(np.ones((100, 100, 3), dtype="uint8"))
ds.image.append(np.ones((100, 100, 1), dtype="uint8"))
ds.image.append(np.ones((100, 100, 0), dtype="uint8"))
ds.image.append(np.ones((100, 0, 3), dtype="uint8"))
ds.image.append(np.ones((100, 100), dtype="uint8"))

target_shapes = {
"image": [
(100, 100, 3),
(100, 100, 1),
(100, 100, 0),
(100, 0, 3),
(100, 100, 1),
],
"_image_downsampled_2": [
(50, 50, 3),
(50, 50, 1),
(0, 0, 0),
(0, 0, 0),
(50, 50, 1),
],
"_image_downsampled_4": [
(25, 25, 3),
(25, 25, 1),
(0, 0, 0),
(0, 0, 0),
(25, 25, 1),
],
}
for tensor, target_shape in target_shapes.items():
shapes = [ds[tensor][i].shape for i in range(5)]
numpy_shapes = [ds[tensor][i].numpy().shape for i in range(5)]
assert shapes == target_shape == numpy_shapes
2 changes: 1 addition & 1 deletion deeplake/api/tests/test_link.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def test_complex_htype_parsing():
is_sequence, is_link, htype = parse_complex_htype("sequence")
assert is_sequence
assert not is_link
assert htype == "generic"
assert htype is None

with pytest.raises(ValueError):
is_sequence, is_link, htype = parse_complex_htype("sequence[link]")
Expand Down
25 changes: 25 additions & 0 deletions deeplake/api/tests/test_rechunk.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,3 +269,28 @@ def test_rechunk_cloud_link(local_ds_generator):
assert sample_2.path == s3_path_1, sample_2.creds_key == "my_s3_key_1"

assert ds.abc.chunk_engine.creds_encoder.num_samples == 3


@deeplake.compute
def add_samples(sample_in, samples_out):
samples_out.labels.append(np.ones((200,), dtype=np.int64))


def test_rechunk_vc_bug(local_ds):
ds = local_ds
with ds:
ds.create_tensor("labels", dtype="int64")
add_samples().eval(list(range(200)), ds, num_workers=2)
ds.commit()
add_samples().eval(list(range(100)), ds, num_workers=2)
ds.commit()
ds.checkout("alt", True)
ds.labels[8] = ds.labels[8].numpy()
ds.commit()
np.testing.assert_array_equal(
ds.labels.numpy(), np.ones((300, 200), dtype=np.int64)
)
ds.checkout("main")
np.testing.assert_array_equal(
ds.labels.numpy(), np.ones((300, 200), dtype=np.int64)
)
Loading

0 comments on commit 0b70e8d

Please sign in to comment.