Merge branch 'main' of github.com:activeloopai/deeplake into ingest_s…

…tandartization
puyuanOT · Feb 21, 2023 · 0b70e8d · 0b70e8d
2 parents 757f5cf + 6ff1cf6
commit 0b70e8d
Show file tree

Hide file tree

Showing 58 changed files with 1,567 additions and 680 deletions.
diff --git a/deeplake/__init__.py b/deeplake/__init__.py
@@ -1,5 +1,5 @@
 r"""
-The deeplake package provides a database which stores data as compressed chunked arrays that can be stored anywhere and 
+The deeplake package provides a database which stores data as compressed chunked arrays that can be stored anywhere and
 later streamed to deep learning models.
 """
 
@@ -33,7 +33,6 @@
 
 compressions = list(SUPPORTED_COMPRESSIONS)
 htypes = sorted(list(HTYPE_CONFIGURATIONS))
-list = api_dataset.list
 exists = api_dataset.exists
 load = api_dataset.load
 empty = api_dataset.empty
@@ -80,7 +79,7 @@
 ]
 
 
-__version__ = "3.2.7"
+__version__ = "3.3.0"
 warn_if_update_required(__version__)
 __encoded_version__ = np.array(__version__)
 config = {"s3": Config(max_pool_connections=50, connect_timeout=300, read_timeout=300)}

diff --git a/deeplake/api/dataset.py b/deeplake/api/dataset.py
@@ -942,7 +942,7 @@ def process_meta(k):
         if tensors:
             assert metas
         len_keys = len(keys)
-        if num_workers == 0:
+        if num_workers <= 1:
             keys = [keys]
         else:
             keys = [keys[i::num_workers] for i in range(num_workers)]
@@ -1034,6 +1034,7 @@ def ingest_coco(
         dest_creds: Optional[Dict] = None,
         inspect_limit: int = 1000000,
         progressbar: bool = True,
+        shuffle: bool = False,
         num_workers: int = 0,
         token: Optional[str] = None,
         connect_kwargs: Optional[Dict] = None,
@@ -1085,6 +1086,7 @@ def ingest_coco(
             dest_creds (Optional[Dict]): A dictionary containing credentials used to access the destination path of the dataset.
             inspect_limit (int): The maximum number of samples to inspect in the annotations json, in order to generate the set of COCO annotation keys. Set to ``1000000`` by default.
             progressbar (bool): Enables or disables ingestion progress bar. Set to ``True`` by default.
+            shuffle (bool): Shuffles the input data prior to ingestion. Set to ``False`` by default.
             num_workers (int): The number of workers to use for ingestion. Set to ``0`` by default.
             token (Optional[str]): The token to use for accessing the dataset and/or connecting it to Deep Lake.
             connect_kwargs (Optional[Dict]): If specified, the dataset will be connected to Deep Lake, and connect_kwargs will be passed to :meth:`Dataset.connect <deeplake.core.dataset.Dataset.connect>`.
@@ -1134,11 +1136,7 @@ def ingest_coco(
 
         structure.create_missing(ds)
 
-        unstructured.structure(
-            ds,
-            progressbar,
-            num_workers,
-        )
+        unstructured.structure(ds, progressbar, num_workers, shuffle)
 
         return ds
 
@@ -1157,6 +1155,7 @@ def ingest_yolo(
         image_creds_key: Optional[str] = None,
         inspect_limit: int = 1000,
         progressbar: bool = True,
+        shuffle: bool = False,
         num_workers: int = 0,
         token: Optional[str] = None,
         connect_kwargs: Optional[Dict] = None,
@@ -1202,6 +1201,7 @@ def ingest_yolo(
             image_creds_key (Optional[str]): creds_key for linked tensors, applicable if the htype for the images tensor is specified as 'link[image]' in the 'image_params' input.
             inspect_limit (int): The maximum number of annotations to inspect, in order to infer whether they are bounding boxes of polygons. This in put is ignored if the htype is specfied in the 'coordinates_params'.
             progressbar (bool): Enables or disables ingestion progress bar. Set to ``True`` by default.
+            shuffle (bool): Shuffles the input data prior to ingestion. Set to ``False`` by default.
             num_workers (int): The number of workers to use for ingestion. Set to ``0`` by default.
             token (Optional[str]): The token to use for accessing the dataset and/or connecting it to Deep Lake.
             connect_kwargs (Optional[Dict]): If specified, the dataset will be connected to Deep Lake, and connect_kwargs will be passed to :meth:`Dataset.connect <deeplake.core.dataset.Dataset.connect>`.
@@ -1264,6 +1264,7 @@ def ingest_yolo(
             ds,
             progressbar,
             num_workers,
+            shuffle,
         )
 
         return ds
@@ -1278,6 +1279,7 @@ def ingest_classification(
         progressbar: bool = True,
         summary: bool = True,
         num_workers: int = 0,
+        shuffle: bool = True,
         token: Optional[str] = None,
         connect_kwargs: Optional[Dict] = None,
         **dataset_kwargs,
@@ -1297,6 +1299,7 @@ def ingest_classification(
             progressbar (bool): Enables or disables ingestion progress bar. Defaults to ``True``.
             summary (bool): If ``True``, a summary of skipped files will be printed after completion. Defaults to ``True``.
             num_workers (int): The number of workers to use for ingestion. Set to ``0`` by default.
+            shuffle (bool): Shuffles the input data prior to ingestion. Since data arranged in folders by class is highly non-random, shuffling is important in order to produce optimal results when training. Defaults to ``True``.
             token (Optional[str]): The token to use for accessing the dataset.
             connect_kwargs (Optional[Dict]): If specified, the dataset will be connected to Deep Lake, and connect_kwargs will be passed to :meth:`Dataset.connect <deeplake.core.dataset.Dataset.connect>`.
             **dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function see :func:`deeplake.empty`.
@@ -1411,6 +1414,8 @@ def ingest_classification(
                 image_tensor_args=image_params,
                 label_tensor_args=label_params,
                 num_workers=num_workers,
+                shuffle=shuffle,
+                image_tensor_args={"sample_compression": images_compression},
             )
 
         return ds  # type: ignore
@@ -1426,6 +1431,7 @@ def ingest_kaggle(
         kaggle_credentials: Optional[dict] = None,
         progressbar: bool = True,
         summary: bool = True,
+        shuffle: bool = True,
         **dataset_kwargs,
     ) -> Dataset:
         """Download and ingest a kaggle dataset and store it as a structured dataset to destination.
@@ -1444,6 +1450,7 @@ def ingest_kaggle(
             kaggle_credentials (dict): A dictionary containing kaggle credentials {"username":"YOUR_USERNAME", "key": "YOUR_KEY"}. If ``None``, environment variables/the kaggle.json file will be used if available.
             progressbar (bool): Enables or disables ingestion progress bar. Set to ``True`` by default.
             summary (bool): Generates ingestion summary. Set to ``True`` by default.
+            shuffle (bool): Shuffles the input data prior to ingestion. Since data arranged in folders by class is highly non-random, shuffling is important in order to produce optimal results when training. Defaults to ``True``.
             **dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function. See :func:`deeplake.dataset`.
 
         Returns:
@@ -1488,6 +1495,7 @@ def ingest_kaggle(
             dest_creds=dest_creds,
             progressbar=progressbar,
             summary=summary,
+            shuffle=shuffle,
             **dataset_kwargs,
         )
 
@@ -1551,29 +1559,3 @@ def ingest_dataframe(
         structured.fill_dataset(ds, progressbar)  # type: ignore
 
         return ds  # type: ignore
-
-    @staticmethod
-    def list(
-        org_id: str = "",
-        token: Optional[str] = None,
-    ) -> None:
-        """List all available Deep Lake cloud datasets.
-
-        Args:
-            org_id (str): Specify organization id. If not given,
-                returns a list of all datasets that can be accessed, regardless of what workspace they are in.
-                Otherwise, lists all datasets in the given organization.
-            token (str, optional): Activeloop token, used for fetching credentials for Deep Lake datasets. This is optional, tokens are normally autogenerated.
-
-        Returns:
-            List: List of dataset names.
-        """
-
-        deeplake_reporter.feature_report(
-            feature_name="list",
-            parameters={"org_id": org_id},
-        )
-
-        client = DeepLakeBackendClient(token=token)
-        datasets = client.get_datasets(workspace=org_id)
-        return datasets
diff --git a/deeplake/api/tests/test_api.py b/deeplake/api/tests/test_api.py
@@ -2131,6 +2131,7 @@ def token_permission_error_check(
 
     with pytest.raises(TokenPermissionError):
         ds = deeplake.load("hub://activeloop/fake-path")
+    runner.invoke(logout)
 
 
 def invalid_token_exception_check():
@@ -2154,6 +2155,7 @@ def dataset_handler_error_check(runner, username, password):
     result = runner.invoke(login, f"-u {username} -p {password}")
     with pytest.raises(DatasetHandlerError):
         ds = deeplake.load(f"hub://{username}/wrong-path")
+    runner.invoke(logout)
 
 
 def test_hub_related_permission_exceptions(
@@ -2376,3 +2378,64 @@ def test_pickle_bug(local_ds):
         ds["__temp_123"].numpy()
 
     assert ds._temp_tensors == []
+
+
+def test_max_view(memory_ds):
+    with memory_ds as ds:
+        ds.create_tensor("abc")
+        ds.create_tensor("xyz")
+        ds.create_tensor("pqr")
+
+        ds.abc.extend([1, 2, 3, 4])
+        ds.xyz.extend([1, 2, 3])
+        ds.pqr.extend([1, 2])
+
+    expected = {
+        "abc": [[1], [2], [3], [4]],
+        "xyz": [[1], [2], [3], []],
+        "pqr": [[1], [2], [], []],
+    }
+
+    for i, sample in enumerate(ds.max_view):
+        np.testing.assert_array_equal(sample.abc.numpy(), expected["abc"][i])
+
+
+def test_min_view(memory_ds):
+    with memory_ds as ds:
+        ds.create_tensor("abc")
+        ds.create_tensor("xyz")
+        ds.create_tensor("pqr")
+
+        ds.abc.extend([1, 2, 3, 4])
+        ds.xyz.extend([1, 2, 3])
+        ds.pqr.extend([1, 2])
+
+    expected = {
+        "abc": [[1], [2]],
+        "xyz": [[1], [2]],
+        "pqr": [[1], [2]],
+    }
+
+    for i, sample in enumerate(ds.min_view):
+        np.testing.assert_array_equal(sample.abc.numpy(), expected["abc"][i])
+
+
+def test_extend_with_empty_tensor(memory_ds):
+    with memory_ds as ds:
+        ds.create_tensor("abc")
+        ds.abc.extend([None, None, None])
+
+        ds.create_tensor("xyz")
+        ds.xyz.extend(ds.abc)
+        ds.xyz.extend([ds.abc[0], ds.abc[1]])
+
+        with pytest.raises(EmptyTensorError):
+            ds.xyz.numpy()
+
+        ds.xyz.append(1)
+
+        data = ds.xyz.numpy(aslist=True)
+        expected = [[]] * 5 + [1]
+
+        for i in range(len(data)):
+            np.testing.assert_array_equal(data[i], expected[i])
diff --git a/deeplake/api/tests/test_downsample.py b/deeplake/api/tests/test_downsample.py
@@ -1,5 +1,6 @@
 import deeplake
 import numpy as np
+import pytest
 
 
 def test_downsample(local_ds_generator, cat_path):
@@ -105,3 +106,83 @@ def test_downsample_tiled(memory_ds):
         for i in range(4):
             x = i * 5472
             ds.image[0][0:3648, x : x + 5472, :] = arr
+
+
+@pytest.mark.parametrize(
+    "sample_compression", [None]
+)  # add back apng when bug is fixed
+def test_downsample_binary_mask(memory_ds, sample_compression):
+    with memory_ds as ds:
+        ds.create_tensor(
+            "mask",
+            htype="binary_mask",
+            sample_compression=sample_compression,
+            downsampling=(2, 5),
+        )
+        binary_masks = [
+            np.ones((1000, 1000, 3), dtype=bool),
+            np.zeros((1000, 1000, 3), dtype=bool),
+        ]
+        ds.mask.extend(binary_masks)
+
+        for i in range(1, 6):
+            tensor = ds[f"_mask_downsampled_{2 ** i}"]
+            assert len(tensor) == 2
+            for j in range(2):
+                np.testing.assert_array_equal(
+                    tensor[j], binary_masks[j][:: 2**i, :: 2**i, :]
+                )
+
+
+def test_downsample_group_bug(memory_ds):
+    with memory_ds as ds:
+        ds.create_group("stuff")
+        ds.create_tensor(
+            "mask", htype="binary_mask", sample_compression="lz4", downsampling=(2, 2)
+        )
+        ds.create_tensor(
+            "stuff/mask",
+            htype="binary_mask",
+            sample_compression="lz4",
+            downsampling=(2, 2),
+        )
+
+
+def test_downsample_image(memory_ds):
+    with memory_ds as ds:
+        ds.create_tensor(
+            "image", htype="image", sample_compression="jpeg", downsampling=(2, 2)
+        )
+        ds.image.append(np.ones((100, 100, 3), dtype="uint8"))
+        ds.image.append(np.ones((100, 100, 1), dtype="uint8"))
+        ds.image.append(np.ones((100, 100, 0), dtype="uint8"))
+        ds.image.append(np.ones((100, 0, 3), dtype="uint8"))
+        ds.image.append(np.ones((100, 100), dtype="uint8"))
+
+        target_shapes = {
+            "image": [
+                (100, 100, 3),
+                (100, 100, 1),
+                (100, 100, 0),
+                (100, 0, 3),
+                (100, 100, 1),
+            ],
+            "_image_downsampled_2": [
+                (50, 50, 3),
+                (50, 50, 1),
+                (0, 0, 0),
+                (0, 0, 0),
+                (50, 50, 1),
+            ],
+            "_image_downsampled_4": [
+                (25, 25, 3),
+                (25, 25, 1),
+                (0, 0, 0),
+                (0, 0, 0),
+                (25, 25, 1),
+            ],
+        }
+        for tensor, target_shape in target_shapes.items():
+            shapes = [ds[tensor][i].shape for i in range(5)]
+            numpy_shapes = [ds[tensor][i].numpy().shape for i in range(5)]
+            assert shapes == target_shape == numpy_shapes
diff --git a/deeplake/api/tests/test_link.py b/deeplake/api/tests/test_link.py
@@ -27,7 +27,7 @@ def test_complex_htype_parsing():
     is_sequence, is_link, htype = parse_complex_htype("sequence")
     assert is_sequence
     assert not is_link
-    assert htype == "generic"
+    assert htype is None
 
     with pytest.raises(ValueError):
         is_sequence, is_link, htype = parse_complex_htype("sequence[link]")

diff --git a/deeplake/api/tests/test_rechunk.py b/deeplake/api/tests/test_rechunk.py
@@ -269,3 +269,28 @@ def test_rechunk_cloud_link(local_ds_generator):
     assert sample_2.path == s3_path_1, sample_2.creds_key == "my_s3_key_1"
 
     assert ds.abc.chunk_engine.creds_encoder.num_samples == 3
+
+
+@deeplake.compute
+def add_samples(sample_in, samples_out):
+    samples_out.labels.append(np.ones((200,), dtype=np.int64))
+
+
+def test_rechunk_vc_bug(local_ds):
+    ds = local_ds
+    with ds:
+        ds.create_tensor("labels", dtype="int64")
+    add_samples().eval(list(range(200)), ds, num_workers=2)
+    ds.commit()
+    add_samples().eval(list(range(100)), ds, num_workers=2)
+    ds.commit()
+    ds.checkout("alt", True)
+    ds.labels[8] = ds.labels[8].numpy()
+    ds.commit()
+    np.testing.assert_array_equal(
+        ds.labels.numpy(), np.ones((300, 200), dtype=np.int64)
+    )
+    ds.checkout("main")
+    np.testing.assert_array_equal(
+        ds.labels.numpy(), np.ones((300, 200), dtype=np.int64)
+    )