Merge remote-tracking branch 'origin' into fix/boto3_threading_issue

activeloopai · Mar 24, 2022 · 4b92ebe · 4b92ebe
2 parents 4413bc8 + 8a51d43
commit 4b92ebe
Show file tree

Hide file tree

Showing 35 changed files with 1,310 additions and 62 deletions.
diff --git a/hub/__init__.py b/hub/__init__.py
@@ -43,6 +43,7 @@
 empty = dataset.empty
 like = dataset.like
 delete = dataset.delete
+rename = dataset.rename
 copy = dataset.copy
 dataset_cl = Dataset
 ingest = dataset.ingest
@@ -72,7 +73,7 @@
     "delete",
 ]
 
-__version__ = "2.3.2"
+__version__ = "2.3.3"
 __encoded_version__ = np.array(__version__)
 config = {"s3": Config(max_pool_connections=50, connect_timeout=300, read_timeout=300)}
 

diff --git a/hub/api/dataset.py b/hub/api/dataset.py
@@ -262,6 +262,39 @@ def load(
         except AgreementError as e:
             raise e from None
 
+    @staticmethod
+    def rename(
+        old_path: str,
+        new_path: str,
+        creds: Optional[dict] = None,
+        token: Optional[str] = None,
+    ) -> Dataset:
+        """Renames dataset at `old_path` to `new_path`.
+
+        Args:
+            old_path (str): The path to the dataset to be renamed.
+            new_path (str): Path to the dataset after renaming.
+            creds (dict, optional): A dictionary containing credentials used to access the dataset at the path.
+                This takes precedence over credentials present in the environment. Currently only works with s3 paths.
+                It supports 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token', 'endpoint_url' and 'region' as keys.
+            token (str, optional): Activeloop token, used for fetching credentials to the dataset at path if it is a Hub dataset. This is optional, tokens are normally autogenerated.
+
+        Returns:
+            Dataset object after renaming.
+
+        Raises:
+            DatasetHandlerError: If a Dataset does not exist at the given path or if new path is to a different directory.
+        """
+        if creds is None:
+            creds = {}
+
+        feature_report_path(old_path, "rename", {})
+
+        ds = hub.load(old_path, verbose=False, token=token, creds=creds)
+        ds.rename(new_path)
+
+        return ds  # type: ignore
+
     @staticmethod
     def delete(
         path: str,
@@ -354,7 +387,7 @@ def like(
         if isinstance(source, str):
             source_ds = dataset.load(source)
 
-        for tensor_name in source_ds.version_state["meta"].tensors:  # type: ignore
+        for tensor_name in source_ds.tensors:  # type: ignore
             destination_ds.create_tensor_like(tensor_name, source_ds[tensor_name])
 
         destination_ds.info.update(source_ds.info.__getstate__())  # type: ignore

diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
@@ -10,6 +10,7 @@
 from hub.tests.storage_fixtures import enabled_remote_storages
 from hub.core.storage import GCSProvider
 from hub.util.exceptions import (
+    RenameError,
     InvalidOperationError,
     TensorDtypeMismatchError,
     TensorAlreadyExistsError,
@@ -73,6 +74,8 @@ def test_persist_keys(local_ds_generator):
         "dataset_meta.json",
         "image/commit_diff",
         "image/tensor_meta.json",
+        "_image_id/tensor_meta.json",
+        "_image_id/commit_diff",
     }
 
 
@@ -128,9 +131,7 @@ def test_populate_dataset(local_ds):
     local_ds.image.extend([np.ones((28, 28)), np.ones((28, 28))])
     assert len(local_ds.image) == 16
 
-    assert local_ds.meta.tensors == [
-        "image",
-    ]
+    assert local_ds.meta.tensors == ["image", "_image_id"]
     assert local_ds.meta.version == hub.__version__
 
 
@@ -704,6 +705,37 @@ def test_dataset_delete():
         hub.constants.DELETE_SAFETY_SIZE = old_size
 
 
+@pytest.mark.parametrize(
+    ("ds_generator", "path", "hub_token"),
+    [
+        ("local_ds_generator", "local_path", "hub_cloud_dev_token"),
+        ("s3_ds_generator", "s3_path", "hub_cloud_dev_token"),
+        ("gcs_ds_generator", "gcs_path", "hub_cloud_dev_token"),
+        ("hub_cloud_ds_generator", "hub_cloud_path", "hub_cloud_dev_token"),
+    ],
+    indirect=True,
+)
+def test_dataset_rename(ds_generator, path, hub_token):
+    ds = ds_generator()
+    ds.create_tensor("abc")
+    ds.abc.append([1, 2, 3, 4])
+
+    new_path = "_".join([path, "renamed"])
+
+    with pytest.raises(RenameError):
+        ds.rename("wrongfolder/new_ds")
+
+    ds = hub.rename(ds.path, new_path, token=hub_token)
+
+    assert ds.path == new_path
+    np.testing.assert_array_equal(ds.abc.numpy(), np.array([[1, 2, 3, 4]]))
+
+    ds = hub.load(new_path, token=hub_token)
+    np.testing.assert_array_equal(ds.abc.numpy(), np.array([[1, 2, 3, 4]]))
+
+    hub.delete(new_path, token=hub_token)
+
+
 @pytest.mark.parametrize(
     "path,hub_token",
     [
@@ -745,8 +777,7 @@ def test_dataset_copy(path, hub_token, num_workers, progress_bar):
         progress_bar=progress_bar,
     )
 
-    assert dest_ds.meta.tensors == ["a", "b", "c", "d"]
-
+    assert list(dest_ds.tensors) == ["a", "b", "c", "d"]
     assert dest_ds.a.meta.htype == "image"
     assert dest_ds.a.meta.sample_compression == "png"
     assert dest_ds.b.meta.htype == "class_label"
@@ -772,12 +803,12 @@ def test_dataset_copy(path, hub_token, num_workers, progress_bar):
         progress_bar=progress_bar,
     )
 
-    assert dest_ds.meta.tensors == ["a", "b", "c", "d"]
+    assert list(dest_ds.tensors) == ["a", "b", "c", "d"]
     for tensor in dest_ds.tensors:
         np.testing.assert_array_equal(src_ds[tensor].numpy(), dest_ds[tensor].numpy())
 
     dest_ds = hub.load(dest_path, token=hub_token)
-    assert dest_ds.meta.tensors == ["a", "b", "c", "d"]
+    assert list(dest_ds.tensors) == ["a", "b", "c", "d"]
     for tensor in dest_ds.tensors.keys():
         np.testing.assert_array_equal(src_ds[tensor].numpy(), dest_ds[tensor].numpy())
 
@@ -792,7 +823,7 @@ def test_dataset_copy(path, hub_token, num_workers, progress_bar):
     )
     dest_ds = hub.load(dest_path, token=hub_token)
 
-    assert dest_ds.meta.tensors == ["a", "b", "c", "d"]
+    assert list(dest_ds.tensors) == ["a", "b", "c", "d"]
     for tensor in dest_ds.tensors:
         np.testing.assert_array_equal(src_ds[tensor].numpy(), dest_ds[tensor].numpy())
 
@@ -948,7 +979,7 @@ def test_vc_bug(local_ds_generator):
     a = ds.commit("first")
     ds.checkout(a)
     ds.create_tensor("a/b/c/d")
-    assert ds._all_tensors_filtered() == ["abc", "a/b/c/d"]
+    assert list(ds.tensors) == ["abc", "a/b/c/d"]
 
 
 def test_tobytes(memory_ds, compressed_image_paths, audio_paths):

diff --git a/hub/api/tests/test_linking.py b/hub/api/tests/test_linking.py
@@ -0,0 +1,50 @@
+import pytest
+import hub
+import numpy as np
+import uuid
+from hub.tests.common import LinkTransformTestContext
+
+
+def test_linking(memory_ds):
+    ds = memory_ds
+    with ds:
+        ds.create_tensor("x")
+        ds.create_tensor("y")
+        ds._link_tensors("x", "y", "append_test")
+        ds.x.extend(list(range(10)))
+        np.testing.assert_array_equal(ds.x.numpy(), np.arange(10).reshape(-1, 1))
+        np.testing.assert_array_equal(ds.x.numpy(), ds.y.numpy())
+
+
+def test_linking_sequence(memory_ds):
+    ds = memory_ds
+    with ds:
+        ds.create_tensor("x", htype="sequence")
+        ds.create_tensor("x_id")
+        id_f = lambda _: 0
+        with LinkTransformTestContext(id_f, "id"):
+            ds._link_tensors("x", "x_id", "id", flatten_sequence=False)
+            ds.x.extend(np.random.random((10, 5, 3, 2)))
+            assert len(ds.x) == len(ds.x_id) == 10
+            np.testing.assert_array_equal(ds.x_id.numpy(), np.zeros((10, 1)))
+
+
+def test_linking_sequence_update(memory_ds):
+    ds = memory_ds
+    with ds:
+        ds.create_tensor("x", htype="sequence")
+        ds.create_tensor("x_id")
+        id_f = lambda _: 0
+        id_f2 = lambda *_: 1  # updated samples will have x_id=1
+        with LinkTransformTestContext(id_f, "id"):
+            with LinkTransformTestContext(id_f2, "id2"):
+                ds._link_tensors(
+                    "x", "x_id", append_f="id", update_f="id2", flatten_sequence=False
+                )
+                ds.x.extend(np.random.random((10, 5, 3, 2)))
+                ds.x[0] += 1
+                ds.x[3] += 1
+                expected = np.zeros((10, 1))
+                expected[0] = 1
+                expected[3] = 1
+                np.testing.assert_array_equal(ds.x_id.numpy(), expected)
diff --git a/hub/client/client.py b/hub/client/client.py
@@ -219,6 +219,12 @@ def delete_dataset_entry(self, username, dataset_name):
             endpoint=self.endpoint(),
         ).json()
 
+    def rename_dataset_entry(self, username, old_name, new_name):
+        suffix = UPDATE_SUFFIX.format(username, old_name)
+        self.request(
+            "PUT", suffix, endpoint=self.endpoint(), json={"basename": new_name}
+        )
+
     def get_user_organizations(self):
         """Get list of user organizations from the backend. If user is not logged in, returns ['public'].
 

diff --git a/hub/constants.py b/hub/constants.py
@@ -62,9 +62,6 @@
 AGREEMENT_FILENAME = "agreement.txt"
 
 ENCODING_DTYPE = np.uint32
-# caclulate the number of bits to shift right when converting a 128-bit uuid into `ENCODING_DTYPE`
-UUID_SHIFT_AMOUNT = 128 - (8 * ENCODING_DTYPE(1).itemsize)
-
 
 # environment variables
 ENV_HUB_DEV_USERNAME = "ACTIVELOOP_HUB_USERNAME"