Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin' into fix/boto3_threading_issue
Browse files Browse the repository at this point in the history
  • Loading branch information
AbhinavTuli committed Mar 24, 2022
2 parents 4413bc8 + 8a51d43 commit 4b92ebe
Show file tree
Hide file tree
Showing 35 changed files with 1,310 additions and 62 deletions.
3 changes: 2 additions & 1 deletion hub/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
empty = dataset.empty
like = dataset.like
delete = dataset.delete
rename = dataset.rename
copy = dataset.copy
dataset_cl = Dataset
ingest = dataset.ingest
Expand Down Expand Up @@ -72,7 +73,7 @@
"delete",
]

__version__ = "2.3.2"
__version__ = "2.3.3"
__encoded_version__ = np.array(__version__)
config = {"s3": Config(max_pool_connections=50, connect_timeout=300, read_timeout=300)}

Expand Down
35 changes: 34 additions & 1 deletion hub/api/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,39 @@ def load(
except AgreementError as e:
raise e from None

@staticmethod
def rename(
old_path: str,
new_path: str,
creds: Optional[dict] = None,
token: Optional[str] = None,
) -> Dataset:
"""Renames dataset at `old_path` to `new_path`.
Args:
old_path (str): The path to the dataset to be renamed.
new_path (str): Path to the dataset after renaming.
creds (dict, optional): A dictionary containing credentials used to access the dataset at the path.
This takes precedence over credentials present in the environment. Currently only works with s3 paths.
It supports 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token', 'endpoint_url' and 'region' as keys.
token (str, optional): Activeloop token, used for fetching credentials to the dataset at path if it is a Hub dataset. This is optional, tokens are normally autogenerated.
Returns:
Dataset object after renaming.
Raises:
DatasetHandlerError: If a Dataset does not exist at the given path or if new path is to a different directory.
"""
if creds is None:
creds = {}

feature_report_path(old_path, "rename", {})

ds = hub.load(old_path, verbose=False, token=token, creds=creds)
ds.rename(new_path)

return ds # type: ignore

@staticmethod
def delete(
path: str,
Expand Down Expand Up @@ -354,7 +387,7 @@ def like(
if isinstance(source, str):
source_ds = dataset.load(source)

for tensor_name in source_ds.version_state["meta"].tensors: # type: ignore
for tensor_name in source_ds.tensors: # type: ignore
destination_ds.create_tensor_like(tensor_name, source_ds[tensor_name])

destination_ds.info.update(source_ds.info.__getstate__()) # type: ignore
Expand Down
49 changes: 40 additions & 9 deletions hub/api/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from hub.tests.storage_fixtures import enabled_remote_storages
from hub.core.storage import GCSProvider
from hub.util.exceptions import (
RenameError,
InvalidOperationError,
TensorDtypeMismatchError,
TensorAlreadyExistsError,
Expand Down Expand Up @@ -73,6 +74,8 @@ def test_persist_keys(local_ds_generator):
"dataset_meta.json",
"image/commit_diff",
"image/tensor_meta.json",
"_image_id/tensor_meta.json",
"_image_id/commit_diff",
}


Expand Down Expand Up @@ -128,9 +131,7 @@ def test_populate_dataset(local_ds):
local_ds.image.extend([np.ones((28, 28)), np.ones((28, 28))])
assert len(local_ds.image) == 16

assert local_ds.meta.tensors == [
"image",
]
assert local_ds.meta.tensors == ["image", "_image_id"]
assert local_ds.meta.version == hub.__version__


Expand Down Expand Up @@ -704,6 +705,37 @@ def test_dataset_delete():
hub.constants.DELETE_SAFETY_SIZE = old_size


@pytest.mark.parametrize(
("ds_generator", "path", "hub_token"),
[
("local_ds_generator", "local_path", "hub_cloud_dev_token"),
("s3_ds_generator", "s3_path", "hub_cloud_dev_token"),
("gcs_ds_generator", "gcs_path", "hub_cloud_dev_token"),
("hub_cloud_ds_generator", "hub_cloud_path", "hub_cloud_dev_token"),
],
indirect=True,
)
def test_dataset_rename(ds_generator, path, hub_token):
ds = ds_generator()
ds.create_tensor("abc")
ds.abc.append([1, 2, 3, 4])

new_path = "_".join([path, "renamed"])

with pytest.raises(RenameError):
ds.rename("wrongfolder/new_ds")

ds = hub.rename(ds.path, new_path, token=hub_token)

assert ds.path == new_path
np.testing.assert_array_equal(ds.abc.numpy(), np.array([[1, 2, 3, 4]]))

ds = hub.load(new_path, token=hub_token)
np.testing.assert_array_equal(ds.abc.numpy(), np.array([[1, 2, 3, 4]]))

hub.delete(new_path, token=hub_token)


@pytest.mark.parametrize(
"path,hub_token",
[
Expand Down Expand Up @@ -745,8 +777,7 @@ def test_dataset_copy(path, hub_token, num_workers, progress_bar):
progress_bar=progress_bar,
)

assert dest_ds.meta.tensors == ["a", "b", "c", "d"]

assert list(dest_ds.tensors) == ["a", "b", "c", "d"]
assert dest_ds.a.meta.htype == "image"
assert dest_ds.a.meta.sample_compression == "png"
assert dest_ds.b.meta.htype == "class_label"
Expand All @@ -772,12 +803,12 @@ def test_dataset_copy(path, hub_token, num_workers, progress_bar):
progress_bar=progress_bar,
)

assert dest_ds.meta.tensors == ["a", "b", "c", "d"]
assert list(dest_ds.tensors) == ["a", "b", "c", "d"]
for tensor in dest_ds.tensors:
np.testing.assert_array_equal(src_ds[tensor].numpy(), dest_ds[tensor].numpy())

dest_ds = hub.load(dest_path, token=hub_token)
assert dest_ds.meta.tensors == ["a", "b", "c", "d"]
assert list(dest_ds.tensors) == ["a", "b", "c", "d"]
for tensor in dest_ds.tensors.keys():
np.testing.assert_array_equal(src_ds[tensor].numpy(), dest_ds[tensor].numpy())

Expand All @@ -792,7 +823,7 @@ def test_dataset_copy(path, hub_token, num_workers, progress_bar):
)
dest_ds = hub.load(dest_path, token=hub_token)

assert dest_ds.meta.tensors == ["a", "b", "c", "d"]
assert list(dest_ds.tensors) == ["a", "b", "c", "d"]
for tensor in dest_ds.tensors:
np.testing.assert_array_equal(src_ds[tensor].numpy(), dest_ds[tensor].numpy())

Expand Down Expand Up @@ -948,7 +979,7 @@ def test_vc_bug(local_ds_generator):
a = ds.commit("first")
ds.checkout(a)
ds.create_tensor("a/b/c/d")
assert ds._all_tensors_filtered() == ["abc", "a/b/c/d"]
assert list(ds.tensors) == ["abc", "a/b/c/d"]


def test_tobytes(memory_ds, compressed_image_paths, audio_paths):
Expand Down
50 changes: 50 additions & 0 deletions hub/api/tests/test_linking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
import pytest
import hub
import numpy as np
import uuid
from hub.tests.common import LinkTransformTestContext


def test_linking(memory_ds):
ds = memory_ds
with ds:
ds.create_tensor("x")
ds.create_tensor("y")
ds._link_tensors("x", "y", "append_test")
ds.x.extend(list(range(10)))
np.testing.assert_array_equal(ds.x.numpy(), np.arange(10).reshape(-1, 1))
np.testing.assert_array_equal(ds.x.numpy(), ds.y.numpy())


def test_linking_sequence(memory_ds):
ds = memory_ds
with ds:
ds.create_tensor("x", htype="sequence")
ds.create_tensor("x_id")
id_f = lambda _: 0
with LinkTransformTestContext(id_f, "id"):
ds._link_tensors("x", "x_id", "id", flatten_sequence=False)
ds.x.extend(np.random.random((10, 5, 3, 2)))
assert len(ds.x) == len(ds.x_id) == 10
np.testing.assert_array_equal(ds.x_id.numpy(), np.zeros((10, 1)))


def test_linking_sequence_update(memory_ds):
ds = memory_ds
with ds:
ds.create_tensor("x", htype="sequence")
ds.create_tensor("x_id")
id_f = lambda _: 0
id_f2 = lambda *_: 1 # updated samples will have x_id=1
with LinkTransformTestContext(id_f, "id"):
with LinkTransformTestContext(id_f2, "id2"):
ds._link_tensors(
"x", "x_id", append_f="id", update_f="id2", flatten_sequence=False
)
ds.x.extend(np.random.random((10, 5, 3, 2)))
ds.x[0] += 1
ds.x[3] += 1
expected = np.zeros((10, 1))
expected[0] = 1
expected[3] = 1
np.testing.assert_array_equal(ds.x_id.numpy(), expected)
6 changes: 6 additions & 0 deletions hub/client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -219,6 +219,12 @@ def delete_dataset_entry(self, username, dataset_name):
endpoint=self.endpoint(),
).json()

def rename_dataset_entry(self, username, old_name, new_name):
suffix = UPDATE_SUFFIX.format(username, old_name)
self.request(
"PUT", suffix, endpoint=self.endpoint(), json={"basename": new_name}
)

def get_user_organizations(self):
"""Get list of user organizations from the backend. If user is not logged in, returns ['public'].
Expand Down
3 changes: 0 additions & 3 deletions hub/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,9 +62,6 @@
AGREEMENT_FILENAME = "agreement.txt"

ENCODING_DTYPE = np.uint32
# caclulate the number of bits to shift right when converting a 128-bit uuid into `ENCODING_DTYPE`
UUID_SHIFT_AMOUNT = 128 - (8 * ENCODING_DTYPE(1).itemsize)


# environment variables
ENV_HUB_DEV_USERNAME = "ACTIVELOOP_HUB_USERNAME"
Expand Down
Loading

0 comments on commit 4b92ebe

Please sign in to comment.