Skip to content

Commit

Permalink
[DL-815] Unifying src_token and dest_token to token (activeloopai#2038)
Browse files Browse the repository at this point in the history
* First Commit

* Changed exception message

* python linting fix

* Linting Fix

* Darglint fix

* Changed according to comments

* Addressed Fayaz comments

* Added token to rst

* Update docs/source/deeplake.api.dataset.rst

Co-authored-by: Fayaz Rahman <fayazrahman4u@gmail.com>
  • Loading branch information
adolkhan and FayazRahman authored Dec 9, 2022
1 parent 048fdf4 commit a6ce856
Show file tree
Hide file tree
Showing 5 changed files with 69 additions and 30 deletions.
52 changes: 36 additions & 16 deletions deeplake/api/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@
AuthorizationException,
UserNotLoggedInException,
TokenPermissionError,
UnsupportedParameterException,
)
from deeplake.util.storage import (
get_storage_and_cache_chain,
Expand Down Expand Up @@ -685,12 +686,12 @@ def copy(
tensors: Optional[List[str]] = None,
overwrite: bool = False,
src_creds=None,
src_token=None,
token=None,
dest_creds=None,
dest_token=None,
num_workers: int = 0,
scheduler="threaded",
progressbar=True,
**kwargs,
):
"""Copies dataset at ``src`` to ``dest``. Version control history is not included.
Expand All @@ -702,26 +703,34 @@ def copy(
src_creds (dict, optional): - A dictionary containing credentials used to access the dataset at the path.
- If 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token' are present, these take precedence over credentials present in the environment or in credentials file. Currently only works with s3 paths.
- It supports 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token', 'endpoint_url', 'aws_region', 'profile_name' as keys.
src_token (str, optional): Activeloop token, used for fetching credentials to the dataset at ``src`` if it is a Deep Lake dataset. This is optional, tokens are normally autogenerated.
dest_creds (dict, optional): creds required to create / overwrite datasets at ``dest``.
dest_token (str, optional): token used to for fetching credentials to ``dest``.
token (str, optional): Activeloop token, used for fetching credentials to the dataset at path if it is a Deep Lake dataset. This is optional, tokens are normally autogenerated.
num_workers (int): The number of workers to use for copying. Defaults to 0. When set to 0, it will always use serial processing, irrespective of the scheduler.
scheduler (str): The scheduler to be used for copying. Supported values include: 'serial', 'threaded', 'processed' and 'ray'.
Defaults to 'threaded'.
progressbar (bool): Displays a progress bar if True (default).
**kwargs (dict): Additional keyword arguments
Returns:
Dataset: New dataset object.
Raises:
DatasetHandlerError: If a dataset already exists at destination path and overwrite is False.
UnsupportedParameterException: If a parameter that is no longer supported is specified.
"""
if "src_token" in kwargs:
raise UnsupportedParameterException(
"src_token is now not supported. You should use `token` instead."
)

if "dest_token" in kwargs:
raise UnsupportedParameterException(
"dest_token is now not supported. You should use `token` instead."
)

if isinstance(src, (str, pathlib.Path)):
src = convert_pathlib_to_string_if_needed(src)
src_ds = deeplake.load(
src, read_only=True, creds=src_creds, token=src_token
)
src_ds = deeplake.load(src, read_only=True, creds=src_creds, token=token)
else:
src_ds = src
src_ds.path = str(src_ds.path)
Expand All @@ -733,7 +742,7 @@ def copy(
tensors=tensors,
overwrite=overwrite,
creds=dest_creds,
token=dest_token,
token=token,
num_workers=num_workers,
scheduler=scheduler,
progressbar=progressbar,
Expand All @@ -746,14 +755,14 @@ def deepcopy(
tensors: Optional[List[str]] = None,
overwrite: bool = False,
src_creds=None,
src_token=None,
dest_creds=None,
dest_token=None,
token=None,
num_workers: int = 0,
scheduler="threaded",
progressbar=True,
public: bool = False,
verbose: bool = True,
**kwargs,
):
"""Copies dataset at ``src`` to ``dest`` including version control history.
Expand All @@ -765,24 +774,35 @@ def deepcopy(
src_creds (dict, optional): - A dictionary containing credentials used to access the dataset at the path.
- If 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token' are present, these take precedence over credentials present in the environment or in credentials file. Currently only works with s3 paths.
- It supports 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token', 'endpoint_url', 'aws_region', 'profile_name' as keys.
src_token (str, optional): Activeloop token, used for fetching credentials to the dataset at `src` if it is a Deep Lake dataset. This is optional, tokens are normally autogenerated.
dest_creds (dict, optional): creds required to create / overwrite datasets at ``dest``.
dest_token (str, optional): token used to for fetching credentials to ``dest``.
token (str, optional): Activeloop token, used for fetching credentials to the dataset at path if it is a Deep Lake dataset. This is optional, tokens are normally autogenerated.
num_workers (int): The number of workers to use for copying. Defaults to 0. When set to 0, it will always use serial processing, irrespective of the scheduler.
scheduler (str): The scheduler to be used for copying. Supported values include: 'serial', 'threaded', 'processed' and 'ray'.
Defaults to 'threaded'.
progressbar (bool): Displays a progress bar if True (default).
public (bool): Defines if the dataset will have public access. Applicable only if Deep Lake cloud storage is used and a new Dataset is being created. Defaults to ``False``.
verbose (bool): If True, logs will be printed. Defaults to ``True``.
**kwargs: Additional keyword arguments
Returns:
Dataset: New dataset object.
Raises:
DatasetHandlerError: If a dataset already exists at destination path and overwrite is False.
TypeError: If source is not a path to a dataset.
UnsupportedParameterException: If parameter that is no longer supported is beeing called.
"""

if "src_token" in kwargs:
raise UnsupportedParameterException(
"src_token is now not supported. You should use `token` instead."
)

if "dest_token" in kwargs:
raise UnsupportedParameterException(
"dest_token is now not supported. You should use `token` instead."
)

if not isinstance(src, (str, pathlib.Path)):
raise TypeError(
f"Source for `deepcopy` should be path to a dataset. Got {type(src)}."
Expand All @@ -802,17 +822,17 @@ def deepcopy(
}
if dest.startswith("hub://"):
report_params["Dest"] = dest
feature_report_path(src, "deepcopy", report_params, token=dest_token)
feature_report_path(src, "deepcopy", report_params, token=token)

src_ds = deeplake.load(
src, read_only=True, creds=src_creds, token=src_token, verbose=False
src, read_only=True, creds=src_creds, token=token, verbose=False
)
src_storage = get_base_storage(src_ds.storage)

dest_storage, cache_chain = get_storage_and_cache_chain(
dest,
creds=dest_creds,
token=dest_token,
token=token,
read_only=False,
memory_cache_size=DEFAULT_MEMORY_CACHE_SIZE,
local_cache_size=DEFAULT_LOCAL_CACHE_SIZE,
Expand Down Expand Up @@ -924,7 +944,7 @@ def process_meta(k):
path=dest,
storage=cache_chain,
public=public,
token=dest_token,
token=token,
verbose=verbose,
)
ret._register_dataset()
Expand Down
37 changes: 26 additions & 11 deletions deeplake/api/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
SampleAppendingError,
DatasetTooLargeToDelete,
InvalidDatasetNameException,
UnsupportedParameterException,
)
from deeplake.util.path import convert_string_to_pathlib_if_needed, verify_dataset_name
from deeplake.util.pretty_print import summary_tensor, summary_dataset
Expand Down Expand Up @@ -901,8 +902,7 @@ def test_dataset_deepcopy(path, hub_token, num_workers, progressbar):
src_path,
dest_path,
overwrite=True,
src_token=hub_token,
dest_token=hub_token,
token=hub_token,
num_workers=num_workers,
progressbar=progressbar,
)
Expand All @@ -921,20 +921,37 @@ def test_dataset_deepcopy(path, hub_token, num_workers, progressbar):
np.testing.assert_array_equal(src_ds[tensor].numpy(), dest_ds[tensor].numpy())

with pytest.raises(DatasetHandlerError):
deeplake.deepcopy(
src_path, dest_path, src_token=hub_token, dest_token=hub_token
)
deeplake.deepcopy(src_path, dest_path, token=hub_token)

deeplake.deepcopy(
src_path,
dest_path,
overwrite=True,
src_token=hub_token,
dest_token=hub_token,
token=hub_token,
num_workers=num_workers,
progressbar=progressbar,
)

with pytest.raises(UnsupportedParameterException):
deeplake.deepcopy(
src_path,
dest_path,
overwrite=True,
src_token=hub_token,
num_workers=num_workers,
progressbar=progressbar,
)

with pytest.raises(UnsupportedParameterException):
deeplake.deepcopy(
src_path,
dest_path,
overwrite=True,
dest_token=hub_token,
num_workers=num_workers,
progressbar=progressbar,
)

assert list(dest_ds.tensors) == ["a", "b", "c", "d"]
for tensor in dest_ds.tensors:
np.testing.assert_array_equal(src_ds[tensor].numpy(), dest_ds[tensor].numpy())
Expand All @@ -949,8 +966,7 @@ def test_dataset_deepcopy(path, hub_token, num_workers, progressbar):
src_path,
dest_path,
overwrite=True,
src_token=hub_token,
dest_token=hub_token,
token=hub_token,
num_workers=num_workers,
progressbar=progressbar,
)
Expand All @@ -965,8 +981,7 @@ def test_dataset_deepcopy(path, hub_token, num_workers, progressbar):
dest_path,
tensors=["a", "d"],
overwrite=True,
src_token=hub_token,
dest_token=hub_token,
token=hub_token,
num_workers=num_workers,
progressbar=progressbar,
)
Expand Down
2 changes: 1 addition & 1 deletion deeplake/util/access_method.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def get_local_dataset(
path,
local_path,
src_creds=creds,
src_token=token,
token=token,
num_workers=num_workers,
scheduler=scheduler,
progressbar=True,
Expand Down
4 changes: 4 additions & 0 deletions deeplake/util/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -833,3 +833,7 @@ def __init__(self, path_type):
else:
message = "Please specify a dataset name that contains only letters, numbers, hyphens and underscores."
super().__init__(message)


class UnsupportedParameterException(Exception):
pass
4 changes: 2 additions & 2 deletions docs/source/deeplake.api.dataset.rst
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,11 @@ deeplake.api.dataset

See :func:`deeplake.like`.

.. staticmethod:: copy(src: Union[str, pathlib.Path, Dataset], dest: Union[str, pathlib.Path], tensors: Optional[List[str]] = None, overwrite: bool = False, src_creds=None, src_token=None, dest_creds=None, dest_token=None, num_workers: int = 0, scheduler="threaded", progressbar=True)
.. staticmethod:: copy(src: Union[str, pathlib.Path, Dataset], dest: Union[str, pathlib.Path], tensors: Optional[List[str]] = None, overwrite: bool = False, src_creds=None, dest_creds=None, token=None, num_workers: int = 0, scheduler="threaded", progressbar=True)

See :func:`deeplake.copy`.

.. staticmethod:: deepcopy(src: Union[str, pathlib.Path], dest: Union[str, pathlib.Path], tensors: Optional[List[str]] = None, overwrite: bool = False, src_creds=None, src_token=None, dest_creds=None, dest_token=None, num_workers: int = 0, scheduler="threaded", progressbar=True, public: bool = False, verbose: bool = True)
.. staticmethod:: deepcopy(src: Union[str, pathlib.Path], dest: Union[str, pathlib.Path], tensors: Optional[List[str]] = None, overwrite: bool = False, src_creds=None, dest_creds=None, token=None, num_workers: int = 0, scheduler="threaded", progressbar=True, public: bool = False, verbose: bool = True)

See :func:`deeplake.deepcopy`.

Expand Down

0 comments on commit a6ce856

Please sign in to comment.