Skip to content

Commit

Permalink
Added connect_kwargs and tests
Browse files Browse the repository at this point in the history
  • Loading branch information
ProgerDav committed Feb 3, 2023
1 parent 2ee7ddd commit ed532d4
Show file tree
Hide file tree
Showing 5 changed files with 141 additions and 22 deletions.
16 changes: 14 additions & 2 deletions deeplake/api/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -1129,7 +1129,7 @@ def ingest_coco(
dest, creds=dest_creds, verbose=False, token=token, **dataset_kwargs
)
if connect_kwargs is not None:
connect_kwargs["token"] = token or connect_kwargs.get(token)
connect_kwargs["token"] = token or connect_kwargs.get("token", None)
ds.connect(**connect_kwargs)

structure.create_missing(ds)
Expand Down Expand Up @@ -1255,7 +1255,7 @@ def ingest_yolo(
dest, creds=dest_creds, verbose=False, token=token, **dataset_kwargs
)
if connect_kwargs is not None:
connect_kwargs["token"] = token or connect_kwargs.get("token")
connect_kwargs["token"] = token or connect_kwargs.get("token", None)
ds.connect(**connect_kwargs)

structure.create_missing(ds)
Expand All @@ -1278,6 +1278,7 @@ def ingest_classification(
progressbar: bool = True,
summary: bool = True,
token: Optional[str] = None,
connect_kwargs: Optional[Dict] = None,
**dataset_kwargs,
) -> Dataset:
"""Ingests a dataset from a source and stores it as a structured dataset to destination.
Expand All @@ -1295,6 +1296,7 @@ def ingest_classification(
progressbar (bool): Enables or disables ingestion progress bar. Defaults to ``True``.
summary (bool): If ``True``, a summary of skipped files will be printed after completion. Defaults to ``True``.
token (Optional[str]): The token to use for accessing the dataset.
connect_kwargs (Optional[Dict]): If specified, the dataset will be connected to Deep Lake, and connect_kwargs will be passed to :meth:`Dataset.connect <deeplake.core.dataset.Dataset.connect>`.
**dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function see :func:`deeplake.empty`.
Returns:
Expand Down Expand Up @@ -1395,6 +1397,9 @@ def ingest_classification(
ds = deeplake.empty(
dest, creds=dest_creds, token=token, verbose=False, **dataset_kwargs
)
if connect_kwargs is not None:
connect_kwargs["token"] = token or connect_kwargs.get("token", None)
ds.connect(**connect_kwargs)

# TODO: auto detect compression
unstructured.structure(
Expand All @@ -1404,6 +1409,7 @@ def ingest_classification(
image_tensor_args=image_params,
label_tensor_args=label_params,
)

return ds # type: ignore

@staticmethod
Expand Down Expand Up @@ -1491,6 +1497,7 @@ def ingest_dataframe(
dest_creds: Optional[Dict] = None,
progressbar: bool = True,
token: Optional[str] = None,
connect_kwargs: Optional[Dict] = None,
**dataset_kwargs,
):
"""Convert pandas dataframe to a Deep Lake Dataset.
Expand All @@ -1506,6 +1513,7 @@ def ingest_dataframe(
dest_creds (Optional[Dict]): A dictionary containing credentials used to access the destination path of the dataset.
progressbar (bool): Enables or disables ingestion progress bar. Set to ``True`` by default.
token (Optional[str]): The token to use for accessing the dataset.
connect_kwargs (Optional[Dict]): A dictionary containing arguments to be passed to the dataset connect method. See :meth:`Dataset.connect`.
**dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function. See :func:`deeplake.empty`.
Returns:
Expand Down Expand Up @@ -1533,8 +1541,12 @@ def ingest_dataframe(
ds = deeplake.empty(
dest, creds=dest_creds, token=token, verbose=False, **dataset_kwargs
)
if connect_kwargs is not None:
connect_kwargs["token"] = token or connect_kwargs.get("token", None)
ds.connect(**connect_kwargs)

structured.fill_dataset(ds, progressbar) # type: ignore

return ds # type: ignore

@staticmethod
Expand Down
45 changes: 41 additions & 4 deletions deeplake/auto/tests/test_coco_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,20 +122,57 @@ def test_minimal_coco_ingestion(local_path, coco_ingestion_data):
assert "group2/iscrowd" not in ds.tensors


def test_coco_ingestion_with_linked_images(local_path, coco_ingestion_data):
def test_minimal_coco_with_connect(
s3_path,
coco_ingestion_data,
hub_cloud_path,
hub_cloud_dev_token,
hub_cloud_dev_managed_creds_key,
):
params = {**coco_ingestion_data}

ds = deeplake.ingest_coco(
**params,
dest=s3_path,
connect_kwargs={
"dest_path": hub_cloud_path,
"creds_key": hub_cloud_dev_managed_creds_key,
"token": hub_cloud_dev_token,
},
)

assert ds.path == hub_cloud_path
assert "images" in ds.tensors
assert "annotations1/bbox" in ds.tensors


def test_coco_ingestion_with_linked_images(
s3_path,
coco_ingestion_data,
hub_cloud_path,
hub_cloud_dev_token,
hub_cloud_dev_managed_creds_key,
):
file_to_group = {"annotations1.json": "base_annotations"}
ds = deeplake.ingest_coco(
**coco_ingestion_data,
file_to_group_mapping=file_to_group,
dest=local_path,
dest=s3_path,
image_params={"name": "linked_images", "htype": "link[image]"},
image_creds_key=hub_cloud_dev_managed_creds_key,
connect_kwargs={
"dest_path": hub_cloud_path,
"creds_key": hub_cloud_dev_managed_creds_key,
"token": hub_cloud_dev_token,
},
)

assert ds.path == local_path
assert ds.path == hub_cloud_path
assert "linked_images" in ds.tensors
assert ds.linked_images.num_samples > 0
assert ds.linked_images.htype == "link[image]"
assert "base_annotations/bbox" in ds.tensors
assert "base_annotations/segmentation" in ds.tensors
assert ds.linked_images.htype == "link[image]"


def test_flat_coco_ingestion(local_path, coco_ingestion_data):
Expand Down
58 changes: 58 additions & 0 deletions deeplake/auto/tests/test_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,32 @@ def test_overwrite(local_ds: Dataset):
)


def test_ingestion_with_connection(
s3_path,
hub_cloud_path,
hub_cloud_dev_token,
hub_cloud_dev_managed_creds_key,
):
path = get_dummy_data_path("tests_auto/image_classification")
ds = deeplake.ingest_classification(
src=path,
dest=s3_path,
progressbar=False,
summary=False,
overwrite=False,
connect_kwargs={
"dest_path": hub_cloud_path,
"creds_key": hub_cloud_dev_managed_creds_key,
"token": hub_cloud_dev_token,
},
)

assert ds.path == hub_cloud_path
assert "images" in ds.tensors
assert "labels" in ds.tensors
assert len(ds.labels.info["class_names"]) > 0


def test_csv(memory_ds: Dataset):
path = get_dummy_data_path("tests_auto/csv/deniro.csv")
with pytest.raises(InvalidPathException):
Expand Down Expand Up @@ -215,3 +241,35 @@ def test_dataframe(memory_ds: Dataset, convert_to_pathlib: bool):
assert ds["Title"].htype == "text"
assert ds["Title"].dtype == str
np.testing.assert_array_equal(ds["Title"].numpy().reshape(-1), df["Title"].values)


def test_dataframe_with_connect(
s3_path,
hub_cloud_path,
hub_cloud_dev_token,
hub_cloud_dev_managed_creds_key,
):
path = get_dummy_data_path("tests_auto/csv/deniro.csv")
df = pd.read_csv(path, quotechar='"', skipinitialspace=True)
ds = deeplake.ingest_dataframe(
df,
s3_path,
progressbar=False,
connect_kwargs={
"dest_path": hub_cloud_path,
"creds_key": hub_cloud_dev_managed_creds_key,
"token": hub_cloud_dev_token,
},
)

assert ds.path == hub_cloud_path
assert list(ds.tensors) == ["Year", "Score", "Title"]
assert ds["Year"].dtype == df["Year"].dtype
np.testing.assert_array_equal(ds["Year"].numpy().reshape(-1), df["Year"].values)

assert ds["Score"].dtype == df["Score"].dtype
np.testing.assert_array_equal(ds["Score"].numpy().reshape(-1), df["Score"].values)

assert ds["Title"].htype == "text"
assert ds["Title"].dtype == str
np.testing.assert_array_equal(ds["Title"].numpy().reshape(-1), df["Title"].values)
38 changes: 25 additions & 13 deletions deeplake/auto/tests/test_yolo_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,38 +118,44 @@ def test_minimal_yolo_ingestion_poly(local_path, yolo_ingestion_data):
assert ds.polygons.htype == "polygon"


def test_minimal_yolo_ingestion_with_linked_images(local_path, yolo_ingestion_data):

def test_minimal_yolo_with_connect(
s3_path,
yolo_ingestion_data,
hub_cloud_path,
hub_cloud_dev_token,
hub_cloud_dev_managed_creds_key,
):
params = {
"data_directory": yolo_ingestion_data["data_directory"],
"class_names_file": yolo_ingestion_data["class_names_file"],
}

ds = deeplake.ingest_yolo(
**params,
dest=local_path,
image_params={
"name": "linked_images",
"htype": "link[image]",
"sample_compression": "png",
dest=s3_path,
connect_kwargs={
"dest_path": hub_cloud_path,
"creds_key": hub_cloud_dev_managed_creds_key,
"token": hub_cloud_dev_token,
},
)

assert ds.path == local_path
assert "linked_images" in ds.tensors
assert ds.path == hub_cloud_path
assert "images" in ds.tensors
assert "boxes" in ds.tensors
assert "labels" in ds.tensors
assert len(ds.labels.info["class_names"]) > 0
assert ds.linked_images.htype == "link[image]"
assert ds.boxes.htype == "bbox"


def test_minimal_yolo_with_connect(
def test_minimal_yolo_ingestion_with_linked_images(
s3_path,
yolo_ingestion_data,
hub_cloud_path,
hub_cloud_dev_token,
hub_cloud_dev_managed_creds_key,
):

params = {
"data_directory": yolo_ingestion_data["data_directory"],
"class_names_file": yolo_ingestion_data["class_names_file"],
Expand All @@ -158,6 +164,12 @@ def test_minimal_yolo_with_connect(
ds = deeplake.ingest_yolo(
**params,
dest=s3_path,
image_params={
"name": "linked_images",
"htype": "link[image]",
"sample_compression": "png",
},
image_creds_key=hub_cloud_dev_managed_creds_key,
connect_kwargs={
"dest_path": hub_cloud_path,
"creds_key": hub_cloud_dev_managed_creds_key,
Expand All @@ -166,8 +178,8 @@ def test_minimal_yolo_with_connect(
)

assert ds.path == hub_cloud_path
assert "images" in ds.tensors
assert "linked_images" in ds.tensors
assert "boxes" in ds.tensors
assert "labels" in ds.tensors
assert len(ds.labels.info["class_names"]) > 0
assert ds.boxes.htype == "bbox"
assert ds.linked_images.htype == "link[image]"
6 changes: 3 additions & 3 deletions docs/source/deeplake.api.dataset.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ deeplake.api.dataset

See :func:`deeplake.connect`.

.. staticmethod:: ingest_classification(src: Union[str, pathlib.Path], dest: Union[str, pathlib.Path], image_params: Optional[Dict] = None, label_params: Optional[Dict]: None, dest_creds: Optional[Dict] = None, progressbar: bool = True, summary: bool = True, token: Optional[str] = None, **dataset_kwargs) -> Dataset
.. staticmethod:: ingest_classification(src: Union[str, pathlib.Path], dest: Union[str, pathlib.Path], image_params: Optional[Dict] = None, label_params: Optional[Dict]: None, dest_creds: Optional[Dict] = None, progressbar: bool = True, summary: bool = True, token: Optional[str] = None, connect_kwargs: Optional[Dict] = None, **dataset_kwargs) -> Dataset

See :func:`deeplake.ingest_classification`.

.. staticmethod:: ingest_coco(images_directory: Union[str, pathlib.Path], annotation_files: Union[str, pathlib.Path, List[str]], dest: Union[str, pathlib.Path], key_to_tensor_mapping: Optional[Dict] = None, file_to_group_mapping: Optional[Dict] = None, ignore_one_group: bool = False, ignore_keys: Optional[List[str]] = None, image_settings: Optional[Dict] = None, src_creds: Optional[Dict] = None, dest_creds: Optional[Dict] = None, inspect_limit: int = 1000000, progressbar: bool = True, num_workers: int = 0, **dataset_kwargs) -> Dataset
.. staticmethod:: ingest_coco(images_directory: Union[str, pathlib.Path], annotation_files: Union[str, pathlib.Path, List[str]], dest: Union[str, pathlib.Path], key_to_tensor_mapping: Optional[Dict] = None, file_to_group_mapping: Optional[Dict] = None, ignore_one_group: bool = False, ignore_keys: Optional[List[str]] = None, image_settings: Optional[Dict] = None, src_creds: Optional[Dict] = None, dest_creds: Optional[Dict] = None, inspect_limit: int = 1000000, progressbar: bool = True, num_workers: int = 0, token: Optional[str] = None, connect_kwargs: Optional[Dict] = None, **dataset_kwargs) -> Dataset

See :func:`deeplake.ingest_coco`.

Expand All @@ -55,7 +55,7 @@ deeplake.api.dataset

See :func:`deeplake.ingest_kaggle`.

.. staticmethod:: ingest_dataframe(src, dest: Union[str, pathlib.Path], dest_creds: Optional[Dict] = None, progressbar: bool = True, token: Optional[str] = None, **dataset_kwargs)
.. staticmethod:: ingest_dataframe(src, dest: Union[str, pathlib.Path], dest_creds: Optional[Dict] = None, progressbar: bool = True, token: Optional[str] = None, connect_kwargs: Optional[Dict] = None, **dataset_kwargs)

See :func:`deeplake.ingest_dataframe`.

Expand Down

0 comments on commit ed532d4

Please sign in to comment.