diff --git a/.circleci/config.yml b/.circleci/config.yml index 8ea761d075..fd10ee4bb1 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -90,11 +90,17 @@ commands: get-python: steps: - run: - name: "Install Python" + name: "Install Python & ffmpeg" command: | brew update brew install python@3.7 brew link --overwrite python@3.7 + brew install ffmpeg + get-linux-ffmpeg: + steps: + - run: + name: "Install ffmpeg" + command: sudo apt-get update && sudo apt-get install ffmpeg info: steps: - run: @@ -168,6 +174,7 @@ commands: pytest --cov-report=xml --cov=./ - when: condition: << parameters.unix-like >> + resource_class: medium+ steps: - run: name: "Running tests - Unix" @@ -244,6 +251,7 @@ jobs: type: boolean default: false executor: << parameters.e >> + resource_class: medium+ steps: - checkout - unless: @@ -254,6 +262,14 @@ jobs: condition: << parameters.mac-like >> steps: - get-python + - when: + condition: + and: + - not: << parameters.mac-like >> + - << parameters.unix-like >> + - << parameters.optional >> + steps: + - get-linux-ffmpeg - info - google-creds: unix-like: << parameters.unix-like >> @@ -272,6 +288,7 @@ jobs: executor: linux environment: IMAGE_NAME: snarkai/hub + resource_class: medium+ steps: - setup_remote_docker - checkout diff --git a/docs/source/integrations/supervisely.md b/docs/source/integrations/supervisely.md new file mode 100644 index 0000000000..1e54d9f188 --- /dev/null +++ b/docs/source/integrations/supervisely.md @@ -0,0 +1,30 @@ +# Supervisely + +## Dataset to Supervisely Project +Here is an example of a conversion of the dataset into Supervisely format. + +```python +from hub import Dataset, schema + +# Create dataset +ds = Dataset( + "./dataset", + shape=(64,), + schema={ + "image": schema.Image((512, 512, 3)), + }, +) + +# transform into Supervisely project +project = ds.to_supervisely("sample-project") +``` + +## Supervisely Project to Dataset +In this manner, Hub dataset can be created from a supervisely project: + +```python +import hub + +out_ds = hub.Dataset.from_supervisely("sample-project") +res_ds = out_ds.store("./dataset") +``` diff --git a/hub/api/dataset.py b/hub/api/dataset.py index 3c296310c9..c58f0c7333 100755 --- a/hub/api/dataset.py +++ b/hub/api/dataset.py @@ -813,6 +813,18 @@ def to_tensorflow(self, indexes=None, include_shapes=False, key_list=None): ds = _to_tensorflow(self, indexes, include_shapes, key_list) return ds + def to_supervisely(self, output): + """| Converts the dataset into a supervisely project + Parameters + ---------- + output: str + Project name and output directory. + """ + from .integrations import _to_supervisely + + project = _to_supervisely(self, output) + return project + def _get_dictionary(self, subpath, slice_=None): """Gets dictionary from dataset given incomplete subpath""" tensor_dict = {} @@ -1067,6 +1079,24 @@ def from_pytorch(dataset, scheduler: str = "single", workers: int = 1): ds = _from_pytorch(dataset, scheduler, workers) return ds + @staticmethod + def from_supervisely(project, scheduler: str = "single", workers: int = 1): + """| Converts a supervisely project into hub format + + Parameters + ---------- + dataset: + The path to the supervisely project that needs to be converted into hub format + scheduler: str + choice between "single", "threaded", "processed" + workers: int + how many threads or processes to use + """ + from .integrations import _from_supervisely + + ds = _from_supervisely(project, scheduler, workers) + return ds + @staticmethod def from_path(path, scheduler="single", workers=1): # infer schema & get data (label -> input mapping with file refs) diff --git a/hub/api/integrations.py b/hub/api/integrations.py index f146888aa2..f8fc17961f 100644 --- a/hub/api/integrations.py +++ b/hub/api/integrations.py @@ -5,10 +5,15 @@ """ import sys +import numpy as np +import json +from itertools import chain from collections import defaultdict +import PIL.Image +import PIL.ImageDraw from hub.exceptions import ModuleNotInstalledException, OutOfBoundsError from hub.schema.features import Primitive, Tensor, SchemaDict -from hub.schema import Audio, BBox, ClassLabel, Image, Sequence, Text, Video +from hub.schema import Audio, BBox, ClassLabel, Image, Sequence, Text, Video, Mask from .dataset import Dataset import hub.store.pickle_s3_storage import hub.schema.serialize @@ -702,3 +707,211 @@ def __iter__(self): self._init_ds() for i in range(len(self)): yield self[i] + + +def _from_supervisely(project, scheduler: str = "single", workers: int = 1): + try: + import supervisely_lib as sly + from supervisely_lib.project import project as sly_image_project + from supervisely_lib.project import video_project as sly_video_project + from skvideo.io import FFmpegReader, vread + except ModuleNotFoundError: + raise ModuleNotInstalledException("supervisely") + if isinstance(project, str): + with open(project + "meta.json") as meta_file: + project_meta_dict = json.load(meta_file) + instantiated = False + else: + project_meta_dict = project.meta.to_json() + instantiated = True + project_type = project_meta_dict["projectType"] + mode = sly.OpenMode.READ + + def infer_image(paths): + bboxes, masks = [], [] + classes_bb, classes_mask = [], [] + item_path, item_ann_path = paths + + ann = sly.Annotation.load_json_file(item_ann_path, project.meta) + ann_dict = ann.to_json() + sizes = (ann_dict["size"]["height"], ann_dict["size"]["width"]) + for obj in ann_dict["objects"]: + if obj["geometryType"] == "rectangle": + bboxes.append( + [item for sublist in obj["points"]["exterior"] for item in sublist] + ) + classes_bb.append(obj["classTitle"]) + elif obj["geometryType"] == "polygon": + img = PIL.Image.new("L", (sizes[1], sizes[0]), 0) + PIL.ImageDraw.Draw(img).polygon( + [tuple(obj) for obj in obj["points"]["exterior"]], + outline=1, + fill=1, + ) + masks.append(np.array(img)) + classes_mask.append(obj["classTitle"]) + return sizes, bboxes, masks, classes_bb, classes_mask + + def infer_video(paths): + item_path, item_ann_path = paths + vreader = FFmpegReader(item_path) + return (vreader.getShape(),) + + def infer_project(project, project_type, read_mode): + if project_type == "images": + if not instantiated: + project = sly_image_project.Project(project, mode) + max_shape = (0, 0) + return ( + project, + Image, + infer_image, + max_shape, + ) + elif project_type == "videos": + if not instantiated: + project = sly_video_project.VideoProject(project, mode) + max_shape = (0, 0, 0, 0) + return ( + project, + Video, + infer_video, + max_shape, + ) + + project, main_blob, infer_ds, max_shape = infer_project(project, project_type, mode) + + image_paths = [] + label_names = [] + max_num_bboxes = 0 + max_num_polys = 0 + masks = False + datasets = project.datasets.items() + uniform = True + for ds in datasets: + for i, item in enumerate(ds): + path = ds.get_item_paths(item) + image_paths.append(path) + inf = infer_ds(path) + if len(inf) > 1: + if inf[3]: + label_names.extend(inf[3]) + if len(inf[3]) > max_num_bboxes: + max_num_bboxes = len(inf[3]) + if inf[4]: + label_names.extend(inf[4]) + if len(inf[4]) > max_num_polys: + max_num_polys = len(inf[4]) + if inf[2]: + masks = True + shape = inf[0] + max_shape = np.maximum(shape, max_shape) + if uniform and max_shape.any() and (shape != max_shape).any(): + uniform = False + label_names = list(np.unique(label_names)) + items = chain(*datasets) + idatasets = iter(datasets) + ds, i = next(idatasets), 0 + key = "shape" if uniform else "max_shape" + if project_type == "images": + read = sly.imaging.image.read + blob_shape = {key: (*max_shape.tolist(), 3)} + elif project_type == "videos": + read = vread + blob_shape = {key: max_shape.tolist()} + if key == "max_shape": + blob_shape["shape"] = (None, None, None, 3) + + schema = { + project_type: main_blob(**blob_shape), + } + if max_num_bboxes: + schema["bbox"] = BBox(shape=(None, 4), max_shape=(max_num_bboxes, 4)) + if label_names: + schema["label"] = ClassLabel( + shape=(None,), + max_shape=(max(max_num_bboxes, max_num_polys),), + names=label_names, + ) + if masks: + schema["mask"] = Mask( + shape=(None, None, None), max_shape=(*max_shape.tolist(), 1) + ) + + @hub.transform(schema=schema, scheduler=scheduler, workers=workers) + def transformation(item): + nonlocal i, ds + sample = {} + if i >= len(ds): + ds, i = next(idatasets), 0 + item_path, item_ann_path = ds.get_item_paths(item) + i += 1 + _, bboxes, masks, classes_bbox, classes_mask = infer_ds( + (item_path, item_ann_path) + ) + sample[project_type] = read(item_path) + if bboxes: + sample["bbox"] = np.array(bboxes) + sample["label"] = [label_names.index(i) for i in classes_bbox] + if masks: + sample["mask"] = np.expand_dims(masks[0], -1) + sample["label"] = [label_names.index(i) for i in classes_mask] + return sample + + return transformation(list(items)) + + +def _to_supervisely(dataset, output): + try: + import supervisely_lib as sly + from skvideo.io import vwrite + except ModuleNotFoundError: + raise ModuleNotInstalledException("supervisely") + schema_dict = dataset.schema.dict_ + for key, schem in schema_dict.items(): + if isinstance(schem, Image): + project_type = "images" + extension = "jpeg" + break + elif isinstance(schem, Video): + project_type = "videos" + extension = "mp4" + break + else: + raise Exception + mode = sly.OpenMode.CREATE + if project_type == "images": + _project = sly.Project + elif project_type == "videos": + _project = sly.VideoProject + else: + raise Exception + pr = _project(output, mode) + meta = pr.meta + meta._project_type = project_type + # probably here we can create multiple datasets + out_ds = pr.create_dataset(output) + try: + fn_key = "filename" + dataset[fn_key] + except KeyError: + fn_key = None + zeroes = len(str(len(dataset))) + for idx, view in enumerate(dataset): + obj = view[key].compute() + if fn_key: + fn = view[fn_key].compute() + else: + fn = f"{idx:0{zeroes}}" + fn = "{}.{}".format(fn, extension) + # strangely supervisely prevents from using this method on videos + try: + out_ds.add_item_np(fn, obj) + except RuntimeError: + # fix with in-memory file + path = "{}/{}".format(out_ds.item_dir, fn) + vwrite(path, obj) + out_ds._item_to_ann[fn] = fn + ".json" + out_ds.set_ann(fn, out_ds._get_empty_annotaion(path)) + pr.set_meta(meta) + return pr diff --git a/hub/api/tests/test_converters.py b/hub/api/tests/test_converters.py index 9b0d269ee5..c15936d043 100644 --- a/hub/api/tests/test_converters.py +++ b/hub/api/tests/test_converters.py @@ -7,9 +7,16 @@ import hub.api.tests.test_converters from hub.schema.features import Tensor import numpy as np -from hub.utils import tfds_loaded, tensorflow_loaded, pytorch_loaded +import shutil +import os.path +from hub.utils import ( + tfds_loaded, + tensorflow_loaded, + pytorch_loaded, + supervisely_loaded, + Timer, +) import pytest -from hub.utils import Timer @pytest.mark.skipif(not tfds_loaded(), reason="requires tfds to be loaded") @@ -464,6 +471,120 @@ def test_to_tensorflow_bug(): data = ds.to_tensorflow() +@pytest.mark.skipif( + not supervisely_loaded(), reason="requires supervisely to be loaded" +) +def test_to_supervisely(): + data_path = "./data/test_supervisely/to_from" + dataset_name = "rock_paper_scissors_test" + if os.path.exists(data_path): + shutil.rmtree(data_path) + original_dataset = hub.Dataset(f"activeloop/{dataset_name}", mode="r") + project = original_dataset.to_supervisely(os.path.join(data_path, dataset_name)) + trans = hub.Dataset.from_supervisely(project) + new_dataset = trans.store(os.path.join(data_path, "new_rpst")) + + +@pytest.mark.skipif( + not supervisely_loaded(), reason="requires supervisely to be loaded" +) +def test_from_supervisely(): + import supervisely_lib as sly + + data_path = "./data/test_supervisely/from_to" + if os.path.exists(data_path): + shutil.rmtree(data_path) + project_name = "pixel_project" + project_path = os.path.join(data_path, project_name) + project = sly.Project(project_path, sly.OpenMode.CREATE) + init_meta = project.meta + project.meta._project_type = "images" + project_ds = project.create_dataset(project_name) + img = np.ones((30, 30, 3)) + project_ds.add_item_np("pixel.jpeg", img) + item_path, item_ann_path = project_ds.get_item_paths("pixel.jpeg") + ann = sly.Annotation.load_json_file(item_ann_path, project.meta) + bbox_class = sly.ObjClass(name="_bbox", geometry_type=sly.Rectangle) + meta_with_bboxes = project.meta.add_obj_classes([bbox_class]) + bbox_label = sly.Label( + geometry=sly.Rectangle(0, 0, 10, 10), + obj_class=meta_with_bboxes.obj_classes.get("_bbox"), + ) + ann_with_bboxes = ann.add_labels([bbox_label]) + project_ds.set_ann("pixel.jpeg", ann_with_bboxes) + project.set_meta(meta_with_bboxes) + + trans = hub.Dataset.from_supervisely(project) + dataset = trans.store(os.path.join(data_path, "pixel_dataset_bbox")) + project_back = dataset.to_supervisely( + os.path.join(data_path, "pixel_project_bbox_back") + ) + project.set_meta(init_meta) + poly_class = sly.ObjClass(name="_poly", geometry_type=sly.Polygon) + meta_with_poly = project.meta.add_obj_classes([poly_class]) + points = [[0, 0], [0, 10], [10, 0], [10, 10]] + point_loc_points = [ + sly.geometry.point_location.PointLocation(*point) for point in points + ] + poly_label = sly.Label( + geometry=sly.Polygon(exterior=point_loc_points, interior=[]), + obj_class=meta_with_poly.obj_classes.get("_poly"), + ) + ann_with_polys = ann.add_labels([poly_label]) + project_ds.set_ann("pixel.jpeg", ann_with_polys) + project.set_meta(meta_with_poly) + trans = hub.Dataset.from_supervisely(project) + dataset = trans.store(os.path.join(data_path, "pixel_dataset_poly")) + project_back = dataset.to_supervisely( + os.path.join(data_path, "pixel_project_poly_back") + ) + + +@pytest.mark.skipif( + not supervisely_loaded(), reason="requires supervisely to be loaded" +) +def test_to_supervisely_video(): + data_path = "./data/test_supervisely/video_to" + if os.path.exists(data_path): + shutil.rmtree(data_path) + schema = { + "vid": hub.schema.Video(shape=(1, 1, 2, 3)), + "filename": hub.schema.Text(max_shape=5), + } + ds = hub.Dataset( + os.path.join(data_path, "hub_video_dataset"), schema=schema, shape=3 + ) + filenames = ["one", "two", "three"] + ds["filename"][:] = filenames + project = ds.to_supervisely(os.path.join(data_path, "sly_video_dataset")) + + +@pytest.mark.skipif( + not supervisely_loaded(), reason="requires supervisely to be loaded" +) +def test_from_supervisely_video(): + import supervisely_lib as sly + from skvideo.io import vwrite + + data_path = "./data/test_supervisely/video_from" + if os.path.exists(data_path): + shutil.rmtree(data_path) + project_name = "minuscule_videos/" + project_path = os.path.join(data_path, project_name) + project = sly.VideoProject(project_path, sly.OpenMode.CREATE) + project.meta._project_type = "videos" + item_name = "item.mp4" + np.random.seed(0) + for name in ["foofoo", "bar"]: + ds = project.create_dataset(name) + item_path = os.path.join(ds.item_dir, item_name) + vwrite(item_path, (np.random.rand(len(name), 2, 2, 3) * 255).astype("uint8")) + ds._item_to_ann[item_name] = item_name + ".json" + ds.set_ann(item_name, ds._get_empty_annotaion(item_path)) + project.set_meta(project.meta) + trans = hub.Dataset.from_supervisely(os.path.join(data_path, project_name)) + + @pytest.mark.skipif(not pytorch_loaded(), reason="requires pytorch to be loaded") def test_to_pytorch_shuffle(): schema = { diff --git a/hub/utils.py b/hub/utils.py index 5ca5385362..c43432dd9e 100644 --- a/hub/utils.py +++ b/hub/utils.py @@ -157,6 +157,14 @@ def pathos_loaded(): return True +def supervisely_loaded(): + try: + import supervisely_lib + except ImportError: + return False + return True + + def compute_lcm(a): """ Lowest Common Multiple of a list a diff --git a/requirements-optional.txt b/requirements-optional.txt index acc2ede88c..a5fd7959f8 100644 --- a/requirements-optional.txt +++ b/requirements-optional.txt @@ -5,3 +5,4 @@ transformers>=3.5.1 dask[complete]>=2.30 tensorflow_datasets ray==1.3.0 +supervisely==6.1.64