merge

Gyanachand1 · Nov 16, 2020 · af43b91 · af43b91
2 parents ac4547e + 1fdaa54
commit af43b91
Show file tree

Hide file tree

Showing 10 changed files with 157 additions and 54 deletions.
diff --git a/examples/eurosat.py b/examples/eurosat.py
@@ -0,0 +1,41 @@
+import hub
+import torch
+
+if __name__ == "__main__":
+    ds = hub.Dataset("eurosat/eurosat-rgb")
+
+    # 26000 samples in dataset, accessing values
+    print(ds["image"][10].numpy())
+    print(ds["label", 15].numpy())  # alternate way to access, by specifying both key and sample number at once
+    print(ds["filename", 20:22].numpy())  # accessing multiple elements at once
+
+    # Splitting into train and test sets
+    train_ds = ds[:13000]
+    test_ds = ds[13000:]
+
+    # Using hub with tensorflow
+    train_tf_ds = train_ds.to_tensorflow().batch(2)
+
+    for batch in train_tf_ds:
+        print(batch["label"], batch["filename"], batch["image"])
+        break
+
+    test_tf_ds = test_ds.to_tensorflow().batch(2)
+
+    for batch in test_tf_ds:
+        print(batch["label"], batch["filename"], batch["image"])
+        break
+
+    # Using hub with pytorch
+    train_pt_ds = train_ds.to_pytorch()
+    train_loader = torch.utils.data.DataLoader(train_pt_ds, batch_size=2)
+
+    for batch in train_loader:
+        print(batch["label"], batch["image"])  # pytorch tensors don't support text labels such as filename
+        break
+
+    test_pt_ds = test_ds.to_pytorch()
+    test_loader = torch.utils.data.DataLoader(test_pt_ds, batch_size=2)
+    for batch in test_loader:
+        print(batch["label"], batch["image"])  # pytorch tensors don't support text labels such as filename
+        break
diff --git a/hub/api/dataset.py b/hub/api/dataset.py
@@ -36,7 +36,8 @@
     ShapeArgumentNotFoundException,
     SchemaArgumentNotFoundException,
     ModuleNotInstalledException,
-    WrongUsernameException,
+    NoneValueException,
+    ShapeLengthException
 )
 from hub.store.metastore import MetaStorage
 from hub.client.hub_control import HubControlClient
@@ -76,7 +77,6 @@ def __init__(
         lock_cache=True,
     ):
         """Open a new or existing dataset for read/write
-
         Parameters
         ----------
         url: str
@@ -106,8 +106,10 @@ def __init__(
         if isinstance(shape, int):
             shape = [shape]
         if shape is not None:
-            assert len(tuple(shape)) == 1
-        assert mode is not None
+            if len(tuple(shape)) != 1:
+                raise ShapeLengthException
+        if mode is None:
+            raise NoneValueException('mode')
 
         self.url = url
         self.token = token
@@ -177,12 +179,6 @@ def _check_and_prepare_dir(self):
         Returns True dataset needs to be created opposed to read.
         """
         fs, path, mode = self._fs, self._path, self.mode
-        if path.startswith("s3://"):
-            with open(os.path.expanduser("~/.activeloop/store"), "rb") as f:
-                stored_username = json.load(f)["_id"]
-            current_username = path.split("/")[-2]
-            if stored_username != current_username:
-                raise WrongUsernameException(current_username)
         exist_meta = fs.exists(posixpath.join(path, "meta.json"))
         if exist_meta:
             if "w" in mode:
@@ -258,12 +254,9 @@ def _open_storage_tensors(self):
     def __getitem__(self, slice_):
         """| Gets a slice or slices from dataset
         | Usage:
-
         >>> return ds["image", 5, 0:1920, 0:1080, 0:3].numpy() # returns numpy array
-
         >>> images = ds["image"]
         >>> return images[5].numpy() # returns numpy array
-
         >>> images = ds["image"]
         >>> image = images[5]
         >>> return image[0:1920, 0:1080, 0:3].numpy()
@@ -296,9 +289,7 @@ def __getitem__(self, slice_):
     def __setitem__(self, slice_, value):
         """| Sets a slice or slices with a value
         | Usage
-
          >>> ds["image", 5, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8")
-
         >>> images = ds["image"]
         >>> image = images[5]
         >>> image[0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8")
@@ -327,19 +318,35 @@ def delete(self):
             return True
         return False
 
-    def to_pytorch(self, Transform=None):
-        """Converts the dataset into a pytorch compatible format"""
+    def to_pytorch(self, Transform=None, offset=None, num_samples=None):
+        """Converts the dataset into a pytorch compatible format
+        Parameters
+        ----------
+        offset: int, optional
+            The offset from which dataset needs to be converted
+        num_samples: int, optional
+            The number of samples required of the dataset that needs to be converted
+        """
         if "torch" not in sys.modules:
             raise ModuleNotInstalledException("torch")
-        return TorchDataset(self, Transform)
+        return TorchDataset(self, Transform, offset=offset, num_samples=num_samples)
 
-    def to_tensorflow(self):
-        """Converts the dataset into a tensorflow compatible format"""
+    def to_tensorflow(self, offset=None, num_samples=None):
+        """Converts the dataset into a tensorflow compatible format
+        Parameters
+        ----------
+        offset: int, optional
+            The offset from which dataset needs to be converted
+        num_samples: int, optional
+            The number of samples required of the dataset that needs to be converted
+        """
         if "tensorflow" not in sys.modules:
             raise ModuleNotInstalledException("tensorflow")
+        offset = 0 if offset is None else offset
+        num_samples = self.shape[0] if num_samples is None else num_samples
 
         def tf_gen():
-            for index in range(self.shape[0]):
+            for index in range(offset, offset + num_samples):
                 d = {}
                 for key in self._tensors.keys():
                     split_key = key.split("/")
@@ -368,6 +375,8 @@ def dtype_to_tf(my_dtype):
             elif isinstance(my_dtype, Tensor):
                 return tensor_to_tf(my_dtype)
             elif isinstance(my_dtype, Primitive):
+                if str(my_dtype._dtype) == "object":
+                    return "string"
                 return str(my_dtype._dtype)
 
         def get_output_shapes(my_dtype):
@@ -466,7 +475,6 @@ def from_tensorflow(ds):
         ds = ds.to_tensorflow()
         out_ds = hub.Dataset.from_tensorflow(ds)
         res_ds = out_ds.store("username/new_dataset") # res_ds is now a usable hub dataset
-
         """
         if "tensorflow" not in sys.modules:
             raise ModuleNotInstalledException("tensorflow")
@@ -483,8 +491,9 @@ def tf_to_hub(tf_dt):
                 return TensorSpec_to_hub(tf_dt)
 
         def TensorSpec_to_hub(tf_dt):
+            dt = tf_dt.dtype.name if tf_dt.dtype.name != "string" else "object"
             shape = tf_dt.shape if tf_dt.shape.rank is not None else (None,)
-            return Tensor(shape=shape, dtype=tf_dt.dtype.name)
+            return Tensor(shape=shape, dtype=dt)
 
         def dict_to_hub(tf_dt):
             d = {
@@ -623,11 +632,13 @@ def my_transform(sample):
 
 
 class TorchDataset:
-    def __init__(self, ds, transform=None):
+    def __init__(self, ds, transform=None, num_samples=None, offset=None):
         self._ds = None
         self._url = ds.url
         self._token = ds.token
         self._transform = transform
+        self.num_samples = num_samples
+        self.offset = offset
 
     def _do_transform(self, data):
         return self._transform(data) if self._transform else data
@@ -641,9 +652,10 @@ def _init_ds(self):
 
     def __len__(self):
         self._init_ds()
-        return self._ds.shape[0]
+        return self.num_samples if self.num_samples is not None else self._ds.shape[0]
 
     def __getitem__(self, index):
+        index = index + self.offset if self.offset is not None else index
         self._init_ds()
         d = {}
         for key in self._ds._tensors.keys():
@@ -655,8 +667,8 @@ def __getitem__(self, index):
                 else:
                     cur[split_key[i]] = {}
                     cur = cur[split_key[i]]
-
-            cur[split_key[-1]] = torch.tensor(self._ds._tensors[key][index])
+            if not isinstance(self._ds._tensors[key][index], bytes):
+                cur[split_key[-1]] = torch.tensor(self._ds._tensors[key][index])
         return d
 
     def __iter__(self):

diff --git a/hub/api/datasetview.py b/hub/api/datasetview.py
@@ -1,5 +1,6 @@
 from hub.api.tensorview import TensorView
 from hub.api.dataset_utils import slice_extract_info, slice_split
+from hub.exceptions import NoneValueException
 import collections.abc as abc
 
 
@@ -21,9 +22,12 @@ def __init__(
         offset: int
             The offset from which the DatasetView starts
         """
-        assert dataset is not None
-        assert num_samples is not None
-        assert offset is not None
+        if dataset is None:
+            raise NoneValueException('dataset')
+        if num_samples is None:
+            raise NoneValueException('num_samples')
+        if offset is None:
+            raise NoneValueException('offset')
 
         self.dataset = dataset
         self.num_samples = num_samples
@@ -132,3 +136,12 @@ def __iter__(self):
 
     def __len__(self):
         return self.num_samples
+
+    def to_tensorflow(self):
+        """Converts the dataset into a tensorflow compatible format"""
+        return self.dataset.to_tensorflow(num_samples=self.num_samples, offset=self.offset)
+
+    def to_pytorch(self, Transform=None):
+        """Converts the dataset into a pytorch compatible format"""
+        return self.dataset.to_pytorch(Transform=Transform, num_samples=self.num_samples, offset=self.offset)
+
diff --git a/hub/api/tensorview.py b/hub/api/tensorview.py
@@ -1,5 +1,6 @@
 import collections.abc as abc
 from hub.api.dataset_utils import slice_split
+from hub.exceptions import NoneValueException
 
 
 class TensorView:
@@ -21,8 +22,10 @@ def __init__(
             The `slice_` of this Tensor that needs to be accessed
         """
 
-        assert dataset is not None
-        assert subpath is not None
+        if dataset is None:
+            raise NoneValueException('dataset')
+        if subpath is None:
+            raise NoneValueException('subpath')
 
         self.dataset = dataset
         self.subpath = subpath

diff --git a/hub/api/tests/test_converters.py b/hub/api/tests/test_converters.py
@@ -50,18 +50,23 @@ def test_to_from_tensorflow():
             "d": {"e": Tensor((5, 3), "uint8")},
             "f": "float"
         },
+        "named_label": "object"
     }
 
     ds = hub.Dataset(schema=my_schema, shape=(10,), url="./data/test_from_tf/ds3", mode="w")
     for i in range(10):
         ds["label", "d", "e", i] = i * np.ones((5, 3))
+        ds["named_label", i] = 'try' + str(i)
     ds = ds.to_tensorflow()
     out_ds = hub.Dataset.from_tensorflow(ds)
     res_ds = out_ds.store("./data/test_from_tf/ds4", length=10)  # generator has no length, argument needed
     for i in range(10):
         assert (
             res_ds["label", "d", "e", i].numpy() == i * np.ones((5, 3))
         ).all()
+        assert (
+            res_ds["named_label", i].numpy().decode('utf-8') == 'try' + str(i)
+        )
 
 
 @pytest.mark.skipif(not pytorch_loaded(), reason="requires pytorch to be loaded")

diff --git a/hub/compute/tests/test_pipeline.py b/hub/compute/tests/test_pipeline.py
@@ -35,8 +35,8 @@ def my_transform(sample, multiplier: int = 2):
         }
 
     out_ds = my_transform(ds, multiplier=2)
-    assert (out_ds["image", 0:2].numpy() == 2).all()
-    # assert len(list(out_ds)) == 100
+    assert (out_ds["image", 0].numpy() == 2).all()
+    assert len(list(out_ds)) == 100
     res_ds = out_ds.store("./data/test/test_pipeline_basic_output")
 
     assert res_ds["label", 5].numpy() == "hello 5"
@@ -133,12 +133,13 @@ def my_transform(sample):
 
         with Timer(name):
             out_ds = my_transform(ds_fs)
-            res_ds = out_ds.store(f"./data/test/test_pipeline_basic_output_{name}")
+            out_ds.store(f"./data/test/test_pipeline_basic_output_{name}")
 
 if __name__ == "__main__":
     test_pipeline_basic()
     test_pipeline_dynamic()
-    # test_pipeline_basic()
-
+
     # test_pathos()
-    # benchmark()
+    # benchmark()
+
+
diff --git a/hub/compute/transform.py b/hub/compute/transform.py
@@ -1,6 +1,8 @@
-from hub.api.dataset import Dataset
+import os
 from typing import Dict
+
 import hub
+from hub.api.dataset import Dataset
 from tqdm import tqdm
 from collections.abc import MutableMapping
 from hub.features.features import Primitive
@@ -203,7 +205,8 @@ def __getitem__(self, slice_):
         num, ofs = slice_extract_info(slice_list[0], self.shape[0])
         ds_view = DatasetView(dataset=self._ds, num_samples=num, offset=ofs)
 
-        new_ds = self.store("~/.activeloop/tmp/array", length=num, ds=ds_view, progressbar=True)
+        path = os.path.expanduser("~/.activeloop/tmp")
+        new_ds = self.store(path, length=num, ds=ds_view, progressbar=True)
         slice_[1] = slice(None, None, None)  # Get all shape dimension since we already sliced
         return new_ds[slice_]
 

diff --git a/hub/exceptions.py b/hub/exceptions.py
@@ -169,6 +169,15 @@ def __init__(self, correct_shape, wrong_shape):
         message = f"parameter 'value': expected array with shape {correct_shape}, got {wrong_shape}"
         super(HubException, self).__init__(message=message)
 
+class NoneValueException(HubException):
+    def __init__(self, param):
+        message = f"Parameter '{param}' should be provided"
+        super(HubException, self).__init__(message=message)
+
+class ShapeLengthException(HubException):
+    def __init__(self):
+        message = f"Parameter 'shape' should be a tuple of length 1"
+        super(HubException, self).__init__(message=message)
 
 class ModuleNotInstalledException(HubException):
     def __init__(self, module_name):
@@ -196,6 +205,22 @@ def __init__(self):
                   "The provided directory is not empty and doesn't contain information about any Hub Dataset "
         super(HubException, self).__init__(message=message)
 
+class DynamicTensorNotFoundException(Exception):
+    def __init__(self):
+        message = f"Unable to find dynamic tensor"
+        super(HubException, self).__init__(message=message)
+
+class DynamicTensorShapeException(Exception):
+    def __init__(self, exc_type):
+        if exc_type == 'none':
+            message = f"Parameter 'max_shape' shouldn't contain any 'None' value"
+        elif exc_type == 'length':
+            message = "Lengths of 'shape' and 'max_shape' should be equal"
+        elif exc_type == 'not_equal':
+            message = "All not-None values from 'shape' should be equal to the corresponding values in 'max_shape'"
+        else:
+            message = "Wrong 'shape' or 'max_shape' values"
+        super(HubException, self).__init__(message=message)
 
 class NotIterable(HubException):
     def __init__(self):
@@ -212,10 +237,5 @@ def __init__(self):
 class NotZarrFolderException(Exception):
     pass
 
-
 class StorageTensorNotFoundException(Exception):
     pass
-
-
-class DynamicTensorNotFoundException(Exception):
-    pass