Merge branch 'main' of github.com:activeloopai/Hub into task/remove-c…

…hunk-sizes
gaurav879 · Jul 13, 2021 · cbdc896 · cbdc896
2 parents 536bc1d + b6e5601
commit cbdc896
Show file tree

Hide file tree

Showing 17 changed files with 93 additions and 55 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -112,7 +112,7 @@ commands:
     steps:
       - run:
           name: "Gather machine info"
-          command: python3 -c "import platform as p; print('{}\nPython {}'.format(p.platform(), p.sys.version))"
+          command: python3 -c "import platform as p; print(f'{p.platform()}\nPython {p.sys.version}')"
 
   google-creds:
     parameters:

diff --git a/conftest.py b/conftest.py
@@ -87,15 +87,14 @@ def pytest_addoption(parser):
         CACHE_OPT,
         action="store_true",
         help="Tests using the `storage` fixture may run with combinations of all enabled providers in cache chains. "
-        "For example, if the option `%s` is not provided, all cache chains that use `S3Provider`"
-        "  are skipped." % (S3_OPT),
+        f"For example, if the option `{S3_OPT}` is not provided, all cache chains that use `S3Provider`"
+        "  are skipped.",
     )
     parser.addoption(
         CACHE_ONLY_OPT,
         action="store_true",
-        help="Force enables `%s`. `storage` fixture only returns cache chains. For example, if `%s` is provided, \
-            `storage` will never be just `S3Provider`."
-        % (CACHE_OPT, S3_OPT),
+        help=f"Force enables `{CACHE_OPT}`. `storage` fixture only returns cache chains. For example, if `{S3_OPT}` is provided, \
+            `storage` will never be just `S3Provider`.",
     )
     parser.addoption(
         S3_PATH_OPT,
@@ -273,7 +272,7 @@ def flower_path():
 
 def print_session_id():
     print("\n\n----------------------------------------------------------")
-    print("Testing session ID: %s" % SESSION_ID)
+    print(f"Testing session ID: {SESSION_ID}")
     print("----------------------------------------------------------")
 
 

diff --git a/hub/api/dataset.py b/hub/api/dataset.py
@@ -172,6 +172,7 @@ def create_tensor(
         if tensor_exists(name, self.storage):
             raise TensorAlreadyExistsError(name)
 
+        self.meta.tensors.append(name)
         create_tensor(
             name,
             self.storage,
@@ -183,7 +184,6 @@ def create_tensor(
         tensor = Tensor(name, self.storage)  # type: ignore
 
         self.tensors[name] = tensor
-        self.meta.tensors.append(name)
 
         return tensor
 

diff --git a/hub/api/tests/test_api.py b/hub/api/tests/test_api.py
@@ -12,6 +12,7 @@
 )
 from hub.client.client import HubBackendClient
 from hub.client.utils import has_hub_testing_creds
+from click.testing import CliRunner
 
 
 # need this for 32-bit and 64-bit systems to have correct tests
@@ -448,20 +449,41 @@ def test_fails_on_wrong_tensor_syntax(memory_ds):
 def test_hub_cloud_dataset():
     username = "testingacc"
     password = os.getenv("ACTIVELOOP_HUB_PASSWORD")
+    id = str(uuid.uuid1())
+
+    uri = f"hub://{username}/hub2ds2_{id}"
 
     client = HubBackendClient()
     token = client.request_auth_token(username, password)
-    id = str(uuid.uuid1())
-    ds = Dataset(f"hub://testingacc/hub2ds2_{id}", token=token)
-    ds.create_tensor("image")
 
-    for i in range(10):
-        ds.image.append(i * np.ones((100, 100)))
+    with Dataset(uri, token=token) as ds:
+        ds.create_tensor("image")
+        ds.create_tensor("label", htype="class_label")
 
-    token = ds.token
-    del ds
-    ds = Dataset(f"hub://testingacc/hub2ds2_{id}", token=token)
-    for i in range(10):
+        for i in range(1000):
+            ds.image.append(i * np.ones((100, 100)))
+            ds.label.append(np.uint32(i))
+
+    ds = Dataset(uri, token=token)
+    for i in range(1000):
         np.testing.assert_array_equal(ds.image[i].numpy(), i * np.ones((100, 100)))
+        np.testing.assert_array_equal(ds.label[i].numpy(), np.uint32(i))
 
     ds.delete()
+
+
+@parametrize_all_dataset_storages
+def test_hub_dataset_suffix_bug(ds):
+    # creating dataset with similar name but some suffix removed from end
+    ds2 = Dataset(ds.path[:-1])
+    ds2.delete()
+
+
+def test_empty_dataset():
+    with CliRunner().isolated_filesystem():
+        ds = Dataset("test")
+        ds.create_tensor("x")
+        ds.create_tensor("y")
+        ds.create_tensor("z")
+        ds = Dataset("test")
+        assert list(ds.tensors) == ["x", "y", "z"]
diff --git a/hub/client/client.py b/hub/client/client.py
@@ -157,7 +157,7 @@ def get_dataset_credentials(
         Returns:
             tuple: containing full url to dataset, credentials, mode and expiration time respectively.
         """
-        relative_url = GET_DATASET_CREDENTIALS_SUFFIX % (org_id, ds_name)
+        relative_url = GET_DATASET_CREDENTIALS_SUFFIX.format(org_id, ds_name)
         response = self.request(
             "GET",
             relative_url,

diff --git a/hub/client/config.py b/hub/client/config.py
@@ -11,7 +11,7 @@
 
 GET_TOKEN_SUFFIX = "/api/user/token"
 REGISTER_USER_SUFFIX = "/api/user/register"
-GET_DATASET_CREDENTIALS_SUFFIX = "/api/org/%s/ds/%s/creds"
+GET_DATASET_CREDENTIALS_SUFFIX = "/api/org/{}/ds/{}/creds"
 CREATE_DATASET_SUFFIX = "/api/dataset/create"
 DATASET_SUFFIX = "/api/dataset"
 UPDATE_SUFFIX = "/api/org/{}/dataset/{}"

diff --git a/hub/core/meta/tensor_meta.py b/hub/core/meta/tensor_meta.py
@@ -83,9 +83,7 @@ def check_compatibility(self, shape: Tuple[int], dtype):
             actual_shape_len = len(shape)
             if expected_shape_len != actual_shape_len:
                 raise TensorInvalidSampleShapeError(
-                    "Sample shape length is expected to be {}, actual length is {}.".format(
-                        expected_shape_len, actual_shape_len
-                    ),
+                    f"Sample shape length is expected to be {expected_shape_len}, actual length is {actual_shape_len}.",
                     shape,
                 )
 

diff --git a/hub/core/storage/s3.py b/hub/core/storage/s3.py
@@ -149,7 +149,7 @@ def _list_keys(self):
             items = items["Contents"]
             names = [item["Key"] for item in items]
             # removing the prefix from the names
-            len_path = len(self.path.split("/"))
+            len_path = len(self.path.split("/")) - 1
             names = ["/".join(name.split("/")[len_path:]) for name in names]
             return names
         except Exception as err:
@@ -218,6 +218,8 @@ def _set_bucket_and_path(self):
         root = self.root.replace("s3://", "")
         self.bucket = root.split("/")[0]
         self.path = "/".join(root.split("/")[1:])
+        if not self.path.endswith("/"):
+            self.path += "/"
 
     def _set_hub_creds_info(self, tag: str, expiration: str):
         """Sets the tag and expiration of the credentials. These are only relevant to datasets using Hub storage.

diff --git a/hub/integrations/pytorch.py b/hub/integrations/pytorch.py
@@ -1,3 +1,4 @@
+from hub.util.dataset import try_flushing
 from hub.constants import MB
 from hub.util.keys import get_chunk_key
 from hub.core.storage.lru_cache import LRUCache
@@ -12,7 +13,6 @@
 from hub.util.exceptions import (
     DatasetUnsupportedPytorch,
     ModuleNotInstalledException,
-    TensorDoesNotExistError,
 )
 from hub.util.shared_memory import (
     remove_shared_memory_from_resource_tracker,
@@ -79,7 +79,7 @@ def dataset_to_pytorch(
     collate_fn: Optional[Callable] = None,
     pin_memory: Optional[bool] = False,
 ):
-    dataset.flush()
+    try_flushing(dataset)
     _import_torch()
     # TODO new pytorch approach doesn't support 0 workers currently
     num_workers = max(num_workers, 1)
@@ -145,9 +145,6 @@ def __init__(
         # keeps track of names of all shared_memory that have data in them
         self.all_shared_memory_names: Dict[str, List[str]] = defaultdict(list)
 
-        # keeps pointers to shared memory across tensors so they don't get closed between calls to getitem
-        self.all_shared_memory: Dict = defaultdict(list)
-
         self.last_chunk_num_generated = -1
 
     def __len__(self):
@@ -270,8 +267,8 @@ def _get_data_from_chunks(
         for chunk_name, shared_memory_name, chunk_size in zip(
             chunk_names, shared_memory_names, chunk_sizes
         ):
-            self.all_shared_memory[key].append(SharedMemory(name=shared_memory_name))
-            chunk = Chunk.frombuffer(self.all_shared_memory[key][-1].buf[:chunk_size])
+            shared_memory = SharedMemory(name=shared_memory_name)
+            chunk = Chunk.frombuffer(shared_memory.buf[:chunk_size])
             chunk_map[chunk_name] = chunk
 
         # saves np array for each index in memory

diff --git a/hub/integrations/pytorch_old.py b/hub/integrations/pytorch_old.py
@@ -1,3 +1,4 @@
+from hub.util.dataset import try_flushing
 from hub.core.storage.memory import MemoryProvider
 from hub.util.remove_cache import get_base_storage
 from typing import Callable, Union, List, Optional, Dict, Tuple, Sequence
@@ -20,6 +21,8 @@ def dataset_to_pytorch(
     pin_memory: Optional[bool] = False,
     python_version_warning: bool = True,
 ):
+    try_flushing(dataset)
+
     global torch
     try:
         import torch
@@ -28,7 +31,6 @@ def dataset_to_pytorch(
             "'torch' should be installed to convert the Dataset into pytorch format"
         )
 
-    dataset.flush()
     pytorch_ds = TorchDataset(
         dataset,
         transform,

diff --git a/hub/integrations/tests/test_pytorch.py b/hub/integrations/tests/test_pytorch.py
@@ -190,3 +190,22 @@ def test_custom_tensor_order(ds):
             np.testing.assert_array_equal(a1[0], ds.a.numpy()[i])
             np.testing.assert_array_equal(c1[0], ds.c.numpy()[i])
             np.testing.assert_array_equal(d1[0], ds.d.numpy()[i])
+
+
+@requires_torch
+def test_readonly(local_ds):
+    path = local_ds.path
+
+    local_ds.create_tensor("images")
+    local_ds.create_tensor("labels")
+    local_ds.images.extend(np.ones((10, 28, 28)))
+    local_ds.labels.extend(np.ones(10))
+
+    del local_ds
+
+    local_ds = Dataset(path)
+    local_ds.mode = "r"
+
+    # no need to check input, only care that readonly works
+    for sample in local_ds.pytorch():
+        pass
diff --git a/hub/tests/common.py b/hub/tests/common.py
@@ -72,7 +72,7 @@ def get_random_array(shape: Tuple[int], dtype: str) -> np.ndarray:
         a = np.random.uniform(size=shape)
         return a > 0.5
 
-    raise ValueError("Dtype %s not supported." % dtype)
+    raise ValueError(f"Dtype '{dtype}' not supported.")
 
 
 @parametrize_dtypes

diff --git a/hub/util/dataset.py b/hub/util/dataset.py
@@ -0,0 +1,8 @@
+from hub.util.exceptions import ReadOnlyModeError
+
+
+def try_flushing(ds):
+    try:
+        ds.flush()
+    except ReadOnlyModeError:
+        pass
diff --git a/hub/util/exceptions.py b/hub/util/exceptions.py
@@ -5,28 +5,28 @@
 
 class TensorInvalidSampleShapeError(Exception):
     def __init__(self, message: str, shape: Sequence[int]):
-        super().__init__("{} Incoming sample shape: {}".format(message, str(shape)))
+        super().__init__(f"{message} Incoming sample shape: {str(shape)}")
 
 
 class TensorMetaMissingKey(Exception):
     def __init__(self, key: str, meta: dict):
-        super().__init__("Key {} missing from tensor meta {}.".format(key, str(meta)))
+        super().__init__(f"Key '{key}' missing from tensor meta '{str(meta)}'.")
 
 
 class TensorDoesNotExistError(KeyError):
     def __init__(self, tensor_name: str):
-        super().__init__("Tensor {} does not exist.".format(tensor_name))
+        super().__init__(f"Tensor '{tensor_name}' does not exist.")
 
 
 class TensorAlreadyExistsError(Exception):
     def __init__(self, key: str):
-        super().__init__("Tensor {} already exists.".format(key))
+        super().__init__(f"Tensor '{key}' already exists.")
 
 
 class DynamicTensorNumpyError(Exception):
     def __init__(self, key: str, index, property_key: str):
         super().__init__(
-            f"Tensor {key} with index = {str(index)} is has a dynamic '{property_key}' and cannot be converted into a `np.ndarray`. Try setting the parameter `aslist=True`"
+            f"Tensor '{key}' with index = {str(index)} is a dynamic '{property_key}' and cannot be converted into a `np.ndarray`. Try setting the parameter `aslist=True`"
         )
 
 
@@ -37,29 +37,25 @@ def __init__(
         s = message
 
         if lower is not None:
-            s += " lower={}".format(str(lower))
+            s += f" lower={str(lower)}"
 
         if upper is not None:
-            s += " upper={}".format(str(upper))
+            s += f" upper={str(upper)}"
 
         super().__init__(s)
 
 
 class InvalidKeyTypeError(TypeError):
     def __init__(self, item: Any):
         super().__init__(
-            "Item {} is of type {} is not a valid key".format(
-                str(item), type(item).__name__
-            )
+            f"Item '{str(item)}' of type '{type(item).__name__}' is not a valid key."
         )
 
 
 class UnsupportedTensorTypeError(TypeError):
     def __init__(self, item: Any):
         super().__init__(
-            "Key of type {} is not currently supported to convert to a tensor.".format(
-                type(item).__name__
-            )
+            f"Key of type '{type(item).__name__}' is not currently supported to convert to a tensor."
         )
 
 
@@ -309,9 +305,7 @@ def __init__(self, htype: str, available_htypes: Sequence[str]):
 class TensorMetaInvalidHtypeOverwriteValue(MetaError):
     def __init__(self, key: str, value: Any, explanation: str = ""):
         super().__init__(
-            "Invalid value '{}' for tensor meta key {}. {}".format(
-                str(value), key, explanation
-            )
+            f"Invalid value '{value}' for tensor meta key '{key}'. {explanation}"
         )
 
 

diff --git a/hub/util/shape.py b/hub/util/shape.py
@@ -74,9 +74,9 @@ def __str__(self):
             if l == u:
                 intervals.append(str(l))
             else:
-                intervals.append("{}:{}".format(l, u))
+                intervals.append(f"{l}:{u}")
 
-        return "({})".format(", ".join(intervals))
+        return f"({', '.join(intervals)})"
 
     def __repr__(self):
         return str(self)
diff --git a/hub/util/subscript_namedtuple.py b/hub/util/subscript_namedtuple.py
@@ -65,10 +65,7 @@ def __len__(self):
             return len(self._dict)
 
         def __repr__(self):
-            return "%s(%s)" % (
-                T,
-                ", ".join(["%s=%s" % (k, v) for k, v in self.items()]),
-            )
+            return f"{T}{', '.join([f'{k}={v}' for k, v in self.items()])}"
 
         def __eq__(self, other):
             try:

diff --git a/setup.py b/setup.py
@@ -24,7 +24,7 @@
 def get_property(prop):
     result = re.search(
         # find variable with name `prop` in the __init__.py file
-        r'{}\s*=\s*[\'"]([^\'"]*)[\'"]'.format(prop),
+        fr'{prop}\s*=\s*[\'"]([^\'"]*)[\'"]',
         open(init_file).read(),
     )
     return result.group(1)