Skip to content

Commit

Permalink
Merge branch 'main' of github.com:activeloopai/Hub into task/remove-c…
Browse files Browse the repository at this point in the history
…hunk-sizes
  • Loading branch information
verbose-void committed Jul 13, 2021
2 parents 536bc1d + b6e5601 commit cbdc896
Show file tree
Hide file tree
Showing 17 changed files with 93 additions and 55 deletions.
2 changes: 1 addition & 1 deletion .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ commands:
steps:
- run:
name: "Gather machine info"
command: python3 -c "import platform as p; print('{}\nPython {}'.format(p.platform(), p.sys.version))"
command: python3 -c "import platform as p; print(f'{p.platform()}\nPython {p.sys.version}')"

google-creds:
parameters:
Expand Down
11 changes: 5 additions & 6 deletions conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -87,15 +87,14 @@ def pytest_addoption(parser):
CACHE_OPT,
action="store_true",
help="Tests using the `storage` fixture may run with combinations of all enabled providers in cache chains. "
"For example, if the option `%s` is not provided, all cache chains that use `S3Provider`"
" are skipped." % (S3_OPT),
f"For example, if the option `{S3_OPT}` is not provided, all cache chains that use `S3Provider`"
" are skipped.",
)
parser.addoption(
CACHE_ONLY_OPT,
action="store_true",
help="Force enables `%s`. `storage` fixture only returns cache chains. For example, if `%s` is provided, \
`storage` will never be just `S3Provider`."
% (CACHE_OPT, S3_OPT),
help=f"Force enables `{CACHE_OPT}`. `storage` fixture only returns cache chains. For example, if `{S3_OPT}` is provided, \
`storage` will never be just `S3Provider`.",
)
parser.addoption(
S3_PATH_OPT,
Expand Down Expand Up @@ -273,7 +272,7 @@ def flower_path():

def print_session_id():
print("\n\n----------------------------------------------------------")
print("Testing session ID: %s" % SESSION_ID)
print(f"Testing session ID: {SESSION_ID}")
print("----------------------------------------------------------")


Expand Down
2 changes: 1 addition & 1 deletion hub/api/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ def create_tensor(
if tensor_exists(name, self.storage):
raise TensorAlreadyExistsError(name)

self.meta.tensors.append(name)
create_tensor(
name,
self.storage,
Expand All @@ -183,7 +184,6 @@ def create_tensor(
tensor = Tensor(name, self.storage) # type: ignore

self.tensors[name] = tensor
self.meta.tensors.append(name)

return tensor

Expand Down
40 changes: 31 additions & 9 deletions hub/api/tests/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
)
from hub.client.client import HubBackendClient
from hub.client.utils import has_hub_testing_creds
from click.testing import CliRunner


# need this for 32-bit and 64-bit systems to have correct tests
Expand Down Expand Up @@ -448,20 +449,41 @@ def test_fails_on_wrong_tensor_syntax(memory_ds):
def test_hub_cloud_dataset():
username = "testingacc"
password = os.getenv("ACTIVELOOP_HUB_PASSWORD")
id = str(uuid.uuid1())

uri = f"hub://{username}/hub2ds2_{id}"

client = HubBackendClient()
token = client.request_auth_token(username, password)
id = str(uuid.uuid1())
ds = Dataset(f"hub://testingacc/hub2ds2_{id}", token=token)
ds.create_tensor("image")

for i in range(10):
ds.image.append(i * np.ones((100, 100)))
with Dataset(uri, token=token) as ds:
ds.create_tensor("image")
ds.create_tensor("label", htype="class_label")

token = ds.token
del ds
ds = Dataset(f"hub://testingacc/hub2ds2_{id}", token=token)
for i in range(10):
for i in range(1000):
ds.image.append(i * np.ones((100, 100)))
ds.label.append(np.uint32(i))

ds = Dataset(uri, token=token)
for i in range(1000):
np.testing.assert_array_equal(ds.image[i].numpy(), i * np.ones((100, 100)))
np.testing.assert_array_equal(ds.label[i].numpy(), np.uint32(i))

ds.delete()


@parametrize_all_dataset_storages
def test_hub_dataset_suffix_bug(ds):
# creating dataset with similar name but some suffix removed from end
ds2 = Dataset(ds.path[:-1])
ds2.delete()


def test_empty_dataset():
with CliRunner().isolated_filesystem():
ds = Dataset("test")
ds.create_tensor("x")
ds.create_tensor("y")
ds.create_tensor("z")
ds = Dataset("test")
assert list(ds.tensors) == ["x", "y", "z"]
2 changes: 1 addition & 1 deletion hub/client/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,7 @@ def get_dataset_credentials(
Returns:
tuple: containing full url to dataset, credentials, mode and expiration time respectively.
"""
relative_url = GET_DATASET_CREDENTIALS_SUFFIX % (org_id, ds_name)
relative_url = GET_DATASET_CREDENTIALS_SUFFIX.format(org_id, ds_name)
response = self.request(
"GET",
relative_url,
Expand Down
2 changes: 1 addition & 1 deletion hub/client/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

GET_TOKEN_SUFFIX = "/api/user/token"
REGISTER_USER_SUFFIX = "/api/user/register"
GET_DATASET_CREDENTIALS_SUFFIX = "/api/org/%s/ds/%s/creds"
GET_DATASET_CREDENTIALS_SUFFIX = "/api/org/{}/ds/{}/creds"
CREATE_DATASET_SUFFIX = "/api/dataset/create"
DATASET_SUFFIX = "/api/dataset"
UPDATE_SUFFIX = "/api/org/{}/dataset/{}"
Expand Down
4 changes: 1 addition & 3 deletions hub/core/meta/tensor_meta.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,7 @@ def check_compatibility(self, shape: Tuple[int], dtype):
actual_shape_len = len(shape)
if expected_shape_len != actual_shape_len:
raise TensorInvalidSampleShapeError(
"Sample shape length is expected to be {}, actual length is {}.".format(
expected_shape_len, actual_shape_len
),
f"Sample shape length is expected to be {expected_shape_len}, actual length is {actual_shape_len}.",
shape,
)

Expand Down
4 changes: 3 additions & 1 deletion hub/core/storage/s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def _list_keys(self):
items = items["Contents"]
names = [item["Key"] for item in items]
# removing the prefix from the names
len_path = len(self.path.split("/"))
len_path = len(self.path.split("/")) - 1
names = ["/".join(name.split("/")[len_path:]) for name in names]
return names
except Exception as err:
Expand Down Expand Up @@ -218,6 +218,8 @@ def _set_bucket_and_path(self):
root = self.root.replace("s3://", "")
self.bucket = root.split("/")[0]
self.path = "/".join(root.split("/")[1:])
if not self.path.endswith("/"):
self.path += "/"

def _set_hub_creds_info(self, tag: str, expiration: str):
"""Sets the tag and expiration of the credentials. These are only relevant to datasets using Hub storage.
Expand Down
11 changes: 4 additions & 7 deletions hub/integrations/pytorch.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from hub.util.dataset import try_flushing
from hub.constants import MB
from hub.util.keys import get_chunk_key
from hub.core.storage.lru_cache import LRUCache
Expand All @@ -12,7 +13,6 @@
from hub.util.exceptions import (
DatasetUnsupportedPytorch,
ModuleNotInstalledException,
TensorDoesNotExistError,
)
from hub.util.shared_memory import (
remove_shared_memory_from_resource_tracker,
Expand Down Expand Up @@ -79,7 +79,7 @@ def dataset_to_pytorch(
collate_fn: Optional[Callable] = None,
pin_memory: Optional[bool] = False,
):
dataset.flush()
try_flushing(dataset)
_import_torch()
# TODO new pytorch approach doesn't support 0 workers currently
num_workers = max(num_workers, 1)
Expand Down Expand Up @@ -145,9 +145,6 @@ def __init__(
# keeps track of names of all shared_memory that have data in them
self.all_shared_memory_names: Dict[str, List[str]] = defaultdict(list)

# keeps pointers to shared memory across tensors so they don't get closed between calls to getitem
self.all_shared_memory: Dict = defaultdict(list)

self.last_chunk_num_generated = -1

def __len__(self):
Expand Down Expand Up @@ -270,8 +267,8 @@ def _get_data_from_chunks(
for chunk_name, shared_memory_name, chunk_size in zip(
chunk_names, shared_memory_names, chunk_sizes
):
self.all_shared_memory[key].append(SharedMemory(name=shared_memory_name))
chunk = Chunk.frombuffer(self.all_shared_memory[key][-1].buf[:chunk_size])
shared_memory = SharedMemory(name=shared_memory_name)
chunk = Chunk.frombuffer(shared_memory.buf[:chunk_size])
chunk_map[chunk_name] = chunk

# saves np array for each index in memory
Expand Down
4 changes: 3 additions & 1 deletion hub/integrations/pytorch_old.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from hub.util.dataset import try_flushing
from hub.core.storage.memory import MemoryProvider
from hub.util.remove_cache import get_base_storage
from typing import Callable, Union, List, Optional, Dict, Tuple, Sequence
Expand All @@ -20,6 +21,8 @@ def dataset_to_pytorch(
pin_memory: Optional[bool] = False,
python_version_warning: bool = True,
):
try_flushing(dataset)

global torch
try:
import torch
Expand All @@ -28,7 +31,6 @@ def dataset_to_pytorch(
"'torch' should be installed to convert the Dataset into pytorch format"
)

dataset.flush()
pytorch_ds = TorchDataset(
dataset,
transform,
Expand Down
19 changes: 19 additions & 0 deletions hub/integrations/tests/test_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,3 +190,22 @@ def test_custom_tensor_order(ds):
np.testing.assert_array_equal(a1[0], ds.a.numpy()[i])
np.testing.assert_array_equal(c1[0], ds.c.numpy()[i])
np.testing.assert_array_equal(d1[0], ds.d.numpy()[i])


@requires_torch
def test_readonly(local_ds):
path = local_ds.path

local_ds.create_tensor("images")
local_ds.create_tensor("labels")
local_ds.images.extend(np.ones((10, 28, 28)))
local_ds.labels.extend(np.ones(10))

del local_ds

local_ds = Dataset(path)
local_ds.mode = "r"

# no need to check input, only care that readonly works
for sample in local_ds.pytorch():
pass
2 changes: 1 addition & 1 deletion hub/tests/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ def get_random_array(shape: Tuple[int], dtype: str) -> np.ndarray:
a = np.random.uniform(size=shape)
return a > 0.5

raise ValueError("Dtype %s not supported." % dtype)
raise ValueError(f"Dtype '{dtype}' not supported.")


@parametrize_dtypes
Expand Down
8 changes: 8 additions & 0 deletions hub/util/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from hub.util.exceptions import ReadOnlyModeError


def try_flushing(ds):
try:
ds.flush()
except ReadOnlyModeError:
pass
26 changes: 10 additions & 16 deletions hub/util/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,28 @@

class TensorInvalidSampleShapeError(Exception):
def __init__(self, message: str, shape: Sequence[int]):
super().__init__("{} Incoming sample shape: {}".format(message, str(shape)))
super().__init__(f"{message} Incoming sample shape: {str(shape)}")


class TensorMetaMissingKey(Exception):
def __init__(self, key: str, meta: dict):
super().__init__("Key {} missing from tensor meta {}.".format(key, str(meta)))
super().__init__(f"Key '{key}' missing from tensor meta '{str(meta)}'.")


class TensorDoesNotExistError(KeyError):
def __init__(self, tensor_name: str):
super().__init__("Tensor {} does not exist.".format(tensor_name))
super().__init__(f"Tensor '{tensor_name}' does not exist.")


class TensorAlreadyExistsError(Exception):
def __init__(self, key: str):
super().__init__("Tensor {} already exists.".format(key))
super().__init__(f"Tensor '{key}' already exists.")


class DynamicTensorNumpyError(Exception):
def __init__(self, key: str, index, property_key: str):
super().__init__(
f"Tensor {key} with index = {str(index)} is has a dynamic '{property_key}' and cannot be converted into a `np.ndarray`. Try setting the parameter `aslist=True`"
f"Tensor '{key}' with index = {str(index)} is a dynamic '{property_key}' and cannot be converted into a `np.ndarray`. Try setting the parameter `aslist=True`"
)


Expand All @@ -37,29 +37,25 @@ def __init__(
s = message

if lower is not None:
s += " lower={}".format(str(lower))
s += f" lower={str(lower)}"

if upper is not None:
s += " upper={}".format(str(upper))
s += f" upper={str(upper)}"

super().__init__(s)


class InvalidKeyTypeError(TypeError):
def __init__(self, item: Any):
super().__init__(
"Item {} is of type {} is not a valid key".format(
str(item), type(item).__name__
)
f"Item '{str(item)}' of type '{type(item).__name__}' is not a valid key."
)


class UnsupportedTensorTypeError(TypeError):
def __init__(self, item: Any):
super().__init__(
"Key of type {} is not currently supported to convert to a tensor.".format(
type(item).__name__
)
f"Key of type '{type(item).__name__}' is not currently supported to convert to a tensor."
)


Expand Down Expand Up @@ -309,9 +305,7 @@ def __init__(self, htype: str, available_htypes: Sequence[str]):
class TensorMetaInvalidHtypeOverwriteValue(MetaError):
def __init__(self, key: str, value: Any, explanation: str = ""):
super().__init__(
"Invalid value '{}' for tensor meta key {}. {}".format(
str(value), key, explanation
)
f"Invalid value '{value}' for tensor meta key '{key}'. {explanation}"
)


Expand Down
4 changes: 2 additions & 2 deletions hub/util/shape.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,9 @@ def __str__(self):
if l == u:
intervals.append(str(l))
else:
intervals.append("{}:{}".format(l, u))
intervals.append(f"{l}:{u}")

return "({})".format(", ".join(intervals))
return f"({', '.join(intervals)})"

def __repr__(self):
return str(self)
5 changes: 1 addition & 4 deletions hub/util/subscript_namedtuple.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,7 @@ def __len__(self):
return len(self._dict)

def __repr__(self):
return "%s(%s)" % (
T,
", ".join(["%s=%s" % (k, v) for k, v in self.items()]),
)
return f"{T}{', '.join([f'{k}={v}' for k, v in self.items()])}"

def __eq__(self, other):
try:
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
def get_property(prop):
result = re.search(
# find variable with name `prop` in the __init__.py file
r'{}\s*=\s*[\'"]([^\'"]*)[\'"]'.format(prop),
fr'{prop}\s*=\s*[\'"]([^\'"]*)[\'"]',
open(init_file).read(),
)
return result.group(1)
Expand Down

0 comments on commit cbdc896

Please sign in to comment.