Skip to content

Commit

Permalink
merge
Browse files Browse the repository at this point in the history
  • Loading branch information
davidbuniat committed Nov 16, 2020
2 parents ac4547e + 1fdaa54 commit af43b91
Show file tree
Hide file tree
Showing 10 changed files with 157 additions and 54 deletions.
41 changes: 41 additions & 0 deletions examples/eurosat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import hub
import torch

if __name__ == "__main__":
ds = hub.Dataset("eurosat/eurosat-rgb")

# 26000 samples in dataset, accessing values
print(ds["image"][10].numpy())
print(ds["label", 15].numpy()) # alternate way to access, by specifying both key and sample number at once
print(ds["filename", 20:22].numpy()) # accessing multiple elements at once

# Splitting into train and test sets
train_ds = ds[:13000]
test_ds = ds[13000:]

# Using hub with tensorflow
train_tf_ds = train_ds.to_tensorflow().batch(2)

for batch in train_tf_ds:
print(batch["label"], batch["filename"], batch["image"])
break

test_tf_ds = test_ds.to_tensorflow().batch(2)

for batch in test_tf_ds:
print(batch["label"], batch["filename"], batch["image"])
break

# Using hub with pytorch
train_pt_ds = train_ds.to_pytorch()
train_loader = torch.utils.data.DataLoader(train_pt_ds, batch_size=2)

for batch in train_loader:
print(batch["label"], batch["image"]) # pytorch tensors don't support text labels such as filename
break

test_pt_ds = test_ds.to_pytorch()
test_loader = torch.utils.data.DataLoader(test_pt_ds, batch_size=2)
for batch in test_loader:
print(batch["label"], batch["image"]) # pytorch tensors don't support text labels such as filename
break
66 changes: 39 additions & 27 deletions hub/api/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,8 @@
ShapeArgumentNotFoundException,
SchemaArgumentNotFoundException,
ModuleNotInstalledException,
WrongUsernameException,
NoneValueException,
ShapeLengthException
)
from hub.store.metastore import MetaStorage
from hub.client.hub_control import HubControlClient
Expand Down Expand Up @@ -76,7 +77,6 @@ def __init__(
lock_cache=True,
):
"""Open a new or existing dataset for read/write
Parameters
----------
url: str
Expand Down Expand Up @@ -106,8 +106,10 @@ def __init__(
if isinstance(shape, int):
shape = [shape]
if shape is not None:
assert len(tuple(shape)) == 1
assert mode is not None
if len(tuple(shape)) != 1:
raise ShapeLengthException
if mode is None:
raise NoneValueException('mode')

self.url = url
self.token = token
Expand Down Expand Up @@ -177,12 +179,6 @@ def _check_and_prepare_dir(self):
Returns True dataset needs to be created opposed to read.
"""
fs, path, mode = self._fs, self._path, self.mode
if path.startswith("s3://"):
with open(os.path.expanduser("~/.activeloop/store"), "rb") as f:
stored_username = json.load(f)["_id"]
current_username = path.split("/")[-2]
if stored_username != current_username:
raise WrongUsernameException(current_username)
exist_meta = fs.exists(posixpath.join(path, "meta.json"))
if exist_meta:
if "w" in mode:
Expand Down Expand Up @@ -258,12 +254,9 @@ def _open_storage_tensors(self):
def __getitem__(self, slice_):
"""| Gets a slice or slices from dataset
| Usage:
>>> return ds["image", 5, 0:1920, 0:1080, 0:3].numpy() # returns numpy array
>>> images = ds["image"]
>>> return images[5].numpy() # returns numpy array
>>> images = ds["image"]
>>> image = images[5]
>>> return image[0:1920, 0:1080, 0:3].numpy()
Expand Down Expand Up @@ -296,9 +289,7 @@ def __getitem__(self, slice_):
def __setitem__(self, slice_, value):
"""| Sets a slice or slices with a value
| Usage
>>> ds["image", 5, 0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8")
>>> images = ds["image"]
>>> image = images[5]
>>> image[0:1920, 0:1080, 0:3] = np.zeros((1920, 1080, 3), "uint8")
Expand Down Expand Up @@ -327,19 +318,35 @@ def delete(self):
return True
return False

def to_pytorch(self, Transform=None):
"""Converts the dataset into a pytorch compatible format"""
def to_pytorch(self, Transform=None, offset=None, num_samples=None):
"""Converts the dataset into a pytorch compatible format
Parameters
----------
offset: int, optional
The offset from which dataset needs to be converted
num_samples: int, optional
The number of samples required of the dataset that needs to be converted
"""
if "torch" not in sys.modules:
raise ModuleNotInstalledException("torch")
return TorchDataset(self, Transform)
return TorchDataset(self, Transform, offset=offset, num_samples=num_samples)

def to_tensorflow(self):
"""Converts the dataset into a tensorflow compatible format"""
def to_tensorflow(self, offset=None, num_samples=None):
"""Converts the dataset into a tensorflow compatible format
Parameters
----------
offset: int, optional
The offset from which dataset needs to be converted
num_samples: int, optional
The number of samples required of the dataset that needs to be converted
"""
if "tensorflow" not in sys.modules:
raise ModuleNotInstalledException("tensorflow")
offset = 0 if offset is None else offset
num_samples = self.shape[0] if num_samples is None else num_samples

def tf_gen():
for index in range(self.shape[0]):
for index in range(offset, offset + num_samples):
d = {}
for key in self._tensors.keys():
split_key = key.split("/")
Expand Down Expand Up @@ -368,6 +375,8 @@ def dtype_to_tf(my_dtype):
elif isinstance(my_dtype, Tensor):
return tensor_to_tf(my_dtype)
elif isinstance(my_dtype, Primitive):
if str(my_dtype._dtype) == "object":
return "string"
return str(my_dtype._dtype)

def get_output_shapes(my_dtype):
Expand Down Expand Up @@ -466,7 +475,6 @@ def from_tensorflow(ds):
ds = ds.to_tensorflow()
out_ds = hub.Dataset.from_tensorflow(ds)
res_ds = out_ds.store("username/new_dataset") # res_ds is now a usable hub dataset
"""
if "tensorflow" not in sys.modules:
raise ModuleNotInstalledException("tensorflow")
Expand All @@ -483,8 +491,9 @@ def tf_to_hub(tf_dt):
return TensorSpec_to_hub(tf_dt)

def TensorSpec_to_hub(tf_dt):
dt = tf_dt.dtype.name if tf_dt.dtype.name != "string" else "object"
shape = tf_dt.shape if tf_dt.shape.rank is not None else (None,)
return Tensor(shape=shape, dtype=tf_dt.dtype.name)
return Tensor(shape=shape, dtype=dt)

def dict_to_hub(tf_dt):
d = {
Expand Down Expand Up @@ -623,11 +632,13 @@ def my_transform(sample):


class TorchDataset:
def __init__(self, ds, transform=None):
def __init__(self, ds, transform=None, num_samples=None, offset=None):
self._ds = None
self._url = ds.url
self._token = ds.token
self._transform = transform
self.num_samples = num_samples
self.offset = offset

def _do_transform(self, data):
return self._transform(data) if self._transform else data
Expand All @@ -641,9 +652,10 @@ def _init_ds(self):

def __len__(self):
self._init_ds()
return self._ds.shape[0]
return self.num_samples if self.num_samples is not None else self._ds.shape[0]

def __getitem__(self, index):
index = index + self.offset if self.offset is not None else index
self._init_ds()
d = {}
for key in self._ds._tensors.keys():
Expand All @@ -655,8 +667,8 @@ def __getitem__(self, index):
else:
cur[split_key[i]] = {}
cur = cur[split_key[i]]

cur[split_key[-1]] = torch.tensor(self._ds._tensors[key][index])
if not isinstance(self._ds._tensors[key][index], bytes):
cur[split_key[-1]] = torch.tensor(self._ds._tensors[key][index])
return d

def __iter__(self):
Expand Down
19 changes: 16 additions & 3 deletions hub/api/datasetview.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from hub.api.tensorview import TensorView
from hub.api.dataset_utils import slice_extract_info, slice_split
from hub.exceptions import NoneValueException
import collections.abc as abc


Expand All @@ -21,9 +22,12 @@ def __init__(
offset: int
The offset from which the DatasetView starts
"""
assert dataset is not None
assert num_samples is not None
assert offset is not None
if dataset is None:
raise NoneValueException('dataset')
if num_samples is None:
raise NoneValueException('num_samples')
if offset is None:
raise NoneValueException('offset')

self.dataset = dataset
self.num_samples = num_samples
Expand Down Expand Up @@ -132,3 +136,12 @@ def __iter__(self):

def __len__(self):
return self.num_samples

def to_tensorflow(self):
"""Converts the dataset into a tensorflow compatible format"""
return self.dataset.to_tensorflow(num_samples=self.num_samples, offset=self.offset)

def to_pytorch(self, Transform=None):
"""Converts the dataset into a pytorch compatible format"""
return self.dataset.to_pytorch(Transform=Transform, num_samples=self.num_samples, offset=self.offset)

7 changes: 5 additions & 2 deletions hub/api/tensorview.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import collections.abc as abc
from hub.api.dataset_utils import slice_split
from hub.exceptions import NoneValueException


class TensorView:
Expand All @@ -21,8 +22,10 @@ def __init__(
The `slice_` of this Tensor that needs to be accessed
"""

assert dataset is not None
assert subpath is not None
if dataset is None:
raise NoneValueException('dataset')
if subpath is None:
raise NoneValueException('subpath')

self.dataset = dataset
self.subpath = subpath
Expand Down
5 changes: 5 additions & 0 deletions hub/api/tests/test_converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,18 +50,23 @@ def test_to_from_tensorflow():
"d": {"e": Tensor((5, 3), "uint8")},
"f": "float"
},
"named_label": "object"
}

ds = hub.Dataset(schema=my_schema, shape=(10,), url="./data/test_from_tf/ds3", mode="w")
for i in range(10):
ds["label", "d", "e", i] = i * np.ones((5, 3))
ds["named_label", i] = 'try' + str(i)
ds = ds.to_tensorflow()
out_ds = hub.Dataset.from_tensorflow(ds)
res_ds = out_ds.store("./data/test_from_tf/ds4", length=10) # generator has no length, argument needed
for i in range(10):
assert (
res_ds["label", "d", "e", i].numpy() == i * np.ones((5, 3))
).all()
assert (
res_ds["named_label", i].numpy().decode('utf-8') == 'try' + str(i)
)


@pytest.mark.skipif(not pytorch_loaded(), reason="requires pytorch to be loaded")
Expand Down
13 changes: 7 additions & 6 deletions hub/compute/tests/test_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ def my_transform(sample, multiplier: int = 2):
}

out_ds = my_transform(ds, multiplier=2)
assert (out_ds["image", 0:2].numpy() == 2).all()
# assert len(list(out_ds)) == 100
assert (out_ds["image", 0].numpy() == 2).all()
assert len(list(out_ds)) == 100
res_ds = out_ds.store("./data/test/test_pipeline_basic_output")

assert res_ds["label", 5].numpy() == "hello 5"
Expand Down Expand Up @@ -133,12 +133,13 @@ def my_transform(sample):

with Timer(name):
out_ds = my_transform(ds_fs)
res_ds = out_ds.store(f"./data/test/test_pipeline_basic_output_{name}")
out_ds.store(f"./data/test/test_pipeline_basic_output_{name}")

if __name__ == "__main__":
test_pipeline_basic()
test_pipeline_dynamic()
# test_pipeline_basic()


# test_pathos()
# benchmark()
# benchmark()


7 changes: 5 additions & 2 deletions hub/compute/transform.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from hub.api.dataset import Dataset
import os
from typing import Dict

import hub
from hub.api.dataset import Dataset
from tqdm import tqdm
from collections.abc import MutableMapping
from hub.features.features import Primitive
Expand Down Expand Up @@ -203,7 +205,8 @@ def __getitem__(self, slice_):
num, ofs = slice_extract_info(slice_list[0], self.shape[0])
ds_view = DatasetView(dataset=self._ds, num_samples=num, offset=ofs)

new_ds = self.store("~/.activeloop/tmp/array", length=num, ds=ds_view, progressbar=True)
path = os.path.expanduser("~/.activeloop/tmp")
new_ds = self.store(path, length=num, ds=ds_view, progressbar=True)
slice_[1] = slice(None, None, None) # Get all shape dimension since we already sliced
return new_ds[slice_]

Expand Down
30 changes: 25 additions & 5 deletions hub/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,15 @@ def __init__(self, correct_shape, wrong_shape):
message = f"parameter 'value': expected array with shape {correct_shape}, got {wrong_shape}"
super(HubException, self).__init__(message=message)

class NoneValueException(HubException):
def __init__(self, param):
message = f"Parameter '{param}' should be provided"
super(HubException, self).__init__(message=message)

class ShapeLengthException(HubException):
def __init__(self):
message = f"Parameter 'shape' should be a tuple of length 1"
super(HubException, self).__init__(message=message)

class ModuleNotInstalledException(HubException):
def __init__(self, module_name):
Expand Down Expand Up @@ -196,6 +205,22 @@ def __init__(self):
"The provided directory is not empty and doesn't contain information about any Hub Dataset "
super(HubException, self).__init__(message=message)

class DynamicTensorNotFoundException(Exception):
def __init__(self):
message = f"Unable to find dynamic tensor"
super(HubException, self).__init__(message=message)

class DynamicTensorShapeException(Exception):
def __init__(self, exc_type):
if exc_type == 'none':
message = f"Parameter 'max_shape' shouldn't contain any 'None' value"
elif exc_type == 'length':
message = "Lengths of 'shape' and 'max_shape' should be equal"
elif exc_type == 'not_equal':
message = "All not-None values from 'shape' should be equal to the corresponding values in 'max_shape'"
else:
message = "Wrong 'shape' or 'max_shape' values"
super(HubException, self).__init__(message=message)

class NotIterable(HubException):
def __init__(self):
Expand All @@ -212,10 +237,5 @@ def __init__(self):
class NotZarrFolderException(Exception):
pass


class StorageTensorNotFoundException(Exception):
pass


class DynamicTensorNotFoundException(Exception):
pass
Loading

0 comments on commit af43b91

Please sign in to comment.