Skip to content

Commit

Permalink
Merge branch 'master' into release/v0.12.1
Browse files Browse the repository at this point in the history
  • Loading branch information
davidbuniat committed Sep 3, 2020
2 parents 21603d3 + 26fe092 commit 9ff2634
Show file tree
Hide file tree
Showing 9 changed files with 384 additions and 11 deletions.
10 changes: 6 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
<a href="http://docs.activeloop.ai/">
<img alt="Docs" src="https://readthedocs.org/projects/hubdb/badge/?version=latest">
</a>
<a href="https://pypi.org/project/hub/"><img src="https://badge.fury.io/py/hub.svg" alt="PyPI version" height="18"></a>
<a href="https://pypi.org/project/hub/"><img src="https://img.shields.io/pypi/dm/hub.svg" alt="PyPI version" height="18"></a>
<a href="https://twitter.com/intent/tweet?text=The%20fastest%20way%20to%20access%20and%20manage%20PyTorch%20and%20Tensorflow%20datasets%20is%20open-source&url=https://activeloop.ai/&via=activeloopai&hashtags=opensource,pytorch,tensorflow,data,datascience,datapipelines,sqlforimages,activeloop">
<img alt="tweet" src="https://img.shields.io/twitter/url/http/shields.io.svg?style=social">
</a>
</a>
</p>




<h3 align="center">
The fastest way to access and manage datasets for PyTorch and TensorFlow
</h3>
Expand Down
4 changes: 2 additions & 2 deletions docs/source/dataset.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ ds.store("username/dataset") # Upload

## Notes

For small datasets that would fit into your RAM you can directly upload by converting a numpy array into hub tensor. For complete example please check [Uploading MNIST](https://github.com/activeloopai/Hub/blob/master/examples/mnist.py) and [Uploading CIFAR](https://github.com/activeloopai/Hub/blob/master/examples/cifar100.py)
For small datasets that would fit into your RAM you can directly upload by converting a numpy array into hub tensor. For complete example please check [Uploading MNIST](https://github.com/activeloopai/Hub/blob/master/examples/mnist/upload.py) and [Uploading CIFAR](https://github.com/activeloopai/Hub/blob/master/examples/cifar/upload_cifar10.py)

For larger datasets you would need to define a dataset generator and apply the transformation iteratively. Please see an example below [Uploading COCO](https://github.com/activeloopai/Hub/blob/master/examples/coco2017.py).
For larger datasets you would need to define a dataset generator and apply the transformation iteratively. Please see an example below [Uploading COCO](https://github.com/activeloopai/Hub/blob/master/examples/coco/upload_coco2017.py).
Please pay careful attention to `meta(...)` function where you describe each tensor properties. Please pay careful attention providing full meta description including shape, dtype, dtag, chunk_shape etc.

## Dtag
Expand Down
94 changes: 94 additions & 0 deletions examples/fashion-mnist/train_pytorch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
from hub import dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


class CNN(nn.Module):
def __init__(self):
super(CNN, self).__init__()
self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
self.conv2_drop = nn.Dropout2d()
self.fc1 = nn.Linear(320, 50)
self.fc2 = nn.Linear(50, 10)

def forward(self, x):
x = F.relu(F.max_pool2d(self.conv1(x.float()), 2))
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
x = x.view(-1, 320)
x = F.relu(self.fc1(x))
x = F.dropout(x, training=self.training)
x = self.fc2(x)
return F.log_softmax(x, dim=1)


def train(model, train_loader, optimizer):
model.train()
for batch_idx, batch in enumerate(train_loader):
data = batch["data"]
data = torch.unsqueeze(data, 1)
labels = batch["labels"]
labels = labels.type(torch.LongTensor)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, labels)
loss.backward()
optimizer.step()


def test(model, test_loader):
model.eval()
print("Evaluating on Test Set")
test_loss = correct = 0
with torch.no_grad():
for batch in test_loader:
data = batch["data"]
data = torch.unsqueeze(data, 1)
labels = batch["labels"]
labels = labels.type(torch.LongTensor)
output = model(data)
test_loss += F.nll_loss(output, labels, reduction='sum').item()
pred = output.data.max(1, keepdim=True)[1]
correct += pred.eq(labels.data.view_as(pred)).sum()

test_loss /= len(test_loader.dataset)
print('Test set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset), 100. * correct / len(test_loader.dataset)))


def main():
EPOCHS = 3
BATCH_SIZE = 64
LEARNING_RATE = 0.01
MOMENTUM = 0.5
torch.backends.cudnn.enabled = False
random_seed = 2
torch.manual_seed(random_seed)

# Load data
ds = dataset.load("abhinavtuli/fashion-mnist")

# Transform into pytorch
ds = ds.to_pytorch()

# Splitting back into the original train and test sets, instead of random split
train_dataset = torch.utils.data.Subset(ds, range(60000))
test_dataset = torch.utils.data.Subset(ds, range(60000, 70000))

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, collate_fn=ds.collate_fn)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=BATCH_SIZE, collate_fn=ds.collate_fn)

model = CNN()
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)

for epoch in range(EPOCHS):
print("Starting Training Epoch {}".format(epoch))
train(model, train_loader, optimizer)
print("Training Epoch {} finished\n".format(epoch))
test(model, test_loader)


if __name__ == "__main__":
main()
54 changes: 54 additions & 0 deletions examples/fashion-mnist/train_tf_fit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from hub import dataset
import tensorflow as tf


def create_CNN():
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=2, padding='same', activation='relu', input_shape=(28, 28, 1)))
model.add(tf.keras.layers.MaxPooling2D(pool_size=2))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=2, padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPooling2D(pool_size=2))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(10, activation='softmax'))
return model


def to_model_fit(item):
data = item["data"]
data = tf.expand_dims(data, axis=2)
labels = item["labels"]
return (data, labels)


def main():
BATCH_SIZE = 64
EPOCHS = 3

# Load data
ds = dataset.load("abhinavtuli/fashion-mnist")

# transform into Tensorflow dataset
ds = ds.to_tensorflow()

# converting ds so that it can be directly used in model.fit
ds = ds.map(lambda x: to_model_fit(x))

# Splitting back into the original train and test sets
train_dataset = ds.take(60000)
test_dataset = ds.skip(60000)

train_dataset = train_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

model = create_CNN()
# model.summary()
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_dataset, epochs=EPOCHS, validation_data=test_dataset, validation_steps=1)


if __name__ == "__main__":
main()
82 changes: 82 additions & 0 deletions examples/fashion-mnist/train_tf_gradient_tape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from hub import dataset
import tensorflow as tf
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam


def create_CNN():
model = tf.keras.Sequential()
model.add(tf.keras.layers.Conv2D(filters=64, kernel_size=2, padding='same', activation='relu', input_shape=(28, 28, 1)))
model.add(tf.keras.layers.MaxPooling2D(pool_size=2))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Conv2D(filters=32, kernel_size=2, padding='same', activation='relu'))
model.add(tf.keras.layers.MaxPooling2D(pool_size=2))
model.add(tf.keras.layers.Dropout(0.3))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(256, activation='relu'))
model.add(tf.keras.layers.Dropout(0.5))
model.add(tf.keras.layers.Dense(10, activation='softmax'))
return model


def train(model, train_dataset, optimizer, loss_fn, train_acc_metric):
for batch in train_dataset:
with tf.GradientTape() as tape:
pred = model(tf.expand_dims(batch["data"], axis=3))
loss = loss_fn(batch["labels"], pred)

# calculate gradients and update the model weights
grads = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(grads, model.trainable_variables))
train_acc_metric.update_state(batch["labels"], pred)

train_acc = train_acc_metric.result()
print("Training acc: %.4f" % (float(train_acc),))
train_acc_metric.reset_states()


def test(model, test_dataset, test_acc_metric):
print("Evaluating on Test Set")
for batch in test_dataset:
pred = model(tf.expand_dims(batch["data"], axis=3), training=False)
test_acc_metric.update_state(batch["labels"], pred)

test_acc = test_acc_metric.result()
print("Test acc: %.4f" % (float(test_acc),))
test_acc_metric.reset_states()


def main():
BATCH_SIZE = 64
EPOCHS = 3

optimizer = Adam()
train_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
test_acc_metric = tf.keras.metrics.SparseCategoricalAccuracy()
loss_fn = SparseCategoricalCrossentropy()

# Load data
ds = dataset.load("abhinavtuli/fashion-mnist")

# transform into Tensorflow dataset
ds = ds.to_tensorflow()

# Splitting back into the original train and test sets
train_dataset = ds.take(60000)
test_dataset = ds.skip(60000)

train_dataset = train_dataset.batch(BATCH_SIZE)
test_dataset = test_dataset.batch(BATCH_SIZE)

model = create_CNN()
# model.summary()

for epoch in range(EPOCHS):
print("\nStarting Training Epoch {}".format(epoch))
train(model, train_dataset, optimizer, loss_fn, train_acc_metric)
print("Training Epoch {} finished\n".format(epoch))
test(model, test_dataset, test_acc_metric)


if __name__ == "__main__":
main()
75 changes: 75 additions & 0 deletions examples/fashion-mnist/upload.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import argparse
import os
import struct

import numpy as np
from array import array as pyarray

from hub.collections import dataset, tensor


def load_fashion_mnist(dataset="training", digits=np.arange(10), path=".", size=60000):
if dataset == "training":
fname_img = os.path.join(path, "train-images-idx3-ubyte")
fname_lbl = os.path.join(path, "train-labels-idx1-ubyte")
elif dataset == "testing":
fname_img = os.path.join(path, "t10k-images-idx3-ubyte")
fname_lbl = os.path.join(path, "t10k-labels-idx1-ubyte")

else:
raise ValueError("dataset must be 'testing' or 'training'")

flbl = open(fname_lbl, "rb")
magic_nr, size = struct.unpack(">II", flbl.read(8))
lbl = pyarray("b", flbl.read())
flbl.close()

fimg = open(fname_img, "rb")
magic_nr, size, rows, cols = struct.unpack(">IIII", fimg.read(16))
img = pyarray("B", fimg.read())
fimg.close()

ind = [k for k in range(size) if lbl[k] in digits]
N = size # int(len(ind) * size/100.)
images = np.zeros((N, rows, cols), dtype=np.uint8)
labels = np.zeros((N, 1), dtype=np.int8)
for i in range(N): # int(len(ind) * size/100.)):
images[i] = np.array(
img[ind[i] * rows * cols : (ind[i] + 1) * rows * cols]
).reshape((rows, cols))
labels[i] = lbl[ind[i]]
labels = [label[0] for label in labels]
return images, labels


def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"-d",
"--dataset_path",
type=str,
help="Path to fashion-mnist dataset",
default="./data/fashion-mnist",
)
parser.add_argument(
"-o", "--output_name", type=str, help="Dataset output name", default="fashion-mnist",
)
args = parser.parse_args()
files = ["training", "testing"]
dicts = []
for f in files:
images, labels = load_fashion_mnist(f, path=args.dataset_path)
dicts += [{"images": images, "labels": labels}]
images = np.concatenate([d["images"] for d in dicts])
labels = np.concatenate([np.array(d["labels"], dtype="int8") for d in dicts])
print(images.shape, labels.shape)

images_t = tensor.from_array(images, dtag="mask")
labels_t = tensor.from_array(labels)

ds = dataset.from_tensors({"data": images_t, "labels": labels_t})
ds.store(f"{args.output_name}")


if __name__ == "__main__":
main()
18 changes: 16 additions & 2 deletions hub/collections/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,24 @@ def generate(generator: DatasetGenerator, input) -> Dataset:
)


def from_tensors(tensors: dict) -> Dataset:
def from_tensors(
tensors: dict,
license: str = None,
description: str = None,
citation: str = None,
howtoload: str = None,
) -> Dataset:
""" Creates a dataset from dict of tensors
"""
return Dataset(tensors)
return Dataset(
tensors,
metainfo={
"license": license,
"description": description,
"citation": citation,
"howtoload": howtoload,
},
)


def _meta_concat(metas: Tuple[Dict[str, object]]):
Expand Down
Loading

0 comments on commit 9ff2634

Please sign in to comment.