Skip to content

Commit

Permalink
Merge pull request activeloopai#654 from thisiseshan/update_schema
Browse files Browse the repository at this point in the history
Update schema docs
  • Loading branch information
mynameisvinn authored Mar 23, 2021
2 parents 76b602d + 175e673 commit 3e3a363
Show file tree
Hide file tree
Showing 6 changed files with 298 additions and 28 deletions.
6 changes: 0 additions & 6 deletions docs/source/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,6 @@
```

## Schema
### Serialization
```eval_rst
.. automodule:: hub.schema.serialize
:members:
```
### Schema
```eval_rst
.. autoclass:: hub.schema.audio.Audio
:members:
Expand Down
55 changes: 55 additions & 0 deletions hub/schema/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,61 @@


class Audio(Tensor):

"""Schema for audio would define the maximum shape of the audio dataset and their respective sampling rate.
Example: This example uploads an `audio file` to a Hub dataset `audio_dataset` with `HubSchema` and retrieves it.
----------
>>> import hub
>>> from hub.schema import Audio
>>> from hub import transform, schema
>>> import librosa
>>> from librosa import display
>>> import numpy as np
>>> # Define schema
>>> my_schema={
>>> "wav": Audio(shape=(None,), max_shape=(1920000,), file_format="wav", dtype=float),
>>> "sampling_rate": Primitive(dtype=int),
>>> }
>>>
>>> sample = glob("audio.wav")
>>> # Define transform
>>> @transform(schema=my_schema)
>>> def load_transform(sample):
>>> audio, sr = librosa.load(sample, sr=None)
>>>
>>> return {
>>> "wav": audio,
>>> "sampling_rate": sr
>>> }
>>>
>>> # Returns a transform object
>>> ds = load_transform(sample)
>>> # Load data
>>> ds = Dataset(tag)
>>>
>>> tag = "username/audio_dataset"
>>>
>>> # Pushes to hub
>>> ds2 = ds.store(tag)
>>> # Fetching from Hub
>>> data = Dataset(tag)
>>>
>>> # Fetch the first sample
>>> audio_sample = data["wav"][0].compute()
>>>
>>> # Audio file
array([ 9.15527344e-05, 2.13623047e-04, 0.00000000e+00, ...,
-2.73132324e-02, -2.99072266e-02, -2.44750977e-02])
"""

def __init__(
self,
shape: Tuple[int, ...] = (None,),
Expand Down
32 changes: 31 additions & 1 deletion hub/schema/bbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,37 @@ class BBox(Tensor):
"""| HubSchema` for a normalized bounding box.
Output: Tensor of type `float32` and shape `[4,]` which contains the
normalized coordinates of the bounding box `[xmin, ymin, xmax, ymax]`
normalized coordinates of the bounding box `[ymin, xmin, ymax, xmax]`
Example: This example uploads a dataset with a Bounding box schema and retrieves it.
----------
>>> import hub
>>> from hub import Dataset, schema
>>> from hub.schema import BBox
>>> from numpy import asarray
>>> tag = "username/dataset"
>>>
>>> # Create dataset
>>> ds = Dataset(
>>> tag,
>>> shape=(10,),
>>> schema={
>>> "bbox": schema.BBox(dtype="uint8"),
>>> },
>>> )
>>>
>>> ds["bbox", 1] = np.array([1,2,3,4])
>>> ds.flush()
>>> # Load data
>>> ds = Dataset(tag)
>>>
>>> print(ds["bbox"][1].compute())
[1 2 3 4]
"""

def __init__(
Expand Down
125 changes: 111 additions & 14 deletions hub/schema/class_label.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,116 @@ def _load_names_from_file(names_filepath):


class ClassLabel(Tensor):
"""`HubSchema` for integer class labels."""
"""
| Constructs a ClassLabel HubSchema.
| Returns an integer representations of given classes. Preserves the names of classes to convert those back to strings if needed.
| There are 3 ways to define a ClassLabel, which correspond to the 3 arguments:
Note: In python2, the strings are encoded as utf-8.
>>> import hub
>>> from hub import Dataset, schema
>>> from hub.schema import ClassLabel
| 1. `num_classes`: create 0 to (num_classes-1) labels using ClassLabel(num_classes=`number of classes`)
----------
>>> tag = "username/dataset"
>>>
>>> # Create dataset
>>> ds=Dataset(
>>> tag,
>>> shape=(10,),
>>> schema = {
>>> "label_1": ClassLabel(num_classes=3),
>>> },
>>> )
>>>
>>> ds["label_1",0] = 0
>>> ds["label_1",1] = 1
>>> ds["label_1",2] = 2
>>>
>>> ds.flush()
>>>
>>> # Load data
>>> ds = Dataset(tag)
>>>
>>> print(ds["label_1"][0].compute(True))
>>> print(ds["label_1"][1].compute(True))
>>> print(ds["label_1"][2].compute(True))
0
1
2
| 2. `names`: a list of label strings. ClassLabel=(names=[`class1`,`class2`])
----------
>>> tag = "username/dataset"
>>>
>>> # Define schema
>>> my_schema = {
>>> "label_2": ClassLabel(names=['class1', 'class2', 'class3']),
>>> }
>>>
>>> # Create dataset
>>> ds=Dataset(
>>> tag,
>>> shape=(10,),
>>> schema = my_schema,
>>> )
>>>
>>> ds.flush()
>>>
>>> # Load data
>>> ds = Dataset(tag)
| Note: ClassLabel HubSchema returnsan interger representation of classes.
| Hence use `str2int()` and `int2str()` to load classes.
>>> print(my_schema["label_2"].str2int("class1"))
>>> print(my_schema["label_2"].int2str(0))
0
class1
| 3. `names_file`: a file containing the list of labels. ClassLabel(names_file="/path/to/file/names.txt")
Let's assume `names.txt` is located at `/dataset`:
----------
>>> # Contents of "names.txt"
welcome
to
hub
>>> tag = "username/dataset"
>>>
>>> # Define Schema
>>> my_schema = {
>>> "label_3": ClassLabel(names_file="/content/names.txt"),
>>> }
>>>
# Create dataset
>>> ds=Dataset(
>>> tag,
>>> shape=(10,),
>>> schema = my_schema,
>>> )
>>>
>>> ds.flush()
>>>
>>> # Load data
>>> ds = Dataset(tag)
>>>
>>> print(my_schema["label_3"].int2str(0))
>>> print(my_schema["label_3"].int2str(1))
>>> print(my_schema["label_3"].int2str(2))
welcome
to
hub
"""

def __init__(
self,
Expand All @@ -26,19 +135,7 @@ def __init__(
chunks=None,
compressor="lz4",
):
"""| Constructs a ClassLabel HubSchema.
| Returns an integer representations of given classes. Preserves the names of classes to convert those back to strings if needed.
| There are 3 ways to define a ClassLabel, which correspond to the 3 arguments:
| * `num_classes`: create 0 to (num_classes-1) labels
| * `names`: a list of label strings
| * `names_file`: a file containing the list of labels.
Note: In python2, the strings are encoded as utf-8.
| Usage:
----------
>>> class_label_tensor = ClassLabel(num_classes=10)
>>> class_label_tensor = ClassLabel(names=['class1', 'class2', 'class3', ...])
>>> class_label_tensor = ClassLabel(names_file='/path/to/file/with/names')
"""
Parameters
----------
Expand Down
46 changes: 40 additions & 6 deletions hub/schema/image.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,56 @@


class Image(Tensor):
"""| `HubSchema` for images.
"""Schema for images would define the shape and structure for the dataset.
Output: `tf.Tensor` of type `tf.uint8` and shape `[height, width, num_channels]`
Output: `Tensor` of type `uint8` and shape `[height, width, num_channels]`
for BMP, JPEG, and PNG images
Example:
Example: This example uploads an `image` to a Hub dataset `image_dataset` with `HubSchema` and retrieves it.
----------
>>> image_tensor = Image(shape=(None, None, 1),
>>> encoding_format='png')
>>> import hub
>>> from hub import Dataset, schema
>>> from hub.schema import Image
>>> from numpy import asarray
>>> tag = "username/image_dataset"
>>>
>>> # Create dataset
>>> ds=Dataset(
>>> tag,
>>> shape=(10,),
>>> schema={
>>> "image": schema.Image((height, width, 3), dtype="uint8"),
>>> },
>>> )
>>>
>>> for index, image in enumerate(os.listdir("path/to/folder")):
>>> data = asarray(Image.open(image))
>>>
>>> # Upload data
>>> ds["image"][index] = data
>>>
>>> ds.flush()
>>> # Load data
>>> ds = Dataset(tag)
>>>
>>> for i in range(len(ds)):
>>> print(ds["image"][i].compute())
[[[124 112 64]
[124 112 64]
[124 112 64]
...
[236 237 232]
[238 239 234]
[238 239 234]]]
"""

def __init__(
self,
shape: Tuple[int, ...] = (None, None, 3),
dtype="uint8",
# TODO Add back encoding_format (probably named compress) when support for png and jpg support will be added
max_shape: Tuple[int, ...] = None,
chunks=None,
compressor="lz4",
Expand Down
62 changes: 61 additions & 1 deletion hub/schema/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,67 @@


class Text(Tensor):
"""`HubSchema` for text"""
"""Schema for text would define the shape and structure for the dataset.
Output: `Tensor` of type `uint8` and shape `[height, width, num_channels]`
for BMP, JPEG, and PNG images
Example: This example uploads an `image` to a Hub dataset `image_dataset` with `HubSchema` and retrieves it.
For data with fixed `shape`
----------
>>> import hub
>>> from hub import Dataset, schema
>>> from hub.schema import Text
>>> tag = "username/dataset"
>>>
>>> # Create dataset
>>> ds = Dataset(
>>> tag,
>>> shape=(5,),
>>> schema = {
>>> "text": Text(shape=(11,)),
>>> },
>>> )
>>>
>>> ds["text",0] = "Hello There"
>>>
>>> ds.flush()
>>>
>>> # Load the data
>>> ds = Dataset(tag)
>>>
>>> print(ds["text"][0].compute())
Hello There
For data with variable `shape`, it is recommended to use `max_shape`
>>> ds = Dataset(
>>> tag,
>>> shape=(5,),
>>> schema = {
>>> "text": Text(max_shape=(10,)),
>>> },
>>> )
>>>
>>> ds["text",0] = "Welcome"
>>> ds["text",1] = "to"
>>> ds["text",2] = "Hub"
>>>
>>> ds.flush()
>>>
>>> # Load data
>>> ds = Dataset(tag)
>>>
>>> print(ds["text"][0].compute())
>>> print(ds["text"][1].compute())
>>> print(ds["text"][2].compute())
Welcome
to
Hub
"""

def __init__(
self,
Expand Down

0 comments on commit 3e3a363

Please sign in to comment.