diff --git a/docs/source/api.md b/docs/source/api.md index 04ba1748bc..1f937469f5 100644 --- a/docs/source/api.md +++ b/docs/source/api.md @@ -53,12 +53,6 @@ ``` ## Schema -### Serialization -```eval_rst -.. automodule:: hub.schema.serialize - :members: -``` -### Schema ```eval_rst .. autoclass:: hub.schema.audio.Audio :members: diff --git a/hub/schema/audio.py b/hub/schema/audio.py index 1b10ae50dd..62f53d01c7 100644 --- a/hub/schema/audio.py +++ b/hub/schema/audio.py @@ -10,6 +10,61 @@ class Audio(Tensor): + + """Schema for audio would define the maximum shape of the audio dataset and their respective sampling rate. + + Example: This example uploads an `audio file` to a Hub dataset `audio_dataset` with `HubSchema` and retrieves it. + + ---------- + >>> import hub + >>> from hub.schema import Audio + >>> from hub import transform, schema + >>> import librosa + >>> from librosa import display + >>> import numpy as np + + >>> # Define schema + >>> my_schema={ + >>> "wav": Audio(shape=(None,), max_shape=(1920000,), file_format="wav", dtype=float), + >>> "sampling_rate": Primitive(dtype=int), + >>> } + >>> + >>> sample = glob("audio.wav") + + >>> # Define transform + >>> @transform(schema=my_schema) + >>> def load_transform(sample): + >>> audio, sr = librosa.load(sample, sr=None) + >>> + >>> return { + >>> "wav": audio, + >>> "sampling_rate": sr + >>> } + >>> + >>> # Returns a transform object + >>> ds = load_transform(sample) + + >>> # Load data + >>> ds = Dataset(tag) + >>> + >>> tag = "username/audio_dataset" + >>> + >>> # Pushes to hub + >>> ds2 = ds.store(tag) + + >>> # Fetching from Hub + >>> data = Dataset(tag) + >>> + >>> # Fetch the first sample + >>> audio_sample = data["wav"][0].compute() + >>> + >>> # Audio file + array([ 9.15527344e-05, 2.13623047e-04, 0.00000000e+00, ..., + -2.73132324e-02, -2.99072266e-02, -2.44750977e-02]) + + + """ + def __init__( self, shape: Tuple[int, ...] = (None,), diff --git a/hub/schema/bbox.py b/hub/schema/bbox.py index 8fa256ec02..cf1e702aef 100644 --- a/hub/schema/bbox.py +++ b/hub/schema/bbox.py @@ -12,7 +12,37 @@ class BBox(Tensor): """| HubSchema` for a normalized bounding box. Output: Tensor of type `float32` and shape `[4,]` which contains the - normalized coordinates of the bounding box `[xmin, ymin, xmax, ymax]` + normalized coordinates of the bounding box `[ymin, xmin, ymax, xmax]` + + + Example: This example uploads a dataset with a Bounding box schema and retrieves it. + + ---------- + >>> import hub + >>> from hub import Dataset, schema + >>> from hub.schema import BBox + >>> from numpy import asarray + + >>> tag = "username/dataset" + >>> + >>> # Create dataset + >>> ds = Dataset( + >>> tag, + >>> shape=(10,), + >>> schema={ + >>> "bbox": schema.BBox(dtype="uint8"), + >>> }, + >>> ) + >>> + >>> ds["bbox", 1] = np.array([1,2,3,4]) + >>> ds.flush() + + >>> # Load data + >>> ds = Dataset(tag) + >>> + >>> print(ds["bbox"][1].compute()) + [1 2 3 4] + """ def __init__( diff --git a/hub/schema/class_label.py b/hub/schema/class_label.py index c44a2948b1..53caf6ef87 100644 --- a/hub/schema/class_label.py +++ b/hub/schema/class_label.py @@ -14,7 +14,116 @@ def _load_names_from_file(names_filepath): class ClassLabel(Tensor): - """`HubSchema` for integer class labels.""" + """ + | Constructs a ClassLabel HubSchema. + | Returns an integer representations of given classes. Preserves the names of classes to convert those back to strings if needed. + | There are 3 ways to define a ClassLabel, which correspond to the 3 arguments: + Note: In python2, the strings are encoded as utf-8. + + >>> import hub + >>> from hub import Dataset, schema + >>> from hub.schema import ClassLabel + + | 1. `num_classes`: create 0 to (num_classes-1) labels using ClassLabel(num_classes=`number of classes`) + + ---------- + >>> tag = "username/dataset" + >>> + >>> # Create dataset + >>> ds=Dataset( + >>> tag, + >>> shape=(10,), + >>> schema = { + >>> "label_1": ClassLabel(num_classes=3), + >>> }, + >>> ) + >>> + >>> ds["label_1",0] = 0 + >>> ds["label_1",1] = 1 + >>> ds["label_1",2] = 2 + >>> + >>> ds.flush() + >>> + >>> # Load data + >>> ds = Dataset(tag) + >>> + >>> print(ds["label_1"][0].compute(True)) + >>> print(ds["label_1"][1].compute(True)) + >>> print(ds["label_1"][2].compute(True)) + 0 + 1 + 2 + + + | 2. `names`: a list of label strings. ClassLabel=(names=[`class1`,`class2`]) + + ---------- + >>> tag = "username/dataset" + >>> + >>> # Define schema + >>> my_schema = { + >>> "label_2": ClassLabel(names=['class1', 'class2', 'class3']), + >>> } + >>> + >>> # Create dataset + >>> ds=Dataset( + >>> tag, + >>> shape=(10,), + >>> schema = my_schema, + >>> ) + >>> + >>> ds.flush() + >>> + >>> # Load data + >>> ds = Dataset(tag) + + | Note: ClassLabel HubSchema returnsan interger representation of classes. + | Hence use `str2int()` and `int2str()` to load classes. + + >>> print(my_schema["label_2"].str2int("class1")) + >>> print(my_schema["label_2"].int2str(0)) + 0 + class1 + + + | 3. `names_file`: a file containing the list of labels. ClassLabel(names_file="/path/to/file/names.txt") + + Let's assume `names.txt` is located at `/dataset`: + + ---------- + >>> # Contents of "names.txt" + welcome + to + hub + + + >>> tag = "username/dataset" + >>> + >>> # Define Schema + >>> my_schema = { + >>> "label_3": ClassLabel(names_file="/content/names.txt"), + >>> } + >>> + # Create dataset + >>> ds=Dataset( + >>> tag, + >>> shape=(10,), + >>> schema = my_schema, + >>> ) + >>> + >>> ds.flush() + >>> + >>> # Load data + >>> ds = Dataset(tag) + >>> + >>> print(my_schema["label_3"].int2str(0)) + >>> print(my_schema["label_3"].int2str(1)) + >>> print(my_schema["label_3"].int2str(2)) + welcome + to + hub + + """ def __init__( self, @@ -26,19 +135,7 @@ def __init__( chunks=None, compressor="lz4", ): - """| Constructs a ClassLabel HubSchema. - | Returns an integer representations of given classes. Preserves the names of classes to convert those back to strings if needed. - | There are 3 ways to define a ClassLabel, which correspond to the 3 arguments: - | * `num_classes`: create 0 to (num_classes-1) labels - | * `names`: a list of label strings - | * `names_file`: a file containing the list of labels. - Note: In python2, the strings are encoded as utf-8. - - | Usage: - ---------- - >>> class_label_tensor = ClassLabel(num_classes=10) - >>> class_label_tensor = ClassLabel(names=['class1', 'class2', 'class3', ...]) - >>> class_label_tensor = ClassLabel(names_file='/path/to/file/with/names') + """ Parameters ---------- diff --git a/hub/schema/image.py b/hub/schema/image.py index d7f19fe339..fd447f7a25 100644 --- a/hub/schema/image.py +++ b/hub/schema/image.py @@ -12,22 +12,56 @@ class Image(Tensor): - """| `HubSchema` for images. + """Schema for images would define the shape and structure for the dataset. - Output: `tf.Tensor` of type `tf.uint8` and shape `[height, width, num_channels]` + Output: `Tensor` of type `uint8` and shape `[height, width, num_channels]` for BMP, JPEG, and PNG images - Example: + Example: This example uploads an `image` to a Hub dataset `image_dataset` with `HubSchema` and retrieves it. + ---------- - >>> image_tensor = Image(shape=(None, None, 1), - >>> encoding_format='png') + >>> import hub + >>> from hub import Dataset, schema + >>> from hub.schema import Image + >>> from numpy import asarray + + >>> tag = "username/image_dataset" + >>> + >>> # Create dataset + >>> ds=Dataset( + >>> tag, + >>> shape=(10,), + >>> schema={ + >>> "image": schema.Image((height, width, 3), dtype="uint8"), + >>> }, + >>> ) + >>> + >>> for index, image in enumerate(os.listdir("path/to/folder")): + >>> data = asarray(Image.open(image)) + >>> + >>> # Upload data + >>> ds["image"][index] = data + >>> + >>> ds.flush() + + >>> # Load data + >>> ds = Dataset(tag) + >>> + >>> for i in range(len(ds)): + >>> print(ds["image"][i].compute()) + [[[124 112 64] + [124 112 64] + [124 112 64] + ... + [236 237 232] + [238 239 234] + [238 239 234]]] """ def __init__( self, shape: Tuple[int, ...] = (None, None, 3), dtype="uint8", - # TODO Add back encoding_format (probably named compress) when support for png and jpg support will be added max_shape: Tuple[int, ...] = None, chunks=None, compressor="lz4", diff --git a/hub/schema/text.py b/hub/schema/text.py index 5993ae47a0..5c304070f9 100644 --- a/hub/schema/text.py +++ b/hub/schema/text.py @@ -12,7 +12,67 @@ class Text(Tensor): - """`HubSchema` for text""" + """Schema for text would define the shape and structure for the dataset. + + Output: `Tensor` of type `uint8` and shape `[height, width, num_channels]` + for BMP, JPEG, and PNG images + + Example: This example uploads an `image` to a Hub dataset `image_dataset` with `HubSchema` and retrieves it. + + For data with fixed `shape` + ---------- + >>> import hub + >>> from hub import Dataset, schema + >>> from hub.schema import Text + + >>> tag = "username/dataset" + >>> + >>> # Create dataset + >>> ds = Dataset( + >>> tag, + >>> shape=(5,), + >>> schema = { + >>> "text": Text(shape=(11,)), + >>> }, + >>> ) + >>> + >>> ds["text",0] = "Hello There" + >>> + >>> ds.flush() + >>> + >>> # Load the data + >>> ds = Dataset(tag) + >>> + >>> print(ds["text"][0].compute()) + Hello There + + + For data with variable `shape`, it is recommended to use `max_shape` + + >>> ds = Dataset( + >>> tag, + >>> shape=(5,), + >>> schema = { + >>> "text": Text(max_shape=(10,)), + >>> }, + >>> ) + >>> + >>> ds["text",0] = "Welcome" + >>> ds["text",1] = "to" + >>> ds["text",2] = "Hub" + >>> + >>> ds.flush() + >>> + >>> # Load data + >>> ds = Dataset(tag) + >>> + >>> print(ds["text"][0].compute()) + >>> print(ds["text"][1].compute()) + >>> print(ds["text"][2].compute()) + Welcome + to + Hub + """ def __init__( self,