From 1bcc7d37b5840c0395e2cbfe8ddf7ee9e63f0227 Mon Sep 17 00:00:00 2001 From: prithviraj-maurya Date: Sat, 17 Oct 2020 14:17:42 +0530 Subject: [PATCH 01/16] fixing to_tensorflow method to take any shape #44 --- hub/collections/dataset/core.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hub/collections/dataset/core.py b/hub/collections/dataset/core.py index 8657d5ffef..1f76f18bdd 100644 --- a/hub/collections/dataset/core.py +++ b/hub/collections/dataset/core.py @@ -630,7 +630,7 @@ def tf_dtype(np_dtype): return tf.data.Dataset.from_generator( tf_gen, output_types=output_types, - output_shapes=output_shapes, + # output_shapes=output_shapes, # commenting out this so TF takes the shape whatever is passed ) From 9de6e20f3fafae2595610c91805b09b891192e47 Mon Sep 17 00:00:00 2001 From: prithviraj-maurya Date: Sat, 17 Oct 2020 15:26:53 +0530 Subject: [PATCH 02/16] adding dtypes to COCO examples #44 --- examples/coco/upload_coco2017.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/coco/upload_coco2017.py b/examples/coco/upload_coco2017.py index c5aff4369c..686d6edd65 100644 --- a/examples/coco/upload_coco2017.py +++ b/examples/coco/upload_coco2017.py @@ -25,9 +25,9 @@ def meta(self): "iscrowd": {"shape": (1,), "dtype": "object", "chunksize": 1000}, "image_id": {"shape": (1,), "dtype": "int64"}, "bbox": {"shape": (1,), "dtype": "object", "chunksize": 1000}, - "category_id": {"shape": (1,), "dtype": "object", "chunksize": 1000}, - "id": {"shape": (1,), "dtype": "object", "chunksize": 1000}, - "image": {"shape": (1,), "dtype": "object", "chunksize": 100}, + "category_id": {"shape": (1,), "int64": "object", "chunksize": 1000}, + "id": {"shape": (1,), "dtype": "uint8", "chunksize": 1000}, + "image": {"shape": (1,), "dtype": "uint8", "chunksize": 100}, } def __call__(self, input): @@ -36,21 +36,21 @@ def __call__(self, input): # print(f"Image id: {input['image_id']}") ds["image_id"] = input["image_id"] info = input["info"] - ds["image"] = np.empty(1, object) + ds["image"] = np.empty(1, np.uint8) ds["image"][0] = np.array( Image.open( os.path.join( self._args.dataset_path, get_image_name(self._args, self._tag, input["image_id"]), ) - ) + ), dtype=np.uint8 ) ds["segmentation"] = np.empty(1, object) ds["area"] = np.empty(1, object) ds["iscrowd"] = np.empty(1, object) ds["bbox"] = np.empty(1, object) - ds["category_id"] = np.empty(1, object) - ds["id"] = np.empty(1, object) + ds["category_id"] = np.empty(1, np.uint8) + ds["id"] = np.empty(1, np.uint8) ds["segmentation"][0] = [anno["segmentation"] for anno in info] ds["area"][0] = [anno["area"] for anno in info] From 8dd2222fb58efb27485f4d9ce9b904173821832f Mon Sep 17 00:00:00 2001 From: prithviraj-maurya Date: Sat, 24 Oct 2020 17:21:35 +0530 Subject: [PATCH 03/16] fixing to_tensorflow method to take any shape #44 #90 --- examples/coco/upload_coco2017.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/examples/coco/upload_coco2017.py b/examples/coco/upload_coco2017.py index 686d6edd65..d1b575f9cb 100644 --- a/examples/coco/upload_coco2017.py +++ b/examples/coco/upload_coco2017.py @@ -20,14 +20,14 @@ def __init__(self, args, tag): def meta(self): return { - "segmentation": {"shape": (1,), "dtype": "object", "chunksize": 1000}, - "area": {"shape": (1,), "dtype": "object", "chunksize": 1000}, - "iscrowd": {"shape": (1,), "dtype": "object", "chunksize": 1000}, + "segmentation": {"shape": (1,), "dtype": "uint32", "chunksize": 1000}, + "area": {"shape": (1,), "dtype": "uint32", "chunksize": 1000}, + "iscrowd": {"shape": (1,), "dtype": "uint8", "chunksize": 1000}, "image_id": {"shape": (1,), "dtype": "int64"}, - "bbox": {"shape": (1,), "dtype": "object", "chunksize": 1000}, - "category_id": {"shape": (1,), "int64": "object", "chunksize": 1000}, - "id": {"shape": (1,), "dtype": "uint8", "chunksize": 1000}, - "image": {"shape": (1,), "dtype": "uint8", "chunksize": 100}, + "bbox": {"shape": (1,), "dtype": "uint16", "chunksize": 1000}, + "category_id": {"shape": (1,), "dtype": "int64", "chunksize": 1000}, + "id": {"shape": (1,), "dtype": "uint32", "chunksize": 1000}, + "image": {"shape": (1,), "dtype": "uint32", "chunksize": 100}, } def __call__(self, input): @@ -36,21 +36,21 @@ def __call__(self, input): # print(f"Image id: {input['image_id']}") ds["image_id"] = input["image_id"] info = input["info"] - ds["image"] = np.empty(1, np.uint8) + ds["image"] = np.empty(1, np.uint32) ds["image"][0] = np.array( Image.open( os.path.join( self._args.dataset_path, get_image_name(self._args, self._tag, input["image_id"]), ) - ), dtype=np.uint8 + ), dtype=np.uint32 ) - ds["segmentation"] = np.empty(1, object) - ds["area"] = np.empty(1, object) - ds["iscrowd"] = np.empty(1, object) - ds["bbox"] = np.empty(1, object) + ds["segmentation"] = np.empty(1, np.uint32) + ds["area"] = np.empty(1, np.uint32) + ds["iscrowd"] = np.empty(1, np.uint8) + ds["bbox"] = np.empty(1, np.np.uint16) ds["category_id"] = np.empty(1, np.uint8) - ds["id"] = np.empty(1, np.uint8) + ds["id"] = np.empty(1, np.uint32) ds["segmentation"][0] = [anno["segmentation"] for anno in info] ds["area"][0] = [anno["area"] for anno in info] From 6a384daea71b6af4e77aab9c42448a26235c870e Mon Sep 17 00:00:00 2001 From: sohamsshah Date: Thu, 29 Oct 2020 09:36:59 +0530 Subject: [PATCH 04/16] add upload.py --- examples/3D Object Dataset/upload.py | 98 ++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 examples/3D Object Dataset/upload.py diff --git a/examples/3D Object Dataset/upload.py b/examples/3D Object Dataset/upload.py new file mode 100644 index 0000000000..4322cc77ca --- /dev/null +++ b/examples/3D Object Dataset/upload.py @@ -0,0 +1,98 @@ +""" +Dataset Download Source: http://cvgl.stanford.edu/data2/3Ddataset.zip +Dataset format: Images(.bmp file) +Dataset Features: bicycle, car, cellphone, head, iron, monitor, mouse, shoe, stapler, toaster + +Folder Structure: +3Ddataset + -bicycle + -bicycle_1 + - Various Images in .bmp format + -bicycle_2 + -bicycle_3 + ... + -bicycle_10 + -car + -car_1 + -car_2 + ... + -car_10 + ... + Total 10 features +""" + +import os +import numpy as np +from PIL import Image +import torchvision.transforms as transforms +from hub import Transform, dataset +import pandas as pd + +NUM_FEATURES = 10 + + +class DatasetGenerator(Transform): + def meta(self): + # here we specify the attributes of return type + return { + "image_label": {"shape": (1,), "dtype": "int", "dtag": "text"}, + "named_image_label": {"shape": (1,), "dtype": "object", "dtag": "text"}, + "image": { + "shape": (1,), + "dtype": "object", + "chunksize": 100, + "dtag": "image", + }, + } + + def forward(self, image_info): + # we need to return a dictionary of numpy arrays from here + ds = {} + ds["image_label"] = np.empty(1, dtype="int") + ds["image_label"][0] = image_info["image_label"] + + ds["named_image_label"] = np.empty(1, dtype="object") + ds["named_image_label"][0] = image_info["named_image_label"] + + ds["image"] = np.empty(1, object) + ds["image"][0] = np.array(Image.open(image_info["image_path"]).convert("RGB")) + print("------------------------------------------------") + print(ds["named_image_label"][0] + " image loaded successfully") + return ds + + +def map_labels(labels_list): + dic = {labels_list[i]: i for i in range(1, NUM_FEATURES + 1)} + return dic + + +def load_dataset(base_path): + labels_list = os.listdir(base_path) + labels_dict = map_labels(labels_list) + image_info_list = [] + for label in labels_list: + for label_num in range(1, NUM_FEATURES + 1): + curr_path = base_path + "/" + label + "/" + label + "_" + str(label_num) + images_list = os.listdir(curr_path) + for image in images_list: + image_info = {} + if image.lower().startswith(label): # all images' name starts with the feature name (observation) + image_info["image_path"] = curr_path + "/" + image + image_info["image_label"] = labels_dict[label] + image_info["named_image_label"] = label + image_info_list.append(image_info) + + # the generator iterates through the argument given, one by one and applies forward. This is done lazily. + ds = dataset.generate(DatasetGenerator(), image_info_list) + return ds + + +def main(): + base_path = "./3Ddataset" + # stores the dataset in username/datasetname + ds = load_dataset(base_path) + ds.store("ThreeDimensionalDataset") + + +if __name__ == "__main__": + main() From b2b7ee321d98331b3643d5cafa6420cb89e3e3b8 Mon Sep 17 00:00:00 2001 From: sohamsshah Date: Thu, 29 Oct 2020 09:45:29 +0530 Subject: [PATCH 05/16] format with black --- examples/3D Object Dataset/upload.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/3D Object Dataset/upload.py b/examples/3D Object Dataset/upload.py index 4322cc77ca..d25605d60e 100644 --- a/examples/3D Object Dataset/upload.py +++ b/examples/3D Object Dataset/upload.py @@ -76,7 +76,9 @@ def load_dataset(base_path): images_list = os.listdir(curr_path) for image in images_list: image_info = {} - if image.lower().startswith(label): # all images' name starts with the feature name (observation) + if image.lower().startswith( + label + ): # all images' name starts with the feature name (observation) image_info["image_path"] = curr_path + "/" + image image_info["image_label"] = labels_dict[label] image_info["named_image_label"] = label From d951d27d0d58c25c7375dcf2d247ffbb30cdc1d6 Mon Sep 17 00:00:00 2001 From: sohamsshah Date: Thu, 29 Oct 2020 09:53:44 +0530 Subject: [PATCH 06/16] add code of conduct --- CODE_OF_CONDUCT.md | 76 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 CODE_OF_CONDUCT.md diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000..09827eb16a --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,76 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +In the interest of fostering an open and welcoming environment, we as +contributors and maintainers pledge to making participation in our project and +our community a harassment-free experience for everyone, regardless of age, body +size, disability, ethnicity, sex characteristics, gender identity and expression, +level of experience, education, socio-economic status, nationality, personal +appearance, race, religion, or sexual identity and orientation. + +## Our Standards + +Examples of behavior that contributes to creating a positive environment +include: + +* Using welcoming and inclusive language +* Being respectful of differing viewpoints and experiences +* Gracefully accepting constructive criticism +* Focusing on what is best for the community +* Showing empathy towards other community members + +Examples of unacceptable behavior by participants include: + +* The use of sexualized language or imagery and unwelcome sexual attention or + advances +* Trolling, insulting/derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or electronic + address, without explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Our Responsibilities + +Project maintainers are responsible for clarifying the standards of acceptable +behavior and are expected to take appropriate and fair corrective action in +response to any instances of unacceptable behavior. + +Project maintainers have the right and responsibility to remove, edit, or +reject comments, commits, code, wiki edits, issues, and other contributions +that are not aligned to this Code of Conduct, or to ban temporarily or +permanently any contributor for other behaviors that they deem inappropriate, +threatening, offensive, or harmful. + +## Scope + +This Code of Conduct applies both within project spaces and in public spaces +when an individual is representing the project or its community. Examples of +representing a project or community include using an official project e-mail +address, posting via an official social media account, or acting as an appointed +representative at an online or offline event. Representation of a project may be +further defined and clarified by project maintainers. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported by contacting the project team at . All +complaints will be reviewed and investigated and will result in a response that +is deemed necessary and appropriate to the circumstances. The project team is +obligated to maintain confidentiality with regard to the reporter of an incident. +Further details of specific enforcement policies may be posted separately. + +Project maintainers who do not follow or enforce the Code of Conduct in good +faith may face temporary or permanent repercussions as determined by other +members of the project's leadership. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4, +available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see +https://www.contributor-covenant.org/faq From 0efce91d6486c277e5226aa723285d281eb658ce Mon Sep 17 00:00:00 2001 From: Davit Buniatyan Date: Wed, 28 Oct 2020 23:08:44 -0700 Subject: [PATCH 07/16] Delete creds.md --- docs/source/tutorials/creds.md | 31 ------------------------------- 1 file changed, 31 deletions(-) delete mode 100644 docs/source/tutorials/creds.md diff --git a/docs/source/tutorials/creds.md b/docs/source/tutorials/creds.md deleted file mode 100644 index e3f27f1c81..0000000000 --- a/docs/source/tutorials/creds.md +++ /dev/null @@ -1,31 +0,0 @@ -# Setup Credentials - -This guide will walk you through setting up your credentials so you can start using Snark: Dataflow right away. - -In your project directory create `` file and fill the following: - -``` -[default] -aws_access_key_id = -aws_secret_access_key = -region: us-east-1 -``` - -Run `usecases/setup_creds.py` to set up your Agmri and Intelinair credentials (the script is in the `dataflow` directory). - -A prompt will appear asking for your AgMRI credentials. - -``` -> Admin_username: -> Admin_password: -> Environment: production -``` - -The prompt will then ask you for the IntelinAir AWS credentials for field access. - -``` -> Aws_access_key_id: -> Aws_secret_access_key: -``` - -That's it! From 5637b2860e4530c53559c27138647d92e25bed7a Mon Sep 17 00:00:00 2001 From: Michael Zhou Date: Wed, 28 Oct 2020 21:51:43 -0700 Subject: [PATCH 08/16] Resolve errors and change cases in documentation --- docs/source/concepts/tensor.md | 2 +- docs/source/integrations/pytorch.md | 4 ++-- docs/source/integrations/tensorflow.md | 4 ++-- docs/source/simple.md | 2 +- docs/source/storage/tutorials.md | 4 ++-- docs/source/tutorials/pytorch.md | 2 +- docs/source/why.md | 2 +- 7 files changed, 10 insertions(+), 10 deletions(-) diff --git a/docs/source/concepts/tensor.md b/docs/source/concepts/tensor.md index 2cfdfc8f03..d81c262f02 100644 --- a/docs/source/concepts/tensor.md +++ b/docs/source/concepts/tensor.md @@ -28,7 +28,7 @@ t = tensor.from_zeros(np.zeros((10, 512, 512))) ## Concat or Stack -Concating or stacking tensors works as in other frameworks. +Concatenating or stacking tensors works as in other frameworks. ```python from hub import tensor diff --git a/docs/source/integrations/pytorch.md b/docs/source/integrations/pytorch.md index 22b8e034b1..545d92aacf 100644 --- a/docs/source/integrations/pytorch.md +++ b/docs/source/integrations/pytorch.md @@ -1,6 +1,6 @@ # PyTorch -Here is an example to transform the dataset into pytorch form. +Here is an example to transform the dataset into Pytorch form. ``` import torch @@ -9,7 +9,7 @@ from hub import dataset # Load data ds = dataset.load("mnist/mnist") -# Transform into pytorch +# Transform into Pytorch ds = ds.to_pytorch(transform=None) ds = torch.utils.data.DataLoader( ds, batch_size=8, num_workers=8, collate_fn=ds.collate_fn diff --git a/docs/source/integrations/tensorflow.md b/docs/source/integrations/tensorflow.md index 4519c43946..0062494c6e 100644 --- a/docs/source/integrations/tensorflow.md +++ b/docs/source/integrations/tensorflow.md @@ -1,6 +1,6 @@ # Tensorflow -Here is an example to transform the dataset into tensorflow form. +Here is an example to transform the dataset into Tensorflow form. ```python from hub import dataset @@ -8,7 +8,7 @@ from hub import dataset # Load data ds = dataset.load("mnist/mnist") -# tansform into Tensorflow dataset +# transform into Tensorflow dataset ds = ds.to_tensorflow().batch(8) # Iterate over the data diff --git a/docs/source/simple.md b/docs/source/simple.md index 312210870f..1606fcff0c 100644 --- a/docs/source/simple.md +++ b/docs/source/simple.md @@ -22,7 +22,7 @@ mnist["data"][0:1000].compute() ## Train a model -Load the data and directly train your model using pytorch +Load the data and directly train your model using Pytorch ```python import hub diff --git a/docs/source/storage/tutorials.md b/docs/source/storage/tutorials.md index 60a577a1f4..060c77599f 100644 --- a/docs/source/storage/tutorials.md +++ b/docs/source/storage/tutorials.md @@ -71,7 +71,7 @@ ds["input", 1:3] = np.ones((2, 25, 25)) ``` ## Idea of chunking -Chunks are the most important part of Hub arrays. Imagine that you have a really large array stored in the cloud and want to access only some significantly smaller part of it. Let us say you have an array of 100000 images with shape ```(100000, 1024, 1024, 3)```. If we stored this array wholly without dividing into multiple chunks then in order to request only few images from it we would need to load the entire array into RAM which would be impossible and even if some computer would have that big RAM, downloading the whole array would take a lot of time. Instead we store the array in chunks and we only downlaod the chunks that contain the requested part of the array. +Chunks are the most important part of Hub arrays. Imagine that you have a really large array stored in the cloud and want to access only some significantly smaller part of it. Let us say you have an array of 100000 images with shape ```(100000, 1024, 1024, 3)```. If we stored this array wholly without dividing into multiple chunks then in order to request only few images from it we would need to load the entire array into RAM which would be impossible and even if some computer would have that big RAM, downloading the whole array would take a lot of time. Instead we store the array in chunks and we only download the chunks that contain the requested part of the array. ## How to choose a proper chunk size Choosing a proper chunk size is crucial for performance. The chunks must be much bigger and take longer time to download than the overhead of request to cloud ~1ms. Chunks also should be small enough to fit multiple chunks into RAM. Usually, we can have up to 1 chunk per thread. @@ -91,7 +91,7 @@ Compresslevel is a float number from 0 to 1. Where 1 is the fastest and 0 is the You can easily find about all of our supported compressors, their effectiveness, and performance in the internet. ## Integration with Pytorch and TensorFlow -Hub datasets can easily be transformed into Pytoch and Tensorflow formats. +Hub datasets can easily be transformed into Pytorch and Tensorflow formats. Pytorch: ```python datahub = hub.fs("./data/cache").connect() diff --git a/docs/source/tutorials/pytorch.md b/docs/source/tutorials/pytorch.md index 00961f90d6..cd8ce1efb5 100644 --- a/docs/source/tutorials/pytorch.md +++ b/docs/source/tutorials/pytorch.md @@ -1,6 +1,6 @@ # Pytorch Integration -In this tutorial we will retreive our dataset from the local cache and integrate it with `Pytorch` for further use. +In this tutorial we will retrieve our dataset from the local cache and integrate it with `Pytorch` for further use. For a detailed guide on dataset generation and storage see [this tutorial](samples.md). diff --git a/docs/source/why.md b/docs/source/why.md index cd38d4dfa7..51533c90f5 100644 --- a/docs/source/why.md +++ b/docs/source/why.md @@ -10,7 +10,7 @@ We realized that there are a few problems related with current workflow in deep 2. **Code dependency on local folder structure**. People use a folder structure to store images or videos. As a result, the data input pipeline has to take into consideration the raw folder structure which creates unnecessary & error-prone code dependency of the dataset folder structure. -3. **Managing preprocessing pipelines**. If you want to run some preprocessing, it would be ideal to save the preprocessed images as a local cache for training.But it’s usually hard to manage & version control the preprocessed images locally when there are multiple preprocessing pipelies and the dataset is very big. +3. **Managing preprocessing pipelines**. If you want to run some preprocessing, it would be ideal to save the preprocessed images as a local cache for training.But it’s usually hard to manage & version control the preprocessed images locally when there are multiple preprocessing pipelines and the dataset is very big. 4. **Visualization**. It's difficult to visualize the raw data or preprocessed dataset on servers. From 257ead7877a6951c92223aee672e216cc1dd2bd0 Mon Sep 17 00:00:00 2001 From: sohamsshah Date: Thu, 29 Oct 2020 16:44:00 +0530 Subject: [PATCH 09/16] remove upload.py --- examples/3D Object Dataset/upload.py | 100 --------------------------- 1 file changed, 100 deletions(-) delete mode 100644 examples/3D Object Dataset/upload.py diff --git a/examples/3D Object Dataset/upload.py b/examples/3D Object Dataset/upload.py deleted file mode 100644 index d25605d60e..0000000000 --- a/examples/3D Object Dataset/upload.py +++ /dev/null @@ -1,100 +0,0 @@ -""" -Dataset Download Source: http://cvgl.stanford.edu/data2/3Ddataset.zip -Dataset format: Images(.bmp file) -Dataset Features: bicycle, car, cellphone, head, iron, monitor, mouse, shoe, stapler, toaster - -Folder Structure: -3Ddataset - -bicycle - -bicycle_1 - - Various Images in .bmp format - -bicycle_2 - -bicycle_3 - ... - -bicycle_10 - -car - -car_1 - -car_2 - ... - -car_10 - ... - Total 10 features -""" - -import os -import numpy as np -from PIL import Image -import torchvision.transforms as transforms -from hub import Transform, dataset -import pandas as pd - -NUM_FEATURES = 10 - - -class DatasetGenerator(Transform): - def meta(self): - # here we specify the attributes of return type - return { - "image_label": {"shape": (1,), "dtype": "int", "dtag": "text"}, - "named_image_label": {"shape": (1,), "dtype": "object", "dtag": "text"}, - "image": { - "shape": (1,), - "dtype": "object", - "chunksize": 100, - "dtag": "image", - }, - } - - def forward(self, image_info): - # we need to return a dictionary of numpy arrays from here - ds = {} - ds["image_label"] = np.empty(1, dtype="int") - ds["image_label"][0] = image_info["image_label"] - - ds["named_image_label"] = np.empty(1, dtype="object") - ds["named_image_label"][0] = image_info["named_image_label"] - - ds["image"] = np.empty(1, object) - ds["image"][0] = np.array(Image.open(image_info["image_path"]).convert("RGB")) - print("------------------------------------------------") - print(ds["named_image_label"][0] + " image loaded successfully") - return ds - - -def map_labels(labels_list): - dic = {labels_list[i]: i for i in range(1, NUM_FEATURES + 1)} - return dic - - -def load_dataset(base_path): - labels_list = os.listdir(base_path) - labels_dict = map_labels(labels_list) - image_info_list = [] - for label in labels_list: - for label_num in range(1, NUM_FEATURES + 1): - curr_path = base_path + "/" + label + "/" + label + "_" + str(label_num) - images_list = os.listdir(curr_path) - for image in images_list: - image_info = {} - if image.lower().startswith( - label - ): # all images' name starts with the feature name (observation) - image_info["image_path"] = curr_path + "/" + image - image_info["image_label"] = labels_dict[label] - image_info["named_image_label"] = label - image_info_list.append(image_info) - - # the generator iterates through the argument given, one by one and applies forward. This is done lazily. - ds = dataset.generate(DatasetGenerator(), image_info_list) - return ds - - -def main(): - base_path = "./3Ddataset" - # stores the dataset in username/datasetname - ds = load_dataset(base_path) - ds.store("ThreeDimensionalDataset") - - -if __name__ == "__main__": - main() From db5fb21c64d219a77f2c2eb75663eabdae555976 Mon Sep 17 00:00:00 2001 From: sohamsshah Date: Thu, 29 Oct 2020 21:37:37 +0530 Subject: [PATCH 10/16] add --- examples/3D Object Dataset/upload.py | 100 +++++++++++++++++++++++++++ 1 file changed, 100 insertions(+) create mode 100644 examples/3D Object Dataset/upload.py diff --git a/examples/3D Object Dataset/upload.py b/examples/3D Object Dataset/upload.py new file mode 100644 index 0000000000..d25605d60e --- /dev/null +++ b/examples/3D Object Dataset/upload.py @@ -0,0 +1,100 @@ +""" +Dataset Download Source: http://cvgl.stanford.edu/data2/3Ddataset.zip +Dataset format: Images(.bmp file) +Dataset Features: bicycle, car, cellphone, head, iron, monitor, mouse, shoe, stapler, toaster + +Folder Structure: +3Ddataset + -bicycle + -bicycle_1 + - Various Images in .bmp format + -bicycle_2 + -bicycle_3 + ... + -bicycle_10 + -car + -car_1 + -car_2 + ... + -car_10 + ... + Total 10 features +""" + +import os +import numpy as np +from PIL import Image +import torchvision.transforms as transforms +from hub import Transform, dataset +import pandas as pd + +NUM_FEATURES = 10 + + +class DatasetGenerator(Transform): + def meta(self): + # here we specify the attributes of return type + return { + "image_label": {"shape": (1,), "dtype": "int", "dtag": "text"}, + "named_image_label": {"shape": (1,), "dtype": "object", "dtag": "text"}, + "image": { + "shape": (1,), + "dtype": "object", + "chunksize": 100, + "dtag": "image", + }, + } + + def forward(self, image_info): + # we need to return a dictionary of numpy arrays from here + ds = {} + ds["image_label"] = np.empty(1, dtype="int") + ds["image_label"][0] = image_info["image_label"] + + ds["named_image_label"] = np.empty(1, dtype="object") + ds["named_image_label"][0] = image_info["named_image_label"] + + ds["image"] = np.empty(1, object) + ds["image"][0] = np.array(Image.open(image_info["image_path"]).convert("RGB")) + print("------------------------------------------------") + print(ds["named_image_label"][0] + " image loaded successfully") + return ds + + +def map_labels(labels_list): + dic = {labels_list[i]: i for i in range(1, NUM_FEATURES + 1)} + return dic + + +def load_dataset(base_path): + labels_list = os.listdir(base_path) + labels_dict = map_labels(labels_list) + image_info_list = [] + for label in labels_list: + for label_num in range(1, NUM_FEATURES + 1): + curr_path = base_path + "/" + label + "/" + label + "_" + str(label_num) + images_list = os.listdir(curr_path) + for image in images_list: + image_info = {} + if image.lower().startswith( + label + ): # all images' name starts with the feature name (observation) + image_info["image_path"] = curr_path + "/" + image + image_info["image_label"] = labels_dict[label] + image_info["named_image_label"] = label + image_info_list.append(image_info) + + # the generator iterates through the argument given, one by one and applies forward. This is done lazily. + ds = dataset.generate(DatasetGenerator(), image_info_list) + return ds + + +def main(): + base_path = "./3Ddataset" + # stores the dataset in username/datasetname + ds = load_dataset(base_path) + ds.store("ThreeDimensionalDataset") + + +if __name__ == "__main__": + main() From 2ee0609131178d16a3dc5d6e07a3f1a63a42fe97 Mon Sep 17 00:00:00 2001 From: sohamsshah Date: Thu, 29 Oct 2020 21:38:45 +0530 Subject: [PATCH 11/16] update upload.py --- examples/3D Object Dataset/upload.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/3D Object Dataset/upload.py b/examples/3D Object Dataset/upload.py index d25605d60e..ded53e18c5 100644 --- a/examples/3D Object Dataset/upload.py +++ b/examples/3D Object Dataset/upload.py @@ -36,10 +36,10 @@ def meta(self): # here we specify the attributes of return type return { "image_label": {"shape": (1,), "dtype": "int", "dtag": "text"}, - "named_image_label": {"shape": (1,), "dtype": "object", "dtag": "text"}, + "named_image_label": {"shape": (1,), "dtype": "U25", "dtag": "text"}, "image": { "shape": (1,), - "dtype": "object", + "dtype": "uint32", "chunksize": 100, "dtag": "image", }, From 4beb7407bb4b696dc7fca2ae480ed37bb05bbf58 Mon Sep 17 00:00:00 2001 From: Anselm Hahn Date: Thu, 29 Oct 2020 20:50:55 +0100 Subject: [PATCH 12/16] Fixed Issue: Add a flake8 statistic #152 --- .circleci/config.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4bfa368833..fb87b88c5b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -45,6 +45,11 @@ jobs: pip install pytest pip install -r requirements.txt pip install -e . + - run: + name: "Checking code style" + command: | + pip install flake8 + flake8 . --count --exit-zero --max-complexity=10 --statistics - run: name: "Running tests" command: | From aba6343988642b744f6b6fb6fa31dab6e882f450 Mon Sep 17 00:00:00 2001 From: Anselm Hahn Date: Thu, 29 Oct 2020 20:55:27 +0100 Subject: [PATCH 13/16] Cleanded the master branch --- .circleci/config.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.circleci/config.yml b/.circleci/config.yml index fb87b88c5b..4bfa368833 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -45,11 +45,6 @@ jobs: pip install pytest pip install -r requirements.txt pip install -e . - - run: - name: "Checking code style" - command: | - pip install flake8 - flake8 . --count --exit-zero --max-complexity=10 --statistics - run: name: "Running tests" command: | From 3a51fd436b8e109cc4a97d6f1683b81e9b80098b Mon Sep 17 00:00:00 2001 From: Anselm Hahn Date: Thu, 29 Oct 2020 20:56:55 +0100 Subject: [PATCH 14/16] Fixed Issue: Add a flake8 statistic #152 --- .circleci/config.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.circleci/config.yml b/.circleci/config.yml index 4bfa368833..fb87b88c5b 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -45,6 +45,11 @@ jobs: pip install pytest pip install -r requirements.txt pip install -e . + - run: + name: "Checking code style" + command: | + pip install flake8 + flake8 . --count --exit-zero --max-complexity=10 --statistics - run: name: "Running tests" command: | From fc742ca6ff2be9872dc08c3df3b20b2af6d6ff0b Mon Sep 17 00:00:00 2001 From: Anselm Hahn Date: Thu, 29 Oct 2020 21:01:21 +0100 Subject: [PATCH 15/16] Idea for Create a circle-ci shield #150 --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index a7e6743c53..1829bb898a 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,9 @@ PyPI version PyPI version + + CircleCI + codecov tweet From 9d36dc08d901150f7d1453c581a55fa5bf3bed00 Mon Sep 17 00:00:00 2001 From: Hugo van Kemenade Date: Fri, 30 Oct 2020 20:17:14 +0200 Subject: [PATCH 16/16] Fix Black formatting --- examples/coco/upload_coco2017.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/coco/upload_coco2017.py b/examples/coco/upload_coco2017.py index a01e7e7177..ad9205c655 100644 --- a/examples/coco/upload_coco2017.py +++ b/examples/coco/upload_coco2017.py @@ -43,7 +43,8 @@ def __call__(self, input): self._args.dataset_path, get_image_name(self._args, self._tag, input["image_id"]), ) - ), dtype=np.uint32 + ), + dtype=np.uint32, ) ds["segmentation"] = np.empty(1, np.uint32) ds["area"] = np.empty(1, np.uint32)