Resolve merge conflict

Gyanachand1 · Oct 30, 2020 · 8a41279 · 8a41279
2 parents b56a9a3 + b2d6740
commit 8a41279
Show file tree

Hide file tree

Showing 12 changed files with 194 additions and 41 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -45,6 +45,11 @@ jobs:
             pip install pytest
             pip install -r requirements.txt
             pip install -e .
+      - run:
+          name: "Checking code style"
+          command: |
+            pip install flake8
+            flake8 . --count --exit-zero --max-complexity=10 --statistics
       - run:
           name: "Running tests"
           command: |

diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
@@ -0,0 +1,76 @@
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to making participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+
+## Our Standards
+
+Examples of behavior that contributes to creating a positive environment
+include:
+
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+
+Examples of unacceptable behavior by participants include:
+
+* The use of sexualized language or imagery and unwelcome sexual attention or
+ advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+ address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+ professional setting
+
+## Our Responsibilities
+
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+
+## Scope
+
+This Code of Conduct applies both within project spaces and in public spaces
+when an individual is representing the project or its community. Examples of
+representing a project or community include using an official project e-mail
+address, posting via an official social media account, or acting as an appointed
+representative at an online or offline event. Representation of a project may be
+further defined and clarified by project maintainers.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at . All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq
diff --git a/README.md b/README.md
@@ -9,6 +9,9 @@
     </a>
     <a href="https://pypi.org/project/hub/"><img src="https://badge.fury.io/py/hub.svg" alt="PyPI version" height="18"></a>
     <a href="https://pypi.org/project/hub/"><img src="https://img.shields.io/pypi/dm/hub.svg" alt="PyPI version" height="18"></a>
+    <a>
+    <img alt="CircleCI" src="https://img.shields.io/circleci/build/github/activeloopai/Hub?logo=circleci">
+    </a>
     <a href="https://codecov.io/gh/activeloopai/Hub/branch/master"><img src="https://codecov.io/gh/activeloopai/Hub/branch/master/graph/badge.svg" alt="codecov" height="18"></a>
     <a href="https://twitter.com/intent/tweet?text=The%20fastest%20way%20to%20access%20and%20manage%20PyTorch%20and%20Tensorflow%20datasets%20is%20open-source&url=https://activeloop.ai/&via=activeloopai&hashtags=opensource,pytorch,tensorflow,data,datascience,datapipelines,sqlforimages,activeloop"> 
         <img alt="tweet" src="https://img.shields.io/twitter/url/http/shields.io.svg?style=social">

diff --git a/docs/source/concepts/tensor.md b/docs/source/concepts/tensor.md
@@ -28,7 +28,7 @@ t = tensor.from_zeros(np.zeros((10, 512, 512)))
 
 ## Concat or Stack
 
-Concating or stacking tensors works as in other frameworks.
+Concatenating or stacking tensors works as in other frameworks.
 
 ```python
 from hub import tensor

diff --git a/docs/source/integrations/pytorch.md b/docs/source/integrations/pytorch.md
@@ -1,6 +1,6 @@
 # PyTorch
 
-Here is an example to transform the dataset into pytorch form.
+Here is an example to transform the dataset into Pytorch form.
 
 ```
 import torch
@@ -9,7 +9,7 @@ from hub import dataset
 # Load data
 ds = dataset.load("mnist/mnist")
 
-# Transform into pytorch
+# Transform into Pytorch
 ds = ds.to_pytorch(transform=None)
 ds = torch.utils.data.DataLoader(
     ds, batch_size=8, num_workers=8, collate_fn=ds.collate_fn

diff --git a/docs/source/integrations/tensorflow.md b/docs/source/integrations/tensorflow.md
@@ -1,14 +1,14 @@
 # Tensorflow
 
-Here is an example to transform the dataset into tensorflow form.
+Here is an example to transform the dataset into Tensorflow form.
 
 ```python
 from hub import dataset
 
 # Load data
 ds = dataset.load("mnist/mnist")
 
-# tansform into Tensorflow dataset
+# transform into Tensorflow dataset
 ds = ds.to_tensorflow().batch(8)
 
 # Iterate over the data

diff --git a/docs/source/simple.md b/docs/source/simple.md
@@ -22,7 +22,7 @@ mnist["data"][0:1000].compute()
 
 ## Train a model
 
-Load the data and directly train your model using pytorch
+Load the data and directly train your model using Pytorch
 
 ```python
 import hub

diff --git a/docs/source/storage/tutorials.md b/docs/source/storage/tutorials.md
@@ -71,7 +71,7 @@ ds["input", 1:3] =  np.ones((2, 25, 25))
 ```
 
 ## Idea of chunking 
-Chunks are the most important part of Hub arrays. Imagine that you have a really large array stored in the cloud and want to access only some significantly smaller part of it. Let us say you have an array of 100000 images with shape ```(100000, 1024, 1024, 3)```. If we stored this array wholly without dividing into multiple chunks then in order to request only few images from it we would need to load the entire array into RAM which would be impossible and even if some computer would have that big RAM, downloading the whole array would take a lot of time. Instead we store the array in chunks and we only downlaod the chunks that contain the requested part of the array.  
+Chunks are the most important part of Hub arrays. Imagine that you have a really large array stored in the cloud and want to access only some significantly smaller part of it. Let us say you have an array of 100000 images with shape ```(100000, 1024, 1024, 3)```. If we stored this array wholly without dividing into multiple chunks then in order to request only few images from it we would need to load the entire array into RAM which would be impossible and even if some computer would have that big RAM, downloading the whole array would take a lot of time. Instead we store the array in chunks and we only download the chunks that contain the requested part of the array.  
 
 ## How to choose a proper chunk size
 Choosing a proper chunk size is crucial for performance. The chunks must be much bigger and take longer time to download than the overhead of request to cloud ~1ms. Chunks also should be small enough to fit multiple chunks into RAM. Usually, we can have up to 1 chunk per thread. 
@@ -91,7 +91,7 @@ Compresslevel is a float number from 0 to 1. Where 1 is the fastest and 0 is the
 You can easily find about all of our supported compressors, their effectiveness, and performance in the internet.  
 
 ## Integration with Pytorch and TensorFlow
-Hub datasets can easily be transformed into Pytoch and Tensorflow formats.
+Hub datasets can easily be transformed into Pytorch and Tensorflow formats.
 Pytorch:
 ```python
     datahub = hub.fs("./data/cache").connect()

diff --git a/docs/source/tutorials/creds.md b/docs/source/tutorials/creds.md
diff --git a/docs/source/tutorials/pytorch.md b/docs/source/tutorials/pytorch.md
@@ -1,6 +1,6 @@
 # Pytorch Integration
 
-In this tutorial we will retreive our dataset from the local cache and integrate it with `Pytorch` for further use.
+In this tutorial we will retrieve our dataset from the local cache and integrate it with `Pytorch` for further use.
 
 For a detailed guide on dataset generation and storage see [this tutorial](samples.md).
 

diff --git a/docs/source/why.md b/docs/source/why.md
@@ -10,7 +10,7 @@ We realized that there are a few problems related with current workflow in deep
 2. **Code dependency on local folder structure**. People use a folder structure to store images or videos. As a result, the data input pipeline has to take into consideration the raw folder structure which creates unnecessary & error-prone code dependency of the dataset folder structure.
 
 
-3. **Managing preprocessing pipelines**. If you want to run some preprocessing, it would be ideal to save the preprocessed images as a local cache for training.But it’s usually hard to manage & version control the preprocessed images locally when there are multiple preprocessing pipelies and the dataset is very big.
+3. **Managing preprocessing pipelines**. If you want to run some preprocessing, it would be ideal to save the preprocessed images as a local cache for training.But it’s usually hard to manage & version control the preprocessed images locally when there are multiple preprocessing pipelines and the dataset is very big.
 
 
 4. **Visualization**. It's difficult to visualize the raw data or preprocessed dataset on servers.

diff --git a/examples/3D Object Dataset/upload.py b/examples/3D Object Dataset/upload.py
@@ -0,0 +1,100 @@
+"""
+Dataset Download Source: http://cvgl.stanford.edu/data2/3Ddataset.zip
+Dataset format: Images(.bmp file)
+Dataset Features: bicycle, car, cellphone, head, iron, monitor, mouse, shoe, stapler, toaster
+
+Folder Structure:
+3Ddataset
+  -bicycle
+    -bicycle_1
+      - Various Images in .bmp format
+    -bicycle_2
+    -bicycle_3
+    ...
+    -bicycle_10
+  -car
+    -car_1
+    -car_2
+    ...
+    -car_10
+  ...
+  Total 10 features
+"""
+
+import os
+import numpy as np
+from PIL import Image
+import torchvision.transforms as transforms
+from hub import Transform, dataset
+import pandas as pd
+
+NUM_FEATURES = 10
+
+
+class DatasetGenerator(Transform):
+    def meta(self):
+        # here we specify the attributes of return type
+        return {
+            "image_label": {"shape": (1,), "dtype": "int", "dtag": "text"},
+            "named_image_label": {"shape": (1,), "dtype": "U25", "dtag": "text"},
+            "image": {
+                "shape": (1,),
+                "dtype": "uint32",
+                "chunksize": 100,
+                "dtag": "image",
+            },
+        }
+
+    def forward(self, image_info):
+        # we need to return a dictionary of numpy arrays from here
+        ds = {}
+        ds["image_label"] = np.empty(1, dtype="int")
+        ds["image_label"][0] = image_info["image_label"]
+
+        ds["named_image_label"] = np.empty(1, dtype="object")
+        ds["named_image_label"][0] = image_info["named_image_label"]
+
+        ds["image"] = np.empty(1, object)
+        ds["image"][0] = np.array(Image.open(image_info["image_path"]).convert("RGB"))
+        print("------------------------------------------------")
+        print(ds["named_image_label"][0] + " image loaded successfully")
+        return ds
+
+
+def map_labels(labels_list):
+    dic = {labels_list[i]: i for i in range(1, NUM_FEATURES + 1)}
+    return dic
+
+
+def load_dataset(base_path):
+    labels_list = os.listdir(base_path)
+    labels_dict = map_labels(labels_list)
+    image_info_list = []
+    for label in labels_list:
+        for label_num in range(1, NUM_FEATURES + 1):
+            curr_path = base_path + "/" + label + "/" + label + "_" + str(label_num)
+            images_list = os.listdir(curr_path)
+            for image in images_list:
+                image_info = {}
+                if image.lower().startswith(
+                    label
+                ):  # all images' name starts with the feature name (observation)
+                    image_info["image_path"] = curr_path + "/" + image
+                    image_info["image_label"] = labels_dict[label]
+                    image_info["named_image_label"] = label
+                    image_info_list.append(image_info)
+
+    # the generator iterates through the argument given, one by one and applies forward. This is done lazily.
+    ds = dataset.generate(DatasetGenerator(), image_info_list)
+    return ds
+
+
+def main():
+    base_path = "./3Ddataset"
+    # stores the dataset in username/datasetname
+    ds = load_dataset(base_path)
+    ds.store("ThreeDimensionalDataset")
+
+
+if __name__ == "__main__":
+    main()