Support for torchvision datasets in VISSL (take 2) (facebookresearch#180

) Summary: This PR is a revision of the first PR (facebookresearch#165) which was a draft to discuss the approach to integrate these datasets. Compared to the previous PR: 1. More pytorch datasets are included (CIFAR10, CIFAR100, STL10, MNIST) 2. The different datasets are proposed in benchmark/linear_image_classification/datasets To combine the datasets with the standard linear evaluation benchmark, we can use the following hydra syntax ``` python tools/run_distributed_engines.py config=benchmark/linear_image_classification/imagenet1k/eval_resnet_8gpu_transfer_in1k_linear config.MODEL.WEIGHTS_INIT.PARAMS_FILE=PATH +config/benchmark/linear_image_classification/datasets=cifar10 ``` Note that for MNIST, a wrapper dataset was needed to convert the images from 1 channel to 3 channels, and increase their size to 32x32 to at least match the downsampling by 32 of standard Resnet50. Pull Request resolved: facebookresearch#180 Reviewed By: prigoyal Differential Revision: D26453148 Pulled By: QuentinDuval fbshipit-source-id: 6ce341f1201641aa51364c2b31cdd40bac162ffb
yangsenwxy · Feb 16, 2021 · 9e53d28 · 9e53d28
1 parent 75d472b
commit 9e53d28
Show file tree

Hide file tree

Showing 13 changed files with 543 additions and 4 deletions.
diff --git a/configs/config/benchmark/linear_image_classification/datasets/cifar10.yaml b/configs/config/benchmark/linear_image_classification/datasets/cifar10.yaml
@@ -0,0 +1,47 @@
+# @package _global_
+config:
+  DATA:
+    NUM_DATALOADER_WORKERS: 4
+    TRAIN:
+      DATA_SOURCES: [torchvision_dataset]
+      LABEL_SOURCES: [torchvision_dataset]
+      DATASET_NAMES: [CIFAR10]
+      BATCHSIZE_PER_REPLICA: 256
+      TRANSFORMS:
+        - name: RandomResizedCrop
+          size: 32
+        - name: RandomHorizontalFlip
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      MMAP_MODE: True
+      COPY_TO_LOCAL_DISK: False
+      COPY_DESTINATION_DIR: /tmp/cifar10/
+    TEST:
+      DATA_SOURCES: [torchvision_dataset]
+      LABEL_SOURCES: [torchvision_dataset]
+      DATASET_NAMES: [CIFAR10]
+      BATCHSIZE_PER_REPLICA: 256
+      TRANSFORMS:
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      MMAP_MODE: True
+      COPY_TO_LOCAL_DISK: False
+      COPY_DESTINATION_DIR: /tmp/cifar10/
+  METERS:
+    name: accuracy_list_meter
+    accuracy_list_meter:
+      num_meters: 1
+      topk_values: [1]
+  MODEL:
+    FEATURE_EVAL_SETTINGS:
+      LINEAR_EVAL_FEAT_POOL_OPS_MAP: [
+        ["res5", ["AdaptiveAvgPool2d", [[1, 1]]]],
+      ]
+    HEAD:
+      PARAMS: [
+        ["eval_mlp", {"in_channels": 2048, "dims": [2048, 10]}],
+      ]
diff --git a/configs/config/benchmark/linear_image_classification/datasets/cifar100.yaml b/configs/config/benchmark/linear_image_classification/datasets/cifar100.yaml
@@ -0,0 +1,47 @@
+# @package _global_
+config:
+  DATA:
+    NUM_DATALOADER_WORKERS: 4
+    TRAIN:
+      DATA_SOURCES: [torchvision_dataset]
+      LABEL_SOURCES: [torchvision_dataset]
+      DATASET_NAMES: [CIFAR100]
+      BATCHSIZE_PER_REPLICA: 256
+      TRANSFORMS:
+        - name: RandomResizedCrop
+          size: 32
+        - name: RandomHorizontalFlip
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      MMAP_MODE: True
+      COPY_TO_LOCAL_DISK: False
+      COPY_DESTINATION_DIR: /tmp/cifar100/
+    TEST:
+      DATA_SOURCES: [torchvision_dataset]
+      LABEL_SOURCES: [torchvision_dataset]
+      DATASET_NAMES: [CIFAR100]
+      BATCHSIZE_PER_REPLICA: 256
+      TRANSFORMS:
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      MMAP_MODE: True
+      COPY_TO_LOCAL_DISK: False
+      COPY_DESTINATION_DIR: /tmp/cifar100/
+  METERS:
+    name: accuracy_list_meter
+    accuracy_list_meter:
+      num_meters: 1
+      topk_values: [1, 5]
+  MODEL:
+    FEATURE_EVAL_SETTINGS:
+      LINEAR_EVAL_FEAT_POOL_OPS_MAP: [
+        ["res5", ["AdaptiveAvgPool2d", [[1, 1]]]],
+      ]
+    HEAD:
+      PARAMS: [
+        ["eval_mlp", {"in_channels": 2048, "dims": [2048, 100]}],
+      ]
diff --git a/configs/config/benchmark/linear_image_classification/datasets/mnist.yaml b/configs/config/benchmark/linear_image_classification/datasets/mnist.yaml
@@ -0,0 +1,53 @@
+# @package _global_
+config:
+  DATA:
+    NUM_DATALOADER_WORKERS: 4
+    TRAIN:
+      DATA_SOURCES: [torchvision_dataset]
+      LABEL_SOURCES: [torchvision_dataset]
+      DATASET_NAMES: [MNIST]
+      BATCHSIZE_PER_REPLICA: 256
+      TRANSFORMS:
+        - name: MNISTImgPil2RGB
+          size: 32
+          box: [2, 2]
+        - name: RandomResizedCrop
+          size: 32
+        - name: RandomHorizontalFlip
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      MMAP_MODE: True
+      COPY_TO_LOCAL_DISK: False
+      COPY_DESTINATION_DIR: /tmp/mnist/
+    TEST:
+      DATA_SOURCES: [torchvision_dataset]
+      LABEL_SOURCES: [torchvision_dataset]
+      DATASET_NAMES: [MNIST]
+      BATCHSIZE_PER_REPLICA: 256
+      TRANSFORMS:
+        - name: MNISTImgPil2RGB
+          size: 32
+          box: [2, 2]
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      MMAP_MODE: True
+      COPY_TO_LOCAL_DISK: False
+      COPY_DESTINATION_DIR: /tmp/mnist/
+  METERS:
+    name: accuracy_list_meter
+    accuracy_list_meter:
+      num_meters: 1
+      topk_values: [1]
+  MODEL:
+    FEATURE_EVAL_SETTINGS:
+      LINEAR_EVAL_FEAT_POOL_OPS_MAP: [
+        ["res5", ["AdaptiveAvgPool2d", [[1, 1]]]],
+      ]
+    HEAD:
+      PARAMS: [
+        ["eval_mlp", {"in_channels": 2048, "dims": [2048, 10]}],
+      ]
diff --git a/configs/config/benchmark/linear_image_classification/datasets/stl10.yaml b/configs/config/benchmark/linear_image_classification/datasets/stl10.yaml
@@ -0,0 +1,45 @@
+# @package _global_
+config:
+  DATA:
+    NUM_DATALOADER_WORKERS: 4
+    TRAIN:
+      DATA_SOURCES: [torchvision_dataset]
+      LABEL_SOURCES: [torchvision_dataset]
+      DATASET_NAMES: [STL10]
+      TRANSFORMS:
+        - name: RandomResizedCrop
+          size: 32
+        - name: RandomHorizontalFlip
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      MMAP_MODE: True
+      COPY_TO_LOCAL_DISK: False
+      COPY_DESTINATION_DIR: /tmp/stl10/
+    TEST:
+      DATA_SOURCES: [torchvision_dataset]
+      LABEL_SOURCES: [torchvision_dataset]
+      DATASET_NAMES: [STL10]
+      TRANSFORMS:
+        - name: ToTensor
+        - name: Normalize
+          mean: [0.485, 0.456, 0.406]
+          std: [0.229, 0.224, 0.225]
+      MMAP_MODE: True
+      COPY_TO_LOCAL_DISK: False
+      COPY_DESTINATION_DIR: /tmp/stl10/
+  METERS:
+    name: accuracy_list_meter
+    accuracy_list_meter:
+      num_meters: 1
+      topk_values: [1]
+  MODEL:
+    FEATURE_EVAL_SETTINGS:
+      LINEAR_EVAL_FEAT_POOL_OPS_MAP: [
+        ["res5", ["AdaptiveAvgPool2d", [[1, 1]]]],
+      ]
+    HEAD:
+      PARAMS: [
+        ["eval_mlp", {"in_channels": 2048, "dims": [2048, 10]}],
+      ]
diff --git a/configs/config/dataset_catalog.json b/configs/config/dataset_catalog.json
@@ -15,6 +15,22 @@
         "train": ["<img_path>", "<lbl_path>"],
         "val": ["<img_path>", "<lbl_path>"]
     },
+    "CIFAR10": {
+        "train": ["<path_to_pytorch_dataset>", "<unused>"],
+        "val": ["<path_to_pytorch_dataset>", "<unused>"]
+    },
+    "CIFAR100": {
+        "train": ["<path_to_pytorch_dataset>", "<unused>"],
+        "val": ["<path_to_pytorch_dataset>", "<unused>"]
+    },
+    "STL10": {
+        "train": ["<path_to_pytorch_dataset>", "<unused>"],
+        "val": ["<path_to_pytorch_dataset>", "<unused>"]
+    },
+    "MNIST": {
+        "train": ["<path_to_pytorch_dataset>", "<unused>"],
+        "val": ["<path_to_pytorch_dataset>", "<unused>"]
+    },
     "inaturalist2018_folder": {
         "train": ["<img_path>", "<lbl_path>"],
         "val": ["<img_path>", "<lbl_path>"]

diff --git a/docs/source/extend_modules/data_source.rst b/docs/source/extend_modules/data_source.rst
@@ -68,6 +68,7 @@ VISSL supports data loading from :code:`disk` as the default data source. If use
     DATASET_SOURCE_MAP = {
         "disk_filelist": DiskImageDataset,
         "disk_folder": DiskImageDataset,
+        "torchvision_dataset": TorchvisionDataset,
         "synthetic": SyntheticImageDataset,
         "my_data_source": MyNewSourceDataset,
     }

diff --git a/docs/source/vissl_modules/data.rst b/docs/source/vissl_modules/data.rst
@@ -11,16 +11,18 @@ To use a dataset in VISSL, the only requirements are:
 Reading data from several sources
 ------------------------------------------
 
-VISSL allows reading data from multiple sources (disk, etc) and in multiple formats (a folder path, a :code:`.npy` file).
+VISSL allows reading data from multiple sources (disk, etc) and in multiple formats (a folder path, a :code:`.npy` file, or torchvision datasets).
 The `GenericSSLDataset <https://github.com/facebookresearch/vissl/blob/master/vissl/data/ssl_dataset.py>`_ class is defined to support reading data from multiple data sources. For example: :code:`data = [dataset1, dataset2]` and the minibatches generated will have the corresponding data from each dataset.
 For this reason, we also support labels from multiple sources. For example :code:`targets = [dataset1 targets, dataset2 targets]`.
 
-Source of the data (:code:`disk_filelist` | :code:`disk_folder`):
+Source of the data (:code:`disk_filelist` | :code:`disk_folder` | :code:`torchvision_dataset`):
 
 - :code:`disk_folder`: this is simply the root folder path to the downloaded data.
 
 - :code:`disk_filelist`: These are numpy (or .pkl) files: (1) file containing images information (2) file containing corresponding labels for images. We provide `scripts <https://github.com/facebookresearch/vissl/blob/master/extra_scripts/README.md>`_ that can be used to prepare these two files for a dataset of choice.
 
+- :code:`torchvision_dataset`: the root folder path to the torchvision dowloaded dataset. As of now, the supported datasets are: CIFAR-10, CIFAR-100, MNIST and STL-10.
+
 To use a dataset, VISSL takes following inputs in the configuration file for each dataset split (train, test):
 
 - :code:`DATASET_NAMES`: names of the datasets that are registered with :code:`VisslDatasetCatalog`. Registering dataset name is important. Example: :code:`DATASET_NAMES=[imagenet1k_folder, my_new_dataset_filelist]`
@@ -135,6 +137,88 @@ Expected dataset structure for COCO2014
             # image files that are mentioned in the corresponding json
 
 
+Expected dataset structure for CIFAR10
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The expected format is the exact same format used by torchvision, and the exact format obtained after either:
+
+- expanding the "CIFAR-10 python version" archive available at https://www.cs.toronto.edu/~kriz/cifar.html
+
+- instantiating the :code:`torchvision.datasets.CIFAR10` class with :code:`download=True`
+
+.. code-block::
+
+    cifar-10-batches-py/
+        batches.meta
+        data_batch_1
+        data_batch_2
+        data_batch_3
+        data_batch_4
+        data_batch_5
+        readme.html
+        test_batch
+
+
+Expected dataset structure for CIFAR100
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The expected format is the exact same format used by torchvision, and the exact format obtained after either:
+
+- expanding the "CIFAR-100 python version" archive available at https://www.cs.toronto.edu/~kriz/cifar.html
+
+- instantiating the :code:`torchvision.datasets.CIFAR100` class with :code:`download=True`
+
+.. code-block::
+
+    cifar-100-python/
+        meta
+        test
+        train
+
+
+Expected dataset structure for MNIST
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The expected format is the exact same format used by torchvision, and the exact format obtained after
+instantiating the :code:`torchvision.datasets.MNIST` class with the flag :code:`download=True`.
+
+.. code-block::
+
+    MNIST/
+        processed/
+            test.pt
+            training.pt
+        raw/
+            t10k-images-idx3-ubyte
+            t10k-images-idx3-ubyte.gz
+            t10k-labels-idx1-ubyte
+            t10k-labels-idx1-ubyte.gz
+            train-images-idx3-ubyte
+            train-images-idx3-ubyte.gz
+            train-labels-idx1-ubyte
+            train-labels-idx1-ubyte.gz
+
+
+Expected dataset structure for STL10
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The expected format is the exact same format used by torchvision, and the exact format obtained after either:
+
+- expanding the :code:`stl10_binary.tar.gz` archive available at https://cs.stanford.edu/~acoates/stl10/
+
+- instantiating the :code:`torchvision.datasets.STL10` class with :code:`download=True`
+
+.. code-block::
+
+    stl10_binary/
+        class_names.txt
+        fold_indices.txt
+        test_X.bin
+        test_y.bin
+        train_X.bin
+        train_y.bin
+        unlabeled_X.bin
+
 
 Dataloader
 ------------------------------------------

diff --git a/tests/test_transforms.py b/tests/test_transforms.py
@@ -5,8 +5,9 @@
 import numpy as np
 import torch
 from PIL import Image
-from torchvision.transforms import ToTensor
+from torchvision.transforms import Compose, ToTensor
 from vissl.data.ssl_transforms.img_pil_to_tensor import ImgToTensor
+from vissl.data.ssl_transforms.mnist_img_pil_to_rgb_mode import MNISTImgPil2RGB
 
 
 RAND_TENSOR = (torch.rand((224, 224, 3)) * 255).to(dtype=torch.uint8)
@@ -28,3 +29,28 @@ def test_to_tensor(self):
         c = ImgToTensor()(RAND_NUMPY)
         d = ToTensor()(RAND_NUMPY)
         self.assertTrue(torch.allclose(c, d))
+
+    def test_img_pil_to_rgb_mode(self):
+        one_channel_input = (torch.ones((28, 28)) * 255).to(dtype=torch.uint8)
+        one_channel_input = Image.fromarray(one_channel_input.numpy(), mode="L")
+
+        # Test without modifying the image size
+        transform = Compose([MNISTImgPil2RGB.from_config({}), ToTensor()])
+        output = transform(one_channel_input)
+        assert output.shape == torch.Size([3, 28, 28])
+        assert (
+            output.sum().item() == 28 * 28 * 3
+        ), "Background should be black, center is gray scale"
+
+        # Test with modifying the image size (try the two valid formats)
+        for size in [32, [32, 32]]:
+            transform = Compose(
+                [MNISTImgPil2RGB.from_config({"size": size, "box": [2, 2]}), ToTensor()]
+            )
+            output = transform(one_channel_input)
+            assert (
+                output.sum().item() == 28 * 28 * 3
+            ), "Background should be black, center is gray scale"
+            assert (
+                output[:, 2:-2, 2:-2].sum().item() == 28 * 28 * 3
+            ), "Paste should be in the middle"