Rework the tests to make sure that all GPU tests are ran and that ski…

…pped tests do not trigger warnings upon integration (facebookresearch#356) Summary: Pull Request resolved: facebookresearch#356 Two main changes: - Make sure that all GPU tests are run on CircleCI - Make sure that skipped GPU tests do not trigger warnings This required to split the test_losses in part and update some other tests as well. Reviewed By: prigoyal Differential Revision: D29557107 fbshipit-source-id: a46823097983d0b58bc6d20bab5fef03820dd07d
ArrowLuo · Jul 8, 2021 · 9d961cd · 9d961cd
1 parent 243dd92
commit 9d961cd
Show file tree

Hide file tree

Showing 6 changed files with 133 additions and 104 deletions.
diff --git a/dev/run_quick_tests.sh b/dev/run_quick_tests.sh
@@ -13,12 +13,15 @@ SRC_DIR=$(dirname "${SRC_DIR}")
 # -----------------------------------------------------------------------------
 
 TEST_LIST=(
+    "test_extract_cluster.py"
+    "test_extract_features.py"
+    "test_larc_fsdp.py"
+    "test_layer_memory_tracking.py"
+    "test_losses_gpu.py"
     "test_regnet_fsdp.py"
     "test_regnet_fsdp_integration.py"
+    "test_state_checkpoint_conversion.py"
     "test_state_checkpointing.py"
-    "test_layer_memory_tracking.py"
-    "test_extract_features.py"
-    "test_extract_cluster.py"
 )
 
 echo "========================================================================"
@@ -28,7 +31,7 @@ echo "========================================================================"
 
 pushd "${SRC_DIR}/tests"
 for test_file in "${TEST_LIST[@]}"; do
-  python -m unittest $test_file || exit
+  python -m unittest "$test_file" || exit
 done
 popd
 

diff --git a/tests/test_losses.py b/tests/test_losses.py
@@ -8,8 +8,6 @@
 from collections import namedtuple
 
 import torch
-import torch.distributed as dist
-import torch.multiprocessing as mp
 from classy_vision.generic.distributed_util import set_cpu_device
 from parameterized import parameterized
 from utils import ROOT_LOSS_CONFIGS, SSLHydraConfig
@@ -19,7 +17,6 @@
 from vissl.losses.swav_loss import SwAVCriterion
 from vissl.trainer.train_task import SelfSupervisionTask
 from vissl.utils.hydra_config import convert_to_attrdict
-from vissl.utils.misc import find_free_tcp_port
 
 
 logger = logging.getLogger("__name__")
@@ -100,42 +97,6 @@ def test_barlow_twins_backward(self):
             next_embeddings = embeddings - embeddings.grad  # gradient descent
             self.assertTrue(criterion(next_embeddings) < criterion(embeddings))
 
-    @staticmethod
-    def worker_fn(gpu_id: int, world_size: int, batch_size: int, port: int):
-        dist.init_process_group(
-            backend="nccl",
-            init_method=f"tcp://0.0.0.0:{port}",
-            world_size=world_size,
-            rank=gpu_id,
-        )
-        criterion = BarlowTwinsCriterion(
-            lambda_=0.0051, scale_loss=0.024, embedding_dim=EMBEDDING_DIM
-        )
-        embeddings = torch.randn(
-            (batch_size, EMBEDDING_DIM), dtype=torch.float32, requires_grad=True
-        ).cuda()
-        criterion(embeddings).backward()
-
-    def test_backward_world_size_1(self):
-        if torch.cuda.device_count() >= 1:
-            port = find_free_tcp_port()
-
-            WORLD_SIZE = 1
-            BATCH_SIZE = 2
-            mp.spawn(
-                self.worker_fn, args=(WORLD_SIZE, BATCH_SIZE, port), nprocs=WORLD_SIZE
-            )
-
-    def test_backward_world_size_2(self):
-        if torch.cuda.device_count() >= 2:
-            port = find_free_tcp_port()
-
-            WORLD_SIZE = 2
-            BATCH_SIZE = 2
-            mp.spawn(
-                self.worker_fn, args=(WORLD_SIZE, BATCH_SIZE, port), nprocs=WORLD_SIZE
-            )
-
 
 class TestSimClrCriterion(unittest.TestCase):
     """
@@ -195,58 +156,6 @@ def test_simclr_backward(self):
             next_embeddings = embeddings - embeddings.grad  # gradient descent
             self.assertTrue(criterion(next_embeddings) < criterion(embeddings))
 
-    @staticmethod
-    def worker_fn(gpu_id: int, world_size: int, batch_size: int, port: int):
-        dist.init_process_group(
-            backend="nccl",
-            init_method=f"tcp://0.0.0.0:{port}",
-            world_size=world_size,
-            rank=gpu_id,
-        )
-        embeddings = torch.full(
-            size=(batch_size, 3), fill_value=float(gpu_id), requires_grad=True
-        ).cuda(gpu_id)
-        gathered = SimclrInfoNCECriterion.gather_embeddings(embeddings)
-        if world_size == 1:
-            assert gathered.equal(
-                torch.tensor(
-                    [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], device=f"cuda:{gpu_id}"
-                )
-            )
-        if world_size == 2:
-            assert gathered.equal(
-                torch.tensor(
-                    [
-                        [0.0, 0.0, 0.0],
-                        [0.0, 0.0, 0.0],
-                        [1.0, 1.0, 1.0],
-                        [1.0, 1.0, 1.0],
-                    ],
-                    device=f"cuda:{gpu_id}",
-                )
-            )
-        assert gathered.requires_grad
-
-    def test_gather_embeddings_word_size_1(self):
-        if torch.cuda.device_count() >= 1:
-            port = find_free_tcp_port()
-
-            WORLD_SIZE = 1
-            BATCH_SIZE = 2
-            mp.spawn(
-                self.worker_fn, args=(WORLD_SIZE, BATCH_SIZE, port), nprocs=WORLD_SIZE
-            )
-
-    def test_gather_embeddings_word_size_2(self):
-        if torch.cuda.device_count() >= 2:
-            port = find_free_tcp_port()
-
-            WORLD_SIZE = 2
-            BATCH_SIZE = 2
-            mp.spawn(
-                self.worker_fn, args=(WORLD_SIZE, BATCH_SIZE, port), nprocs=WORLD_SIZE
-            )
-
 
 class TestRootConfigsLossesBuild(unittest.TestCase):
     @parameterized.expand(ROOT_LOSS_CONFIGS)

diff --git a/tests/test_losses_gpu.py b/tests/test_losses_gpu.py
@@ -0,0 +1,112 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+
+import unittest
+
+import torch
+import torch.multiprocessing as mp
+from vissl.losses.barlow_twins_loss import BarlowTwinsCriterion
+from vissl.losses.simclr_info_nce_loss import SimclrInfoNCECriterion
+from vissl.utils.test_utils import gpu_test, init_distributed_on_file, with_temp_files
+
+
+class TestSimClrCriterionOnGpu(unittest.TestCase):
+    """
+    Specific tests on SimCLR going further than just doing a forward pass
+    """
+
+    @staticmethod
+    def worker_fn(gpu_id: int, world_size: int, batch_size: int, sync_file: str):
+        init_distributed_on_file(
+            world_size=world_size, gpu_id=gpu_id, sync_file=sync_file
+        )
+        embeddings = torch.full(
+            size=(batch_size, 3), fill_value=float(gpu_id), requires_grad=True
+        ).cuda(gpu_id)
+        gathered = SimclrInfoNCECriterion.gather_embeddings(embeddings)
+        if world_size == 1:
+            assert gathered.equal(
+                torch.tensor(
+                    [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], device=f"cuda:{gpu_id}"
+                )
+            )
+        if world_size == 2:
+            assert gathered.equal(
+                torch.tensor(
+                    [
+                        [0.0, 0.0, 0.0],
+                        [0.0, 0.0, 0.0],
+                        [1.0, 1.0, 1.0],
+                        [1.0, 1.0, 1.0],
+                    ],
+                    device=f"cuda:{gpu_id}",
+                )
+            )
+        assert gathered.requires_grad
+
+    @gpu_test(gpu_count=1)
+    def test_gather_embeddings_word_size_1(self):
+        with with_temp_files(count=1) as sync_file:
+            WORLD_SIZE = 1
+            BATCH_SIZE = 2
+            mp.spawn(
+                self.worker_fn,
+                args=(WORLD_SIZE, BATCH_SIZE, sync_file),
+                nprocs=WORLD_SIZE,
+            )
+
+    @gpu_test(gpu_count=2)
+    def test_gather_embeddings_word_size_2(self):
+        with with_temp_files(count=1) as sync_file:
+            WORLD_SIZE = 2
+            BATCH_SIZE = 2
+            mp.spawn(
+                self.worker_fn,
+                args=(WORLD_SIZE, BATCH_SIZE, sync_file),
+                nprocs=WORLD_SIZE,
+            )
+
+
+class TestBarlowTwinsCriterionOnGpu(unittest.TestCase):
+    """
+    Specific tests on Barlow Twins going further than just doing a forward pass
+    """
+
+    @staticmethod
+    def worker_fn(gpu_id: int, world_size: int, batch_size: int, sync_file: str):
+        init_distributed_on_file(
+            world_size=world_size, gpu_id=gpu_id, sync_file=sync_file
+        )
+        EMBEDDING_DIM = 128
+        criterion = BarlowTwinsCriterion(
+            lambda_=0.0051, scale_loss=0.024, embedding_dim=EMBEDDING_DIM
+        )
+        embeddings = torch.randn(
+            (batch_size, EMBEDDING_DIM), dtype=torch.float32, requires_grad=True
+        ).cuda()
+        criterion(embeddings).backward()
+
+    @gpu_test(gpu_count=1)
+    def test_backward_world_size_1(self):
+        with with_temp_files(count=1) as sync_file:
+            WORLD_SIZE = 1
+            BATCH_SIZE = 2
+            mp.spawn(
+                self.worker_fn,
+                args=(WORLD_SIZE, BATCH_SIZE, sync_file),
+                nprocs=WORLD_SIZE,
+            )
+
+    @gpu_test(gpu_count=2)
+    def test_backward_world_size_2(self):
+        with with_temp_files(count=1) as sync_file:
+            WORLD_SIZE = 2
+            BATCH_SIZE = 2
+            mp.spawn(
+                self.worker_fn,
+                args=(WORLD_SIZE, BATCH_SIZE, sync_file),
+                nprocs=WORLD_SIZE,
+            )
diff --git a/tests/test_regnet_fsdp.py b/tests/test_regnet_fsdp.py
@@ -16,7 +16,7 @@
 from vissl.optimizers import *  # noqa
 from vissl.utils.fsdp_utils import fsdp_wrapper
 from vissl.utils.hydra_config import convert_to_attrdict
-from vissl.utils.test_utils import init_distributed_on_file, with_temp_files
+from vissl.utils.test_utils import gpu_test, init_distributed_on_file, with_temp_files
 
 
 class TestRegnetFSDP(unittest.TestCase):
@@ -130,7 +130,7 @@ def run_pretraining(
                 nprocs=2,
             )
 
-    @unittest.skipIf(torch.cuda.device_count() < 2, "Not enough GPUs to run the test")
+    @gpu_test(gpu_count=2)
     def test_regnet_fsdp_convergence_on_swav(self):
         """
         Run SWAV architecture with DDP or with FSDP with or without
@@ -164,7 +164,7 @@ def test_regnet_fsdp_convergence_on_swav(self):
             self.assertEqual(results[0], results[1], "DDP vs FSDP")
             self.assertEqual(results[1], results[2], "Activation checkpointing")
 
-    @unittest.skipIf(torch.cuda.device_count() < 2, "Not enough GPUs to run the test")
+    @gpu_test(gpu_count=2)
     def test_regnet_fsdp_convergence_on_swav_with_larc(self):
         """
         Run SWAV architecture with DDP or with FSDP with or without

diff --git a/vissl/trainer/trainer_main.py b/vissl/trainer/trainer_main.py
@@ -440,8 +440,6 @@ def _extract_split_features(
                     for num, feat_name in enumerate(feat_names):
                         feature = flat_features_list[num].cpu().numpy()
                         targets = input_sample["target"]
-                        logging.info(feat_name)
-                        logging.info(str(feature))
                         for idx in range(num_images):
                             index = input_sample["inds"][idx]
                             out_features[feat_name][index] = feature[idx]

diff --git a/vissl/utils/test_utils.py b/vissl/utils/test_utils.py
@@ -3,12 +3,13 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
+import functools
 import os
 import re
 import tempfile
 from contextlib import contextmanager
 from dataclasses import dataclass
-from typing import List, Tuple
+from typing import Callable, List, Tuple
 
 import torch
 import torch.distributed as dist
@@ -76,10 +77,16 @@ def gpu_test(gpu_count: int = 1):
     Annotation for GPU tests, skipping the test if the
     required amount of GPU is not available
     """
-    import unittest
 
-    message = f"Not enough GPUs to run the test: required {gpu_count}"
-    return unittest.skipIf(torch.cuda.device_count() < gpu_count, message)
+    def gpu_test_decorator(test_function: Callable):
+        @functools.wraps(test_function)
+        def wrapped_test(*args, **kwargs):
+            if torch.cuda.device_count() >= gpu_count:
+                return test_function(*args, **kwargs)
+
+        return wrapped_test
+
+    return gpu_test_decorator
 
 
 def init_distributed_on_file(world_size: int, gpu_id: int, sync_file: str):