Skip to content

Commit

Permalink
Rework the tests to make sure that all GPU tests are ran and that ski…
Browse files Browse the repository at this point in the history
…pped tests do not trigger warnings upon integration (facebookresearch#356)

Summary:
Pull Request resolved: facebookresearch#356

Two main changes:
- Make sure that all GPU tests are run on CircleCI
- Make sure that skipped GPU tests do not trigger warnings

This required to split the test_losses in part and update some other tests as well.

Reviewed By: prigoyal

Differential Revision: D29557107

fbshipit-source-id: a46823097983d0b58bc6d20bab5fef03820dd07d
  • Loading branch information
QuentinDuval authored and facebook-github-bot committed Jul 8, 2021
1 parent 243dd92 commit 9d961cd
Show file tree
Hide file tree
Showing 6 changed files with 133 additions and 104 deletions.
11 changes: 7 additions & 4 deletions dev/run_quick_tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,15 @@ SRC_DIR=$(dirname "${SRC_DIR}")
# -----------------------------------------------------------------------------

TEST_LIST=(
"test_extract_cluster.py"
"test_extract_features.py"
"test_larc_fsdp.py"
"test_layer_memory_tracking.py"
"test_losses_gpu.py"
"test_regnet_fsdp.py"
"test_regnet_fsdp_integration.py"
"test_state_checkpoint_conversion.py"
"test_state_checkpointing.py"
"test_layer_memory_tracking.py"
"test_extract_features.py"
"test_extract_cluster.py"
)

echo "========================================================================"
Expand All @@ -28,7 +31,7 @@ echo "========================================================================"

pushd "${SRC_DIR}/tests"
for test_file in "${TEST_LIST[@]}"; do
python -m unittest $test_file || exit
python -m unittest "$test_file" || exit
done
popd

Expand Down
91 changes: 0 additions & 91 deletions tests/test_losses.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
from collections import namedtuple

import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from classy_vision.generic.distributed_util import set_cpu_device
from parameterized import parameterized
from utils import ROOT_LOSS_CONFIGS, SSLHydraConfig
Expand All @@ -19,7 +17,6 @@
from vissl.losses.swav_loss import SwAVCriterion
from vissl.trainer.train_task import SelfSupervisionTask
from vissl.utils.hydra_config import convert_to_attrdict
from vissl.utils.misc import find_free_tcp_port


logger = logging.getLogger("__name__")
Expand Down Expand Up @@ -100,42 +97,6 @@ def test_barlow_twins_backward(self):
next_embeddings = embeddings - embeddings.grad # gradient descent
self.assertTrue(criterion(next_embeddings) < criterion(embeddings))

@staticmethod
def worker_fn(gpu_id: int, world_size: int, batch_size: int, port: int):
dist.init_process_group(
backend="nccl",
init_method=f"tcp://0.0.0.0:{port}",
world_size=world_size,
rank=gpu_id,
)
criterion = BarlowTwinsCriterion(
lambda_=0.0051, scale_loss=0.024, embedding_dim=EMBEDDING_DIM
)
embeddings = torch.randn(
(batch_size, EMBEDDING_DIM), dtype=torch.float32, requires_grad=True
).cuda()
criterion(embeddings).backward()

def test_backward_world_size_1(self):
if torch.cuda.device_count() >= 1:
port = find_free_tcp_port()

WORLD_SIZE = 1
BATCH_SIZE = 2
mp.spawn(
self.worker_fn, args=(WORLD_SIZE, BATCH_SIZE, port), nprocs=WORLD_SIZE
)

def test_backward_world_size_2(self):
if torch.cuda.device_count() >= 2:
port = find_free_tcp_port()

WORLD_SIZE = 2
BATCH_SIZE = 2
mp.spawn(
self.worker_fn, args=(WORLD_SIZE, BATCH_SIZE, port), nprocs=WORLD_SIZE
)


class TestSimClrCriterion(unittest.TestCase):
"""
Expand Down Expand Up @@ -195,58 +156,6 @@ def test_simclr_backward(self):
next_embeddings = embeddings - embeddings.grad # gradient descent
self.assertTrue(criterion(next_embeddings) < criterion(embeddings))

@staticmethod
def worker_fn(gpu_id: int, world_size: int, batch_size: int, port: int):
dist.init_process_group(
backend="nccl",
init_method=f"tcp://0.0.0.0:{port}",
world_size=world_size,
rank=gpu_id,
)
embeddings = torch.full(
size=(batch_size, 3), fill_value=float(gpu_id), requires_grad=True
).cuda(gpu_id)
gathered = SimclrInfoNCECriterion.gather_embeddings(embeddings)
if world_size == 1:
assert gathered.equal(
torch.tensor(
[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], device=f"cuda:{gpu_id}"
)
)
if world_size == 2:
assert gathered.equal(
torch.tensor(
[
[0.0, 0.0, 0.0],
[0.0, 0.0, 0.0],
[1.0, 1.0, 1.0],
[1.0, 1.0, 1.0],
],
device=f"cuda:{gpu_id}",
)
)
assert gathered.requires_grad

def test_gather_embeddings_word_size_1(self):
if torch.cuda.device_count() >= 1:
port = find_free_tcp_port()

WORLD_SIZE = 1
BATCH_SIZE = 2
mp.spawn(
self.worker_fn, args=(WORLD_SIZE, BATCH_SIZE, port), nprocs=WORLD_SIZE
)

def test_gather_embeddings_word_size_2(self):
if torch.cuda.device_count() >= 2:
port = find_free_tcp_port()

WORLD_SIZE = 2
BATCH_SIZE = 2
mp.spawn(
self.worker_fn, args=(WORLD_SIZE, BATCH_SIZE, port), nprocs=WORLD_SIZE
)


class TestRootConfigsLossesBuild(unittest.TestCase):
@parameterized.expand(ROOT_LOSS_CONFIGS)
Expand Down
112 changes: 112 additions & 0 deletions tests/test_losses_gpu.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# Copyright (c) Facebook, Inc. and its affiliates.

# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.


import unittest

import torch
import torch.multiprocessing as mp
from vissl.losses.barlow_twins_loss import BarlowTwinsCriterion
from vissl.losses.simclr_info_nce_loss import SimclrInfoNCECriterion
from vissl.utils.test_utils import gpu_test, init_distributed_on_file, with_temp_files


class TestSimClrCriterionOnGpu(unittest.TestCase):
"""
Specific tests on SimCLR going further than just doing a forward pass
"""

@staticmethod
def worker_fn(gpu_id: int, world_size: int, batch_size: int, sync_file: str):
init_distributed_on_file(
world_size=world_size, gpu_id=gpu_id, sync_file=sync_file
)
embeddings = torch.full(
size=(batch_size, 3), fill_value=float(gpu_id), requires_grad=True
).cuda(gpu_id)
gathered = SimclrInfoNCECriterion.gather_embeddings(embeddings)
if world_size == 1:
assert gathered.equal(
torch.tensor(
[[0.0, 0.0, 0.0], [0.0, 0.0, 0.0]], device=f"cuda:{gpu_id}"
)
)
if world_size == 2:
assert gathered.equal(
torch.tensor(
[
[0.0, 0.0, 0.0],
[0.0, 0.0, 0.0],
[1.0, 1.0, 1.0],
[1.0, 1.0, 1.0],
],
device=f"cuda:{gpu_id}",
)
)
assert gathered.requires_grad

@gpu_test(gpu_count=1)
def test_gather_embeddings_word_size_1(self):
with with_temp_files(count=1) as sync_file:
WORLD_SIZE = 1
BATCH_SIZE = 2
mp.spawn(
self.worker_fn,
args=(WORLD_SIZE, BATCH_SIZE, sync_file),
nprocs=WORLD_SIZE,
)

@gpu_test(gpu_count=2)
def test_gather_embeddings_word_size_2(self):
with with_temp_files(count=1) as sync_file:
WORLD_SIZE = 2
BATCH_SIZE = 2
mp.spawn(
self.worker_fn,
args=(WORLD_SIZE, BATCH_SIZE, sync_file),
nprocs=WORLD_SIZE,
)


class TestBarlowTwinsCriterionOnGpu(unittest.TestCase):
"""
Specific tests on Barlow Twins going further than just doing a forward pass
"""

@staticmethod
def worker_fn(gpu_id: int, world_size: int, batch_size: int, sync_file: str):
init_distributed_on_file(
world_size=world_size, gpu_id=gpu_id, sync_file=sync_file
)
EMBEDDING_DIM = 128
criterion = BarlowTwinsCriterion(
lambda_=0.0051, scale_loss=0.024, embedding_dim=EMBEDDING_DIM
)
embeddings = torch.randn(
(batch_size, EMBEDDING_DIM), dtype=torch.float32, requires_grad=True
).cuda()
criterion(embeddings).backward()

@gpu_test(gpu_count=1)
def test_backward_world_size_1(self):
with with_temp_files(count=1) as sync_file:
WORLD_SIZE = 1
BATCH_SIZE = 2
mp.spawn(
self.worker_fn,
args=(WORLD_SIZE, BATCH_SIZE, sync_file),
nprocs=WORLD_SIZE,
)

@gpu_test(gpu_count=2)
def test_backward_world_size_2(self):
with with_temp_files(count=1) as sync_file:
WORLD_SIZE = 2
BATCH_SIZE = 2
mp.spawn(
self.worker_fn,
args=(WORLD_SIZE, BATCH_SIZE, sync_file),
nprocs=WORLD_SIZE,
)
6 changes: 3 additions & 3 deletions tests/test_regnet_fsdp.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from vissl.optimizers import * # noqa
from vissl.utils.fsdp_utils import fsdp_wrapper
from vissl.utils.hydra_config import convert_to_attrdict
from vissl.utils.test_utils import init_distributed_on_file, with_temp_files
from vissl.utils.test_utils import gpu_test, init_distributed_on_file, with_temp_files


class TestRegnetFSDP(unittest.TestCase):
Expand Down Expand Up @@ -130,7 +130,7 @@ def run_pretraining(
nprocs=2,
)

@unittest.skipIf(torch.cuda.device_count() < 2, "Not enough GPUs to run the test")
@gpu_test(gpu_count=2)
def test_regnet_fsdp_convergence_on_swav(self):
"""
Run SWAV architecture with DDP or with FSDP with or without
Expand Down Expand Up @@ -164,7 +164,7 @@ def test_regnet_fsdp_convergence_on_swav(self):
self.assertEqual(results[0], results[1], "DDP vs FSDP")
self.assertEqual(results[1], results[2], "Activation checkpointing")

@unittest.skipIf(torch.cuda.device_count() < 2, "Not enough GPUs to run the test")
@gpu_test(gpu_count=2)
def test_regnet_fsdp_convergence_on_swav_with_larc(self):
"""
Run SWAV architecture with DDP or with FSDP with or without
Expand Down
2 changes: 0 additions & 2 deletions vissl/trainer/trainer_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,8 +440,6 @@ def _extract_split_features(
for num, feat_name in enumerate(feat_names):
feature = flat_features_list[num].cpu().numpy()
targets = input_sample["target"]
logging.info(feat_name)
logging.info(str(feature))
for idx in range(num_images):
index = input_sample["inds"][idx]
out_features[feat_name][index] = feature[idx]
Expand Down
15 changes: 11 additions & 4 deletions vissl/utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import functools
import os
import re
import tempfile
from contextlib import contextmanager
from dataclasses import dataclass
from typing import List, Tuple
from typing import Callable, List, Tuple

import torch
import torch.distributed as dist
Expand Down Expand Up @@ -76,10 +77,16 @@ def gpu_test(gpu_count: int = 1):
Annotation for GPU tests, skipping the test if the
required amount of GPU is not available
"""
import unittest

message = f"Not enough GPUs to run the test: required {gpu_count}"
return unittest.skipIf(torch.cuda.device_count() < gpu_count, message)
def gpu_test_decorator(test_function: Callable):
@functools.wraps(test_function)
def wrapped_test(*args, **kwargs):
if torch.cuda.device_count() >= gpu_count:
return test_function(*args, **kwargs)

return wrapped_test

return gpu_test_decorator


def init_distributed_on_file(world_size: int, gpu_id: int, sync_file: str):
Expand Down

0 comments on commit 9d961cd

Please sign in to comment.