Skip to content

Commit

Permalink
added CI for unit testing (hpcaitech#69)
Browse files Browse the repository at this point in the history
  • Loading branch information
FrankLeeeee authored Dec 16, 2021
1 parent 45355a6 commit cd9c28e
Show file tree
Hide file tree
Showing 68 changed files with 1,090 additions and 767 deletions.
40 changes: 40 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
name: Build
on:
pull_request:
types: [review_requested]
branches:
- "*"

jobs:
build:
name: Build and test Colossal-AI
runs-on: [self-hosted, gpu]
container:
image: nvcr.io/nvidia/pytorch:21.07-py3
options: --gpus all --rm --ipc=host -v /data/cifar-10:/data/cifar-10
timeout-minutes: 1200
if: github.event.pull_request.draft == false && github.base_ref == 'main' && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
steps:
- name: Setup Environment
run: |
export https_proxy=http://172.17.0.1:7890 http_proxy=http://172.17.0.1:7890 all_proxy=socks5://172.17.0.1:7890
- name: Install dependencies
run: |
python3 -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
python3 -m pip install -U pip setuptools wheel --user
pip install pytest tensorboard deepspeed apex
- uses: actions/checkout@v2
- name: Install Colossal-AI
run: |
pip install -v --no-cache-dir --global-option="--cuda_ext" .
- name: Unit Testing
run: |
pytest tests
env:
DATA: /data/cifar-10






2 changes: 1 addition & 1 deletion colossalai/trainer/hooks/_log_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
import os.path as osp

import torch
from torch.utils.tensorboard import SummaryWriter
from typing import List
from decimal import Decimal
from colossalai.context import ParallelMode
Expand Down Expand Up @@ -100,6 +99,7 @@ def __init__(self,
priority: int = 10,
) -> None:
super().__init__(priority=priority)
from torch.utils.tensorboard import SummaryWriter

# create log dir
if not gpc.is_initialized(ParallelMode.GLOBAL) or gpc.get_global_rank() == 0:
Expand Down
9 changes: 5 additions & 4 deletions tests/test_context/test_2d_init.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-

from functools import partial
from pathlib import Path

import pytest
import torch
import torch.multiprocessing as mp

from colossalai import launch
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from functools import partial
from pathlib import Path

CONFIG_PATH = Path(__file__).parent.joinpath('configs/parallel_2d_init.py').absolute()

Expand Down Expand Up @@ -75,6 +75,7 @@ def init_2d(rank, world_size, backend, port, host):
check_2d_parallel_rank(rank)
check_pipeline_parallel_rank(rank)
gpc.destroy()
torch.cuda.empty_cache()


@pytest.mark.cpu
Expand All @@ -86,7 +87,7 @@ def test_2d_init():
test_fn = partial(init_2d,
world_size=world_size,
backend='gloo',
port='29500',
port='29900',
host='localhost'
)
mp.spawn(test_fn, nprocs=world_size)
Expand Down
4 changes: 3 additions & 1 deletion tests/test_context/test_2p5d_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from pathlib import Path

import pytest
import torch
import torch.multiprocessing as mp

from colossalai.context.parallel_mode import ParallelMode
Expand Down Expand Up @@ -98,6 +99,7 @@ def init_2halfd(rank, world_size, backend, port, host):
check_tensor_parallel_rank(rank)
check_2p5d_parallel_rank(rank)
gpc.destroy()
torch.cuda.empty_cache()


@pytest.mark.cpu
Expand All @@ -109,7 +111,7 @@ def test_2halfd_init():
test_fn = partial(init_2halfd,
world_size=world_size,
backend='gloo',
port='29501',
port='29901',
host='localhost'
)
mp.spawn(test_fn, nprocs=world_size)
Expand Down
5 changes: 4 additions & 1 deletion tests/test_context/test_3d_init.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,10 @@
from pathlib import Path

import pytest
import torch
import torch.multiprocessing as mp


from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.initialize import launch
Expand Down Expand Up @@ -90,6 +92,7 @@ def init_3d(rank, world_size, backend, port, host):
check_data_parallel_rank(rank)
check_pipeline_parallel_rank(rank)
gpc.destroy()
torch.cuda.empty_cache()


@pytest.mark.cpu
Expand All @@ -101,7 +104,7 @@ def test_3d_init():
test_fn = partial(init_3d,
world_size=world_size,
backend='gloo',
port='29502',
port='29902',
host='localhost'
)
mp.spawn(test_fn, nprocs=world_size)
Expand Down
5 changes: 3 additions & 2 deletions tests/test_data/test_data_parallel_sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pathlib import Path

import pytest
import torch.cuda
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.utils.data import DataLoader
Expand Down Expand Up @@ -49,7 +49,7 @@ def run_data_sampler(rank, world_size):
rank=rank,
world_size=world_size,
backend='gloo',
port='29503',
port='29903',
host='localhost'
)
colossalai.launch(**dist_args)
Expand All @@ -73,6 +73,7 @@ def run_data_sampler(rank, world_size):
if gpc.get_local_rank(ParallelMode.DATA) != 0:
assert not torch.equal(img,
img_to_compare), 'Same image was distributed across ranks but expected it to be different'
torch.cuda.empty_cache()


@pytest.mark.cpu
Expand Down
6 changes: 3 additions & 3 deletions tests/test_data/test_deterministic_dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from pathlib import Path

import pytest
import torch.cuda
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from torchvision import transforms
Expand Down Expand Up @@ -52,11 +52,10 @@ def run_data_sampler(rank, world_size):
rank=rank,
world_size=world_size,
backend='gloo',
port='29499',
port='29904',
host='localhost'
)
colossalai.launch(**dist_args)
print('finished initialization')

dataset_cfg = gpc.config.train_data.dataset
dataloader_cfg = gpc.config.train_data.dataloader
Expand Down Expand Up @@ -88,6 +87,7 @@ def run_data_sampler(rank, world_size):
# this should be false if data parallel sampler to given to the dataloader
assert torch.equal(img,
img_to_compare), 'Same image was distributed across ranks and expected it to be the same'
torch.cuda.empty_cache()


@pytest.mark.cpu
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pytest
from pathlib import Path
from colossalai.amp.amp_type import AMP_TYPE
from colossalai.context.parallel_mode import ParallelMode
Expand Down Expand Up @@ -34,7 +35,9 @@
)


def main():
@pytest.mark.dist
@pytest.mark.skip("This test requires more than 8 GPUs, you should invoke this test script using test.sh provided manually")
def test_hybrid_parallel():
parser = colossalai.get_default_parser()
args = parser.parse_args()
colossalai.launch_from_slurm(config=CONFIG,
Expand Down
4 changes: 0 additions & 4 deletions tests/test_engine/test.sh

This file was deleted.

26 changes: 12 additions & 14 deletions tests/test_engine/test_engine/test_engine_apex_amp.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,17 @@
import os.path as osp
from pathlib import Path
import torch.nn as nn
import torch.multiprocessing as mp

from torchvision import transforms
from torch.optim import Adam
from colossalai.core import global_context as gpc
from colossalai.amp import AMP_TYPE
from colossalai.logging import get_dist_logger
from colossalai.utils import report_memory_usage, get_dataloader
from colossalai.initialize import get_default_parser
from torchvision.models import resnet18
from torchvision.datasets import CIFAR10
from functools import partial


# Config
Expand All @@ -37,18 +38,15 @@
)


def run_no_pipeline():
parser = get_default_parser()
args = parser.parse_args()

def run_engine(rank, world_size):
# init dist env
colossalai.launch(
config=CONFIG,
rank=args.rank,
world_size=args.world_size,
host=args.host,
port=args.port,
backend=args.backend
rank=rank,
world_size=world_size,
host='localhost',
port=29910,
backend='nccl'
)

# build model
Expand All @@ -69,8 +67,6 @@ def run_no_pipeline():
train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True,
batch_size=BATCH_SIZE,
num_workers=1,
pin_memory=True,
drop_last=True)

# build optimizer
Expand Down Expand Up @@ -102,12 +98,14 @@ def run_no_pipeline():
gpc.destroy()
logger.info('Test engine finished')
report_memory_usage("After testing")
torch.cuda.empty_cache()


@pytest.mark.skip("This test should be invoked using the test.sh provided")
@pytest.mark.dist
def test_engine():
run_no_pipeline()
world_size = 4
run_func = partial(run_engine, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)


if __name__ == '__main__':
Expand Down
25 changes: 12 additions & 13 deletions tests/test_engine/test_engine/test_engine_naive_amp.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os.path as osp
from pathlib import Path
import torch.nn as nn
import torch.multiprocessing as mp

from torchvision import transforms
from torch.optim import Adam
Expand All @@ -15,6 +16,7 @@
from colossalai.initialize import get_default_parser
from torchvision.models import resnet18
from torchvision.datasets import CIFAR10
from functools import partial


# Config
Expand All @@ -36,18 +38,15 @@
)


def run_no_pipeline():
parser = get_default_parser()
args = parser.parse_args()

def run_engine(rank, world_size):
# init dist env
colossalai.launch(
config=CONFIG,
rank=args.rank,
world_size=args.world_size,
host=args.host,
port=args.port,
backend=args.backend
rank=rank,
world_size=world_size,
host='localhost',
port=29911,
backend='nccl'
)

# build model
Expand All @@ -68,8 +67,6 @@ def run_no_pipeline():
train_dataloader = get_dataloader(dataset=train_dataset,
shuffle=True,
batch_size=BATCH_SIZE,
num_workers=1,
pin_memory=True,
drop_last=True)

# build optimizer
Expand Down Expand Up @@ -101,12 +98,14 @@ def run_no_pipeline():
gpc.destroy()
logger.info('Test engine finished')
report_memory_usage("After testing")
torch.cuda.empty_cache()


@pytest.mark.skip("This test should be invoked using the test.sh provided")
@pytest.mark.dist
def test_engine():
run_no_pipeline()
world_size = 4
run_func = partial(run_engine, world_size=world_size)
mp.spawn(run_func, nprocs=world_size)


if __name__ == '__main__':
Expand Down
Loading

0 comments on commit cd9c28e

Please sign in to comment.