added CI for unit testing (hpcaitech#69)

erhaa233 · Dec 16, 2021 · cd9c28e · cd9c28e
1 parent 45355a6
commit cd9c28e
Show file tree

Hide file tree

Showing 68 changed files with 1,090 additions and 767 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -0,0 +1,40 @@
+name: Build
+on:
+  pull_request:
+    types: [review_requested]
+    branches:
+      - "*"
+
+jobs:
+  build:
+    name: Build and test Colossal-AI
+    runs-on: [self-hosted, gpu]
+    container:
+      image: nvcr.io/nvidia/pytorch:21.07-py3
+      options: --gpus all --rm --ipc=host -v /data/cifar-10:/data/cifar-10
+    timeout-minutes: 1200
+    if: github.event.pull_request.draft == false && github.base_ref == 'main' && github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
+    steps:
+      - name: Setup Environment
+        run: |
+          export https_proxy=http://172.17.0.1:7890 http_proxy=http://172.17.0.1:7890 all_proxy=socks5://172.17.0.1:7890
+      - name: Install dependencies
+        run: |
+          python3 -m pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple
+          python3 -m pip install -U pip setuptools wheel --user
+          pip install pytest tensorboard deepspeed apex
+      - uses: actions/checkout@v2
+      - name: Install Colossal-AI      
+        run: |
+          pip install -v --no-cache-dir --global-option="--cuda_ext" .
+      - name: Unit Testing
+        run: |
+          pytest tests
+        env:
+          DATA: /data/cifar-10
+
+
+
+
+
+
diff --git a/colossalai/trainer/hooks/_log_hook.py b/colossalai/trainer/hooks/_log_hook.py
@@ -5,7 +5,6 @@
 import os.path as osp
 
 import torch
-from torch.utils.tensorboard import SummaryWriter
 from typing import List
 from decimal import Decimal
 from colossalai.context import ParallelMode
@@ -100,6 +99,7 @@ def __init__(self,
                  priority: int = 10,
                  ) -> None:
         super().__init__(priority=priority)
+        from torch.utils.tensorboard import SummaryWriter
 
         # create log dir
         if not gpc.is_initialized(ParallelMode.GLOBAL) or gpc.get_global_rank() == 0:

diff --git a/tests/test_context/test_2d_init.py b/tests/test_context/test_2d_init.py
@@ -1,15 +1,15 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
-from functools import partial
-from pathlib import Path
-
 import pytest
+import torch
 import torch.multiprocessing as mp
 
 from colossalai import launch
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
+from functools import partial
+from pathlib import Path
 
 CONFIG_PATH = Path(__file__).parent.joinpath('configs/parallel_2d_init.py').absolute()
 
@@ -75,6 +75,7 @@ def init_2d(rank, world_size, backend, port, host):
     check_2d_parallel_rank(rank)
     check_pipeline_parallel_rank(rank)
     gpc.destroy()
+    torch.cuda.empty_cache()
 
 
 @pytest.mark.cpu
@@ -86,7 +87,7 @@ def test_2d_init():
     test_fn = partial(init_2d,
                       world_size=world_size,
                       backend='gloo',
-                      port='29500',
+                      port='29900',
                       host='localhost'
                       )
     mp.spawn(test_fn, nprocs=world_size)

diff --git a/tests/test_context/test_2p5d_init.py b/tests/test_context/test_2p5d_init.py
@@ -5,6 +5,7 @@
 from pathlib import Path
 
 import pytest
+import torch
 import torch.multiprocessing as mp
 
 from colossalai.context.parallel_mode import ParallelMode
@@ -98,6 +99,7 @@ def init_2halfd(rank, world_size, backend, port, host):
     check_tensor_parallel_rank(rank)
     check_2p5d_parallel_rank(rank)
     gpc.destroy()
+    torch.cuda.empty_cache()
 
 
 @pytest.mark.cpu
@@ -109,7 +111,7 @@ def test_2halfd_init():
     test_fn = partial(init_2halfd,
                       world_size=world_size,
                       backend='gloo',
-                      port='29501',
+                      port='29901',
                       host='localhost'
                       )
     mp.spawn(test_fn, nprocs=world_size)

diff --git a/tests/test_context/test_3d_init.py b/tests/test_context/test_3d_init.py
@@ -5,8 +5,10 @@
 from pathlib import Path
 
 import pytest
+import torch
 import torch.multiprocessing as mp
 
+
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch
@@ -90,6 +92,7 @@ def init_3d(rank, world_size, backend, port, host):
     check_data_parallel_rank(rank)
     check_pipeline_parallel_rank(rank)
     gpc.destroy()
+    torch.cuda.empty_cache()
 
 
 @pytest.mark.cpu
@@ -101,7 +104,7 @@ def test_3d_init():
     test_fn = partial(init_3d,
                       world_size=world_size,
                       backend='gloo',
-                      port='29502',
+                      port='29902',
                       host='localhost'
                       )
     mp.spawn(test_fn, nprocs=world_size)

diff --git a/tests/test_data/test_data_parallel_sampler.py b/tests/test_data/test_data_parallel_sampler.py
@@ -6,7 +6,7 @@
 from pathlib import Path
 
 import pytest
-import torch.cuda
+import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
 from torch.utils.data import DataLoader
@@ -49,7 +49,7 @@ def run_data_sampler(rank, world_size):
         rank=rank,
         world_size=world_size,
         backend='gloo',
-        port='29503',
+        port='29903',
         host='localhost'
     )
     colossalai.launch(**dist_args)
@@ -73,6 +73,7 @@ def run_data_sampler(rank, world_size):
     if gpc.get_local_rank(ParallelMode.DATA) != 0:
         assert not torch.equal(img,
                                img_to_compare), 'Same image was distributed across ranks but expected it to be different'
+    torch.cuda.empty_cache()
 
 
 @pytest.mark.cpu

diff --git a/tests/test_data/test_deterministic_dataloader.py b/tests/test_data/test_deterministic_dataloader.py
@@ -6,7 +6,7 @@
 from pathlib import Path
 
 import pytest
-import torch.cuda
+import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
 from torchvision import transforms
@@ -52,11 +52,10 @@ def run_data_sampler(rank, world_size):
         rank=rank,
         world_size=world_size,
         backend='gloo',
-        port='29499',
+        port='29904',
         host='localhost'
     )
     colossalai.launch(**dist_args)
-    print('finished initialization')
 
     dataset_cfg = gpc.config.train_data.dataset
     dataloader_cfg = gpc.config.train_data.dataloader
@@ -88,6 +87,7 @@ def run_data_sampler(rank, world_size):
         # this should be false if data parallel sampler to given to the dataloader
         assert torch.equal(img,
                            img_to_compare), 'Same image was distributed across ranks and expected it to be the same'
+    torch.cuda.empty_cache()
 
 
 @pytest.mark.cpu

diff --git a/tests/test_data_pipeline_tensor_parallel/run_cifar10_vit2d_with_pipeline.py b/tests/test_data_pipeline_tensor_parallel/run_cifar10_vit2d_with_pipeline.py
@@ -1,3 +1,4 @@
+import pytest
 from pathlib import Path
 from colossalai.amp.amp_type import AMP_TYPE
 from colossalai.context.parallel_mode import ParallelMode
@@ -34,7 +35,9 @@
 )
 
 
-def main():
+@pytest.mark.dist
+@pytest.mark.skip("This test requires more than 8 GPUs, you should invoke this test script using test.sh provided manually")
+def test_hybrid_parallel():
     parser = colossalai.get_default_parser()
     args = parser.parse_args()
     colossalai.launch_from_slurm(config=CONFIG,

diff --git a/tests/test_engine/test.sh b/tests/test_engine/test.sh
diff --git a/tests/test_engine/test_engine/test_engine_apex_amp.py b/tests/test_engine/test_engine/test_engine_apex_amp.py
@@ -8,16 +8,17 @@
 import os.path as osp
 from pathlib import Path
 import torch.nn as nn
+import torch.multiprocessing as mp
 
 from torchvision import transforms
 from torch.optim import Adam
 from colossalai.core import global_context as gpc
 from colossalai.amp import AMP_TYPE
 from colossalai.logging import get_dist_logger
 from colossalai.utils import report_memory_usage, get_dataloader
-from colossalai.initialize import get_default_parser
 from torchvision.models import resnet18
 from torchvision.datasets import CIFAR10
+from functools import partial
 
 
 # Config
@@ -37,18 +38,15 @@
 )
 
 
-def run_no_pipeline():
-    parser = get_default_parser()
-    args = parser.parse_args()
-
+def run_engine(rank, world_size):
     # init dist env
     colossalai.launch(
         config=CONFIG,
-        rank=args.rank,
-        world_size=args.world_size,
-        host=args.host,
-        port=args.port,
-        backend=args.backend
+        rank=rank,
+        world_size=world_size,
+        host='localhost',
+        port=29910,
+        backend='nccl'
     )
 
     # build model
@@ -69,8 +67,6 @@ def run_no_pipeline():
     train_dataloader = get_dataloader(dataset=train_dataset,
                                       shuffle=True,
                                       batch_size=BATCH_SIZE,
-                                      num_workers=1,
-                                      pin_memory=True,
                                       drop_last=True)
 
     # build optimizer
@@ -102,12 +98,14 @@ def run_no_pipeline():
     gpc.destroy()
     logger.info('Test engine finished')
     report_memory_usage("After testing")
+    torch.cuda.empty_cache()
 
 
-@pytest.mark.skip("This test should be invoked using the test.sh provided")
 @pytest.mark.dist
 def test_engine():
-    run_no_pipeline()
+    world_size = 4
+    run_func = partial(run_engine, world_size=world_size)
+    mp.spawn(run_func, nprocs=world_size)
 
 
 if __name__ == '__main__':

diff --git a/tests/test_engine/test_engine/test_engine_naive_amp.py b/tests/test_engine/test_engine/test_engine_naive_amp.py
@@ -5,6 +5,7 @@
 import os.path as osp
 from pathlib import Path
 import torch.nn as nn
+import torch.multiprocessing as mp
 
 from torchvision import transforms
 from torch.optim import Adam
@@ -15,6 +16,7 @@
 from colossalai.initialize import get_default_parser
 from torchvision.models import resnet18
 from torchvision.datasets import CIFAR10
+from functools import partial
 
 
 # Config
@@ -36,18 +38,15 @@
 )
 
 
-def run_no_pipeline():
-    parser = get_default_parser()
-    args = parser.parse_args()
-
+def run_engine(rank, world_size):
     # init dist env
     colossalai.launch(
         config=CONFIG,
-        rank=args.rank,
-        world_size=args.world_size,
-        host=args.host,
-        port=args.port,
-        backend=args.backend
+        rank=rank,
+        world_size=world_size,
+        host='localhost',
+        port=29911,
+        backend='nccl'
     )
 
     # build model
@@ -68,8 +67,6 @@ def run_no_pipeline():
     train_dataloader = get_dataloader(dataset=train_dataset,
                                       shuffle=True,
                                       batch_size=BATCH_SIZE,
-                                      num_workers=1,
-                                      pin_memory=True,
                                       drop_last=True)
 
     # build optimizer
@@ -101,12 +98,14 @@ def run_no_pipeline():
     gpc.destroy()
     logger.info('Test engine finished')
     report_memory_usage("After testing")
+    torch.cuda.empty_cache()
 
 
-@pytest.mark.skip("This test should be invoked using the test.sh provided")
 @pytest.mark.dist
 def test_engine():
-    run_no_pipeline()
+    world_size = 4
+    run_func = partial(run_engine, world_size=world_size)
+    mp.spawn(run_func, nprocs=world_size)
 
 
 if __name__ == '__main__':