From fb603ace8dfecdb777807127d66c83b1b3b33521 Mon Sep 17 00:00:00 2001
From: zbian <kurisusnowdeng@gmail.com>
Date: Tue, 28 Dec 2021 12:11:47 +0800
Subject: [PATCH 1/3] optimized 1d layer apis; reorganized nn.layer modules;
 fixed tests

---
 benchmark/cifar/configs/vit_1d.py             |   2 +-
 benchmark/cifar/train.py                      |   8 +-
 benchmark/imagenet100/configs/vit_1d.py       |   2 +-
 benchmark/imagenet100/train.py                |   8 +-
 benchmark/imagenet1k/configs/vit_1d.py        |   2 +-
 benchmark/imagenet1k/train.py                 |   8 +-
 colossalai/constants.py                       |   4 +
 colossalai/context/parallel_context.py        |   6 +-
 .../initializer_1d.py                         |   4 +-
 colossalai/nn/layer/__init__.py               |   7 +-
 colossalai/nn/layer/colossalai_layer.py       | 231 ------------------
 .../nn/layer/colossalai_layer/__init__.py     |   6 +
 .../nn/layer/colossalai_layer/dropout.py      |  27 ++
 .../nn/layer/colossalai_layer/embedding.py    | 107 ++++++++
 .../nn/layer/colossalai_layer/linear.py       |  97 ++++++++
 .../layer/colossalai_layer/normalization.py   |  35 +++
 colossalai/nn/layer/fused_bias_gelu.py        |  35 ---
 colossalai/nn/layer/parallel_1d/__init__.py   |   4 +-
 colossalai/nn/layer/parallel_1d/_utils.py     |  13 +-
 colossalai/nn/layer/parallel_1d/layers.py     | 184 +++++++++++++-
 colossalai/nn/layer/parallel_2d/layers.py     |   4 +-
 colossalai/nn/layer/parallel_2p5d/layers.py   |   4 +-
 colossalai/nn/layer/parallel_3d/layers.py     |   4 +-
 colossalai/nn/layer/utils/__init__.py         |   1 +
 .../{_common_utils.py => utils/common.py}     |   7 +-
 colossalai/nn/layer/vanilla/layers.py         |   2 +-
 colossalai/nn/loss/__init__.py                |   6 +-
 colossalai/nn/metric/__init__.py              |   6 +-
 model_zoo/vit/vit.py                          | 152 ++++--------
 tests/test_comm/test_comm.py                  |   2 +-
 tests/test_context/test_2d_init.py            |   2 +-
 tests/test_context/test_2p5d_init.py          |   2 +-
 tests/test_context/test_3d_init.py            |   2 +-
 .../test_cifar_with_data_pipeline_tensor.py   |   8 +-
 .../test_engine/test_engine_apex_amp.py       |   2 +-
 .../test_engine/test_engine_naive_amp.py      |   2 +-
 .../test_engine/test_engine_no_amp.py         |   5 +-
 .../test_engine/test_engine_torch_amp.py      |   4 +-
 tests/test_layers/test_1d/test_1d.py          |   2 +-
 tests/test_layers/test_2d/test_2d.py          |   2 +-
 tests/test_layers/test_2p5d/test_2p5d.py      |   2 +-
 tests/test_layers/test_3d/test_3d.py          |   2 +-
 .../test_sequence/test_sequence.py            |   4 +-
 tests/test_trainer/test_pipeline/test_p2p.py  |   2 +-
 .../test_pipeline/test_partition.py           |   2 +-
 .../test_pipeline/test_pipeline_schedule.py   |   2 +-
 .../test_trainer_with_non_pipe_schedule.py    |   2 +-
 .../test_trainer_with_pipe_schedule.py        |   2 +-
 .../test_utils/test_gradient_accumluation.py  |   5 +-
 .../test_zero_level_2.py                      |   2 +-
 .../test_zero_level_3.py                      |   2 +-
 .../test_vit_2d_level_2.py                    |   6 +-
 .../test_vit_2d_level_3.py                    |   6 +-
 53 files changed, 589 insertions(+), 459 deletions(-)
 delete mode 100644 colossalai/nn/layer/colossalai_layer.py
 create mode 100644 colossalai/nn/layer/colossalai_layer/__init__.py
 create mode 100644 colossalai/nn/layer/colossalai_layer/dropout.py
 create mode 100644 colossalai/nn/layer/colossalai_layer/embedding.py
 create mode 100644 colossalai/nn/layer/colossalai_layer/linear.py
 create mode 100644 colossalai/nn/layer/colossalai_layer/normalization.py
 delete mode 100644 colossalai/nn/layer/fused_bias_gelu.py
 create mode 100644 colossalai/nn/layer/utils/__init__.py
 rename colossalai/nn/layer/{_common_utils.py => utils/common.py} (94%)

diff --git a/benchmark/cifar/configs/vit_1d.py b/benchmark/cifar/configs/vit_1d.py
index 34eb7d50a4da..1731abc1edb5 100644
--- a/benchmark/cifar/configs/vit_1d.py
+++ b/benchmark/cifar/configs/vit_1d.py
@@ -2,7 +2,7 @@
 LEARNING_RATE = 2e-3
 WEIGHT_DECAY = 3e-2
 
-TENSOR_PARALLEL_SIZE = 4
+TENSOR_PARALLEL_SIZE = 2
 TENSOR_PARALLEL_MODE = '1d'
 
 NUM_EPOCHS = 200
diff --git a/benchmark/cifar/train.py b/benchmark/cifar/train.py
index 4a1d87758d11..4ffa22968cdd 100644
--- a/benchmark/cifar/train.py
+++ b/benchmark/cifar/train.py
@@ -72,13 +72,11 @@ def train_cifar():
                 os.mkdir(log_path)
             logger.log_to_file(log_path)
 
-    tp = gpc.config.parallel.tensor.mode
-
-    model = vit_lite_depth7_patch4_32(tensor_parallel=tp)
+    model = vit_lite_depth7_patch4_32()
 
     train_dataloader, test_dataloader = build_cifar(gpc.config.BATCH_SIZE // gpc.data_parallel_size)
 
-    criterion = CrossEntropyLoss(label_smoothing=0.1, tensor_parallel=tp)
+    criterion = CrossEntropyLoss(label_smoothing=0.1)
 
     optimizer = torch.optim.AdamW(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
 
@@ -107,7 +105,7 @@ def train_cifar():
         LogMetricByStepHook(),
         # LogTimingByEpochHook(timer=timer, logger=logger),
         # LogMemoryByEpochHook(logger=logger),
-        AccuracyHook(accuracy_func=Accuracy(tensor_parallel=tp)),
+        AccuracyHook(accuracy_func=Accuracy()),
         LossHook(),
         ThroughputHook(),
         LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=False)
diff --git a/benchmark/imagenet100/configs/vit_1d.py b/benchmark/imagenet100/configs/vit_1d.py
index 07bb5fb66ae0..bd90e1e84264 100644
--- a/benchmark/imagenet100/configs/vit_1d.py
+++ b/benchmark/imagenet100/configs/vit_1d.py
@@ -4,7 +4,7 @@
 LEARNING_RATE = 3e-3
 WEIGHT_DECAY = 0.3
 
-TENSOR_PARALLEL_SIZE = 4
+TENSOR_PARALLEL_SIZE = 2
 TENSOR_PARALLEL_MODE = '1d'
 
 NUM_EPOCHS = 300
diff --git a/benchmark/imagenet100/train.py b/benchmark/imagenet100/train.py
index fece6d1a6626..58ad3b15e535 100644
--- a/benchmark/imagenet100/train.py
+++ b/benchmark/imagenet100/train.py
@@ -159,14 +159,12 @@ def train_imagenet():
                 os.mkdir(log_path)
             logger.log_to_file(log_path)
 
-    tp = gpc.config.parallel.tensor.mode
-
-    model = vit_small_patch16_224(tensor_parallel=tp, num_classes=100, init_method='jax')
+    model = vit_small_patch16_224(num_classes=100, init_method='jax')
 
     train_dataloader = build_dali_train(gpc.config.BATCH_SIZE // gpc.data_parallel_size)
     test_dataloader = build_dali_test(gpc.config.BATCH_SIZE // gpc.data_parallel_size)
 
-    criterion = CrossEntropyLoss(label_smoothing=0.1, tensor_parallel=tp)
+    criterion = CrossEntropyLoss(label_smoothing=0.1)
 
     optimizer = torch.optim.AdamW(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
 
@@ -192,7 +190,7 @@ def train_imagenet():
         LogMetricByStepHook(),
         # LogTimingByEpochHook(timer=timer, logger=logger),
         # LogMemoryByEpochHook(logger=logger),
-        AccuracyHook(accuracy_func=Accuracy(tensor_parallel=tp)),
+        AccuracyHook(accuracy_func=Accuracy()),
         LossHook(),
         ThroughputHook(),
         LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True)
diff --git a/benchmark/imagenet1k/configs/vit_1d.py b/benchmark/imagenet1k/configs/vit_1d.py
index adddceb3a021..d447d10b1616 100644
--- a/benchmark/imagenet1k/configs/vit_1d.py
+++ b/benchmark/imagenet1k/configs/vit_1d.py
@@ -4,7 +4,7 @@
 LEARNING_RATE = 3e-3
 WEIGHT_DECAY = 0.3
 
-TENSOR_PARALLEL_SIZE = 4
+TENSOR_PARALLEL_SIZE = 2
 TENSOR_PARALLEL_MODE = '1d'
 
 NUM_EPOCHS = 300
diff --git a/benchmark/imagenet1k/train.py b/benchmark/imagenet1k/train.py
index 989dff2aa9b7..d9b9ade9956c 100644
--- a/benchmark/imagenet1k/train.py
+++ b/benchmark/imagenet1k/train.py
@@ -159,14 +159,12 @@ def train_imagenet():
                 os.mkdir(log_path)
             logger.log_to_file(log_path)
 
-    tp = gpc.config.parallel.tensor.mode
-
-    model = vit_small_patch16_224(tensor_parallel=tp, num_classes=1000, init_method='jax')
+    model = vit_small_patch16_224(num_classes=1000, init_method='jax')
 
     train_dataloader = build_dali_train(gpc.config.BATCH_SIZE // gpc.data_parallel_size)
     test_dataloader = build_dali_test(gpc.config.BATCH_SIZE // gpc.data_parallel_size)
 
-    criterion = CrossEntropyLoss(label_smoothing=0.1, tensor_parallel=tp)
+    criterion = CrossEntropyLoss(label_smoothing=0.1)
 
     optimizer = torch.optim.AdamW(model.parameters(), lr=gpc.config.LEARNING_RATE, weight_decay=gpc.config.WEIGHT_DECAY)
 
@@ -192,7 +190,7 @@ def train_imagenet():
         LogMetricByStepHook(),
         # LogTimingByEpochHook(timer=timer, logger=logger),
         # LogMemoryByEpochHook(logger=logger),
-        AccuracyHook(accuracy_func=Accuracy(tensor_parallel=tp)),
+        AccuracyHook(accuracy_func=Accuracy()),
         LossHook(),
         ThroughputHook(),
         LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=True)
diff --git a/colossalai/constants.py b/colossalai/constants.py
index 874c53d7291f..58a94437ae5e 100644
--- a/colossalai/constants.py
+++ b/colossalai/constants.py
@@ -2,6 +2,7 @@
 # -*- encoding: utf-8 -*-
 
 ALLOWED_MODES = [None, '1d', '2d', '2.5d', '3d', 'sequence']
+TENSOR_PARALLEL_MODE = 'tensor_parallel_mode'
 
 # intializer
 INITIALIZER_MAPPING = {
@@ -16,6 +17,9 @@
     'sequence': 'Initializer_Sequence'
 }
 
+# 1D parallel
+PARALLEL_INPUT_1D = 'parallel_input_1d'
+
 # 2D paralllel
 SUMMA_DIM = 'SUMMA_DIM'
 
diff --git a/colossalai/context/parallel_context.py b/colossalai/context/parallel_context.py
index f3ebb1eaa6b5..f76f4d60efe2 100644
--- a/colossalai/context/parallel_context.py
+++ b/colossalai/context/parallel_context.py
@@ -1,17 +1,18 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
+import os
 import random
 from typing import Union
 
 import numpy as np
 import torch
 import torch.distributed as dist
-
-from colossalai.constants import ALLOWED_MODES, INITIALIZER_MAPPING
+from colossalai.constants import ALLOWED_MODES, INITIALIZER_MAPPING, TENSOR_PARALLEL_MODE
 from colossalai.context.config import Config
 from colossalai.logging import get_dist_logger
 from colossalai.registry import DIST_GROUP_INITIALIZER
+
 from .parallel_mode import ParallelMode
 from .random import add_seed, get_seeds, set_mode
 
@@ -386,6 +387,7 @@ def init_parallel_groups(self):
         if parallel_config is not None and 'tensor' in parallel_config and 'mode' in parallel_config['tensor']:
             tensor_parallel_mode = parallel_config['tensor']['mode']
         assert tensor_parallel_mode in ALLOWED_MODES, f"mode in the parallel config must be set to one of {ALLOWED_MODES}"
+        os.environ[TENSOR_PARALLEL_MODE] = str(tensor_parallel_mode)
         self.check_sanity()
 
         pg_init = []
diff --git a/colossalai/context/process_group_initializer/initializer_1d.py b/colossalai/context/process_group_initializer/initializer_1d.py
index 1b487aba1db2..edd60c085235 100644
--- a/colossalai/context/process_group_initializer/initializer_1d.py
+++ b/colossalai/context/process_group_initializer/initializer_1d.py
@@ -1,12 +1,13 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
-
+import os
 import torch.distributed as dist
 
 from colossalai.context import Config
 from colossalai.registry import DIST_GROUP_INITIALIZER
 from .process_group_initializer import ProcessGroupInitializer
 from ..parallel_mode import ParallelMode
+from colossalai.constants import PARALLEL_INPUT_1D
 
 
 @DIST_GROUP_INITIALIZER.register_module
@@ -29,6 +30,7 @@ def init_dist_group(self):
         process_group = None
         group_world_size = None
         mode = ParallelMode.PARALLEL_1D
+        os.environ[PARALLEL_INPUT_1D] = ''
 
         for i in range(self.num_group):
             ranks = [i * self.tensor_parallel_size + j for j in range(self.tensor_parallel_size)]
diff --git a/colossalai/nn/layer/__init__.py b/colossalai/nn/layer/__init__.py
index a04dece9141f..108c8dd37460 100644
--- a/colossalai/nn/layer/__init__.py
+++ b/colossalai/nn/layer/__init__.py
@@ -1,3 +1,8 @@
 from .colossalai_layer import *
-from .fused_bias_gelu import bias_gelu_impl
+from .parallel_1d import *
+from .parallel_2d import *
+from .parallel_2p5d import *
+from .parallel_3d import *
+from .parallel_sequence import *
+from .vanilla import *
 from .wrapper import *
diff --git a/colossalai/nn/layer/colossalai_layer.py b/colossalai/nn/layer/colossalai_layer.py
deleted file mode 100644
index 3a185ae15c08..000000000000
--- a/colossalai/nn/layer/colossalai_layer.py
+++ /dev/null
@@ -1,231 +0,0 @@
-import math
-from typing import Callable, Optional
-
-from colossalai.utils import get_current_device
-from torch import dtype, nn
-from torch.nn.modules.activation import *
-from torch.nn.modules.adaptive import *
-from torch.nn.modules.batchnorm import *
-from torch.nn.modules.channelshuffle import *
-from torch.nn.modules.conv import *
-from torch.nn.modules.distance import *
-from torch.nn.modules.dropout import *
-from torch.nn.modules.flatten import *
-from torch.nn.modules.fold import *
-from torch.nn.modules.instancenorm import *
-from torch.nn.modules.linear import *
-from torch.nn.modules.normalization import *
-from torch.nn.modules.padding import *
-from torch.nn.modules.pixelshuffle import *
-from torch.nn.modules.pooling import *
-from torch.nn.modules.rnn import *
-from torch.nn.modules.sparse import *
-from torch.nn.modules.transformer import *
-from torch.nn.modules.upsampling import *
-
-from .. import init as init
-
-from .vanilla import *
-from .parallel_1d import *
-from .parallel_2d import *
-from .parallel_2p5d import *
-from .parallel_3d import *
-from .parallel_sequence import *
-
-_parallel_linear = {'1d_col': Linear1D_Col, '1d_row': Linear1D_Row, '2d': Linear2D, '2.5d': Linear2p5D, '3d': Linear3D}
-
-_parallel_classifier = {
-    None: VanillaClassifier,
-    '1d': VanillaClassifier,
-    '2d': Classifier2D,
-    '2.5d': Classifier2p5D,
-    '3d': Classifier3D
-}
-
-_parallel_layernorm = {'2d': LayerNorm2D, '2.5d': LayerNorm2p5D, '3d': LayerNorm3D}
-
-_parallel_embedding = {'3d': Embedding3D}
-
-_parallel_patchembedding = {
-    None: VanillaPatchEmbedding,
-    '1d': VanillaPatchEmbedding,
-    '2d': PatchEmbedding2D,
-    '2.5d': PatchEmbedding2p5D,
-    '3d': PatchEmbedding3D
-}
-
-
-class Linear(nn.Module):
-    def __init__(self,
-                 in_features: int,
-                 out_features: int,
-                 bias: bool = True,
-                 dtype: dtype = None,
-                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
-                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
-                 tensor_parallel: Optional[str] = None,
-                 **kwargs) -> None:
-        super().__init__()
-        if tensor_parallel is None:
-            self.layer = nn.Linear(in_features, out_features, bias=bias, device=get_current_device(), dtype=dtype)
-            weight_initializer(self.layer.weight, fan_in=in_features, fan_out=out_features)
-            if bias:
-                bias_initializer(self.layer.bias, fan_in=in_features)
-        else:
-            self.layer = _parallel_linear[tensor_parallel](
-                in_features,
-                out_features,
-                bias=bias,
-                dtype=dtype,
-                weight_initializer=weight_initializer,
-                bias_initializer=bias_initializer,
-                **kwargs,
-            )
-
-    @property
-    def weight(self):
-        return self.layer.weight
-
-    @property
-    def bias(self):
-        return self.layer.bias
-
-    def forward(self, *args):
-        return self.layer(*args)
-
-
-class LayerNorm(nn.Module):
-    def __init__(self, normalized_shape: int, eps=1e-05, dtype=None, tensor_parallel: Optional[str] = None) -> None:
-        super().__init__()
-        if tensor_parallel in [None, '1d']:
-            self.norm = nn.LayerNorm(normalized_shape, eps=eps, device=get_current_device(), dtype=dtype)
-        else:
-            self.norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype)
-
-    @property
-    def weight(self):
-        return self.norm.weight
-
-    @property
-    def bias(self):
-        return self.norm.bias
-
-    def forward(self, *args):
-        return self.norm(*args)
-
-
-class Embedding(nn.Module):
-    def __init__(self,
-                 num_embeddings: int,
-                 embedding_dim: int,
-                 padding_idx: int = None,
-                 dtype: dtype = None,
-                 weight_initializer: Callable = init.normal_(),
-                 tensor_parallel: Optional[str] = None,
-                 *args,
-                 **kwargs) -> None:
-        super().__init__()
-        if tensor_parallel in [None, '1d']:
-            self.embed = nn.Embedding(num_embeddings,
-                                      embedding_dim,
-                                      padding_idx=padding_idx,
-                                      device=get_current_device(),
-                                      dtype=dtype,
-                                      *args,
-                                      **kwargs)
-            weight_initializer(self.embed.weight, fan_in=num_embeddings, fan_out=embedding_dim)
-        else:
-            self.embed = _parallel_embedding[tensor_parallel](
-                num_embeddings,
-                embedding_dim,
-                padding_idx=padding_idx,
-                dtype=dtype,
-                weight_initializer=weight_initializer,
-                *args,
-                **kwargs,
-            )
-
-    @property
-    def weight(self):
-        return self.embed.weight
-
-    def forward(self, *args):
-        return self.embed(*args)
-
-
-class PatchEmbedding(nn.Module):
-    def __init__(self,
-                 img_size: int,
-                 patch_size: int,
-                 in_chans: int,
-                 embed_size: int,
-                 dtype: dtype = None,
-                 flatten: bool = True,
-                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
-                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
-                 position_embed_initializer: Callable = init.zeros_(),
-                 tensor_parallel: Optional[str] = None) -> None:
-        super().__init__()
-        self.embed = _parallel_patchembedding[tensor_parallel](
-            img_size,
-            patch_size,
-            in_chans,
-            embed_size,
-            dtype=dtype,
-            flatten=flatten,
-            weight_initializer=weight_initializer,
-            bias_initializer=bias_initializer,
-            position_embed_initializer=position_embed_initializer,
-        )
-
-    @property
-    def weight(self):
-        return self.embed.weight
-
-    @property
-    def bias(self):
-        return self.embed.bias
-
-    @property
-    def pos_embed(self):
-        return self.embed.pos_embed
-
-    @property
-    def cls_token(self):
-        return self.embed.cls_token
-
-    def forward(self, *args):
-        return self.embed(*args)
-
-
-class Classifier(nn.Module):
-    def __init__(self,
-                 in_features: int,
-                 num_classes: int,
-                 weight: nn.Parameter = None,
-                 bias: bool = True,
-                 dtype: dtype = None,
-                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
-                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
-                 tensor_parallel: Optional[str] = None) -> None:
-        super().__init__()
-        self.layer = _parallel_classifier[tensor_parallel](
-            in_features,
-            num_classes,
-            weight=weight,
-            bias=bias,
-            dtype=dtype,
-            weight_initializer=weight_initializer,
-            bias_initializer=bias_initializer,
-        )
-
-    @property
-    def weight(self):
-        return self.layer.weight
-
-    @property
-    def bias(self):
-        return self.layer.bias
-
-    def forward(self, *args):
-        return self.layer(*args)
diff --git a/colossalai/nn/layer/colossalai_layer/__init__.py b/colossalai/nn/layer/colossalai_layer/__init__.py
new file mode 100644
index 000000000000..e9c78c576031
--- /dev/null
+++ b/colossalai/nn/layer/colossalai_layer/__init__.py
@@ -0,0 +1,6 @@
+from .dropout import Dropout
+from .embedding import Embedding, PatchEmbedding
+from .linear import Classifier, Linear
+from .normalization import LayerNorm
+
+__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout']
diff --git a/colossalai/nn/layer/colossalai_layer/dropout.py b/colossalai/nn/layer/colossalai_layer/dropout.py
new file mode 100644
index 000000000000..f923e1f27b5f
--- /dev/null
+++ b/colossalai/nn/layer/colossalai_layer/dropout.py
@@ -0,0 +1,27 @@
+from typing import Optional
+
+import torch.nn as nn
+from colossalai.context import ParallelMode, seed
+from colossalai.utils import conditional_context
+
+from ... import init as init
+from ..parallel_1d import *
+from ..parallel_2d import *
+from ..parallel_2p5d import *
+from ..parallel_3d import *
+from ..utils import get_tensor_parallel_mode
+from ..vanilla import *
+
+
+class Dropout(nn.Module):
+    def __init__(self, p: float = 0.5, inplace: bool = False) -> None:
+        super().__init__()
+        self.tensor_parallel = get_tensor_parallel_mode()
+        if self.tensor_parallel == '1d':
+            self.drop = Dropout1D(p, inplace)
+        else:
+            self.drop = nn.Dropout(p, inplace)
+
+    def forward(self, *args):
+        with conditional_context(seed(ParallelMode.TENSOR), enable=self.tensor_parallel not in ['None', '1d']):
+            return self.drop(*args)
diff --git a/colossalai/nn/layer/colossalai_layer/embedding.py b/colossalai/nn/layer/colossalai_layer/embedding.py
new file mode 100644
index 000000000000..6a580a29d887
--- /dev/null
+++ b/colossalai/nn/layer/colossalai_layer/embedding.py
@@ -0,0 +1,107 @@
+import math
+from typing import Callable, Optional
+
+from colossalai.utils import get_current_device
+from torch import dtype, nn
+
+from ... import init as init
+from ..parallel_1d import *
+from ..parallel_2d import *
+from ..parallel_2p5d import *
+from ..parallel_3d import *
+from ..utils import get_tensor_parallel_mode
+from ..vanilla import *
+
+_parallel_embedding = {'1d': Embedding1D, '2d': Embedding2D, '2.5d': Embedding2p5D, '3d': Embedding3D}
+
+_parallel_patchembedding = {
+    'None': VanillaPatchEmbedding,
+    '1d': VanillaPatchEmbedding,
+    '2d': PatchEmbedding2D,
+    '2.5d': PatchEmbedding2p5D,
+    '3d': PatchEmbedding3D
+}
+
+
+class Embedding(nn.Module):
+    def __init__(self,
+                 num_embeddings: int,
+                 embedding_dim: int,
+                 padding_idx: int = None,
+                 dtype: dtype = None,
+                 weight_initializer: Callable = init.normal_(),
+                 *args,
+                 **kwargs) -> None:
+        super().__init__()
+        tensor_parallel = get_tensor_parallel_mode()
+        if tensor_parallel == 'None':
+            self.embed = nn.Embedding(num_embeddings,
+                                      embedding_dim,
+                                      padding_idx=padding_idx,
+                                      device=get_current_device(),
+                                      dtype=dtype,
+                                      *args,
+                                      **kwargs)
+            weight_initializer(self.embed.weight, fan_in=num_embeddings, fan_out=embedding_dim)
+        else:
+            self.embed = _parallel_embedding[tensor_parallel](
+                num_embeddings,
+                embedding_dim,
+                padding_idx=padding_idx,
+                dtype=dtype,
+                weight_initializer=weight_initializer,
+                *args,
+                **kwargs,
+            )
+
+    @property
+    def weight(self):
+        return self.embed.weight
+
+    def forward(self, *args):
+        return self.embed(*args)
+
+
+class PatchEmbedding(nn.Module):
+    def __init__(self,
+                 img_size: int,
+                 patch_size: int,
+                 in_chans: int,
+                 embed_size: int,
+                 dtype: dtype = None,
+                 flatten: bool = True,
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
+                 position_embed_initializer: Callable = init.zeros_()) -> None:
+        super().__init__()
+        tensor_parallel = get_tensor_parallel_mode()
+        self.embed = _parallel_patchembedding[tensor_parallel](
+            img_size,
+            patch_size,
+            in_chans,
+            embed_size,
+            dtype=dtype,
+            flatten=flatten,
+            weight_initializer=weight_initializer,
+            bias_initializer=bias_initializer,
+            position_embed_initializer=position_embed_initializer,
+        )
+
+    @property
+    def weight(self):
+        return self.embed.weight
+
+    @property
+    def bias(self):
+        return self.embed.bias
+
+    @property
+    def pos_embed(self):
+        return self.embed.pos_embed
+
+    @property
+    def cls_token(self):
+        return self.embed.cls_token
+
+    def forward(self, *args):
+        return self.embed(*args)
diff --git a/colossalai/nn/layer/colossalai_layer/linear.py b/colossalai/nn/layer/colossalai_layer/linear.py
new file mode 100644
index 000000000000..7c78941a23bb
--- /dev/null
+++ b/colossalai/nn/layer/colossalai_layer/linear.py
@@ -0,0 +1,97 @@
+import math
+from typing import Callable, Optional
+
+from colossalai.nn.layer.parallel_1d.layers import Classifier1D
+from colossalai.utils import get_current_device
+from torch import dtype, nn
+
+from ... import init as init
+from ..parallel_1d import *
+from ..parallel_2d import *
+from ..parallel_2p5d import *
+from ..parallel_3d import *
+from ..utils import get_tensor_parallel_mode
+from ..vanilla import *
+
+_parallel_linear = {'1d': Linear1D, '2d': Linear2D, '2.5d': Linear2p5D, '3d': Linear3D}
+
+_parallel_classifier = {
+    'None': VanillaClassifier,
+    '1d': Classifier1D,
+    '2d': Classifier2D,
+    '2.5d': Classifier2p5D,
+    '3d': Classifier3D
+}
+
+
+class Linear(nn.Module):
+    def __init__(self,
+                 in_features: int,
+                 out_features: int,
+                 bias: bool = True,
+                 dtype: dtype = None,
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
+                 **kwargs) -> None:
+        super().__init__()
+        tensor_parallel = get_tensor_parallel_mode()
+        if tensor_parallel == 'None':
+            self.layer = nn.Linear(in_features, out_features, bias=bias, device=get_current_device(), dtype=dtype)
+            weight_initializer(self.layer.weight, fan_in=in_features, fan_out=out_features)
+            if bias:
+                bias_initializer(self.layer.bias, fan_in=in_features)
+        else:
+            self.layer = _parallel_linear[tensor_parallel](
+                in_features,
+                out_features,
+                bias=bias,
+                dtype=dtype,
+                weight_initializer=weight_initializer,
+                bias_initializer=bias_initializer,
+                **kwargs,
+            )
+
+    @property
+    def weight(self):
+        return self.layer.weight
+
+    @property
+    def bias(self):
+        return self.layer.bias
+
+    def forward(self, *args):
+        return self.layer(*args)
+
+
+class Classifier(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        num_classes: int,
+        weight: nn.Parameter = None,
+        bias: bool = True,
+        dtype: dtype = None,
+        weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+        bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1)
+    ) -> None:
+        super().__init__()
+        self.layer = _parallel_classifier[get_tensor_parallel_mode()](
+            in_features,
+            num_classes,
+            weight=weight,
+            bias=bias,
+            dtype=dtype,
+            weight_initializer=weight_initializer,
+            bias_initializer=bias_initializer,
+        )
+
+    @property
+    def weight(self):
+        return self.layer.weight
+
+    @property
+    def bias(self):
+        return self.layer.bias
+
+    def forward(self, *args):
+        return self.layer(*args)
diff --git a/colossalai/nn/layer/colossalai_layer/normalization.py b/colossalai/nn/layer/colossalai_layer/normalization.py
new file mode 100644
index 000000000000..f1dab93f9e2e
--- /dev/null
+++ b/colossalai/nn/layer/colossalai_layer/normalization.py
@@ -0,0 +1,35 @@
+from typing import Optional
+
+from colossalai.utils import get_current_device
+from torch import nn
+
+from ... import init as init
+from ..parallel_1d import *
+from ..parallel_2d import *
+from ..parallel_2p5d import *
+from ..parallel_3d import *
+from ..utils import get_tensor_parallel_mode
+from ..vanilla import *
+
+_parallel_layernorm = {'2d': LayerNorm2D, '2.5d': LayerNorm2p5D, '3d': LayerNorm3D}
+
+
+class LayerNorm(nn.Module):
+    def __init__(self, normalized_shape: int, eps=1e-05, dtype=None) -> None:
+        super().__init__()
+        tensor_parallel = get_tensor_parallel_mode()
+        if tensor_parallel in ['None', '1d']:
+            self.norm = nn.LayerNorm(normalized_shape, eps=eps, device=get_current_device(), dtype=dtype)
+        else:
+            self.norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype)
+
+    @property
+    def weight(self):
+        return self.norm.weight
+
+    @property
+    def bias(self):
+        return self.norm.bias
+
+    def forward(self, *args):
+        return self.norm(*args)
diff --git a/colossalai/nn/layer/fused_bias_gelu.py b/colossalai/nn/layer/fused_bias_gelu.py
deleted file mode 100644
index e920415349a0..000000000000
--- a/colossalai/nn/layer/fused_bias_gelu.py
+++ /dev/null
@@ -1,35 +0,0 @@
-# adapted from Megatron-LM 
-# https://github.com/NVIDIA/Megatron-LM/blob/b31e1296354e979722627a6c4dedafe19b51fa97/megatron/model/fused_bias_gelu.py
-
-import torch
-
-@torch.jit.script
-def bias_gelu(bias, y):
-    x = bias + y
-    return  x * 0.5 * (1.0 + torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x)))
-
-# gradient of tanh approximation of gelu
-# gradient of actual gelu is:
-# 0.5 * (1. + torch.erf(x * 0.70710678)) + 0.3989423 * x * torch.exp(-0.5 * x * x)
-@torch.jit.script
-def bias_gelu_back(g, bias, y):
-    x = bias + y
-    tanh_out = torch.tanh(0.79788456 * x * (1 + 0.044715 * x * x))
-    # sqrt(2/pi) * 3 * 0.044715 -> 0.1070322243
-    ff = 0.5 * x * ((1 - tanh_out * tanh_out) * (0.79788456 + 0.1070322243 * x * x)) + 0.5 * (1 + tanh_out)
-    return ff*g
-
-class GeLUFunction(torch.autograd.Function):
-    @staticmethod
-    # bias is an optional argument
-    def forward(ctx, input, bias):
-        ctx.save_for_backward(input, bias)
-        return bias_gelu(bias, input)
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        input, bias = ctx.saved_tensors
-        tmp = bias_gelu_back(grad_output, bias, input)
-        return tmp, tmp
-
-bias_gelu_impl = GeLUFunction.apply
\ No newline at end of file
diff --git a/colossalai/nn/layer/parallel_1d/__init__.py b/colossalai/nn/layer/parallel_1d/__init__.py
index 8fcd82aab76f..6f2093a112bc 100644
--- a/colossalai/nn/layer/parallel_1d/__init__.py
+++ b/colossalai/nn/layer/parallel_1d/__init__.py
@@ -1,4 +1,4 @@
-from .layers import Linear1D_Col, Linear1D_Row
+from .layers import Dropout1D, Embedding1D, Linear1D, Linear1D_Col, Linear1D_Row
 from .layers import MixedFusedLayerNorm1D as LayerNorm1D
 
-__all__ = ['Linear1D_Col', 'Linear1D_Row', 'LayerNorm1D']
+__all__ = ['Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'LayerNorm1D', 'Embedding1D', 'Dropout1D']
diff --git a/colossalai/nn/layer/parallel_1d/_utils.py b/colossalai/nn/layer/parallel_1d/_utils.py
index b8b7bcceba38..db589afe5cb2 100644
--- a/colossalai/nn/layer/parallel_1d/_utils.py
+++ b/colossalai/nn/layer/parallel_1d/_utils.py
@@ -1,12 +1,21 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
+import os
 
 import torch
 import torch.distributed as dist
-
+from colossalai.constants import PARALLEL_INPUT_1D
 from colossalai.core import global_context as gpc
 
-from .._common_utils import divide
+from ..utils import divide
+
+
+def set_parallel_input(input_parallel: bool):
+    os.environ[PARALLEL_INPUT_1D] = 'true' if input_parallel else ''
+
+
+def get_parallel_input():
+    return bool(os.environ[PARALLEL_INPUT_1D])
 
 
 def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size, rank):
diff --git a/colossalai/nn/layer/parallel_1d/layers.py b/colossalai/nn/layer/parallel_1d/layers.py
index 21764aca6b96..9cab1f71053d 100644
--- a/colossalai/nn/layer/parallel_1d/layers.py
+++ b/colossalai/nn/layer/parallel_1d/layers.py
@@ -6,21 +6,131 @@
 from typing import Callable, Tuple
 
 import torch
-import torch.distributed as dist
 import torch.nn.functional as F
 from colossalai.communication import broadcast
 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
 from colossalai.nn import init as init
 from colossalai.registry import LAYERS
-from colossalai.utils import get_current_device
-from torch import Tensor
+from colossalai.utils import get_current_device, conditional_context
+from torch import Tensor, dtype
 from torch.nn.parameter import Parameter
 
-from .._common_utils import divide, set_tensor_parallel_attribute_by_partition
+from ..utils import divide, set_tensor_parallel_attribute_by_partition
 from ..base_layer import ParallelLayer
 from ._operation import FusedLayerNormAffineFunction1D
-from ._utils import (gather_forward_split_backward, reduce_grad, reduce_input, split_forward_gather_backward)
+from ._utils import (gather_forward_split_backward, get_parallel_input, reduce_grad, reduce_input, set_parallel_input,
+                     split_forward_gather_backward)
+
+
+@LAYERS.register_module
+class Linear1D(torch.nn.Module):
+    def __init__(self,
+                 in_features: int,
+                 out_features: int,
+                 bias: bool = True,
+                 dtype: torch.dtype = None,
+                 gather_output: bool = False,
+                 skip_bias_add: bool = False,
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1)):
+        super().__init__()
+        parallel_input = get_parallel_input()
+        if not parallel_input:
+            self.layer = Linear1D_Col(in_features,
+                                      out_features,
+                                      bias=bias,
+                                      dtype=dtype,
+                                      gather_output=gather_output,
+                                      skip_bias_add=skip_bias_add,
+                                      weight_initializer=weight_initializer,
+                                      bias_initializer=bias_initializer)
+            set_parallel_input(True)
+        else:
+            self.layer = Linear1D_Row(in_features,
+                                      out_features,
+                                      bias=bias,
+                                      dtype=dtype,
+                                      parallel_input=parallel_input,
+                                      skip_bias_add=skip_bias_add,
+                                      weight_initializer=weight_initializer,
+                                      bias_initializer=bias_initializer)
+            set_parallel_input(False)
+
+    @property
+    def weight(self):
+        return self.layer.weight
+
+    @property
+    def bias(self):
+        return self.layer.bias
+
+    def forward(self, *args):
+        return self.layer(*args)
+
+
+@LAYERS.register_module
+class Classifier1D(ParallelLayer):
+    """RowLinear with given weight"""
+    def __init__(self,
+                 in_features: int,
+                 num_classes: int,
+                 weight: Parameter = None,
+                 bias: bool = True,
+                 dtype: dtype = None,
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1)):
+        super().__init__()
+        self.in_features = in_features
+        self.num_classes = num_classes
+        self.parallel_input = get_parallel_input()
+
+        # Divide the weight matrix along the last dimension.
+        self.input_size_per_partition = divide(in_features, gpc.tensor_parallel_size)
+
+        # Parameters.
+        # Initialize weight.
+        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
+        if weight is not None:
+            self.weight = weight
+            self.has_weight = False
+        else:
+            self.weight = Parameter(torch.empty(self.num_classes, self.input_size_per_partition, **factory_kwargs))
+            self.has_weight = True
+        if bias:
+            self.bias = Parameter(torch.empty(self.num_classes, **factory_kwargs))
+        else:
+            self.bias = None
+        with seed(ParallelMode.TENSOR):
+            self.reset_parameters(weight_initializer, bias_initializer)
+        self._set_tensor_parallel_attributes()
+        set_parallel_input(False)
+
+    def reset_parameters(self, weight_initializer, bias_initializer) -> None:
+        fan_in, fan_out = self.in_features, self.num_classes
+        if self.has_weight:
+            weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+        if self.bias is not None:
+            bias_initializer(self.bias, fan_in=fan_in)
+            broadcast(self.bias, gpc.get_ranks_in_group(ParallelMode.PARALLEL_1D)[0], ParallelMode.PARALLEL_1D)
+
+    def _set_tensor_parallel_attributes(self):
+        if self.has_weight:
+            num_partition = gpc.get_world_size(ParallelMode.TENSOR)
+            set_tensor_parallel_attribute_by_partition(self.weight, num_partition)
+
+    def forward(self, input_: Tensor) -> Tensor:
+        # Set up backprop all-reduce.
+        if self.parallel_input:
+            input_ = input_
+        else:
+            input_ = split_forward_gather_backward(input_, ParallelMode.PARALLEL_1D, dim=-1)
+
+        output_parallel = F.linear(input_, self.weight)
+        output = reduce_input(output_parallel, ParallelMode.PARALLEL_1D)
+
+        output = output + self.bias
+        return output
 
 
 @LAYERS.register_module
@@ -208,3 +318,67 @@ def reset_parameters(self):
 
     def forward(self, input):
         return FusedLayerNormAffineFunction1D.apply(input, self.weight, self.bias, self.normalized_shape, self.eps)
+
+
+@LAYERS.register_module
+class Embedding1D(ParallelLayer):
+    def __init__(self,
+                 num_embeddings: int,
+                 embedding_dim: int,
+                 padding_idx: int = None,
+                 dtype: dtype = None,
+                 weight_initializer: Callable = init.normal_(),
+                 *args,
+                 **kwargs):
+        super().__init__()
+
+        self.num_embeddings = num_embeddings
+        self.embed_dim = embedding_dim
+        embed_dim_per_partition = divide(embedding_dim, gpc.tensor_parallel_size)
+
+        self.padding_idx = padding_idx
+        self.embed_args = args
+        self.embed_kwargs = kwargs
+
+        self.weight = Parameter(
+            torch.empty((num_embeddings, embed_dim_per_partition), device=get_current_device(), dtype=dtype))
+
+        self.reset_parameters(weight_initializer)
+        self._set_tensor_parallel_attributes()
+        set_parallel_input(False)
+
+    def _set_tensor_parallel_attributes(self):
+        set_tensor_parallel_attribute_by_partition(self.weight, gpc.tensor_parallel_size)
+
+    def reset_parameters(self, weight_initializer) -> None:
+        with seed(ParallelMode.TENSOR):
+            fan_in, fan_out = self.num_embeddings, self.embed_dim
+            weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+            self._fill_padding_idx_with_zero()
+
+    def _fill_padding_idx_with_zero(self) -> None:
+        if self.padding_idx is not None:
+            with torch.no_grad():
+                self.weight[self.padding_idx].fill_(0)
+
+    def forward(self, input_: Tensor) -> Tensor:
+
+        output_parallel = F.embedding(input_, self.weight, self.padding_idx, *self.embed_args, **self.embed_kwargs)
+
+        output = gather_forward_split_backward(output_parallel, ParallelMode.PARALLEL_1D, dim=-1)
+
+        return output
+
+
+@LAYERS.register_module
+class Dropout1D(ParallelLayer):
+    def __init__(self, p: float = 0.5, inplace: bool = False):
+        super().__init__()
+        self.parallel_input = get_parallel_input()
+        self.p = p
+        self.inplace = inplace
+
+    def forward(self, input_: Tensor) -> Tensor:
+        with conditional_context(seed(ParallelMode.TENSOR), enable=self.parallel_input):
+            output = F.dropout(input_, self.p, self.training, self.inplace)
+        return output
diff --git a/colossalai/nn/layer/parallel_2d/layers.py b/colossalai/nn/layer/parallel_2d/layers.py
index 5b735aca5be2..3f67df382632 100644
--- a/colossalai/nn/layer/parallel_2d/layers.py
+++ b/colossalai/nn/layer/parallel_2d/layers.py
@@ -13,9 +13,9 @@
 from torch import Tensor, dtype
 from torch.nn import Parameter
 
-from .._common_utils import (divide, set_tensor_parallel_attribute_by_partition, to_2tuple)
+from ..utils import divide, set_tensor_parallel_attribute_by_partition, to_2tuple
 from ..base_layer import ParallelLayer
-from ._operation import (Matmul_AB_2D, add_bias_2d, all_gather_weight_2d, classifier_2d, layernorm_2d, split_batch_2d)
+from ._operation import Matmul_AB_2D, add_bias_2d, all_gather_weight_2d, classifier_2d, layernorm_2d, split_batch_2d
 from ._utils import assert_summa_initialization, get_summa_dim_from_env
 
 
diff --git a/colossalai/nn/layer/parallel_2p5d/layers.py b/colossalai/nn/layer/parallel_2p5d/layers.py
index 963a1e8b2e1f..17706a74b8ad 100644
--- a/colossalai/nn/layer/parallel_2p5d/layers.py
+++ b/colossalai/nn/layer/parallel_2p5d/layers.py
@@ -13,11 +13,11 @@
 from torch import Tensor, dtype
 from torch.nn import Parameter
 
-from .._common_utils import (divide, set_tensor_parallel_attribute_by_partition, to_2tuple)
+from ..utils import divide, set_tensor_parallel_attribute_by_partition, to_2tuple
 from ..base_layer import ParallelLayer
 from ._operation import (Add_Bias_2p5D, Matmul_AB_2p5D, all_gather_weight_2p5d, classifier_2p5d, layernorm_2p5d,
                          split_batch_2p5d)
-from ._utils import (assert_tesseract_initialization, get_tesseract_dim_dep_from_env)
+from ._utils import assert_tesseract_initialization, get_tesseract_dim_dep_from_env
 
 
 @LAYERS.register_module
diff --git a/colossalai/nn/layer/parallel_3d/layers.py b/colossalai/nn/layer/parallel_3d/layers.py
index 59b4498289d5..8f0a5058be67 100644
--- a/colossalai/nn/layer/parallel_3d/layers.py
+++ b/colossalai/nn/layer/parallel_3d/layers.py
@@ -17,9 +17,9 @@
 from torch import Tensor, dtype
 from torch.nn import Parameter
 
-from .._common_utils import (divide, set_tensor_parallel_attribute_by_partition, to_2tuple)
+from ..utils import divide, set_tensor_parallel_attribute_by_partition, to_2tuple
 from ._operation import *
-from ._utils import (get_depth_from_env, get_last_group, get_parallel_mode_from_env, swap_in_out_group)
+from ._utils import get_depth_from_env, get_last_group, get_parallel_mode_from_env, swap_in_out_group
 
 
 @LAYERS.register_module
diff --git a/colossalai/nn/layer/utils/__init__.py b/colossalai/nn/layer/utils/__init__.py
new file mode 100644
index 000000000000..e791d9ecb2e9
--- /dev/null
+++ b/colossalai/nn/layer/utils/__init__.py
@@ -0,0 +1 @@
+from .common import *
\ No newline at end of file
diff --git a/colossalai/nn/layer/_common_utils.py b/colossalai/nn/layer/utils/common.py
similarity index 94%
rename from colossalai/nn/layer/_common_utils.py
rename to colossalai/nn/layer/utils/common.py
index d38e74f95b5e..83c3da7bc48b 100644
--- a/colossalai/nn/layer/_common_utils.py
+++ b/colossalai/nn/layer/utils/common.py
@@ -2,11 +2,12 @@
 # -*- encoding: utf-8 -*-
 
 import collections.abc
+import os
 from itertools import repeat
 
 import numpy as np
 import torch
-from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS
+from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS, TENSOR_PARALLEL_MODE
 from colossalai.utils import checkpoint
 from torch import Tensor, nn
 
@@ -59,6 +60,10 @@ def set_tensor_parallel_attribute_by_partition(param, num_partitions):
     setattr(param, NUM_PARTITIONS, num_partitions)
 
 
+def get_tensor_parallel_mode():
+    return os.environ[TENSOR_PARALLEL_MODE]
+
+
 # From PyTorch internals
 
 
diff --git a/colossalai/nn/layer/vanilla/layers.py b/colossalai/nn/layer/vanilla/layers.py
index f19cca47544c..a89e5e1e9add 100644
--- a/colossalai/nn/layer/vanilla/layers.py
+++ b/colossalai/nn/layer/vanilla/layers.py
@@ -9,7 +9,7 @@
 from torch import Tensor, dtype
 from torch import nn as nn
 
-from .._common_utils import to_2tuple
+from ..utils import to_2tuple
 
 
 def drop_path(x, drop_prob: float = 0., training: bool = False):
diff --git a/colossalai/nn/loss/__init__.py b/colossalai/nn/loss/__init__.py
index 58a9d625a647..65eef4a9e34b 100644
--- a/colossalai/nn/loss/__init__.py
+++ b/colossalai/nn/loss/__init__.py
@@ -2,6 +2,7 @@
 from torch.nn.modules.loss import *
 from torch.nn.modules.loss import _Loss
 
+from colossalai.nn.layer.utils import get_tensor_parallel_mode 
 from .loss_2d import CrossEntropyLoss2D
 from .loss_2p5d import CrossEntropyLoss2p5D
 from .loss_3d import CrossEntropyLoss3D
@@ -14,9 +15,10 @@
 
 
 class CrossEntropyLoss(_Loss):
-    def __init__(self, reduction: bool = True, tensor_parallel: str = None, *args, **kwargs):
+    def __init__(self, reduction: bool = True, *args, **kwargs):
         super().__init__()
-        if tensor_parallel in [None, '1d']:
+        tensor_parallel = get_tensor_parallel_mode()
+        if tensor_parallel in ['None', '1d']:
             reduction = 'mean' if reduction else 'none'
             self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs)
         else:
diff --git a/colossalai/nn/metric/__init__.py b/colossalai/nn/metric/__init__.py
index 036bcaa698d6..7ce17b08be99 100644
--- a/colossalai/nn/metric/__init__.py
+++ b/colossalai/nn/metric/__init__.py
@@ -4,6 +4,7 @@
 from .accuracy_2d import Accuracy2D
 from .accuracy_2p5d import Accuracy2p5D
 from .accuracy_3d import Accuracy3D
+from colossalai.nn.layer.utils import get_tensor_parallel_mode
 
 _parallel_accuracy = {
     '2d': Accuracy2D,
@@ -13,9 +14,10 @@
 
 
 class Accuracy(nn.Module):
-    def __init__(self, tensor_parallel: str = None):
+    def __init__(self):
         super().__init__()
-        if tensor_parallel in [None, '1d']:
+        tensor_parallel = get_tensor_parallel_mode()
+        if tensor_parallel in ['None', '1d']:
             self.acc = calc_acc
         else:
             self.acc = _parallel_accuracy[tensor_parallel]()
diff --git a/model_zoo/vit/vit.py b/model_zoo/vit/vit.py
index 4e3209f2c37b..450f334a4cc7 100644
--- a/model_zoo/vit/vit.py
+++ b/model_zoo/vit/vit.py
@@ -3,9 +3,8 @@
 
 import torch
 from colossalai import nn as col_nn
-from colossalai.context import ParallelMode, seed
+from colossalai.nn.layer.utils import CheckpointModule
 from colossalai.registry import LAYERS, MODELS
-from colossalai.utils import checkpoint
 from torch import dtype, nn
 
 __all__ = [
@@ -72,8 +71,7 @@ def __init__(self,
                  dropout: float,
                  dtype: dtype = None,
                  flatten: bool = True,
-                 init_method: str = 'torch',
-                 tensor_parallel: str = None):
+                 init_method: str = 'torch'):
         super().__init__()
         self.patch_embed = col_nn.PatchEmbedding(img_size,
                                                  patch_size,
@@ -81,19 +79,17 @@ def __init__(self,
                                                  embedding_dim,
                                                  dtype=dtype,
                                                  flatten=flatten,
-                                                 tensor_parallel=tensor_parallel,
                                                  **_init_rules[init_method]['embed'])
-        self.dropout = nn.Dropout(dropout)
+        self.dropout = col_nn.Dropout(dropout)
 
     def forward(self, x):
         x = self.patch_embed(x)
-        with seed(ParallelMode.TENSOR):
-            x = self.dropout(x)
+        x = self.dropout(x)
         return x
 
 
 @LAYERS.register_module
-class ViTSelfAttention(nn.Module):
+class ViTSelfAttention(CheckpointModule):
     def __init__(self,
                  dim: int,
                  num_heads: int,
@@ -102,27 +98,17 @@ def __init__(self,
                  bias: bool = True,
                  dtype: dtype = None,
                  checkpoint: bool = False,
-                 init_method: str = 'torch',
-                 tensor_parallel: str = None):
-        super().__init__()
+                 init_method: str = 'torch'):
+        super().__init__(checkpoint)
         self.attention_head_size = dim // num_heads
-        self.checkpoint = checkpoint
-        self.tensor_parallel = tensor_parallel
-
         self.query_key_value = col_nn.Linear(dim,
                                              3 * dim,
                                              dtype=dtype,
                                              bias=bias,
-                                             tensor_parallel='1d_col' if tensor_parallel == '1d' else tensor_parallel,
                                              **_init_rules[init_method]['transformer'])
-        self.attention_dropout = nn.Dropout(attention_dropout)
-        self.dense = col_nn.Linear(dim,
-                                   dim,
-                                   dtype=dtype,
-                                   bias=True,
-                                   tensor_parallel='1d_row' if tensor_parallel == '1d' else tensor_parallel,
-                                   **_init_rules[init_method]['transformer'])
-        self.dropout = nn.Dropout(dropout)
+        self.attention_dropout = col_nn.Dropout(attention_dropout)
+        self.dense = col_nn.Linear(dim, dim, dtype=dtype, bias=True, **_init_rules[init_method]['transformer'])
+        self.dropout = col_nn.Dropout(dropout)
         self.softmax = nn.Softmax(dim=-1)
 
     def _forward(self, x):
@@ -138,8 +124,7 @@ def _forward(self, x):
         x = torch.matmul(q, k.transpose(-1, -2))
         x = x / math.sqrt(self.attention_head_size)
         x = self.softmax(x)
-        with seed(ParallelMode.TENSOR):
-            x = self.attention_dropout(x)
+        x = self.attention_dropout(x)
 
         x = torch.matmul(x, v)
         x = x.transpose(1, 2)
@@ -147,26 +132,13 @@ def _forward(self, x):
         x = x.reshape(new_context_layer_shape)
 
         x = self.dense(x)
-        if self.tensor_parallel == '1d':
-            x = self.dropout(x)
-        else:
-            with seed(ParallelMode.TENSOR):
-                x = self.dropout(x)
+        x = self.dropout(x)
 
         return x
 
-    def _checkpoint_forward(self, x):
-        return checkpoint(self._forward, x)
-
-    def forward(self, x):
-        if self.checkpoint:
-            return self._checkpoint_forward(x)
-        else:
-            return self._forward(x)
-
 
 @LAYERS.register_module
-class ViTMLP(nn.Module):
+class ViTMLP(CheckpointModule):
     def __init__(self,
                  dim: int,
                  mlp_ratio: int,
@@ -175,50 +147,30 @@ def __init__(self,
                  dtype: dtype = None,
                  bias: bool = True,
                  checkpoint: bool = False,
-                 init_method: str = 'torch',
-                 tensor_parallel: str = None):
-        super().__init__()
-        self.checkpoint = checkpoint
-        self.tensor_parallel = tensor_parallel
-
+                 init_method: str = 'torch'):
+        super().__init__(checkpoint)
         self.dense_1 = col_nn.Linear(dim,
                                      mlp_ratio * dim,
                                      dtype=dtype,
                                      bias=bias,
-                                     tensor_parallel='1d_col' if tensor_parallel == '1d' else tensor_parallel,
                                      **_init_rules[init_method]['transformer'])
         self.activation = activation
+        self.dropout_1 = col_nn.Dropout(dropout)
         self.dense_2 = col_nn.Linear(mlp_ratio * dim,
                                      dim,
                                      dtype=dtype,
                                      bias=bias,
-                                     tensor_parallel='1d_row' if tensor_parallel == '1d' else tensor_parallel,
                                      **_init_rules[init_method]['transformer'])
-        self.dropout = nn.Dropout(dropout)
+        self.dropout_2 = col_nn.Dropout(dropout)
 
     def _forward(self, x):
         x = self.dense_1(x)
         x = self.activation(x)
-        with seed(ParallelMode.TENSOR):
-            x = self.dropout(x)
+        x = self.dropout_1(x)
         x = self.dense_2(x)
-        if self.tensor_parallel == '1d':
-            x = self.dropout(x)
-        else:
-            with seed(ParallelMode.TENSOR):
-                x = self.dropout(x)
-
+        x = self.dropout_2(x)
         return x
 
-    def _checkpoint_forward(self, x):
-        return checkpoint(self._forward, x)
-
-    def forward(self, x):
-        if self.checkpoint:
-            return self._checkpoint_forward(x)
-        else:
-            return self._forward(x)
-
 
 @LAYERS.register_module
 class ViTHead(nn.Module):
@@ -228,19 +180,14 @@ def __init__(self,
                  representation_size: int = None,
                  dtype: dtype = None,
                  bias: bool = True,
-                 init_method: str = 'torch',
-                 tensor_parallel: str = None):
+                 init_method: str = 'torch'):
         super().__init__()
         if representation_size:
-            tensor_parallel_kwargs = {'tensor_parallel': '1d_col' if tensor_parallel == '1d' else tensor_parallel}
-            if tensor_parallel == '1d':
-                tensor_parallel_kwargs['gather_output'] = True
             self.representation = col_nn.Linear(dim,
                                                 representation_size,
                                                 bias=bias,
                                                 dtype=dtype,
-                                                **_init_rules[init_method]['head'],
-                                                **tensor_parallel_kwargs)
+                                                **_init_rules[init_method]['head'])
         else:
             self.representation = None
             representation_size = dim
@@ -249,7 +196,6 @@ def __init__(self,
                                         num_classes,
                                         dtype=dtype,
                                         bias=bias,
-                                        tensor_parallel=tensor_parallel,
                                         **_init_rules[init_method]['head'])
 
     def forward(self, x):
@@ -273,10 +219,9 @@ def __init__(self,
                  dtype: dtype = None,
                  bias: bool = True,
                  checkpoint: bool = False,
-                 init_method: str = 'torch',
-                 tensor_parallel: str = None):
+                 init_method: str = 'torch'):
         super().__init__()
-        self.norm1 = col_nn.LayerNorm(normalized_shape=dim, eps=1e-6, dtype=dtype, tensor_parallel=tensor_parallel)
+        self.norm1 = col_nn.LayerNorm(normalized_shape=dim, eps=1e-6, dtype=dtype)
         self.attn = ViTSelfAttention(dim=dim,
                                      num_heads=num_heads,
                                      attention_dropout=attention_dropout,
@@ -284,10 +229,9 @@ def __init__(self,
                                      bias=bias,
                                      dtype=dtype,
                                      checkpoint=checkpoint,
-                                     init_method=init_method,
-                                     tensor_parallel=tensor_parallel)
+                                     init_method=init_method)
         self.drop_path = col_nn.DropPath(drop_path) if drop_path > 0. else nn.Identity()
-        self.norm2 = col_nn.LayerNorm(normalized_shape=dim, eps=1e-6, dtype=dtype, tensor_parallel=tensor_parallel)
+        self.norm2 = col_nn.LayerNorm(normalized_shape=dim, eps=1e-6, dtype=dtype)
         self.mlp = ViTMLP(dim=dim,
                           mlp_ratio=mlp_ratio,
                           activation=activation,
@@ -295,8 +239,7 @@ def __init__(self,
                           dtype=dtype,
                           bias=bias,
                           checkpoint=checkpoint,
-                          init_method=init_method,
-                          tensor_parallel=tensor_parallel)
+                          init_method=init_method)
 
     def forward(self, x):
         x = x + self.drop_path(self.attn(self.norm1(x)))
@@ -323,20 +266,16 @@ def __init__(self,
                  dtype: dtype = None,
                  bias: bool = True,
                  checkpoint: bool = False,
-                 init_method: str = 'torch',
-                 tensor_parallel: str = None):
+                 init_method: str = 'torch'):
         super().__init__()
 
-        embed = ViTEmbedding(
-            img_size=img_size,
-            patch_size=patch_size,
-            in_chans=in_chans,
-            embedding_dim=dim,
-            dropout=dropout,
-            dtype=dtype,
-            init_method=init_method,
-            tensor_parallel=tensor_parallel,
-        )
+        embed = ViTEmbedding(img_size=img_size,
+                             patch_size=patch_size,
+                             in_chans=in_chans,
+                             embedding_dim=dim,
+                             dropout=dropout,
+                             dtype=dtype,
+                             init_method=init_method)
 
         # stochastic depth decay rule
         dpr = [x.item() for x in torch.linspace(0, drop_path, depth)]
@@ -353,26 +292,17 @@ def __init__(self,
                 bias=bias,
                 checkpoint=checkpoint,
                 init_method=init_method,
-                tensor_parallel=tensor_parallel,
             ) for i in range(depth)
         ]
 
-        norm = col_nn.LayerNorm(
-            normalized_shape=dim,
-            eps=1e-6,
-            dtype=dtype,
-            tensor_parallel=tensor_parallel,
-        )
+        norm = col_nn.LayerNorm(normalized_shape=dim, eps=1e-6, dtype=dtype)
 
-        head = ViTHead(
-            dim=dim,
-            num_classes=num_classes,
-            representation_size=representation_size,
-            dtype=dtype,
-            bias=bias,
-            init_method=init_method,
-            tensor_parallel=tensor_parallel,
-        )
+        head = ViTHead(dim=dim,
+                       num_classes=num_classes,
+                       representation_size=representation_size,
+                       dtype=dtype,
+                       bias=bias,
+                       init_method=init_method)
 
         self.layers = nn.Sequential(
             embed,
diff --git a/tests/test_comm/test_comm.py b/tests/test_comm/test_comm.py
index e2f981af5757..a8bae5c085ce 100644
--- a/tests/test_comm/test_comm.py
+++ b/tests/test_comm/test_comm.py
@@ -50,7 +50,7 @@ def check_all_reduce():
 
 
 def check_layer(rank, world_size):
-    launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=30010, backend='nccl')
+    launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=40011, backend='nccl')
 
     assert dist.get_rank() == gpc.get_global_rank()
     print('Rank {} / {}'.format(dist.get_rank(), dist.get_world_size()))
diff --git a/tests/test_context/test_2d_init.py b/tests/test_context/test_2d_init.py
index 3ad376750036..387a8b5076ae 100644
--- a/tests/test_context/test_2d_init.py
+++ b/tests/test_context/test_2d_init.py
@@ -87,7 +87,7 @@ def test_2d_init():
     test_fn = partial(init_2d,
                       world_size=world_size,
                       backend='gloo',
-                      port='29900',
+                      port='39900',
                       host='localhost'
                       )
     mp.spawn(test_fn, nprocs=world_size)
diff --git a/tests/test_context/test_2p5d_init.py b/tests/test_context/test_2p5d_init.py
index 1ce5f8ff45a3..30fc5809236d 100644
--- a/tests/test_context/test_2p5d_init.py
+++ b/tests/test_context/test_2p5d_init.py
@@ -111,7 +111,7 @@ def test_2halfd_init():
     test_fn = partial(init_2halfd,
                       world_size=world_size,
                       backend='gloo',
-                      port='29901',
+                      port='39901',
                       host='localhost'
                       )
     mp.spawn(test_fn, nprocs=world_size)
diff --git a/tests/test_context/test_3d_init.py b/tests/test_context/test_3d_init.py
index 5c66ab6a02ff..d4a881ece8c2 100644
--- a/tests/test_context/test_3d_init.py
+++ b/tests/test_context/test_3d_init.py
@@ -104,7 +104,7 @@ def test_3d_init():
     test_fn = partial(init_3d,
                       world_size=world_size,
                       backend='gloo',
-                      port='29902',
+                      port='39902',
                       host='localhost'
                       )
     mp.spawn(test_fn, nprocs=world_size)
diff --git a/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py b/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py
index 8fd8a6ae9244..28e7e5350bbe 100644
--- a/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py
+++ b/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py
@@ -28,11 +28,11 @@
 
 
 def run_trainer(rank, world_size):
-    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=30000, backend='nccl')
+    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=40001, backend='nccl')
 
     logger = get_dist_logger()
 
-    model = vit_tiny_patch4_32(tensor_parallel='1d')
+    model = vit_tiny_patch4_32()
     pipe_model = build_pipeline_model(model.layers, num_chunks=1)
 
     # build dataloaders
@@ -54,7 +54,7 @@ def run_trainer(rank, world_size):
     test_dataloader = get_dataloader(dataset=test_dataset, batch_size=BATCH_SIZE, pin_memory=True)
 
     # build criterion
-    criterion = CrossEntropyLoss(tensor_parallel='1d')
+    criterion = CrossEntropyLoss()
 
     # optimizer
     optimizer = torch.optim.Adam(pipe_model.parameters(), lr=0.001, weight_decay=0)
@@ -78,7 +78,7 @@ def run_trainer(rank, world_size):
     hook_list = [
         hooks.LossHook(),
         hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=False),
-        hooks.AccuracyHook(accuracy_func=Accuracy(tensor_parallel='1d')),
+        hooks.AccuracyHook(accuracy_func=Accuracy()),
         hooks.LogMetricByEpochHook(logger),
     ]
 
diff --git a/tests/test_engine/test_engine/test_engine_apex_amp.py b/tests/test_engine/test_engine/test_engine_apex_amp.py
index c8ee13de1d21..ec14ff5f16b8 100644
--- a/tests/test_engine/test_engine/test_engine_apex_amp.py
+++ b/tests/test_engine/test_engine/test_engine_apex_amp.py
@@ -45,7 +45,7 @@ def run_engine(rank, world_size):
         rank=rank,
         world_size=world_size,
         host='localhost',
-        port=29910,
+        port=39910,
         backend='nccl'
     )
 
diff --git a/tests/test_engine/test_engine/test_engine_naive_amp.py b/tests/test_engine/test_engine/test_engine_naive_amp.py
index e60b0bbe96e3..91645392fddf 100644
--- a/tests/test_engine/test_engine/test_engine_naive_amp.py
+++ b/tests/test_engine/test_engine/test_engine_naive_amp.py
@@ -45,7 +45,7 @@ def run_engine(rank, world_size):
         rank=rank,
         world_size=world_size,
         host='localhost',
-        port=29911,
+        port=39911,
         backend='nccl'
     )
 
diff --git a/tests/test_engine/test_engine/test_engine_no_amp.py b/tests/test_engine/test_engine/test_engine_no_amp.py
index 8bf0baaea6fb..6b4296f4f919 100644
--- a/tests/test_engine/test_engine/test_engine_no_amp.py
+++ b/tests/test_engine/test_engine/test_engine_no_amp.py
@@ -2,7 +2,6 @@
 import os
 import pytest
 import torch
-import os.path as osp
 from pathlib import Path
 import torch.nn as nn
 import torch.multiprocessing as mp
@@ -10,10 +9,8 @@
 from torchvision import transforms
 from torch.optim import Adam
 from colossalai.core import global_context as gpc
-from colossalai.amp import AMP_TYPE
 from colossalai.logging import get_dist_logger
 from colossalai.utils import report_memory_usage, get_dataloader
-from colossalai.initialize import get_default_parser
 from torchvision.models import resnet18
 from torchvision.datasets import CIFAR10
 from functools import partial
@@ -42,7 +39,7 @@ def run_engine(rank, world_size):
         rank=rank,
         world_size=world_size,
         host='localhost',
-        port=29912,
+        port=39912,
         backend='nccl'
     )
 
diff --git a/tests/test_engine/test_engine/test_engine_torch_amp.py b/tests/test_engine/test_engine/test_engine_torch_amp.py
index 289a8f1b62d5..2e6776f1f046 100644
--- a/tests/test_engine/test_engine/test_engine_torch_amp.py
+++ b/tests/test_engine/test_engine/test_engine_torch_amp.py
@@ -2,7 +2,6 @@
 import os
 import pytest
 import torch
-import os.path as osp
 from pathlib import Path
 import torch.nn as nn
 import torch.multiprocessing as mp
@@ -13,7 +12,6 @@
 from colossalai.amp import AMP_TYPE
 from colossalai.logging import get_dist_logger
 from colossalai.utils import report_memory_usage, get_dataloader
-from colossalai.initialize import get_default_parser
 from torchvision.models import resnet18
 from torchvision.datasets import CIFAR10
 from functools import partial
@@ -43,7 +41,7 @@ def run_engine(rank, world_size):
         rank=rank,
         world_size=world_size,
         host='localhost',
-        port=29913,
+        port=39913,
         backend='nccl'
     )
 
diff --git a/tests/test_layers/test_1d/test_1d.py b/tests/test_layers/test_1d/test_1d.py
index f0f977bea47a..bc7f09353d7b 100644
--- a/tests/test_layers/test_1d/test_1d.py
+++ b/tests/test_layers/test_1d/test_1d.py
@@ -26,7 +26,7 @@ def check_layer(rank, world_size):
            rank=rank,
            world_size=world_size,
            host='localhost',
-           port=29920,
+           port=39920,
            backend='nccl')
 
     check_linear_col()
diff --git a/tests/test_layers/test_2d/test_2d.py b/tests/test_layers/test_2d/test_2d.py
index 02b0a9cf13ae..31829b8b91b2 100644
--- a/tests/test_layers/test_2d/test_2d.py
+++ b/tests/test_layers/test_2d/test_2d.py
@@ -39,7 +39,7 @@ def check_layer_and_operation(rank, world_size):
            rank=rank,
            world_size=world_size,
            host='localhost',
-           port=29921,
+           port=39921,
            backend='nccl')
 
     # check_operations()
diff --git a/tests/test_layers/test_2p5d/test_2p5d.py b/tests/test_layers/test_2p5d/test_2p5d.py
index f3a180e4d1ab..a09fd1edd0e1 100644
--- a/tests/test_layers/test_2p5d/test_2p5d.py
+++ b/tests/test_layers/test_2p5d/test_2p5d.py
@@ -34,7 +34,7 @@ def check_layer_and_operation(rank, world_size):
            rank=rank,
            world_size=world_size,
            host='localhost',
-           port=29922,
+           port=39922,
            backend='nccl')
 
     check_operations()
diff --git a/tests/test_layers/test_3d/test_3d.py b/tests/test_layers/test_3d/test_3d.py
index 39e5d8e4516b..6aeec775fdf4 100644
--- a/tests/test_layers/test_3d/test_3d.py
+++ b/tests/test_layers/test_3d/test_3d.py
@@ -28,7 +28,7 @@ def check_layer():
 
 
 def check_layer_and_operation(rank, world_size):
-    launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=29923, backend='nccl')
+    launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=39923, backend='nccl')
     check_layer()
     gpc.destroy()
     torch.cuda.empty_cache()
diff --git a/tests/test_layers/test_sequence/test_sequence.py b/tests/test_layers/test_sequence/test_sequence.py
index 56148e6732ab..d76fc35badb4 100644
--- a/tests/test_layers/test_sequence/test_sequence.py
+++ b/tests/test_layers/test_sequence/test_sequence.py
@@ -4,7 +4,7 @@
 import pytest
 import torch
 import torch.multiprocessing as mp
-from colossalai.initialize import launch, get_default_parser
+from colossalai.initialize import launch
 from colossalai.logging import get_dist_logger
 from checks_seq.check_layer_seq import *
 from functools import partial
@@ -28,7 +28,7 @@ def run_check_sequence(rank, world_size):
            rank=rank,
            world_size=world_size,
            host='localhost',
-           port=29924,
+           port=39924,
            backend='nccl')
     logger = get_dist_logger()
     logger.info('Distributed environment is initialzied.', ranks=[0])
diff --git a/tests/test_trainer/test_pipeline/test_p2p.py b/tests/test_trainer/test_pipeline/test_p2p.py
index ce60955aeef6..4b2c00d6d087 100644
--- a/tests/test_trainer/test_pipeline/test_p2p.py
+++ b/tests/test_trainer/test_pipeline/test_p2p.py
@@ -129,7 +129,7 @@ def run_check(rank, world_size):
         rank=rank,
         world_size=world_size,
         host='localhost',
-        port=29932,
+        port=39932,
         backend='nccl'
     )
     logger = get_dist_logger()
diff --git a/tests/test_trainer/test_pipeline/test_partition.py b/tests/test_trainer/test_pipeline/test_partition.py
index 9f011c0e2b8e..c0ecd410bbd7 100644
--- a/tests/test_trainer/test_pipeline/test_partition.py
+++ b/tests/test_trainer/test_pipeline/test_partition.py
@@ -21,7 +21,7 @@ def run_partition(rank, world_size):
            rank=rank,
            world_size=world_size,
            host='localhost',
-           port=29933,
+           port=39933,
            backend='nccl'
            )
     logger = get_dist_logger()
diff --git a/tests/test_trainer/test_pipeline/test_pipeline_schedule.py b/tests/test_trainer/test_pipeline/test_pipeline_schedule.py
index be2f7ab30964..5b2c1986fb7c 100644
--- a/tests/test_trainer/test_pipeline/test_pipeline_schedule.py
+++ b/tests/test_trainer/test_pipeline/test_pipeline_schedule.py
@@ -35,7 +35,7 @@ def run_schedule(rank, world_size):
            rank=rank,
            world_size=world_size,
            host='localhost',
-           port=29934,
+           port=39934,
            backend='nccl')
 
     # build model
diff --git a/tests/test_trainer/test_trainer_with_non_pipe_schedule.py b/tests/test_trainer/test_trainer_with_non_pipe_schedule.py
index af4180ade2c2..b44e609d86b8 100644
--- a/tests/test_trainer/test_trainer_with_non_pipe_schedule.py
+++ b/tests/test_trainer/test_trainer_with_non_pipe_schedule.py
@@ -27,7 +27,7 @@
 
 
 def run_trainer_no_pipeline(rank, world_size):
-    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=29930, backend='nccl')
+    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=39930, backend='nccl')
 
     # build model
     model = resnet18(num_classes=10)
diff --git a/tests/test_trainer/test_trainer_with_pipe_schedule.py b/tests/test_trainer/test_trainer_with_pipe_schedule.py
index c6bb5ad155d7..d1e1a59101b7 100644
--- a/tests/test_trainer/test_trainer_with_pipe_schedule.py
+++ b/tests/test_trainer/test_trainer_with_pipe_schedule.py
@@ -26,7 +26,7 @@
 
 
 def run_trainer_with_pipeline(rank, world_size):
-    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=29931, backend='nccl')
+    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=39931, backend='nccl')
 
     # build model
     model = resnet18(num_classes=10)
diff --git a/tests/test_utils/test_gradient_accumluation.py b/tests/test_utils/test_gradient_accumluation.py
index 6a709f7db622..1c8af272f8fd 100644
--- a/tests/test_utils/test_gradient_accumluation.py
+++ b/tests/test_utils/test_gradient_accumluation.py
@@ -11,8 +11,7 @@
 from torch.optim import Adam
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
-from colossalai.utils import report_memory_usage, get_dataloader
-from colossalai.initialize import get_default_parser
+from colossalai.utils import get_dataloader
 from torchvision.models import resnet18
 from torchvision.datasets import CIFAR10
 
@@ -40,7 +39,7 @@ def run_no_pipeline(rank, world_size):
         rank=rank,
         world_size=world_size,
         host='localhost',
-        port=29500,
+        port=39501,
         backend='nccl'
     )
 
diff --git a/tests/test_zero_data_parallel/test_zero_level_2.py b/tests/test_zero_data_parallel/test_zero_level_2.py
index 5da2822551aa..de7779cab877 100644
--- a/tests/test_zero_data_parallel/test_zero_level_2.py
+++ b/tests/test_zero_data_parallel/test_zero_level_2.py
@@ -39,7 +39,7 @@ def run_dist(rank, world_size):
                       rank=rank,
                       world_size=world_size,
                       host='localhost',
-                      port=29940,
+                      port=39940,
                       backend='nccl')
 
     # build model
diff --git a/tests/test_zero_data_parallel/test_zero_level_3.py b/tests/test_zero_data_parallel/test_zero_level_3.py
index f1fe45b2b3c3..d0f7adc8d9d2 100644
--- a/tests/test_zero_data_parallel/test_zero_level_3.py
+++ b/tests/test_zero_data_parallel/test_zero_level_3.py
@@ -51,7 +51,7 @@ def run_dist(rank, world_size):
                       rank=rank,
                       world_size=world_size,
                       host='localhost',
-                      port=29941,
+                      port=39941,
                       backend='nccl')
 
     # build model
diff --git a/tests/test_zero_tensor_parallel/test_vit_2d_level_2.py b/tests/test_zero_tensor_parallel/test_vit_2d_level_2.py
index 58c1e98b9bb7..eeb487c33514 100644
--- a/tests/test_zero_tensor_parallel/test_vit_2d_level_2.py
+++ b/tests/test_zero_tensor_parallel/test_vit_2d_level_2.py
@@ -41,10 +41,10 @@ def train_epoch(engine, train_dataloader):
 
 
 def run_2d_parallel_vision_transformer_level_2(rank, world_size):
-    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=29950, backend='nccl')
+    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=39950, backend='nccl')
 
     # build model
-    model = vit_lite_depth7_patch4_32(tensor_parallel='2d')
+    model = vit_lite_depth7_patch4_32()
 
     # build dataloader# build dataloaders
     train_dataset = CIFAR10(root=Path(os.environ['DATA']),
@@ -62,7 +62,7 @@ def run_2d_parallel_vision_transformer_level_2(rank, world_size):
 
     # build optimizer and loss
     optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
-    criterion = CrossEntropyLoss(tensor_parallel='2d')
+    criterion = CrossEntropyLoss()
 
     engine, train_dataloader, *args = colossalai.initialize(model=model,
                                                             optimizer=optimizer,
diff --git a/tests/test_zero_tensor_parallel/test_vit_2d_level_3.py b/tests/test_zero_tensor_parallel/test_vit_2d_level_3.py
index 0b08a58f2b8b..720e460e57e9 100644
--- a/tests/test_zero_tensor_parallel/test_vit_2d_level_3.py
+++ b/tests/test_zero_tensor_parallel/test_vit_2d_level_3.py
@@ -41,10 +41,10 @@ def train_epoch(engine, train_dataloader):
 
 
 def run_2d_parallel_vision_transformer_level_3(rank, world_size):
-    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=29951, backend='nccl')
+    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=39951, backend='nccl')
 
     # build model
-    model = vit_lite_depth7_patch4_32(tensor_parallel='2d')
+    model = vit_lite_depth7_patch4_32()
 
     # build dataloader# build dataloaders
     train_dataset = CIFAR10(root=Path(os.environ['DATA']),
@@ -62,7 +62,7 @@ def run_2d_parallel_vision_transformer_level_3(rank, world_size):
 
     # build optimizer and loss
     optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
-    criterion = CrossEntropyLoss(tensor_parallel='2d')
+    criterion = CrossEntropyLoss()
 
     engine, train_dataloader, *args = colossalai.initialize(model=model,
                                                             optimizer=optimizer,

From 17087da88d16532c1972b4d9e9785aec3a9024fa Mon Sep 17 00:00:00 2001
From: BoxiangW <45734921+BoxiangW@users.noreply.github.com>
Date: Tue, 28 Dec 2021 21:36:57 +0800
Subject: [PATCH 2/3] fixed 2.5d runtime issue

---
 colossalai/nn/layer/parallel_1d/layers.py     |   4 +-
 .../nn/layer/parallel_2p5d/_operation.py      | 183 +++++++++++++-----
 colossalai/nn/layer/parallel_2p5d/layers.py   |  18 +-
 colossalai/utils/__init__.py                  |  38 ++--
 colossalai/utils/common.py                    |  23 ++-
 tests/test_comm/test_comm.py                  |   9 +-
 tests/test_context/test_2d_init.py            |   9 +-
 tests/test_context/test_2p5d_init.py          |   4 +-
 tests/test_context/test_3d_init.py            |   5 +-
 .../test_cifar_with_data_pipeline_tensor.py   |   8 +-
 .../test_engine/test_engine_apex_amp.py       |  28 ++-
 .../test_engine/test_engine_naive_amp.py      |  29 ++-
 .../test_engine/test_engine_no_amp.py         |  25 ++-
 .../test_engine/test_engine_torch_amp.py      |  27 ++-
 tests/test_layers/test_1d/test_1d.py          |  12 +-
 tests/test_layers/test_2d/test_2d.py          |  13 +-
 tests/test_layers/test_2p5d/test_2p5d.py      |  16 +-
 tests/test_layers/test_3d/test_3d.py          |   7 +-
 .../test_sequence/test_sequence.py            |   7 +-
 .../test_pipeline/resnet_config.py            |   1 +
 tests/test_trainer/test_pipeline/test_p2p.py  |  12 +-
 .../test_pipeline/test_partition.py           |   9 +-
 .../test_pipeline/test_pipeline_schedule.py   |  25 ++-
 .../test_trainer_with_non_pipe_schedule.py    |   8 +-
 .../test_trainer_with_pipe_schedule.py        |   8 +-
 .../test_utils/test_gradient_accumluation.py  |  23 ++-
 .../test_zero_level_2.py                      |  18 +-
 .../test_zero_level_3.py                      |  18 +-
 .../test_vit_2d_level_2.py                    |   8 +-
 .../test_vit_2d_level_3.py                    |   8 +-
 30 files changed, 349 insertions(+), 254 deletions(-)

diff --git a/colossalai/nn/layer/parallel_1d/layers.py b/colossalai/nn/layer/parallel_1d/layers.py
index 9cab1f71053d..497feb4a9479 100644
--- a/colossalai/nn/layer/parallel_1d/layers.py
+++ b/colossalai/nn/layer/parallel_1d/layers.py
@@ -65,8 +65,8 @@ def weight(self):
     def bias(self):
         return self.layer.bias
 
-    def forward(self, *args):
-        return self.layer(*args)
+    def forward(self, input_: Tensor) -> Tensor:
+        return self.layer(input_)
 
 
 @LAYERS.register_module
diff --git a/colossalai/nn/layer/parallel_2p5d/_operation.py b/colossalai/nn/layer/parallel_2p5d/_operation.py
index 5a38c5d37ec6..a4b062647328 100644
--- a/colossalai/nn/layer/parallel_2p5d/_operation.py
+++ b/colossalai/nn/layer/parallel_2p5d/_operation.py
@@ -120,30 +120,56 @@ def forward(ctx: Any, A: Tensor, B: Tensor, tesseract_dim: int, out_shape: Tuple
             ctx.save_for_backward(A, B)
 
         A_shape = A.shape
-        A = A.reshape((-1, A_shape[-1])).contiguous()
+        A = A.reshape((-1, A_shape[-1]))
         B_shape = B.shape
-        B = B.reshape((-1, B_shape[-1])).contiguous()
+        B = B.reshape((-1, B_shape[-1]))
         C_shape = (A.shape[0], B.shape[-1])
         C = torch.zeros(C_shape, dtype=A.dtype, device=get_current_device())
 
-        A_list = [torch.empty_like(A) for _ in range(gpc.get_world_size(row_parallel_mode) - 1)]
-        B_list = [torch.empty_like(B) for _ in range(gpc.get_world_size(col_parallel_mode) - 1)]
-        A_list.insert(gpc.get_local_rank(row_parallel_mode), A)
-        B_list.insert(gpc.get_local_rank(col_parallel_mode), B)
-        op_a = dist.all_gather(A_list, A, group=gpc.get_group(row_parallel_mode), async_op=True)
-        op_a.wait()
-        op_b = dist.all_gather(B_list, B, group=gpc.get_group(col_parallel_mode), async_op=True)
-        for op in [op_a, op_b]:
-            op.wait()
+        # use circular buffer to store the communication tensor
+        # 2 is enough for all cases
+        A_list = [torch.empty_like(A) for _ in range(2)]
+        B_list = [torch.empty_like(B) for _ in range(2)]
+
+        row_group = gpc.get_group(row_parallel_mode)
+        col_group = gpc.get_group(col_parallel_mode)
+
+        src_a = tesseract_dim * row_rank + tesseract_dim ** 2 * dep_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
+                pipeline_parallel_rank * tensor_parallel_size
+        src_b = col_rank + tesseract_dim ** 2 * dep_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
+                pipeline_parallel_rank * tensor_parallel_size
+
+        opa = [None] * 2
+        opb = [None] * 2
+
+        A_list[0].copy_(A)
+        B_list[0].copy_(B)
+        opa[0] = dist.broadcast(A_list[0], src=src_a, group=row_group, async_op=True)
+        opb[0] = dist.broadcast(B_list[0], src=src_b, group=col_group, async_op=True)
+        cur = 0
 
         for i in range(tesseract_dim):
-            src_a = i + tesseract_dim * row_rank
-            src_b = i + tesseract_dim * col_rank
-            src_a = src_a % tesseract_dim
-            src_b = src_b % tesseract_dim
-            A_temp = A_list[src_a]
-            B_temp = B_list[src_b]
-            torch.addmm(C, A_temp, B_temp, out=C)
+            if i != tesseract_dim - 1:
+                A_list[1 - cur].copy_(A)
+                opa[1 - cur] = dist.broadcast(A_list[1 - cur],
+                                              src=src_a + 1,
+                                              group=row_group,
+                                              async_op=True)
+                B_list[1 - cur].copy_(B)
+                opb[1 - cur] = dist.broadcast(B_list[1 - cur],
+                                              src=src_b + tesseract_dim,
+                                              group=col_group,
+                                              async_op=True)
+
+            if opa[cur] is not None:
+                opa[cur].wait()
+            if opb[cur] is not None:
+                opb[cur].wait()
+
+            torch.addmm(C, A_list[cur], B_list[cur], out=C)
+            cur = 1 - cur
+            src_a += 1
+            src_b += tesseract_dim
         out = C.reshape(out_shape)
 
         if ctx:
@@ -201,20 +227,55 @@ def forward(ctx: Any, A: Tensor, B: Tensor, tesseract_dim: int, out_shape: Tuple
         C_shape = (A.shape[0], B.shape[0])
         C = torch.empty(C_shape, dtype=A.dtype, device=get_current_device())
 
+        # use circular buffer to store the communication tensor
+        # 2 is enough for all cases
+        B_list = [torch.empty_like(B) for _ in range(2)]
+        C_list = [torch.empty_like(C) for _ in range(2)]
+
+        row_group = gpc.get_group(row_parallel_mode)
+        col_group = gpc.get_group(col_parallel_mode)
+
+        src_b = col_rank + tesseract_dim ** 2 * dep_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
+                pipeline_parallel_rank * tensor_parallel_size
+        src_c = tesseract_dim * row_rank + tesseract_dim ** 2 * dep_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
+                pipeline_parallel_rank * tensor_parallel_size
+
+        opb = [None] * 2
+        opr = [None] * 2
+
+        B_list[0].copy_(B)
+        opb[0] = dist.broadcast(B_list[0], src=src_b, group=col_group, async_op=True)
+        cur = 0
+
         for i in range(tesseract_dim):
-            B_temp = B.clone()
-            src_b = col_rank + i * tesseract_dim + dep_rank * (
-                        tesseract_dim ** 2) + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                    pipeline_parallel_rank * tensor_parallel_size
-            dist.broadcast(B_temp, src=src_b, group=gpc.get_group(col_parallel_mode))
-            C_temp = torch.matmul(A, B_temp.transpose(0, 1))
-            src_c = i + row_rank * tesseract_dim + dep_rank * (
-                        tesseract_dim ** 2) + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                    pipeline_parallel_rank * tensor_parallel_size
-            dist.reduce(C_temp, dst=src_c, group=gpc.get_group(row_parallel_mode))
-            if i == col_rank:
-                C = C_temp.clone()
+            if i != tesseract_dim - 1:
+                B_list[1 - cur].copy_(B)
+                opb[1 - cur] = dist.broadcast(B_list[1 - cur],
+                                              src=src_b + tesseract_dim,
+                                              group=col_group,
+                                              async_op=True)
+
+            if opr[cur] is not None:
+                opr[cur].wait()
+                if i - 2 == col_rank:
+                    C.copy_(C_list[cur])
+
+            if opb[cur] is not None:
+                opb[cur].wait()
+
+            torch.matmul(A, B_list[cur].transpose(0, 1), out=C_list[cur])
+            opr[cur] = dist.reduce(C_list[cur], dst=src_c, group=row_group, async_op=True)
+            cur = 1 - cur
+            src_b += tesseract_dim
+            src_c += 1
+
+        for op in opr:
+            op.wait()
 
+        if tesseract_dim - 2 == col_rank:
+            C.copy_(C_list[cur])
+        if tesseract_dim - 1 == col_rank:
+            C.copy_(C_list[1 - cur])
         out = C.reshape(out_shape)
 
         if ctx:
@@ -272,20 +333,55 @@ def forward(ctx: Any, A: Tensor, B: Tensor, tesseract_dim: int, out_shape: Tuple
         C_shape = (A.shape[-1], B.shape[-1])
         C = torch.empty(C_shape, dtype=A.dtype, device=get_current_device())
 
+        # use circular buffer to store the communication tensor
+        # 2 is enough for all cases
+        A_list = [torch.empty_like(A) for _ in range(2)]
+        C_list = [torch.empty_like(C) for _ in range(2)]
+
+        row_group = gpc.get_group(row_parallel_mode)
+        col_group = gpc.get_group(col_parallel_mode)
+
+        src_a = tesseract_dim * row_rank + tesseract_dim ** 2 * dep_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
+                pipeline_parallel_rank * tensor_parallel_size
+        src_c = col_rank + tesseract_dim ** 2 * dep_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
+                pipeline_parallel_rank * tensor_parallel_size
+
+        opa = [None] * 2
+        opr = [None] * 2
+
+        A_list[0].copy_(A)
+        opa[0] = dist.broadcast(A_list[0], src=src_a, group=row_group, async_op=True)
+        cur = 0
+
         for i in range(tesseract_dim):
-            A_temp = A.clone()
-            src_a = i + row_rank * tesseract_dim + dep_rank * (
-                        tesseract_dim ** 2) + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                    pipeline_parallel_rank * tensor_parallel_size
-            dist.broadcast(A_temp, src=src_a, group=get_parallel_group(row_parallel_mode))
-            C_temp = torch.matmul(A_temp.transpose(0, 1), B)
-            src_c = col_rank + i * tesseract_dim + dep_rank * (
-                        tesseract_dim ** 2) + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                    pipeline_parallel_rank * tensor_parallel_size
-            dist.reduce(C_temp, dst=src_c, group=get_parallel_group(col_parallel_mode))
-            if i == row_rank:
-                C = C_temp.clone()
+            if i != tesseract_dim - 1:
+                A_list[1 - cur].copy_(A)
+                opa[1 - cur] = dist.broadcast(A_list[1 - cur],
+                                              src=src_a + 1,
+                                              group=row_group,
+                                              async_op=True)
+
+            if opr[cur] is not None:
+                opr[cur].wait()
+                if i - 2 == row_rank:
+                    C.copy_(C_list[cur])
+
+            if opa[cur] is not None:
+                opa[cur].wait()
+
+            torch.matmul(A_list[cur].transpose(0, 1), B, out=C_list[cur])
+            opr[cur] = dist.reduce(C_list[cur], dst=src_c, group=col_group, async_op=True)
+            cur = 1 - cur
+            src_a += 1
+            src_c += tesseract_dim
+
+        for op in opr:
+            op.wait()
 
+        if tesseract_dim - 2 == row_rank:
+            C.copy_(C_list[cur])
+        if tesseract_dim - 1 == row_rank:
+            C.copy_(C_list[1 - cur])
         out = C.reshape(out_shape)
 
         if ctx:
@@ -333,8 +429,7 @@ def forward(ctx: Any, input: Tensor, bias: Tensor, output_size_per_partition: in
             bias_temp = bias.clone()
         else:
             bias_temp = torch.zeros(output_size_per_partition, dtype=bias.dtype, device=get_current_device())
-        src_rank = col_rank + dep_rank * (
-                    tesseract_dim ** 2) + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
+        src_rank = col_rank + dep_rank * tesseract_dim ** 2 + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
                    pipeline_parallel_rank * tensor_parallel_size
         dist.broadcast(bias_temp, src=src_rank, group=get_parallel_group(col_parallel_mode))
 
diff --git a/colossalai/nn/layer/parallel_2p5d/layers.py b/colossalai/nn/layer/parallel_2p5d/layers.py
index 17706a74b8ad..c2bef24d5af6 100644
--- a/colossalai/nn/layer/parallel_2p5d/layers.py
+++ b/colossalai/nn/layer/parallel_2p5d/layers.py
@@ -231,7 +231,7 @@ def __init__(self,
         self.num_patches = self.grid_size[0] * self.grid_size[1]
         self.flatten = flatten
         self.embed_size = embed_size
-        self.embed_size_per_partition = embed_size // (self.tesseract_dep * self.tesseract_dim**2)
+        self.embed_size_per_partition = embed_size // self.tesseract_dim**2
 
         with seed(ParallelMode.TENSOR):
             self.weight = Parameter(
@@ -251,10 +251,10 @@ def __init__(self,
         self._set_tensor_parallel_attribute()
 
     def _set_tensor_parallel_attribute(self):
-        set_tensor_parallel_attribute_by_partition(self.weight, self.tesseract_dep * self.tesseract_dim**2)
-        set_tensor_parallel_attribute_by_partition(self.bias, self.tesseract_dep * self.tesseract_dim**2)
-        set_tensor_parallel_attribute_by_partition(self.cls_token, self.tesseract_dep * self.tesseract_dim**2)
-        set_tensor_parallel_attribute_by_partition(self.pos_embed, self.tesseract_dep * self.tesseract_dim**2)
+        set_tensor_parallel_attribute_by_partition(self.weight, self.tesseract_dim**2)
+        set_tensor_parallel_attribute_by_partition(self.bias, self.tesseract_dim**2)
+        set_tensor_parallel_attribute_by_partition(self.cls_token, self.tesseract_dim**2)
+        set_tensor_parallel_attribute_by_partition(self.pos_embed, self.tesseract_dim**2)
 
     def reset_parameters(self, weight_initializer, bias_initializer, position_embed_initializer):
         with seed(ParallelMode.TENSOR):
@@ -303,7 +303,7 @@ def __init__(self,
         self.tesseract_dim, self.tesseract_dep = get_tesseract_dim_dep_from_env()
         self.num_embeddings = num_embeddings
         self.embed_dim = embedding_dim
-        embed_dim_per_partition = embedding_dim // (self.tesseract_dep * self.tesseract_dim**2)
+        embed_dim_per_partition = embedding_dim // self.tesseract_dim**2
 
         self.padding_idx = padding_idx
         self.embed_args = args
@@ -316,7 +316,7 @@ def __init__(self,
         self._set_tensor_parallel_attributes()
 
     def _set_tensor_parallel_attributes(self):
-        set_tensor_parallel_attribute_by_partition(self.weight, self.tesseract_dep * self.tesseract_dim**2)
+        set_tensor_parallel_attribute_by_partition(self.weight, self.tesseract_dim**2)
 
     def reset_parameters(self, weight_initializer) -> None:
         with seed(ParallelMode.TENSOR):
@@ -359,7 +359,7 @@ def __init__(self,
         self.tesseract_dim, self.tesseract_dep = get_tesseract_dim_dep_from_env()
 
         # partitioning dimension
-        self.input_size_per_partition = divide(self.in_features, self.tesseract_dep * self.tesseract_dim**2)
+        self.input_size_per_partition = divide(self.in_features, self.tesseract_dim**2)
 
         if weight is not None:
             self.weight = weight
@@ -378,7 +378,7 @@ def __init__(self,
 
     def _set_tensor_parallel_attributes(self):
         if self.has_weight:
-            set_tensor_parallel_attribute_by_partition(self.weight, self.tesseract_dep * self.tesseract_dim**2)
+            set_tensor_parallel_attribute_by_partition(self.weight, self.tesseract_dim**2)
 
     def reset_parameters(self, weight_initializer, bias_initializer) -> None:
         with seed(ParallelMode.TENSOR):
diff --git a/colossalai/utils/__init__.py b/colossalai/utils/__init__.py
index 7430ab100c1a..b346f1a57798 100644
--- a/colossalai/utils/__init__.py
+++ b/colossalai/utils/__init__.py
@@ -1,27 +1,19 @@
 from .activation_checkpoint import checkpoint
-from .common import (print_rank_0, sync_model_param_in_dp, is_dp_rank_0,
-                     is_tp_rank_0, is_no_pp_or_last_stage, is_using_ddp,
-                     is_using_pp, conditional_context, is_model_parallel_parameter,
-                     clip_grad_norm_fp32, count_zeros_fp32, copy_tensor_parallel_attributes,
-                     param_is_not_tensor_parallel_duplicate, switch_virtual_pipeline_parallel_rank)
-from .cuda import get_current_device, synchronize, empty_cache, set_to_cuda
+from .common import (clip_grad_norm_fp32, conditional_context, copy_tensor_parallel_attributes, count_zeros_fp32,
+                     free_port, is_dp_rank_0, is_model_parallel_parameter, is_no_pp_or_last_stage, is_tp_rank_0,
+                     is_using_ddp, is_using_pp, multi_tensor_applier, param_is_not_tensor_parallel_duplicate,
+                     print_rank_0, switch_virtual_pipeline_parallel_rank, sync_model_param_in_dp)
+from .cuda import empty_cache, get_current_device, set_to_cuda, synchronize
+from .data_sampler import DataParallelSampler, get_dataloader
+from .gradient_accumulation import accumulate_gradient
 from .memory import report_memory_usage
 from .timer import MultiTimer, Timer
-from .multi_tensor_apply import multi_tensor_applier
-from .gradient_accumulation import accumulate_gradient
-from .data_sampler import DataParallelSampler, get_dataloader
 
-__all__ = ['checkpoint',
-           'print_rank_0', 'sync_model_param_in_dp', 'is_dp_rank_0',
-           'is_tp_rank_0', 'is_no_pp_or_last_stage', 'is_using_ddp',
-           'is_using_pp', 'conditional_context', 'is_model_parallel_parameter',
-           'clip_grad_norm_fp32', 'count_zeros_fp32', 'copy_tensor_parallel_attributes',
-           'param_is_not_tensor_parallel_duplicate',
-           'get_current_device', 'synchronize', 'empty_cache', 'set_to_cuda',
-           'report_memory_usage',
-           'Timer', 'MultiTimer',
-           'multi_tensor_applier',
-           'accumulate_gradient',
-           'DataParallelSampler', 'get_dataloader',
-           'switch_virtual_pipeline_parallel_rank'
-           ]
+__all__ = [
+    'checkpoint', 'free_port', 'print_rank_0', 'sync_model_param_in_dp', 'is_dp_rank_0', 'is_tp_rank_0',
+    'is_no_pp_or_last_stage', 'is_using_ddp', 'is_using_pp', 'conditional_context', 'is_model_parallel_parameter',
+    'clip_grad_norm_fp32', 'count_zeros_fp32', 'copy_tensor_parallel_attributes',
+    'param_is_not_tensor_parallel_duplicate', 'get_current_device', 'synchronize', 'empty_cache', 'set_to_cuda',
+    'report_memory_usage', 'Timer', 'MultiTimer', 'multi_tensor_applier', 'accumulate_gradient', 'DataParallelSampler',
+    'get_dataloader', 'switch_virtual_pipeline_parallel_rank'
+]
diff --git a/colossalai/utils/common.py b/colossalai/utils/common.py
index 610986d03db8..6e33181729b1 100644
--- a/colossalai/utils/common.py
+++ b/colossalai/utils/common.py
@@ -1,5 +1,7 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
+import random
+import socket
 
 import torch
 from torch._six import inf
@@ -9,16 +11,15 @@
 except:
     pass
 
-import torch.distributed as dist
 from contextlib import contextmanager
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from .multi_tensor_apply import multi_tensor_applier
-from colossalai.constants import IS_TENSOR_PARALLEL, TENSOR_PARALLEL_ATTRIBUTES, NUM_PARTITIONS
+
 import torch.distributed as dist
+from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS, TENSOR_PARALLEL_ATTRIBUTES
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 
+from .multi_tensor_apply import multi_tensor_applier
+
 
 def print_rank_0(msg: str, logger=None):
     '''Print messages and save logs(optional). This is executed only if you are the rank-0 gpu.
@@ -33,6 +34,18 @@ def print_rank_0(msg: str, logger=None):
             logger.info(msg)
 
 
+def free_port():
+    while True:
+        try:
+            sock = socket.socket()
+            port = random.randint(20000, 65000)
+            sock.bind(('localhost', port))
+            sock.close()
+            return port
+        except Exception:
+            continue
+
+
 def sync_model_param_in_dp(model):
     '''Make sure data parameters are consistent during Data Parallel Mode
 
diff --git a/tests/test_comm/test_comm.py b/tests/test_comm/test_comm.py
index a8bae5c085ce..4316e1a56ac5 100644
--- a/tests/test_comm/test_comm.py
+++ b/tests/test_comm/test_comm.py
@@ -1,4 +1,3 @@
-import time
 from functools import partial
 
 import pytest
@@ -9,7 +8,7 @@
 from colossalai.context import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch
-from colossalai.utils import get_current_device
+from colossalai.utils import free_port, get_current_device
 
 CONFIG = dict(parallel=dict(data=8, pipeline=1, tensor=dict(mode=None, size=1)))
 
@@ -49,8 +48,8 @@ def check_all_reduce():
     torch.cuda.synchronize()
 
 
-def check_layer(rank, world_size):
-    launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=40011, backend='nccl')
+def check_layer(rank, world_size, port):
+    launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     assert dist.get_rank() == gpc.get_global_rank()
     print('Rank {} / {}'.format(dist.get_rank(), dist.get_world_size()))
@@ -66,7 +65,7 @@ def check_layer(rank, world_size):
 @pytest.mark.dist
 def test_comm():
     world_size = 4
-    run_func = partial(check_layer, world_size=world_size)
+    run_func = partial(check_layer, world_size=world_size, port=free_port())
     mp.spawn(run_func, nprocs=world_size)
 
 
diff --git a/tests/test_context/test_2d_init.py b/tests/test_context/test_2d_init.py
index 387a8b5076ae..22826bf387a2 100644
--- a/tests/test_context/test_2d_init.py
+++ b/tests/test_context/test_2d_init.py
@@ -1,15 +1,16 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
+from functools import partial
+from pathlib import Path
+
 import pytest
 import torch
 import torch.multiprocessing as mp
-
 from colossalai import launch
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from functools import partial
-from pathlib import Path
+from colossalai.utils import free_port
 
 CONFIG_PATH = Path(__file__).parent.joinpath('configs/parallel_2d_init.py').absolute()
 
@@ -87,7 +88,7 @@ def test_2d_init():
     test_fn = partial(init_2d,
                       world_size=world_size,
                       backend='gloo',
-                      port='39900',
+                      port=free_port(),
                       host='localhost'
                       )
     mp.spawn(test_fn, nprocs=world_size)
diff --git a/tests/test_context/test_2p5d_init.py b/tests/test_context/test_2p5d_init.py
index 30fc5809236d..3668c701e4d0 100644
--- a/tests/test_context/test_2p5d_init.py
+++ b/tests/test_context/test_2p5d_init.py
@@ -7,10 +7,10 @@
 import pytest
 import torch
 import torch.multiprocessing as mp
-
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch
+from colossalai.utils import free_port
 
 CONFIG_PATH = Path(__file__).parent.joinpath('configs/parallel_2p5d_init.py').absolute()
 
@@ -111,7 +111,7 @@ def test_2halfd_init():
     test_fn = partial(init_2halfd,
                       world_size=world_size,
                       backend='gloo',
-                      port='39901',
+                      port=free_port(),
                       host='localhost'
                       )
     mp.spawn(test_fn, nprocs=world_size)
diff --git a/tests/test_context/test_3d_init.py b/tests/test_context/test_3d_init.py
index d4a881ece8c2..c9395f8682a2 100644
--- a/tests/test_context/test_3d_init.py
+++ b/tests/test_context/test_3d_init.py
@@ -7,11 +7,10 @@
 import pytest
 import torch
 import torch.multiprocessing as mp
-
-
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch
+from colossalai.utils import free_port
 
 CONFIG_PATH = Path(__file__).parent.joinpath('configs/parallel_3d_init.py').absolute()
 
@@ -104,7 +103,7 @@ def test_3d_init():
     test_fn = partial(init_3d,
                       world_size=world_size,
                       backend='gloo',
-                      port='39902',
+                      port=free_port(),
                       host='localhost'
                       )
     mp.spawn(test_fn, nprocs=world_size)
diff --git a/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py b/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py
index 28e7e5350bbe..71f560e23736 100644
--- a/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py
+++ b/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py
@@ -13,7 +13,7 @@
 from colossalai.nn import Accuracy, LinearWarmupLR
 from colossalai.nn.loss import CrossEntropyLoss
 from colossalai.trainer import Trainer, hooks
-from colossalai.utils import MultiTimer, get_dataloader
+from colossalai.utils import MultiTimer, free_port, get_dataloader
 from colossalai.utils.gradient_accumulation import GradAccumLrSchedulerByStep
 from model_zoo.vit import vit_tiny_patch4_32
 from torchvision import transforms
@@ -27,8 +27,8 @@
               gradient_accumulation=2)
 
 
-def run_trainer(rank, world_size):
-    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=40001, backend='nccl')
+def run_trainer(rank, world_size, port):
+    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     logger = get_dist_logger()
 
@@ -95,7 +95,7 @@ def run_trainer(rank, world_size):
 # @pytest.mark.skip("This test requires more than 8 GPUs, you should invoke this test script using test.sh provided manually")
 def test_hybrid_parallel():
     world_size = 8
-    run_func = partial(run_trainer, world_size=world_size)
+    run_func = partial(run_trainer, world_size=world_size, port=free_port())
     mp.spawn(run_func, nprocs=world_size)
 
 
diff --git a/tests/test_engine/test_engine/test_engine_apex_amp.py b/tests/test_engine/test_engine/test_engine_apex_amp.py
index ec14ff5f16b8..164ae54bb714 100644
--- a/tests/test_engine/test_engine/test_engine_apex_amp.py
+++ b/tests/test_engine/test_engine/test_engine_apex_amp.py
@@ -1,25 +1,23 @@
 # !/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
-import colossalai
 import os
+from functools import partial
+from pathlib import Path
+
+import colossalai
 import pytest
 import torch
-import os.path as osp
-from pathlib import Path
-import torch.nn as nn
 import torch.multiprocessing as mp
-
-from torchvision import transforms
-from torch.optim import Adam
-from colossalai.core import global_context as gpc
+import torch.nn as nn
 from colossalai.amp import AMP_TYPE
+from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
-from colossalai.utils import report_memory_usage, get_dataloader
-from torchvision.models import resnet18
+from colossalai.utils import free_port, get_dataloader, report_memory_usage
+from torch.optim import Adam
+from torchvision import transforms
 from torchvision.datasets import CIFAR10
-from functools import partial
-
+from torchvision.models import resnet18
 
 # Config
 BATCH_SIZE = 128
@@ -38,14 +36,14 @@
 )
 
 
-def run_engine(rank, world_size):
+def run_engine(rank, world_size, port):
     # init dist env
     colossalai.launch(
         config=CONFIG,
         rank=rank,
         world_size=world_size,
         host='localhost',
-        port=39910,
+        port=port,
         backend='nccl'
     )
 
@@ -104,7 +102,7 @@ def run_engine(rank, world_size):
 @pytest.mark.dist
 def test_engine():
     world_size = 4
-    run_func = partial(run_engine, world_size=world_size)
+    run_func = partial(run_engine, world_size=world_size, port=free_port())
     mp.spawn(run_func, nprocs=world_size)
 
 
diff --git a/tests/test_engine/test_engine/test_engine_naive_amp.py b/tests/test_engine/test_engine/test_engine_naive_amp.py
index 91645392fddf..95c620368317 100644
--- a/tests/test_engine/test_engine/test_engine_naive_amp.py
+++ b/tests/test_engine/test_engine/test_engine_naive_amp.py
@@ -1,23 +1,20 @@
-import colossalai
 import os
+from functools import partial
+from pathlib import Path
+
+import colossalai
 import pytest
 import torch
-import os.path as osp
-from pathlib import Path
-import torch.nn as nn
 import torch.multiprocessing as mp
-
-from torchvision import transforms
-from torch.optim import Adam
-from colossalai.core import global_context as gpc
+import torch.nn as nn
 from colossalai.amp import AMP_TYPE
+from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
-from colossalai.utils import report_memory_usage, get_dataloader
-from colossalai.initialize import get_default_parser
-from torchvision.models import resnet18
+from colossalai.utils import free_port, get_dataloader, report_memory_usage
+from torch.optim import Adam
+from torchvision import transforms
 from torchvision.datasets import CIFAR10
-from functools import partial
-
+from torchvision.models import resnet18
 
 # Config
 BATCH_SIZE = 128
@@ -38,14 +35,14 @@
 )
 
 
-def run_engine(rank, world_size):
+def run_engine(rank, world_size, port):
     # init dist env
     colossalai.launch(
         config=CONFIG,
         rank=rank,
         world_size=world_size,
         host='localhost',
-        port=39911,
+        port=port,
         backend='nccl'
     )
 
@@ -104,7 +101,7 @@ def run_engine(rank, world_size):
 @pytest.mark.dist
 def test_engine():
     world_size = 4
-    run_func = partial(run_engine, world_size=world_size)
+    run_func = partial(run_engine, world_size=world_size, port=free_port())
     mp.spawn(run_func, nprocs=world_size)
 
 
diff --git a/tests/test_engine/test_engine/test_engine_no_amp.py b/tests/test_engine/test_engine/test_engine_no_amp.py
index 6b4296f4f919..13668e251274 100644
--- a/tests/test_engine/test_engine/test_engine_no_amp.py
+++ b/tests/test_engine/test_engine/test_engine_no_amp.py
@@ -1,20 +1,19 @@
-import colossalai
 import os
+from functools import partial
+from pathlib import Path
+
+import colossalai
 import pytest
 import torch
-from pathlib import Path
-import torch.nn as nn
 import torch.multiprocessing as mp
-
-from torchvision import transforms
-from torch.optim import Adam
+import torch.nn as nn
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
-from colossalai.utils import report_memory_usage, get_dataloader
-from torchvision.models import resnet18
+from colossalai.utils import free_port, get_dataloader, report_memory_usage
+from torch.optim import Adam
+from torchvision import transforms
 from torchvision.datasets import CIFAR10
-from functools import partial
-
+from torchvision.models import resnet18
 
 # Config
 BATCH_SIZE = 128
@@ -32,14 +31,14 @@
 )
 
 
-def run_engine(rank, world_size):
+def run_engine(rank, world_size, port):
     # init dist env
     colossalai.launch(
         config=CONFIG,
         rank=rank,
         world_size=world_size,
         host='localhost',
-        port=39912,
+        port=port,
         backend='nccl'
     )
 
@@ -98,7 +97,7 @@ def run_engine(rank, world_size):
 @pytest.mark.dist
 def test_engine():
     world_size = 4
-    run_func = partial(run_engine, world_size=world_size)
+    run_func = partial(run_engine, world_size=world_size, port=free_port())
     mp.spawn(run_func, nprocs=world_size)
 
 
diff --git a/tests/test_engine/test_engine/test_engine_torch_amp.py b/tests/test_engine/test_engine/test_engine_torch_amp.py
index 2e6776f1f046..435df81dcdfb 100644
--- a/tests/test_engine/test_engine/test_engine_torch_amp.py
+++ b/tests/test_engine/test_engine/test_engine_torch_amp.py
@@ -1,21 +1,20 @@
-import colossalai
 import os
+from functools import partial
+from pathlib import Path
+
+import colossalai
 import pytest
 import torch
-from pathlib import Path
-import torch.nn as nn
 import torch.multiprocessing as mp
-
-from torchvision import transforms
-from torch.optim import Adam
-from colossalai.core import global_context as gpc
+import torch.nn as nn
 from colossalai.amp import AMP_TYPE
+from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
-from colossalai.utils import report_memory_usage, get_dataloader
-from torchvision.models import resnet18
+from colossalai.utils import free_port, get_dataloader, report_memory_usage
+from torch.optim import Adam
+from torchvision import transforms
 from torchvision.datasets import CIFAR10
-from functools import partial
-
+from torchvision.models import resnet18
 
 # Config
 BATCH_SIZE = 128
@@ -34,14 +33,14 @@
 )
 
 
-def run_engine(rank, world_size):
+def run_engine(rank, world_size, port):
     # init dist env
     colossalai.launch(
         config=CONFIG,
         rank=rank,
         world_size=world_size,
         host='localhost',
-        port=39913,
+        port=port,
         backend='nccl'
     )
 
@@ -100,7 +99,7 @@ def run_engine(rank, world_size):
 @pytest.mark.dist
 def test_engine():
     world_size = 4
-    run_func = partial(run_engine, world_size=world_size)
+    run_func = partial(run_engine, world_size=world_size, port=free_port())
     mp.spawn(run_func, nprocs=world_size)
 
 
diff --git a/tests/test_layers/test_1d/test_1d.py b/tests/test_layers/test_1d/test_1d.py
index bc7f09353d7b..f4120dc53c3d 100644
--- a/tests/test_layers/test_1d/test_1d.py
+++ b/tests/test_layers/test_1d/test_1d.py
@@ -1,13 +1,15 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
+from functools import partial
+
 import pytest
 import torch
 import torch.multiprocessing as mp
-
 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch
-from functools import partial
+from colossalai.utils import free_port
+
 from checks_1d.check_layer_1d import *
 
 CONFIG = dict(
@@ -21,12 +23,12 @@
 )
 
 
-def check_layer(rank, world_size):
+def check_layer(rank, world_size, port):
     launch(config=CONFIG,
            rank=rank,
            world_size=world_size,
            host='localhost',
-           port=39920,
+           port=port,
            backend='nccl')
 
     check_linear_col()
@@ -39,7 +41,7 @@ def check_layer(rank, world_size):
 @pytest.mark.dist
 def test_1d():
     world_size = 4
-    run_func = partial(check_layer, world_size=world_size)
+    run_func = partial(check_layer, world_size=world_size, port=free_port())
     mp.spawn(run_func, nprocs=world_size)
 
 
diff --git a/tests/test_layers/test_2d/test_2d.py b/tests/test_layers/test_2d/test_2d.py
index 31829b8b91b2..83dc80a95b66 100644
--- a/tests/test_layers/test_2d/test_2d.py
+++ b/tests/test_layers/test_2d/test_2d.py
@@ -1,16 +1,17 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
+from functools import partial
+
 import pytest
 import torch
 import torch.multiprocessing as mp
-
 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch
+from colossalai.utils import free_port
+
 from checks_2d.check_layer_2d import *
 from checks_2d.check_operation_2d import *
-from functools import partial
-
 
 CONFIG = dict(
     parallel=dict(
@@ -34,12 +35,12 @@ def check_layer():
     check_layernorm()
     check_classifier()
 
-def check_layer_and_operation(rank, world_size):
+def check_layer_and_operation(rank, world_size, port):
     launch(config=CONFIG,
            rank=rank,
            world_size=world_size,
            host='localhost',
-           port=39921,
+           port=port,
            backend='nccl')
 
     # check_operations()
@@ -51,7 +52,7 @@ def check_layer_and_operation(rank, world_size):
 @pytest.mark.dist
 def test_2d():
     world_size = 4
-    run_func = partial(check_layer_and_operation, world_size=world_size)
+    run_func = partial(check_layer_and_operation, world_size=world_size, port=free_port())
     mp.spawn(run_func, nprocs=world_size)
 
 
diff --git a/tests/test_layers/test_2p5d/test_2p5d.py b/tests/test_layers/test_2p5d/test_2p5d.py
index a09fd1edd0e1..4de4015bffe0 100644
--- a/tests/test_layers/test_2p5d/test_2p5d.py
+++ b/tests/test_layers/test_2p5d/test_2p5d.py
@@ -1,13 +1,15 @@
+from functools import partial
+
 import pytest
 import torch
 import torch.multiprocessing as mp
-
 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch
-from checks_2p5d.check_layer_2p5d import check_linear, check_layernorm, check_classifier
-from checks_2p5d.check_operation_2p5d import check_AB, check_ABT, check_ATB
-from functools import partial
+from colossalai.utils import free_port
 
+from checks_2p5d.check_layer_2p5d import (check_classifier, check_layernorm,
+                                          check_linear)
+from checks_2p5d.check_operation_2p5d import check_AB, check_ABT, check_ATB
 
 CONFIG = dict(
     parallel=dict(
@@ -29,12 +31,12 @@ def check_layer():
     check_classifier()
 
 
-def check_layer_and_operation(rank, world_size):
+def check_layer_and_operation(rank, world_size, port):
     launch(config=CONFIG,
            rank=rank,
            world_size=world_size,
            host='localhost',
-           port=39922,
+           port=port,
            backend='nccl')
 
     check_operations()
@@ -46,7 +48,7 @@ def check_layer_and_operation(rank, world_size):
 @pytest.mark.dist
 def test_2p5d():
     world_size = 4
-    run_func = partial(check_layer_and_operation, world_size=world_size)
+    run_func = partial(check_layer_and_operation, world_size=world_size, port=free_port())
     mp.spawn(run_func, nprocs=world_size)
 
 
diff --git a/tests/test_layers/test_3d/test_3d.py b/tests/test_layers/test_3d/test_3d.py
index 6aeec775fdf4..73bdbb5bd9ac 100644
--- a/tests/test_layers/test_3d/test_3d.py
+++ b/tests/test_layers/test_3d/test_3d.py
@@ -7,6 +7,7 @@
 import torch.multiprocessing as mp
 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch
+from colossalai.utils import free_port
 
 from checks_3d.check_layer_3d import *
 
@@ -27,8 +28,8 @@ def check_layer():
     # check_loss()
 
 
-def check_layer_and_operation(rank, world_size):
-    launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=39923, backend='nccl')
+def check_layer_and_operation(rank, world_size, port):
+    launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
     check_layer()
     gpc.destroy()
     torch.cuda.empty_cache()
@@ -37,7 +38,7 @@ def check_layer_and_operation(rank, world_size):
 @pytest.mark.dist
 def test_3d():
     world_size = 8
-    run_func = partial(check_layer_and_operation, world_size=world_size)
+    run_func = partial(check_layer_and_operation, world_size=world_size, port=free_port())
     mp.spawn(run_func, nprocs=world_size)
 
 
diff --git a/tests/test_layers/test_sequence/test_sequence.py b/tests/test_layers/test_sequence/test_sequence.py
index d76fc35badb4..1ee104eb20ae 100644
--- a/tests/test_layers/test_sequence/test_sequence.py
+++ b/tests/test_layers/test_sequence/test_sequence.py
@@ -8,6 +8,7 @@
 from colossalai.logging import get_dist_logger
 from checks_seq.check_layer_seq import *
 from functools import partial
+from colossalai.utils import free_port
 
 
 CONFIG = dict(
@@ -22,13 +23,13 @@ def check_layer():
     check_selfattention()
 
 
-def run_check_sequence(rank, world_size):
+def run_check_sequence(rank, world_size, port):
     # init dist
     launch(config=CONFIG,
            rank=rank,
            world_size=world_size,
            host='localhost',
-           port=39924,
+           port=port,
            backend='nccl')
     logger = get_dist_logger()
     logger.info('Distributed environment is initialzied.', ranks=[0])
@@ -41,7 +42,7 @@ def run_check_sequence(rank, world_size):
 @pytest.mark.dist
 def test_sequence():
     world_size = 4
-    run_func = partial(run_check_sequence, world_size=world_size)
+    run_func = partial(run_check_sequence, world_size=world_size, port=free_port())
     mp.spawn(run_func, nprocs=world_size)
 
 
diff --git a/tests/test_trainer/test_pipeline/resnet_config.py b/tests/test_trainer/test_pipeline/resnet_config.py
index b0bcc38609d4..cbf7dd266192 100644
--- a/tests/test_trainer/test_pipeline/resnet_config.py
+++ b/tests/test_trainer/test_pipeline/resnet_config.py
@@ -1,4 +1,5 @@
 import os
+import model
 from pathlib import Path
 
 BATCH_SIZE = 128
diff --git a/tests/test_trainer/test_pipeline/test_p2p.py b/tests/test_trainer/test_pipeline/test_p2p.py
index 4b2c00d6d087..283f49fa0932 100644
--- a/tests/test_trainer/test_pipeline/test_p2p.py
+++ b/tests/test_trainer/test_pipeline/test_p2p.py
@@ -1,11 +1,12 @@
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
 
+from functools import partial
+
 import pytest
 import torch
 import torch.distributed as dist
 import torch.multiprocessing as mp
-
 from colossalai.communication import (recv_backward, recv_forward,
                                       recv_tensor_meta, send_backward,
                                       send_backward_recv_forward, send_forward,
@@ -15,8 +16,7 @@
 from colossalai.core import global_context as gpc
 from colossalai.initialize import launch
 from colossalai.logging import get_dist_logger
-from colossalai.utils import get_current_device
-from functools import partial
+from colossalai.utils import free_port, get_current_device
 
 BATCH_SIZE = 16
 SEQ_LENGTH = 64
@@ -123,13 +123,13 @@ def check_comm(size, rank, prev_rank, next_rank, up_group, down_group, logger):
     check_forward_backward(tensor, grad, rank, logger)
 
 
-def run_check(rank, world_size):
+def run_check(rank, world_size, port):
     launch(
         config=CONFIG,
         rank=rank,
         world_size=world_size,
         host='localhost',
-        port=39932,
+        port=port,
         backend='nccl'
     )
     logger = get_dist_logger()
@@ -154,7 +154,7 @@ def run_check(rank, world_size):
 @pytest.mark.dist
 def test_p2p():
     world_size = 4
-    run_func = partial(run_check, world_size=world_size)
+    run_func = partial(run_check, world_size=world_size, port=free_port())
     mp.spawn(run_func, nprocs=world_size)
 
 
diff --git a/tests/test_trainer/test_pipeline/test_partition.py b/tests/test_trainer/test_pipeline/test_partition.py
index c0ecd410bbd7..61e7e707b16b 100644
--- a/tests/test_trainer/test_pipeline/test_partition.py
+++ b/tests/test_trainer/test_pipeline/test_partition.py
@@ -3,25 +3,24 @@
 import pytest
 import torch
 import torch.multiprocessing as mp
-from torch.utils.data import DataLoader
 
 from colossalai.builder.pipeline import build_pipeline_model_from_cfg
 from colossalai.core import global_context
 from colossalai.initialize import launch
 from colossalai.logging import get_dist_logger
 from functools import partial
-import model
+from colossalai.utils import free_port
 
 DIR_PATH = osp.dirname(osp.realpath(__file__))
 CONFIG_PATH = osp.join(DIR_PATH, 'resnet_config.py')
 
 
-def run_partition(rank, world_size):
+def run_partition(rank, world_size, port):
     launch(config=CONFIG_PATH,
            rank=rank,
            world_size=world_size,
            host='localhost',
-           port=39933,
+           port=port,
            backend='nccl'
            )
     logger = get_dist_logger()
@@ -40,7 +39,7 @@ def run_partition(rank, world_size):
 @pytest.mark.dist
 def test_partition():
     world_size = 4
-    run_func = partial(run_partition, world_size=world_size)
+    run_func = partial(run_partition, world_size=world_size, port=free_port())
     mp.spawn(run_func, nprocs=world_size)
 
 
diff --git a/tests/test_trainer/test_pipeline/test_pipeline_schedule.py b/tests/test_trainer/test_pipeline/test_pipeline_schedule.py
index 5b2c1986fb7c..d3c876c9c8cf 100644
--- a/tests/test_trainer/test_pipeline/test_pipeline_schedule.py
+++ b/tests/test_trainer/test_pipeline/test_pipeline_schedule.py
@@ -1,26 +1,23 @@
 # referenced from Megatron and used to testify communication
 
-import colossalai
 import os
 import os.path as osp
+from functools import partial
+from pathlib import Path
+
+import colossalai
 import pytest
 import torch
 import torch.multiprocessing as mp
-import model
-
 from colossalai.builder import build_pipeline_model_from_cfg
-from colossalai.communication import p2p as p2p_communication
-from colossalai.communication.utils import send_tensor_meta, recv_tensor_meta
-from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
-from colossalai.initialize import launch
-from colossalai.utils import print_rank_0, get_current_device, get_dataloader
 from colossalai.engine.schedule import PipelineSchedule
-from torchvision.datasets import CIFAR10
+from colossalai.initialize import launch
+from colossalai.utils import free_port, get_dataloader, print_rank_0
 from torchvision import transforms
-from pathlib import Path
-from functools import partial
+from torchvision.datasets import CIFAR10
 
+import model
 
 BATCH_SIZE = 32
 NUM_MICRO = 8
@@ -30,12 +27,12 @@
 CONFIG_PATH = osp.join(DIR_PATH, './resnet_config.py')
 
 
-def run_schedule(rank, world_size):
+def run_schedule(rank, world_size, port):
     launch(config=CONFIG_PATH,
            rank=rank,
            world_size=world_size,
            host='localhost',
-           port=39934,
+           port=port,
            backend='nccl')
 
     # build model
@@ -86,7 +83,7 @@ def run_schedule(rank, world_size):
 @pytest.mark.dist
 def test_pipeline_schedule():
     world_size = 4
-    run_func = partial(run_schedule, world_size=world_size)
+    run_func = partial(run_schedule, world_size=world_size, port=free_port())
     mp.spawn(run_func, nprocs=world_size)
 
 
diff --git a/tests/test_trainer/test_trainer_with_non_pipe_schedule.py b/tests/test_trainer/test_trainer_with_non_pipe_schedule.py
index b44e609d86b8..599efd883aa1 100644
--- a/tests/test_trainer/test_trainer_with_non_pipe_schedule.py
+++ b/tests/test_trainer/test_trainer_with_non_pipe_schedule.py
@@ -11,7 +11,7 @@
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
 from colossalai.trainer import Trainer
-from colossalai.utils import MultiTimer, get_dataloader
+from colossalai.utils import MultiTimer, free_port, get_dataloader
 from torch.optim import Adam
 from torchvision import transforms
 from torchvision.datasets import CIFAR10
@@ -26,8 +26,8 @@
     fp16=dict(mode=AMP_TYPE.TORCH))
 
 
-def run_trainer_no_pipeline(rank, world_size):
-    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=39930, backend='nccl')
+def run_trainer_no_pipeline(rank, world_size, port):
+    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     # build model
     model = resnet18(num_classes=10)
@@ -88,7 +88,7 @@ def run_trainer_no_pipeline(rank, world_size):
 @pytest.mark.dist
 def test_trainer_no_pipeline():
     world_size = 4
-    run_func = partial(run_trainer_no_pipeline, world_size=world_size)
+    run_func = partial(run_trainer_no_pipeline, world_size=world_size, port=free_port())
     mp.spawn(run_func, nprocs=world_size)
 
 
diff --git a/tests/test_trainer/test_trainer_with_pipe_schedule.py b/tests/test_trainer/test_trainer_with_pipe_schedule.py
index d1e1a59101b7..8dffc3fc748b 100644
--- a/tests/test_trainer/test_trainer_with_pipe_schedule.py
+++ b/tests/test_trainer/test_trainer_with_pipe_schedule.py
@@ -12,7 +12,7 @@
 from colossalai.engine.schedule import PipelineSchedule
 from colossalai.logging import get_dist_logger
 from colossalai.trainer import Trainer
-from colossalai.utils import MultiTimer, get_dataloader
+from colossalai.utils import MultiTimer, free_port, get_dataloader
 from torch.optim import Adam
 from torchvision import transforms
 from torchvision.datasets import CIFAR10
@@ -25,8 +25,8 @@
 CONFIG = dict(parallel=dict(pipeline=2, ), )
 
 
-def run_trainer_with_pipeline(rank, world_size):
-    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=39931, backend='nccl')
+def run_trainer_with_pipeline(rank, world_size, port):
+    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     # build model
     model = resnet18(num_classes=10)
@@ -99,7 +99,7 @@ def forward(self, x):
 @pytest.mark.dist
 def test_trainer_with_pipeline():
     world_size = 4
-    run_func = partial(run_trainer_with_pipeline, world_size=world_size)
+    run_func = partial(run_trainer_with_pipeline, world_size=world_size, port=free_port())
     mp.spawn(run_func, nprocs=world_size)
 
 
diff --git a/tests/test_utils/test_gradient_accumluation.py b/tests/test_utils/test_gradient_accumluation.py
index 1c8af272f8fd..c7471d77ccb9 100644
--- a/tests/test_utils/test_gradient_accumluation.py
+++ b/tests/test_utils/test_gradient_accumluation.py
@@ -1,20 +1,19 @@
-import colossalai
 import os
+from functools import partial
+from pathlib import Path
+
+import colossalai
 import pytest
 import torch
 import torch.multiprocessing as mp
 import torch.nn as nn
-
-from functools import partial
-from pathlib import Path
-from torchvision import transforms
-from torch.optim import Adam
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
-from colossalai.utils import get_dataloader
-from torchvision.models import resnet18
+from colossalai.utils import free_port, get_dataloader
+from torch.optim import Adam
+from torchvision import transforms
 from torchvision.datasets import CIFAR10
-
+from torchvision.models import resnet18
 
 # Config
 BATCH_SIZE = 16
@@ -31,7 +30,7 @@
 )
 
 
-def run_no_pipeline(rank, world_size):
+def run_no_pipeline(rank, world_size, port):
 
     # init dist env
     colossalai.launch(
@@ -39,7 +38,7 @@ def run_no_pipeline(rank, world_size):
         rank=rank,
         world_size=world_size,
         host='localhost',
-        port=39501,
+        port=port,
         backend='nccl'
     )
 
@@ -109,7 +108,7 @@ def run_no_pipeline(rank, world_size):
 @pytest.mark.dist
 def test_engine():
     world_size = 4
-    func = partial(run_no_pipeline, world_size=world_size)
+    func = partial(run_no_pipeline, world_size=world_size, port=free_port())
     mp.spawn(func, nprocs=world_size)
 
 
diff --git a/tests/test_zero_data_parallel/test_zero_level_2.py b/tests/test_zero_data_parallel/test_zero_level_2.py
index de7779cab877..9bdd1b12496b 100644
--- a/tests/test_zero_data_parallel/test_zero_level_2.py
+++ b/tests/test_zero_data_parallel/test_zero_level_2.py
@@ -2,18 +2,18 @@
 # -*- encoding: utf-8 -*-
 
 import os
-import pytest
-import torch
-import torch.multiprocessing as mp
+from functools import partial
 from pathlib import Path
 
 import colossalai
+import pytest
+import torch
+import torch.multiprocessing as mp
 from colossalai.core import global_context as gpc
-from colossalai.utils import get_dataloader
+from colossalai.utils import free_port, get_dataloader
 from torchvision import transforms
-from torchvision.models import resnet18
 from torchvision.datasets import CIFAR10
-from functools import partial
+from torchvision.models import resnet18
 
 BATCH_SIZE = 16
 IMG_SIZE = 224
@@ -34,12 +34,12 @@
 )
 
 
-def run_dist(rank, world_size):
+def run_dist(rank, world_size, port):
     colossalai.launch(config=CONFIG,
                       rank=rank,
                       world_size=world_size,
                       host='localhost',
-                      port=39940,
+                      port=port,
                       backend='nccl')
 
     # build model
@@ -94,7 +94,7 @@ def run_dist(rank, world_size):
 @pytest.mark.dist
 def test_zero_level_2():
     world_size = 4
-    run_func = partial(run_dist, world_size=world_size)
+    run_func = partial(run_dist, world_size=world_size, port=free_port())
     mp.spawn(run_func, nprocs=world_size)
 
 
diff --git a/tests/test_zero_data_parallel/test_zero_level_3.py b/tests/test_zero_data_parallel/test_zero_level_3.py
index d0f7adc8d9d2..2655210dbffa 100644
--- a/tests/test_zero_data_parallel/test_zero_level_3.py
+++ b/tests/test_zero_data_parallel/test_zero_level_3.py
@@ -2,18 +2,18 @@
 # -*- encoding: utf-8 -*-
 
 import os
-import pytest
-import torch
-import torch.multiprocessing as mp
+from functools import partial
 from pathlib import Path
 
 import colossalai
+import pytest
+import torch
+import torch.multiprocessing as mp
 from colossalai.core import global_context as gpc
-from colossalai.utils import get_dataloader
+from colossalai.utils import free_port, get_dataloader
 from torchvision import transforms
-from torchvision.models import resnet18
 from torchvision.datasets import CIFAR10
-from functools import partial
+from torchvision.models import resnet18
 
 BATCH_SIZE = 16
 IMG_SIZE = 224
@@ -46,12 +46,12 @@
 )
 
 
-def run_dist(rank, world_size):
+def run_dist(rank, world_size, port):
     colossalai.launch(config=CONFIG,
                       rank=rank,
                       world_size=world_size,
                       host='localhost',
-                      port=39941,
+                      port=port,
                       backend='nccl')
 
     # build model
@@ -106,7 +106,7 @@ def run_dist(rank, world_size):
 @pytest.mark.dist
 def test_zero_level_3():
     world_size = 4
-    run_func = partial(run_dist, world_size=world_size)
+    run_func = partial(run_dist, world_size=world_size, port=free_port())
     mp.spawn(run_func, nprocs=world_size)
 
 
diff --git a/tests/test_zero_tensor_parallel/test_vit_2d_level_2.py b/tests/test_zero_tensor_parallel/test_vit_2d_level_2.py
index eeb487c33514..9d215f5aeb82 100644
--- a/tests/test_zero_tensor_parallel/test_vit_2d_level_2.py
+++ b/tests/test_zero_tensor_parallel/test_vit_2d_level_2.py
@@ -13,7 +13,7 @@
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
 from colossalai.nn import CrossEntropyLoss
-from colossalai.utils import get_dataloader
+from colossalai.utils import free_port, get_dataloader
 from model_zoo.vit import vit_lite_depth7_patch4_32
 from torchvision import transforms
 from torchvision.datasets import CIFAR10
@@ -40,8 +40,8 @@ def train_epoch(engine, train_dataloader):
     return avg_loss
 
 
-def run_2d_parallel_vision_transformer_level_2(rank, world_size):
-    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=39950, backend='nccl')
+def run_2d_parallel_vision_transformer_level_2(rank, world_size, port):
+    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     # build model
     model = vit_lite_depth7_patch4_32()
@@ -90,7 +90,7 @@ def run_2d_parallel_vision_transformer_level_2(rank, world_size):
 @pytest.mark.dist
 def test_2d_vit_zero_level_2():
     world_size = 8
-    run_func = partial(run_2d_parallel_vision_transformer_level_2, world_size=world_size)
+    run_func = partial(run_2d_parallel_vision_transformer_level_2, world_size=world_size, port=free_port())
     mp.spawn(run_func, nprocs=world_size)
 
 
diff --git a/tests/test_zero_tensor_parallel/test_vit_2d_level_3.py b/tests/test_zero_tensor_parallel/test_vit_2d_level_3.py
index 720e460e57e9..149fefb72ff9 100644
--- a/tests/test_zero_tensor_parallel/test_vit_2d_level_3.py
+++ b/tests/test_zero_tensor_parallel/test_vit_2d_level_3.py
@@ -13,7 +13,7 @@
 from colossalai.core import global_context as gpc
 from colossalai.logging import get_dist_logger
 from colossalai.nn import CrossEntropyLoss
-from colossalai.utils import get_dataloader
+from colossalai.utils import free_port, get_dataloader
 from model_zoo.vit import vit_lite_depth7_patch4_32
 from torchvision import transforms
 from torchvision.datasets import CIFAR10
@@ -40,8 +40,8 @@ def train_epoch(engine, train_dataloader):
     return avg_loss
 
 
-def run_2d_parallel_vision_transformer_level_3(rank, world_size):
-    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=39951, backend='nccl')
+def run_2d_parallel_vision_transformer_level_3(rank, world_size, port):
+    colossalai.launch(config=CONFIG, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
 
     # build model
     model = vit_lite_depth7_patch4_32()
@@ -91,7 +91,7 @@ def run_2d_parallel_vision_transformer_level_3(rank, world_size):
 @pytest.mark.skip("Level 3 has unknown bug so skip this test for now")
 def test_3d_vit_zero_level_3():
     world_size = 8
-    run_func = partial(run_2d_parallel_vision_transformer_level_3, world_size=world_size)
+    run_func = partial(run_2d_parallel_vision_transformer_level_3, world_size=world_size, port=free_port())
     mp.spawn(run_func, nprocs=world_size)
 
 

From 226614cd3b94d11bab4d7ecdacec3f2cdfb836ad Mon Sep 17 00:00:00 2001
From: zbian <kurisusnowdeng@gmail.com>
Date: Wed, 29 Dec 2021 15:58:12 +0800
Subject: [PATCH 3/3] reworked split batch, now called in
 trainer.schedule.load_batch

---
 colossalai/engine/schedule/_base_schedule.py  |  8 +++-
 colossalai/nn/layer/__init__.py               |  1 +
 .../nn/layer/colossalai_layer/__init__.py     |  3 +-
 .../nn/layer/colossalai_layer/_utils.py       | 19 ++++++++
 .../nn/layer/colossalai_layer/dropout.py      | 10 ++---
 colossalai/nn/layer/parallel_1d/layers.py     | 12 +++---
 colossalai/nn/layer/parallel_2d/__init__.py   |  4 +-
 colossalai/nn/layer/parallel_2d/_operation.py | 33 +++++++++-----
 colossalai/nn/layer/parallel_2d/layers.py     |  6 +--
 colossalai/nn/layer/parallel_2p5d/__init__.py |  4 +-
 .../nn/layer/parallel_2p5d/_operation.py      | 43 +++++++++++--------
 colossalai/nn/layer/parallel_2p5d/layers.py   | 11 ++---
 colossalai/nn/layer/parallel_3d/__init__.py   |  4 +-
 colossalai/nn/layer/parallel_3d/_operation.py | 26 ++++++++---
 colossalai/nn/layer/parallel_3d/layers.py     |  4 --
 colossalai/nn/layer/utils/__init__.py         |  8 +++-
 colossalai/nn/layer/utils/common.py           |  2 +-
 colossalai/nn/loss/loss_2d.py                 | 11 ++---
 colossalai/nn/loss/loss_2p5d.py               | 11 ++---
 colossalai/nn/loss/loss_3d.py                 | 12 ++----
 colossalai/nn/metric/accuracy_2d.py           |  3 +-
 colossalai/nn/metric/accuracy_2p5d.py         |  3 +-
 colossalai/nn/metric/accuracy_3d.py           |  3 +-
 colossalai/trainer/hooks/_metric_hook.py      | 21 +++++----
 .../test_cifar_with_data_pipeline_tensor.py   |  1 -
 25 files changed, 149 insertions(+), 114 deletions(-)
 create mode 100644 colossalai/nn/layer/colossalai_layer/_utils.py

diff --git a/colossalai/engine/schedule/_base_schedule.py b/colossalai/engine/schedule/_base_schedule.py
index aceee4e6c756..411f1861b450 100644
--- a/colossalai/engine/schedule/_base_schedule.py
+++ b/colossalai/engine/schedule/_base_schedule.py
@@ -10,7 +10,7 @@
 from .._base_engine import Engine
 from colossalai.logging import get_dist_logger
 from colossalai.utils import get_current_device
-
+from colossalai.nn.layer import split_batch
 
 class BaseSchedule(ABC):
     """A basic helper class to control the process of training or evaluation.
@@ -59,7 +59,11 @@ def load_batch(self, data_iter):
         else:
             data, label = batch_data
 
-        data, label = self._to_list(data), self._to_list(label)
+        if isinstance(label, (tuple, list)):
+            self.batch_size = label[0].size(0)
+        else:
+            self.batch_size = label.size(0)
+        data, label = self._to_list(split_batch(data)), self._to_list(split_batch(label))
         return self._move_to_device(data), self._move_to_device(label)
 
     def pre_processing(self, engine: Engine):
diff --git a/colossalai/nn/layer/__init__.py b/colossalai/nn/layer/__init__.py
index 108c8dd37460..86961dd933a7 100644
--- a/colossalai/nn/layer/__init__.py
+++ b/colossalai/nn/layer/__init__.py
@@ -4,5 +4,6 @@
 from .parallel_2p5d import *
 from .parallel_3d import *
 from .parallel_sequence import *
+from .utils import *
 from .vanilla import *
 from .wrapper import *
diff --git a/colossalai/nn/layer/colossalai_layer/__init__.py b/colossalai/nn/layer/colossalai_layer/__init__.py
index e9c78c576031..54ed567eb36f 100644
--- a/colossalai/nn/layer/colossalai_layer/__init__.py
+++ b/colossalai/nn/layer/colossalai_layer/__init__.py
@@ -1,6 +1,7 @@
+from ._utils import split_batch
 from .dropout import Dropout
 from .embedding import Embedding, PatchEmbedding
 from .linear import Classifier, Linear
 from .normalization import LayerNorm
 
-__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout']
+__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'split_batch']
diff --git a/colossalai/nn/layer/colossalai_layer/_utils.py b/colossalai/nn/layer/colossalai_layer/_utils.py
new file mode 100644
index 000000000000..8b996c860d61
--- /dev/null
+++ b/colossalai/nn/layer/colossalai_layer/_utils.py
@@ -0,0 +1,19 @@
+from torch import Tensor
+
+from ..parallel_2d._operation import split_tensor_2d
+from ..parallel_2p5d._operation import split_tensor_2p5d
+from ..parallel_3d._operation import split_tensor_3d
+from ..utils import get_tensor_parallel_mode
+
+_parallel_split_batch = {'2d': split_tensor_2d, '2.5d': split_tensor_2p5d, '3d': split_tensor_3d}
+
+
+def split_batch(input_) -> Tensor:
+    tensor_parallel_mode = get_tensor_parallel_mode()
+    if tensor_parallel_mode in _parallel_split_batch:
+        if isinstance(input_, (tuple, list)):
+            return tuple(map(_parallel_split_batch[tensor_parallel_mode], input_))
+        else:
+            return _parallel_split_batch[tensor_parallel_mode](input_)
+    else:
+        return input_
diff --git a/colossalai/nn/layer/colossalai_layer/dropout.py b/colossalai/nn/layer/colossalai_layer/dropout.py
index f923e1f27b5f..ff86e0745508 100644
--- a/colossalai/nn/layer/colossalai_layer/dropout.py
+++ b/colossalai/nn/layer/colossalai_layer/dropout.py
@@ -1,16 +1,11 @@
-from typing import Optional
+from contextlib import nullcontext
 
 import torch.nn as nn
 from colossalai.context import ParallelMode, seed
 from colossalai.utils import conditional_context
 
-from ... import init as init
 from ..parallel_1d import *
-from ..parallel_2d import *
-from ..parallel_2p5d import *
-from ..parallel_3d import *
 from ..utils import get_tensor_parallel_mode
-from ..vanilla import *
 
 
 class Dropout(nn.Module):
@@ -23,5 +18,6 @@ def __init__(self, p: float = 0.5, inplace: bool = False) -> None:
             self.drop = nn.Dropout(p, inplace)
 
     def forward(self, *args):
-        with conditional_context(seed(ParallelMode.TENSOR), enable=self.tensor_parallel not in ['None', '1d']):
+        cm = nullcontext() if self.tensor_parallel in ['None', '1d'] else seed(ParallelMode.TENSOR)
+        with cm:
             return self.drop(*args)
diff --git a/colossalai/nn/layer/parallel_1d/layers.py b/colossalai/nn/layer/parallel_1d/layers.py
index 497feb4a9479..3a3fa6e00e0a 100644
--- a/colossalai/nn/layer/parallel_1d/layers.py
+++ b/colossalai/nn/layer/parallel_1d/layers.py
@@ -3,6 +3,7 @@
 
 import math
 import numbers
+from contextlib import nullcontext
 from typing import Callable, Tuple
 
 import torch
@@ -12,12 +13,12 @@
 from colossalai.core import global_context as gpc
 from colossalai.nn import init as init
 from colossalai.registry import LAYERS
-from colossalai.utils import get_current_device, conditional_context
+from colossalai.utils import get_current_device
 from torch import Tensor, dtype
 from torch.nn.parameter import Parameter
 
-from ..utils import divide, set_tensor_parallel_attribute_by_partition
 from ..base_layer import ParallelLayer
+from ..utils import divide, set_tensor_parallel_attribute_by_partition
 from ._operation import FusedLayerNormAffineFunction1D
 from ._utils import (gather_forward_split_backward, get_parallel_input, reduce_grad, reduce_input, set_parallel_input,
                      split_forward_gather_backward)
@@ -45,7 +46,6 @@ def __init__(self,
                                       skip_bias_add=skip_bias_add,
                                       weight_initializer=weight_initializer,
                                       bias_initializer=bias_initializer)
-            set_parallel_input(True)
         else:
             self.layer = Linear1D_Row(in_features,
                                       out_features,
@@ -55,7 +55,6 @@ def __init__(self,
                                       skip_bias_add=skip_bias_add,
                                       weight_initializer=weight_initializer,
                                       bias_initializer=bias_initializer)
-            set_parallel_input(False)
 
     @property
     def weight(self):
@@ -187,6 +186,7 @@ def __init__(self,
         with seed(ParallelMode.TENSOR):
             self.reset_parameters(weight_initializer, bias_initializer)
         self._set_tensor_parallel_attributes()
+        set_parallel_input(True)
 
     def reset_parameters(self, weight_initializer, bias_initializer) -> None:
         fan_in, fan_out = self.in_features, self.out_features
@@ -268,6 +268,7 @@ def __init__(self,
         with seed(ParallelMode.TENSOR):
             self.reset_parameters(weight_initializer, bias_initializer)
         self._set_tensor_parallel_attributes()
+        set_parallel_input(False)
 
     def reset_parameters(self, weight_initializer, bias_initializer) -> None:
         fan_in, fan_out = self.in_features, self.out_features
@@ -379,6 +380,7 @@ def __init__(self, p: float = 0.5, inplace: bool = False):
         self.inplace = inplace
 
     def forward(self, input_: Tensor) -> Tensor:
-        with conditional_context(seed(ParallelMode.TENSOR), enable=self.parallel_input):
+        cm = nullcontext() if not self.parallel_input else seed(ParallelMode.TENSOR)
+        with cm:
             output = F.dropout(input_, self.p, self.training, self.inplace)
         return output
diff --git a/colossalai/nn/layer/parallel_2d/__init__.py b/colossalai/nn/layer/parallel_2d/__init__.py
index e54f3e7e41d7..2122a1bfed45 100644
--- a/colossalai/nn/layer/parallel_2d/__init__.py
+++ b/colossalai/nn/layer/parallel_2d/__init__.py
@@ -1,6 +1,6 @@
-from ._operation import reduce_by_batch_2d, split_batch_2d
+from ._operation import reduce_by_batch_2d, split_tensor_2d
 from .layers import Classifier2D, Embedding2D, LayerNorm2D, Linear2D, PatchEmbedding2D
 
 __all__ = [
-    'split_batch_2d', 'reduce_by_batch_2d', 'Linear2D', 'LayerNorm2D', 'Classifier2D', 'PatchEmbedding2D', 'Embedding2D'
+    'split_tensor_2d', 'reduce_by_batch_2d', 'Linear2D', 'LayerNorm2D', 'Classifier2D', 'PatchEmbedding2D', 'Embedding2D'
 ]
diff --git a/colossalai/nn/layer/parallel_2d/_operation.py b/colossalai/nn/layer/parallel_2d/_operation.py
index 603b4dcfed9c..9955bcefec90 100644
--- a/colossalai/nn/layer/parallel_2d/_operation.py
+++ b/colossalai/nn/layer/parallel_2d/_operation.py
@@ -2,7 +2,7 @@
 
 import torch
 import torch.distributed as dist
-from colossalai.communication.collective import (all_gather, all_reduce, reduce_scatter)
+from colossalai.communication.collective import (all_gather, all_reduce, reduce, reduce_scatter)
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.utils import get_current_device
@@ -595,7 +595,9 @@ def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
         return grad, None, None
 
 
-def split_batch_2d(input_: Tensor, dim: int = 0) -> Tensor:
+def split_tensor_2d(input_: Tensor, dim: int = 0) -> Tensor:
+    if input_.size(dim) <= 1:
+        return input_
     return torch.chunk(input_, gpc.get_world_size(ParallelMode.PARALLEL_2D_COL),
                        dim=dim)[gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL)].contiguous()
 
@@ -603,17 +605,28 @@ def split_batch_2d(input_: Tensor, dim: int = 0) -> Tensor:
 class reduce_by_batch_2d(torch.autograd.Function):
     """All-reduce the input from the model parallel region."""
     @staticmethod
-    def symbolic(graph, input_):
-        dist.all_reduce(input_, group=gpc.get_group(ParallelMode.PARALLEL_2D_COL))
-        return input_
+    def symbolic(graph, input_, reduce_mean: bool = False):
+        output = all_reduce(input_, ParallelMode.PARALLEL_2D_COL)
+        if reduce_mean:
+            reduce_size = gpc.get_world_size(ParallelMode.PARALLEL_2D_COL)
+            return output / reduce_size
+        return output
 
     @staticmethod
     @custom_fwd(cast_inputs=torch.float32)
-    def forward(ctx, input_):
-        dist.all_reduce(input_, group=gpc.get_group(ParallelMode.PARALLEL_2D_COL))
-        return input_.clone()
+    def forward(ctx, input_, reduce_mean: bool = False):
+        output = all_reduce(input_, ParallelMode.PARALLEL_2D_COL)
+        ctx.reduce_mean = reduce_mean
+        if reduce_mean:
+            reduce_size = gpc.get_world_size(ParallelMode.PARALLEL_2D_COL)
+            ctx.reduce_size = reduce_size
+            return output.clone() / reduce_size
+        return output.clone()
 
     @staticmethod
     @custom_bwd
-    def backward(ctx, grad_output):
-        return grad_output
+    def backward(ctx, output_grad):
+        if ctx.reduce_mean:
+            return output_grad / ctx.reduce_size, None
+        else:
+            return output_grad, None
diff --git a/colossalai/nn/layer/parallel_2d/layers.py b/colossalai/nn/layer/parallel_2d/layers.py
index 3f67df382632..d113ec94c0c1 100644
--- a/colossalai/nn/layer/parallel_2d/layers.py
+++ b/colossalai/nn/layer/parallel_2d/layers.py
@@ -15,7 +15,7 @@
 
 from ..utils import divide, set_tensor_parallel_attribute_by_partition, to_2tuple
 from ..base_layer import ParallelLayer
-from ._operation import Matmul_AB_2D, add_bias_2d, all_gather_weight_2d, classifier_2d, layernorm_2d, split_batch_2d
+from ._operation import Matmul_AB_2D, add_bias_2d, all_gather_weight_2d, classifier_2d, layernorm_2d
 from ._utils import assert_summa_initialization, get_summa_dim_from_env
 
 
@@ -257,8 +257,6 @@ def forward(self, input_: Tensor) -> Tensor:
         assert H == self.img_size[0] and W == self.img_size[1], \
             f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
 
-        input_ = split_batch_2d(input_)
-
         weight = all_gather_weight_2d.apply(self.weight, 0, self.summa_dim, ParallelMode.PARALLEL_2D_COL)
         bias = all_gather_weight_2d.apply(self.bias, 0, self.summa_dim, ParallelMode.PARALLEL_2D_COL)
 
@@ -318,8 +316,6 @@ def _fill_padding_idx_with_zero(self) -> None:
                 self.weight[self.padding_idx].fill_(0)
 
     def forward(self, input_: Tensor) -> Tensor:
-        input_ = split_batch_2d(input_)
-
         weight = all_gather_weight_2d.apply(self.weight, -1, self.summa_dim, ParallelMode.PARALLEL_2D_COL)
 
         output = F.embedding(input_, weight, self.padding_idx, *self.embed_args, **self.embed_kwargs)
diff --git a/colossalai/nn/layer/parallel_2p5d/__init__.py b/colossalai/nn/layer/parallel_2p5d/__init__.py
index 5fc9666f86dd..202c948c5f05 100644
--- a/colossalai/nn/layer/parallel_2p5d/__init__.py
+++ b/colossalai/nn/layer/parallel_2p5d/__init__.py
@@ -1,7 +1,7 @@
-from ._operation import reduce_by_batch_2p5d, split_batch_2p5d
+from ._operation import reduce_by_batch_2p5d, split_tensor_2p5d
 from .layers import Classifier2p5D, Embedding2p5D, LayerNorm2p5D, Linear2p5D, PatchEmbedding2p5D
 
 __all__ = [
-    'split_batch_2p5d', 'reduce_by_batch_2p5d', 'Linear2p5D', 'LayerNorm2p5D', 'Classifier2p5D', 'PatchEmbedding2p5D',
+    'split_tensor_2p5d', 'reduce_by_batch_2p5d', 'Linear2p5D', 'LayerNorm2p5D', 'Classifier2p5D', 'PatchEmbedding2p5D',
     'Embedding2p5D'
 ]
diff --git a/colossalai/nn/layer/parallel_2p5d/_operation.py b/colossalai/nn/layer/parallel_2p5d/_operation.py
index a4b062647328..a1dbcd3cd9fe 100644
--- a/colossalai/nn/layer/parallel_2p5d/_operation.py
+++ b/colossalai/nn/layer/parallel_2p5d/_operation.py
@@ -22,7 +22,7 @@ def get_parallel_rank(parallel_mode: ParallelMode):
     return gpc.get_local_rank(parallel_mode)
 
 
-def split_batch_2p5d(input_: Tensor, dim: int = 0) -> Tensor:
+def split_tensor_2p5d(input_: Tensor, dim: int = 0) -> Tensor:
     return torch.chunk(input_, gpc.get_world_size(ParallelMode.PARALLEL_2P5D_COL),
                        dim=dim)[gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_COL)].contiguous()
 
@@ -151,10 +151,7 @@ def forward(ctx: Any, A: Tensor, B: Tensor, tesseract_dim: int, out_shape: Tuple
         for i in range(tesseract_dim):
             if i != tesseract_dim - 1:
                 A_list[1 - cur].copy_(A)
-                opa[1 - cur] = dist.broadcast(A_list[1 - cur],
-                                              src=src_a + 1,
-                                              group=row_group,
-                                              async_op=True)
+                opa[1 - cur] = dist.broadcast(A_list[1 - cur], src=src_a + 1, group=row_group, async_op=True)
                 B_list[1 - cur].copy_(B)
                 opb[1 - cur] = dist.broadcast(B_list[1 - cur],
                                               src=src_b + tesseract_dim,
@@ -356,10 +353,7 @@ def forward(ctx: Any, A: Tensor, B: Tensor, tesseract_dim: int, out_shape: Tuple
         for i in range(tesseract_dim):
             if i != tesseract_dim - 1:
                 A_list[1 - cur].copy_(A)
-                opa[1 - cur] = dist.broadcast(A_list[1 - cur],
-                                              src=src_a + 1,
-                                              group=row_group,
-                                              async_op=True)
+                opa[1 - cur] = dist.broadcast(A_list[1 - cur], src=src_a + 1, group=row_group, async_op=True)
 
             if opr[cur] is not None:
                 opr[cur].wait()
@@ -564,7 +558,9 @@ def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
         return grad, None, None
 
 
-def split_batch_2p5d(input_: Tensor, dim: int = 0) -> Tensor:
+def split_tensor_2p5d(input_: Tensor, dim: int = 0) -> Tensor:
+    if input_.size(dim) <= 1:
+        return input_
     return torch.chunk(input_, gpc.get_world_size(ParallelMode.PARALLEL_2P5D_COL),
                        dim=dim)[gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_COL)].contiguous()
 
@@ -572,17 +568,28 @@ def split_batch_2p5d(input_: Tensor, dim: int = 0) -> Tensor:
 class reduce_by_batch_2p5d(torch.autograd.Function):
     """All-reduce the input from the model parallel region."""
     @staticmethod
-    def symbolic(graph, input_):
-        dist.all_reduce(input_, group=gpc.get_group(ParallelMode.PARALLEL_2P5D_COL))
-        return input_
+    def symbolic(graph, input_, reduce_mean: bool = False):
+        output = all_reduce(input_, ParallelMode.PARALLEL_2P5D_COL)
+        if reduce_mean:
+            reduce_size = gpc.get_world_size(ParallelMode.PARALLEL_2P5D_COL)
+            return output / reduce_size
+        return output
 
     @staticmethod
     @custom_fwd(cast_inputs=torch.float32)
-    def forward(ctx, input_):
-        dist.all_reduce(input_, group=gpc.get_group(ParallelMode.PARALLEL_2P5D_COL))
-        return input_.clone()
+    def forward(ctx, input_, reduce_mean: bool = False):
+        output = all_reduce(input_, ParallelMode.PARALLEL_2P5D_COL)
+        ctx.reduce_mean = reduce_mean
+        if reduce_mean:
+            reduce_size = gpc.get_world_size(ParallelMode.PARALLEL_2P5D_COL)
+            ctx.reduce_size = reduce_size
+            return output.clone() / reduce_size
+        return output.clone()
 
     @staticmethod
     @custom_bwd
-    def backward(ctx, grad_output):
-        return grad_output
+    def backward(ctx, output_grad):
+        if ctx.reduce_mean:
+            return output_grad / ctx.reduce_size, None
+        else:
+            return output_grad, None
diff --git a/colossalai/nn/layer/parallel_2p5d/layers.py b/colossalai/nn/layer/parallel_2p5d/layers.py
index c2bef24d5af6..d7bd265bdbc3 100644
--- a/colossalai/nn/layer/parallel_2p5d/layers.py
+++ b/colossalai/nn/layer/parallel_2p5d/layers.py
@@ -13,11 +13,10 @@
 from torch import Tensor, dtype
 from torch.nn import Parameter
 
-from ..utils import divide, set_tensor_parallel_attribute_by_partition, to_2tuple
 from ..base_layer import ParallelLayer
-from ._operation import (Add_Bias_2p5D, Matmul_AB_2p5D, all_gather_weight_2p5d, classifier_2p5d, layernorm_2p5d,
-                         split_batch_2p5d)
-from ._utils import assert_tesseract_initialization, get_tesseract_dim_dep_from_env
+from ..utils import (divide, set_tensor_parallel_attribute_by_partition, to_2tuple)
+from ._operation import (Add_Bias_2p5D, Matmul_AB_2p5D, all_gather_weight_2p5d, classifier_2p5d, layernorm_2p5d)
+from ._utils import (assert_tesseract_initialization, get_tesseract_dim_dep_from_env)
 
 
 @LAYERS.register_module
@@ -269,8 +268,6 @@ def forward(self, input_: Tensor) -> Tensor:
         assert H == self.img_size[0] and W == self.img_size[1], \
             f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
 
-        input_ = split_batch_2p5d(input_)
-
         weight = all_gather_weight_2p5d.apply(self.weight, 0, self.tesseract_dim, ParallelMode.PARALLEL_2P5D_COL)
         bias = all_gather_weight_2p5d.apply(self.bias, 0, self.tesseract_dim, ParallelMode.PARALLEL_2P5D_COL)
 
@@ -330,8 +327,6 @@ def _fill_padding_idx_with_zero(self) -> None:
                 self.weight[self.padding_idx].fill_(0)
 
     def forward(self, input_: Tensor) -> Tensor:
-        input_ = split_batch_2p5d(input_)
-
         weight = all_gather_weight_2p5d.apply(self.weight, -1, self.tesseract_dim, ParallelMode.PARALLEL_2P5D_COL)
 
         output = F.embedding(input_, weight, self.padding_idx, *self.embed_args, **self.embed_kwargs)
diff --git a/colossalai/nn/layer/parallel_3d/__init__.py b/colossalai/nn/layer/parallel_3d/__init__.py
index feb30d46216a..46eeacda171d 100644
--- a/colossalai/nn/layer/parallel_3d/__init__.py
+++ b/colossalai/nn/layer/parallel_3d/__init__.py
@@ -1,6 +1,6 @@
-from ._operation import reduce_by_batch_3d, split_batch_3d
+from ._operation import reduce_by_batch_3d, split_tensor_3d
 from .layers import Classifier3D, Embedding3D, LayerNorm3D, Linear3D, PatchEmbedding3D
 
 __all__ = [
-    'reduce_by_batch_3d', 'split_batch_3d', 'Linear3D', 'LayerNorm3D', 'PatchEmbedding3D', 'Classifier3D', 'Embedding3D'
+    'reduce_by_batch_3d', 'split_tensor_3d', 'Linear3D', 'LayerNorm3D', 'PatchEmbedding3D', 'Classifier3D', 'Embedding3D'
 ]
diff --git a/colossalai/nn/layer/parallel_3d/_operation.py b/colossalai/nn/layer/parallel_3d/_operation.py
index 5b3763c3a6bc..96ed775eca6e 100644
--- a/colossalai/nn/layer/parallel_3d/_operation.py
+++ b/colossalai/nn/layer/parallel_3d/_operation.py
@@ -175,10 +175,12 @@ def backward(ctx, output_grad: Tensor) -> Tuple[Tensor, ...]:
         return input_grad, weight_grad, bias_grad, None, None, None, None, None
 
 
-def split_batch_3d(input_: Tensor,
-                   input_parallel_mode: ParallelMode,
-                   weight_parallel_mode: ParallelMode,
-                   dim: int = 0) -> Tensor:
+def split_tensor_3d(input_: Tensor,
+                   dim: int = 0,
+                   input_parallel_mode: ParallelMode = ParallelMode.PARALLEL_3D_INPUT,
+                   weight_parallel_mode: ParallelMode = ParallelMode.PARALLEL_3D_WEIGHT) -> Tensor:
+    if input_.size(dim) <= 1:
+        return input_
     output = torch.chunk(input_, gpc.get_world_size(weight_parallel_mode),
                          dim=dim)[gpc.get_local_rank(weight_parallel_mode)].contiguous()
     output = torch.chunk(output, gpc.get_world_size(input_parallel_mode),
@@ -189,15 +191,27 @@ def split_batch_3d(input_: Tensor,
 class reduce_by_batch_3d(torch.autograd.Function):
     @staticmethod
     @custom_fwd(cast_inputs=torch.float32)
-    def forward(ctx, input_: Tensor, input_parallel_mode: ParallelMode, weight_parallel_mode: ParallelMode) -> Tensor:
+    def forward(ctx,
+                input_: Tensor,
+                input_parallel_mode: ParallelMode,
+                weight_parallel_mode: ParallelMode,
+                reduce_mean: bool = False) -> Tensor:
         output = all_reduce(input_, input_parallel_mode)
         output = all_reduce(output, weight_parallel_mode)
+        ctx.reduce_mean = reduce_mean
+        if reduce_mean:
+            reduce_size = gpc.get_world_size(input_parallel_mode) * gpc.get_world_size(weight_parallel_mode)
+            ctx.reduce_size = reduce_size
+            return output.clone() / reduce_size
         return output.clone()
 
     @staticmethod
     @custom_bwd
     def backward(ctx, output_grad: Tensor) -> Tuple[Tensor, ...]:
-        return output_grad, None, None
+        if ctx.reduce_mean:
+            return output_grad / ctx.reduce_size, None, None, None
+        else:
+            return output_grad, None, None, None
 
 
 class broadcast_weight_3d_from_diagonal(torch.autograd.Function):
diff --git a/colossalai/nn/layer/parallel_3d/layers.py b/colossalai/nn/layer/parallel_3d/layers.py
index 8f0a5058be67..4871d1443917 100644
--- a/colossalai/nn/layer/parallel_3d/layers.py
+++ b/colossalai/nn/layer/parallel_3d/layers.py
@@ -241,8 +241,6 @@ def reset_parameters(self, weight_initializer, bias_initializer, position_embed_
         self.pos_embed.register_hook(self._sync_grad_hook)
 
     def forward(self, input_: Tensor) -> Tensor:
-        input_ = split_batch_3d(input_, self.input_parallel_mode, self.weight_parallel_mode)
-
         weight = broadcast_weight_3d_from_diagonal.apply(self.weight, self.input_parallel_mode,
                                                          self.weight_parallel_mode, self.output_parallel_mode)
         output = F.conv2d(input_, weight, self.bias, stride=self.patch_size)
@@ -302,8 +300,6 @@ def _fill_padding_idx_with_zero(self) -> None:
                 self.weight[self.padding_idx].fill_(0)
 
     def forward(self, input_: Tensor) -> Tensor:
-        input_ = split_batch_3d(input_, self.input_parallel_mode, self.weight_parallel_mode)
-
         weight = broadcast_weight_3d_from_diagonal.apply(self.weight, self.input_parallel_mode,
                                                          self.weight_parallel_mode, self.output_parallel_mode)
         output = F.embedding(input_, weight, self.padding_idx, *self.embed_args, **self.embed_kwargs)
diff --git a/colossalai/nn/layer/utils/__init__.py b/colossalai/nn/layer/utils/__init__.py
index e791d9ecb2e9..7e999ee82149 100644
--- a/colossalai/nn/layer/utils/__init__.py
+++ b/colossalai/nn/layer/utils/__init__.py
@@ -1 +1,7 @@
-from .common import *
\ No newline at end of file
+from .common import (ACT2FN, CheckpointModule, _ntuple, divide, get_tensor_parallel_mode,
+                     set_tensor_parallel_attribute_by_partition, set_tensor_parallel_attribute_by_size, to_2tuple)
+
+__all__ = [
+    'CheckpointModule', 'divide', 'ACT2FN', 'set_tensor_parallel_attribute_by_size',
+    'set_tensor_parallel_attribute_by_partition', 'get_tensor_parallel_mode', '_ntuple', 'to_2tuple'
+]
diff --git a/colossalai/nn/layer/utils/common.py b/colossalai/nn/layer/utils/common.py
index 83c3da7bc48b..734aa5bfa54a 100644
--- a/colossalai/nn/layer/utils/common.py
+++ b/colossalai/nn/layer/utils/common.py
@@ -7,7 +7,7 @@
 
 import numpy as np
 import torch
-from colossalai.constants import IS_TENSOR_PARALLEL, NUM_PARTITIONS, TENSOR_PARALLEL_MODE
+from colossalai.constants import (IS_TENSOR_PARALLEL, NUM_PARTITIONS, TENSOR_PARALLEL_MODE)
 from colossalai.utils import checkpoint
 from torch import Tensor, nn
 
diff --git a/colossalai/nn/loss/loss_2d.py b/colossalai/nn/loss/loss_2d.py
index aeb798201c1e..7aef949f660f 100644
--- a/colossalai/nn/loss/loss_2d.py
+++ b/colossalai/nn/loss/loss_2d.py
@@ -1,4 +1,4 @@
-from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
+from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d
 from colossalai.nn.layer.parallel_2d._utils import assert_summa_initialization
 from colossalai.registry import LOSSES
 from torch.nn.functional import cross_entropy
@@ -20,11 +20,8 @@ def __init__(self, reduction=True, *args, **kwargs):
         self.loss_kwargs = kwargs
 
     def forward(self, logits, targets):
-        batch_size = targets.size(0)
-        targets = split_batch_2d(targets)
-        loss = cross_entropy(logits, targets, reduction='sum', *self.loss_args, **self.loss_kwargs)
+        loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
         if self.reduction_mean:
-            loss = loss.sum()
-            loss = reduce_by_batch_2d.apply(loss)
-            loss /= batch_size
+            loss = loss.mean()
+            loss = reduce_by_batch_2d.apply(loss, True)
         return loss
diff --git a/colossalai/nn/loss/loss_2p5d.py b/colossalai/nn/loss/loss_2p5d.py
index 4f11b71759d9..d7596d9245b6 100644
--- a/colossalai/nn/loss/loss_2p5d.py
+++ b/colossalai/nn/loss/loss_2p5d.py
@@ -1,4 +1,4 @@
-from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
+from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d
 from colossalai.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
 from colossalai.registry import LOSSES
 from torch.nn.functional import cross_entropy
@@ -19,11 +19,8 @@ def __init__(self, reduction=True, *args, **kwargs):
         self.loss_kwargs = kwargs
 
     def forward(self, logits, targets):
-        batch_size = targets.size(0)
-        targets = split_batch_2p5d(targets)
-        loss = cross_entropy(logits, targets, reduction='sum', *self.loss_args, **self.loss_kwargs)
+        loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
         if self.reduction_mean:
-            loss = loss.sum()
-            loss = reduce_by_batch_2p5d.apply(loss)
-            loss /= batch_size
+            loss = loss.mean()
+            loss = reduce_by_batch_2p5d.apply(loss, True)
         return loss
diff --git a/colossalai/nn/loss/loss_3d.py b/colossalai/nn/loss/loss_3d.py
index d5431dabc5f7..59b6ffeeb88d 100644
--- a/colossalai/nn/loss/loss_3d.py
+++ b/colossalai/nn/loss/loss_3d.py
@@ -1,11 +1,10 @@
 from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
-from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_batch_3d
+from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d
 from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
 from colossalai.registry import LOSSES
 from torch.nn.functional import cross_entropy
 from torch.nn.modules.loss import _Loss
 
-
 @LOSSES.register_module
 class CrossEntropyLoss3D(_Loss):
     """Cross entropy loss for 3D parallelism
@@ -28,11 +27,8 @@ def __init__(self, reduction=True, *args, **kwargs):
         self.loss_kwargs = kwargs
 
     def forward(self, logits, targets):
-        batch_size = targets.size(0)
-        targets = split_batch_3d(targets, self.input_parallel_mode, self.weight_parallel_mode)
-        loss = cross_entropy(logits, targets, reduction='sum', *self.loss_args, **self.loss_kwargs)
+        loss = cross_entropy(logits, targets, reduction='none', *self.loss_args, **self.loss_kwargs)
         if self.reduction_mean:
-            loss = loss.sum()
-            loss = reduce_by_batch_3d.apply(loss, self.input_parallel_mode, self.weight_parallel_mode)
-            loss /= batch_size
+            loss = loss.mean()
+            loss = reduce_by_batch_3d.apply(loss, self.input_parallel_mode, self.weight_parallel_mode, True)
         return loss
diff --git a/colossalai/nn/metric/accuracy_2d.py b/colossalai/nn/metric/accuracy_2d.py
index 1026a52e2272..cc207b02cee7 100644
--- a/colossalai/nn/metric/accuracy_2d.py
+++ b/colossalai/nn/metric/accuracy_2d.py
@@ -1,5 +1,5 @@
 import torch
-from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
+from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d
 from torch import nn
 
 from ._utils import calc_acc
@@ -11,7 +11,6 @@ def __init__(self):
 
     def forward(self, logits, targets):
         with torch.no_grad():
-            targets = split_batch_2d(targets)
             correct = calc_acc(logits, targets)
             correct = reduce_by_batch_2d.apply(correct)
         return correct
diff --git a/colossalai/nn/metric/accuracy_2p5d.py b/colossalai/nn/metric/accuracy_2p5d.py
index 98373cbfb922..90dc4af26845 100644
--- a/colossalai/nn/metric/accuracy_2p5d.py
+++ b/colossalai/nn/metric/accuracy_2p5d.py
@@ -1,5 +1,5 @@
 import torch
-from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
+from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d
 from torch import nn
 
 from ._utils import calc_acc
@@ -11,7 +11,6 @@ def __init__(self):
 
     def forward(self, logits, targets):
         with torch.no_grad():
-            targets = split_batch_2p5d(targets)
             correct = calc_acc(logits, targets)
             correct = reduce_by_batch_2p5d.apply(correct)
         return correct
diff --git a/colossalai/nn/metric/accuracy_3d.py b/colossalai/nn/metric/accuracy_3d.py
index f717b9fb2a69..576800510e9f 100644
--- a/colossalai/nn/metric/accuracy_3d.py
+++ b/colossalai/nn/metric/accuracy_3d.py
@@ -1,6 +1,6 @@
 import torch
 from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
-from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_batch_3d
+from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d
 from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
 from torch import nn
 
@@ -15,7 +15,6 @@ def __init__(self):
 
     def forward(self, logits, targets):
         with torch.no_grad():
-            targets = split_batch_3d(targets, self.input_parallel_mode, self.weight_parallel_mode)
             correct = calc_acc(logits, targets)
             correct = reduce_by_batch_3d.apply(correct, self.input_parallel_mode, self.weight_parallel_mode)
         return correct
diff --git a/colossalai/trainer/hooks/_metric_hook.py b/colossalai/trainer/hooks/_metric_hook.py
index bbf66a6fdb21..3489298807d2 100644
--- a/colossalai/trainer/hooks/_metric_hook.py
+++ b/colossalai/trainer/hooks/_metric_hook.py
@@ -173,7 +173,7 @@ def reset(self) -> None:
         self.accumulated_sum.zero_()
         self.accumulated_correct.zero_()
 
-    def update(self, logits, targets) -> None:
+    def update(self, logits, targets, batch_size) -> None:
         """Updates last step accuracy and accumulated accuracy with current logits
         and labels. It expects the output has logits and labels.
 
@@ -187,7 +187,7 @@ def update(self, logits, targets) -> None:
         # update
         correct = self.acc(logits, targets)
 
-        self.last_step_sum.fill_(targets.size(0))
+        self.last_step_sum.fill_(batch_size)
         self.last_step_correct.fill_(correct)
         self.accumulated_sum += self.last_step_sum
         self.accumulated_correct += self.last_step_correct
@@ -296,7 +296,8 @@ def before_test(self, trainer):
 
     def after_test_iter(self, trainer, logits, targets, *args):
         if self._is_stage_to_compute:
-            self.metric.update(logits, targets)
+            batch_size = trainer.schedule.batch_size
+            self.metric.update(logits, targets, batch_size)
 
 
 class ThroughputMetric(Metric):
@@ -313,10 +314,8 @@ def reset(self) -> None:
         self.last_step_num_samples.zero_()
         self.last_step_used_time.zero_()
 
-    def update(self, tensor, time) -> None:
-        if isinstance(tensor, (list, tuple)):
-            tensor = tensor[0]
-        self.last_step_num_samples.fill_(tensor.size(0))
+    def update(self, num_samples, time) -> None:
+        self.last_step_num_samples.fill_(num_samples)
         self.last_step_used_time.fill_(time)
         self.accumulated_num_samples += self.last_step_num_samples
         self.accumulated_used_time += self.last_step_used_time
@@ -354,11 +353,11 @@ def after_hook_is_attached(self, trainer):
     def before_train_epoch(self, trainer):
         self.metric.reset()
 
-    def after_train_iter(self, trainer, logits, targets, *args):
-        self.metric.update(targets, trainer._timer.get_timer('Train-step').get_elapsed_time())
+    def after_train_iter(self, trainer, *args):
+        self.metric.update(trainer.schedule.batch_size, trainer._timer.get_timer('Train-step').get_elapsed_time())
 
     def before_test(self, trainer):
         self.metric.reset()
 
-    def after_test_iter(self, trainer, logits, targets, *args):
-        self.metric.update(targets, trainer._timer.get_timer('Test-step').get_elapsed_time())
+    def after_test_iter(self, trainer, *args):
+        self.metric.update(trainer.schedule.batch_size, trainer._timer.get_timer('Test-step').get_elapsed_time())
diff --git a/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py b/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py
index 71f560e23736..a472bf0ee090 100644
--- a/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py
+++ b/tests/test_data_pipeline_tensor_parallel/test_cifar_with_data_pipeline_tensor.py
@@ -78,7 +78,6 @@ def run_trainer(rank, world_size, port):
     hook_list = [
         hooks.LossHook(),
         hooks.LRSchedulerHook(lr_scheduler=lr_scheduler, by_epoch=False),
-        hooks.AccuracyHook(accuracy_func=Accuracy()),
         hooks.LogMetricByEpochHook(logger),
     ]