Update layer integration documentations (hpcaitech#108)

Update the documentations of layer integration Update _log_hook.py Update _operation.py
erhaa233 · Jan 10, 2022 · 4a3d344 · 4a3d344
1 parent 3a61d78
commit 4a3d344
Show file tree

Hide file tree

Showing 25 changed files with 1,019 additions and 97 deletions.
diff --git a/colossalai/nn/layer/colossalai_layer/dropout.py b/colossalai/nn/layer/colossalai_layer/dropout.py
@@ -9,6 +9,14 @@
 
 
 class Dropout(nn.Module):
+    """
+    Dropout layer of colossalai
+
+    :param p: dropout rate, defaults to 0.5
+    :type p: float, optional
+    :param inplace: If set to ``True``, will do this operation in-place, defaults tp ``False``
+    :type inplace: bool, optional
+    """
     def __init__(self, p: float = 0.5, inplace: bool = False) -> None:
         super().__init__()
         self.tensor_parallel = get_tensor_parallel_mode()

diff --git a/colossalai/nn/layer/colossalai_layer/embedding.py b/colossalai/nn/layer/colossalai_layer/embedding.py
@@ -24,6 +24,20 @@
 
 
 class Embedding(nn.Module):
+    """
+    Embedding for colossalai
+
+    :param num_embeddings: number of embeddings
+    :type num_embeddings: int
+    :param embedding_dim: dimension of embedding
+    :type embedding_dim: int
+    :param padding_idx: index of padding, defaults to None
+    :type padding_idx: int, optional
+    :param dtype: The dtype of parameters, defaults to None
+    :type dtype: torch.dtype, optional
+    :param weight_initializer: The intializer of weight, defaults to normal initializer
+    :type weight_initializer: typing.Callable, optional
+    """
     def __init__(self,
                  num_embeddings: int,
                  embedding_dim: int,
@@ -63,6 +77,28 @@ def forward(self, *args):
 
 
 class PatchEmbedding(nn.Module):
+    """
+    2D Image to Patch Embedding
+
+    :param img_size: image size
+    :type img_size: int
+    :param patch_size: patch size
+    :type patch_size: int
+    :param in_chans: number of channels of input image
+    :type in_chans: int
+    :param embed_size: size of embedding
+    :type embed_size: int
+    :param dtype: The dtype of parameters, defaults to None
+    :type dtype: torch.dtype, optional
+    :param flatten: whether to flatten output tensor, defaults to True
+    :type flatten: bool, optional
+    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+    :type weight_initializer: typing.Callable, optional
+    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    :type bias_initializer: typing.Callable, optional
+    :param position_embed_initializer: The intializer of position embedding, defaults to zero
+    :type position_embed_initializer: typing.Callable, optional
+    """
     def __init__(self,
                  img_size: int,
                  patch_size: int,

diff --git a/colossalai/nn/layer/colossalai_layer/linear.py b/colossalai/nn/layer/colossalai_layer/linear.py
@@ -25,6 +25,22 @@
 
 
 class Linear(nn.Module):
+    """
+    Linear layer of colossalai
+
+    :param in_features: size of each input sample
+    :type in_features: int
+    :param out_features: size of each output sample
+    :type out_features: int
+    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
+    :type bias: bool, optional
+    :param dtype: The dtype of parameters, defaults to None
+    :type dtype: torch.dtype, optional
+    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+    :type weight_initializer: typing.Callable, optional
+    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    :type bias_initializer: typing.Callable, optional
+    """
     def __init__(self,
                  in_features: int,
                  out_features: int,
@@ -64,6 +80,22 @@ def forward(self, *args):
 
 
 class Classifier(nn.Module):
+    """
+    Classifier layer of colossalai
+
+    :param in_features: size of each input sample
+    :type in_features: int
+    :param num_classes: number of total classes for the dataset
+    :type num_classes: int
+    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
+    :type bias: bool, optional
+    :param dtype: The dtype of parameters, defaults to None
+    :type dtype: torch.dtype, optional
+    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+    :type weight_initializer: typing.Callable, optional
+    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    :type bias_initializer: typing.Callable, optional
+    """
     def __init__(
         self,
         in_features: int,

diff --git a/colossalai/nn/layer/colossalai_layer/normalization.py b/colossalai/nn/layer/colossalai_layer/normalization.py
@@ -15,6 +15,19 @@
 
 
 class LayerNorm(nn.Module):
+    r"""
+    Layer Normalization for colossalai
+
+    :param normalized_shape: input shape from an expected input
+        of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
+        If a single integer is used, it is treated as a singleton list, and this module will
+        normalize over the last dimension which is expected to be of that specific size.
+    :type normalized_shape: int
+    :param eps: a value added to the denominator for numerical stability, defaults to 1e-05
+    :type eps: float, optional
+    :param dtype: The dtype of parameters, defaults to None
+    :type dtype: torch.dtype, optional
+    """
     def __init__(self, normalized_shape: int, eps=1e-05, dtype=None) -> None:
         super().__init__()
         tensor_parallel = get_tensor_parallel_mode()

diff --git a/colossalai/nn/layer/parallel_1d/_operation.py b/colossalai/nn/layer/parallel_1d/_operation.py
@@ -7,6 +7,18 @@
 
 
 class FusedLayerNormAffineFunction1D(torch.autograd.Function):
+  r"""
+  Layernorm
+
+  :param input: input maxtrix
+  :param weight: weight matrix
+  :param bias: bias matrix
+  :param normalized_shape: input shape from an expected input
+        of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
+        If a single integer is used, it is treated as a singleton list, and this module will
+        normalize over the last dimension which is expected to be of that specific size.
+  :param eps: a value added to the denominator for numerical stability
+  """
 
   @staticmethod
   def forward(ctx, input, weight, bias, normalized_shape, eps):

diff --git a/colossalai/nn/layer/parallel_1d/_utils.py b/colossalai/nn/layer/parallel_1d/_utils.py
@@ -76,7 +76,12 @@ def _gather(input_, parallel_mode, dim=-1):
 
 
 class _ReduceGrad(torch.autograd.Function):
-    """Pass the input to the model parallel region."""
+    """
+    Pass the input to the model parallel region.
+
+    :param input_: input matrix
+    :param parallel_mode: parallel mode
+    """
     @staticmethod
     def symbolic(graph, input_):
         return input_
@@ -92,7 +97,12 @@ def backward(ctx, grad_output):
 
 
 class _ReduceInput(torch.autograd.Function):
-    """All-reduce the input from the model parallel region."""
+    """
+    All-reduce the input from the model parallel region.
+    
+    :param input_: input matrix
+    :param parallel_mode: parallel mode
+    """
     @staticmethod
     def symbolic(graph, input_):
         return _reduce(input_)
@@ -107,7 +117,13 @@ def backward(ctx, grad_output):
 
 
 class _SplitForwardGatherBackward(torch.autograd.Function):
-    """Split the input and keep only the corresponding chuck to the rank."""
+    """
+    Split the input and keep only the corresponding chuck to the rank.
+    
+    :param input_: input matrix
+    :param parallel_mode: parallel mode
+    :param dim: dimension
+    """
     @staticmethod
     def symbolic(graph, input_):
         return _split(input_)
@@ -124,7 +140,13 @@ def backward(ctx, grad_output):
 
 
 class _GatherForwardSplitBackward(torch.autograd.Function):
-    """Gather the input from model parallel region and concatinate."""
+    """
+    Gather the input from model parallel region and concatinate.
+    
+    :param input_: input matrix
+    :param parallel_mode: parallel mode
+    :param dim: dimension
+    """
     @staticmethod
     def symbolic(graph, input_):
         return _gather(input_)

diff --git a/colossalai/nn/layer/parallel_1d/layers.py b/colossalai/nn/layer/parallel_1d/layers.py
@@ -26,6 +26,24 @@
 
 @LAYERS.register_module
 class Linear1D(torch.nn.Module):
+    """
+    Linear layer for 1D parallelism
+
+    :param in_features: size of each input sample
+    :type in_features: int
+    :param out_features: size of each output sample
+    :type out_features: int
+    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
+    :type bias: bool, optional
+    :param dtype: The dtype of parameters, defaults to None
+    :type dtype: torch.dtype, optional
+    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion, defaults to False
+    :type skip_bias_add: bool, optional
+    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+    :type weight_initializer: typing.Callable, optional
+    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    :type bias_initializer: typing.Callable, optional
+    """
     def __init__(self,
                  in_features: int,
                  out_features: int,
@@ -70,8 +88,24 @@ def forward(self, input_: Tensor) -> Tensor:
 
 @LAYERS.register_module
 class Classifier1D(ParallelLayer):
-    """RowLinear with given weight"""
-
+    """RowLinear with given weight
+    Classifier of 1D parallelism
+    
+    :param in_features: size of input features
+    :type in_features: int
+    :param num_classes: number of classes in the dataset
+    :type num_classes: int
+    :param weight: weight of the classifier, defaults to True
+    :type weight: torch.nn.Parameter, optional
+    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
+    :type bias: bool, optional
+    :param dtype: The dtype of parameters, defaults to None
+    :type dtype: torch.dtype, optional
+    :param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
+    :type weight_initializer: typing.Callable, optional
+    :param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
+    :type bias_initializer: typing.Callable, optional
+    """
     def __init__(self,
                  in_features: int,
                  num_classes: int,
@@ -144,7 +178,7 @@ class Linear1D_Col(ParallelLayer):
     :type in_features: int
     :param output_size: second dimension of matrix A.
     :type output_size: int
-    :param bias: If true, add bias, defaults to True
+    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
     :type bias: bool, optional
     :param dtype: The dtype of parameters, defaults to None
     :type dtype: torch.dtype, optional
@@ -228,7 +262,7 @@ class Linear1D_Row(ParallelLayer):
     :type in_features: int
     :param out_features: size of each output sample
     :type out_features: int
-    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
+    :param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
     :type bias: bool, optional
     :param dtype: The dtype of parameters, defaults to None
     :type dtype: torch.dtype, optional
@@ -303,7 +337,16 @@ def forward(self, input_: Tensor) -> Tensor:
 
 @LAYERS.register_module
 class MixedFusedLayerNorm1D(torch.nn.Module):
-    """ Experimental
+    r"""
+    Layer Normalization for 1D parallelism
+
+    :param normalized_shape: input shape from an expected input
+        of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
+        If a single integer is used, it is treated as a singleton list, and this module will
+        normalize over the last dimension which is expected to be of that specific size.
+    :type normalized_shape: int
+    :param eps: a value added to the denominator for numerical stability, defaults to 1e-05
+    :type eps: float, optional
     """
 
     def __init__(self, normalized_shape, eps=1e-5):
@@ -327,6 +370,20 @@ def forward(self, input):
 
 @LAYERS.register_module
 class Embedding1D(ParallelLayer):
+    """
+    Embedding for 1D parallelism
+
+    :param num_embeddings: number of embeddings
+    :type num_embeddings: int
+    :param embedding_dim: dimension of embedding
+    :type embedding_dim: int
+    :param padding_idx: index of padding, defaults to None
+    :type padding_idx: int, optional
+    :param dtype: The dtype of parameters, defaults to None
+    :type dtype: torch.dtype, optional
+    :param weight_initializer: The intializer of weight, defaults to normal initializer
+    :type weight_initializer: typing.Callable, optional
+    """
     def __init__(self,
                  num_embeddings: int,
                  embedding_dim: int,
@@ -377,6 +434,14 @@ def forward(self, input_: Tensor) -> Tensor:
 
 @LAYERS.register_module
 class Dropout1D(ParallelLayer):
+    """
+    Dropout layer of 1D parallelism
+
+    :param p: dropout rate, defaults to 0.5
+    :type p: float, optional
+    :param inplace: If set to ``True``, will do this operation in-place, defaults tp ``False``
+    :type inplace: bool, optional
+    """
     def __init__(self, p: float = 0.5, inplace: bool = False):
         super().__init__()
         self.parallel_input = get_parallel_input()