Skip to content

Commit

Permalink
Update layer integration documentations (hpcaitech#108)
Browse files Browse the repository at this point in the history
Update the documentations of layer integration

Update _log_hook.py

Update _operation.py
  • Loading branch information
BoxiangW authored Jan 10, 2022
1 parent 3a61d78 commit 4a3d344
Show file tree
Hide file tree
Showing 25 changed files with 1,019 additions and 97 deletions.
8 changes: 8 additions & 0 deletions colossalai/nn/layer/colossalai_layer/dropout.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,14 @@


class Dropout(nn.Module):
"""
Dropout layer of colossalai
:param p: dropout rate, defaults to 0.5
:type p: float, optional
:param inplace: If set to ``True``, will do this operation in-place, defaults tp ``False``
:type inplace: bool, optional
"""
def __init__(self, p: float = 0.5, inplace: bool = False) -> None:
super().__init__()
self.tensor_parallel = get_tensor_parallel_mode()
Expand Down
36 changes: 36 additions & 0 deletions colossalai/nn/layer/colossalai_layer/embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,20 @@


class Embedding(nn.Module):
"""
Embedding for colossalai
:param num_embeddings: number of embeddings
:type num_embeddings: int
:param embedding_dim: dimension of embedding
:type embedding_dim: int
:param padding_idx: index of padding, defaults to None
:type padding_idx: int, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to normal initializer
:type weight_initializer: typing.Callable, optional
"""
def __init__(self,
num_embeddings: int,
embedding_dim: int,
Expand Down Expand Up @@ -63,6 +77,28 @@ def forward(self, *args):


class PatchEmbedding(nn.Module):
"""
2D Image to Patch Embedding
:param img_size: image size
:type img_size: int
:param patch_size: patch size
:type patch_size: int
:param in_chans: number of channels of input image
:type in_chans: int
:param embed_size: size of embedding
:type embed_size: int
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param flatten: whether to flatten output tensor, defaults to True
:type flatten: bool, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
:param position_embed_initializer: The intializer of position embedding, defaults to zero
:type position_embed_initializer: typing.Callable, optional
"""
def __init__(self,
img_size: int,
patch_size: int,
Expand Down
32 changes: 32 additions & 0 deletions colossalai/nn/layer/colossalai_layer/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,22 @@


class Linear(nn.Module):
"""
Linear layer of colossalai
:param in_features: size of each input sample
:type in_features: int
:param out_features: size of each output sample
:type out_features: int
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
"""
def __init__(self,
in_features: int,
out_features: int,
Expand Down Expand Up @@ -64,6 +80,22 @@ def forward(self, *args):


class Classifier(nn.Module):
"""
Classifier layer of colossalai
:param in_features: size of each input sample
:type in_features: int
:param num_classes: number of total classes for the dataset
:type num_classes: int
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
"""
def __init__(
self,
in_features: int,
Expand Down
13 changes: 13 additions & 0 deletions colossalai/nn/layer/colossalai_layer/normalization.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,19 @@


class LayerNorm(nn.Module):
r"""
Layer Normalization for colossalai
:param normalized_shape: input shape from an expected input
of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size.
:type normalized_shape: int
:param eps: a value added to the denominator for numerical stability, defaults to 1e-05
:type eps: float, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
"""
def __init__(self, normalized_shape: int, eps=1e-05, dtype=None) -> None:
super().__init__()
tensor_parallel = get_tensor_parallel_mode()
Expand Down
12 changes: 12 additions & 0 deletions colossalai/nn/layer/parallel_1d/_operation.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,18 @@


class FusedLayerNormAffineFunction1D(torch.autograd.Function):
r"""
Layernorm
:param input: input maxtrix
:param weight: weight matrix
:param bias: bias matrix
:param normalized_shape: input shape from an expected input
of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size.
:param eps: a value added to the denominator for numerical stability
"""

@staticmethod
def forward(ctx, input, weight, bias, normalized_shape, eps):
Expand Down
30 changes: 26 additions & 4 deletions colossalai/nn/layer/parallel_1d/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,12 @@ def _gather(input_, parallel_mode, dim=-1):


class _ReduceGrad(torch.autograd.Function):
"""Pass the input to the model parallel region."""
"""
Pass the input to the model parallel region.
:param input_: input matrix
:param parallel_mode: parallel mode
"""
@staticmethod
def symbolic(graph, input_):
return input_
Expand All @@ -92,7 +97,12 @@ def backward(ctx, grad_output):


class _ReduceInput(torch.autograd.Function):
"""All-reduce the input from the model parallel region."""
"""
All-reduce the input from the model parallel region.
:param input_: input matrix
:param parallel_mode: parallel mode
"""
@staticmethod
def symbolic(graph, input_):
return _reduce(input_)
Expand All @@ -107,7 +117,13 @@ def backward(ctx, grad_output):


class _SplitForwardGatherBackward(torch.autograd.Function):
"""Split the input and keep only the corresponding chuck to the rank."""
"""
Split the input and keep only the corresponding chuck to the rank.
:param input_: input matrix
:param parallel_mode: parallel mode
:param dim: dimension
"""
@staticmethod
def symbolic(graph, input_):
return _split(input_)
Expand All @@ -124,7 +140,13 @@ def backward(ctx, grad_output):


class _GatherForwardSplitBackward(torch.autograd.Function):
"""Gather the input from model parallel region and concatinate."""
"""
Gather the input from model parallel region and concatinate.
:param input_: input matrix
:param parallel_mode: parallel mode
:param dim: dimension
"""
@staticmethod
def symbolic(graph, input_):
return _gather(input_)
Expand Down
75 changes: 70 additions & 5 deletions colossalai/nn/layer/parallel_1d/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,24 @@

@LAYERS.register_module
class Linear1D(torch.nn.Module):
"""
Linear layer for 1D parallelism
:param in_features: size of each input sample
:type in_features: int
:param out_features: size of each output sample
:type out_features: int
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion, defaults to False
:type skip_bias_add: bool, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
"""
def __init__(self,
in_features: int,
out_features: int,
Expand Down Expand Up @@ -70,8 +88,24 @@ def forward(self, input_: Tensor) -> Tensor:

@LAYERS.register_module
class Classifier1D(ParallelLayer):
"""RowLinear with given weight"""

"""RowLinear with given weight
Classifier of 1D parallelism
:param in_features: size of input features
:type in_features: int
:param num_classes: number of classes in the dataset
:type num_classes: int
:param weight: weight of the classifier, defaults to True
:type weight: torch.nn.Parameter, optional
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
"""
def __init__(self,
in_features: int,
num_classes: int,
Expand Down Expand Up @@ -144,7 +178,7 @@ class Linear1D_Col(ParallelLayer):
:type in_features: int
:param output_size: second dimension of matrix A.
:type output_size: int
:param bias: If true, add bias, defaults to True
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
Expand Down Expand Up @@ -228,7 +262,7 @@ class Linear1D_Row(ParallelLayer):
:type in_features: int
:param out_features: size of each output sample
:type out_features: int
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
Expand Down Expand Up @@ -303,7 +337,16 @@ def forward(self, input_: Tensor) -> Tensor:

@LAYERS.register_module
class MixedFusedLayerNorm1D(torch.nn.Module):
""" Experimental
r"""
Layer Normalization for 1D parallelism
:param normalized_shape: input shape from an expected input
of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size.
:type normalized_shape: int
:param eps: a value added to the denominator for numerical stability, defaults to 1e-05
:type eps: float, optional
"""

def __init__(self, normalized_shape, eps=1e-5):
Expand All @@ -327,6 +370,20 @@ def forward(self, input):

@LAYERS.register_module
class Embedding1D(ParallelLayer):
"""
Embedding for 1D parallelism
:param num_embeddings: number of embeddings
:type num_embeddings: int
:param embedding_dim: dimension of embedding
:type embedding_dim: int
:param padding_idx: index of padding, defaults to None
:type padding_idx: int, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to normal initializer
:type weight_initializer: typing.Callable, optional
"""
def __init__(self,
num_embeddings: int,
embedding_dim: int,
Expand Down Expand Up @@ -377,6 +434,14 @@ def forward(self, input_: Tensor) -> Tensor:

@LAYERS.register_module
class Dropout1D(ParallelLayer):
"""
Dropout layer of 1D parallelism
:param p: dropout rate, defaults to 0.5
:type p: float, optional
:param inplace: If set to ``True``, will do this operation in-place, defaults tp ``False``
:type inplace: bool, optional
"""
def __init__(self, p: float = 0.5, inplace: bool = False):
super().__init__()
self.parallel_input = get_parallel_input()
Expand Down
Loading

0 comments on commit 4a3d344

Please sign in to comment.