update markdown docs (english) (hpcaitech#60)

erhaa233 · Dec 10, 2021 · 9a04665 · 9a04665
1 parent da01c23
commit 9a04665
Show file tree

Hide file tree

Showing 10 changed files with 339 additions and 372 deletions.
diff --git a/README.md b/README.md
@@ -42,21 +42,56 @@ pip install -v --no-cache-dir --global-option="--cuda_ext" .
 
 ```python
 import colossalai
-from colossalai.trainer import Trainer
-from colossalai.core import global_context as gpc
-
-engine, train_dataloader, test_dataloader = colossalai.initialize()
-
-trainer = Trainer(engine=engine,
-                  verbose=True)
-trainer.fit(
-    train_dataloader=train_dataloader,
-    test_dataloader=test_dataloader,
-    epochs=gpc.config.num_epochs,
-    hooks_cfg=gpc.config.hooks,
-    display_progress=True,
-    test_interval=5
+from colossalai.utils import get_dataloader
+
+
+# my_config can be path to config file or a dictionary obj
+# 'localhost' is only for single node, you need to specify
+# the node name if using multiple nodes
+colossalai.launch(
+    config=my_config,
+    rank=rank,
+    world_size=world_size,
+    backend='nccl',
+    port=29500,
+    host='localhost'
 )
+
+# build your model
+model = ... 
+
+# build you dataset, the dataloader will have distributed data 
+# sampler by default
+train_dataset = ... 
+train_dataloader = get_dataloader(dataset=dataset,
+                            shuffle=True,
+                            )
+
+
+# build your 
+optimizer = ... 
+
+# build your loss function
+criterion = ...
+
+# build your lr_scheduler
+engine, train_dataloader, _, _ = colossalai.initialize(
+    model=model,
+    optimizer=optimizer,
+    criterion=criterion,
+    train_dataloader=train_dataloader
+)
+
+# start training
+engine.train()
+for epoch in range(NUM_EPOCHS):
+    for data, label in train_dataloader:
+        engine.zero_grad()
+        output = engine(data)
+        loss = engine.criterion(output, label)
+        engine.backward(loss)
+        engine.step()
+
 ```
 
 ### Write a Simple 2D Parallel Model

diff --git a/colossalai/trainer/hooks/__init__.py b/colossalai/trainer/hooks/__init__.py
@@ -1,13 +1,15 @@
 from ._base_hook import BaseHook
 from ._checkpoint_hook import SaveCheckpointHook, LoadCheckpointHook
-from ._metric_hook import LossHook, Accuracy2DHook, AccuracyHook, MetricHook
+from ._metric_hook import (LossHook, Accuracy2DHook, AccuracyHook, MetricHook,
+                           Accuracy1DHook, Accuracy2p5DHook, Accuracy3DHook)
 from ._log_hook import LogMetricByEpochHook, TensorboardHook, LogTimingByEpochHook, LogMemoryByEpochHook
 from ._lr_scheduler_hook import LRSchedulerHook
 
 __all__ = [
     'BaseHook', 'MetricHook',
     'LoadCheckpointHook', 'SaveCheckpointHook',
     'LossHook', 'AccuracyHook', 'Accuracy2DHook',
+    'Accuracy1DHook', 'Accuracy2p5DHook', 'Accuracy3DHook',
     'LogMetricByEpochHook', 'TensorboardHook', 'LogTimingByEpochHook', 'LogMemoryByEpochHook',
     'LRSchedulerHook'
 ]
diff --git a/colossalai/utils/data_sampler/data_parallel_sampler.py b/colossalai/utils/data_sampler/data_parallel_sampler.py
@@ -108,7 +108,14 @@ def set_epoch(self, epoch: int) -> None:
         self.epoch = epoch
 
 
-def get_dataloader(dataset, shuffle=False, seed=1024, add_sampler=True, **kwargs):
+def get_dataloader(dataset,
+                   shuffle=False,
+                   seed=1024, 
+                   add_sampler=True, 
+                   drop_last=False,
+                   pin_memory=False,
+                   num_workers=0,
+                   **kwargs):
     '''Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not)
 
     .. note: when pipeline parallel is enabled, shuffle cannot be True
@@ -141,9 +148,15 @@ def seed_worker(worker_id):
         return DataLoader(dataset,
                           worker_init_fn=seed_worker,
                           shuffle=shuffle,
+                          drop_last=drop_last,
+                          pin_memory=pin_memory,
+                          num_workers=num_workers,
                           **_kwargs)
     else:
         return DataLoader(dataset,
                           sampler=sampler,
                           worker_init_fn=seed_worker,
+                          drop_last=drop_last,
+                          pin_memory=pin_memory,
+                          num_workers=num_workers,
                           **_kwargs)
diff --git a/docs/add_your_parallel.md b/docs/add_your_parallel.md
@@ -103,7 +103,7 @@ class YourGradientHandler(BaseGradientHandler):
 Afterwards, you can specify the gradient handler you want to use in your configuration file.
 
 ```python
-dist_initializer = [
+gradient_handlers = [
     dict(type='YourGradientHandler'),
 ]
 ```
@@ -112,5 +112,4 @@ dist_initializer = [
 
 Schedule entails how to execute a forward and backward pass. Currently, Colossal-AI provides pipeline and non-pipeline
 schedules. If you want to modify how the forward and backward passes are executed, you can
-inherit `colossalai.engine.BaseSchedule` and implement your idea. You can also add your schedule to the engine before
-training.
+inherit `colossalai.engine.schedule.BaseSchedule` and implement the `forward_back_step` function.
diff --git a/docs/amp.md b/docs/amp.md
@@ -3,25 +3,39 @@
 In Colossal-AI, we have incorporated different implementations of mixed precision training:
 1. torch.cuda.amp
 2. apex.amp
-3. tensor-parallel amp
+3. naive amp
 
 The first two rely on the original implementation of [PyTorch](https://pytorch.org/docs/stable/amp.html)
-(version 1.6 and above) and [Nvidia Apex](https://github.com/NVIDIA/apex). However, these two methods are not compatible 
-with tensor parallelism. This is because that tensors are split across devices in tensor parallelism, thus, it is required 
-to communicate among different processes to check if `inf` or `nan` occurs in the whole model weights. For the mixed
-precision training with tensor parallelism, we adapted this feature from [Megatron-LM](https://github.com/NVIDIA/Megatron-LM). 
+(version 1.6 and above) and [Nvidia Apex](https://github.com/NVIDIA/apex). The last mehtod is simialr to Apex O2 level.
+
+Among these methods, apex.amp is not compatible with tensor parallelism. This is because that tensors are split across devices 
+in tensor parallelism, thus, it is required to communicate among different processes to check if `inf` or `nan` occurs in the 
+whole model weights. **We modified the torch amp implementation so that it is compatible with tensor parallelism now.**
 
 To use mixed precision training, you can easily specify the `fp16` field in the config file to be True. Currently, PyTorch and 
-Apex amp cannot be guaranteed to work with tensor and pipeline parallelism, thus, only the last one is recommended if you 
-are using hybrid parallelism.
+Apex amp cannot be guaranteed to work with tensor and pipeline parallelism. We recommend you to use torch amp as it generally 
+gives better accuracy than naive amp.
+
+The AMP module is designed to be completely modular and can be used independently from other colossalai modules.
+If you wish to only use amp in your code base without `colossalai.initialize`, you can use `colossalai.amp.convert_to_amp`.
+
+```python
+from colossalai.amp import AMP_TYPE
+
+# exmaple of using torch amp
+model, optimizer, criterion = colossalai.amp.convert_to_amp(model, 
+                                                            optimizer, 
+                                                            criterion,
+                                                            AMP_TYPE.TORCH)
+```
 
 ## PyTorch AMP
 
 PyTorch provides mixed precision training in version 1.6 and above. It provides an easy way to cast data to `fp16` format 
 while keeping some operations such as reductions in `fp32`. You can configure the gradient scaler in the config file.
 
 ```python
-from colossalai.engine import AMP_TYPE
+from colossalai.amp import AMP_TYPE
 
 fp16=dict(
     mode=AMP_TYPE.TORCH,
@@ -43,7 +57,7 @@ will keep batch normalization in `fp32`.
 The following code block shows a config file for Apex AMP.
 
 ```python
-from colossalai.engine import AMP_TYPE
+from colossalai.amp import AMP_TYPE
 
 fp16 = dict(
     mode=AMP_TYPE.APEX,
@@ -71,10 +85,10 @@ and pipeline parallelism.
 The following conde block show a config file for this mode.
 
 ```python
-from colossalai.engine import AMP_TYPE
+from colossalai.amp import AMP_TYPE
 
 fp16 = dict(
-    mode=AMP_TYPE.PARALLEL,
+    mode=AMP_TYPE.NAIVE,
     # below are the default values
     clip_grad=0,
     log_num_zeros_in_grad=False,