Skip to content

Commit

Permalink
add pytorch hooks (hpcaitech#179)
Browse files Browse the repository at this point in the history
* add pytorch hooks
fix hpcaitech#175

* remove licenses in src code

* add gpu memory tracer

* replacing print with logger in ophooks.
  • Loading branch information
feifeibear authored Jan 25, 2022
1 parent 708404d commit 569357f
Show file tree
Hide file tree
Showing 9 changed files with 480 additions and 135 deletions.
12 changes: 7 additions & 5 deletions colossalai/builder/__init__.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
from .builder import (build_schedule, build_lr_scheduler, build_model, build_optimizer, build_layer,
build_loss, build_hooks, build_dataset, build_transform, build_data_sampler,
build_gradient_handler)
from .builder import (build_schedule, build_lr_scheduler, build_model,
build_optimizer, build_layer, build_loss, build_hooks,
build_dataset, build_transform, build_data_sampler,
build_gradient_handler, build_ophooks)
from .pipeline import build_pipeline_model, build_pipeline_model_from_cfg

__all__ = [
'build_schedule', 'build_lr_scheduler', 'build_model', 'build_optimizer',
'build_layer', 'build_loss', 'build_hooks', 'build_dataset', 'build_transform', 'build_data_sampler',
'build_gradient_handler', 'build_pipeline_model', 'build_pipeline_model_from_cfg'
'build_layer', 'build_loss', 'build_hooks', 'build_dataset',
'build_transform', 'build_data_sampler', 'build_gradient_handler',
'build_pipeline_model', 'build_pipeline_model_from_cfg', 'build_ophooks'
]
32 changes: 23 additions & 9 deletions colossalai/builder/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def build_from_registry(config, registry: Registry):
"""Returns an object constructed from `config`, the type of the object
is specified by `registry`.
:param config: A python dict or a :class:`colossalai.context.Config` object
:param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.colossalai.context.Config`
:param registry: A registry specifying the type of the return object
Expand All @@ -50,7 +50,8 @@ def build_from_registry(config, registry: Registry):
obj = registry.get_module(mod_type)(**config_)
except Exception as e:
print(
f'An error occurred when building {mod_type} from registry {registry.name}', flush=True)
f'An error occurred when building {mod_type} from registry {registry.name}',
flush=True)
raise e

return obj
Expand All @@ -69,7 +70,7 @@ def build_layer(config):


def build_loss(config):
"""Returns a loss function object of :class:`torch.autograd.Function` constructed
"""Returns a loss function object of :class:`torch.autograd.Function` constructed
from `config`.
:param config: A python dict or a :class:`colossalai.context.Config` object
Expand All @@ -94,7 +95,7 @@ def build_model(config):


def build_dataset(config):
"""Returns a dataset object of :class:`torch.utils.data.Dataset` constructed
"""Returns a dataset object of :class:`torch.utils.data.Dataset` constructed
from `config`.
:param config: A python dict or a :class:`colossalai.context.Config` object
Expand All @@ -107,13 +108,13 @@ def build_dataset(config):


def build_optimizer(config, model):
"""Returns an optimizer object of :class:`torch.optim.Optimizer` constructed from `config`,
"""Returns an optimizer object of :class:`torch.optim.Optimizer` constructed from `config`,
'model' and 'params'.
:param config: A python dict or a :class:`colossalai.context.Config` object
:param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.Config`
:param model: A model containing parameters for the optimizer
:param model: A model containing parameters for the optimizer
:type model: :class:`nn.Module`
:return: An object of :class:`torch.optim.Optimizer`
:rtype: :class:`torch.optim.Optimizer`
Expand Down Expand Up @@ -159,6 +160,19 @@ def build_hooks(config, trainer):
return build_from_registry(config_, HOOKS)


def build_ophooks(config):
"""Returns a hook object of :class:`BaseOpHook` constructed from `config`.
:param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.Config`
:return: An object of :class:`colossalai.trainer.hooks.BaseOpHook`
:rtype: :class:`colossalai.trainer.hooks.BaseOpHook`
"""
config_ = config.copy()
return build_from_registry(config_, OPHOOKS)


def build_transform(config):
"""Returns a transformation object of :class:`torchvision.transforms` constructed
from `config`.
Expand Down Expand Up @@ -191,10 +205,10 @@ def build_data_sampler(config, dataset):


def build_lr_scheduler(config, optimizer):
"""Returns a learning rate scheduler object of :class:`torch.optim.lr_scheduler`
"""Returns a learning rate scheduler object of :class:`torch.optim.lr_scheduler`
constructed from `config`, `optimizer`, `total_steps` and `num_steps_per_epoch`.
:param config: A python dict or a :class:`colossalai.context.Config` object
:param config: A python dict or a :class:`colossalai.context.Config` object
containing information used in the construction of the return object
:type config: dict or :class:`colossalai.context.Config`
:param optimizer: An optimizer object containing parameters for the learning rate
Expand Down
22 changes: 15 additions & 7 deletions colossalai/engine/_base_engine.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,18 @@
#!/usr/bin/env python
# -*- encoding: utf-8 -*-


from typing import List
from torch.nn import Module
from torch.nn.modules.loss import _Loss
from torch.optim import Optimizer

from colossalai.logging import get_dist_logger
from torch import Tensor
from colossalai.engine.ophooks import register_ophooks_recursively, BaseOpHook


class Engine:
"""Basic engine class for training and evaluation. It runs a specific process method
"""Basic engine class for training and evaluation. It runs a specific process method
:meth:`step` which is based on the given :attr:`schedule` over each batch of a dataset.
It controls a iteration in training.
Expand All @@ -29,15 +29,14 @@ class Engine:
:param verbose: whether to display log info
:type verbose: bool
"""

def __init__(self,
model: Module,
optimizer: Optimizer,
criterion: _Loss,
gradient_handlers: List = None,
clip_grad_norm: float = 0.0,
verbose: bool = True
):
ophook_list: List[BaseOpHook] = [],
verbose: bool = True):
self._model = model
self._optimizer = optimizer
self._criterion = criterion
Expand All @@ -54,6 +53,9 @@ def __init__(self,
else:
self._gradient_handlers = []

self._ophook_list = ophook_list
register_ophooks_recursively(self._model, self._ophook_list)

@property
def model(self):
"""Model attached to the engine"""
Expand Down Expand Up @@ -87,7 +89,10 @@ def backward(self, loss: Tensor):
:param loss: Loss value computed by a loss function
:type loss: :class:`torch.Tensor`
"""
return self.optimizer.backward(loss)
ret = self.optimizer.backward(loss)
for ophook in self._ophook_list:
ophook.post_iter()
return ret

def backward_by_grad(self, tensor, grad):
"""Start backward propagation given the gradient of the output tensor
Expand All @@ -97,7 +102,10 @@ def backward_by_grad(self, tensor, grad):
:param grad: Gradient passed back to the output
:type grad: :class:`torch.Tensor`
"""
return self.optimizer.backward_by_grad(tensor, grad)
ret = self.optimizer.backward_by_grad(tensor, grad)
for ophook in self._ophook_list:
ophook.post_iter()
return ret

def calc_loss(self, *args, **kwargs):
"""Compute the loss value
Expand Down
115 changes: 115 additions & 0 deletions colossalai/engine/ophooks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
from ._base_ophook import BaseOpHook
from ._memtracer_ophook import MemTracerOpHook
import torch
from typing import List

all = ["BaseOpHook", "MemTracerOpHook", "register_ophooks_recursively"]


# apply torch.autograd.Function that calls a backward_function to tensors in output
def _apply_to_tensors_only(module, functional, backward_function, outputs):
if type(outputs) is tuple:
touched_outputs = []
for output in outputs:
touched_output = _apply_to_tensors_only(module, functional,
backward_function, output)
touched_outputs.append(touched_output)
return tuple(touched_outputs)
elif type(outputs) is torch.Tensor:
return functional.apply(module, backward_function, outputs)
else:
return outputs


class PreBackwardFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, module, pre_backward_function, outputs):
ctx.module = module
ctx.pre_backward_function = pre_backward_function
module.applied_pre_backward = False
outputs = outputs.detach()
return outputs

@staticmethod
def backward(ctx, *args):
ctx.pre_backward_function(ctx.module)
return (None, None) + args


class PostBackwardFunction(torch.autograd.Function):
@staticmethod
def forward(ctx, module, pre_backward_function, output):
ctx.module = module
output = output.detach()
ctx.pre_backward_function = pre_backward_function
return output

@staticmethod
def backward(ctx, *args):
"""
Args:
activation_grad of the next layer.
Returns:
grad of the input activation.
"""
ctx.pre_backward_function(ctx.module)
return (None, None) + args


def register_ophooks_recursively(module: torch.nn.Module,
ophook_list: List[BaseOpHook] = None,
name: str = ""):
r"""Recursilvely register pre/post hooks for all submodules in the module in FWD and BWD."""
assert isinstance(module, torch.nn.Module)
has_children = False
for child_name, child in module.named_children():
register_ophooks_recursively(child, ophook_list, name + child_name)
has_children = True

# Early return on modules with no parameters or buffers that
# are not in their children.
if (len(list(module.named_parameters(recurse=False))) == 0
and len(list(module.named_buffers(recurse=False))) == 0):
return

# return if the module has not childern.
if has_children:
return

if ophook_list is not None:
for hook in ophook_list:
assert (isinstance(hook, BaseOpHook))

def _pre_forward_module_hook(submodule, *args):
for hook in ophook_list:
assert isinstance(submodule, torch.nn.Module)
hook.pre_fwd_exec(submodule, *args)

def _post_forward_module_hook(submodule, *args):
for hook in ophook_list:
assert isinstance(submodule, torch.nn.Module)
hook.post_fwd_exec(submodule, *args)

def _pre_backward_module_hook(submodule, inputs, output):
def _run_before_backward_function(submodule):
for hook in ophook_list:
assert isinstance(submodule, torch.nn.Module)
hook.pre_bwd_exec(submodule, inputs, output)

return _apply_to_tensors_only(submodule, PreBackwardFunction,
_run_before_backward_function, output)

def _post_backward_module_hook(submodule, inputs):
def _run_after_backward_function(submodule):
for hook in ophook_list:
assert isinstance(submodule, torch.nn.Module)
hook.post_bwd_exec(submodule, inputs)

return _apply_to_tensors_only(submodule, PostBackwardFunction,
_run_after_backward_function, inputs)

module.register_forward_pre_hook(_pre_forward_module_hook)
module.register_forward_hook(_post_forward_module_hook)

module.register_forward_hook(_pre_backward_module_hook)
module.register_forward_pre_hook(_post_backward_module_hook)
29 changes: 29 additions & 0 deletions colossalai/engine/ophooks/_base_ophook.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from abc import ABC, abstractmethod
import torch


class BaseOpHook(ABC):
"""This class allows users to add customized operations
before and after the execution of a PyTorch submodule"""
def __init__(self):
pass

@abstractmethod
def pre_fwd_exec(self, module: torch.nn.Module, *args):
pass

@abstractmethod
def post_fwd_exec(self, module: torch.nn.Module, *args):
pass

@abstractmethod
def pre_bwd_exec(self, module: torch.nn.Module, input, output):
pass

@abstractmethod
def post_bwd_exec(self, module: torch.nn.Module, input):
pass

@abstractmethod
def post_iter(self):
pass
Loading

0 comments on commit 569357f

Please sign in to comment.