forked from hpcaitech/ColossalAI
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[profiler] add MemProfiler (hpcaitech#356)
* add memory trainer hook * fix bug * add memory trainer hook * fix import bug * fix import bug * add trainer hook * fix hpcaitech#370 git log bug * modify `to_tensorboard` function to support better output * remove useless output * change the name of `MemProfiler` * complete memory profiler * replace error with warning * finish trainer hook * modify interface of MemProfiler * modify `__init__.py` in profiler * remove unnecessary pass statement * add usage to doc string * add usage to trainer hook * new location to store temp data file
- Loading branch information
Jie Zhu
authored
Mar 29, 2022
1 parent
fb841dd
commit 73d3661
Showing
8 changed files
with
136 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
from cgitb import Hook | ||
from colossalai.registry import HOOKS | ||
from torch import Tensor | ||
from colossalai.trainer.hooks import BaseHook | ||
from colossalai.utils.memory_tracer import AsyncMemoryMonitor | ||
from ._metric_hook import LearningRateMetric, MetricHook | ||
|
||
@HOOKS.register_module | ||
class MemTraceHook(BaseHook): | ||
"""Save memory stats and pass it to states | ||
This hook is used to record memory usage info, and pass to trainer.states | ||
You can use it as other trainer hook and fetch data from trainer.states['metrics][mode] | ||
""" | ||
def __init__( | ||
self, | ||
priority: int = 0, | ||
) -> None: | ||
super().__init__(priority=priority) | ||
self._memory_monitor = AsyncMemoryMonitor() | ||
|
||
def after_hook_is_attached(self, trainer): | ||
# Initialize the data | ||
trainer.states['metrics']['train'] = self._memory_monitor.state_dict | ||
trainer.states['metrics']['test'] = self._memory_monitor.state_dict | ||
|
||
def before_train_iter(self, trainer): | ||
self._memory_monitor.start() | ||
return super().before_train_iter(trainer) | ||
|
||
def after_train_iter(self, trainer, output: Tensor, label: Tensor, loss: Tensor): | ||
self._memory_monitor.finish() | ||
trainer.states['metrics']['train'] = self._memory_monitor.state_dict | ||
trainer.states['metrics']['test'] = self._memory_monitor.state_dict | ||
return super().after_train_iter(trainer, output, label, loss) | ||
|
||
def before_test_iter(self, trainer): | ||
self._memory_monitor.start() | ||
return super().before_test(trainer) | ||
|
||
def after_test_iter(self, trainer, output: Tensor, label: Tensor, loss: Tensor): | ||
self._memory_monitor.finish() | ||
trainer.states['metrics']['train'] = self._memory_monitor.state_dict | ||
trainer.states['metrics']['test'] = self._memory_monitor.state_dict | ||
return super().after_test_iter(trainer, output, label, loss) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,6 @@ | ||
from .comm_profiler import CommProfiler | ||
from .pcie_profiler import PcieProfiler | ||
from .prof_utils import ProfilerContext | ||
from .prof_utils import ProfilerContext, BaseProfiler | ||
from .mem_profiler import MemProfiler | ||
|
||
__all__ = ['BaseProfiler', 'CommProfiler', 'PcieProfiler', 'MemProfiler', 'ProfilerContext'] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,50 @@ | ||
from pathlib import Path | ||
from typing import Union | ||
from colossalai.engine import Engine | ||
from torch.utils.tensorboard import SummaryWriter | ||
from colossalai.engine.ophooks import MemTracerOpHook | ||
from colossalai.utils.profiler import BaseProfiler | ||
|
||
|
||
class MemProfiler(BaseProfiler): | ||
"""Wraper of MemOpHook, used to show GPU memory usage through each iteration | ||
To use this profiler, you need to pass an `engine` instance. And the usage is same like | ||
CommProfiler. | ||
mm_prof = MemProfiler(engine) | ||
with ProfilerContext([mm_prof]) as prof: | ||
writer = SummaryWriter("mem") | ||
engine.train() | ||
... | ||
prof.to_file("./log") | ||
prof.to_tensorboard(writer) | ||
""" | ||
|
||
def __init__(self, engine: Engine, warmup: int = 50, refreshrate: int = 10) -> None: | ||
super().__init__(profiler_name="MemoryProfiler", priority=0) | ||
self._mem_tracer = MemTracerOpHook(warmup=warmup, refreshrate=refreshrate) | ||
self._engine = engine | ||
|
||
def enable(self) -> None: | ||
self._engine.add_hook(self._mem_tracer) | ||
|
||
def disable(self) -> None: | ||
self._engine.remove_hook(self._mem_tracer) | ||
|
||
def to_tensorboard(self, writer: SummaryWriter) -> None: | ||
stats = self._mem_tracer.async_mem_monitor.state_dict['mem_stats'] | ||
for info, i in enumerate(stats): | ||
writer.add_scalar( | ||
"memory_usage/GPU", | ||
info, | ||
i | ||
) | ||
|
||
def to_file(self, data_file: Path) -> None: | ||
self._mem_tracer.save_results(data_file) | ||
|
||
def show(self) -> None: | ||
stats = self._mem_tracer.async_mem_monitor.state_dict['mem_stats'] | ||
print(stats) |