forked from hpcaitech/ColossalAI
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[utils] add synchronized cuda memory monitor (hpcaitech#740)
- Loading branch information
Showing
4 changed files
with
149 additions
and
110 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,4 @@ | ||
from .async_memtracer import AsyncMemoryMonitor | ||
from .memory_monitor import AsyncMemoryMonitor, SyncCudaMemoryMonitor | ||
from .memstats_collector import MemStatsCollector | ||
|
||
__all__ = ['AsyncMemoryMonitor', 'MemStatsCollector'] | ||
__all__ = ['AsyncMemoryMonitor', 'SyncCudaMemoryMonitor', 'MemStatsCollector'] |
245 changes: 142 additions & 103 deletions
245
...ai/utils/memory_tracer/async_memtracer.py → ...lai/utils/memory_tracer/memory_monitor.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,103 +1,142 @@ | ||
from concurrent.futures import ThreadPoolExecutor | ||
from time import sleep, time | ||
import pickle | ||
|
||
import torch | ||
|
||
from colossalai.utils.memory import colo_device_memory_used | ||
from colossalai.utils import get_current_device | ||
|
||
|
||
class AsyncMemoryMonitor: | ||
""" | ||
An Async Memory Monitor runing during computing. Sampling memory usage of the current GPU | ||
at interval of `1/(10**power)` sec. | ||
The idea comes from Runtime Memory Tracer of PatrickStar | ||
`PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management`_ | ||
Usage:: | ||
async_mem_monitor = AsyncMemoryMonitor() | ||
input = torch.randn(2, 20).cuda() | ||
OP1 = torch.nn.Linear(20, 30).cuda() | ||
OP2 = torch.nn.Linear(30, 40).cuda() | ||
async_mem_monitor.start() | ||
output = OP1(input) | ||
async_mem_monitor.finish() | ||
async_mem_monitor.start() | ||
output = OP2(output) | ||
async_mem_monitor.finish() | ||
async_mem_monitor.save('log.pkl') | ||
Args: | ||
power (int, optional): the power of time interva. Defaults to 10. | ||
.. _PatrickStar\: Parallel Training of Pre-trained Models via Chunk-based Memory Management: | ||
https://arxiv.org/abs/2108.05818 | ||
""" | ||
|
||
def __init__(self, power: int = 10): | ||
self.keep_measuring = False | ||
|
||
current_device = get_current_device() | ||
|
||
def _set_cuda_device(): | ||
torch.cuda.set_device(current_device) | ||
|
||
self.executor = ThreadPoolExecutor(max_workers=1, initializer=_set_cuda_device) | ||
self.monitor_thread = None | ||
self.interval = 1 / (10**power) | ||
self.time_stamps = [] | ||
self.mem_stats = [] | ||
|
||
def __len__(self): | ||
return len(self.mem_stats) | ||
|
||
def set_interval(self, power: int): | ||
self.clear() | ||
self.interval = 1 / (10**power) | ||
|
||
def is_measuring(self): | ||
return self.keep_measuring | ||
|
||
def start(self): | ||
self.keep_measuring = True | ||
self.monitor_thread = self.executor.submit(self._measure_usage) | ||
|
||
def finish(self): | ||
if self.keep_measuring is False: | ||
return 0 | ||
self.keep_measuring = False | ||
max_usage = self.monitor_thread.result() | ||
self.monitor_thread = None | ||
self.time_stamps.append(time()) | ||
self.mem_stats.append(max_usage) | ||
return max_usage | ||
|
||
def _measure_usage(self): | ||
max_usage = 0 | ||
while self.keep_measuring: | ||
max_usage = max( | ||
max_usage, | ||
colo_device_memory_used(get_current_device()), | ||
) | ||
sleep(self.interval) | ||
return max_usage | ||
|
||
@property | ||
def state_dict(self): | ||
return { | ||
"time_stamps": self.time_stamps, | ||
"mem_stats": self.mem_stats, | ||
} | ||
|
||
def save(self, filename): | ||
with open(filename, "wb") as f: | ||
pickle.dump(self.state_dict(), f) | ||
|
||
def clear(self): | ||
self.mem_stats.clear() | ||
self.time_stamps.clear() | ||
from abc import abstractmethod | ||
from concurrent.futures import ThreadPoolExecutor | ||
from time import sleep, time | ||
import json | ||
|
||
import torch | ||
|
||
from colossalai.utils.memory import colo_device_memory_used | ||
from colossalai.utils import get_current_device | ||
|
||
|
||
class MemoryMonitor: | ||
"""Base class for all types of memory monitor. | ||
All monitors should have a list called `time_stamps` and a list called `mem_stats`. | ||
""" | ||
|
||
def __init__(self): | ||
self.time_stamps = [] | ||
self.mem_stats = [] | ||
|
||
def __len__(self): | ||
return len(self.mem_stats) | ||
|
||
@abstractmethod | ||
def start(self): | ||
pass | ||
|
||
@abstractmethod | ||
def finish(self): | ||
pass | ||
|
||
def state_dict(self): | ||
return { | ||
"time_stamps": self.time_stamps, | ||
"mem_stats": self.mem_stats, | ||
} | ||
|
||
def save(self, filename): | ||
with open(filename, "w") as f: | ||
json.dump(self.state_dict(), f) | ||
|
||
def clear(self): | ||
self.mem_stats.clear() | ||
self.time_stamps.clear() | ||
|
||
|
||
class AsyncMemoryMonitor(MemoryMonitor): | ||
""" | ||
An Async Memory Monitor runing during computing. Sampling memory usage of the current GPU | ||
at interval of `1/(10**power)` sec. | ||
The idea comes from Runtime Memory Tracer of PatrickStar | ||
`PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management`_ | ||
Usage:: | ||
async_mem_monitor = AsyncMemoryMonitor() | ||
input = torch.randn(2, 20).cuda() | ||
OP1 = torch.nn.Linear(20, 30).cuda() | ||
OP2 = torch.nn.Linear(30, 40).cuda() | ||
async_mem_monitor.start() | ||
output = OP1(input) | ||
async_mem_monitor.finish() | ||
async_mem_monitor.start() | ||
output = OP2(output) | ||
async_mem_monitor.finish() | ||
async_mem_monitor.save('log.pkl') | ||
Args: | ||
power (int, optional): the power of time interva. Defaults to 10. | ||
.. _PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management: | ||
https://arxiv.org/abs/2108.05818 | ||
""" | ||
|
||
def __init__(self, power: int = 10): | ||
super().__init__() | ||
self.keep_measuring = False | ||
|
||
current_device = get_current_device() | ||
|
||
def _set_cuda_device(): | ||
torch.cuda.set_device(current_device) | ||
|
||
self.executor = ThreadPoolExecutor(max_workers=1, initializer=_set_cuda_device) | ||
self.monitor_thread = None | ||
self.interval = 1 / (10**power) | ||
|
||
def set_interval(self, power: int): | ||
self.clear() | ||
self.interval = 1 / (10**power) | ||
|
||
def is_measuring(self): | ||
return self.keep_measuring | ||
|
||
def start(self): | ||
self.keep_measuring = True | ||
self.monitor_thread = self.executor.submit(self._measure_usage) | ||
|
||
def finish(self): | ||
if self.keep_measuring is False: | ||
return 0 | ||
|
||
self.keep_measuring = False | ||
max_usage = self.monitor_thread.result() | ||
|
||
self.monitor_thread = None | ||
self.time_stamps.append(time()) | ||
self.mem_stats.append(max_usage) | ||
return max_usage | ||
|
||
def _measure_usage(self): | ||
max_usage = 0 | ||
while self.keep_measuring: | ||
max_usage = max( | ||
max_usage, | ||
colo_device_memory_used(get_current_device()), | ||
) | ||
sleep(self.interval) | ||
return max_usage | ||
|
||
|
||
class SyncCudaMemoryMonitor(MemoryMonitor): | ||
""" | ||
A synchronized cuda memory monitor. | ||
It only record the maximum allocated cuda memory from start point to finish point. | ||
""" | ||
|
||
def __init__(self, power: int = 10): | ||
super().__init__() | ||
|
||
def start(self): | ||
torch.cuda.synchronize() | ||
torch.cuda.reset_peak_memory_stats() | ||
|
||
def finish(self): | ||
torch.cuda.synchronize() | ||
self.time_stamps.append(time()) | ||
max_usage = torch.cuda.max_memory_allocated() | ||
self.mem_stats.append(max_usage) | ||
return max_usage |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters