Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[refactor] moving memtracer to gemini #801

Merged
merged 17 commits into from
Apr 19, 2022
Merged
Prev Previous commit
Next Next commit
polish
  • Loading branch information
feifeibear committed Apr 18, 2022
commit 731ab393949275ef2589dec716fa06abf852ae7c
15 changes: 9 additions & 6 deletions colossalai/trainer/hooks/_metric_hook.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,15 +324,15 @@ class ThroughputMetric(Metric):
epoch_only (bool): Whether the metric only read for the full epoch.
"""

def __init__(self, epoch_only: bool, ignored_steps: int = 0, tflops_per_step: int = 0):
def __init__(self, epoch_only: bool, ignored_steps: int = 0, tflop_per_step: int = 0):
super().__init__(epoch_only=epoch_only)
self.ignored_steps = ignored_steps
self.cur_steps = 0
self.accumulated_num_samples = torch.zeros(1, device=get_current_device())
self.accumulated_used_time = torch.zeros(1, device=get_current_device())
self.last_step_num_samples = torch.zeros(1, device=get_current_device())
self.last_step_used_time = torch.zeros(1, device=get_current_device())
self._tflops_per_step = tflops_per_step
self._tflop_per_step = tflop_per_step

def reset(self) -> None:
# self.cur_steps = 0
Expand All @@ -354,8 +354,8 @@ def get_last_step_value(self) -> str:
gpc.get_world_size(ParallelMode.DATA)
self.last_step_num_samples = all_reduce(self.last_step_num_samples, ParallelMode.DATA)
samplePerSec = _format_number(self.last_step_num_samples / (self.last_step_used_time + 1e-12).item())
if self._tflops_per_step > 0:
tflops = _format_number(self._tflops_per_step / (self.last_step_used_time.item() + 1e-12))
if self._tflop_per_step > 0:
tflops = _format_number(self._tflop_per_step / (self.last_step_used_time.item() + 1e-12))
return f"{samplePerSec} samplePerSec, {tflops} Tflops"
else:
return f"{samplePerSec} samplePerSec"
Expand All @@ -382,14 +382,17 @@ class ThroughputHook(MetricHook):
depend on the hooks order in the hook list.
"""

def __init__(self, ignored_steps: int = 0, priority: int = 10):
def __init__(self, ignored_steps: int = 0, priority: int = 10, tflop_per_step: int = 0):
super().__init__(priority)
self.ignored_steps = ignored_steps
self._tflop_per_step = tflop_per_step

def after_hook_is_attached(self, trainer):
self._check_metric_states_initialization(trainer)
if self._is_stage_to_compute:
self.metric = ThroughputMetric(epoch_only=True, ignored_steps=self.ignored_steps)
self.metric = ThroughputMetric(epoch_only=True,
ignored_steps=self.ignored_steps,
tflop_per_step=self._tflop_per_step)

# register the metric
trainer.states['metrics']['train']['Throughput'] = self.metric
Expand Down