Skip to content

Commit

Permalink
fixed fix in use issue in distribtued training
Browse files Browse the repository at this point in the history
  • Loading branch information
Richard Fan committed Jan 5, 2025
1 parent 76ff2ef commit bc6642b
Showing 1 changed file with 27 additions and 3 deletions.
30 changes: 27 additions & 3 deletions examples/machine_translation_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import numpy as np
import matplotlib.pyplot as plt
import math
import socket

from datetime import datetime
from torch.utils.data import DataLoader
Expand All @@ -23,6 +24,8 @@

import os
import warnings
import signal
import sys

# Suppress specific PyTorch warnings
warnings.filterwarnings("ignore")
Expand Down Expand Up @@ -107,18 +110,30 @@ def __len__(self):
return len(self.encodings_en["input_ids"])


def find_free_port():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('', 0))
return s.getsockname()[1]


def setup_distributed():
import os
# Only set MASTER_PORT if not already set
if 'MASTER_PORT' not in os.environ:
free_port = find_free_port()
os.environ['MASTER_PORT'] = str(free_port)

rank = int(os.environ["RANK"])
world_size = int(os.environ["WORLD_SIZE"])
local_rank = int(os.environ["LOCAL_RANK"])
dist.init_process_group(
backend="nccl", init_method="env://", rank=rank, world_size=world_size
backend="nccl",
init_method="env://",
rank=rank,
world_size=world_size
)
torch.cuda.set_device(local_rank)

print(f"Initialized process {rank} out of {world_size} on device {local_rank}")
print(f"Initialized process {rank} out of {world_size} on device {local_rank} with MASTER_PORT={os.environ['MASTER_PORT']}")


def cleanup_distributed():
Expand Down Expand Up @@ -261,6 +276,15 @@ def generate_text(
return tgt_ids


def signal_handler(sig, frame):
print('Terminating...')
dist.destroy_process_group()
sys.exit(0)

signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)


@record
def main():

Expand Down

0 comments on commit bc6642b

Please sign in to comment.