-
Notifications
You must be signed in to change notification settings - Fork 3.4k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
ref: adding compute environments (1/n) (#3837)
* ref: adding compute environments (1/n) * ref: adding compute environments (1/n) * ref: adding compute environments (1/n)
- app/0.6.3
- app/0.6.2
- app/0.6.1
- app/0.6.0
- app/0.6.0rc0
- app/0.5.7
- app/0.5.6
- app/0.5.5
- app/0.5.4
- app/0.5.3
- app/0.5.2
- App/0.7.0
- 2.5.0
- 2.5.0rc0
- 2.5.0.post0
- 2.4.0
- 2.3.3
- 2.3.2
- 2.3.1
- 2.3.0
- 2.2.5
- 2.2.4
- 2.2.3
- 2.2.2
- 2.2.1
- 2.2.0
- 2.2.0.rc0
- 2.2.0.post0
- 2.1.4
- 2.1.3
- 2.1.2
- 2.1.1
- 2.1.0
- 2.1.0.rc1
- 2.1.0.rc0
- 2.0.9
- 2.0.9.post0
- 2.0.8
- 2.0.7
- 2.0.6
- 2.0.5
- 2.0.4
- 2.0.3
- 2.0.2
- 2.0.1
- 2.0.1.post0
- 2.0.0
- 2.0.0rc0
- 1.9.5
- 1.9.4
- 1.9.3
- 1.9.2
- 1.9.1
- 1.9.0
- 1.9.0rc0
- 1.8.6
- 1.8.5
- 1.8.5.post0
- 1.8.4
- 1.8.4.post0
- 1.8.3
- 1.8.3.post2
- 1.8.3.post1
- 1.8.3.post0
- 1.8.2
- 1.8.1
- 1.8.0
- 1.8.0rc2
- 1.8.0rc1
- 1.8.0rc0
- 1.8.0.post1
- 1.7.7
- 1.7.6
- 1.7.5
- 1.7.4
- 1.7.3
- 1.7.2
- 1.7.1
- 1.7.0
- 1.7.0rc1
- 1.7.0rc0
- 1.6.5
- 1.6.5.post0
- 1.6.4
- 1.6.3
- 1.6.2
- 1.6.1
- 1.6.0
- 1.6.0rc1
- 1.6.0rc0
- 1.5.10
- 1.5.10.post0
- 1.5.9
- 1.5.8
- 1.5.7
- 1.5.6
- 1.5.5
- 1.5.4
- 1.5.3
- 1.5.2
- 1.5.1
- 1.5.0
- 1.5.0rc1
- 1.5.0rc0
- 1.4.9
- 1.4.8
- 1.4.7
- 1.4.6
- 1.4.5
- 1.4.4
- 1.4.3
- 1.4.2
- 1.4.1
- 1.4.0
- 1.4.0rc2
- 1.4.0rc1
- 1.4.0rc0
- 1.3.8
- 1.3.7
- 1.3.7post0
- 1.3.6
- 1.3.5
- 1.3.4
- 1.3.3
- 1.3.2
- 1.3.1
- 1.3.0
- 1.3.0rc3
- 1.3.0rc2
- 1.3.0rc1
- 1.3.0rc0
- 1.2.10
- 1.2.9
- 1.2.8
- 1.2.7
- 1.2.6
- 1.2.5
- 1.2.4
- 1.2.3
- 1.2.2
- 1.2.1
- 1.2.0
- 1.2.0rc2
- 1.2.0rc1
- 1.2.0rc0
- 1.1.8
- 1.1.7
- 1.1.6
- 1.1.5
- 1.1.4
- 1.1.3
- 1.1.2
- 1.1.2rc1
- 1.1.1
- 1.1.0
- 1.1.0rc2
- 1.1.0rc1
- 1.0.8
- 1.0.7
- 1.0.6
- 1.0.5
- 1.0.4
- 1.0.3
- 1.0.2
- 1.0.1
- 1.0.0
- 0.10.0
1 parent
a3503ce
commit 093535d
Showing
5 changed files
with
117 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment | ||
from pytorch_lightning.cluster_environments.slurm_environment import SLURMEnvironment | ||
from pytorch_lightning.cluster_environments.torchelastic_environment import TorchElasticEnvironment |
13 changes: 13 additions & 0 deletions
13
pytorch_lightning/cluster_environments/cluster_environment.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
class ClusterEnvironment: | ||
|
||
def __init__(self, world_size): | ||
self._world_size = world_size | ||
|
||
def master_address(self): | ||
pass | ||
|
||
def master_port(self): | ||
pass | ||
|
||
def world_size(self): | ||
return self._world_size |
66 changes: 66 additions & 0 deletions
66
pytorch_lightning/cluster_environments/slurm_environment.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import os | ||
import re | ||
from pytorch_lightning import _logger as log | ||
from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment | ||
|
||
|
||
class SLURMEnvironment(ClusterEnvironment): | ||
|
||
def __init__(self, world_size): | ||
super().__init__(world_size) | ||
|
||
def master_address(self): | ||
# figure out the root node addr | ||
try: | ||
root_node = os.environ["SLURM_NODELIST"].split(" ")[0] | ||
except Exception: | ||
root_node = "127.0.0.1" | ||
|
||
root_node = self._resolve_root_node_address(root_node) | ||
os.environ["MASTER_ADDR"] = root_node | ||
log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}") | ||
return root_node | ||
|
||
def master_port(self): | ||
# ----------------------- | ||
# SLURM JOB = PORT number | ||
# ----------------------- | ||
# this way every process knows what port to use | ||
try: | ||
# use the last 4 numbers in the job id as the id | ||
default_port = os.environ["SLURM_JOB_ID"] | ||
default_port = default_port[-4:] | ||
|
||
# all ports should be in the 10k+ range | ||
default_port = int(default_port) + 15000 | ||
|
||
except Exception: | ||
default_port = 12910 | ||
|
||
# ----------------------- | ||
# PORT NUMBER = MASTER_PORT | ||
# ----------------------- | ||
# in case the user passed it in | ||
try: | ||
default_port = os.environ["MASTER_PORT"] | ||
except Exception: | ||
os.environ["MASTER_PORT"] = str(default_port) | ||
|
||
log.debug(f"MASTER_PORT: {os.environ['MASTER_PORT']}") | ||
|
||
return default_port | ||
|
||
def world_size(self): | ||
return self._world_size | ||
|
||
def _resolve_root_node_address(self, root_node): | ||
if '[' in root_node: | ||
name, numbers = root_node.split('[', maxsplit=1) | ||
number = numbers.split(',', maxsplit=1)[0] | ||
if '-' in number: | ||
number = number.split('-')[0] | ||
|
||
number = re.sub('[^0-9]', '', number) | ||
root_node = name + number | ||
|
||
return root_node |
34 changes: 34 additions & 0 deletions
34
pytorch_lightning/cluster_environments/torchelastic_environment.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import os | ||
from pytorch_lightning import _logger as log | ||
from pytorch_lightning.utilities import rank_zero_warn | ||
from pytorch_lightning.cluster_environments.cluster_environment import ClusterEnvironment | ||
|
||
|
||
class TorchElasticEnvironment(ClusterEnvironment): | ||
|
||
def __init__(self, world_size): | ||
super().__init__(world_size) | ||
|
||
def master_address(self): | ||
if "MASTER_ADDR" not in os.environ: | ||
rank_zero_warn( | ||
"MASTER_ADDR environment variable is not defined. Set as localhost" | ||
) | ||
os.environ["MASTER_ADDR"] = "127.0.0.1" | ||
log.debug(f"MASTER_ADDR: {os.environ['MASTER_ADDR']}") | ||
master_address = os.environ.get('MASTER_ADDR') | ||
return master_address | ||
|
||
def master_port(self): | ||
if "MASTER_PORT" not in os.environ: | ||
rank_zero_warn( | ||
"MASTER_PORT environment variable is not defined. Set as 12910" | ||
) | ||
os.environ["MASTER_PORT"] = "12910" | ||
log.debug(f"MASTER_PORT: {os.environ['MASTER_PORT']}") | ||
|
||
port = os.environ.get('MASTER_PORT') | ||
return port | ||
|
||
def world_size(self): | ||
return os.environ.get('WORLD_SIZE', None) |