Skip to content

Commit

Permalink
Auto infer optimizer when using zero and convergence test of zero2 + …
Browse files Browse the repository at this point in the history
…pytorch amp + activation checkpointing (facebookresearch#178)

Summary:
Pull Request resolved: facebookresearch#178

auto inferring the zero optimizer setting which basically moves the parameters to "base_optimizer".

now users just need to set `use_zero=True` .

Reviewed By: min-xu-ai

Differential Revision: D26236748

fbshipit-source-id: 326db1040a457391735632071fe718d6c6071947
  • Loading branch information
prigoyal authored and facebook-github-bot committed Feb 9, 2021
1 parent 95b4857 commit 9b2349d
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 7 deletions.
4 changes: 3 additions & 1 deletion vissl/config/defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -537,7 +537,7 @@ config:
num_prototypes: [3000] # automatically inferred from model HEAD settings
temp_hard_assignment_iters: 0
# for dumping the debugging info in case loss becomes NaN
output_dir: "" # automatically inferred and set to checkpoint dir
output_dir: "." # automatically inferred and set to checkpoint dir
queue:
queue_length: 0 # automatically adjusted to ensure queue_length % global batch size = 0
start_iter: 0
Expand Down Expand Up @@ -593,6 +593,8 @@ config:
# ----------------------------------------------------------------------------------- #
OPTIMIZER:
name: "sgd"
# whether to shard optimizer state as per ZeRO https://arxiv.org/abs/1910.02054
use_zero: False
use_larc: False # supported for SGD only for now
larc_config:
clip: False
Expand Down
2 changes: 1 addition & 1 deletion vissl/utils/checkpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ def get_checkpoint_folder(config: AttrDict):
makedir(odir)
assert PathManager.exists(
config.CHECKPOINT.DIR
), "Please specify config.CHECKPOINT.DIR parameter. It should not be None."
), f"Please specify config.CHECKPOINT.DIR parameter. Invalid: {config.CHECKPOINT.DIR}"
return odir


Expand Down
22 changes: 17 additions & 5 deletions vissl/utils/hydra_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ def resolve_linear_schedule(cfg, param_schedulers):

def get_scaled_lr_scheduler(cfg, param_schedulers, scaled_lr):
"""
Scale learning rate value for different Learning rate types. See assert_learning_rate()
Scale learning rate value for different Learning rate types. See infer_learning_rate()
for how the scaled LR is calculated.
Values changed for learning rate schedules:
Expand Down Expand Up @@ -258,7 +258,7 @@ def get_scaled_lr_scheduler(cfg, param_schedulers, scaled_lr):
return param_schedulers


def assert_learning_rate(cfg):
def infer_learning_rate(cfg):
"""
1) Assert the Learning rate here. LR is scaled as per https://arxiv.org/abs/1706.02677.
to turn this automatic scaling off,
Expand Down Expand Up @@ -325,7 +325,7 @@ def assert_learning_rate(cfg):
return cfg


def assert_losses(cfg):
def infer_losses_config(cfg):
"""
Infer settings for various self-supervised losses. Takes care of setting various loss
parameters correctly like world size, batch size per gpu, effective global batch size,
Expand Down Expand Up @@ -441,8 +441,8 @@ def assert_hydra_conf(cfg):
LABEL_TYPE to "standard" (also vissl default), otherwise if no label is specified, we
set the LABEL_TYPE to "sample_index".
"""
cfg = assert_losses(cfg)
cfg = assert_learning_rate(cfg)
cfg = infer_losses_config(cfg)
cfg = infer_learning_rate(cfg)

# in case of linear evaluation, we often evaluate several layers at a time. For each
# layer, there's a separate accuracy meter. In such case, we want to output the layer
Expand Down Expand Up @@ -499,3 +499,15 @@ def assert_hydra_conf(cfg):
url=cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE, cache_dir=cache_dir
)
cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE = cached_url_path

# if we use a zero optimizer, we nest the optimizer related settings under the
# base_optimizer.
if cfg.OPTIMIZER.use_zero:
cfg.OPTIMIZER["base_optimizer"] = cfg.OPTIMIZER.copy()
cfg.OPTIMIZER.name = "zero"
del cfg.OPTIMIZER.base_optimizer["param_schedulers"]
del cfg.OPTIMIZER.base_optimizer["regularize_bn"]
del cfg.OPTIMIZER.base_optimizer["regularize_bias"]
del cfg.OPTIMIZER.base_optimizer["num_epochs"]
del cfg.OPTIMIZER.base_optimizer["use_zero"]
del cfg.OPTIMIZER.base_optimizer["head_optimizer_params"]

0 comments on commit 9b2349d

Please sign in to comment.