Auto infer optimizer when using zero and convergence test of zero2 + …

…pytorch amp + activation checkpointing (facebookresearch#178) Summary: Pull Request resolved: facebookresearch#178 auto inferring the zero optimizer setting which basically moves the parameters to "base_optimizer". now users just need to set `use_zero=True` . Reviewed By: min-xu-ai Differential Revision: D26236748 fbshipit-source-id: 326db1040a457391735632071fe718d6c6071947
yangsenwxy · Feb 9, 2021 · 9b2349d · 9b2349d
1 parent 95b4857
commit 9b2349d
Show file tree

Hide file tree

Showing 3 changed files with 21 additions and 7 deletions.
diff --git a/vissl/config/defaults.yaml b/vissl/config/defaults.yaml
@@ -537,7 +537,7 @@ config:
       num_prototypes: [3000]        # automatically inferred from model HEAD settings
       temp_hard_assignment_iters: 0
       # for dumping the debugging info in case loss becomes NaN
-      output_dir: ""                # automatically inferred and set to checkpoint dir
+      output_dir: "."               # automatically inferred and set to checkpoint dir
       queue:
         queue_length: 0             # automatically adjusted to ensure queue_length % global batch size = 0
         start_iter: 0
@@ -593,6 +593,8 @@ config:
   # ----------------------------------------------------------------------------------- #
   OPTIMIZER:
     name: "sgd"
+    # whether to shard optimizer state as per ZeRO https://arxiv.org/abs/1910.02054
+    use_zero: False
     use_larc: False  # supported for SGD only for now
     larc_config:
       clip: False

diff --git a/vissl/utils/checkpoint.py b/vissl/utils/checkpoint.py
@@ -45,7 +45,7 @@ def get_checkpoint_folder(config: AttrDict):
     makedir(odir)
     assert PathManager.exists(
         config.CHECKPOINT.DIR
-    ), "Please specify config.CHECKPOINT.DIR parameter. It should not be None."
+    ), f"Please specify config.CHECKPOINT.DIR parameter. Invalid: {config.CHECKPOINT.DIR}"
     return odir
 
 

diff --git a/vissl/utils/hydra_config.py b/vissl/utils/hydra_config.py
@@ -189,7 +189,7 @@ def resolve_linear_schedule(cfg, param_schedulers):
 
 def get_scaled_lr_scheduler(cfg, param_schedulers, scaled_lr):
     """
-    Scale learning rate value for different Learning rate types. See assert_learning_rate()
+    Scale learning rate value for different Learning rate types. See infer_learning_rate()
     for how the scaled LR is calculated.
 
     Values changed for learning rate schedules:
@@ -258,7 +258,7 @@ def get_scaled_lr_scheduler(cfg, param_schedulers, scaled_lr):
     return param_schedulers
 
 
-def assert_learning_rate(cfg):
+def infer_learning_rate(cfg):
     """
     1) Assert the Learning rate here. LR is scaled as per https://arxiv.org/abs/1706.02677.
     to turn this automatic scaling off,
@@ -325,7 +325,7 @@ def assert_learning_rate(cfg):
     return cfg
 
 
-def assert_losses(cfg):
+def infer_losses_config(cfg):
     """
     Infer settings for various self-supervised losses. Takes care of setting various loss
     parameters correctly like world size, batch size per gpu, effective global batch size,
@@ -441,8 +441,8 @@ def assert_hydra_conf(cfg):
        LABEL_TYPE to "standard" (also vissl default), otherwise if no label is specified, we
        set the LABEL_TYPE to "sample_index".
     """
-    cfg = assert_losses(cfg)
-    cfg = assert_learning_rate(cfg)
+    cfg = infer_losses_config(cfg)
+    cfg = infer_learning_rate(cfg)
 
     # in case of linear evaluation, we often evaluate several layers at a time. For each
     # layer, there's a separate accuracy meter. In such case, we want to output the layer
@@ -499,3 +499,15 @@ def assert_hydra_conf(cfg):
             url=cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE, cache_dir=cache_dir
         )
         cfg.MODEL.WEIGHTS_INIT.PARAMS_FILE = cached_url_path
+
+    # if we use a zero optimizer, we nest the optimizer related settings under the
+    # base_optimizer.
+    if cfg.OPTIMIZER.use_zero:
+        cfg.OPTIMIZER["base_optimizer"] = cfg.OPTIMIZER.copy()
+        cfg.OPTIMIZER.name = "zero"
+        del cfg.OPTIMIZER.base_optimizer["param_schedulers"]
+        del cfg.OPTIMIZER.base_optimizer["regularize_bn"]
+        del cfg.OPTIMIZER.base_optimizer["regularize_bias"]
+        del cfg.OPTIMIZER.base_optimizer["num_epochs"]
+        del cfg.OPTIMIZER.base_optimizer["use_zero"]
+        del cfg.OPTIMIZER.base_optimizer["head_optimizer_params"]