Upgrade of fairscale (facebookresearch#231)

Summary: Upgrading to the latest version of fairscale: - [x] Upgrade of PyTorch version to 1.8.1, minimal versions needed for the latest fairscale - [x] Fixed issued with FSDP paths having changed by making our code insensitive to it - [x] Added unit test to check that we can load checkpoints of previous fairscale versions X-link: fairinternal/ssl_scaling#231 Reviewed By: iseessel Differential Revision: D34730366 Pulled By: QuentinDuval fbshipit-source-id: 44759cc388fe7fc1eeca4f570a31ee3ad78e5fe5
johnnynunez · Mar 15, 2022 · c5068d7 · c5068d7
1 parent cfbad8e
commit c5068d7
Show file tree

Hide file tree

Showing 8 changed files with 105 additions and 31 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -60,7 +60,7 @@ install_fairscale: &install_fairscale
       working_directory: ~/
       command: |
         pip uninstall -y fairscale
-        pip install fairscale@https://github.com/facebookresearch/fairscale/tarball/df7db85cef7f9c30a5b821007754b96eb1f977b6
+        pip install fairscale==0.4.6
 
 
 install_classy_vision: &install_classy_vision
@@ -112,7 +112,8 @@ install_vissl_dep: &install_vissl_dep
       name: Install Dependencies
       working_directory: ~/vissl
       command: |
-        pip install --progress-bar off torch==1.7.1 torchvision==0.8.2 opencv-python==3.4.2.17
+        pip install --progress-bar off torch==1.8.1+cu102 torchvision==0.9.1+cu102 -f https://download.pytorch.org/whl/torch_stable.html
+        pip install --progress-bar off opencv-python==3.4.2.17
         pip install --progress-bar off -r requirements.txt
         # Update this since classy_vision seems to need it.
         pip install --progress-bar off --upgrade iopath

diff --git a/INSTALL.md b/INSTALL.md
@@ -33,14 +33,14 @@ The following instructions assume that you have desired CUDA version installed a
 If you don't have anaconda, [run this bash scrip to install conda](https://github.com/facebookresearch/vissl/blob/main/docker/common/install_conda.sh).
 
 ```bash
-conda create -n vissl_env python=3.7
+conda create -n vissl_env python=3.8
 source activate vissl_env
 ```
 
 #### Step 2: Install PyTorch (conda)
 
 ```bash
-conda install pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cudatoolkit=10.2 -c pytorch
+conda install pytorch==1.8.1 torchvision==0.9.1 cudatoolkit=10.2 -c pytorch
 ```
 
 #### Step 3: Install APEX (conda)
@@ -67,7 +67,7 @@ pip uninstall -y classy_vision
 pip install classy-vision@https://github.com/facebookresearch/ClassyVision/tarball/4785d5ee19d3bcedd5b28c1eb51ea1f59188b54d
 # update fairscale install to commit stable for vissl.
 pip uninstall -y fairscale
-pip install fairscale@https://github.com/facebookresearch/fairscale/tarball/df7db85cef7f9c30a5b821007754b96eb1f977b6
+pip install fairscale==0.4.6
 # install vissl dev mode (e stands for editable)
 pip install -e ".[dev]"
 # verify installation
@@ -85,13 +85,13 @@ python3 -m venv ~/venv
 #### Step 2: Install PyTorch (pip)
 
 ```bash
-pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+pip install torch==1.8.1+cu102 torchvision==0.9.1+cu102 -f https://download.pytorch.org/whl/torch_stable.html
 ```
 
 #### Step 3: Install APEX (pip)
 
 ```bash
-pip install -f https://dl.fbaipublicfiles.com/vissl/packaging/apexwheels/py37_cu101_pyt171/download.html apex
+pip install -f https://dl.fbaipublicfiles.com/vissl/packaging/apexwheels/py38_cu102_pyt181/download.html apex
 ```
 
 #### Step 4: Install VISSL
@@ -106,7 +106,7 @@ This assumes you have CUDA 10.2.
 ```bash
 conda create -n vissl python=3.8
 conda activate vissl
-conda install -c pytorch pytorch=1.7.1 torchvision cudatoolkit=10.2
+conda install pytorch==1.8.1 torchvision==0.9.1 cudatoolkit=10.2 -c pytorch
 conda install -c vissl -c iopath -c conda-forge -c pytorch -c defaults apex vissl
 ```
 
@@ -127,14 +127,14 @@ python3 -m venv ~/venv
 
 #### Step 2: Install PyTorch, OpenCV and APEX (pip)
 
-- We use PyTorch=1.5.1 with CUDA 10.1 in the following instruction (user can chose their desired version).
+- We use PyTorch=1.8.1 with CUDA 10.2 in the following instruction (you can chose your desired version).
 - There are several ways to install opencv, one possibility is as follows.
 - For APEX, we provide pre-built binary built with optimized C++/CUDA extensions provided by APEX.
 
 ```bash
-pip install torch==1.5.1+cu101 torchvision==0.6.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+pip install torch==1.8.1+cu102 torchvision==0.9.1+cu102 -f https://download.pytorch.org/whl/torch_stable.html
 pip install opencv-python
-pip install -f https://dl.fbaipublicfiles.com/vissl/packaging/apexwheels/py38_cu101_pyt151/download.html apex
+pip install -f https://dl.fbaipublicfiles.com/vissl/packaging/apexwheels/py38_cu102_pyt181/download.html apex
 ```
 
 Note that, for the APEX install, you need to get the versions of CUDA, PyTorch, and Python correct in the URL. We provide APEX versions with all possible combinations of Python, PyTorch, CUDA. Select the right APEX Wheels if you desire a different combination.

diff --git a/configs/config/pretrain/swav/swav_8node_resnet.yaml b/configs/config/pretrain/swav/swav_8node_resnet.yaml
@@ -55,11 +55,6 @@ config:
       ]
     TEMP_FROZEN_PARAMS_ITER_MAP: [
       ['module.heads.0.prototypes0.weight', 313],
-      # TODO (Min): FSDP need to return the original param name from named_parameters().
-      # Configuration for flatten_parameters = True
-      ['_fsdp_wrapped_module.heads.0._fsdp_wrapped_module._fpw_module.prototypes0._fsdp_wrapped_module.weight', 313],
-      # Configuration for flatten_parameters = False
-      ['_fsdp_wrapped_module.heads.0._fsdp_wrapped_module.prototypes0._fsdp_wrapped_module.weight', 313]
     ]
     SYNC_BN_CONFIG:
       CONVERT_BN_TO_SYNC_BN: True

diff --git a/configs/config/test/integration_test/quick_swav_2crops.yaml b/configs/config/test/integration_test/quick_swav_2crops.yaml
@@ -57,8 +57,6 @@ config:
       ]
     TEMP_FROZEN_PARAMS_ITER_MAP: [
       ['module.heads.0.prototypes0.weight', 313],
-      # TODO (Min): FSDP need to return the original param name from named_parameters().
-      ['_fsdp_wrapped_module.heads.0._fsdp_wrapped_module._fpw_module.prototypes0._fsdp_wrapped_module.weight', 313]
     ]
     SYNC_BN_CONFIG:
       CONVERT_BN_TO_SYNC_BN: True

diff --git a/tests/test_layer_memory_tracking.py b/tests/test_layer_memory_tracking.py
@@ -139,12 +139,28 @@ def _layer_memory_tracking_worker(gpu_id: int, sync_file: str, world_size: int):
             if t.all_gathered > 0
         ]
         assert all_gathered_traces == [
-            ("_fsdp_wrapped_module.0", 440, 440),
-            ("_fsdp_wrapped_module.2._fsdp_wrapped_module", 440, 880),
-            ("_fsdp_wrapped_module.4._fsdp_wrapped_module._fpw_module", 440, 880),
-            ("_fsdp_wrapped_module.4._fsdp_wrapped_module._fpw_module", 440, 0),
-            ("_fsdp_wrapped_module.2._fsdp_wrapped_module", 440, 0),
-        ]
+            ("_fsdp_wrapped_module._fpw_module.0", 440, 440),
+            (
+                "_fsdp_wrapped_module._fpw_module.2._fsdp_wrapped_module._fpw_module",
+                440,
+                880,
+            ),
+            (
+                "_fsdp_wrapped_module._fpw_module.4._fsdp_wrapped_module._fpw_module",
+                440,
+                880,
+            ),
+            (
+                "_fsdp_wrapped_module._fpw_module.4._fsdp_wrapped_module._fpw_module",
+                440,
+                0,
+            ),
+            (
+                "_fsdp_wrapped_module._fpw_module.2._fsdp_wrapped_module._fpw_module",
+                440,
+                0,
+            ),
+        ], f"Expected {all_gathered_traces}"
 
     @gpu_test(gpu_count=2)
     def test_memory_tracking_fsdp(self):

diff --git a/tests/test_regnet_fsdp_10b.py b/tests/test_regnet_fsdp_10b.py
@@ -14,6 +14,13 @@
 
 
 class TestRegnet10B(unittest.TestCase):
+    """
+    Integrations tests that should be run on 8 GPUs nodes.
+
+    Tests that the RegNet10B trained for SEER still works:
+    https://arxiv.org/abs/2202.08360
+    """
+
     @staticmethod
     def _create_10B_pretrain_config(num_gpus: int, num_steps: int, batch_size: int):
         data_limit = num_steps * batch_size * num_gpus
@@ -50,3 +57,51 @@ def test_regnet_10b_swav_pretraining(self):
             losses = results.get_losses()
             print(losses)
             self.assertEqual(len(losses), 2)
+
+    @staticmethod
+    def _create_10B_evaluation_config(
+        num_gpus: int, num_steps: int, batch_size: int, path_to_sliced_checkpoint: str
+    ):
+        data_limit = num_steps * batch_size * num_gpus
+        cfg = compose_hydra_configuration(
+            [
+                "config=benchmark/linear_image_classification/clevr_count/eval_resnet_8gpu_transfer_clevr_count_linear",
+                "+config/benchmark/linear_image_classification/clevr_count/models=regnet10B",
+                f"config.MODEL.WEIGHTS_INIT.PARAMS_FILE={path_to_sliced_checkpoint}",
+                "config.MODEL.AMP_PARAMS.USE_AMP=True",
+                "config.MODEL.AMP_PARAMS.AMP_TYPE=pytorch",
+                "config.OPTIMIZER.num_epochs=1",
+                "config.LOG_FREQUENCY=1",
+                # Testing on fake images
+                "config.DATA.TRAIN.DATA_SOURCES=[synthetic]",
+                "config.DATA.TRAIN.LABEL_SOURCES=[synthetic]",
+                "config.DATA.TRAIN.RANDOM_SYNTHETIC_IMAGES=True",
+                "config.DATA.TRAIN.USE_DEBUGGING_SAMPLER=True",
+                "config.DATA.TEST.DATA_SOURCES=[synthetic]",
+                "config.DATA.TEST.LABEL_SOURCES=[synthetic]",
+                "config.DATA.TEST.RANDOM_SYNTHETIC_IMAGES=True",
+                "config.DATA.TEST.USE_DEBUGGING_SAMPLER=True",
+                # Disable overlap communication and computation for test
+                "config.MODEL.FSDP_CONFIG.FORCE_SYNC_CUDA=True",
+                # Testing on 8 V100 32GB GPU only
+                f"config.DATA.TRAIN.BATCHSIZE_PER_REPLICA={batch_size}",
+                f"config.DATA.TRAIN.DATA_LIMIT={data_limit}",
+                "config.DISTRIBUTED.NUM_NODES=1",
+                f"config.DISTRIBUTED.NUM_PROC_PER_NODE={num_gpus}",
+                "config.DISTRIBUTED.RUN_ID=auto",
+            ]
+        )
+        args, config = convert_to_attrdict(cfg)
+        return config
+
+    @gpu_test(gpu_count=8)
+    def test_regnet_10b_evaluation(self):
+        with in_temporary_directory():
+            cp_path = "/checkpoint/qduval/vissl/seer/regnet10B_sliced/model_iteration124500_sliced.torch"
+            config = self._create_10B_evaluation_config(
+                num_gpus=8, num_steps=2, batch_size=4, path_to_sliced_checkpoint=cp_path
+            )
+            results = run_integration_test(config)
+            losses = results.get_losses()
+            print(losses)
+            self.assertGreater(len(losses), 0)
diff --git a/vissl/hooks/state_update_hooks.py b/vissl/hooks/state_update_hooks.py
@@ -247,21 +247,28 @@ def on_backward(self, task: "tasks.ClassyTask") -> None:
                 )
             return
 
-        world_size = (
-            task.config.DISTRIBUTED.NUM_NODES
-            * task.config.DISTRIBUTED.NUM_PROC_PER_NODE
-        )
-        match_param_prefix = "module." if world_size == 1 else ""
         num_matched_named_params = 0
         for name, p in task.model.named_parameters():
-            match_param_name = f"{match_param_prefix}{name}"
+            match_param_name = self._clean_param_path(name)
             if (
                 match_param_name in map_params_to_iters
             ) and task.iteration < map_params_to_iters[match_param_name]:
                 num_matched_named_params += 1
                 p.grad = None
+
         # TODO (Min): we need to check the exact target number.
         assert num_matched_named_params > 0, (
             f"Didn't find expected number of layers: "
             f"{num_matched_named_params} vs.  {len(map_params_to_iters)}"
         )
+
+    @staticmethod
+    def _clean_param_path(param_name: str) -> str:
+        # Remove FSDP path artifacts
+        paths_to_remove = ["_fsdp_wrapped_module.", "_fpw_module."]
+        for path_to_remove in paths_to_remove:
+            param_name = param_name.replace(path_to_remove, "")
+        # Add the missing "module." prefix if missing (DDP prefix)
+        if not param_name.startswith("module."):
+            param_name = f"module.{param_name}"
+        return param_name
diff --git a/vissl/utils/fsdp_utils.py b/vissl/utils/fsdp_utils.py
@@ -106,7 +106,9 @@ class _BigConvAutoWrapPolicy:
     def __init__(self, threshold: int):
         self.threshold = threshold
 
-    def __call__(self, module: nn.Module, recurse: bool, unwrapped_params: int):
+    def __call__(
+        self, module: nn.Module, recurse: bool, unwrapped_params: int, **kwargs
+    ):
         is_large = unwrapped_params >= self.threshold
         force_leaf_modules = default_auto_wrap_policy.FORCE_LEAF_MODULES
         if recurse: