Rename DeepSpeedPlugin to DeepSpeedStrategy (Lightning-AI#11194)

abhiskk · Dec 21, 2021 · 283bdec · 283bdec
1 parent 17aceaf
commit 283bdec
Show file tree

Hide file tree

Showing 18 changed files with 102 additions and 94 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -137,7 +137,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 
 - Renamed the `TrainingTypePlugin` to `Strategy` ([#11120](https://github.com/PyTorchLightning/pytorch-lightning/pull/11120))
-    *Renamed the `DDPPlugin` to `DDPStrategy` ([#11142](https://github.com/PyTorchLightning/pytorch-lightning/pull/11142))
+    * Renamed the `DDPPlugin` to `DDPStrategy` ([#11142](https://github.com/PyTorchLightning/pytorch-lightning/pull/11142))
+    * Renamed the `DeepSpeedPlugin` to `DeepSpeedStrategy` ([#11194](https://github.com/PyTorchLightning/pytorch-lightning/pull/11194))
 
 
 - Marked the `ResultCollection`, `ResultMetric`, and `ResultMetricCollection` classes as protected ([#11130](https://github.com/PyTorchLightning/pytorch-lightning/pull/11130))

diff --git a/docs/source/advanced/advanced_gpu.rst b/docs/source/advanced/advanced_gpu.rst
@@ -291,7 +291,7 @@ Below we show an example of running `ZeRO-Offload <https://www.deepspeed.ai/tuto
 .. code-block:: python
 
     from pytorch_lightning import Trainer
-    from pytorch_lightning.plugins import DeepSpeedPlugin
+    from pytorch_lightning.plugins import DeepSpeedStrategy
 
     model = MyModel()
     trainer = Trainer(gpus=4, strategy="deepspeed_stage_2_offload", precision=16)
@@ -310,12 +310,12 @@ You can also modify the ZeRO-Offload parameters via the plugin as below.
 .. code-block:: python
 
     from pytorch_lightning import Trainer
-    from pytorch_lightning.plugins import DeepSpeedPlugin
+    from pytorch_lightning.plugins import DeepSpeedStrategy
 
     model = MyModel()
     trainer = Trainer(
         gpus=4,
-        strategy=DeepSpeedPlugin(offload_optimizer=True, allgather_bucket_size=5e8, reduce_bucket_size=5e8),
+        strategy=DeepSpeedStrategy(offload_optimizer=True, allgather_bucket_size=5e8, reduce_bucket_size=5e8),
         precision=16,
     )
     trainer.fit(model)
@@ -335,7 +335,7 @@ For even more speed benefit, DeepSpeed offers an optimized CPU version of ADAM c
 
     import pytorch_lightning
     from pytorch_lightning import Trainer
-    from pytorch_lightning.plugins import DeepSpeedPlugin
+    from pytorch_lightning.plugins import DeepSpeedStrategy
     from deepspeed.ops.adam import DeepSpeedCPUAdam
 
 
@@ -379,7 +379,7 @@ Also please have a look at our :ref:`deepspeed-zero-stage-3-tips` which contains
 .. code-block:: python
 
     from pytorch_lightning import Trainer
-    from pytorch_lightning.plugins import DeepSpeedPlugin
+    from pytorch_lightning.plugins import DeepSpeedStrategy
     from deepspeed.ops.adam import FusedAdam
 
 
@@ -403,7 +403,7 @@ You can also use the Lightning Trainer to run predict or evaluate with DeepSpeed
 .. code-block:: python
 
     from pytorch_lightning import Trainer
-    from pytorch_lightning.plugins import DeepSpeedPlugin
+    from pytorch_lightning.plugins import DeepSpeedStrategy
 
 
     class MyModel(pl.LightningModule):
@@ -429,7 +429,7 @@ This reduces the time taken to initialize very large models, as well as ensure w
 
     import torch.nn as nn
     from pytorch_lightning import Trainer
-    from pytorch_lightning.plugins import DeepSpeedPlugin
+    from pytorch_lightning.plugins import DeepSpeedStrategy
     from deepspeed.ops.adam import FusedAdam
 
 
@@ -467,7 +467,7 @@ DeepSpeed ZeRO Stage 3 Offloads optimizer state, gradients to the host CPU to re
 .. code-block:: python
 
     from pytorch_lightning import Trainer
-    from pytorch_lightning.plugins import DeepSpeedPlugin
+    from pytorch_lightning.plugins import DeepSpeedStrategy
 
     # Enable CPU Offloading
     model = MyModel()
@@ -478,7 +478,7 @@ DeepSpeed ZeRO Stage 3 Offloads optimizer state, gradients to the host CPU to re
     model = MyModel()
     trainer = Trainer(
         gpus=4,
-        strategy=DeepSpeedPlugin(
+        strategy=DeepSpeedStrategy(
             stage=3,
             offload_optimizer=True,
             offload_parameters=True,
@@ -496,7 +496,7 @@ Additionally, DeepSpeed supports offloading to NVMe drives for even larger model
 .. code-block:: python
 
     from pytorch_lightning import Trainer
-    from pytorch_lightning.plugins import DeepSpeedPlugin
+    from pytorch_lightning.plugins import DeepSpeedStrategy
 
     # Enable CPU Offloading
     model = MyModel()
@@ -507,7 +507,7 @@ Additionally, DeepSpeed supports offloading to NVMe drives for even larger model
     model = MyModel()
     trainer = Trainer(
         gpus=4,
-        strategy=DeepSpeedPlugin(
+        strategy=DeepSpeedStrategy(
             stage=3,
             offload_optimizer=True,
             offload_parameters=True,
@@ -541,7 +541,7 @@ This saves memory when training larger models, however requires using a checkpoi
 .. code-block:: python
 
     from pytorch_lightning import Trainer
-    from pytorch_lightning.plugins import DeepSpeedPlugin
+    from pytorch_lightning.plugins import DeepSpeedStrategy
     import deepspeed
 
 
@@ -564,7 +564,7 @@ This saves memory when training larger models, however requires using a checkpoi
 .. code-block:: python
 
     from pytorch_lightning import Trainer
-    from pytorch_lightning.plugins import DeepSpeedPlugin
+    from pytorch_lightning.plugins import DeepSpeedStrategy
     import deepspeed
 
 
@@ -589,7 +589,7 @@ This saves memory when training larger models, however requires using a checkpoi
     # Enable CPU Activation Checkpointing
     trainer = Trainer(
         gpus=4,
-        strategy=DeepSpeedPlugin(
+        strategy=DeepSpeedStrategy(
             stage=3,
             offload_optimizer=True,  # Enable CPU Offloading
             cpu_checkpointing=True,  # (Optional) offload activations to CPU
@@ -609,7 +609,7 @@ Here is some helpful information when setting up DeepSpeed ZeRO Stage 3 with Lig
 * If you're using Adam or AdamW, ensure to use FusedAdam or DeepSpeedCPUAdam (for CPU Offloading) rather than the default torch optimizers as they come with large speed benefits
 * Treat your GPU/CPU memory as one large pool. In some cases, you may not want to offload certain things (like activations) to provide even more space to offload model parameters
 * When offloading to the CPU, make sure to bump up the batch size as GPU memory will be freed
-* We also support sharded checkpointing. By passing ``save_full_weights=False`` to the ``DeepSpeedPlugin``, we'll save shards of the model which allows you to save extremely large models. However to load the model and run test/validation/predict you must use the Trainer object.
+* We also support sharded checkpointing. By passing ``save_full_weights=False`` to the ``DeepSpeedStrategy``, we'll save shards of the model which allows you to save extremely large models. However to load the model and run test/validation/predict you must use the Trainer object.
 
 .. _deepspeed-zero-stage-3-single-file:
 
@@ -644,7 +644,7 @@ In some cases you may want to define your own DeepSpeed Config, to access all pa
 .. code-block:: python
 
     from pytorch_lightning import Trainer
-    from pytorch_lightning.plugins import DeepSpeedPlugin
+    from pytorch_lightning.plugins import DeepSpeedStrategy
 
     deepspeed_config = {
         "zero_allow_untested_optimizer": True,
@@ -678,7 +678,7 @@ In some cases you may want to define your own DeepSpeed Config, to access all pa
     }
 
     model = MyModel()
-    trainer = Trainer(gpus=4, strategy=DeepSpeedPlugin(deepspeed_config), precision=16)
+    trainer = Trainer(gpus=4, strategy=DeepSpeedStrategy(deepspeed_config), precision=16)
     trainer.fit(model)
 
 
@@ -687,10 +687,10 @@ We support taking the config as a json formatted file:
 .. code-block:: python
 
     from pytorch_lightning import Trainer
-    from pytorch_lightning.plugins import DeepSpeedPlugin
+    from pytorch_lightning.plugins import DeepSpeedStrategy
 
     model = MyModel()
-    trainer = Trainer(gpus=4, strategy=DeepSpeedPlugin("/path/to/deepspeed_config.json"), precision=16)
+    trainer = Trainer(gpus=4, strategy=DeepSpeedStrategy("/path/to/deepspeed_config.json"), precision=16)
     trainer.fit(model)
 
 

diff --git a/docs/source/api_references.rst b/docs/source/api_references.rst
@@ -155,7 +155,7 @@ Training Type Plugins
     DDPShardedPlugin
     DDPSpawnShardedPlugin
     DDPSpawnPlugin
-    DeepSpeedPlugin
+    DeepSpeedStrategy
     HorovodStrategy
     SingleTPUPlugin
     TPUSpawnPlugin

diff --git a/docs/source/common/checkpointing.rst b/docs/source/common/checkpointing.rst
@@ -383,4 +383,4 @@ Custom Checkpoint IO Plugin
 
 .. note::
 
-    Some ``TrainingTypePlugins`` like ``DeepSpeedPlugin`` do not support custom ``CheckpointIO`` as checkpointing logic is not modifiable.
+    Some ``TrainingTypePlugins`` like ``DeepSpeedStrategy`` do not support custom ``CheckpointIO`` as checkpointing logic is not modifiable.
diff --git a/docs/source/extensions/plugins.rst b/docs/source/extensions/plugins.rst
@@ -114,7 +114,7 @@ Training Type Plugins
     DDPShardedPlugin
     DDPSpawnShardedPlugin
     DDPSpawnPlugin
-    DeepSpeedPlugin
+    DeepSpeedStrategy
     HorovodStrategy
     SingleTPUPlugin
     TPUSpawnPlugin

diff --git a/docs/source/guides/speed.rst b/docs/source/guides/speed.rst
@@ -45,7 +45,7 @@ Lightning supports a variety of plugins to further speed up distributed GPU trai
 
 * :class:`~pytorch_lightning.plugins.training_type.DDPStrategy`
 * :class:`~pytorch_lightning.plugins.training_type.DDPShardedPlugin`
-* :class:`~pytorch_lightning.plugins.training_type.DeepSpeedPlugin`
+* :class:`~pytorch_lightning.plugins.training_type.DeepSpeedStrategy`
 
 .. code-block:: python
 

diff --git a/docs/source/starter/lightning_lite.rst b/docs/source/starter/lightning_lite.rst
@@ -389,9 +389,9 @@ Additionally, you can pass in your custom training type strategy by configuring
 
 .. code-block:: python
 
-    from pytorch_lightning.plugins import DeepSpeedPlugin
+    from pytorch_lightning.plugins import DeepSpeedStrategy
 
-    lite = Lite(strategy=DeepSpeedPlugin(stage=2), accelerator="gpu", devices=2)
+    lite = Lite(strategy=DeepSpeedStrategy(stage=2), accelerator="gpu", devices=2)
 
 
 Support for Horovod and Fully Sharded training strategies are coming soon.

diff --git a/pytorch_lightning/lite/lite.py b/pytorch_lightning/lite/lite.py
@@ -26,7 +26,7 @@
 
 from pytorch_lightning.accelerators.accelerator import Accelerator
 from pytorch_lightning.lite.wrappers import _LiteDataLoader, _LiteModule, _LiteOptimizer
-from pytorch_lightning.plugins import DDPSpawnPlugin, DeepSpeedPlugin, PLUGIN_INPUT, Strategy, TPUSpawnPlugin
+from pytorch_lightning.plugins import DDPSpawnPlugin, DeepSpeedStrategy, PLUGIN_INPUT, Strategy, TPUSpawnPlugin
 from pytorch_lightning.plugins.training_type.training_type_plugin import TBroadcast
 from pytorch_lightning.trainer.connectors.accelerator_connector import AcceleratorConnector
 from pytorch_lightning.utilities import _AcceleratorType, _StrategyType, move_data_to_device
@@ -257,7 +257,7 @@ def backward(self, tensor: Tensor, *args: Any, model: Optional[_LiteModule] = No
             model as argument here.
         """
         module = model.module if model is not None else model
-        if isinstance(self._strategy, DeepSpeedPlugin):
+        if isinstance(self._strategy, DeepSpeedStrategy):
             if model is None:
                 if self._models_setup == 0:
                     raise MisconfigurationException(

diff --git a/pytorch_lightning/plugins/__init__.py b/pytorch_lightning/plugins/__init__.py
@@ -22,7 +22,7 @@
 from pytorch_lightning.plugins.training_type.ddp import DDPStrategy
 from pytorch_lightning.plugins.training_type.ddp2 import DDP2Plugin
 from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin
-from pytorch_lightning.plugins.training_type.deepspeed import DeepSpeedPlugin
+from pytorch_lightning.plugins.training_type.deepspeed import DeepSpeedStrategy
 from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin
 from pytorch_lightning.plugins.training_type.fully_sharded import DDPFullyShardedStrategy
 from pytorch_lightning.plugins.training_type.horovod import HorovodStrategy
@@ -48,7 +48,7 @@
     "DDPStrategy",
     "DDPSpawnPlugin",
     "DDPFullyShardedStrategy",
-    "DeepSpeedPlugin",
+    "DeepSpeedStrategy",
     "DeepSpeedPrecisionPlugin",
     "DoublePrecisionPlugin",
     "HorovodStrategy",

diff --git a/pytorch_lightning/plugins/training_type/__init__.py b/pytorch_lightning/plugins/training_type/__init__.py
@@ -1,7 +1,7 @@
 from pytorch_lightning.plugins.training_type.ddp import DDPStrategy  # noqa: F401
 from pytorch_lightning.plugins.training_type.ddp2 import DDP2Plugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.ddp_spawn import DDPSpawnPlugin  # noqa: F401
-from pytorch_lightning.plugins.training_type.deepspeed import DeepSpeedPlugin  # noqa: F401
+from pytorch_lightning.plugins.training_type.deepspeed import DeepSpeedStrategy  # noqa: F401
 from pytorch_lightning.plugins.training_type.dp import DataParallelPlugin  # noqa: F401
 from pytorch_lightning.plugins.training_type.fully_sharded import DDPFullyShardedStrategy  # noqa: F401
 from pytorch_lightning.plugins.training_type.horovod import HorovodStrategy  # noqa: F401

diff --git a/pytorch_lightning/plugins/training_type/deepspeed.py b/pytorch_lightning/plugins/training_type/deepspeed.py
@@ -82,7 +82,7 @@ def _move_float_tensors_to_half(self, batch: Any):
         return batch
 
 
-class DeepSpeedPlugin(DDPStrategy):
+class DeepSpeedStrategy(DDPStrategy):
     distributed_backend = _StrategyType.DEEPSPEED
     DEEPSPEED_ENV_VAR = "PL_DEEPSPEED_CONFIG_PATH"
 
@@ -136,7 +136,7 @@ def __init__(
         billion parameter models. `For more information: https://pytorch-
         lightning.readthedocs.io/en/latest/advanced/multi_gpu.html#deepspeed`.
 
-        .. warning:: ``DeepSpeedPlugin`` is in beta and subject to change.
+        .. warning:: ``DeepSpeedStrategy`` is in beta and subject to change.
 
         Defaults have been set to enable ZeRO-Offload and some have been taken from the link below.
         These defaults have been set generally, but may require tuning for optimum performance based on your model size.
@@ -619,7 +619,7 @@ def _auto_select_batch_size(self):
                     deepspeed.utils.logging.logger.warning(
                         "Tried to infer the batch size for internal deepspeed logging from the `train_dataloader()`. "
                         "To ensure DeepSpeed logging remains correct, please manually pass the plugin with the "
-                        "batch size, `Trainer(strategy=DeepSpeedPlugin(logging_batch_size_per_gpu=batch_size))`."
+                        "batch size, `Trainer(strategy=DeepSpeedStrategy(logging_batch_size_per_gpu=batch_size))`."
                     )
         return batch_size
 
@@ -755,7 +755,7 @@ def load_checkpoint(self, checkpoint_path: _PATH) -> Dict[str, Any]:
         if client_state is None:
             raise MisconfigurationException(
                 "DeepSpeed was unable to load the checkpoint. Ensure you passed in a DeepSpeed compatible checkpoint "
-                "or a single checkpoint file with `Trainer(strategy=DeepSpeedPlugin(load_full_weights=True))`."
+                "or a single checkpoint file with `Trainer(strategy=DeepSpeedStrategy(load_full_weights=True))`."
             )
         return client_state
 

diff --git a/pytorch_lightning/trainer/connectors/accelerator_connector.py b/pytorch_lightning/trainer/connectors/accelerator_connector.py
@@ -34,8 +34,8 @@
     DDPSpawnPlugin,
     DDPSpawnShardedPlugin,
     DDPStrategy,
-    DeepSpeedPlugin,
     DeepSpeedPrecisionPlugin,
+    DeepSpeedStrategy,
     DoublePrecisionPlugin,
     FullyShardedNativeMixedPrecisionPlugin,
     HorovodStrategy,
@@ -638,7 +638,7 @@ def select_precision_plugin(self) -> PrecisionPlugin:
                     )
                 return TPUBf16PrecisionPlugin()
 
-        if self._distrib_type == _StrategyType.DEEPSPEED or isinstance(self._training_type_plugin, DeepSpeedPlugin):
+        if self._distrib_type == _StrategyType.DEEPSPEED or isinstance(self._training_type_plugin, DeepSpeedStrategy):
             return DeepSpeedPrecisionPlugin(self.precision, self.amp_type, self.amp_level)
 
         if self.precision == 32:
@@ -702,7 +702,7 @@ def select_training_type_plugin(self) -> Strategy:
         elif self.use_ddp2:
             plugin = DDP2Plugin(parallel_devices=self.parallel_devices, cluster_environment=self.cluster_environment)
         elif self.use_ddp and self.use_deepspeed:
-            plugin = DeepSpeedPlugin(
+            plugin = DeepSpeedStrategy(
                 cluster_environment=self.select_cluster_environment(), parallel_devices=self.parallel_devices
             )
         elif self.use_ddp:

diff --git a/tests/accelerators/test_accelerator_connector.py b/tests/accelerators/test_accelerator_connector.py
@@ -31,7 +31,7 @@
     DDPSpawnPlugin,
     DDPSpawnShardedPlugin,
     DDPStrategy,
-    DeepSpeedPlugin,
+    DeepSpeedStrategy,
     ParallelPlugin,
     PrecisionPlugin,
     SingleDevicePlugin,
@@ -420,7 +420,7 @@ def test_plugin_accelerator_choice(accelerator: Optional[str], plugin: str):
         ("ddp_spawn", DDPSpawnPlugin),
         ("ddp_sharded", DDPShardedPlugin),
         ("ddp_sharded_spawn", DDPSpawnShardedPlugin),
-        pytest.param("deepspeed", DeepSpeedPlugin, marks=RunIf(deepspeed=True)),
+        pytest.param("deepspeed", DeepSpeedStrategy, marks=RunIf(deepspeed=True)),
     ],
 )
 @mock.patch("torch.cuda.is_available", return_value=True)
@@ -631,7 +631,7 @@ def test_strategy_choice_cpu_plugin(tmpdir, plugin):
         ("dp", DataParallelPlugin),
         ("ddp_sharded", DDPShardedPlugin),
         ("ddp_sharded_spawn", DDPSpawnShardedPlugin),
-        pytest.param("deepspeed", DeepSpeedPlugin, marks=RunIf(deepspeed=True)),
+        pytest.param("deepspeed", DeepSpeedStrategy, marks=RunIf(deepspeed=True)),
     ],
 )
 def test_strategy_choice_gpu_str(tmpdir, strategy, plugin):

diff --git a/tests/lite/test_lite.py b/tests/lite/test_lite.py
@@ -25,7 +25,7 @@
 
 from pytorch_lightning.lite import LightningLite
 from pytorch_lightning.lite.wrappers import _LiteDataLoader, _LiteModule, _LiteOptimizer
-from pytorch_lightning.plugins import DeepSpeedPlugin, PrecisionPlugin, Strategy
+from pytorch_lightning.plugins import DeepSpeedStrategy, PrecisionPlugin, Strategy
 from pytorch_lightning.utilities import _StrategyType
 from pytorch_lightning.utilities.exceptions import MisconfigurationException
 from pytorch_lightning.utilities.seed import pl_worker_init_function
@@ -443,4 +443,4 @@ def run(self):
             assert self.broadcast(True)
             assert self.is_global_zero == (self.local_rank == 0)
 
-    Lite(strategy=DeepSpeedPlugin(stage=3, logging_batch_size_per_gpu=1), devices=2, accelerator="gpu").run()
+    Lite(strategy=DeepSpeedStrategy(stage=3, logging_batch_size_per_gpu=1), devices=2, accelerator="gpu").run()
diff --git a/tests/plugins/test_cluster_integration.py b/tests/plugins/test_cluster_integration.py
@@ -18,7 +18,7 @@
 import torch
 
 from pytorch_lightning import Trainer
-from pytorch_lightning.plugins import DDP2Plugin, DDPShardedPlugin, DDPStrategy, DeepSpeedPlugin
+from pytorch_lightning.plugins import DDP2Plugin, DDPShardedPlugin, DDPStrategy, DeepSpeedStrategy
 from pytorch_lightning.plugins.environments import LightningEnvironment, SLURMEnvironment, TorchElasticEnvironment
 from pytorch_lightning.utilities import rank_zero_only
 from tests.helpers.runif import RunIf
@@ -56,7 +56,7 @@ def environment_combinations():
 
 @pytest.mark.parametrize(
     "plugin_cls",
-    [DDPStrategy, DDPShardedPlugin, DDP2Plugin, pytest.param(DeepSpeedPlugin, marks=RunIf(deepspeed=True))],
+    [DDPStrategy, DDPShardedPlugin, DDP2Plugin, pytest.param(DeepSpeedStrategy, marks=RunIf(deepspeed=True))],
 )
 def test_ranks_available_manual_plugin_selection(plugin_cls):
     """Test that the rank information is readily available after Trainer initialization."""
Original file line number	Diff line number	Diff line change
Expand Up		@@ -383,4 +383,4 @@ Custom Checkpoint IO Plugin

		.. note::

		Some ``TrainingTypePlugins`` like ``DeepSpeedPlugin`` do not support custom ``CheckpointIO`` as checkpointing logic is not modifiable.
		Some ``TrainingTypePlugins`` like ``DeepSpeedStrategy`` do not support custom ``CheckpointIO`` as checkpointing logic is not modifiable.