fix marl examples

pytorch · vmoens · Oct 2, 2023 · Sep 15, 2023 · Sep 15, 2023 · Sep 17, 2023
commit fef45e6ebc5ee6bc5a10b880ae863a44261514ac
diff --git a/examples/multiagent/iql.py b/examples/multiagent/iql.py
@@ -21,6 +21,7 @@
 from torchrl.modules.models.multiagent import MultiAgentMLP
 from torchrl.objectives import DQNLoss, SoftUpdate, ValueEstimators
 from utils.logging import init_logging, log_evaluation, log_training
+from utils.utils import DoneTransform
 
 
 def rendering_callback(env, td):
@@ -118,13 +119,18 @@ def train(cfg: "DictConfig"):  # noqa: F821
         sampler=SamplerWithoutReplacement(),
         batch_size=cfg.train.minibatch_size,
     )
+    replay_buffer.append_transform(
+        DoneTransform(reward_key=env.reward_key, done_keys=env.done_keys)
+    )
 
     loss_module = DQNLoss(qnet, delay_value=True)
     loss_module.set_keys(
         action_value=("agents", "action_value"),
         action=env.action_key,
         value=("agents", "chosen_action_value"),
         reward=env.reward_key,
+        done="done_expand",
+        terminated="terminated_expand",
     )
     loss_module.make_value_estimator(ValueEstimators.TD0, gamma=cfg.loss.gamma)
     target_net_updater = SoftUpdate(loss_module, eps=1 - cfg.loss.tau)
@@ -144,13 +150,6 @@ def train(cfg: "DictConfig"):  # noqa: F821
 
         sampling_time = time.time() - sampling_start
 
-        tensordict_data.set(
-            ("next", "done"),
-            tensordict_data.get(("next", "done"))
-            .unsqueeze(-1)
-            .expand(tensordict_data.get(("next", env.reward_key)).shape),
-        )  # We need to expand the done to match the reward shape
-
         current_frames = tensordict_data.numel()
         total_frames += current_frames
         data_view = tensordict_data.reshape(-1)

diff --git a/examples/multiagent/maddpg_iddpg.py b/examples/multiagent/maddpg_iddpg.py
@@ -26,6 +26,7 @@
 from torchrl.modules.models.multiagent import MultiAgentMLP
 from torchrl.objectives import DDPGLoss, SoftUpdate, ValueEstimators
 from utils.logging import init_logging, log_evaluation, log_training
+from utils.utils import DoneTransform
 
 
 def rendering_callback(env, td):
@@ -140,10 +141,15 @@ def train(cfg: "DictConfig"):  # noqa: F821
         sampler=SamplerWithoutReplacement(),
         batch_size=cfg.train.minibatch_size,
     )
+    replay_buffer.append_transform(
+        DoneTransform(reward_key=env.reward_key, done_keys=env.done_keys)
+    )
 
     loss_module = DDPGLoss(
         actor_network=policy, value_network=value_module, delay_value=True
     )
+    loss_module.set_keys(done="done_expand", terminated="terminated_expand")
+
     loss_module.set_keys(
         state_action_value=("agents", "state_action_value"),
         reward=env.reward_key,
@@ -170,13 +176,6 @@ def train(cfg: "DictConfig"):  # noqa: F821
 
         sampling_time = time.time() - sampling_start
 
-        tensordict_data.set(
-            ("next", "done"),
-            tensordict_data.get(("next", "done"))
-            .unsqueeze(-1)
-            .expand(tensordict_data.get(("next", env.reward_key)).shape),
-        )  # We need to expand the done to match the reward shape
-
         current_frames = tensordict_data.numel()
         total_frames += current_frames
         data_view = tensordict_data.reshape(-1)

diff --git a/examples/multiagent/mappo_ippo.py b/examples/multiagent/mappo_ippo.py
@@ -22,7 +22,7 @@
 from torchrl.modules.models.multiagent import MultiAgentMLP
 from torchrl.objectives import ClipPPOLoss, ValueEstimators
 from utils.logging import init_logging, log_evaluation, log_training
-
+from utils.utils import DoneTransform
 
 def rendering_callback(env, td):
     env.frames.append(env.render(mode="rgb_array", agent_index_focus=None))
@@ -126,6 +126,7 @@ def train(cfg: "DictConfig"):  # noqa: F821
         storing_device=cfg.train.device,
         frames_per_batch=cfg.collector.frames_per_batch,
         total_frames=cfg.collector.total_frames,
+        postproc=DoneTransform(reward_key=env.reward_key, done_keys=env.done_keys),
     )
 
     replay_buffer = TensorDictReplayBuffer(
@@ -142,7 +143,12 @@ def train(cfg: "DictConfig"):  # noqa: F821
         entropy_coef=cfg.loss.entropy_eps,
         normalize_advantage=False,
     )
-    loss_module.set_keys(reward=env.reward_key, action=env.action_key)
+    loss_module.set_keys(
+        reward=env.reward_key,
+        action=env.action_key,
+        done="done_expand",
+        terminated="terminated_expand",
+    )
     loss_module.make_value_estimator(
         ValueEstimators.GAE, gamma=cfg.loss.gamma, lmbda=cfg.loss.lmbda
     )
@@ -165,13 +171,6 @@ def train(cfg: "DictConfig"):  # noqa: F821
 
         sampling_time = time.time() - sampling_start
 
-        tensordict_data.set(
-            ("next", "done"),
-            tensordict_data.get(("next", "done"))
-            .unsqueeze(-1)
-            .expand(tensordict_data.get(("next", env.reward_key)).shape),
-        )  # We need to expand the done to match the reward shape
-
         with torch.no_grad():
             loss_module.value_estimator(
                 tensordict_data,

diff --git a/examples/multiagent/sac.py b/examples/multiagent/sac.py
@@ -23,6 +23,7 @@
 from torchrl.modules.models.multiagent import MultiAgentMLP
 from torchrl.objectives import DiscreteSACLoss, SACLoss, SoftUpdate, ValueEstimators
 from utils.logging import init_logging, log_evaluation, log_training
+from utils.utils import DoneTransform
 
 
 def rendering_callback(env, td):
@@ -186,6 +187,9 @@ def train(cfg: "DictConfig"):  # noqa: F821
         sampler=SamplerWithoutReplacement(),
         batch_size=cfg.train.minibatch_size,
     )
+    replay_buffer.append_transform(
+        DoneTransform(reward_key=env.reward_key, done_keys=env.done_keys)
+    )
 
     if cfg.env.continuous_actions:
         loss_module = SACLoss(
@@ -198,6 +202,8 @@ def train(cfg: "DictConfig"):  # noqa: F821
             state_action_value=("agents", "state_action_value"),
             action=env.action_key,
             reward=env.reward_key,
+            done="done_expand",
+            terminated="terminated_expand",
         )
     else:
         loss_module = DiscreteSACLoss(
@@ -211,6 +217,8 @@ def train(cfg: "DictConfig"):  # noqa: F821
             action_value=("agents", "action_value"),
             action=env.action_key,
             reward=env.reward_key,
+            done="done_expand",
+            terminated="terminated_expand",
         )
 
     loss_module.make_value_estimator(ValueEstimators.TD0, gamma=cfg.loss.gamma)
@@ -235,13 +243,6 @@ def train(cfg: "DictConfig"):  # noqa: F821
 
         sampling_time = time.time() - sampling_start
 
-        tensordict_data.set(
-            ("next", "done"),
-            tensordict_data.get(("next", "done"))
-            .unsqueeze(-1)
-            .expand(tensordict_data.get(("next", env.reward_key)).shape),
-        )  # We need to expand the done to match the reward shape
-
         current_frames = tensordict_data.numel()
         total_frames += current_frames
         data_view = tensordict_data.reshape(-1)

diff --git a/examples/multiagent/utils/utils.py b/examples/multiagent/utils/utils.py
@@ -0,0 +1,32 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+from torchrl.envs import Transform
+
+
+def append_suffix(key, suffix):
+    if isinstance(key, str):
+        return key + suffix
+    return key[:-1] + (append_suffix(key[-1], suffix),)
+
+
+class DoneTransform(Transform):
+    """Expands the 'done' entries (incl. terminated) to match the reward shape.
+
+    Can be appended to a replay buffer or a collector.
+    """
+    def __init__(self, reward_key, done_keys):
+        super().__init__()
+        self.reward_key = reward_key
+        self.done_keys = done_keys
+
+    def forward(self, tensordict):
+        for done_key in self.done_keys:
+            tensordict.set(
+                ("next", append_suffix(done_key, "_expand")),
+                tensordict.get(("next", done_key))
+                .unsqueeze(-1)
+                .expand(tensordict.get(("next", self.reward_key)).shape),
+            )
+        return tensordict
diff --git a/torchrl/objectives/value/advantages.py b/torchrl/objectives/value/advantages.py
@@ -242,9 +242,9 @@ def forward(
                 and ``"next"`` tensordict state as returned by the environment)
                 necessary to compute the value estimates and the TDEstimate.
                 The data passed to this module should be structured as
-                :obj:`[*B, T, F]` where :obj:`B` are
+                :obj:`[*B, T, *F]` where :obj:`B` are
                 the batch size, :obj:`T` the time dimension and :obj:`F` the
-                feature dimension(s).
+                feature dimension(s). The tensordict must have shape ``[*B, T]``.
             params (TensorDictBase, optional): A nested TensorDict containing the params
                 to be passed to the functional value network module.
             target_params (TensorDictBase, optional): A nested TensorDict containing the
@@ -500,9 +500,9 @@ def forward(
                 tensordict state as returned by the environment) necessary to
                 compute the value estimates and the TDEstimate.
                 The data passed to this module should be structured as
-                :obj:`[*B, T, F]` where :obj:`B` are
+                :obj:`[*B, T, *F]` where :obj:`B` are
                 the batch size, :obj:`T` the time dimension and :obj:`F` the
-                feature dimension(s).
+                feature dimension(s). The tensordict must have shape ``[*B, T]``.
             params (TensorDictBase, optional): A nested TensorDict containing the params
                 to be passed to the functional value network module.
             target_params (TensorDictBase, optional): A nested TensorDict containing the
@@ -701,8 +701,9 @@ def forward(
                 ``("next", "done")``, ``("next", "terminated")``,
                 and ``"next"`` tensordict state as returned by the environment)
                 necessary to compute the value estimates and the TDEstimate.
-                The data passed to this module should be structured as :obj:`[*B, T, F]` where :obj:`B` are
+                The data passed to this module should be structured as :obj:`[*B, T, *F]` where :obj:`B` are
                 the batch size, :obj:`T` the time dimension and :obj:`F` the feature dimension(s).
+                The tensordict must have shape ``[*B, T]``.
             params (TensorDictBase, optional): A nested TensorDict containing the params
                 to be passed to the functional value network module.
             target_params (TensorDictBase, optional): A nested TensorDict containing the
@@ -910,8 +911,9 @@ def forward(
                 ``("next", "done")``, ``("next", "terminated")``,
                 and ``"next"`` tensordict state as returned by the environment)
                 necessary to compute the value estimates and the TDLambdaEstimate.
-                The data passed to this module should be structured as :obj:`[*B, T, F]` where :obj:`B` are
+                The data passed to this module should be structured as :obj:`[*B, T, *F]` where :obj:`B` are
                 the batch size, :obj:`T` the time dimension and :obj:`F` the feature dimension(s).
+                The tensordict must have shape ``[*B, T]``.
             params (TensorDictBase, optional): A nested TensorDict containing the params
                 to be passed to the functional value network module.
             target_params (TensorDictBase, optional): A nested TensorDict containing the
@@ -1150,8 +1152,9 @@ def forward(
                 ``("next", "done")``, ``("next", "terminated")``,
                 and ``"next"`` tensordict state as returned by the environment)
                 necessary to compute the value estimates and the GAE.
-                The data passed to this module should be structured as :obj:`[*B, T, F]` where :obj:`B` are
+                The data passed to this module should be structured as :obj:`[*B, T, *F]` where :obj:`B` are
                 the batch size, :obj:`T` the time dimension and :obj:`F` the feature dimension(s).
+                The tensordict must have shape ``[*B, T]``.
             params (TensorDictBase, optional): A nested TensorDict containing the params
                 to be passed to the functional value network module.
             target_params (TensorDictBase, optional): A nested TensorDict containing the

diff --git a/torchrl/objectives/value/functional.py b/torchrl/objectives/value/functional.py
@@ -521,7 +521,7 @@ def td1_return_estimate(
         gamma = gamma * not_terminated
         g = next_state_value[..., -1, :]
         for i in reversed(range(T)):
-            # if not done and not terminated, get the bootstrapped value
+            # if not done (and hence not terminated), get the bootstrapped value
             # if done but not terminated, get nex_val
             # if terminated, take nothing (gamma = 0)
             dnt = done_but_not_terminated[..., i, :]