[BugFix] Fix R2Go once more (#2089)

pytorch · Apr 18, 2024 · acf168e · acf168e · github-actions · Apr 18, 2024
1 parent 61c42e4
commit acf168e
Show file tree

Hide file tree

Showing 2 changed files with 29 additions and 8 deletions.
diff --git a/test/test_cost.py b/test/test_cost.py
@@ -13259,6 +13259,18 @@ def test_reward2go(self):
         r = torch.stack([r, -r], -1)
         torch.testing.assert_close(reward2go(reward, done, 0.9), r)
 
+        reward = torch.zeros(4, 1)
+        reward[3, 0] = 1
+        done = torch.zeros(4, 1, dtype=bool)
+        done[3, :] = True
+        r = torch.ones(4)
+        r[1:] = 0.9
+        reward = reward.expand(2, 4, 1)
+        done = done.expand(2, 4, 1)
+        r = torch.cumprod(r, 0).flip(0).unsqueeze(-1).expand(2, 4, 1)
+        r2go = reward2go(reward, done, 0.9)
+        torch.testing.assert_close(r2go, r)
+
     def test_timedimtranspose_single(self):
         @_transpose_time
         def fun(a, b, time_dim=-2):

diff --git a/torchrl/objectives/value/functional.py b/torchrl/objectives/value/functional.py
@@ -4,6 +4,7 @@
 # LICENSE file in the root directory of this source tree.
 from __future__ import annotations
 
+import functools
 import math
 
 import warnings
@@ -1362,13 +1363,19 @@ def reward2go(
         raise ValueError(
             f"reward and done must share the same shape, got {reward.shape} and {done.shape}"
         )
+    # flatten if needed
+    if reward.ndim > 2:
+        # we know time dim is at -2, let's put it at -3
+        rflip = reward.transpose(-2, -3)
+        rflip_shape = rflip.shape[-2:]
+        r2go = reward2go(
+            rflip.flatten(-2, -1), done.transpose(-2, -3).flatten(-2, -1), gamma=gamma
+        ).unflatten(-1, rflip_shape)
+        return r2go.transpose(-2, -3)
+
     # place time at dim -1
     reward = reward.transpose(-2, -1)
     done = done.transpose(-2, -1)
-    # flatten if needed
-    if reward.ndim > 2:
-        reward = reward.flatten(0, -2)
-        done = done.flatten(0, -2)
 
     num_per_traj = _get_num_per_traj(done)
     td0_flat = _split_and_pad_sequence(reward, num_per_traj)
@@ -1379,8 +1386,10 @@ def reward2go(
     cumsum = cumsum.reshape_as(reward)
     cumsum = cumsum.transpose(-2, -1)
     if cumsum.shape != shape:
-        raise RuntimeError(
-            f"Wrong shape for output reward2go: {cumsum.shape} when {shape} was expected."
-        )
-    #     cumsum = cumsum.view(shape)
+        try:
+            cumsum = cumsum.reshape(shape)
+        except RuntimeError:
+            raise RuntimeError(
+                f"Wrong shape for output reward2go: {cumsum.shape} when {shape} was expected."
+            )
     return cumsum