final (?)

pytorch · vmoens · Sep 29, 2023 · Sep 15, 2023 · Sep 15, 2023 · Sep 17, 2023
commit 39ed4c3f9827f2adff72aa7f3f2a6fec05edc174
diff --git a/.github/unittest/linux_libs/scripts_gym/batch_scripts.sh b/.github/unittest/linux_libs/scripts_gym/batch_scripts.sh
@@ -51,7 +51,7 @@ do
 
   echo "Testing gym version: ${GYM_VERSION}"
   # handling https://github.com/openai/gym/issues/3202
-  pip install wheel==0.38.4
+  pip3 install wheel==0.38.4
   pip3 install gym==$GYM_VERSION
   $DIR/run_test.sh
 
@@ -69,6 +69,7 @@ do
   conda activate ./cloned_env
 
   echo "Testing gym version: ${GYM_VERSION}"
+  pip3 install wheel==0.38.4
   pip3 install 'gym[atari]'==$GYM_VERSION
   pip3 install ale-py==0.7
   $DIR/run_test.sh

diff --git a/docs/source/reference/envs.rst b/docs/source/reference/envs.rst
@@ -85,22 +85,17 @@ delivery and the ``"next"`` entry is gathered by the :func:`~.utils.step_mdp`
 function.
 
 .. note::
-
-  The Gym(nasium) API recently shifted to a splitting of the ``"done"`` state
-  into a ``termination`` (the env is done and results should not be trusted)
-  and ``truncation`` (an external limit on the number of steps is reached) flags.
-  In TorchRL, ``"done"`` strictly refers to ``termination | truncation``.
+  In general, all TorchRL environment have a ``"done"`` and ``"terminated"``
+  entry in their output tensordict. If they are not present by design,
+  the :class:`~.EnvBase` metaclass will ensure that every done or truncated
+  is flanked with its dual.
+  In TorchRL, ``"done"`` strictly refers to the union of all the end-of-trajectory
+  signals and should be interpreted as "the last step of a trajectory" or
+  equivalently "a signal indicating the need to reset".
   If the environment provides it (eg, Gymnasium), the truncation entry is also
   written in the :meth:`EnvBase.step` output under a ``"truncated"`` entry.
-  If the environment carries a single value, it will interpreted as a ``"done"``
+  If the environment carries a single value, it will interpreted as a ``"terminated"``
   signal by default.
-  Some classes in TorchRL may require a ``"terminated"`` signal (eg, value functions).
-  If none is available, they will fall back on ``"done"`` instead.
-  The caveat of this choice is that adding a truncation transform (eg, :class:`.StepCounter`)
-  will override the content of the ``"done"`` signal. If this is a problem
-  a :class:`~.RenameTransform` should be used to move or copy the ``"done"``
-  entry (for instance to ``"terminated"``).
-
   By default, TorchRL's collectors and rollout methods will be looking for the ``"done"``
   entry to assess if the environment should be reset.
 

diff --git a/test/test_transforms.py b/test/test_transforms.py
@@ -1534,10 +1534,12 @@ def test_transform_compose(self, max_steps, device, batch, reset_workers):
             _reset = torch.randn(done.shape, device=device) < 0
             td.set("_reset", _reset)
             td.set("done", _reset)
+            td.set("terminated", _reset)
+            td.set(("next", "terminated"), done)
             td.set(("next", "done"), done)
         td.set("step_count", torch.zeros(*batch, 1, dtype=torch.int))
         step_counter[0]._step_count_keys = ["step_count"]
-        step_counter[0]._terminated_keys = ["completed"]
+        step_counter[0]._terminated_keys = ["terminated"]
         step_counter[0]._truncated_keys = ["truncated"]
         step_counter[0]._reset_keys = ["_reset"]
         step_counter[0]._done_keys = ["done"]
@@ -1554,6 +1556,7 @@ def test_transform_compose(self, max_steps, device, batch, reset_workers):
             )
             td = step_mdp(td)
             td["next", "done"] = done
+            td["next", "terminated"] = done
             if max_steps is None:
                 break
 
@@ -1592,11 +1595,14 @@ def test_transform_no_env(self, max_steps, device, batch, reset_workers):
         while not _reset.any() and reset_workers:
             _reset = torch.randn(done.shape, device=device) < 0
             td.set("_reset", _reset)
+            td.set("terminated", _reset)
+            td.set(("next", "terminated"), done)
             td.set("done", _reset)
             td.set(("next", "done"), done)
         td.set("step_count", torch.zeros(*batch, 1, dtype=torch.int))
         step_counter._step_count_keys = ["step_count"]
         step_counter._done_keys = ["done"]
+        step_counter._terminated_keys = ["terminated"]
         step_counter._truncated_keys = ["truncated"]
         step_counter._reset_keys = ["_reset"]
         step_counter._completed_keys = ["completed"]
@@ -1613,6 +1619,7 @@ def test_transform_no_env(self, max_steps, device, batch, reset_workers):
             )
             td = step_mdp(td)
             td["next", "done"] = done
+            td["next", "terminated"] = done
             if max_steps is None:
                 break
 

diff --git a/torchrl/collectors/collectors.py b/torchrl/collectors/collectors.py
@@ -883,12 +883,6 @@ def rollout(self) -> TensorDictBase:
                             self._tensordict_out.ndim - 1,
                             out=self._tensordict_out,
                         )
-                except KeyError:
-                    print("\n\n err during stack")
-                    print("tensordict list", tensordicts)
-                    print("dest", self._tensordict_out)
-                    print("env", self.env)
-                    raise
         return self._tensordict_out
 
     def reset(self, index=None, **kwargs) -> None:

diff --git a/torchrl/envs/common.py b/torchrl/envs/common.py
@@ -653,7 +653,7 @@ def action_spec(self, value: TensorSpec) -> None:
             self.input_spec.lock_()
 
     @property
-    def full_action_spec(self):
+    def full_action_spec(self) -> CompositeSpec:
         """The full action spec.
 
         ``full_action_spec`` is a :class:`~torchrl.data.CompositeSpec`` instance
@@ -678,6 +678,10 @@ def full_action_spec(self):
         """
         return self.input_spec["full_action_spec"]
 
+    @full_action_spec.setter
+    def full_action_spec(self, spec: CompositeSpec) -> None:
+        self.action_spec = spec
+
     # Reward spec
     def _get_reward_keys(self):
         keys = self.output_spec["full_reward_spec"].keys(True, True)
@@ -846,7 +850,7 @@ def reward_spec(self, value: TensorSpec) -> None:
             self.output_spec.lock_()
 
     @property
-    def full_reward_spec(self):
+    def full_reward_spec(self) -> CompositeSpec:
         """The full reward spec.
 
         ``full_reward_spec`` is a :class:`~torchrl.data.CompositeSpec`` instance
@@ -872,6 +876,10 @@ def full_reward_spec(self):
         """
         return self.output_spec["full_reward_spec"]
 
+    @full_reward_spec.setter
+    def full_reward_spec(self, spec: CompositeSpec) -> None:
+        self.reward_spec = spec
+
     # done spec
     def _get_done_keys(self):
         if "full_done_spec" not in self.output_spec.keys():
@@ -914,7 +922,7 @@ def done_key(self):
         return self.done_keys[0]
 
     @property
-    def full_done_spec(self):
+    def full_done_spec(self) -> CompositeSpec:
         """The full done spec.
 
         ``full_done_spec`` is a :class:`~torchrl.data.CompositeSpec`` instance
@@ -944,6 +952,10 @@ def full_done_spec(self):
         """
         return self.output_spec["full_done_spec"]
 
+    @full_done_spec.setter
+    def full_done_spec(self, spec: CompositeSpec) -> None:
+        self.done_spec = spec
+
     # Done spec: done specs belong to output_spec
     @property
     def done_spec(self) -> TensorSpec:
@@ -1177,7 +1189,13 @@ def observation_spec(self, value: TensorSpec) -> None:
         finally:
             self.output_spec.lock_()
 
-    full_observation_spec = observation_spec
+    @property
+    def full_observation_spec(self) -> CompositeSpec:
+        return self.observation_spec
+
+    @full_observation_spec.setter
+    def full_observation_spec(self, spec: CompositeSpec):
+        self.observation_spec = spec
 
     # state spec: state specs belong to input_spec
     @property
@@ -1246,7 +1264,7 @@ def state_spec(self, value: CompositeSpec) -> None:
             self.input_spec.lock_()
 
     @property
-    def full_state_spec(self):
+    def full_state_spec(self) -> CompositeSpec:
         """The full state spec.
 
         ``full_state_spec`` is a :class:`~torchrl.data.CompositeSpec`` instance
@@ -1272,6 +1290,10 @@ def full_state_spec(self):
         """
         return self.state_spec
 
+    @full_state_spec.setter
+    def full_state_spec(self, spec: CompositeSpec) -> None:
+        self.state_spec = spec
+
     def step(self, tensordict: TensorDictBase) -> TensorDictBase:
         """Makes a step in the environment.
 
@@ -1770,6 +1792,7 @@ def policy(td):
             tensordicts.append(tensordict.clone(False))
 
             if i == max_steps - 1:
+                # we don't truncated as one could potentially continue the run
                 break
             tensordict = step_mdp(
                 tensordict,
@@ -1810,6 +1833,8 @@ def reset_keys(self) -> List[NestedKey]:
         settings. They are structured as ``(*prefix, "_reset")`` where ``prefix`` is
         a (possibly empty) tuple of strings pointing to a tensordict location
         where a done state can be found.
+
+        The value of reset_keys is cached.
         """
         reset_keys = self.__dict__.get("_reset_keys", None)
         if reset_keys is not None:
@@ -1844,6 +1869,8 @@ def done_keys_groups(self):
         inner lists contain the done keys (eg, done and truncated) that can
         be read to determine a reset when it is absent.
 
+        The value of ``done_keys_groups`` is cached.
+
         """
         done_keys_sorted = self.__dict__.get("_done_keys_groups", None)
         if done_keys_sorted is not None:

diff --git a/torchrl/envs/gym_like.py b/torchrl/envs/gym_like.py
@@ -135,23 +135,23 @@ def read_action(self, action):
 
     def read_done(
         self,
-        terminated: bool,
+        terminated: bool | None = None,
         truncated: bool | None = None,
         done: bool | None = None,
     ) -> Tuple[bool | np.ndarray, bool | np.ndarray, bool | np.ndarray, bool]:
         """Done state reader.
 
         In torchrl, a `"done"` signal means that a trajectory has reach its end,
         either because it has been interrupted or because it is terminated.
-        Truncated means the trajectory has been interrupted early.
-        Terminated means the task is finished.
+        Truncated means the episode has been interrupted early.
+        Terminated means the task is finished, the episode is completed.
 
         Args:
-            terminated (np.ndarray, boolean or other format): completion state obtained
-                from the environment.
-                ``"terminated"`` equates to ``"termination"`` in gymnasium: the signal that
-                the environment has reached the end of the game, any data coming
-                after this should be considered as nonsensical.
+            terminated (np.ndarray, boolean or other format): completion state
+                obtained from the environment.
+                ``"terminated"`` equates to ``"termination"`` in gymnasium:
+                the signal that the environment has reached the end of the
+                episode, any data coming after this should be considered as nonsensical.
                 Defaults to ``None``.
             truncated (bool or None): early truncation signal.
                 Defaults to ``None``.
@@ -315,26 +315,25 @@ def _output_transform(
         """A method to read the output of the env step.
 
         Must return a tuple: (obs, reward, terminated, truncated, done, info).
-        If only one end-of-trajectory is passed, it is interpreted as ``"done"``
-        (unspecified end-of-traj).
+        If only one end-of-trajectory is passed, it is interpreted as ``"truncated"``.
+        An attempt to retrieve ``"truncated"`` from the info dict is also undertaken.
         If 2 are passed (like in gymnasium), we interpret them as ``"terminated",
         "truncated"`` (``"truncated"`` meaning that the trajectory has been
         interrupted early), and ``"done"`` is the union of the two,
         ie. the unspecified end-of-trajectory signal.
 
         These three concepts have different usage:
 
-          - ``"terminated"`` means that one should not pay attention to the
+          - ``"terminated"`` indicated the final stage of a Markov Decision
+            Process. It means that one should not pay attention to the
             upcoming observations (eg., in value functions) as they should be
             regarded as not valid.
-            This is a "game-over" situation, the result of the action is the
-            end of the game (win or loose).
           - ``"truncated"`` means that the environment has reached a stage where
             we decided to stop the collection for some reason but the next
             observation should not be discarded. If it were not for this
             arbitrary decision, the collection could have proceeded further.
           - ``"done"`` is either one or the other. It is to be interpreted as
-            "a reset should be called at the next step".
+            "a reset should be called before the next step is undertaken".
 
         """
         ...

diff --git a/torchrl/envs/transforms/transforms.py b/torchrl/envs/transforms/transforms.py
@@ -4424,6 +4424,21 @@ def done_keys(self):
         self.__dict__["_done_keys"] = done_keys
         return done_keys
 
+    @property
+    def terminated_keys(self):
+        terminated_keys = self.__dict__.get("_terminated_keys", None)
+        if terminated_keys is None:
+            # make the default terminated keys
+            terminated_keys = []
+            for (terminated_key, *_) in self.parent.done_keys_groups:
+                if isinstance(terminated_key, str):
+                    key = "terminated"
+                else:
+                    key = (*terminated_key[:-1], "terminated")
+                terminated_keys.append(key)
+        self.__dict__["_terminated_keys"] = terminated_keys
+        return terminated_keys
+
     @property
     def step_count_keys(self):
         step_count_keys = self.__dict__.get("_step_count_keys", None)
@@ -4495,11 +4510,11 @@ def reset(self, tensordict: TensorDictBase) -> TensorDictBase:
     def _step(
         self, tensordict: TensorDictBase, next_tensordict: TensorDictBase
     ) -> TensorDictBase:
-        for step_count_key, truncated_key, done_key, done_list_sorted in zip(
+        for step_count_key, truncated_key, done_key, terminated_key in zip(
             self.step_count_keys,
             self.truncated_keys,
             self.done_keys,
-            self.done_keys_groups,
+            self.terminated_keys,
         ):
             step_count = tensordict.get(step_count_key)
             next_step_count = step_count + 1
@@ -4508,10 +4523,9 @@ def _step(
                 truncated = next_step_count >= self.max_steps
                 if self.update_done:
                     done = next_tensordict.get(done_key, None)
-                    if done is None:
-                        done = False
-                        for done in done_list_sorted:
-                            done = done | next_tensordict.get(done_key, default=False)
+                    terminated = next_tensordict.get(terminated_key, None)
+                    if terminated is not None:
+                        truncated = truncated & ~terminated
                     done = truncated | done  # we assume no done after reset
                     next_tensordict.set(done_key, done)
                 next_tensordict.set(truncated_key, truncated)
@@ -5734,8 +5748,8 @@ def _step(
     ) -> TensorDictBase:
         # save the final info
         done = False
-        # TODO: check if there's a done, and if there is, get it
         for done_key in self.done_keys:
+            # we assume dones can be broadcast
             done = done | next_tensordict.get(done_key)
         if done is False:
             raise RuntimeError(
@@ -5808,16 +5822,16 @@ def done_keys(self) -> List[NestedKey]:
         keys = self.__dict__.get("_done_keys", None)
         if keys is None:
             keys = self.parent.done_keys
-            self._done_keys = keys
-            expected_done_keys = {"done", "truncated", "terminated"}
-            # put this check for now. We can consider relaxing that later
-            # and allow nested values, though they will still need to be unique.
-            for done_key in keys:
-                if done_key not in expected_done_keys:
-                    raise RuntimeError(
-                        f"VecGymEnvTransform only supports the following "
-                        f"done keys: {expected_done_keys}, but it got {done_key}."
-                    )
+            # we just want the "done" key
+            _done_keys = []
+            for key in keys:
+                if not isinstance(key, tuple):
+                    key = (key,)
+                if key[-1] == "done":
+                    _done_keys.append(unravel_key(key))
+            if not len(_done_keys):
+                raise RuntimeError("Could not find a 'done' key in the env specs.")
+            self._done_keys = _done_keys
         return keys
 
     @property