Remvove '_key' from keynames in iql.py and redq.py + polish

pytorch · vmoens · May 31, 2023 · May 22, 2023 · May 22, 2023 · May 23, 2023
commit 459554617b7a210e15138b3990de74835f81ab22
diff --git a/test/test_cost.py b/test/test_cost.py
@@ -1124,7 +1124,7 @@ def test_td3_batcher(
 
         with _check_td_steady(ms_td):
             loss_ms = loss_fn(ms_td)
-        assert loss_fn.tensor_keys.priority_key in ms_td.keys()
+        assert loss_fn.tensor_keys.priority in ms_td.keys()
 
         with torch.no_grad():
             torch.manual_seed(0)  # log-prob is computed with a random action
@@ -2192,7 +2192,7 @@ def test_redq(self, delay_qvalue, num_qvalue, device, td_est):
             loss = loss_fn(td)
 
         # check td is left untouched
-        assert loss_fn.tensor_keys.priority_key in td.keys()
+        assert loss_fn.tensor_keys.priority in td.keys()
 
         # check that losses are independent
         for k in loss.keys():
@@ -2317,7 +2317,7 @@ def test_redq_shared(self, delay_qvalue, num_qvalue, device):
             loss_fn.zero_grad()
 
         # check td is left untouched
-        assert loss_fn.tensor_keys.priority_key in td.keys()
+        assert loss_fn.tensor_keys.priority in td.keys()
 
         sum([item for _, item in loss.items()]).backward()
         named_parameters = list(loss_fn.named_parameters())
@@ -2440,7 +2440,7 @@ def test_redq_batcher(self, n, delay_qvalue, num_qvalue, device, gamma=0.9):
 
         with _check_td_steady(ms_td):
             loss_ms = loss_fn(ms_td)
-        assert loss_fn.tensor_keys.priority_key in ms_td.keys()
+        assert loss_fn.tensor_keys.priority in ms_td.keys()
 
         with torch.no_grad():
             torch.manual_seed(0)  # log-prob is computed with a random action
@@ -2514,13 +2514,13 @@ def test_redq_tensordict_keys(self, td_est):
         )
 
         default_keys = {
-            "priority_key": "td_error",
-            "action_key": "action",
-            "value_key": "state_value",
-            "sample_log_prob_key": "sample_log_prob",
-            "state_action_value_key": "state_action_value",
+            "priority": "td_error",
+            "action": "action",
+            "value": "state_value",
+            "sample_log_prob": "sample_log_prob",
+            "state_action_value": "state_action_value",
         }
-        key_mapping = {"value_key": "value_key"}
+        key_mapping = {"value": "value_key"}
         self.tensordict_keys_test(
             loss_fn,
             default_keys=default_keys,
@@ -4150,7 +4150,7 @@ def test_iql(
 
         with _check_td_steady(td):
             loss = loss_fn(td)
-        assert loss_fn.tensor_keys.priority_key in td.keys()
+        assert loss_fn.tensor_keys.priority in td.keys()
 
         # check that losses are independent
         for k in loss.keys():
@@ -4271,7 +4271,7 @@ def test_iql_batcher(
         np.random.seed(0)
         with _check_td_steady(ms_td):
             loss_ms = loss_fn(ms_td)
-        assert loss_fn.tensor_keys.priority_key in ms_td.keys()
+        assert loss_fn.tensor_keys.priority in ms_td.keys()
 
         with torch.no_grad():
             torch.manual_seed(0)  # log-prob is computed with a random action
@@ -4338,13 +4338,13 @@ def test_iql_tensordict_keys(self, td_est):
         )
 
         default_keys = {
-            "priority_key": "td_error",
-            "log_prob_key": "_log_prob",
-            "action_key": "action",
-            "state_action_value_key": "state_action_value",
-            "value_key": "state_value",
+            "priority": "td_error",
+            "log_prob": "_log_prob",
+            "action": "action",
+            "state_action_value": "state_action_value",
+            "value": "state_value",
         }
-        key_mapping = {"value_key": "value_key"}
+        key_mapping = {"value": "value_key"}
         self.tensordict_keys_test(
             loss_fn,
             default_keys=default_keys,

diff --git a/torchrl/objectives/iql.py b/torchrl/objectives/iql.py
@@ -61,11 +61,32 @@ class IQLLoss(LossModule):
 
     @dataclass
     class _AcceptedKeys:
-        priority_key: NestedKey = "td_error"
-        log_prob_key: NestedKey = "_log_prob"
-        action_key: NestedKey = "action"
-        state_action_value_key: NestedKey = "state_action_value"
-        value_key: NestedKey = "state_value"
+        """Stores default values for all configurable tensordict keys.
+
+        This class is used to define and store which tensordict keys are configurable
+        via `.set_keys(key_name=key_value) and their default values.
+
+        Attributes:
+        ------------
+        value : NestedKey
+            The input tensordict key where the state value is expected.
+            Will be used for the underlying value estimator. Defaults to ``"state_value"``.
+        action : NestedKey
+            The input tensordict key where the action is expected. Defaults to ``"action"``.
+        log_prob : NestedKey
+            The input tensordict key where the log probability is expected. Defaults to ``"_log_prob"``.
+        priority : NestedKey
+            The input tensordict key where the target priority is written to. Defaults to ``"td_error"``.
+        state_action_value : NestedKey
+            The input tensordict key where the state action value is expected.
+            Will be used for the underlying value estimator as value key. Defaults to ``"state_action_value"``.
+        """
+
+        value: NestedKey = "state_value"
+        action: NestedKey = "action"
+        log_prob: NestedKey = "_log_prob"
+        priority: NestedKey = "td_error"
+        state_action_value: NestedKey = "state_action_value"
 
     default_keys = _AcceptedKeys()
     default_value_estimator = ValueEstimators.TD0
@@ -86,7 +107,7 @@ def __init__(
         if not _has_functorch:
             raise ImportError("Failed to import functorch.") from FUNCTORCH_ERROR
         super().__init__()
-        self._set_deprecated_ctor_keys(priority_key=priority_key)
+        self._set_deprecated_ctor_keys(priority=priority_key)
 
         # IQL parameter
         self.temperature = temperature
@@ -143,7 +164,7 @@ def loss_value_diff(diff, expectile=0.8):
     def _forward_value_estimator_keys(self, **kwargs) -> None:
         if self._value_estimator is not None:
             self._value_estimator.set_keys(
-                value_key=self._tensor_keys.value_key,
+                value_key=self._tensor_keys.value,
             )
 
     def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
@@ -161,7 +182,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
         loss_qvalue, priority = self._loss_qvalue(td_device)
         loss_value = self._loss_value(td_device)
 
-        tensordict_reshape.set(self.tensor_keys.priority_key, priority)
+        tensordict_reshape.set(self.tensor_keys.priority, priority)
         if (loss_actor.shape != loss_qvalue.shape) or (
             loss_value is not None and loss_actor.shape != loss_value.shape
         ):
@@ -174,7 +195,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
             "loss_actor": loss_actor.mean(),
             "loss_qvalue": loss_qvalue.mean(),
             "loss_value": loss_value.mean(),
-            "entropy": -td_device.get(self.tensor_keys.log_prob_key).mean().detach(),
+            "entropy": -td_device.get(self.tensor_keys.log_prob).mean().detach(),
         }
 
         return TensorDict(
@@ -189,14 +210,14 @@ def _loss_actor(self, tensordict: TensorDictBase) -> Tensor:
             params=self.actor_network_params,
         )
 
-        log_prob = dist.log_prob(tensordict[self.tensor_keys.action_key])
+        log_prob = dist.log_prob(tensordict[self.tensor_keys.action])
 
         # Min Q value
         td_q = tensordict.select(*self.qvalue_network.in_keys)
         td_q = vmap(self.qvalue_network, (None, 0))(
             td_q, self.target_qvalue_network_params
         )
-        min_q = td_q.get(self.tensor_keys.state_action_value_key).min(0)[0].squeeze(-1)
+        min_q = td_q.get(self.tensor_keys.state_action_value).min(0)[0].squeeze(-1)
 
         if log_prob.shape != min_q.shape:
             raise RuntimeError(
@@ -209,15 +230,15 @@ def _loss_actor(self, tensordict: TensorDictBase) -> Tensor:
                 td_copy,
                 params=self.value_network_params,
             )
-            value = td_copy.get(self.tensor_keys.value_key).squeeze(
+            value = td_copy.get(self.tensor_keys.value).squeeze(
                 -1
             )  # assert has no gradient
 
         exp_a = torch.exp((min_q - value) * self.temperature)
         exp_a = torch.min(exp_a, torch.FloatTensor([100.0]).to(self.device))
 
         # write log_prob in tensordict for alpha loss
-        tensordict.set(self.tensor_keys.log_prob_key, log_prob.detach())
+        tensordict.set(self.tensor_keys.log_prob, log_prob.detach())
         return -(exp_a * log_prob).mean()
 
     def _loss_value(self, tensordict: TensorDictBase) -> Tuple[Tensor, Tensor]:
@@ -226,21 +247,21 @@ def _loss_value(self, tensordict: TensorDictBase) -> Tuple[Tensor, Tensor]:
         td_q = vmap(self.qvalue_network, (None, 0))(
             td_q, self.target_qvalue_network_params
         )
-        min_q = td_q.get(self.tensor_keys.state_action_value_key).min(0)[0].squeeze(-1)
+        min_q = td_q.get(self.tensor_keys.state_action_value).min(0)[0].squeeze(-1)
         # state value
         td_copy = tensordict.select(*self.value_network.in_keys)
         self.value_network(
             td_copy,
             params=self.value_network_params,
         )
-        value = td_copy.get(self.tensor_keys.value_key).squeeze(-1)
+        value = td_copy.get(self.tensor_keys.value).squeeze(-1)
         value_loss = self.loss_value_diff(min_q - value, self.expectile).mean()
         return value_loss
 
     def _loss_qvalue(self, tensordict: TensorDictBase) -> Tuple[Tensor, Tensor]:
         obs_keys = self.actor_network.in_keys
         # TODO (refactor key usage): what to do with dynamically generated keys
-        tensordict = tensordict.select("next", *obs_keys, self.tensor_keys.action_key)
+        tensordict = tensordict.select("next", *obs_keys, self.tensor_keys.action)
 
         target_value = self.value_estimator.value_estimate(
             tensordict, target_params=self.target_value_network_params
@@ -249,9 +270,9 @@ def _loss_qvalue(self, tensordict: TensorDictBase) -> Tuple[Tensor, Tensor]:
             tensordict.select(*self.qvalue_network.in_keys),
             self.qvalue_network_params,
         )
-        pred_val = tensordict_expand.get(
-            self.tensor_keys.state_action_value_key
-        ).squeeze(-1)
+        pred_val = tensordict_expand.get(self.tensor_keys.state_action_value).squeeze(
+            -1
+        )
         td_error = abs(pred_val - target_value)
         loss_qval = (
             distance_loss(
@@ -276,7 +297,7 @@ def make_value_estimator(self, value_type: ValueEstimators = None, **hyperparams
         hp.update(hyperparams)
         tensor_keys = {
             "value_target_key": "value_target",
-            "value_key": self.tensor_keys.value_key,
+            "value_key": self.tensor_keys.value,
         }
         if value_type is ValueEstimators.TD1:
             self._value_estimator = TD1Estimator(

diff --git a/torchrl/objectives/redq.py b/torchrl/objectives/redq.py
@@ -82,11 +82,31 @@ class REDQLoss(LossModule):
 
     @dataclass
     class _AcceptedKeys:
-        priority_key: NestedKey = "td_error"
-        action_key: NestedKey = "action"
-        value_key: NestedKey = "state_value"
-        sample_log_prob_key: NestedKey = "sample_log_prob"
-        state_action_value_key: NestedKey = "state_action_value"
+        """Stores default values for all configurable tensordict keys.
+
+        This class is used to define and store which tensordict keys are configurable
+        via `.set_keys(key_name=key_value) and their default values.
+
+        Attributes:
+        ------------
+        value : NestedKey
+            The input tensordict key where the state value is expected.
+            Will be used for the underlying value estimator. Defaults to ``"state_value"``.
+        action : NestedKey
+            The input tensordict key where the action is expected. Defaults to ``"action"``.
+        sample_log_prob : NestedKey
+            The input tensordict key where the sample log probability is expected. Defaults to ``"sample_log_prob"``.
+        priority : NestedKey
+            The input tensordict key where the target priority is written to. Defaults to ``"td_error"``.
+        state_action_value : NestedKey
+            The input tensordict key where the state action value is expected. Defaults to ``"state_action_value"``.
+        """
+
+        action: NestedKey = "action"
+        value: NestedKey = "state_value"
+        sample_log_prob: NestedKey = "sample_log_prob"
+        priority: NestedKey = "td_error"
+        state_action_value: NestedKey = "state_action_value"
 
     default_keys = _AcceptedKeys()
     delay_actor: bool = False
@@ -169,7 +189,7 @@ def __init__(
                     "action tensor in the actor network."
                 )
             target_entropy = -float(
-                np.prod(actor_network.spec[self.tensor_keys.action_key].shape)
+                np.prod(actor_network.spec[self.tensor_keys.action].shape)
             )
         self.register_buffer(
             "target_entropy", torch.tensor(target_entropy, device=device)
@@ -182,7 +202,7 @@ def __init__(
     def _forward_value_estimator_keys(self, **kwargs) -> None:
         if self._value_estimator is not None:
             self._value_estimator.set_keys(
-                value_key=self._tensor_keys.value_key,
+                value_key=self._tensor_keys.value,
             )
 
     @property
@@ -195,7 +215,7 @@ def alpha(self):
     def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
         obs_keys = self.actor_network.in_keys
         tensordict_select = tensordict.clone(False).select(
-            "next", *obs_keys, self.tensor_keys.action_key
+            "next", *obs_keys, self.tensor_keys.action
         )
         selected_models_idx = torch.randperm(self.num_qvalue_nets)[
             : self.sub_sample_len
@@ -227,18 +247,18 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
                 actor_params,
             )
             if isinstance(self.actor_network, TensorDictSequential):
-                sample_key = self.tensor_keys.action_key
+                sample_key = self.tensor_keys.action
                 tensordict_actor_dist = self.actor_network.build_dist_from_params(
                     td_params
                 )
             else:
-                sample_key = self.tensor_keys.action_key
+                sample_key = self.tensor_keys.action
                 tensordict_actor_dist = self.actor_network.build_dist_from_params(
                     td_params
                 )
             tensordict_actor.set(sample_key, tensordict_actor_dist.rsample())
             tensordict_actor.set(
-                self.tensor_keys.sample_log_prob_key,
+                self.tensor_keys.sample_log_prob,
                 tensordict_actor_dist.log_prob(tensordict_actor.get(sample_key)),
             )
 
@@ -277,7 +297,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
         )
 
         state_action_value = tensordict_qval.get(
-            self.tensor_keys.state_action_value_key
+            self.tensor_keys.state_action_value
         ).squeeze(-1)
         (
             state_action_value_actor,
@@ -288,7 +308,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
             dim=0,
         )
         sample_log_prob = tensordict_actor.get(
-            self.tensor_keys.sample_log_prob_key
+            self.tensor_keys.sample_log_prob
         ).squeeze(-1)
         (
             action_log_prob_actor,
@@ -305,7 +325,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
         next_state_value = next_state_value.min(0)[0]
 
         tensordict_select.set(
-            ("next", self.tensor_keys.value_key), next_state_value.unsqueeze(-1)
+            ("next", self.tensor_keys.value), next_state_value.unsqueeze(-1)
         )
         target_value = self.value_estimator.value_estimate(tensordict_select).squeeze(
             -1
@@ -319,7 +339,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
             loss_function=self.loss_function,
         ).mean(0)
 
-        tensordict.set(self.tensor_keys.priority_key, td_error.detach().max(0)[0])
+        tensordict.set(self.tensor_keys.priority, td_error.detach().max(0)[0])
 
         loss_alpha = self._loss_alpha(sample_log_prob)
         if not loss_qval.shape == loss_actor.shape:
@@ -364,7 +384,7 @@ def make_value_estimator(self, value_type: ValueEstimators = None, **hyperparams
         if hasattr(self, "gamma"):
             hp["gamma"] = self.gamma
         hp.update(hyperparams)
-        tensor_keys = {"value_key": self.tensor_keys.value_key}
+        tensor_keys = {"value_key": self.tensor_keys.value}
         # we do not need a value network bc the next state value is already passed
         if value_type == ValueEstimators.TD1:
             self._value_estimator = TD1Estimator(