[Refactor] Refactor functional calls in losses (pytorch#1707)

ai4co · Nov 24, 2023 · bc7595f · bc7595f
1 parent fa149e4
commit bc7595f
Show file tree

Hide file tree

Showing 24 changed files with 273 additions and 449 deletions.
diff --git a/test/assets/generate.py b/test/assets/generate.py
@@ -36,14 +36,20 @@ def get_minibatch():
             batch_size=16,
             block_size=33,
             tensorclass_type=PromptData,
-            dataset_name="test/datasets_mini/openai_summarize_tldr",
+            dataset_name="CarperAI/openai_summarize_tldr",
             device="cpu",
             infinite=False,
             prefetch=0,
             split="train",
-            from_disk=True,
+            from_disk=False,
             root_dir=tmpdir,
         )
         for data in dl:
             data = data.clone().memmap_("test/datasets_mini/tldr_batch/")
             break
+        print("done")
+
+
+if __name__ == "__main__":
+    # generate_small_dataset()
+    get_minibatch()
diff --git a/test/test_cost.py b/test/test_cost.py
@@ -47,7 +47,7 @@
     get_default_devices,
 )
 from mocking_classes import ContinuousActionConvMockEnv
-from tensordict.nn import get_functional, NormalParamExtractor, TensorDictModule
+from tensordict.nn import NormalParamExtractor, TensorDictModule
 from tensordict.nn.utils import Buffer
 
 # from torchrl.data.postprocs.utils import expand_as_right
@@ -4967,6 +4967,18 @@ def test_cql(
             else:
                 raise NotImplementedError(k)
             loss_fn.zero_grad()
+            assert all(
+                (p.grad is None) or (p.grad == 0).all()
+                for p in loss_fn.actor_network_params.values(
+                    include_nested=True, leaves_only=True
+                )
+            )
+            assert all(
+                (p.grad is None) or (p.grad == 0).all()
+                for p in loss_fn.qvalue_network_params.values(
+                    include_nested=True, leaves_only=True
+                )
+            )
 
         sum([item for _, item in loss.items()]).backward()
         named_parameters = list(loss_fn.named_parameters())
@@ -6500,6 +6512,8 @@ def test_a2c(self, device, gradient_mode, advantage, td_est):
                 assert ("critic" not in name) or ("target_" in name)
 
         value.zero_grad()
+        for n, p in loss_fn.named_parameters():
+            assert p.grad is None or p.grad.norm() == 0, n
         loss_objective.backward()
         named_parameters = loss_fn.named_parameters()
         for name, p in named_parameters:
@@ -6900,20 +6914,20 @@ def test_reinforce_value_net(self, advantage, gradient_mode, delay_value, td_est
             advantage = GAE(
                 gamma=gamma,
                 lmbda=0.9,
-                value_network=get_functional(value_net),
+                value_network=value_net,
                 differentiable=gradient_mode,
             )
         elif advantage == "td":
             advantage = TD1Estimator(
                 gamma=gamma,
-                value_network=get_functional(value_net),
+                value_network=value_net,
                 differentiable=gradient_mode,
             )
         elif advantage == "td_lambda":
             advantage = TDLambdaEstimator(
                 gamma=0.9,
                 lmbda=0.9,
-                value_network=get_functional(value_net),
+                value_network=value_net,
                 differentiable=gradient_mode,
             )
         elif advantage is None:
@@ -9829,9 +9843,6 @@ def test_tdlambda_tensor_gamma(self, device, gamma, lmbda, N, T, has_done):
         next_state_value = torch.randn(*N, T, 1, device=device)
 
         gamma_tensor = torch.full((*N, T, 1), gamma, device=device)
-        # if len(N) == 2:
-        #     print(terminated[4, 0, -10:])
-        #     print(done[4, 0, -10:])
         v1 = vec_td_lambda_advantage_estimate(
             gamma,
             lmbda,

diff --git a/torchrl/data/rlhf/dataset.py b/torchrl/data/rlhf/dataset.py
@@ -137,6 +137,7 @@ def load(self):
         data_dir = root_dir / str(Path(self.dataset_name).name).split("-")[0]
         data_dir_total = data_dir / split / str(max_length)
         # search for data
+        print(data_dir_total)
         if os.path.exists(data_dir_total):
             dataset = TensorDict.load_memmap(data_dir_total)
             return dataset

diff --git a/torchrl/envs/transforms/rlhf.py b/torchrl/envs/transforms/rlhf.py
@@ -5,18 +5,13 @@
 from copy import copy, deepcopy
 
 import torch
-from tensordict import TensorDictBase, unravel_key
-from tensordict.nn import (
-    make_functional,
-    ProbabilisticTensorDictModule,
-    repopulate_module,
-    TensorDictParams,
-)
+from tensordict import TensorDict, TensorDictBase, unravel_key
+from tensordict.nn import ProbabilisticTensorDictModule, TensorDictParams
 from tensordict.utils import is_seq_of_nested_key
 from torch import nn
 from torchrl.data.tensor_specs import CompositeSpec, UnboundedContinuousTensorSpec
 from torchrl.envs.transforms.transforms import Transform
-from torchrl.envs.transforms.utils import _set_missing_tolerance
+from torchrl.envs.transforms.utils import _set_missing_tolerance, _stateless_param
 
 
 class KLRewardTransform(Transform):
@@ -116,11 +111,10 @@ def __init__(
         self.in_keys = self.in_keys + actor.in_keys
 
         # check that the model has parameters
-        params = make_functional(
-            actor, keep_params=False, funs_to_decorate=["forward", "get_dist"]
-        )
-        self.functional_actor = deepcopy(actor)
-        repopulate_module(actor, params)
+        params = TensorDict.from_module(actor)
+        with params.apply(_stateless_param).to_module(actor):
+            # copy a stateless actor
+            self.__dict__["functional_actor"] = deepcopy(actor)
         # we need to register these params as buffer to have `to` and similar
         # methods work properly
 
@@ -170,9 +164,8 @@ def _call(self, tensordict: TensorDictBase) -> TensorDictBase:
             if self.out_keys[0] != ("reward",) and self.parent is not None:
                 tensordict.set(self.out_keys[0], self.parent.reward_spec.zero())
             return tensordict
-        dist = self.functional_actor.get_dist(
-            tensordict.clone(False), params=self.frozen_params
-        )
+        with self.frozen_params.to_module(self.functional_actor):
+            dist = self.functional_actor.get_dist(tensordict.clone(False))
         # get the log_prob given the original model
         log_prob = dist.log_prob(action)
         reward_key = self.in_keys[0]

diff --git a/torchrl/envs/transforms/utils.py b/torchrl/envs/transforms/utils.py
@@ -5,6 +5,7 @@
 
 
 import torch
+from torch import nn
 
 
 def check_finite(tensor: torch.Tensor):
@@ -59,3 +60,11 @@ def _get_reset(reset_key, tensordict):
     if _reset.ndim > parent_td.ndim:
         _reset = _reset.flatten(parent_td.ndim, -1).any(-1)
     return _reset
+
+
+def _stateless_param(param):
+    is_param = isinstance(param, nn.Parameter)
+    param = param.data.to("meta")
+    if is_param:
+        return nn.Parameter(param, requires_grad=False)
+    return param
diff --git a/torchrl/modules/tensordict_module/actors.py b/torchrl/modules/tensordict_module/actors.py
@@ -183,7 +183,7 @@ class ProbabilisticActor(SafeProbabilisticTensorDictSequential):
     Examples:
         >>> import torch
         >>> from tensordict import TensorDict
-        >>> from tensordict.nn import TensorDictModule, make_functional
+        >>> from tensordict.nn import TensorDictModule
         >>> from torchrl.data import BoundedTensorSpec
         >>> from torchrl.modules import ProbabilisticActor, NormalParamWrapper, TanhNormal
         >>> td = TensorDict({"observation": torch.randn(3, 4)}, [3,])
@@ -197,8 +197,9 @@ class ProbabilisticActor(SafeProbabilisticTensorDictSequential):
         ...    in_keys=["loc", "scale"],
         ...    distribution_class=TanhNormal,
         ...    )
-        >>> params = make_functional(td_module)
-        >>> td = td_module(td, params=params)
+        >>> params = TensorDict.from_module(td_module)
+        >>> with params.to_module(td_module):
+        ...     td = td_module(td)
         >>> td
         TensorDict(
             fields={
@@ -319,7 +320,6 @@ class ValueOperator(TensorDictModule):
     Examples:
         >>> import torch
         >>> from tensordict import TensorDict
-        >>> from tensordict.nn import make_functional
         >>> from torch import nn
         >>> from torchrl.data import UnboundedContinuousTensorSpec
         >>> from torchrl.modules import ValueOperator
@@ -334,8 +334,9 @@ class ValueOperator(TensorDictModule):
         >>> td_module = ValueOperator(
         ...    in_keys=["observation", "action"], module=module
         ... )
-        >>> params = make_functional(td_module)
-        >>> td = td_module(td, params=params)
+        >>> params = TensorDict.from_module(td_module)
+        >>> with params.to_module(td_module):
+        ...     td = td_module(td)
         >>> print(td)
         TensorDict(
             fields={
@@ -792,7 +793,6 @@ class QValueHook:
     Examples:
         >>> import torch
         >>> from tensordict import TensorDict
-        >>> from tensordict.nn.functional_modules import make_functional
         >>> from torch import nn
         >>> from torchrl.data import OneHotDiscreteTensorSpec
         >>> from torchrl.modules.tensordict_module.actors import QValueHook, Actor
@@ -878,7 +878,6 @@ class DistributionalQValueHook(QValueHook):
     Examples:
         >>> import torch
         >>> from tensordict import TensorDict
-        >>> from tensordict.nn.functional_modules import make_functional
         >>> from torch import nn
         >>> from torchrl.data import OneHotDiscreteTensorSpec
         >>> from torchrl.modules.tensordict_module.actors import DistributionalQValueHook, Actor
@@ -893,12 +892,13 @@ class DistributionalQValueHook(QValueHook):
         ...         return self.linear(x).view(-1, nbins, 4).log_softmax(-2)
         ...
         >>> module = CustomDistributionalQval()
-        >>> params = make_functional(module)
+        >>> params = TensorDict.from_module(module)
         >>> action_spec = OneHotDiscreteTensorSpec(4)
         >>> hook = DistributionalQValueHook("one_hot", support = torch.arange(nbins))
         >>> module.register_forward_hook(hook)
         >>> qvalue_actor = Actor(module=module, spec=action_spec, out_keys=["action", "action_value"])
-        >>> qvalue_actor(td, params=params)
+        >>> with params.to_module(module):
+        ...     qvalue_actor(td)
         >>> print(td)
         TensorDict(
             fields={
@@ -992,7 +992,6 @@ class QValueActor(SafeSequential):
     Examples:
         >>> import torch
         >>> from tensordict import TensorDict
-        >>> from tensordict.nn.functional_modules import make_functional
         >>> from torch import nn
         >>> from torchrl.data import OneHotDiscreteTensorSpec
         >>> from torchrl.modules.tensordict_module.actors import QValueActor

diff --git a/torchrl/modules/tensordict_module/common.py b/torchrl/modules/tensordict_module/common.py
@@ -138,7 +138,6 @@ class SafeModule(TensorDictModule):
     Examples:
         >>> import torch
         >>> from tensordict import TensorDict
-        >>> from tensordict.nn.functional_modules import make_functional
         >>> from torchrl.data import UnboundedContinuousTensorSpec
         >>> from torchrl.modules import TensorDictModule
         >>> td = TensorDict({"input": torch.randn(3, 4), "hidden": torch.randn(3, 8)}, [3,])
@@ -150,8 +149,9 @@ class SafeModule(TensorDictModule):
         ...    in_keys=["input", "hidden"],
         ...    out_keys=["output"],
         ...    )
-        >>> params = make_functional(td_fmodule)
-        >>> td_functional = td_fmodule(td.clone(), params=params)
+        >>> params = TensorDict.from_module(td_fmodule)
+        >>> with params.to_module(td_module):
+        ...     td_functional = td_fmodule(td.clone())
         >>> print(td_functional)
         TensorDict(
             fields={

diff --git a/torchrl/modules/tensordict_module/sequence.py b/torchrl/modules/tensordict_module/sequence.py
@@ -33,7 +33,6 @@ class SafeSequential(TensorDictSequential, SafeModule):
     Examples:
         >>> import torch
         >>> from tensordict import TensorDict
-        >>> from tensordict.nn.functional_modules import make_functional
         >>> from torchrl.data import CompositeSpec, UnboundedContinuousTensorSpec
         >>> from torchrl.modules import TanhNormal, SafeSequential, TensorDictModule, NormalParamWrapper
         >>> from torchrl.modules.tensordict_module import SafeProbabilisticModule
@@ -58,8 +57,9 @@ class SafeSequential(TensorDictSequential, SafeModule):
         ...    out_keys=["output"],
         ...    )
         >>> td_module = SafeSequential(td_module1, td_module2)
-        >>> params = make_functional(td_module)
-        >>> td_module(td, params=params)
+        >>> params = TensorDict.from_module(td_module)
+        >>> with params.to_module(td_module):
+        ...     td_module(td)
         >>> print(td)
         TensorDict(
             fields={

diff --git a/torchrl/objectives/a2c.py b/torchrl/objectives/a2c.py
@@ -327,8 +327,8 @@ def _log_probs(
                 f"tensordict stored {self.tensor_keys.action} require grad."
             )
         tensordict_clone = tensordict.select(*self.actor.in_keys).clone()
-
-        dist = self.actor.get_dist(tensordict_clone, params=self.actor_params)
+        with self.actor_params.to_module(self.actor):
+            dist = self.actor.get_dist(tensordict_clone)
         log_prob = dist.log_prob(action)
         log_prob = log_prob.unsqueeze(-1)
         return log_prob, dist
@@ -339,10 +339,10 @@ def loss_critic(self, tensordict: TensorDictBase) -> torch.Tensor:
             # overhead that we could easily reduce.
             target_return = tensordict.get(self.tensor_keys.value_target)
             tensordict_select = tensordict.select(*self.critic.in_keys)
-            state_value = self.critic(
-                tensordict_select,
-                params=self.critic_params,
-            ).get(self.tensor_keys.value)
+            with self.critic_params.to_module(self.critic):
+                state_value = self.critic(
+                    tensordict_select,
+                ).get(self.tensor_keys.value)
             loss_value = distance_loss(
                 target_return,
                 state_value,
@@ -374,6 +374,7 @@ def forward(self, tensordict: TensorDictBase) -> TensorDictBase:
                 target_params=self.target_critic_params,
             )
             advantage = tensordict.get(self.tensor_keys.advantage)
+        assert not advantage.requires_grad
         log_probs, dist = self._log_probs(tensordict)
         loss = -(log_probs * advantage)
         td_out = TensorDict({"loss_objective": loss.mean()}, [])
@@ -392,6 +393,7 @@ def make_value_estimator(self, value_type: ValueEstimators = None, **hyperparams
         self.value_type = value_type
         hp = dict(default_value_kwargs(value_type))
         hp.update(hyperparams)
+
         if hasattr(self, "gamma"):
             hp["gamma"] = self.gamma
         if value_type == ValueEstimators.TD1: