update header, docs and delete dtwrapper

pytorch · vmoens · Aug 30, 2023 · Mar 31, 2023 · Apr 4, 2023 · Apr 14, 2023
commit a717c8e1411ca3c5b14ef735b089eed51d26f685
diff --git a/examples/decision_transformer/lamb.py b/examples/decision_transformer/lamb.py
@@ -1,51 +1,8 @@
-""" PyTorch Lamb optimizer w/ behaviour similar to NVIDIA FusedLamb
-This optimizer code was adapted from the following (starting with latest)
-* https://github.com/HabanaAI/Model-References/blob/2b435114fe8e31f159b1d3063b8280ae37af7423/PyTorch/nlp/bert/pretraining/lamb.py
-* https://github.com/NVIDIA/DeepLearningExamples/blob/master/PyTorch/LanguageModeling/Transformer-XL/pytorch/lamb.py
-* https://github.com/cybertronai/pytorch-lamb
-Use FusedLamb if you can (GPU). The reason for including this variant of Lamb is to have a version that is
-similar in behaviour to APEX FusedLamb if you aren't using NVIDIA GPUs or cannot install/use APEX.
-In addition to some cleanup, this Lamb impl has been modified to support PyTorch XLA and has been tested on TPU.
-Original copyrights for above sources are below.
-Modifications Copyright 2021 Ross Wightman
-"""
-# Copyright (c) 2021, Habana Labs Ltd.  All rights reserved.
-
-# Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
 #
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#       http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# MIT License
-#
-# Copyright (c) 2019 cybertronai
-#
-# Permission is hereby granted, free of charge, to any person obtaining a copy
-# of this software and associated documentation files (the "Software"), to deal
-# in the Software without restriction, including without limitation the rights
-# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-# copies of the Software, and to permit persons to whom the Software is
-# furnished to do so, subject to the following conditions:
-#
-# The above copyright notice and this permission notice shall be included in all
-# copies or substantial portions of the Software.
-#
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-# SOFTWARE.
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+# Lamb optimizer directly copied from https://github.com/facebookresearch/online-dt
 import math
 
 import torch

diff --git a/examples/decision_transformer/odt_config.yaml b/examples/decision_transformer/odt_config.yaml
@@ -35,12 +35,12 @@ replay_buffer:
   buffer_prefetch: 64
   capacity: 1_000_000
   buffer_scratch_dir: "/tmp/"
-  device: cpu
+  device: cuda:0
   prefetch: 3
 
 # Optimization
 optim:
-  device: cpu
+  device: cuda:0
   lr: 1.0e-4
   weight_decay: 5.0e-4
   batch_size: 256

diff --git a/examples/decision_transformer/online_dt.py b/examples/decision_transformer/online_dt.py
@@ -13,7 +13,7 @@
 
 from torchrl.envs.libs.gym import set_gym_backend
 
-# from torchrl.envs.utils import ExplorationType, set_exploration_type
+from torchrl.envs.utils import ExplorationType, set_exploration_type
 from torchrl.modules.tensordict_module import DecisionTransformerInferenceWrapper
 
 from utils import (
@@ -81,7 +81,7 @@ def main(cfg: "DictConfig"):  # noqa: F821
         scheduler.step()
 
         # evaluation
-        with torch.no_grad():
+        with torch.no_grad(), set_exploration_type(ExplorationType.MODE):
             inference_policy.eval()
             if i % pretrain_log_interval == 0:
                 eval_td = test_env.rollout(

diff --git a/examples/decision_transformer/utils.py b/examples/decision_transformer/utils.py
@@ -1,3 +1,8 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
 import torch.nn
 
 import torch.optim

diff --git a/torchrl/modules/models/decision_transformer.py b/torchrl/modules/models/decision_transformer.py
@@ -8,127 +8,12 @@
 
 import importlib
 from dataclasses import dataclass
-from typing import Any, Optional
+from typing import Any
 
 import torch
 import torch.nn as nn
 
 _has_transformers = importlib.util.find_spec("transformers") is not None
-import transformers
-from transformers.models.gpt2.modeling_gpt2 import (
-    BaseModelOutputWithPastAndCrossAttentions,
-    GPT2Model,
-)
-
-
-class ModifiedGPT2Model(GPT2Model):
-    """Wrapper around the GPT2Model from transformers.
-
-    This class is a modified version of the GPT2Model from transformers
-    as for the Decision Transformer we dont need the wpe layer.
-
-    """
-
-    def __init__(self, config):
-        super(ModifiedGPT2Model, self).__init__(config)
-
-        # Remove the wpe layer
-        del self.wpe
-
-    def forward(
-        self,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-    ):
-        input_shape = inputs_embeds.size()[:-1]
-
-        output_attentions = self.config.output_attentions
-        output_hidden_states = self.config.output_hidden_states
-        use_cache = self.config.use_cache
-        return_dict = self.config.use_return_dict
-
-        head_mask = self.get_head_mask(None, self.config.n_layer)
-
-        hidden_states = inputs_embeds
-
-        hidden_states = self.drop(hidden_states)
-
-        output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
-
-        presents = () if use_cache else None
-        all_self_attentions = () if output_attentions else None
-        all_cross_attentions = (
-            () if output_attentions and self.config.add_cross_attention else None
-        )
-        all_hidden_states = () if output_hidden_states else None
-        past_key_values = tuple([None] * len(self.h))
-        for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
-            # Model parallel
-            if self.model_parallel:
-                torch.cuda.set_device(hidden_states.device)
-                # Ensure layer_past is on same device as hidden_states (might not be correct)
-                if layer_past is not None:
-                    layer_past = tuple(
-                        past_state.to(hidden_states.device) for past_state in layer_past
-                    )
-                if isinstance(head_mask, torch.Tensor):
-                    head_mask = head_mask.to(hidden_states.device)
-            if output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            outputs = block(
-                hidden_states,
-                layer_past=layer_past,
-                head_mask=head_mask[i],
-                use_cache=use_cache,
-                output_attentions=output_attentions,
-            )
-
-            hidden_states = outputs[0]
-            if use_cache is True:
-                presents = presents + (outputs[1],)
-
-            if output_attentions:
-                all_self_attentions = all_self_attentions + (
-                    outputs[2 if use_cache else 1],
-                )
-                if self.config.add_cross_attention:
-                    all_cross_attentions = all_cross_attentions + (
-                        outputs[3 if use_cache else 2],
-                    )
-
-            # Model Parallel: If it's the last layer for that device, put things on the next device
-            if self.model_parallel:
-                for k, v in self.device_map.items():
-                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
-                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
-
-        hidden_states = self.ln_f(hidden_states)
-
-        hidden_states = hidden_states.view(output_shape)
-        # Add last hidden state
-        if output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(
-                v
-                for v in [
-                    hidden_states,
-                    presents,
-                    all_hidden_states,
-                    all_self_attentions,
-                    all_cross_attentions,
-                ]
-                if v is not None
-            )
-
-        return BaseModelOutputWithPastAndCrossAttentions(
-            last_hidden_state=hidden_states,
-            past_key_values=presents,
-            hidden_states=all_hidden_states,
-            attentions=all_self_attentions,
-            cross_attentions=all_cross_attentions,
-        )
 
 
 class DecisionTransformer(nn.Module):
@@ -138,14 +23,14 @@ class DecisionTransformer(nn.Module):
 
     The transformer utilizes a default config to create the GPT2 model if the user does not provide a specific config.
     default_config = {
-        "n_embd": 256,
-        "n_layer": 4,
-        "n_head": 4,
-        "n_inner": 1024,
-        "activation": "relu",
-        "n_positions": 1024,
-        "resid_pdrop": 0.1,
-        "attn_pdrop": 0.1,
+    ... "n_embd": 256,
+    ... "n_layer": 4,
+    ... "n_head": 4,
+    ... "n_inner": 1024,
+    ... "activation": "relu",
+    ... "n_positions": 1024,
+    ... "resid_pdrop": 0.1,
+    ... "attn_pdrop": 0.1,
     }
 
     Args:
@@ -210,6 +95,8 @@ def __init__(
             raise ImportError(
                 "transformers is not installed. Please install it with `pip install transformers`."
             )
+        import transformers
+        from transformers.models.gpt2.modeling_gpt2 import GPT2Model
 
         if config is None:
             config = self.default_config()
@@ -240,7 +127,7 @@ def __init__(
         self.action_dim = action_dim
         self.hidden_size = config["n_embd"]
 
-        self.transformer = ModifiedGPT2Model(config=gpt_config)
+        self.transformer = GPT2Model(config=gpt_config)
 
         self.embed_return = torch.nn.Linear(1, self.hidden_size)
         self.embed_state = torch.nn.Linear(self.state_dim, self.hidden_size)

diff --git a/torchrl/modules/models/models.py b/torchrl/modules/models/models.py
@@ -1289,14 +1289,14 @@ def __init__(
             transformer_config["n_embd"], action_dim, device=device
         )
 
-        # def weight_init(m):
-        #     """Custom weight init for Conv2D and Linear layers."""
-        #     if isinstance(m, torch.nn.Linear):
-        #         nn.init.orthogonal_(m.weight.data)
-        #         if hasattr(m.bias, "data"):
-        #             m.bias.data.fill_(0.0)
-
-        # self.apply(weight_init)
+        def weight_init(m):
+            """Custom weight init for Conv2D and Linear layers."""
+            if isinstance(m, torch.nn.Linear):
+                nn.init.orthogonal_(m.weight.data)
+                if hasattr(m.bias, "data"):
+                    m.bias.data.fill_(0.0)
+
+        self.action_layer.apply(weight_init)
 
     def forward(
         self,