refactor

- added comments on example - fixes CI test - rewards should be a list of tensors - clearer error messages - remove build model method - refactor log stats method Co-authored-by: Leandro von Werra <lvwerra@users.noreply.github.com>
huggingface · younesbelkada · Dec 30, 2022 · Dec 27, 2022 · Dec 27, 2022 · Dec 27, 2022
commit f47b907f6a828433d7aeee516b53cd1ad09d66a1
diff --git a/examples/scripts/04-ppo-sentiment-accelerate.py b/examples/scripts/04-ppo-sentiment-accelerate.py
@@ -1,3 +1,17 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import torch
 import time
 from tqdm import tqdm
@@ -7,12 +21,29 @@
 from transformers import pipeline, AutoTokenizer
 from datasets import load_dataset
 
-from trl import PPOTrainer
+from trl import PPOTrainer, AutoModelForCausalLMWithValueHead
 from trl.trainer import LengthSampler
 
+########################################################################
+# This is a fully working simple example to use trl with accelerate.
+#
+# This example fine-tunes a GPT2 model on the IMDB dataset using PPO 
+# (proximal policy optimization).
+# in any of the following settings (with the same script):
+#   - single CPU or single GPU
+#   - multi GPUS (using PyTorch distributed mode)
+#   - multi GPUS (using DeepSpeed ZeRO-Offload stages 1 & 2)
+#   - fp16 (mixed-precision) or fp32 (normal precision)
+#
+# To run it in each of these various modes, first initialize the accelerate
+# configuration with `accelerate config`
+#
+########################################################################
+
+# We first define the configuration of the experiment, defining the model, the dataset,
+# the training parameters, and the PPO parameters.
 config = {
     "model_name": "lvwerra/gpt2-imdb",
-    # "model_name": "facebook/opt-350m",
     "dataset_name": "imdb",
     "cls_model_name": "lvwerra/distilbert-imdb",
     "steps": 20000,
@@ -34,12 +65,17 @@
     "vf_coef":.1, 
 }
 
+# We then define the arguments to pass to the sentiment analysis pipeline.
+# We set `return_all_scores` to True to get the sentiment score for each token.
 sent_kwargs = {
     "return_all_scores": True,
     "function_to_apply": "none",
     "batch_size": config["forward_batch_size"]
 }
 
+# Below is an example function to build the dataset. In our case, we use the IMDB dataset
+# from the `datasets` library. One should customize this function to train the model on
+# its own dataset.
 def build_dataset(config):
     """
     Build dataset for training. This builds the dataset from `load_dataset`, one should 
@@ -61,7 +97,6 @@ def build_dataset(config):
     ds = ds.rename_columns({'text': 'review', 'label': 'sentiment'})
     ds = ds.filter(lambda x: len(x["review"])>200, batched=False)
 
-
     input_size = LengthSampler(config["txt_in_min_len"], config["txt_in_max_len"])
 
     def tokenize(sample):
@@ -77,20 +112,36 @@ def collater(data):
     dataloader = torch.utils.data.DataLoader(ds, batch_size=config['batch_size'], collate_fn=collater)
     return dataloader
 
+# We retrieve the dataloader by calling the `build_dataset` function.
 dataloader = build_dataset(config)
-ppo_trainer = PPOTrainer(dataloader, **config)
 
-dataloader = ppo_trainer.dataloader
+# Now let's build the model, the reference model, and the tokenizer.
+model = AutoModelForCausalLMWithValueHead.from_pretrained(config["model_name"])
+ref_model = AutoModelForCausalLMWithValueHead.from_pretrained(config["model_name"])
+tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
+
+tokenizer.pad_token = tokenizer.eos_token
 
-tokenizer = ppo_trainer.tokenizer
+# We then build the PPOTrainer, passing the model, the reference model, the tokenizer
+ppo_trainer = PPOTrainer(model, ref_model, tokenizer, dataloader, **config)
 
+# the PPOTrainer has a dataloader attribute, which we can use to get the dataloader - 
+# this step is important in a distributed setting, as the dataloader needs to be
+# converted to a distributed dataloader.
+dataloader = ppo_trainer.dataloader
+
+# We then build the sentiment analysis pipeline, passing the model name and the
+# sentiment analysis pipeline arguments. Let's also make sure to set the device
+# to the same device as the PPOTrainer.
 device = ppo_trainer.accelerator.device
 if device.index is None:
     # single GPU - maybe introduce this hack inside PPOTrainer?
     device = 0
 sentiment_pipe = pipeline("sentiment-analysis","lvwerra/distilbert-imdb", device=device)
 
-
+# We then define the arguments to pass to the `generate` function. These arguments
+# are passed to the `generate` function of the PPOTrainer, which is a wrapper around 
+# the `generate` function of the trained model.
 gen_kwargs = {
     "min_length":-1,
     "top_k": 0.0,
@@ -116,13 +167,13 @@ def collater(data):
     t = time.time()
     texts = [q + r for q,r in zip(batch['query'], batch['response'])]
     pipe_outputs = sentiment_pipe(texts, **sent_kwargs)
-    rewards = torch.tensor([output[1]["score"] for output in pipe_outputs]).to(device)
+    rewards = [torch.tensor(output[1]["score"]).to(device) for output in pipe_outputs]
     timing['time/get_sentiment_preds'] = time.time()-t
 
     #### Run PPO step 
     t = time.time()
     stats = ppo_trainer.step(query_tensors, response_tensors, rewards)
-    ppo_trainer.log_stats(stats, timing, batch, rewards, t0, logs)
+    ppo_trainer.log_stats(stats, batch, rewards, logs, timing, t0)
     # Log the timing of the whole optimization step.
     timing['time/optimization'] = time.time()-t
 
diff --git a/tests/test_gpt2_model.py b/tests/test_gpt2_model.py
@@ -51,7 +51,7 @@ def test_gpt2_model():
     for query_tensor, response_tensor in dummy_dataloader:
         # define a reward for response
         # (this could be any reward such as human feedback or output from another model)
-        reward = torch.Tensor([1.0]* 2) 
+        reward = [torch.tensor(1.0), torch.tensor(0.0)]
         # train model
         train_stats = ppo_trainer.step([q for q in query_tensor], [r for r in response_tensor], reward)
         break

diff --git a/trl/models/__init__.py b/trl/models/__init__.py
@@ -11,4 +11,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .modeling_vhead import AutoModelForCausalLMWithValueHead
+from .modeling_vhead import AutoModelForCausalLMWithValueHead
+from .modeling_base import PreTrainedModelWrapper
+
+SUPPORTED_ARCHITECTURES = (
+    AutoModelForCausalLMWithValueHead,
+)