Standardize dataset_num_proc usage (#1925)

* uniform dataset_num_proc * num_proc in shuffle * Update examples/datasets/anthropic_hh.py Co-authored-by: lewtun <lewis.c.tunstall@gmail.com> * Update examples/scripts/ppo.py Co-authored-by: lewtun <lewis.c.tunstall@gmail.com> * Update examples/scripts/ppo.py Co-authored-by: lewtun <lewis.c.tunstall@gmail.com> --------- Co-authored-by: Quentin Gallouédec <quentin.gallouedec@huggingface.co> Co-authored-by: lewtun <lewis.c.tunstall@gmail.com>
huggingface · Aug 13, 2024 · 54f806b · 54f806b
1 parent a9a7565
commit 54f806b
Show file tree

Hide file tree

Showing 25 changed files with 94 additions and 59 deletions.
diff --git a/examples/datasets/anthropic_hh.py b/examples/datasets/anthropic_hh.py
@@ -1,4 +1,3 @@
-import multiprocessing
 import sys
 from dataclasses import dataclass, field
 from typing import Optional
@@ -32,6 +31,9 @@ class ScriptArguments:
         default=True, metadata={"help": "Update the main revision of the repository"}
     )
     push_to_hub: Optional[bool] = field(default=False, metadata={"help": "Push the dataset to the Hugging Face Hub"})
+    dataset_num_proc: Optional[int] = field(
+        default=None, metadata={"help": "The number of workers to use for dataset processing"}
+    )
 
 
 # GPT-4 generated 😄 Define a function to process the input and extract the dialogue into structured format
@@ -79,8 +81,8 @@ def process(row):
 
     ds = ds.map(
         process,
-        num_proc=1 if args.debug else multiprocessing.cpu_count(),
         load_from_cache_file=False,
+        num_proc=args.dataset_num_proc,
     )
     if args.push_to_hub:
         revisions = ["main"] if args.update_main_revision else []

diff --git a/examples/datasets/sentiment_descriptiveness.py b/examples/datasets/sentiment_descriptiveness.py
@@ -41,6 +41,9 @@ class ScriptArguments:
     )
     push_to_hub: Optional[bool] = field(default=False, metadata={"help": "Push the dataset to the Hugging Face Hub"})
     task: str = field(default="sentiment", metadata={"help": "The task of the dataset"})
+    dataset_num_proc: Optional[int] = field(
+        default=None, metadata={"help": "The number of workers to use to tokenize the data"}
+    )
 
 
 task_to_filename = {
@@ -106,7 +109,7 @@ def filter(row):
             return True
 
     print("=== Before filtering ===", ds)
-    ds = ds.filter(filter, load_from_cache_file=False)
+    ds = ds.filter(filter, load_from_cache_file=False, num_proc=args.dataset_num_proc)
     print("=== After filtering ===", ds)
 
     # here we simply take the preferred sample as the chosen one and the first non-preferred sample as the rejected one
@@ -147,6 +150,7 @@ def process(row):
         process,
         batched=True,
         load_from_cache_file=False,
+        num_proc=args.dataset_num_proc,
     )
     for key in ds:  # reorder columns
         ds[key] = ds[key].select_columns(["prompt", "chosen", "rejected"])

diff --git a/examples/datasets/tldr_preference.py b/examples/datasets/tldr_preference.py
@@ -1,4 +1,3 @@
-import multiprocessing
 import sys
 from dataclasses import dataclass, field
 from typing import Optional
@@ -35,6 +34,9 @@ class ScriptArguments:
         default=True, metadata={"help": "Update the main revision of the repository"}
     )
     push_to_hub: Optional[bool] = field(default=False, metadata={"help": "Push the dataset to the Hugging Face Hub"})
+    dataset_num_proc: Optional[int] = field(
+        default=None, metadata={"help": "The number of workers to use to tokenize the data"}
+    )
 
 
 if __name__ == "__main__":
@@ -53,8 +55,12 @@ class ScriptArguments:
             ds[key] = ds[key].select(range(50))
     cnndm_batches = ["batch0_cnndm", "cnndm0", "cnndm2"]
     if not args.debug:
-        ds["validation_cnndm"] = ds["validation"].filter(lambda x: x["batch"] in cnndm_batches)
-    ds["validation"] = ds["validation"].filter(lambda x: x["batch"] not in cnndm_batches)
+        ds["validation_cnndm"] = ds["validation"].filter(
+            lambda x: x["batch"] in cnndm_batches, num_proc=args.dataset_num_proc
+        )
+    ds["validation"] = ds["validation"].filter(
+        lambda x: x["batch"] not in cnndm_batches, num_proc=args.dataset_num_proc
+    )
 
     tldr_format_str = "SUBREDDIT: r/{subreddit}\n\nTITLE: {title}\n\nPOST: {post}\n\nTL;DR:"
     cnndm_format_str = "Article:\n{article}\n\nTL;DR:"
@@ -72,8 +78,8 @@ def process(row):
 
     ds = ds.map(
         process,
-        num_proc=1 if args.debug else multiprocessing.cpu_count(),
         load_from_cache_file=False,
+        num_proc=args.dataset_num_proc,
     )
     for key in ds:  # reorder columns
         ds[key] = ds[key].select_columns(
@@ -141,8 +147,8 @@ def sft_process(row):
 
     sft_ds = sft_ds.map(
         sft_process,
-        num_proc=1 if args.debug else multiprocessing.cpu_count(),
         load_from_cache_file=False,
+        num_proc=args.dataset_num_proc,
     )
     for key in sft_ds:  # reorder columns
         sft_ds[key] = sft_ds[key].select_columns(["prompt", "messages", "id", "subreddit", "title", "post", "summary"])

diff --git a/examples/datasets/tokenize_ds.py b/examples/datasets/tokenize_ds.py
@@ -1,4 +1,3 @@
-import multiprocessing
 from dataclasses import dataclass, field
 from typing import Optional
 
@@ -19,6 +18,9 @@ class ScriptArguments:
         default="trl-internal-testing/hh-rlhf-helpful-base-trl-style", metadata={"help": "The dataset to load"}
     )
     model: str = field(default="gpt2", metadata={"help": "The model to use for tokenization"})
+    dataset_num_proc: Optional[int] = field(
+        default=None, metadata={"help": "The number of workers to use to tokenize the data"}
+    )
 
 
 if __name__ == "__main__":
@@ -38,7 +40,7 @@ def process(row):
 
     ds = ds.map(
         process,
-        num_proc=1 if args.debug else multiprocessing.cpu_count(),
         load_from_cache_file=False,
+        num_proc=args.dataset_num_proc,
     )
     print(ds["train"][0]["chosen"])
diff --git a/examples/research_projects/stack_llama/scripts/reward_modeling.py b/examples/research_projects/stack_llama/scripts/reward_modeling.py
@@ -198,7 +198,8 @@ def preprocess_function(examples):
     remove_columns=original_columns,
 )
 train_dataset = train_dataset.filter(
-    lambda x: len(x["input_ids_j"]) <= script_args.max_length and len(x["input_ids_k"]) <= script_args.max_length
+    lambda x: len(x["input_ids_j"]) <= script_args.max_length and len(x["input_ids_k"]) <= script_args.max_length,
+    num_proc=num_proc,
 )
 
 eval_dataset = eval_dataset.map(
@@ -208,7 +209,8 @@ def preprocess_function(examples):
     remove_columns=original_columns,
 )
 eval_dataset = eval_dataset.filter(
-    lambda x: len(x["input_ids_j"]) <= script_args.max_length and len(x["input_ids_k"]) <= script_args.max_length
+    lambda x: len(x["input_ids_j"]) <= script_args.max_length and len(x["input_ids_k"]) <= script_args.max_length,
+    num_proc=num_proc,
 )
 
 

diff --git a/examples/research_projects/stack_llama/scripts/rl_training.py b/examples/research_projects/stack_llama/scripts/rl_training.py
@@ -154,7 +154,7 @@ def preprocess_function(examples):
         num_proc=num_proc,
         remove_columns=original_columns,
     )
-    ds = ds.filter(lambda x: len(x["input_ids"]) < 512, batched=False)
+    ds = ds.filter(lambda x: len(x["input_ids"]) < 512, batched=False, num_proc=num_proc)
 
     ds.set_format(type="torch")
     return ds

diff --git a/examples/research_projects/stack_llama_2/scripts/dpo_llama2.py b/examples/research_projects/stack_llama_2/scripts/dpo_llama2.py
@@ -167,14 +167,16 @@ def return_prompt_and_responses(samples) -> Dict[str, str]:
     train_dataset = get_stack_exchange_paired(data_dir="data/rl", sanity_check=script_args.sanity_check)
     train_dataset = train_dataset.filter(
         lambda x: len(x["prompt"]) + len(x["chosen"]) <= script_args.max_length
-        and len(x["prompt"]) + len(x["rejected"]) <= script_args.max_length
+        and len(x["prompt"]) + len(x["rejected"]) <= script_args.max_length,
+        num_proc=script_args.num_proc,
     )
 
     # 3. Load evaluation dataset
     eval_dataset = get_stack_exchange_paired(data_dir="data/evaluation", sanity_check=True)
     eval_dataset = eval_dataset.filter(
         lambda x: len(x["prompt"]) + len(x["chosen"]) <= script_args.max_length
-        and len(x["prompt"]) + len(x["rejected"]) <= script_args.max_length
+        and len(x["prompt"]) + len(x["rejected"]) <= script_args.max_length,
+        num_proc=script_args.num_proc,
     )
 
     # 4. initialize training arguments:

diff --git a/examples/scripts/bco.py b/examples/scripts/bco.py
@@ -55,7 +55,7 @@
 import logging
 from dataclasses import dataclass
 from functools import partial
-from typing import Literal
+from typing import Literal, Optional
 
 import torch
 import torch.nn.functional as F
@@ -76,7 +76,7 @@ class ScriptArguments:
     llm_name: Literal["gpt-3.5-turbo", "llama-2-7b-chat", "llama-2-70b-chat"] = "gpt-3.5-turbo"
 
 
-def build_helpfulness_dataset(llm_name: str) -> Dataset:
+def build_helpfulness_dataset(llm_name: str, num_proc: Optional[int] = None) -> Dataset:
     """
     Filter `llm_name` completions and binarize given their helpfulness score.
     If helpfulness score is 5, it is desirable. Otherwise, it is undesirable.
@@ -100,34 +100,36 @@ def get_model_response(example, llm_name: str):
 
     dataset = load_dataset("openbmb/UltraFeedback")["train"]
 
-    ds = dataset.filter(lambda example: llm_name in example["models"], batched=False, num_proc=8)
-    ds = ds.filter(lambda example: len(example["models"]) == len(example["completions"]), batched=False, num_proc=8)
+    ds = dataset.filter(lambda example: llm_name in example["models"], batched=False, num_proc=num_proc)
+    ds = ds.filter(
+        lambda example: len(example["models"]) == len(example["completions"]), batched=False, num_proc=num_proc
+    )
 
     METRIC = "helpfulness"
 
     ds = ds.map(
         get_model_rating,
         batched=False,
-        num_proc=8,
         fn_kwargs={"metric": METRIC, "llm_name": llm_name},
+        num_proc=num_proc,
     )
 
     ds = ds.map(
         get_model_response,
         batched=False,
-        num_proc=8,
         fn_kwargs={"llm_name": llm_name},
+        num_proc=num_proc,
     )
 
     ds = ds.select_columns(["source", "instruction", "response", "helpfulness"])
 
     ds = ds.rename_columns({"instruction": "prompt", "response": "completion"})
-    ds = ds.map(lambda example: {"label": example["helpfulness"] >= 5}, batched=False, num_proc=8)
+    ds = ds.map(lambda example: {"label": example["helpfulness"] >= 5}, batched=False, num_proc=num_proc)
 
     ds = ds.map(
         lambda example: {"prompt": [{"role": "user", "content": example["prompt"]}]},
         batched=False,
-        num_proc=8,
+        num_proc=num_proc,
     )
     dataset = ds.train_test_split(test_size=0.05, seed=42)
 
@@ -182,7 +184,7 @@ def mean_pooling(model_output, attention_mask):
         model, tokenizer = setup_chat_format(model, tokenizer)
 
     # Load the dataset
-    dataset = build_helpfulness_dataset(script_args.llm_name)
+    dataset = build_helpfulness_dataset(script_args.llm_name, num_proc=bco_args.dataset_num_proc)
 
     # Apply chat template
     def format_dataset(example):
@@ -192,7 +194,7 @@ def format_dataset(example):
         return example
 
     with PartialState().local_main_process_first():
-        formatted_dataset = dataset.map(format_dataset, batched=False, num_proc=8)
+        formatted_dataset = dataset.map(format_dataset, batched=False, num_proc=bco_args.dataset_num_proc)
 
     accelerator = Accelerator()
     embedding_model = AutoModel.from_pretrained(

diff --git a/examples/scripts/cpo.py b/examples/scripts/cpo.py
@@ -52,7 +52,6 @@
     --lora_alpha=16
 """
 
-import multiprocessing
 from dataclasses import dataclass, field
 
 from datasets import load_dataset
@@ -102,8 +101,8 @@ def process(row):
 
     ds = ds.map(
         process,
-        num_proc=1 if cpo_args.debug else multiprocessing.cpu_count(),
         load_from_cache_file=False,
+        num_proc=cpo_args.dataset_num_proc,
     )
     train_dataset = ds["train"]
     eval_dataset = ds["test"]

diff --git a/examples/scripts/dpo.py b/examples/scripts/dpo.py
@@ -161,8 +161,8 @@ def process(row):
 
     ds = ds.map(
         process,
-        num_proc=multiprocessing.cpu_count(),
         load_from_cache_file=False,
+        num_proc=training_args.dataset_num_proc,
     )
     train_dataset = ds[args.dataset_train_split]
     eval_dataset = ds[args.dataset_test_split]

diff --git a/examples/scripts/kto.py b/examples/scripts/kto.py
@@ -102,7 +102,7 @@ def format_dataset(example):
         example["completion"] = tokenizer.apply_chat_template(example["completion"], tokenize=False)
         return example
 
-    formatted_dataset = dataset.map(format_dataset)
+    formatted_dataset = dataset.map(format_dataset, num_proc=kto_args.dataset_num_proc)
 
     # Initialize the KTO trainer
     kto_trainer = KTOTrainer(

diff --git a/examples/scripts/online_dpo.py b/examples/scripts/online_dpo.py
@@ -59,7 +59,7 @@ class ScriptArguments:
     max_length: int = 512
 
 
-def prepare_dataset(dataset, tokenizer, dataset_text_field):
+def prepare_dataset(dataset, tokenizer, dataset_text_field, num_proc):
     """pre-tokenize the dataset before training; only collate during training"""
 
     def tokenize(element):
@@ -73,7 +73,7 @@ def tokenize(element):
         tokenize,
         remove_columns=dataset.column_names,
         batched=True,
-        num_proc=4,  # multiprocessing.cpu_count(),
+        num_proc=num_proc,
         load_from_cache_file=False,
     )
 
@@ -105,11 +105,11 @@ def tokenize(element):
         for key in raw_datasets:
             raw_datasets[key] = raw_datasets[key].select(range(1024))
     train_dataset = raw_datasets[args.dataset_train_split]
-    train_dataset = prepare_dataset(train_dataset, tokenizer, args.dataset_text_field)
+    train_dataset = prepare_dataset(train_dataset, tokenizer, args.dataset_text_field, config.dataset_num_proc)
 
     if args.dataset_test_split is not None:
         eval_dataset = raw_datasets[args.dataset_test_split]
-        eval_dataset = prepare_dataset(eval_dataset, tokenizer, args.dataset_text_field)
+        eval_dataset = prepare_dataset(eval_dataset, tokenizer, args.dataset_text_field, config.dataset_num_proc)
     else:
         eval_dataset = None
     ################

diff --git a/examples/scripts/orpo.py b/examples/scripts/orpo.py
@@ -52,7 +52,6 @@
     --lora_alpha=16
 """
 
-import multiprocessing
 from dataclasses import dataclass, field
 
 from datasets import load_dataset
@@ -103,8 +102,8 @@ def process(row):
 
     ds = ds.map(
         process,
-        num_proc=1 if orpo_args.debug else multiprocessing.cpu_count(),
         load_from_cache_file=False,
+        num_prc=orpo_args.dataset_num_proc,
     )
     train_dataset = ds["train"]
     eval_dataset = ds["test"]

diff --git a/examples/scripts/ppo.py b/examples/scripts/ppo.py
@@ -75,7 +75,7 @@ def build_dataset(config, query_dataset, input_min_text_length=2, input_max_text
     # load imdb with datasets
     ds = load_dataset(query_dataset, split="train")
     ds = ds.rename_columns({"text": "review"})
-    ds = ds.filter(lambda x: len(x["review"]) > 200, batched=False)
+    ds = ds.filter(lambda x: len(x["review"]) > 200, num_proc=args.dataset_num_proc)
 
     input_size = LengthSampler(input_min_text_length, input_max_text_length)
 
@@ -84,7 +84,7 @@ def tokenize(sample):
         sample["query"] = tokenizer.decode(sample["input_ids"])
         return sample
 
-    ds = ds.map(tokenize, batched=False)
+    ds = ds.map(tokenize, num_proc=args.dataset_num_proc)
     ds.set_format(type="torch")
     return ds
 

diff --git a/examples/scripts/ppo/ppo.py b/examples/scripts/ppo/ppo.py
@@ -91,10 +91,10 @@ def tokenize(element):
 
         return dataset.map(
             tokenize,
-            remove_columns=dataset.column_names,
             batched=True,
-            num_proc=4,  # multiprocessing.cpu_count(),
+            remove_columns=dataset.column_names,
             load_from_cache_file=False,
+            num_proc=config.dataset_num_proc,
         )
 
     ################

diff --git a/examples/scripts/ppo/ppo_tldr.py b/examples/scripts/ppo/ppo_tldr.py
@@ -1,4 +1,3 @@
-import multiprocessing
 import shutil
 
 from datasets import load_dataset
@@ -98,15 +97,15 @@ def tokenize(element):
         return dataset.map(
             tokenize,
             remove_columns=dataset.column_names,
-            num_proc=1 if config.sanity_check else multiprocessing.cpu_count(),
             load_from_cache_file=not config.sanity_check,
+            num_proc=config.dataset_num_proc,
         )
 
     train_dataset = prepare_dataset(train_dataset, tokenizer)
     eval_dataset = prepare_dataset(eval_dataset, tokenizer)
     # filtering
-    train_dataset = train_dataset.filter(lambda x: x["lengths"] <= 512)
-    eval_dataset = eval_dataset.filter(lambda x: x["lengths"] <= 512)
+    train_dataset = train_dataset.filter(lambda x: x["lengths"] <= 512, num_proc=config.dataset_num_proc)
+    eval_dataset = eval_dataset.filter(lambda x: x["lengths"] <= 512, num_proc=config.dataset_num_proc)
     assert train_dataset[0]["input_ids"][-1] != tokenizer.eos_token_id, "The last token should not be an EOS token"
     ################
     # Training