From 1b214d1f7d87f21ea2f8be8299ddc7e2bf592413 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Thu, 15 Aug 2024 20:47:53 -0700 Subject: [PATCH 01/41] stash initial changes for now --- open_instruct/merge_models.py | 82 +++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 open_instruct/merge_models.py diff --git a/open_instruct/merge_models.py b/open_instruct/merge_models.py new file mode 100644 index 000000000..13ec4a918 --- /dev/null +++ b/open_instruct/merge_models.py @@ -0,0 +1,82 @@ +import copy +import subprocess +import yaml +from datetime import datetime +import argparse +import re +import shlex + +def load_yaml(file_path): + with open(file_path, 'r') as f: + return yaml.load(f, Loader=yaml.FullLoader) + +def main(): + parser = argparse.ArgumentParser(description="Run experiment with Beaker config") + # TODO: new, need to complete + parser.add_argument("--merge_method", type=str, default="linear", help="Merge method to use") + + + # TODO: old, need to prune + parser.add_argument("--default_beaker_config", default="configs/beaker_configs/default_finetune.yaml", + help="Path to the default Beaker config file") + parser.add_argument("--config", default=None, + help="Path to an additional config file to override default settings") + # parser.add_argument("--wandb_api_key", required=False, help="Weights & Biases API key") + parser.add_argument("--cluster", type=str, default="ai2/allennlp-cirrascale", help="Beaker cluster to use") + parser.add_argument("--priority", type=str, default="high", help="Priority of the job") + parser.add_argument("--preemptible", type=bool, default=True, help="Whether to use preemptible instances") + parser.add_argument("--workspace", type=str, default="ai2/tulu-3-dev", help="Beaker workspace to use.") + + # Structure: + # 1. Parse model inputs + # 2. Build merge config + # 3. Launch merge + # wait for it to complete successfully + # 4. If --run_evals, require model name (check at beginning) + # 5. Launch evals + +""" +Mergekit Options: + -v, --verbose Verbose logging + --allow-crimes / --no-allow-crimes + Allow mixing architectures [default: no- + allow-crimes] + --transformers-cache TEXT Override storage path for downloaded models + --lora-merge-cache TEXT Path to store merged LORA models + --cuda / --no-cuda Perform matrix arithmetic on GPU [default: + no-cuda] + --low-cpu-memory / --no-low-cpu-memory + Store results and intermediate values on + GPU. Useful if VRAM > RAM [default: no-low- + cpu-memory] + --out-shard-size SIZE Number of parameters per output shard + [default: 5B] + --copy-tokenizer / --no-copy-tokenizer + Copy a tokenizer to the output [default: + copy-tokenizer] + --clone-tensors / --no-clone-tensors + Clone tensors before saving, to allow + multiple occurrences of the same layer + [default: no-clone-tensors] + --trust-remote-code / --no-trust-remote-code + Trust remote code from huggingface repos + (danger) [default: no-trust-remote-code] + --random-seed INTEGER Seed for reproducible use of randomized + merge methods + --lazy-unpickle / --no-lazy-unpickle + Experimental lazy unpickler for lower memory + usage [default: no-lazy-unpickle] + --write-model-card / --no-write-model-card + Output README.md containing details of the + merge [default: write-model-card] + --safe-serialization / --no-safe-serialization + Save output in safetensors. Do this, don't + poison the world with more pickled models. + [default: safe-serialization] + --quiet / --no-quiet Suppress progress bars and other non- + essential output [default: no-quiet] + --read-to-gpu / --no-read-to-gpu + Read model weights directly to GPU + [default: no-read-to-gpu] + --help Show this message and exit. + """ \ No newline at end of file From 6178897ace45289327f85f688921321cad0a1dd7 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Thu, 15 Aug 2024 20:48:02 -0700 Subject: [PATCH 02/41] stash --- configs/beaker_configs/default_merge.yaml | 54 +++++++++++++++++++ .../merge_configs/default_linear_merge.yaml | 0 oe-eval-internal | 1 + open_instruct/submit_merge_job.py | 0 4 files changed, 55 insertions(+) create mode 100644 configs/beaker_configs/default_merge.yaml create mode 100644 configs/merge_configs/default_linear_merge.yaml create mode 160000 oe-eval-internal create mode 100644 open_instruct/submit_merge_job.py diff --git a/configs/beaker_configs/default_merge.yaml b/configs/beaker_configs/default_merge.yaml new file mode 100644 index 000000000..506602ddc --- /dev/null +++ b/configs/beaker_configs/default_merge.yaml @@ -0,0 +1,54 @@ +version: v2 +description: open-instruct-merge-models +budget: ai2/oe-adapt +tasks: + - name: open-instruct-merge-models + image: + beaker: nathanl/open_instruct_auto + command: [ + '/bin/sh', '-c' + ] + arguments: ['PYTHONPATH="/stage:$PYTHONPATH" python + open_instruct/merge_models.py + --model_name_or_path /hf_llama_models + --use_flash_attn + --tokenizer_name /hf_llama_models + --max_seq_length 2048 + --preprocessing_num_workers 16 + --per_device_train_batch_size 2 + --gradient_accumulation_steps 16 + --learning_rate 2e-5 + --lr_scheduler_type linear + --warmup_ratio 0.03 + --weight_decay 0. + --num_train_epochs 2 + --output_dir /output/ + --with_tracking + --report_to tensorboard + --logging_steps 1 + '] + envVars: + - name: CUDA_DEVICE_ORDER + value: PCI_BUS_ID + - name: TRANSFORMERS_CACHE + value: ./cache/ + - name: WANDB_API_KEY + secret: WANDB_API_KEY + - name: WANDB_PROJECT + value: open-instruct + - name: WANDB_WATCH + value: false + - name: WANDB_LOG_MODEL + value: false + - name: WANDB_DISABLED + value: true + - name: HF_TOKEN + secret: HF_TOKEN + result: + path: /output + resources: + gpuCount: 1 + context: + cluster: ai2/allennlp-cirrascale + priority: low + preemptible: false \ No newline at end of file diff --git a/configs/merge_configs/default_linear_merge.yaml b/configs/merge_configs/default_linear_merge.yaml new file mode 100644 index 000000000..e69de29bb diff --git a/oe-eval-internal b/oe-eval-internal new file mode 160000 index 000000000..a997d6e0e --- /dev/null +++ b/oe-eval-internal @@ -0,0 +1 @@ +Subproject commit a997d6e0e57920807a5dabeaefe269b318be2316 diff --git a/open_instruct/submit_merge_job.py b/open_instruct/submit_merge_job.py new file mode 100644 index 000000000..e69de29bb From cc5670f88a21bd0f78fd1bf1bc5353672e97983c Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Sat, 17 Aug 2024 16:25:43 -0700 Subject: [PATCH 03/41] . --- configs/beaker_configs/default_merge.yaml | 20 +---- .../base_configs/default_linear_merge.yaml | 10 +++ .../default_task_arithmetic_merge.yaml | 14 +++ .../merge_configs/default_linear_merge.yaml | 0 .../example_linear_merge_config.yaml | 11 +++ configs/merge_configs/my-merge-config.yaml | 11 +++ oe-eval-internal | 2 +- open_instruct/merge_models.py | 82 ------------------ open_instruct/submit_merge_job.py | 0 scripts/submit_merge_job.py | 86 +++++++++++++++++++ 10 files changed, 134 insertions(+), 102 deletions(-) create mode 100644 configs/merge_configs/base_configs/default_linear_merge.yaml create mode 100644 configs/merge_configs/base_configs/default_task_arithmetic_merge.yaml delete mode 100644 configs/merge_configs/default_linear_merge.yaml create mode 100644 configs/merge_configs/example_linear_merge_config.yaml create mode 100644 configs/merge_configs/my-merge-config.yaml delete mode 100644 open_instruct/merge_models.py delete mode 100644 open_instruct/submit_merge_job.py create mode 100644 scripts/submit_merge_job.py diff --git a/configs/beaker_configs/default_merge.yaml b/configs/beaker_configs/default_merge.yaml index 506602ddc..e42d15379 100644 --- a/configs/beaker_configs/default_merge.yaml +++ b/configs/beaker_configs/default_merge.yaml @@ -8,25 +8,7 @@ tasks: command: [ '/bin/sh', '-c' ] - arguments: ['PYTHONPATH="/stage:$PYTHONPATH" python - open_instruct/merge_models.py - --model_name_or_path /hf_llama_models - --use_flash_attn - --tokenizer_name /hf_llama_models - --max_seq_length 2048 - --preprocessing_num_workers 16 - --per_device_train_batch_size 2 - --gradient_accumulation_steps 16 - --learning_rate 2e-5 - --lr_scheduler_type linear - --warmup_ratio 0.03 - --weight_decay 0. - --num_train_epochs 2 - --output_dir /output/ - --with_tracking - --report_to tensorboard - --logging_steps 1 - '] + arguments: ['echo rawconfig > /output/config.yaml; mergekit-yaml /output/config.yaml /output --cuda'] envVars: - name: CUDA_DEVICE_ORDER value: PCI_BUS_ID diff --git a/configs/merge_configs/base_configs/default_linear_merge.yaml b/configs/merge_configs/base_configs/default_linear_merge.yaml new file mode 100644 index 000000000..3557bc440 --- /dev/null +++ b/configs/merge_configs/base_configs/default_linear_merge.yaml @@ -0,0 +1,10 @@ +models: + - model: /model-one + parameters: + weight: 1.0 + - model: /model-two + parameters: + weight: 1.0 +normalize: true +merge_method: linear +dtype: bfloat16 \ No newline at end of file diff --git a/configs/merge_configs/base_configs/default_task_arithmetic_merge.yaml b/configs/merge_configs/base_configs/default_task_arithmetic_merge.yaml new file mode 100644 index 000000000..e2f55159a --- /dev/null +++ b/configs/merge_configs/base_configs/default_task_arithmetic_merge.yaml @@ -0,0 +1,14 @@ +models: + # no parameters necessary for base model + - model: /base-model + - model: /model-one + parameters: + weight: 0.70 + normalize: False + - model: /model-2 + parameters: + weight: 0.30 + normalize: False +merge_method: task_arithmetic +base_model: /base-model +dtype: bfloat16 \ No newline at end of file diff --git a/configs/merge_configs/default_linear_merge.yaml b/configs/merge_configs/default_linear_merge.yaml deleted file mode 100644 index e69de29bb..000000000 diff --git a/configs/merge_configs/example_linear_merge_config.yaml b/configs/merge_configs/example_linear_merge_config.yaml new file mode 100644 index 000000000..8f902c28f --- /dev/null +++ b/configs/merge_configs/example_linear_merge_config.yaml @@ -0,0 +1,11 @@ +merge_method: linear +normalize: true +models: + - name: name + location: beaker + path: jacobm/beaker-dataset + weight: 0.5 + - name: name2 + location: huggingface + path: allenai/llama-3-tulu-2 + weight: 0.5 \ No newline at end of file diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml new file mode 100644 index 000000000..8f8b73946 --- /dev/null +++ b/configs/merge_configs/my-merge-config.yaml @@ -0,0 +1,11 @@ +merge_method: linear +normalize: true +models: + - name: llama-3-8b-tulu_v2_numina-WEIGHTED_MERGE-_dpo_norm_beta5_uf_1ep + location: beaker + path: hamishivi/llama-3-8b-tulu_v2_numina-WEIGHTED_MERGE-_dpo_norm_beta5_uf_1ep + weight: 0.5 + - name: llama-3-8b-tulu_v2_numina-WEIGHTED_MERGE-_simpo_2048len_ultrafeedback + location: beaker + path: hamishivi/01J542ZEE4JY6JNQ8K1TMK2S8H + weight: 0.5 \ No newline at end of file diff --git a/oe-eval-internal b/oe-eval-internal index a997d6e0e..876d8b0c1 160000 --- a/oe-eval-internal +++ b/oe-eval-internal @@ -1 +1 @@ -Subproject commit a997d6e0e57920807a5dabeaefe269b318be2316 +Subproject commit 876d8b0c1906114e49f768bd8aa14bfd575099c6 diff --git a/open_instruct/merge_models.py b/open_instruct/merge_models.py deleted file mode 100644 index 13ec4a918..000000000 --- a/open_instruct/merge_models.py +++ /dev/null @@ -1,82 +0,0 @@ -import copy -import subprocess -import yaml -from datetime import datetime -import argparse -import re -import shlex - -def load_yaml(file_path): - with open(file_path, 'r') as f: - return yaml.load(f, Loader=yaml.FullLoader) - -def main(): - parser = argparse.ArgumentParser(description="Run experiment with Beaker config") - # TODO: new, need to complete - parser.add_argument("--merge_method", type=str, default="linear", help="Merge method to use") - - - # TODO: old, need to prune - parser.add_argument("--default_beaker_config", default="configs/beaker_configs/default_finetune.yaml", - help="Path to the default Beaker config file") - parser.add_argument("--config", default=None, - help="Path to an additional config file to override default settings") - # parser.add_argument("--wandb_api_key", required=False, help="Weights & Biases API key") - parser.add_argument("--cluster", type=str, default="ai2/allennlp-cirrascale", help="Beaker cluster to use") - parser.add_argument("--priority", type=str, default="high", help="Priority of the job") - parser.add_argument("--preemptible", type=bool, default=True, help="Whether to use preemptible instances") - parser.add_argument("--workspace", type=str, default="ai2/tulu-3-dev", help="Beaker workspace to use.") - - # Structure: - # 1. Parse model inputs - # 2. Build merge config - # 3. Launch merge - # wait for it to complete successfully - # 4. If --run_evals, require model name (check at beginning) - # 5. Launch evals - -""" -Mergekit Options: - -v, --verbose Verbose logging - --allow-crimes / --no-allow-crimes - Allow mixing architectures [default: no- - allow-crimes] - --transformers-cache TEXT Override storage path for downloaded models - --lora-merge-cache TEXT Path to store merged LORA models - --cuda / --no-cuda Perform matrix arithmetic on GPU [default: - no-cuda] - --low-cpu-memory / --no-low-cpu-memory - Store results and intermediate values on - GPU. Useful if VRAM > RAM [default: no-low- - cpu-memory] - --out-shard-size SIZE Number of parameters per output shard - [default: 5B] - --copy-tokenizer / --no-copy-tokenizer - Copy a tokenizer to the output [default: - copy-tokenizer] - --clone-tensors / --no-clone-tensors - Clone tensors before saving, to allow - multiple occurrences of the same layer - [default: no-clone-tensors] - --trust-remote-code / --no-trust-remote-code - Trust remote code from huggingface repos - (danger) [default: no-trust-remote-code] - --random-seed INTEGER Seed for reproducible use of randomized - merge methods - --lazy-unpickle / --no-lazy-unpickle - Experimental lazy unpickler for lower memory - usage [default: no-lazy-unpickle] - --write-model-card / --no-write-model-card - Output README.md containing details of the - merge [default: write-model-card] - --safe-serialization / --no-safe-serialization - Save output in safetensors. Do this, don't - poison the world with more pickled models. - [default: safe-serialization] - --quiet / --no-quiet Suppress progress bars and other non- - essential output [default: no-quiet] - --read-to-gpu / --no-read-to-gpu - Read model weights directly to GPU - [default: no-read-to-gpu] - --help Show this message and exit. - """ \ No newline at end of file diff --git a/open_instruct/submit_merge_job.py b/open_instruct/submit_merge_job.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/scripts/submit_merge_job.py b/scripts/submit_merge_job.py new file mode 100644 index 000000000..6aa92cb54 --- /dev/null +++ b/scripts/submit_merge_job.py @@ -0,0 +1,86 @@ +import copy +import subprocess +import yaml +import re +import itertools +from datetime import date +import argparse +import os + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--workspace", type=str, default="oe-adapt-general") + parser.add_argument("--beaker_image", type=str, default="nathanl/open_instruct_auto", help="If given, use this Beaker image.") + parser.add_argument("--beaker_config", type=str, default="configs/beaker_configs/default_merge.yaml") + parser.add_argument("--merge_config", type=str, default="configs/merge_configs/example_linear_merge_config.yaml") + parser.add_argument("--cluster", nargs='+', default=["ai2/allennlp-cirrascale", "ai2/general-cirrascale", "ai2/general-cirrascale-a100-80g-ib", "ai2/mosaic-cirrascale-a100", "ai2/s2-cirrascale-l40"]) + parser.add_argument("--priority", type=str, default="low") + parser.add_argument("--preemptible", action="store_true", default=False, help="for using preemtipble jobs (required on some instances)") + args = parser.parse_args() + + today = date.today().strftime("%m%d%Y") + + with open(args.merge_config, 'r') as f: + default_yaml = f.read() + mergeConfig = yaml.load(default_yaml, Loader=yaml.FullLoader) + + assert mergeConfig["merge_method"] in ["linear", "task_arithmetic"], f"merging method {mergeConfig['merge_method']} not supported" + + with open(f"configs/merge_configs/base_configs/default_{mergeConfig['merge_method']}_merge.yaml", 'r') as f: + merge_yaml = f.read() + baseConfig = yaml.load(merge_yaml, Loader=yaml.FullLoader) + + baseConfig["normalize"] = mergeConfig["normalize"] + baseConfig["models"] = [] + beakerDatasets = [] + for elem in mergeConfig["models"]: + # - model: /model-one + # parameters: + # weight: 1.0 + + # - name: name + # location: beaker + # path: jacobm/beaker-dataset + # weight: 0.5 + if elem["location"] == "beaker": + model_data = { + "model": f"/{elem['name']}", + "parameters": {"weight": float(elem["weight"])} + } + # beakerConfig['datasets'][1]['source']['beaker'] = model_info[1] + # - mountPath: /hf_llama_models + # source: + # beaker: Yizhongw03/hf_llama_model_7B + beakerDatasets.append({ + "mountPath": f"/{elem['name']}", + "source": {"beaker": elem["path"]} + }) + # mount datasets + elif elem["location"] in ["huggingface", "nfs"]: # todo: support weka + pass + baseConfig["models"].append(model_data) + + with open(args.beaker_config, 'r') as f: + beaker_yaml = f.read() + beakerConfig = yaml.load(beaker_yaml, Loader=yaml.FullLoader) + + if len(beakerDatasets) > 0: + beakerConfig["tasks"][0]["datasets"] = beakerDatasets + base_command = beakerConfig["tasks"][0]["arguments"][0] + beakerConfig["tasks"][0]["arguments"][0] = base_command.replace("rawconfig", f'"{str(baseConfig)}"') + + experiment_name = f"open_instruct_merge_models_{today}" + beakerConfig["description"] = experiment_name + # if configs/beaker_configs/auto_created doesn't exist, create it with os + if not os.path.exists("configs/beaker_configs/auto_created"): + os.makedirs("configs/beaker_configs/auto_created") + fn = "configs/beaker_configs/auto_created/{}.yaml".format(experiment_name) + os.makedirs(os.path.dirname(fn), exist_ok=True) + with open(fn, "w") as file: + yaml.dump(beakerConfig, file, default_flow_style=True) + + cmd = "beaker experiment create {} --workspace ai2/{}".format(fn, args.workspace) + subprocess.Popen(cmd, shell=True) + +if __name__ == "__main__": + main() \ No newline at end of file From d3ccf4a8c9e2381a6aacaee0cd0e91187a4fa675 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Sat, 17 Aug 2024 16:26:13 -0700 Subject: [PATCH 04/41] . --- scripts/submit_merge_job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/submit_merge_job.py b/scripts/submit_merge_job.py index 6aa92cb54..0ca81a444 100644 --- a/scripts/submit_merge_job.py +++ b/scripts/submit_merge_job.py @@ -9,7 +9,7 @@ def main(): parser = argparse.ArgumentParser() - parser.add_argument("--workspace", type=str, default="oe-adapt-general") + parser.add_argument("--workspace", type=str, default="tulu-3-dev") parser.add_argument("--beaker_image", type=str, default="nathanl/open_instruct_auto", help="If given, use this Beaker image.") parser.add_argument("--beaker_config", type=str, default="configs/beaker_configs/default_merge.yaml") parser.add_argument("--merge_config", type=str, default="configs/merge_configs/example_linear_merge_config.yaml") From f9e0319ef1e6327087dffc0f5f6440831a155346 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Sat, 17 Aug 2024 16:26:42 -0700 Subject: [PATCH 05/41] . --- configs/merge_configs/my-merge-config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index 8f8b73946..9f556f12d 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -7,5 +7,5 @@ models: weight: 0.5 - name: llama-3-8b-tulu_v2_numina-WEIGHTED_MERGE-_simpo_2048len_ultrafeedback location: beaker - path: hamishivi/01J542ZEE4JY6JNQ8K1TMK2S8H + path: 01J542ZEE4JY6JNQ8K1TMK2S8H weight: 0.5 \ No newline at end of file From 4c161e1c1eb1919e5dd9bb90cfa67d954eca763d Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Sat, 17 Aug 2024 16:27:56 -0700 Subject: [PATCH 06/41] . --- Dockerfile | 1 + 1 file changed, 1 insertion(+) diff --git a/Dockerfile b/Dockerfile index 651218625..69a5478e1 100644 --- a/Dockerfile +++ b/Dockerfile @@ -52,6 +52,7 @@ RUN pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url h RUN pip install packaging RUN pip install flash-attn==2.5.8 --no-build-isolation RUN pip install -r requirements.txt +RUN pip install git+https://github.com/arcee-ai/mergekit.git # NLTK download RUN python -m nltk.downloader punkt From f282a3c48412c687e1db2c14ad00e6a7d95f4692 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Sat, 17 Aug 2024 22:05:59 -0700 Subject: [PATCH 07/41] . --- scripts/submit_merge_job.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/scripts/submit_merge_job.py b/scripts/submit_merge_job.py index 0ca81a444..3e6f7afac 100644 --- a/scripts/submit_merge_job.py +++ b/scripts/submit_merge_job.py @@ -63,6 +63,8 @@ def main(): with open(args.beaker_config, 'r') as f: beaker_yaml = f.read() beakerConfig = yaml.load(beaker_yaml, Loader=yaml.FullLoader) + + beakerConfig['tasks'][0]['image']['beaker'] = args.beaker_image if len(beakerDatasets) > 0: beakerConfig["tasks"][0]["datasets"] = beakerDatasets From 488b71b84f687a309b2f471f1f980b6a7f843c92 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Sat, 17 Aug 2024 22:21:12 -0700 Subject: [PATCH 08/41] . --- configs/merge_configs/my-merge-config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index 9f556f12d..e66b670c4 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -1,11 +1,11 @@ merge_method: linear normalize: true models: - - name: llama-3-8b-tulu_v2_numina-WEIGHTED_MERGE-_dpo_norm_beta5_uf_1ep + - name: llama_3_8b-rs_30566_v3.2 location: beaker - path: hamishivi/llama-3-8b-tulu_v2_numina-WEIGHTED_MERGE-_dpo_norm_beta5_uf_1ep + path: jacobm/llama_3_8b-rs_30566_v3.2 weight: 0.5 - - name: llama-3-8b-tulu_v2_numina-WEIGHTED_MERGE-_simpo_2048len_ultrafeedback + - name: llama_3_8b-rs_30566_numina location: beaker - path: 01J542ZEE4JY6JNQ8K1TMK2S8H + path: jacobm/llama_3_8b-rs_30566_numina weight: 0.5 \ No newline at end of file From 623e68d27fb0044f494e2ec0240ce3300647ccda Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Sat, 17 Aug 2024 22:37:51 -0700 Subject: [PATCH 09/41] . --- configs/merge_configs/my-merge-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index e66b670c4..9aa5f4960 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -4,8 +4,8 @@ models: - name: llama_3_8b-rs_30566_v3.2 location: beaker path: jacobm/llama_3_8b-rs_30566_v3.2 - weight: 0.5 + weight: 0.6 - name: llama_3_8b-rs_30566_numina location: beaker path: jacobm/llama_3_8b-rs_30566_numina - weight: 0.5 \ No newline at end of file + weight: 0.4 \ No newline at end of file From 758b2a98a2d3bef7a232c603124f98d85678bc9d Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Sun, 18 Aug 2024 09:39:32 -0700 Subject: [PATCH 10/41] . --- configs/merge_configs/my-merge-config.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index 9aa5f4960..0b78d62ff 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -1,11 +1,11 @@ merge_method: linear normalize: true models: - - name: llama_3_8b-rs_30566_v3.2 + - name: llama_3_8b-rs_2860_v3.2 location: beaker - path: jacobm/llama_3_8b-rs_30566_v3.2 - weight: 0.6 - - name: llama_3_8b-rs_30566_numina + path: jacobm/llama_3_8b-rs_2860_v3.2 + weight: 0.5 + - name: llama_3_8b-rs_2860_numina location: beaker - path: jacobm/llama_3_8b-rs_30566_numina - weight: 0.4 \ No newline at end of file + path: jacobm/llama_3_8b-rs_2860_numina + weight: 0.5 \ No newline at end of file From 8b680120322907fbe02aada5e78238f298945d21 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Sun, 18 Aug 2024 09:39:43 -0700 Subject: [PATCH 11/41] fix --- configs/merge_configs/my-merge-config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index 0b78d62ff..11709482e 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -1,11 +1,11 @@ merge_method: linear normalize: true models: - - name: llama_3_8b-rs_2860_v3.2 + - name: L38B-rs_2860_v3.2 location: beaker - path: jacobm/llama_3_8b-rs_2860_v3.2 + path: jacobm/L38B-rs_2860_v3.2 weight: 0.5 - - name: llama_3_8b-rs_2860_numina + - name: L38B-rs_2860_numina location: beaker - path: jacobm/llama_3_8b-rs_2860_numina + path: jacobm/L38B-rs_2860_numina weight: 0.5 \ No newline at end of file From 09d683406adf9f4fd8bb2124cf5787da7e13f109 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Sun, 18 Aug 2024 09:45:02 -0700 Subject: [PATCH 12/41] . --- configs/merge_configs/my-merge-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index 11709482e..07c0c3078 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -4,8 +4,8 @@ models: - name: L38B-rs_2860_v3.2 location: beaker path: jacobm/L38B-rs_2860_v3.2 - weight: 0.5 + weight: 0.6 - name: L38B-rs_2860_numina location: beaker path: jacobm/L38B-rs_2860_numina - weight: 0.5 \ No newline at end of file + weight: 0.4 \ No newline at end of file From baaa375dd2389e204b84e003b94098affd617eea Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Sun, 18 Aug 2024 09:53:02 -0700 Subject: [PATCH 13/41] . --- configs/merge_configs/my-merge-config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index 07c0c3078..60ec42fa4 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -1,11 +1,11 @@ merge_method: linear normalize: true models: - - name: L38B-rs_2860_v3.2 + - name: L38B-rs_2860_v3.2_more_rs location: beaker - path: jacobm/L38B-rs_2860_v3.2 - weight: 0.6 + path: jacobm/L38B-rs_2860_v3.2_more_rs + weight: 0.5 - name: L38B-rs_2860_numina location: beaker path: jacobm/L38B-rs_2860_numina - weight: 0.4 \ No newline at end of file + weight: 0.5 \ No newline at end of file From e0b9a84e6c4654cf98238f2b42a80c2874c622a6 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Sun, 18 Aug 2024 09:53:32 -0700 Subject: [PATCH 14/41] . --- configs/merge_configs/my-merge-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index 60ec42fa4..6644af806 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -4,8 +4,8 @@ models: - name: L38B-rs_2860_v3.2_more_rs location: beaker path: jacobm/L38B-rs_2860_v3.2_more_rs - weight: 0.5 + weight: 0.6 - name: L38B-rs_2860_numina location: beaker path: jacobm/L38B-rs_2860_numina - weight: 0.5 \ No newline at end of file + weight: 0.4 \ No newline at end of file From 88b16569946e17405ce26e3d0c2227cd48cb5388 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Sun, 18 Aug 2024 09:53:54 -0700 Subject: [PATCH 15/41] . --- configs/merge_configs/my-merge-config.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index 6644af806..84288cc3e 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -1,11 +1,11 @@ merge_method: linear normalize: true models: - - name: L38B-rs_2860_v3.2_more_rs + - name: llama_3_8b-rs_30566_v3.2_more_rs location: beaker - path: jacobm/L38B-rs_2860_v3.2_more_rs - weight: 0.6 - - name: L38B-rs_2860_numina + path: jacobm/llama_3_8b-rs_30566_v3.2_more_rs + weight: 0.5 + - name: llama_3_8b-rs_30566_numina location: beaker - path: jacobm/L38B-rs_2860_numina - weight: 0.4 \ No newline at end of file + path: jacobm/llama_3_8b-rs_30566_numina + weight: 0.5 \ No newline at end of file From 32739a4b265a837256913f2f725131a3445ae058 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Sun, 18 Aug 2024 09:54:06 -0700 Subject: [PATCH 16/41] . --- configs/merge_configs/my-merge-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index 84288cc3e..e14a1539e 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -4,8 +4,8 @@ models: - name: llama_3_8b-rs_30566_v3.2_more_rs location: beaker path: jacobm/llama_3_8b-rs_30566_v3.2_more_rs - weight: 0.5 + weight: 0.6 - name: llama_3_8b-rs_30566_numina location: beaker path: jacobm/llama_3_8b-rs_30566_numina - weight: 0.5 \ No newline at end of file + weight: 0.4 \ No newline at end of file From f516abcc63cea420a422b2cbbf0b28cf428ff43a Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Sun, 18 Aug 2024 18:28:35 -0700 Subject: [PATCH 17/41] . --- configs/merge_configs/my-merge-config.yaml | 8 ++++---- scripts/submit_merge_job.py | 3 +++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index e14a1539e..48c2ce533 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -1,11 +1,11 @@ merge_method: linear normalize: true models: - - name: llama_3_8b-rs_30566_v3.2_more_rs + - name: L38B-rs_2860_v3.2-dpo_norm_beta5_uf_1ep location: beaker - path: jacobm/llama_3_8b-rs_30566_v3.2_more_rs + path: jacobm/L38B-rs_2860_v3.2-dpo_norm_beta5_uf_1ep weight: 0.6 - - name: llama_3_8b-rs_30566_numina + - name: L38B-rs_2860_numina-dpo_norm_beta5_uf_1ep location: beaker - path: jacobm/llama_3_8b-rs_30566_numina + path: jacobm/L38B-rs_2860_numina-dpo_norm_beta5_uf_1ep weight: 0.4 \ No newline at end of file diff --git a/scripts/submit_merge_job.py b/scripts/submit_merge_job.py index 3e6f7afac..cd3b27869 100644 --- a/scripts/submit_merge_job.py +++ b/scripts/submit_merge_job.py @@ -65,6 +65,9 @@ def main(): beakerConfig = yaml.load(beaker_yaml, Loader=yaml.FullLoader) beakerConfig['tasks'][0]['image']['beaker'] = args.beaker_image + beakerConfig['tasks'][0]['context']['cluster'] = args.cluster + beakerConfig['tasks'][0]['context']['priority'] = args.priority + beakerConfig['tasks'][0]['context']['preemptible'] = args.preemptible # True requried for Jupiter/Pluto if len(beakerDatasets) > 0: beakerConfig["tasks"][0]["datasets"] = beakerDatasets From f97c4d2567d62af7989e945f24e4e815638b84bc Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Sun, 18 Aug 2024 18:29:58 -0700 Subject: [PATCH 18/41] . --- scripts/submit_merge_job.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/submit_merge_job.py b/scripts/submit_merge_job.py index cd3b27869..70079aa21 100644 --- a/scripts/submit_merge_job.py +++ b/scripts/submit_merge_job.py @@ -66,8 +66,8 @@ def main(): beakerConfig['tasks'][0]['image']['beaker'] = args.beaker_image beakerConfig['tasks'][0]['context']['cluster'] = args.cluster - beakerConfig['tasks'][0]['context']['priority'] = args.priority - beakerConfig['tasks'][0]['context']['preemptible'] = args.preemptible # True requried for Jupiter/Pluto + # beakerConfig['tasks'][0]['context']['priority'] = args.priority + # beakerConfig['tasks'][0]['context']['preemptible'] = args.preemptible # True requried for Jupiter/Pluto if len(beakerDatasets) > 0: beakerConfig["tasks"][0]["datasets"] = beakerDatasets From ef11e70ac3f3bcb7c6c791faef944b77a04f767d Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Sun, 18 Aug 2024 18:30:20 -0700 Subject: [PATCH 19/41] . --- scripts/submit_merge_job.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/scripts/submit_merge_job.py b/scripts/submit_merge_job.py index 70079aa21..77dd1273c 100644 --- a/scripts/submit_merge_job.py +++ b/scripts/submit_merge_job.py @@ -66,8 +66,10 @@ def main(): beakerConfig['tasks'][0]['image']['beaker'] = args.beaker_image beakerConfig['tasks'][0]['context']['cluster'] = args.cluster - # beakerConfig['tasks'][0]['context']['priority'] = args.priority - # beakerConfig['tasks'][0]['context']['preemptible'] = args.preemptible # True requried for Jupiter/Pluto + beakerConfig['tasks'][0]['context']['priority'] = args.priority + beakerConfig['tasks'][0]['context']['preemptible'] = args.preemptible # True requried for Jupiter/Pluto + + print(beakerConfig) if len(beakerDatasets) > 0: beakerConfig["tasks"][0]["datasets"] = beakerDatasets From 8250980b15ae0bb9f71cfd043064bb1135ca42a7 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Sun, 18 Aug 2024 18:31:19 -0700 Subject: [PATCH 20/41] . --- scripts/submit_merge_job.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/submit_merge_job.py b/scripts/submit_merge_job.py index 77dd1273c..3a203900d 100644 --- a/scripts/submit_merge_job.py +++ b/scripts/submit_merge_job.py @@ -65,9 +65,9 @@ def main(): beakerConfig = yaml.load(beaker_yaml, Loader=yaml.FullLoader) beakerConfig['tasks'][0]['image']['beaker'] = args.beaker_image - beakerConfig['tasks'][0]['context']['cluster'] = args.cluster - beakerConfig['tasks'][0]['context']['priority'] = args.priority - beakerConfig['tasks'][0]['context']['preemptible'] = args.preemptible # True requried for Jupiter/Pluto + # beakerConfig['tasks'][0]['context']['cluster'] = args.cluster + # beakerConfig['tasks'][0]['context']['priority'] = args.priority + # beakerConfig['tasks'][0]['context']['preemptible'] = args.preemptible # True requried for Jupiter/Pluto print(beakerConfig) From b704fde55eeba144eebaa8f26fb1f98498fd399a Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Sun, 18 Aug 2024 18:31:56 -0700 Subject: [PATCH 21/41] . --- configs/merge_configs/my-merge-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index 48c2ce533..14f48bf3b 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -4,8 +4,8 @@ models: - name: L38B-rs_2860_v3.2-dpo_norm_beta5_uf_1ep location: beaker path: jacobm/L38B-rs_2860_v3.2-dpo_norm_beta5_uf_1ep - weight: 0.6 + weight: 0.5 - name: L38B-rs_2860_numina-dpo_norm_beta5_uf_1ep location: beaker path: jacobm/L38B-rs_2860_numina-dpo_norm_beta5_uf_1ep - weight: 0.4 \ No newline at end of file + weight: 0.5 \ No newline at end of file From 8404b6c569e580525efe550bc5507bffb59a56af Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Wed, 21 Aug 2024 12:09:23 -0700 Subject: [PATCH 22/41] . --- configs/merge_configs/my-merge-config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index 14f48bf3b..0280679fa 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -1,11 +1,11 @@ merge_method: linear normalize: true models: - - name: L38B-rs_2860_v3.2-dpo_norm_beta5_uf_1ep + - name: llama_3_8b-rs_2860_v3.2_ccn location: beaker - path: jacobm/L38B-rs_2860_v3.2-dpo_norm_beta5_uf_1ep + path: jacobm/llama_3_8b-rs_2860_v3.2_ccn weight: 0.5 - - name: L38B-rs_2860_numina-dpo_norm_beta5_uf_1ep + - name: llama_3_8b-rs_2860_numina_ccn location: beaker - path: jacobm/L38B-rs_2860_numina-dpo_norm_beta5_uf_1ep + path: jacobm/llama_3_8b-rs_2860_numina_ccn weight: 0.5 \ No newline at end of file From 691143b701434fa672336fc6830540f966122007 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Wed, 21 Aug 2024 12:10:16 -0700 Subject: [PATCH 23/41] . --- configs/merge_configs/my-merge-config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index 0280679fa..89a5ec0d7 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -4,8 +4,8 @@ models: - name: llama_3_8b-rs_2860_v3.2_ccn location: beaker path: jacobm/llama_3_8b-rs_2860_v3.2_ccn - weight: 0.5 + weight: 0.6 - name: llama_3_8b-rs_2860_numina_ccn location: beaker path: jacobm/llama_3_8b-rs_2860_numina_ccn - weight: 0.5 \ No newline at end of file + weight: 0.4 \ No newline at end of file From bbe7648c6f5d722bc4695a1f4ff893b13e67c05d Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Mon, 9 Sep 2024 20:44:42 -0700 Subject: [PATCH 24/41] add --- configs/merge_configs/my-merge-config.yaml | 4 +- .../sft/train-math-only-model.yaml | 179 ++++++++++++++++++ oe-eval-internal | 2 +- 3 files changed, 182 insertions(+), 3 deletions(-) create mode 100644 configs/train_configs/sft/train-math-only-model.yaml diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index 89a5ec0d7..26beab035 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -1,9 +1,9 @@ merge_method: linear normalize: true models: - - name: llama_3_8b-rs_2860_v3.2_ccn + - name: valpy_dpo_mix_uf_wc_regen_da_sftmix_v4.23_09052024165835 location: beaker - path: jacobm/llama_3_8b-rs_2860_v3.2_ccn + path: 01J72A0A1VAT17049F6FHXS7AH weight: 0.6 - name: llama_3_8b-rs_2860_numina_ccn location: beaker diff --git a/configs/train_configs/sft/train-math-only-model.yaml b/configs/train_configs/sft/train-math-only-model.yaml new file mode 100644 index 000000000..86a600f9d --- /dev/null +++ b/configs/train_configs/sft/train-math-only-model.yaml @@ -0,0 +1,179 @@ +model_name_or_path: meta-llama/Meta-Llama-3-8B +model_revision: main +use_flash_attn: true +tokenizer_name: meta-llama/Meta-Llama-3-8B +use_slow_tokenizer: true +dataset_mixer: + # ------------------------------------------------------ + # no_robot dataset, human written, for general chat. + # Total: 9500 + # Pro: created by scale ai with high cost, should be high quality. + # Con: small, not diverse enough, may not be in consistent style. + HuggingFaceH4/no_robots: 9500 + # ------------------------------------------------------ + # OpenAssistant dataset, human written, for general chat. + # Here, only the highest rated paths are extracted. + # Total: 7708 + # Pro: created and reviewed by human volunteers, has multi-turn chat. + # Con: small, still has some noise, the writting quality may not be as good/careful as paid workers, style consistency. + # TODO: need to check if this version corresponds to the highest rated paths. + allenai/openassistant-guanaco-reformatted: 7708 + # ------------------------------------------------------ + # LIMA dataset, human written, for general chat. + # Some instances were filtered in building Tulu 2, probably due to some identity keywords. + # Total: 1018 + # Pro: created by researchers at Meta, aiming for diversity and high quality. + # Con: small, they were created quite early so might not consider some of the latest answering styles of chatbot. + # natolambert/tulu-v2-sft-mixture-lima: 1018 + # ------------------------------------------------------ + # Aya dataset, human written, for general chat (multilingual). + # Total: 202362 + # Pro: created by ..., aiming for very diverse languages (). + # Con: answers may not be in the perfect style. + # ai2-adapt-dev/aya_dataset-reformat: 202362 + # ------------------------------------------------------ + # Tulu hard-coded examples, human written, for identity-related questions. + # Total: 14 + # Pro: necessary to make Tulu aware of itself and its builders. + # Con: small, low coverage of possible questions from users. + # TODO: we should later find ways to replicate this multiple times. + ai2-adapt-dev/tulu_hard_coded_examples: 14 + # ------------------------------------------------------ + # CoT subset in FLAN v2, human (researchers) converted from existing datasets, for reasoning. + # Here, we use the subset processed in Tulu v2. + # Total: 48747 + # Pro: researchers converted from 9 chain-of-thought datasets about arithmetics, multi-hop reasoning, nli. + # Con: limited in the task type, written early, may have inconsistent styles compared to today's chatbot. + # natolambert/tulu-v2-sft-mixture-cot: 49747 + # ------------------------------------------------------ + # SciIFF dataset, human (researchers) converted from existing datasets, for scientific literature understanding. + # Here, we use the subset extracted by the author in building allenai/SciRIFF-train-mix. + # Total: 35357 + # Pro: researchers converted from existing datasets for 54 scientific literature understanding tasks + # Con: limited in the task type, may have inconsistent styles compared to today's chatbot. + # TODO: need to ablate and compare with the one in tulu 2 mixture natolambert/tulu-v2-sft-mixture-science + # natolambert/tulu-v2-sft-mixture-science: 7468 # original data slightly different + # ai2-adapt-dev/SciRIFF-train-mix-science: 10000 + # ------------------------------------------------------ + # SlimOrca dataset, gpt4 generated, for general chat. + # Total: 517982 + # Pro: Paring FLAN v2 inputs with system prompts, and regenerating the outputs using GPT4, potentially in a better style. + # Con: GPT4 responses may contain errors, which may be mitagated by the filtering in SlimOrca + # TODO: need to need to ablate and compare with the 300K one Faeze created. may benefit from regeneration. + # ai2-adapt-dev/slim-orca-300k: 100000 + ai2-adapt-dev/SlimOrca-reformat: 100000 + # ------------------------------------------------------ + # WizardLM eval instruct dataset, gpt4 generated, for general chat. + # Total: 196000 + # Pro: the approach deepens the complexity of gpt4-generated data + # Con: GPT4 generations have eorrs, may also inheritate the biases/styles in GPT4 + # TODO: need to ablate. + WizardLMTeam/WizardLM_evol_instruct_V2_196k: 30000 + # ------------------------------------------------------ + # WildChat dataset, real user queries + gpt4 responses, for general chat. + # Total: 254663 (1M if including those interacting with gpt 3.5) + # Pro: real user queries, may contain diverse challenging scenarios, as well as unsafe prompts. Mutli-turn. + # Con: user queries are usually not that well-formated, and contain a lot of noises. + # ai2-adapt-dev/WildChat-1M-Full-GPT4-Only: 254663 + # ------------------------------------------------------ + # ShareGPT dataset, real user shared queries + gpt4 responses, for general chat. + # Total: 114046 + # Pro: user shared queries usually contain interesting phenomena. Multi-turn. + # Con: unsure licensing, the responses were generated using earlier version of GPT4. + # TODO: need to ablate. May benefit from regeneration. + # Vtuber-plan/sharegpt-cleaned: 114046 + # ------------------------------------------------------ + # Daring-Anteater, a mix of existing datasets, for general chat. + # Total: 99532 + # Pro: a good mix of precise_instruction_following / json_format_following / complex instructions. + # Con: the constraint following part is too small. + # TODO: need to ablate if exclusing the main chat subset is helpful. + # TODO: data needs to be reformatted to consider the system prompt. + ai2-adapt-dev/Daring-Anteater-reformat: 99532 + # ------------------------------------------------------ + # MetaMathQA dataset, augmented using gpt4, for math capability. + # Total: 395000 + # Pro: augmented towards GSM/MATH, so good performance on these two benchmarks (probably similar questions too) + # Con: may be too targeted for the two benchmarks and fail to generalize to other math problems in different styles. + # ai2-adapt-dev/metamath-qa-reformat: 100000 + # ------------------------------------------------------ + # WebInstruct dataset, extract&rewritten using gpt4, (mainly) for math/science related questions + # Here, we are using their released subset. + # Total: 2335220 + # Pro: the generation benefits from GPT4 answering style & the correctness of grounding to web documents. + # Con: may be biased by the response styles in the three websites (MathStackExchange, ScienceStackExchange, Socratic); + # the question answering style are also not diverse enough, with different instruction constraints; + # the answer may still have some errors (10% based on the paper) + # TODO: need to ablate the effect. + # ai2-adapt-dev/WebInstructSub-reformat: 100000 + # ------------------------------------------------------ + # Codefeedback Filtered Instruction, a mix of existing dataset, for coding + # The data mix includes: + # Magicoder-OSS-Instruct + # Python code subset of ShareGPT + # Magicoder-Evol-Instruct + # Evol-Instruct-Code + # Total: 156526 + # Pro: a decent mix of existing coding prompts + # Con: curated mainly for the prompts in building the real CodeFeedback, so responses may be low quality (e.g., ShareGPT) + # TODO: change to individual dataset and ablate the effect. may benefit from regeneration. + # m-a-p/CodeFeedback-Filtered-Instruction: 156526 + # ------------------------------------------------------ + # Codefeedback dataset, a mix of existing dataset + feedback interaction generation, for coding + # Total: 66383 + # Pro: single-turn packing + interaction simulation seems to create good coding model that takes feedback in multi turn. + # Con: not sure how diverse the feedback is and how well it can generalize + # TODO: need to ablate. need to change code for downweight the intermediate responses with errors!!! + # m-a-p/Code-Feedback: 66383 + # ------------------------------------------------------ + # Table-GPT dataset, converted & synthesized, for table understanding and operations + # Total: 13222 + # Pro: a special dataset that contains 14 table related tasks for enhancing table capabilities + # Con: task types are limited. The tables may not be big enough. Reponse styles may be inconsistent. + # TODO: need to ablate. + # ai2-adapt-dev/Table-GPT-All-train: 3000 + # ------------------------------------------------------ + # Coconot dataset, generated by gpt4, for non-compliance + # Total: 11477 + # Pro: a special dataset for the a comprehenvise list of non-compliance behaviors of models. + # Con: the generated queries may only reflect simple cases. + # TODO: need to ablate. + # ai2-adapt-dev/coconot-sft-reformat: 11477 + # ------------------------------------------------------ + # NuminaMATH-TIR, extracted and generated by gpt4, for tool-integrated reasoning for math + # Total: 72441 + # Pro: generally high-quality dataset with mined prompts from web corpus, verified tool-integrated reasoning trajatories. + # Con: mainly for solving math in a specific format, not in a consistent format with the general chat. + # TODO: need to ablate. need to rewrite!!! + AI-MO/NuminaMath-TIR: 72441 + # AI-MO/NuminaMath-CoT: 859000 + # ------------------------------------------------------ + # Xlam function calling dataset, synthesized and verified, for tool use + # Total: 60000 + # Pro: a special dataset for enhancing function calling capability, good performance on BFCL + # Con: responses only contain the function calling and arguments, not in a consistent style with the general chat. + # TODO: need to ablate. need to rewrite!!! + # Salesforce/xlam-function-calling-60k: 60000 + # ------------------------------------------------------ + # Lmsys chatbot arena data, human queries for challenging models, for general chat. + # Total: 1000000 + # Pro: real human interaction with model, with reasonable challenges. + # Con: may not reflect the real challenges in actual use of AI models. The interactions include those with weak models. + # TODO: need to ablate. need to regenerate (the last step)!! the intermediate low-quality responese need to downweight. + # lmsys/lmsys-chat-1m: 1000000 +max_seq_length: 4096 # Note, reduced from 8192 to fit on one GPU with DeepSpeed Stage3 +preprocessing_num_workers: 128 +per_device_train_batch_size: 1 # note, this is set up for 8 GPUs +gradient_accumulation_steps: 16 # effective batch size 128 with 4 nodes +learning_rate: 5.0e-06 # best LR so far +lr_scheduler_type: linear +warmup_ratio: 0.03 +weight_decay: 0.0 +num_train_epochs: 2 +output_dir: /output/ +with_tracking: true +report_to: + - wandb +logging_steps: 1 +checkpointing_steps: epoch +dataset_mix_dir: /output/ \ No newline at end of file diff --git a/oe-eval-internal b/oe-eval-internal index 876d8b0c1..98b95efe8 160000 --- a/oe-eval-internal +++ b/oe-eval-internal @@ -1 +1 @@ -Subproject commit 876d8b0c1906114e49f768bd8aa14bfd575099c6 +Subproject commit 98b95efe8523cda8c48f80197191a4db108657fe From 45cabbb294e31c83897587be3361e54222c2e203 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Fri, 20 Sep 2024 10:11:27 -0700 Subject: [PATCH 25/41] . --- configs/merge_configs/my-merge-config.yaml | 12 ++++++------ oe-eval-internal | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index 26beab035..74b62197a 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -1,11 +1,11 @@ merge_method: linear normalize: true models: - - name: valpy_dpo_mix_uf_wc_regen_da_sftmix_v4.23_09052024165835 + - name: deepseek-math-7b-math-attempt-3 location: beaker - path: 01J72A0A1VAT17049F6FHXS7AH - weight: 0.6 - - name: llama_3_8b-rs_2860_numina_ccn + path: jacobm/deepseek-math-7b-math-attempt-3 + weight: 0.5 + - name: deepseek-coder-7b-math-attempt-3 location: beaker - path: jacobm/llama_3_8b-rs_2860_numina_ccn - weight: 0.4 \ No newline at end of file + path: jacobm/deepseek-coder-7b-math-attempt-3 + weight: 0.5 \ No newline at end of file diff --git a/oe-eval-internal b/oe-eval-internal index 98b95efe8..2fc04da13 160000 --- a/oe-eval-internal +++ b/oe-eval-internal @@ -1 +1 @@ -Subproject commit 98b95efe8523cda8c48f80197191a4db108657fe +Subproject commit 2fc04da1388f8adfcd2b56459cdd2b8b9d66b9c8 From ace26b052f5b76019ce47fedc603fa923903cc0c Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Fri, 20 Sep 2024 10:13:55 -0700 Subject: [PATCH 26/41] . --- scripts/submit_merge_job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/submit_merge_job.py b/scripts/submit_merge_job.py index 3a203900d..219cd8c02 100644 --- a/scripts/submit_merge_job.py +++ b/scripts/submit_merge_job.py @@ -13,7 +13,7 @@ def main(): parser.add_argument("--beaker_image", type=str, default="nathanl/open_instruct_auto", help="If given, use this Beaker image.") parser.add_argument("--beaker_config", type=str, default="configs/beaker_configs/default_merge.yaml") parser.add_argument("--merge_config", type=str, default="configs/merge_configs/example_linear_merge_config.yaml") - parser.add_argument("--cluster", nargs='+', default=["ai2/allennlp-cirrascale", "ai2/general-cirrascale", "ai2/general-cirrascale-a100-80g-ib", "ai2/mosaic-cirrascale-a100", "ai2/s2-cirrascale-l40"]) + parser.add_argument("--cluster", nargs='+', default=["ai2/allennlp-cirrascale", "ai2/general-cirrascale", "ai2/pluto-cirrascale", "ai2/s2-cirrascale-l40"]) parser.add_argument("--priority", type=str, default="low") parser.add_argument("--preemptible", action="store_true", default=False, help="for using preemtipble jobs (required on some instances)") args = parser.parse_args() From 7e7e1c1562d3d6dff0490af2a0390d75053f022a Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Fri, 20 Sep 2024 10:19:39 -0700 Subject: [PATCH 27/41] test --- configs/merge_configs/my-merge-config.yaml | 12 ++++++------ scripts/submit_merge_job.py | 6 +++++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index 74b62197a..f5385d9b4 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -1,11 +1,11 @@ merge_method: linear normalize: true models: - - name: deepseek-math-7b-math-attempt-3 - location: beaker - path: jacobm/deepseek-math-7b-math-attempt-3 + - name: deepseek-coder-7b-base-v1.5 + location: huggingface + path: deepseek-ai/deepseek-coder-7b-base-v1.5 weight: 0.5 - - name: deepseek-coder-7b-math-attempt-3 - location: beaker - path: jacobm/deepseek-coder-7b-math-attempt-3 + - name: deepseek-math-7b-base + location: huggingface + path: deepseek-ai/deepseek-math-7b-base weight: 0.5 \ No newline at end of file diff --git a/scripts/submit_merge_job.py b/scripts/submit_merge_job.py index 219cd8c02..1851241f2 100644 --- a/scripts/submit_merge_job.py +++ b/scripts/submit_merge_job.py @@ -56,8 +56,12 @@ def main(): "source": {"beaker": elem["path"]} }) # mount datasets - elif elem["location"] in ["huggingface", "nfs"]: # todo: support weka + elif elem["location"] in ["huggingface", "nfs"]: + pass # don't need to do anything + elif elem["location"] == "weka": # verify the only available cluster(s) have weka pass + else: + print(f"Unsupported location: {elem['location']}") baseConfig["models"].append(model_data) with open(args.beaker_config, 'r') as f: From 29a7a9530fb2deba93b979ecda68f6ad12e0eafc Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Fri, 20 Sep 2024 10:20:36 -0700 Subject: [PATCH 28/41] , --- scripts/submit_merge_job.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/submit_merge_job.py b/scripts/submit_merge_job.py index 1851241f2..8ee501510 100644 --- a/scripts/submit_merge_job.py +++ b/scripts/submit_merge_job.py @@ -57,7 +57,10 @@ def main(): }) # mount datasets elif elem["location"] in ["huggingface", "nfs"]: - pass # don't need to do anything + model_data = { + "model": elem['location'], + "parameters": {"weight": float(elem["weight"])} + } elif elem["location"] == "weka": # verify the only available cluster(s) have weka pass else: From 9b29228c531af0ec55345707b4b4b5ce8b867e53 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Fri, 20 Sep 2024 10:21:01 -0700 Subject: [PATCH 29/41] fix --- scripts/submit_merge_job.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/submit_merge_job.py b/scripts/submit_merge_job.py index 8ee501510..d906e603f 100644 --- a/scripts/submit_merge_job.py +++ b/scripts/submit_merge_job.py @@ -58,7 +58,7 @@ def main(): # mount datasets elif elem["location"] in ["huggingface", "nfs"]: model_data = { - "model": elem['location'], + "model": elem['path'], "parameters": {"weight": float(elem["weight"])} } elif elem["location"] == "weka": # verify the only available cluster(s) have weka From d07a8192045166b39e944553db6da23c0afca059 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Fri, 20 Sep 2024 10:21:59 -0700 Subject: [PATCH 30/41] test --- configs/merge_configs/my-merge-config.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index f5385d9b4..b4ad468f6 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -1,11 +1,11 @@ merge_method: linear normalize: true models: - - name: deepseek-coder-7b-base-v1.5 + - name: deepseek-math-7b-instruct location: huggingface - path: deepseek-ai/deepseek-coder-7b-base-v1.5 + path: deepseek-ai/deepseek-math-7b-instruct weight: 0.5 - - name: deepseek-math-7b-base + - name: deepseek-coder-7b-instruct-v1.5 location: huggingface - path: deepseek-ai/deepseek-math-7b-base + path: deepseek-ai/deepseek-coder-7b-instruct-v1.5 weight: 0.5 \ No newline at end of file From 47fb938a56ee9c652fc65986098c713d1a9bd7e5 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Tue, 29 Oct 2024 10:51:18 -0700 Subject: [PATCH 31/41] push new commits --- configs/beaker_configs/default_merge.yaml | 10 ++- configs/merge_configs/my-merge-config.yaml | 26 ++++--- .../my-task-arithmetic-config.yaml | 22 ++++++ configs/train_configs/dpo/my-test-dpo.yaml | 29 ++++++++ ...8b_preview_mix_v3.8-noncommercial-wip.yaml | 68 +++++++++++++++++++ scripts/submit_merge_job.py | 48 +++++++++---- 6 files changed, 180 insertions(+), 23 deletions(-) create mode 100644 configs/merge_configs/my-task-arithmetic-config.yaml create mode 100644 configs/train_configs/dpo/my-test-dpo.yaml create mode 100644 configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial-wip.yaml diff --git a/configs/beaker_configs/default_merge.yaml b/configs/beaker_configs/default_merge.yaml index e42d15379..6344e5c5f 100644 --- a/configs/beaker_configs/default_merge.yaml +++ b/configs/beaker_configs/default_merge.yaml @@ -8,7 +8,7 @@ tasks: command: [ '/bin/sh', '-c' ] - arguments: ['echo rawconfig > /output/config.yaml; mergekit-yaml /output/config.yaml /output --cuda'] + arguments: ['mkdir {OUTPUT_DIR}; echo {RAW_CONFIG} > {OUTPUT_DIR}/config.yaml; mergekit-yaml {OUTPUT_DIR}/config.yaml {OUTPUT_DIR} --cuda'] envVars: - name: CUDA_DEVICE_ORDER value: PCI_BUS_ID @@ -31,6 +31,10 @@ tasks: resources: gpuCount: 1 context: - cluster: ai2/allennlp-cirrascale priority: low - preemptible: false \ No newline at end of file + preemptible: true + constraints: + cluster: + - ai2/neptune-cirrascale + - ai2/saturn-cirrascale + - ai2/jupiter-cirrascale-2 \ No newline at end of file diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index b4ad468f6..5e62d939d 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -1,11 +1,21 @@ merge_method: linear normalize: true models: - - name: deepseek-math-7b-instruct - location: huggingface - path: deepseek-ai/deepseek-math-7b-instruct - weight: 0.5 - - name: deepseek-coder-7b-instruct-v1.5 - location: huggingface - path: deepseek-ai/deepseek-coder-7b-instruct-v1.5 - weight: 0.5 \ No newline at end of file + # - name: deepseek-math-7b-instruct + # location: huggingface + # path: deepseek-ai/deepseek-math-7b-instruct + # weight: 0.5 + # - name: deepseek-coder-7b-instruct-v1.5 + # location: huggingface + # path: deepseek-ai/deepseek-coder-7b-instruct-v1.5 + # weight: 0.5 + - name: L3.1-8B-v3.8-nc-final + location: weka + path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.8-nc-final/ + wekaBucket: "oe-adapt-default" + weight: 0.9 + - name: L3.1-8B-v3.8-math_subset + location: weka + path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.8-math_subset/ + wekaBucket: "oe-adapt-default" + weight: 0.4 \ No newline at end of file diff --git a/configs/merge_configs/my-task-arithmetic-config.yaml b/configs/merge_configs/my-task-arithmetic-config.yaml new file mode 100644 index 000000000..d91e75d30 --- /dev/null +++ b/configs/merge_configs/my-task-arithmetic-config.yaml @@ -0,0 +1,22 @@ +merge_method: task_arithmetic +base_model: ai2-adapt-dev/llama-3.1-8b-resized +normalize: true +models: + # - name: deepseek-math-7b-instruct + # location: huggingface + # path: deepseek-ai/deepseek-math-7b-instruct + # weight: 0.5 + # - name: deepseek-coder-7b-instruct-v1.5 + # location: huggingface + # path: deepseek-ai/deepseek-coder-7b-instruct-v1.5 + # weight: 0.5 + - name: L3.1-8B-v3.8-nc-final + location: weka + path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.8-nc-final/ + wekaBucket: "oe-adapt-default" + weight: 1.0 + - name: L3.1-8B-v3.8-math_subset + location: weka + path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.8-math_subset/ + wekaBucket: "oe-adapt-default" + weight: 0.46 \ No newline at end of file diff --git a/configs/train_configs/dpo/my-test-dpo.yaml b/configs/train_configs/dpo/my-test-dpo.yaml new file mode 100644 index 000000000..b3d1d88e7 --- /dev/null +++ b/configs/train_configs/dpo/my-test-dpo.yaml @@ -0,0 +1,29 @@ +model_name_or_path: /model +model_revision: main +use_flash_attn: true +gradient_checkpointing: true +# dataset_name: ai2-adapt-dev/tulu3.4-sft-replica-50k +# dataset_config_name: gpt4-prefs-on-policy +dataset_mixer: + ai2-adapt-dev/tulu3.4-sft-replica-50k-gpt4-prefs-on-policy: 1.0 + ai2-adapt-dev/personahub_if_pref_data_manualseed_v2_19890: 1.0 +tokenizer_name: /model +use_slow_tokenizer: true +max_seq_length: 2048 +preprocessing_num_workers: 16 +per_device_train_batch_size: 1 +gradient_accumulation_steps: 16 # designed for 8 GPUs, so batch size 128 +learning_rate: 5.0e-7 +lr_scheduler_type: linear +warmup_ratio: 0.1 +weight_decay: 0.0 +num_train_epochs: 1 +output_dir: /output +with_tracking: true +report_to: + - wandb +logging_steps: 1 +use_lora: false +dpo_loss_type: dpo_norm +dpo_beta: 5 +checkpointing_steps: 1000 \ No newline at end of file diff --git a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial-wip.yaml b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial-wip.yaml new file mode 100644 index 000000000..9660404c3 --- /dev/null +++ b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial-wip.yaml @@ -0,0 +1,68 @@ +model_name_or_path: meta-llama/Llama-3.1-8B +model_revision: main +use_flash_attn: true +tokenizer_name: meta-llama/Llama-3.1-8B +use_slow_tokenizer: true +dataset_mixer: + # Static v3.8 nc mix file + # /oe-adapt-default/jacobm/tulu-3-dev/data/tulu_v3.8_preview_nc.jsonl: 1.0 + + # # General datasets: + ai2-adapt-dev/oasst1_converted: 7132 # all + ai2-adapt-dev/flan_v2_converted: 89982 # all + ai2-adapt-dev/tulu_hard_coded_repeated_10: 240 # all + ai2-adapt-dev/no_robots_converted: 9500 # all + ai2-adapt-dev/wildchat_gpt4_converted: 100000 + + # # Math datasets: + # ai2-adapt-dev/personahub_math_v5_regen_149960: 149960 # all + # ai2-adapt-dev/personahub_grade_math_v1_49980: 49980 # all + # ai2-adapt-dev/open_math_2_gsm8k_converted: 50000 + # AI-MO/NuminaMath-TIR: 72441 # all + # ai2-adapt-dev/personahub_math_interm_algebra_50000: 20000 + + # Coding datasets: + ai2-adapt-dev/personahub_code_v2_34999: 34999 # all + ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 107276 # all + + # IF datasets: + ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 29980 # all + + # Safety datasets: + ai2-adapt-dev/coconot_converted: 10983 # all + ai2-adapt-dev/processed-wildjailbreak: 50000 + ai2-adapt-dev/synthetic-finalresp-wildguarmixtrain: 50000 + + # Specialty datasets: + ai2-adapt-dev/sciriff_converted: 10000 + ai2-adapt-dev/table_gpt_converted: 5000 + ai2-adapt-dev/aya_dataset_converted: 100000 + + # # need to split for preferences: + # ai2-adapt-dev/wildchat_gpt4_converted: 100000 + # ai2-adapt-dev/tulu_v3.8_unused_wildchat_prompts + # ai2-adapt-dev/tulu_v3.8_unused_wildchat_conversations + # ai2-adapt-dev/open_math_2_gsm8k_converted: 50000 + # ai2-adapt-dev/personahub_math_interm_algebra_50000: 20000 + # ai2-adapt-dev/processed-wildjailbreak: 50000 + # ai2-adapt-dev/synthetic-finalresp-wildguarmixtrain: 50000 + # ai2-adapt-dev/sciriff_converted: 10000 + # ai2-adapt-dev/table_gpt_converted: 5000 + # ai2-adapt-dev/aya_dataset_converted: 100000 + +max_seq_length: 4096 # need to increase to 8k +preprocessing_num_workers: 128 +per_device_train_batch_size: 1 # note, this is set up for 8 GPUs +gradient_accumulation_steps: 2 # effective batch size 128 with 1 node +learning_rate: 5.0e-06 +lr_scheduler_type: linear +warmup_ratio: 0.03 +weight_decay: 0.0 +num_train_epochs: 2 +output_dir: /output/ +with_tracking: true +report_to: + - wandb +logging_steps: 1 +checkpointing_steps: epoch +dataset_mix_dir: /output/ diff --git a/scripts/submit_merge_job.py b/scripts/submit_merge_job.py index d906e603f..061dc03f8 100644 --- a/scripts/submit_merge_job.py +++ b/scripts/submit_merge_job.py @@ -13,17 +13,17 @@ def main(): parser.add_argument("--beaker_image", type=str, default="nathanl/open_instruct_auto", help="If given, use this Beaker image.") parser.add_argument("--beaker_config", type=str, default="configs/beaker_configs/default_merge.yaml") parser.add_argument("--merge_config", type=str, default="configs/merge_configs/example_linear_merge_config.yaml") - parser.add_argument("--cluster", nargs='+', default=["ai2/allennlp-cirrascale", "ai2/general-cirrascale", "ai2/pluto-cirrascale", "ai2/s2-cirrascale-l40"]) - parser.add_argument("--priority", type=str, default="low") - parser.add_argument("--preemptible", action="store_true", default=False, help="for using preemtipble jobs (required on some instances)") + parser.add_argument("--cluster", nargs='+', default=["ai2/neptune-cirrascale", "ai2/saturn-cirrascale", "ai2/jupiter-cirrascale-2"]) + parser.add_argument("--priority", type=str, default="high") + parser.add_argument("--preemptible", action="store_true", default=True, help="for using preemtipble jobs (required on some instances)") + parser.add_argument("--output_dir", type=str, default="/output") args = parser.parse_args() - today = date.today().strftime("%m%d%Y") - with open(args.merge_config, 'r') as f: default_yaml = f.read() mergeConfig = yaml.load(default_yaml, Loader=yaml.FullLoader) + # TODO: support SLERP assert mergeConfig["merge_method"] in ["linear", "task_arithmetic"], f"merging method {mergeConfig['merge_method']} not supported" with open(f"configs/merge_configs/base_configs/default_{mergeConfig['merge_method']}_merge.yaml", 'r') as f: @@ -32,7 +32,15 @@ def main(): baseConfig["normalize"] = mergeConfig["normalize"] baseConfig["models"] = [] + + if mergeConfig["merge_method"] == "task_arithmetic": + baseConfig["models"].append({ + "model": mergeConfig["base_model"] + }) + baseConfig["base_model"] = mergeConfig["base_model"] + beakerDatasets = [] + wekaBuckets = set() for elem in mergeConfig["models"]: # - model: /model-one # parameters: @@ -47,6 +55,8 @@ def main(): "model": f"/{elem['name']}", "parameters": {"weight": float(elem["weight"])} } + if mergeConfig["merge_method"] == "task_arithmetic": + model_data["parameters"]["normalize"] = mergeConfig["normalize"] # beakerConfig['datasets'][1]['source']['beaker'] = model_info[1] # - mountPath: /hf_llama_models # source: @@ -61,8 +71,21 @@ def main(): "model": elem['path'], "parameters": {"weight": float(elem["weight"])} } + if mergeConfig["merge_method"] == "task_arithmetic": + model_data["parameters"]["normalize"] = mergeConfig["normalize"] elif elem["location"] == "weka": # verify the only available cluster(s) have weka - pass + if elem["wekaBucket"] not in wekaBuckets: + beakerDatasets.append({ + "mountPath": f"/{elem['wekaBucket']}", + "source": {"weka": elem["wekaBucket"]} + }) + wekaBuckets.add(elem["wekaBucket"]) + model_data = { + "model": elem["path"], + "parameters": {"weight": float(elem["weight"])} + } + if mergeConfig["merge_method"] == "task_arithmetic": + model_data["parameters"]["normalize"] = mergeConfig["normalize"] else: print(f"Unsupported location: {elem['location']}") baseConfig["models"].append(model_data) @@ -72,18 +95,19 @@ def main(): beakerConfig = yaml.load(beaker_yaml, Loader=yaml.FullLoader) beakerConfig['tasks'][0]['image']['beaker'] = args.beaker_image - # beakerConfig['tasks'][0]['context']['cluster'] = args.cluster - # beakerConfig['tasks'][0]['context']['priority'] = args.priority - # beakerConfig['tasks'][0]['context']['preemptible'] = args.preemptible # True requried for Jupiter/Pluto + # TODO: fix these + beakerConfig['tasks'][0]['constraints']['cluster'] = args.cluster + beakerConfig['tasks'][0]['context']['priority'] = args.priority + beakerConfig['tasks'][0]['context']['preemptible'] = args.preemptible # True required for Jupiter/Pluto print(beakerConfig) if len(beakerDatasets) > 0: beakerConfig["tasks"][0]["datasets"] = beakerDatasets - base_command = beakerConfig["tasks"][0]["arguments"][0] - beakerConfig["tasks"][0]["arguments"][0] = base_command.replace("rawconfig", f'"{str(baseConfig)}"') + base_command = beakerConfig["tasks"][0]["arguments"][0].replace("{OUTPUT_DIR}", args.output_dir) + beakerConfig["tasks"][0]["arguments"][0] = base_command.replace("{RAW_CONFIG}", f'"{str(baseConfig)}"') - experiment_name = f"open_instruct_merge_models_{today}" + experiment_name = f"open_instruct_merge_models" beakerConfig["description"] = experiment_name # if configs/beaker_configs/auto_created doesn't exist, create it with os if not os.path.exists("configs/beaker_configs/auto_created"): From 9a13d8b2b50142b638bb3040c97f2e603c5ab42c Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Tue, 29 Oct 2024 15:50:28 -0700 Subject: [PATCH 32/41] changes to support weka (rough draft for now) --- configs/beaker_configs/default_eval.yaml | 10 +++++----- oe-eval-internal | 2 +- scripts/eval/oe-eval.sh | 2 +- scripts/submit_eval_jobs.py | 16 +++++++++++----- 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/configs/beaker_configs/default_eval.yaml b/configs/beaker_configs/default_eval.yaml index 3b4553ed6..3553ccc60 100644 --- a/configs/beaker_configs/default_eval.yaml +++ b/configs/beaker_configs/default_eval.yaml @@ -35,16 +35,16 @@ tasks: - name: WANDB_DISABLED value: true - name: OPENAI_API_KEY - secret: openai_api_key + secret: jacobm_OPENAI_API_KEY - name: HF_TOKEN - secret: HF_TOKEN + secret: jacobm_HF_TOKEN datasets: + - mountPath: /oe-adapt-default + source: + weka: oe-adapt-default - mountPath: /data/ source: beaker: hamishivi/open-instruct-eval-data - - mountPath: /model - source: - beaker: 01GVYXDGJC6DV0JW9JZ16YM07G - mountPath: /net/nfs.cirrascale source: hostPath: /net/nfs.cirrascale diff --git a/oe-eval-internal b/oe-eval-internal index 2fc04da13..15589625f 160000 --- a/oe-eval-internal +++ b/oe-eval-internal @@ -1 +1 @@ -Subproject commit 2fc04da1388f8adfcd2b56459cdd2b8b9d66b9c8 +Subproject commit 15589625f17ead946fae8f462904484a06277f32 diff --git a/scripts/eval/oe-eval.sh b/scripts/eval/oe-eval.sh index ed605b685..c4019be10 100755 --- a/scripts/eval/oe-eval.sh +++ b/scripts/eval/oe-eval.sh @@ -117,5 +117,5 @@ for TASK in "${TASKS[@]}"; do GPU_COUNT=$GPU_COUNT fi - python oe-eval-internal/oe_eval/launch.py --model "$MODEL_NAME" --beaker-workspace "ai2/tulu-3-results" --beaker-budget ai2/oe-adapt --task "$TASK" $MODEL_TYPE --batch-size "$BATCH_SIZE" --model-args "{\"model_path\":\"${MODEL_LOCATION}\", \"max_length\": ${MAX_LENGTH}}" ${HF_UPLOAD_ARG} --gpus "$GPU_COUNT" --gantry-args '{"env-secret": "OPENAI_API_KEY=openai_api_key"}' ${REVISION_ARG} --beaker-retries 2 + python oe-eval-internal/oe_eval/launch.py --model "$MODEL_NAME" --beaker-workspace "ai2/tulu-3-dev" --beaker-budget ai2/oe-adapt --task "$TASK" $MODEL_TYPE --batch-size "$BATCH_SIZE" --model-args "{\"model_path\":\"${MODEL_LOCATION}\", \"max_length\": ${MAX_LENGTH}}" ${HF_UPLOAD_ARG} --gpus "$GPU_COUNT" --gantry-args '{"env-secret": "OPENAI_API_KEY=jacobm_OPENAI_API_KEY", "weka": "oe-adapt-default:/oe-adapt-default"}' ${REVISION_ARG} --beaker-retries 2 --beaker-priority "urgent" --gantry-secret-hf-write "jacobm_HF_TOKEN" done diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py index 9c64320cf..a3f2274fd 100755 --- a/scripts/submit_eval_jobs.py +++ b/scripts/submit_eval_jobs.py @@ -76,11 +76,11 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): parser.add_argument("--beaker_image", type=str, default="nathanl/open_instruct_auto", help="If given, use this Beaker image.") parser.add_argument("--beaker_subfolder", type=str, default=None) parser.add_argument("--cluster", nargs='+', default=[ - "ai2/allennlp-cirrascale", - "ai2/general-cirrascale", - "ai2/s2-cirrascale-l40", + # "ai2/allennlp-cirrascale", + # "ai2/general-cirrascale", + # "ai2/s2-cirrascale-l40", "ai2/allennlp-elara-cirrascale", - "ai2/pluto-cirrascale", + # "ai2/pluto-cirrascale", "ai2/neptune-cirrascale", "ai2/saturn-cirrascale", "ai2/jupiter-cirrascale-2", @@ -462,9 +462,11 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", f"--model_name_or_path {model_info[1]} --hf_revision {args.hf_revision}")] task_spec['arguments'] = [task_spec['arguments'][0].replace("--tokenizer_name_or_path /model", f"--tokenizer_name_or_path {model_info[1]}")] elif model_info[1].startswith("/"): # if it's a local model, load it from the local directory - assert nfs_available, "NFS is required for path-based models." # to be safe. + # assert nfs_available, "NFS is required for path-based models." # to be safe. task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", f"--model_name_or_path {model_info[1]}")] task_spec['arguments'] = [task_spec['arguments'][0].replace("--tokenizer_name_or_path /model", f"--tokenizer_name_or_path {model_info[1]}")] + elif model_info[1].startswith("weka"): + task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", "--model_name_or_path "+model_info[1].split(':/')[1])] else: # if it's a beaker model, mount the beaker dataset to `/model` task_spec['datasets'][1]['source']['beaker'] = model_info[1] @@ -582,6 +584,8 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): ## model location munging: if beaker, use beaker://. If hf, just name if model_info[0].startswith("hf-"): oe_eval_cmd += f" --model-location {model_info[1]}" + elif "weka" in model_info[1]: + oe_eval_cmd += f" --model-location {model_info[1]}" else: oe_eval_cmd += f" --model-location beaker://{model_info[1]}" if args.hf_revision: @@ -625,6 +629,8 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): assert nfs_available, "NFS is required for path-based models." # to be safe. task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", "--model_name_or_path "+model_info[1])] task_spec['arguments'] = [task_spec['arguments'][0].replace("--tokenizer_name_or_path /model", "--tokenizer_name_or_path "+model_info[1])] + elif model_info[1].startswith("weka"): + task_spec['arguments'] = [task_spec['arguments'][0].replace("--model_name_or_path /model", "--model_name_or_path "+model_info[1].split(':/')[1])] else: # if it's a beaker model, mount the beaker dataset to `/model` task_spec['datasets'][1]['source']['beaker'] = model_info[1] From 5aa6267e2c16552512dbd7dd7e32c07924a23b80 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Tue, 29 Oct 2024 20:46:44 -0700 Subject: [PATCH 33/41] changes --- configs/beaker_configs/default_eval.yaml | 3 +++ configs/beaker_configs/default_finetune_multinode.yaml | 4 ++-- scripts/submit_eval_jobs.py | 1 + 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/configs/beaker_configs/default_eval.yaml b/configs/beaker_configs/default_eval.yaml index 3553ccc60..ba9f55731 100644 --- a/configs/beaker_configs/default_eval.yaml +++ b/configs/beaker_configs/default_eval.yaml @@ -42,6 +42,9 @@ tasks: - mountPath: /oe-adapt-default source: weka: oe-adapt-default + - mountPath: /model + source: + beaker: 01GVYXDGJC6DV0JW9JZ16YM07G - mountPath: /data/ source: beaker: hamishivi/open-instruct-eval-data diff --git a/configs/beaker_configs/default_finetune_multinode.yaml b/configs/beaker_configs/default_finetune_multinode.yaml index 03ed976af..9dd376f69 100644 --- a/configs/beaker_configs/default_finetune_multinode.yaml +++ b/configs/beaker_configs/default_finetune_multinode.yaml @@ -51,7 +51,7 @@ tasks: - name: TRANSFORMERS_CACHE value: ./cache/ - name: WANDB_API_KEY - secret: WANDB_API_KEY + secret: jacobm_WANDB_API_KEY - name: WANDB_PROJECT value: open-instruct - name: WANDB_WATCH @@ -61,7 +61,7 @@ tasks: - name: WANDB_DISABLED value: true - name: HF_TOKEN - secret: HF_TOKEN + secret: jacobm_HF_TOKEN datasets: - mountPath: /oe-adapt-default source: diff --git a/scripts/submit_eval_jobs.py b/scripts/submit_eval_jobs.py index a3f2274fd..388e8f2a0 100755 --- a/scripts/submit_eval_jobs.py +++ b/scripts/submit_eval_jobs.py @@ -643,6 +643,7 @@ def adjust_gpus(task_spec, experiment_group, model_name, gpu_multiplier): # add gpu information. # we just assume you want to use all the gpus for one task at a time + task_spec['resources']['gpuCount'] = 8 num_gpus = task_spec['resources']['gpuCount'] task_spec["arguments"][0]+= f" --min_gpus_per_task {num_gpus}" From f4bbe023de513ec2e1f54a1d87a9af5d561bcae0 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Wed, 30 Oct 2024 09:02:29 -0700 Subject: [PATCH 34/41] update merge configs --- configs/beaker_configs/default_merge.yaml | 4 ++-- configs/merge_configs/my-merge-config.yaml | 13 +++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/configs/beaker_configs/default_merge.yaml b/configs/beaker_configs/default_merge.yaml index 6344e5c5f..446acee54 100644 --- a/configs/beaker_configs/default_merge.yaml +++ b/configs/beaker_configs/default_merge.yaml @@ -15,7 +15,7 @@ tasks: - name: TRANSFORMERS_CACHE value: ./cache/ - name: WANDB_API_KEY - secret: WANDB_API_KEY + secret: jacobm_WANDB_API_KEY - name: WANDB_PROJECT value: open-instruct - name: WANDB_WATCH @@ -25,7 +25,7 @@ tasks: - name: WANDB_DISABLED value: true - name: HF_TOKEN - secret: HF_TOKEN + secret: jacobm_HF_TOKEN result: path: /output resources: diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index 5e62d939d..2f6e8720f 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -13,9 +13,14 @@ models: location: weka path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.8-nc-final/ wekaBucket: "oe-adapt-default" - weight: 0.9 - - name: L3.1-8B-v3.8-math_subset + weight: 1.0 + - name: L3.1-8B-v3.8-nc-2 location: weka - path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.8-math_subset/ + path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.8-nc-2/ wekaBucket: "oe-adapt-default" - weight: 0.4 \ No newline at end of file + weight: 1.0 + - name: L3.1-8B-v3.8-nc-3 + location: weka + path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.8-nc-3/ + wekaBucket: "oe-adapt-default" + weight: 1.0 \ No newline at end of file From 6377335317556dfd4a8a2fde22752be112240f46 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Thu, 31 Oct 2024 18:22:39 -0700 Subject: [PATCH 35/41] committing changes --- configs/beaker_configs/default_dpo.yaml | 4 +- configs/beaker_configs/default_finetune.yaml | 4 +- .../default_finetune_multinode_augusta.yaml | 128 ++++++++++++++++++ configs/merge_configs/my-merge-config.yaml | 18 +-- configs/train_configs/dpo/my-test-dpo.yaml | 4 +- ...u3_70b_preview_mix_v3.9-noncommercial.yaml | 60 ++++++++ ...8b_preview_mix_v3.8-noncommercial-wip.yaml | 46 ++++--- ...lu3_8b_preview_mix_v3.8-noncommercial.yaml | 2 +- ...lu3_8b_preview_mix_v3.9-noncommercial.yaml | 59 ++++++++ scripts/filter-v3.8-data.py | 111 +++++++++++++++ scripts/submit_finetune_job.py | 3 + 11 files changed, 401 insertions(+), 38 deletions(-) create mode 100644 configs/beaker_configs/default_finetune_multinode_augusta.yaml create mode 100644 configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml create mode 100644 configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml create mode 100644 scripts/filter-v3.8-data.py diff --git a/configs/beaker_configs/default_dpo.yaml b/configs/beaker_configs/default_dpo.yaml index 87685b5fe..694ca8e33 100644 --- a/configs/beaker_configs/default_dpo.yaml +++ b/configs/beaker_configs/default_dpo.yaml @@ -37,7 +37,7 @@ tasks: - name: TRANSFORMERS_CACHE value: ./cache/ - name: WANDB_API_KEY - secret: WANDB_API_KEY + secret: jacobm_WANDB_API_KEY - name: WANDB_PROJECT value: open-instruct - name: WANDB_WATCH @@ -47,7 +47,7 @@ tasks: - name: WANDB_DISABLED value: true - name: HF_TOKEN - secret: HF_TOKEN + secret: jacobm_HF_TOKEN datasets: - mountPath: /oe-adapt-default source: diff --git a/configs/beaker_configs/default_finetune.yaml b/configs/beaker_configs/default_finetune.yaml index bd5e05c06..6d2e2b549 100644 --- a/configs/beaker_configs/default_finetune.yaml +++ b/configs/beaker_configs/default_finetune.yaml @@ -37,7 +37,7 @@ tasks: - name: TRANSFORMERS_CACHE value: ./cache/ - name: WANDB_API_KEY - secret: WANDB_API_KEY + secret: jacobm_WANDB_API_KEY - name: WANDB_PROJECT value: open-instruct - name: WANDB_WATCH @@ -47,7 +47,7 @@ tasks: - name: WANDB_DISABLED value: true - name: HF_TOKEN - secret: HF_TOKEN + secret: jacobm_HF_TOKEN datasets: - mountPath: /oe-adapt-default source: diff --git a/configs/beaker_configs/default_finetune_multinode_augusta.yaml b/configs/beaker_configs/default_finetune_multinode_augusta.yaml new file mode 100644 index 000000000..3766bdc4b --- /dev/null +++ b/configs/beaker_configs/default_finetune_multinode_augusta.yaml @@ -0,0 +1,128 @@ +version: v2 +description: open-instruct-finetune-multinode +budget: ai2/oe-adapt +tasks: + - name: open-instruct-finetune-multinode + replicas: 4 + leaderSelection: true + hostNetworking: true + propagateFailure: true + propagatePreemption: true + synchronizedStartTimeout: 60m + image: + beaker: nathanl/open_instruct_auto + command: [ + '/bin/sh', '-c' + ] + arguments: [' + unset CUDA_LAUNCH_BLOCKING && export LD_LIBRARY_PATH=/var/lib/tcpxo/lib64:${LD_LIBRARY_PATH} && PYTHONPATH="/stage:$PYTHONPATH" accelerate launch + --mixed_precision bf16 + --num_machines 4 + --num_processes 32 + --machine_rank $BEAKER_REPLICA_RANK + --main_process_ip $BEAKER_LEADER_REPLICA_HOSTNAME + --main_process_port 29400 + --use_deepspeed + --deepspeed_config_file configs/ds_configs/stage3_no_offloading_accelerate.conf + --deepspeed_multinode_launcher standard + open_instruct/finetune.py + --model_name_or_path meta-llama/Meta-Llama-3-8B + --tokenizer_name meta-llama/Meta-Llama-3-8B + --use_slow_tokenizer + --use_flash_attn + --max_seq_length 4096 + --preprocessing_num_workers 16 + --per_device_train_batch_size 1 + --gradient_accumulation_steps 4 + --learning_rate 5e-6 + --lr_scheduler_type linear + --warmup_ratio 0.03 + --weight_decay 0. + --num_train_epochs 2 + --output_dir /output/ + --with_tracking + --report_to tensorboard + --logging_steps 1 + --reduce_loss sum + '] + envVars: + - name: CUDA_DEVICE_ORDER + value: PCI_BUS_ID + - name: TRANSFORMERS_CACHE + value: ./cache/ + - name: WANDB_API_KEY + secret: jacobm_WANDB_API_KEY + - name: WANDB_PROJECT + value: open-instruct + - name: WANDB_WATCH + value: false + - name: WANDB_LOG_MODEL + value: false + - name: WANDB_DISABLED + value: true + - name: HF_TOKEN + secret: jacobm_HF_TOKEN + - name: NCCL_CROSS_NIC + value: 0 + - name: NCCL_ALGO + value: Ring,Tree + - name: NCCL_PROTO + value: Simple + - name: NCCL_MIN_NCHANNELS + value: 4 + - name: NCCL_P2P_NET_CHUNKSIZE + value: 524288 + - name: NCCL_P2P_PCI_CHUNKSIZE + value: 524288 + - name: NCCL_P2P_NVL_CHUNKSIZE + value: 1048576 + - name: NCCL_FASTRAK_NUM_FLOWS + value: 2 + - name: NCCL_FASTRAK_ENABLE_CONTROL_CHANNEL + value: 0 + - name: NCCL_BUFFSIZE + value: 8388608 + - name: NCCL_FASTRAK_USE_SNAP + value: 1 + - name: CUDA_VISIBLE_DEVICES + value: 0,1,2,3,4,5,6,7 + - name: NCCL_NET_GDR_LEVEL + value: PIX + - name: NCCL_FASTRAK_ENABLE_HOTPATH_LOGGING + value: 0 + - name: NCCL_TUNER_PLUGIN + value: libnccl-tuner.so + - name: NCCL_TUNER_CONFIG_PATH + value: /var/lib/tcpxo/lib64/a3plus_tuner_config.textproto + - name: NCCL_SHIMNET_GUEST_CONFIG_CHECKER_CONFIG_FILE + value: /var/lib/tcpxo/lib64/a3plus_guest_config.textproto + - name: NCCL_FASTRAK_PLUGIN_ACCEPT_TIMEOUT_MS + value: 600000 + - name: NCCL_NVLS_ENABLE + value: 0 + - name: NCCL_DEBUG + value: WARN + - name: NCCL_FASTRAK_CTRL_DEV + value: enp0s12 + - name: NCCL_FASTRAK_IFNAME + value: enp6s0,enp7s0,enp13s0,enp14s0,enp134s0,enp135s0,enp141s0,enp142s0 + - name: NCCL_SOCKET_IFNAME + value: enp0s12 + - name: NCCL_USE_SNAP + value: 1 + - name: NCCL_FASTRAK_USE_LLCM + value: 1 + - name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY + value: /dev/aperture_devices + + datasets: + - mountPath: /oe-adapt-default + source: + weka: oe-adapt-default + result: + path: /output + resources: + gpuCount: 8 + context: + priority: normal + preemptible: true \ No newline at end of file diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index 2f6e8720f..f558a29af 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -9,18 +9,18 @@ models: # location: huggingface # path: deepseek-ai/deepseek-coder-7b-instruct-v1.5 # weight: 0.5 - - name: L3.1-8B-v3.8-nc-final + - name: L3.1-8B-v3.9-nc-1 location: weka - path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.8-nc-final/ + path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-1/ wekaBucket: "oe-adapt-default" weight: 1.0 - - name: L3.1-8B-v3.8-nc-2 + - name: L3.1-8B-v3.9-nc-2 location: weka - path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.8-nc-2/ + path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-2/ wekaBucket: "oe-adapt-default" weight: 1.0 - - name: L3.1-8B-v3.8-nc-3 - location: weka - path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.8-nc-3/ - wekaBucket: "oe-adapt-default" - weight: 1.0 \ No newline at end of file + # - name: L3.1-8B-v3.9-nc-3 + # location: weka + # path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-3/ + # wekaBucket: "oe-adapt-default" + # weight: 1.0 \ No newline at end of file diff --git a/configs/train_configs/dpo/my-test-dpo.yaml b/configs/train_configs/dpo/my-test-dpo.yaml index b3d1d88e7..edac5a2f8 100644 --- a/configs/train_configs/dpo/my-test-dpo.yaml +++ b/configs/train_configs/dpo/my-test-dpo.yaml @@ -1,4 +1,5 @@ -model_name_or_path: /model +model_name_or_path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-1 +tokenizer_name: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-1 model_revision: main use_flash_attn: true gradient_checkpointing: true @@ -7,7 +8,6 @@ gradient_checkpointing: true dataset_mixer: ai2-adapt-dev/tulu3.4-sft-replica-50k-gpt4-prefs-on-policy: 1.0 ai2-adapt-dev/personahub_if_pref_data_manualseed_v2_19890: 1.0 -tokenizer_name: /model use_slow_tokenizer: true max_seq_length: 2048 preprocessing_num_workers: 16 diff --git a/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml b/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml new file mode 100644 index 000000000..adfa95280 --- /dev/null +++ b/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml @@ -0,0 +1,60 @@ +model_name_or_path: meta-llama/Llama-3.1-70B +model_revision: main +use_flash_attn: true +tokenizer_name: meta-llama/Llama-3.1-70B +use_slow_tokenizer: true +dataset_mixer: + # Static v3.9 nc mix file + # WIP + + # Static v3.9 huggingface dataset + allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0 + + # # General datasets: + # ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all + # ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all + # ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all + # ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all + # ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0 + + # # Math datasets: + # ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all + # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 # 49980 # all + # ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 + # ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0 + # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 + + # # Coding datasets: + # ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all + # ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all + + # # IF datasets: + # ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all + + # # Safety datasets: + # ai2-adapt-dev/coconot_converted: 1.0 # 10983 # all + # ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 1.0 + # ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 1.0 + + # # Specialty datasets: + # ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0 + # ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0 + # ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0 + +max_seq_length: 4096 +preprocessing_num_workers: 128 +per_device_train_batch_size: 1 # note, this is set up for 8 GPUs +gradient_accumulation_steps: 2 # effective batch size 128 with 8 nodes +learning_rate: 2.0e-06 +lr_scheduler_type: linear +warmup_ratio: 0.03 +weight_decay: 0.0 +num_train_epochs: 2 +output_dir: /output/ +with_tracking: true +report_to: + - wandb +logging_steps: 1 +checkpointing_steps: epoch +dataset_mix_dir: /output/ +gradient_checkpointing: true diff --git a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial-wip.yaml b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial-wip.yaml index 9660404c3..22a574f4f 100644 --- a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial-wip.yaml +++ b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial-wip.yaml @@ -6,42 +6,44 @@ use_slow_tokenizer: true dataset_mixer: # Static v3.8 nc mix file # /oe-adapt-default/jacobm/tulu-3-dev/data/tulu_v3.8_preview_nc.jsonl: 1.0 + allenai/tulu-v.3.8-mix-preview-noncommercial: 1.0 - # # General datasets: - ai2-adapt-dev/oasst1_converted: 7132 # all - ai2-adapt-dev/flan_v2_converted: 89982 # all - ai2-adapt-dev/tulu_hard_coded_repeated_10: 240 # all - ai2-adapt-dev/no_robots_converted: 9500 # all - ai2-adapt-dev/wildchat_gpt4_converted: 100000 - # # Math datasets: + # # # General datasets: + # ai2-adapt-dev/oasst1_converted: 7132 # all + # ai2-adapt-dev/flan_v2_converted: 89982 # all + # ai2-adapt-dev/tulu_hard_coded_repeated_10: 240 # all + # ai2-adapt-dev/no_robots_converted: 9500 # all + # ai2-adapt-dev/wildchat_gpt4_converted: 100000 + + # # # Math datasets: # ai2-adapt-dev/personahub_math_v5_regen_149960: 149960 # all # ai2-adapt-dev/personahub_grade_math_v1_49980: 49980 # all # ai2-adapt-dev/open_math_2_gsm8k_converted: 50000 # AI-MO/NuminaMath-TIR: 72441 # all # ai2-adapt-dev/personahub_math_interm_algebra_50000: 20000 - # Coding datasets: - ai2-adapt-dev/personahub_code_v2_34999: 34999 # all - ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 107276 # all + # # Coding datasets: + # ai2-adapt-dev/personahub_code_v2_34999: 34999 # all + # ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 107276 # all - # IF datasets: - ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 29980 # all + # # IF datasets: + # ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 29980 # all - # Safety datasets: - ai2-adapt-dev/coconot_converted: 10983 # all - ai2-adapt-dev/processed-wildjailbreak: 50000 - ai2-adapt-dev/synthetic-finalresp-wildguarmixtrain: 50000 + # # Safety datasets: + # ai2-adapt-dev/coconot_converted: 10983 # all + # ai2-adapt-dev/processed-wildjailbreak: 50000 + # ai2-adapt-dev/synthetic-finalresp-wildguarmixtrain: 50000 - # Specialty datasets: - ai2-adapt-dev/sciriff_converted: 10000 - ai2-adapt-dev/table_gpt_converted: 5000 - ai2-adapt-dev/aya_dataset_converted: 100000 + # # Specialty datasets: + # ai2-adapt-dev/sciriff_converted: 10000 + # ai2-adapt-dev/table_gpt_converted: 5000 + # ai2-adapt-dev/aya_dataset_converted: 100000 # # need to split for preferences: # ai2-adapt-dev/wildchat_gpt4_converted: 100000 - # ai2-adapt-dev/tulu_v3.8_unused_wildchat_prompts - # ai2-adapt-dev/tulu_v3.8_unused_wildchat_conversations + # # ai2-adapt-dev/tulu_v3.8_unused_wildchat_prompts + # # ai2-adapt-dev/tulu_v3.8_unused_wildchat_conversations # ai2-adapt-dev/open_math_2_gsm8k_converted: 50000 # ai2-adapt-dev/personahub_math_interm_algebra_50000: 20000 # ai2-adapt-dev/processed-wildjailbreak: 50000 diff --git a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial.yaml b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial.yaml index 834dc99ff..fd1f81ee2 100644 --- a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial.yaml +++ b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.8-noncommercial.yaml @@ -41,7 +41,7 @@ dataset_mixer: max_seq_length: 4096 # need to increase to 8k preprocessing_num_workers: 128 per_device_train_batch_size: 1 # note, this is set up for 8 GPUs -gradient_accumulation_steps: 4 # effective batch size 128 with 4 nodes +gradient_accumulation_steps: 16 # effective batch size 128 with 4 nodes learning_rate: 5.0e-06 lr_scheduler_type: linear warmup_ratio: 0.03 diff --git a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml new file mode 100644 index 000000000..e604d0790 --- /dev/null +++ b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml @@ -0,0 +1,59 @@ +model_name_or_path: meta-llama/Llama-3.1-8B +model_revision: main +use_flash_attn: true +tokenizer_name: meta-llama/Llama-3.1-8B +use_slow_tokenizer: true +dataset_mixer: + # Static v3.9 nc mix file + # WIP + + # Static v3.9 huggingface dataset + allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0 + + # # General datasets: + # ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all + # ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all + # ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all + # ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all + # ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0 + + # # Math datasets: + # ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all + # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 # 49980 # all + # ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 + # ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0 + # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 + + # # Coding datasets: + # ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all + # ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all + + # # IF datasets: + # ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all + + # # Safety datasets: + # ai2-adapt-dev/coconot_converted: 1.0 # 10983 # all + # ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 1.0 + # ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 1.0 + + # # Specialty datasets: + # ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0 + # ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0 + # ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0 + +max_seq_length: 4096 # need to increase to 8k +preprocessing_num_workers: 128 +per_device_train_batch_size: 1 # note, this is set up for 8 GPUs +gradient_accumulation_steps: 2 # effective batch size 128 with 4 nodes +learning_rate: 5.0e-06 +lr_scheduler_type: linear +warmup_ratio: 0.03 +weight_decay: 0.0 +num_train_epochs: 2 +output_dir: /output/ +with_tracking: true +report_to: + - wandb +logging_steps: 1 +checkpointing_steps: epoch +dataset_mix_dir: /output/ diff --git a/scripts/filter-v3.8-data.py b/scripts/filter-v3.8-data.py new file mode 100644 index 000000000..0d81a3881 --- /dev/null +++ b/scripts/filter-v3.8-data.py @@ -0,0 +1,111 @@ +from datasets import load_dataset + +full_ds = load_dataset("allenai/tulu-v.3.8-mix-preview-noncommercial") + +conversations = set() +prompts = set() + +for elem in full_ds["train"]: + conv = "" + prompt = elem["messages"][0]["content"] + prompts.add(prompt) + for msg in elem["messages"]: + conv += msg["content"] + conversations.add(conv) + + ### Not using anymore: + # ai2-adapt-dev/wildchat_gpt4_converted: 100000 + # # ai2-adapt-dev/tulu_v3.8_unused_wildchat_prompts + # # ai2-adapt-dev/tulu_v3.8_unused_wildchat_conversations + +seed = 42 + +### splitting: + +# wildchat_gpt4_converted_safety_decontaminated: 100000 +wildchat_ds = load_dataset("ai2-adapt-dev/wildchat_gpt4_converted_safety_decontaminated").shuffle(seed) +wildchat_ds_to_use = wildchat_ds["train"].select(range(100000)) +wildchat_ds_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_wildchat_100k") +wildchat_ds_to_not_use = wildchat_ds["train"].select(range(100000, len(wildchat_ds["train"]))) +wildchat_ds_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_wildchat_unused") + +del wildchat_ds +del wildchat_ds_to_use +del wildchat_ds_to_not_use + +# ai2-adapt-dev/open_math_2_gsm8k_converted: 50000 +openmath2_gsm8k_ds = load_dataset("ai2-adapt-dev/open_math_2_gsm8k_converted").shuffle(seed) +openmath2_gsm8k_to_use = openmath2_gsm8k_ds["train"].select(range(50000)) +openmath2_gsm8k_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k") +openmath2_gsm8k_to_not_use = openmath2_gsm8k_ds["train"].select(range(50000, len(openmath2_gsm8k_ds["train"]))) +openmath2_gsm8k_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_unused") + +del openmath2_gsm8k_ds +del openmath2_gsm8k_to_use +del openmath2_gsm8k_to_not_use + +# ai2-adapt-dev/personahub_math_interm_algebra_50000: 20000 +p_math_alg_ds = load_dataset("ai2-adapt-dev/personahub_math_interm_algebra_50000").shuffle(seed) +p_math_alg_ds_to_use = p_math_alg_ds["train"].select(range(20000)) +p_math_alg_ds_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k") +p_math_alg_ds_to_not_use = p_math_alg_ds["train"].select(range(20000, len(p_math_alg_ds["train"]))) +p_math_alg_ds_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_unused") + +del p_math_alg_ds +del p_math_alg_ds_to_use +del p_math_alg_ds_to_not_use + +# ai2-adapt-dev/processed_wildjailbreak_safety_decontaminated: 50000 +wjb_ds = load_dataset("ai2-adapt-dev/processed_wildjailbreak_safety_decontaminated").shuffle(seed) +wjb_ds_to_use = wjb_ds["train"].select(range(50000)) +wjb_ds_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k") +wjb_ds_to_not_use = wjb_ds["train"].select(range(50000, len(wjb_ds["train"]))) +wjb_ds_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_unused") + +del wjb_ds +del wjb_ds_to_use +del wjb_ds_to_not_use + +# ai2-adapt-dev/synthetic_finalresp_wildguardmixtrain_safety_decontaminated: 50000 +wg_ds = load_dataset("ai2-adapt-dev/synthetic_finalresp_wildguardmixtrain_safety_decontaminated").shuffle(seed) +wg_ds_to_use = wg_ds["train"].select(range(50000)) +wg_ds_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k") +wg_ds_to_not_use = wg_ds["train"].select(range(50000, len(wg_ds["train"]))) +wg_ds_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_unused") + +del wg_ds +del wg_ds_to_use +del wg_ds_to_not_use + +# ai2-adapt-dev/sciriff_converted: 10000 +sciriff_ds = load_dataset("ai2-adapt-dev/sciriff_converted").shuffle(seed) +sciriff_ds_to_use = sciriff_ds["train"].select(range(10000)) +sciriff_ds_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_sciriff_10k") +sciriff_ds_to_not_use = sciriff_ds["train"].select(range(10000, len(sciriff_ds["train"]))) +sciriff_ds_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_sciriff_unused") + +del sciriff_ds +del sciriff_ds_to_use +del sciriff_ds_to_not_use + +# ai2-adapt-dev/table_gpt_converted: 5000 +table_gpt_ds = load_dataset("ai2-adapt-dev/table_gpt_converted").shuffle(seed) +table_gpt_ds_to_use = table_gpt_ds["train"].select(range(5000)) +table_gpt_ds_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_table_gpt_5k") +table_gpt_ds_to_not_use = table_gpt_ds["train"].select(range(5000, len(table_gpt_ds["train"]))) +table_gpt_ds_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_table_gpt_unused") + +del table_gpt_ds +del table_gpt_ds_to_use +del table_gpt_ds_to_not_use + +# ai2-adapt-dev/aya_dataset_converted: 100000 +aya_ds = load_dataset("ai2-adapt-dev/aya_dataset_converted").shuffle(seed) +aya_ds_to_use = aya_ds["train"].select(range(100000)) +aya_ds_to_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_aya_100k") +aya_ds_to_not_use = aya_ds["train"].select(range(100000, len(aya_ds["train"]))) +aya_ds_to_not_use.push_to_hub("ai2-adapt-dev/tulu_v3.9_aya_unused") + +del aya_ds +del aya_ds_to_use +del aya_ds_to_not_use diff --git a/scripts/submit_finetune_job.py b/scripts/submit_finetune_job.py index 7b7e7609f..c55eb184a 100644 --- a/scripts/submit_finetune_job.py +++ b/scripts/submit_finetune_job.py @@ -199,6 +199,9 @@ def parse_args(args): }, ] + if "google" in args.cluster: + d["tasks"][0]["datasets"].pop(0) + # WANDB settings for env in d['tasks'][0]['envVars']: if env['name'] == "WANDB_DISABLED": From c5b9c0fcf08fb461309e0f3414c000c1de7b1649 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Sat, 2 Nov 2024 22:56:11 -0700 Subject: [PATCH 36/41] update --- configs/beaker_configs/default_finetune.yaml | 3 + .../default_finetune_multinode.yaml | 3 + configs/train_configs/dpo/my-test-dpo.yaml | 6 +- .../sft/peteish_1124_preview_mix_v3.9.yaml | 59 +++++++++++++++++++ ...u3_70b_preview_mix_v3.9-noncommercial.yaml | 4 +- ...review_mix_v3.9-noncommercial-augusta.yaml | 59 +++++++++++++++++++ ...lu3_8b_preview_mix_v3.9-noncommercial.yaml | 2 +- scripts/submit_finetune_job.py | 2 +- 8 files changed, 132 insertions(+), 6 deletions(-) create mode 100644 configs/train_configs/sft/peteish_1124_preview_mix_v3.9.yaml create mode 100644 configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial-augusta.yaml diff --git a/configs/beaker_configs/default_finetune.yaml b/configs/beaker_configs/default_finetune.yaml index 6d2e2b549..d10680de5 100644 --- a/configs/beaker_configs/default_finetune.yaml +++ b/configs/beaker_configs/default_finetune.yaml @@ -52,6 +52,9 @@ tasks: - mountPath: /oe-adapt-default source: weka: oe-adapt-default + - mountPath: /oe-training-default + source: + weka: oe-training-default result: path: /output resources: diff --git a/configs/beaker_configs/default_finetune_multinode.yaml b/configs/beaker_configs/default_finetune_multinode.yaml index 9dd376f69..380cc0c76 100644 --- a/configs/beaker_configs/default_finetune_multinode.yaml +++ b/configs/beaker_configs/default_finetune_multinode.yaml @@ -66,6 +66,9 @@ tasks: - mountPath: /oe-adapt-default source: weka: oe-adapt-default + - mountPath: /model + source: + beaker: jacobm/llama-3.1-8b result: path: /output resources: diff --git a/configs/train_configs/dpo/my-test-dpo.yaml b/configs/train_configs/dpo/my-test-dpo.yaml index edac5a2f8..543b50c53 100644 --- a/configs/train_configs/dpo/my-test-dpo.yaml +++ b/configs/train_configs/dpo/my-test-dpo.yaml @@ -1,5 +1,5 @@ -model_name_or_path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-1 -tokenizer_name: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-1 +model_name_or_path: /model +tokenizer_name: /model model_revision: main use_flash_attn: true gradient_checkpointing: true @@ -8,6 +8,8 @@ gradient_checkpointing: true dataset_mixer: ai2-adapt-dev/tulu3.4-sft-replica-50k-gpt4-prefs-on-policy: 1.0 ai2-adapt-dev/personahub_if_pref_data_manualseed_v2_19890: 1.0 + ai2-adapt-dev/helpsteer2-uf-pipeline-regen: 1.0 + allenai/ultrafeedback_binarized_cleaned_train: 1.0 use_slow_tokenizer: true max_seq_length: 2048 preprocessing_num_workers: 16 diff --git a/configs/train_configs/sft/peteish_1124_preview_mix_v3.9.yaml b/configs/train_configs/sft/peteish_1124_preview_mix_v3.9.yaml new file mode 100644 index 000000000..e56ed1398 --- /dev/null +++ b/configs/train_configs/sft/peteish_1124_preview_mix_v3.9.yaml @@ -0,0 +1,59 @@ +model_name_or_path: /oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan/step11931-hf +model_revision: main +use_flash_attn: true +tokenizer_name: /oe-training-default/ai2-llm/checkpoints/OLMo-medium/peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-se-flan/step11931-hf +use_slow_tokenizer: true +dataset_mixer: + # Static v3.9 nc mix file + # WIP + + # Static v3.9 huggingface dataset + allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0 + + # # General datasets: + # ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all + # ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all + # ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all + # ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all + # ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0 + + # # Math datasets: + # ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all + # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 # 49980 # all + # ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 + # ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0 + # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 + + # # Coding datasets: + # ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all + # ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all + + # # IF datasets: + # ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all + + # # Safety datasets: + # ai2-adapt-dev/coconot_converted: 1.0 # 10983 # all + # ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 1.0 + # ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 1.0 + + # # Specialty datasets: + # ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0 + # ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0 + # ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0 + +max_seq_length: 4096 # need to increase to 8k +preprocessing_num_workers: 128 +per_device_train_batch_size: 1 # note, this is set up for 8 GPUs +gradient_accumulation_steps: 16 # effective batch size 128 with 4 nodes +learning_rate: 2.0e-06 +lr_scheduler_type: linear +warmup_ratio: 0.03 +weight_decay: 0.0 +num_train_epochs: 2 +output_dir: /output/ +with_tracking: true +report_to: + - wandb +logging_steps: 1 +checkpointing_steps: epoch +dataset_mix_dir: /output/ diff --git a/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml b/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml index adfa95280..68bb81199 100644 --- a/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml +++ b/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml @@ -19,7 +19,7 @@ dataset_mixer: # # Math datasets: # ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all - # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 # 49980 # all + # allenai/tulu-3-sft-personas-math-grade: 1.0 # 49980 # all # ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 # ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0 # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 @@ -44,7 +44,7 @@ dataset_mixer: max_seq_length: 4096 preprocessing_num_workers: 128 per_device_train_batch_size: 1 # note, this is set up for 8 GPUs -gradient_accumulation_steps: 2 # effective batch size 128 with 8 nodes +gradient_accumulation_steps: 1 # effective batch size 128 with 8 nodes learning_rate: 2.0e-06 lr_scheduler_type: linear warmup_ratio: 0.03 diff --git a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial-augusta.yaml b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial-augusta.yaml new file mode 100644 index 000000000..ebb18fee4 --- /dev/null +++ b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial-augusta.yaml @@ -0,0 +1,59 @@ +model_name_or_path: /model +model_revision: main +use_flash_attn: true +tokenizer_name: /model +use_slow_tokenizer: true +dataset_mixer: + # Static v3.9 nc mix file + # WIP + + # Static v3.9 huggingface dataset + # allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0 + + # General datasets: + ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all + ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all + ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all + ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all + ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0 + + # Math datasets: + ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all + allenai/tulu-3-sft-personas-math-grade: 1.0 # 49980 # all + ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 + ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0 + ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 + + # Coding datasets: + ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all + ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all + + # IF datasets: + ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all + + # Safety datasets: + ai2-adapt-dev/coconot_converted: 1.0 # 10983 # all + ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 1.0 + ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 1.0 + + # Specialty datasets: + ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0 + ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0 + ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0 + +max_seq_length: 4096 # need to increase to 8k +preprocessing_num_workers: 128 +per_device_train_batch_size: 1 # note, this is set up for 8 GPUs +gradient_accumulation_steps: 2 # effective batch size 128 with 4 nodes +learning_rate: 5.0e-06 +lr_scheduler_type: linear +warmup_ratio: 0.03 +weight_decay: 0.0 +num_train_epochs: 2 +output_dir: /output/ +with_tracking: true +report_to: + - wandb +logging_steps: 1 +checkpointing_steps: epoch +dataset_mix_dir: /output/ diff --git a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml index e604d0790..c57e4747a 100644 --- a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml +++ b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml @@ -19,7 +19,7 @@ dataset_mixer: # # Math datasets: # ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all - # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 # 49980 # all + # allenai/tulu-3-sft-personas-math-grade: 1.0 # 49980 # all # ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 # ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0 # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 diff --git a/scripts/submit_finetune_job.py b/scripts/submit_finetune_job.py index c55eb184a..9ab0330de 100644 --- a/scripts/submit_finetune_job.py +++ b/scripts/submit_finetune_job.py @@ -166,7 +166,7 @@ def parse_args(args): d['tasks'][0]['arguments'][0] = new_arguments # name and description - exp_name = f"open_instruct_finetune_{model_name}_{now}" + exp_name = f"open_instruct_finetune_{model_name}_{now}"[:128] d['description'] = exp_name d['tasks'][0]['name'] = exp_name From 67d05a47a2ac23da8a3e9e821a77fb2a35be711f Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Tue, 5 Nov 2024 08:42:20 -1000 Subject: [PATCH 37/41] update --- configs/merge_configs/my-merge-config.yaml | 36 ++++++++++--------- ...u3_70b_preview_mix_v3.9-noncommercial.yaml | 3 +- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index f558a29af..37819fb0d 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -1,26 +1,28 @@ merge_method: linear normalize: true models: - # - name: deepseek-math-7b-instruct - # location: huggingface - # path: deepseek-ai/deepseek-math-7b-instruct - # weight: 0.5 - # - name: deepseek-coder-7b-instruct-v1.5 - # location: huggingface - # path: deepseek-ai/deepseek-coder-7b-instruct-v1.5 - # weight: 0.5 - - name: L3.1-8B-v3.9-nc-1 + - name: L3.1-8B-v3.9-nc-fixed-2 location: weka - path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-1/ + path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-2/ wekaBucket: "oe-adapt-default" weight: 1.0 - - name: L3.1-8B-v3.9-nc-2 + - name: L3.1-8B-v3.9-nc-fixed-3 location: weka - path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-2/ + path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-3/ wekaBucket: "oe-adapt-default" weight: 1.0 - # - name: L3.1-8B-v3.9-nc-3 - # location: weka - # path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-3/ - # wekaBucket: "oe-adapt-default" - # weight: 1.0 \ No newline at end of file + - name: L3.1-8B-v3.9-nc-fixed-1 + location: weka + path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-1/ + wekaBucket: "oe-adapt-default" + weight: 1.0 + - name: L3.1-8B-v3.9-nc-fixed-5 + location: weka + path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-5/ + wekaBucket: "oe-adapt-default" + weight: 1.0 + - name: L3.1-8B-v3.9-nc-fixed-4 + location: weka + path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-4/ + wekaBucket: "oe-adapt-default" + weight: 1.0 \ No newline at end of file diff --git a/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml b/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml index 68bb81199..3eba17fbe 100644 --- a/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml +++ b/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml @@ -55,6 +55,7 @@ with_tracking: true report_to: - wandb logging_steps: 1 -checkpointing_steps: epoch dataset_mix_dir: /output/ +checkpointing_steps: 1000 +keep_last_n_checkpoints: 20 gradient_checkpointing: true From 205c2f62408ab45678851f4fcd50ff2bb65b6458 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Tue, 5 Nov 2024 08:43:31 -1000 Subject: [PATCH 38/41] final configs --- .../sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml | 5 +---- .../sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml | 5 +---- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml b/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml index 3eba17fbe..0f2f595ff 100644 --- a/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml +++ b/configs/train_configs/sft/tulu3_70b_preview_mix_v3.9-noncommercial.yaml @@ -4,9 +4,6 @@ use_flash_attn: true tokenizer_name: meta-llama/Llama-3.1-70B use_slow_tokenizer: true dataset_mixer: - # Static v3.9 nc mix file - # WIP - # Static v3.9 huggingface dataset allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0 @@ -44,7 +41,7 @@ dataset_mixer: max_seq_length: 4096 preprocessing_num_workers: 128 per_device_train_batch_size: 1 # note, this is set up for 8 GPUs -gradient_accumulation_steps: 1 # effective batch size 128 with 8 nodes +gradient_accumulation_steps: 2 # effective batch size 128 with 8 nodes learning_rate: 2.0e-06 lr_scheduler_type: linear warmup_ratio: 0.03 diff --git a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml index c57e4747a..12b737ec3 100644 --- a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml +++ b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml @@ -4,9 +4,6 @@ use_flash_attn: true tokenizer_name: meta-llama/Llama-3.1-8B use_slow_tokenizer: true dataset_mixer: - # Static v3.9 nc mix file - # WIP - # Static v3.9 huggingface dataset allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0 @@ -41,7 +38,7 @@ dataset_mixer: # ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0 # ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0 -max_seq_length: 4096 # need to increase to 8k +max_seq_length: 4096 preprocessing_num_workers: 128 per_device_train_batch_size: 1 # note, this is set up for 8 GPUs gradient_accumulation_steps: 2 # effective batch size 128 with 4 nodes From bc2aec83b56fb3198b8692f3b0194c3a6bb07d53 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Tue, 12 Nov 2024 19:00:32 -0500 Subject: [PATCH 39/41] update --- .../default_finetune_multinode.yaml | 2 +- configs/merge_configs/70b-soup.yaml | 18 ++++++ configs/merge_configs/my-merge-config.yaml | 55 ++++++++++-------- .../olmo_7b_preview_mix_v3.9-no-safety.yaml | 56 +++++++++++++++++++ ...lu3_8b_preview_mix_v3.9-noncommercial.yaml | 54 +++++++++--------- oe-eval-internal | 2 +- 6 files changed, 135 insertions(+), 52 deletions(-) create mode 100644 configs/merge_configs/70b-soup.yaml create mode 100644 configs/train_configs/sft/olmo_7b_preview_mix_v3.9-no-safety.yaml diff --git a/configs/beaker_configs/default_finetune_multinode.yaml b/configs/beaker_configs/default_finetune_multinode.yaml index 380cc0c76..aa8751559 100644 --- a/configs/beaker_configs/default_finetune_multinode.yaml +++ b/configs/beaker_configs/default_finetune_multinode.yaml @@ -15,7 +15,7 @@ tasks: '/bin/sh', '-c' ] arguments: [' - unset CUDA_LAUNCH_BLOCKING && PYTHONPATH="/stage:$PYTHONPATH" accelerate launch + unset CUDA_LAUNCH_BLOCKING && PYTHONPATH="/stage:$PYTHONPATH" pip install git+https://github.com/vwxyzjn/transformers.git@olmo1124_classification && accelerate launch --mixed_precision bf16 --num_machines 4 --num_processes 32 diff --git a/configs/merge_configs/70b-soup.yaml b/configs/merge_configs/70b-soup.yaml new file mode 100644 index 000000000..6ba42cf09 --- /dev/null +++ b/configs/merge_configs/70b-soup.yaml @@ -0,0 +1,18 @@ +merge_method: linear +normalize: true +models: + # - name: L3.1-70B-v3.9-nc-2e-6-2_ep-fixed + # location: weka + # path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-70B-v3.9-nc-2e-6-2_ep-fixed/ + # wekaBucket: "oe-adapt-default" + # weight: 1.0 + - name: L3.1-70B-v3.9-nc-2e-6-2_ep-fixed-2 + location: weka + path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-70B-v3.9-nc-2e-6-2_ep-fixed-2/ + wekaBucket: "oe-adapt-default" + weight: 1.0 + - name: L3.1-70B-v3.9-nc-2e-6-2_ep-fixed-3 + location: weka + path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-70B-v3.9-nc-2e-6-2_ep-fixed-3/ + wekaBucket: "oe-adapt-default" + weight: 1.0 \ No newline at end of file diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index 37819fb0d..b77fec8f9 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -1,28 +1,37 @@ merge_method: linear normalize: true models: - - name: L3.1-8B-v3.9-nc-fixed-2 + - name: llama-3.1-8b-resized + location: huggingface + path: ai2-adapt-dev/llama-3.1-8b-resized + weight: 0.5 + - name: L3.1-8B-v3.9-nc-fixed-soup-best_2 location: weka - path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-2/ + path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-best_2/ wekaBucket: "oe-adapt-default" - weight: 1.0 - - name: L3.1-8B-v3.9-nc-fixed-3 - location: weka - path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-3/ - wekaBucket: "oe-adapt-default" - weight: 1.0 - - name: L3.1-8B-v3.9-nc-fixed-1 - location: weka - path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-1/ - wekaBucket: "oe-adapt-default" - weight: 1.0 - - name: L3.1-8B-v3.9-nc-fixed-5 - location: weka - path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-5/ - wekaBucket: "oe-adapt-default" - weight: 1.0 - - name: L3.1-8B-v3.9-nc-fixed-4 - location: weka - path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-4/ - wekaBucket: "oe-adapt-default" - weight: 1.0 \ No newline at end of file + weight: 0.5 + # - name: L3.1-8B-v3.9-nc-fixed-2 + # location: weka + # path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-2/ + # wekaBucket: "oe-adapt-default" + # weight: 1.0 + # - name: L3.1-8B-v3.9-nc-fixed-3 + # location: weka + # path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-3/ + # wekaBucket: "oe-adapt-default" + # weight: 1.0 + # - name: L3.1-8B-v3.9-nc-fixed-1 + # location: weka + # path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-1/ + # wekaBucket: "oe-adapt-default" + # weight: 1.0 + # - name: L3.1-8B-v3.9-nc-fixed-5 + # location: weka + # path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-5/ + # wekaBucket: "oe-adapt-default" + # weight: 1.0 + # - name: L3.1-8B-v3.9-nc-fixed-4 + # location: weka + # path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-4/ + # wekaBucket: "oe-adapt-default" + # weight: 1.0 \ No newline at end of file diff --git a/configs/train_configs/sft/olmo_7b_preview_mix_v3.9-no-safety.yaml b/configs/train_configs/sft/olmo_7b_preview_mix_v3.9-no-safety.yaml new file mode 100644 index 000000000..2d76adf22 --- /dev/null +++ b/configs/train_configs/sft/olmo_7b_preview_mix_v3.9-no-safety.yaml @@ -0,0 +1,56 @@ +model_name_or_path: meta-llama/Llama-3.1-8B +model_revision: main +use_flash_attn: true +tokenizer_name: meta-llama/Llama-3.1-8B +use_slow_tokenizer: true +dataset_mixer: + # Static v3.9 huggingface dataset + # allenai/tulu-v.3.9-mix-preview-noncommercial: 0.05 + + # General datasets: + ai2-adapt-dev/oasst1_converted: 0.05 # 7132 # all + ai2-adapt-dev/flan_v2_converted: 0.05 # 89982 # all + ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all + ai2-adapt-dev/no_robots_converted: 0.05 # 9500 # all + ai2-adapt-dev/tulu_v3.9_wildchat_100k: 0.05 + + # Math datasets: + ai2-adapt-dev/personahub_math_v5_regen_149960: 0.05 # 149960 # all + allenai/tulu-3-sft-personas-math-grade: 0.05 # 49980 # all + ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 0.05 + ai2-adapt-dev/numinamath_tir_math_decontaminated: 0.05 + ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 0.05 + + # Coding datasets: + ai2-adapt-dev/personahub_code_v2_34999: 0.05 # 34999 # all + ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 0.05 # 107276 # all + + # IF datasets: + ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 0.05 # 29980 # all + + # Safety datasets: + # ai2-adapt-dev/coconot_converted: 0.05 # 10983 # all + # ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 0.05 + # ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 0.05 + + # Specialty datasets: + ai2-adapt-dev/tulu_v3.9_sciriff_10k: 0.05 + ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 0.05 + ai2-adapt-dev/tulu_v3.9_aya_100k: 0.05 + +max_seq_length: 4096 +preprocessing_num_workers: 128 +per_device_train_batch_size: 1 # note, this is set up for 8 GPUs +gradient_accumulation_steps: 16 # effective batch size 128 with 4 nodes +learning_rate: 5.0e-06 +lr_scheduler_type: linear +warmup_ratio: 0.03 +weight_decay: 0.0 +num_train_epochs: 2 +output_dir: /output/ +with_tracking: true +report_to: + - wandb +logging_steps: 1 +checkpointing_steps: epoch +dataset_mix_dir: /output/ diff --git a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml index 12b737ec3..8b4fb0fab 100644 --- a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml +++ b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml @@ -5,43 +5,43 @@ tokenizer_name: meta-llama/Llama-3.1-8B use_slow_tokenizer: true dataset_mixer: # Static v3.9 huggingface dataset - allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0 + # allenai/tulu-v.3.9-mix-preview-noncommercial: 0.05 - # # General datasets: - # ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all - # ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all - # ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all - # ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all - # ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0 + # General datasets: + ai2-adapt-dev/oasst1_converted: 0.05 # 7132 # all + ai2-adapt-dev/flan_v2_converted: 0.05 # 89982 # all + ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all + ai2-adapt-dev/no_robots_converted: 0.05 # 9500 # all + ai2-adapt-dev/tulu_v3.9_wildchat_100k: 0.05 - # # Math datasets: - # ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all - # allenai/tulu-3-sft-personas-math-grade: 1.0 # 49980 # all - # ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 - # ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0 - # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 + # Math datasets: + ai2-adapt-dev/personahub_math_v5_regen_149960: 0.05 # 149960 # all + allenai/tulu-3-sft-personas-math-grade: 0.05 # 49980 # all + ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 0.05 + ai2-adapt-dev/numinamath_tir_math_decontaminated: 0.05 + ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 0.05 - # # Coding datasets: - # ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all - # ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all + # Coding datasets: + ai2-adapt-dev/personahub_code_v2_34999: 0.05 # 34999 # all + ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 0.05 # 107276 # all - # # IF datasets: - # ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all + # IF datasets: + ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 0.05 # 29980 # all - # # Safety datasets: - # ai2-adapt-dev/coconot_converted: 1.0 # 10983 # all - # ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 1.0 - # ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 1.0 + # Safety datasets: + ai2-adapt-dev/coconot_converted: 0.05 # 10983 # all + ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 0.05 + ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 0.05 - # # Specialty datasets: - # ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0 - # ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0 - # ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0 + # Specialty datasets: + ai2-adapt-dev/tulu_v3.9_sciriff_10k: 0.05 + ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 0.05 + ai2-adapt-dev/tulu_v3.9_aya_100k: 0.05 max_seq_length: 4096 preprocessing_num_workers: 128 per_device_train_batch_size: 1 # note, this is set up for 8 GPUs -gradient_accumulation_steps: 2 # effective batch size 128 with 4 nodes +gradient_accumulation_steps: 16 # effective batch size 128 with 4 nodes learning_rate: 5.0e-06 lr_scheduler_type: linear warmup_ratio: 0.03 diff --git a/oe-eval-internal b/oe-eval-internal index 15589625f..7936aa511 160000 --- a/oe-eval-internal +++ b/oe-eval-internal @@ -1 +1 @@ -Subproject commit 15589625f17ead946fae8f462904484a06277f32 +Subproject commit 7936aa51128ae671bcae47aa50422e7f98c2fe39 From a0fc16f3f97dbf2ba3cfc93130e0f8a8477b5d1a Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Sun, 17 Nov 2024 12:27:30 -0800 Subject: [PATCH 40/41] update my branch with garbo --- configs/beaker_configs/default_dpo.yaml | 2 +- .../default_finetune_multinode.yaml | 8 +- .../default_finetune_multinode_olmo.yaml | 78 +++++ .../default_finetune_multinode_olmoe.yaml | 78 +++++ .../beaker_configs/default_finetune_olmo.yaml | 65 ++++ configs/merge_configs/my-merge-config.yaml | 25 +- configs/train_configs/dpo/olmoe_dpo_test.yaml | 37 ++ .../olmo_7b_preview_mix_v3.9-no-safety.yaml | 46 +-- configs/train_configs/sft/olmoe_v3.9.yaml | 52 +++ ...2.5_7b_preview_mix_v3.9-noncommercial.yaml | 56 +++ ...lu3_8b_preview_mix_v3.9-noncommercial.yaml | 54 +-- downsampling.pdf | Bin 0 -> 17087 bytes oe-eval-internal | 2 +- scripts/plot-downsampling.py | 219 ++++++++++++ scripts/table-script.py | 331 ++++++++++++++++++ 15 files changed, 990 insertions(+), 63 deletions(-) create mode 100644 configs/beaker_configs/default_finetune_multinode_olmo.yaml create mode 100644 configs/beaker_configs/default_finetune_multinode_olmoe.yaml create mode 100644 configs/beaker_configs/default_finetune_olmo.yaml create mode 100644 configs/train_configs/dpo/olmoe_dpo_test.yaml create mode 100644 configs/train_configs/sft/olmoe_v3.9.yaml create mode 100644 configs/train_configs/sft/qwen2.5_7b_preview_mix_v3.9-noncommercial.yaml create mode 100644 downsampling.pdf create mode 100644 scripts/plot-downsampling.py create mode 100644 scripts/table-script.py diff --git a/configs/beaker_configs/default_dpo.yaml b/configs/beaker_configs/default_dpo.yaml index 694ca8e33..08eacd1a7 100644 --- a/configs/beaker_configs/default_dpo.yaml +++ b/configs/beaker_configs/default_dpo.yaml @@ -8,7 +8,7 @@ tasks: command: [ '/bin/sh', '-c' ] - arguments: ['PYTHONPATH="/stage:$PYTHONPATH" accelerate launch + arguments: ['pip install --upgrade transformers && PYTHONPATH="/stage:$PYTHONPATH" accelerate launch --mixed_precision bf16 --num_machines 1 --num_processes 4 diff --git a/configs/beaker_configs/default_finetune_multinode.yaml b/configs/beaker_configs/default_finetune_multinode.yaml index aa8751559..7b10fa8c7 100644 --- a/configs/beaker_configs/default_finetune_multinode.yaml +++ b/configs/beaker_configs/default_finetune_multinode.yaml @@ -15,7 +15,7 @@ tasks: '/bin/sh', '-c' ] arguments: [' - unset CUDA_LAUNCH_BLOCKING && PYTHONPATH="/stage:$PYTHONPATH" pip install git+https://github.com/vwxyzjn/transformers.git@olmo1124_classification && accelerate launch + unset CUDA_LAUNCH_BLOCKING && PYTHONPATH="/stage:$PYTHONPATH" accelerate launch --mixed_precision bf16 --num_machines 4 --num_processes 32 @@ -66,9 +66,9 @@ tasks: - mountPath: /oe-adapt-default source: weka: oe-adapt-default - - mountPath: /model - source: - beaker: jacobm/llama-3.1-8b + # - mountPath: /model + # source: + # beaker: jacobm/llama-3.1-8b result: path: /output resources: diff --git a/configs/beaker_configs/default_finetune_multinode_olmo.yaml b/configs/beaker_configs/default_finetune_multinode_olmo.yaml new file mode 100644 index 000000000..b42800cee --- /dev/null +++ b/configs/beaker_configs/default_finetune_multinode_olmo.yaml @@ -0,0 +1,78 @@ +version: v2 +description: open-instruct-finetune-multinode +budget: ai2/oe-adapt +tasks: + - name: open-instruct-finetune-multinode + replicas: 4 + leaderSelection: true + hostNetworking: true + propagateFailure: true + propagatePreemption: true + synchronizedStartTimeout: 60m + image: + beaker: nathanl/open_instruct_auto + command: [ + '/bin/sh', '-c' + ] + arguments: [' + unset CUDA_LAUNCH_BLOCKING && pip install git+https://github.com/vwxyzjn/transformers.git@olmo1124_classification && PYTHONPATH="/stage:$PYTHONPATH" accelerate launch + --mixed_precision bf16 + --num_machines 4 + --num_processes 32 + --machine_rank $BEAKER_REPLICA_RANK + --main_process_ip $BEAKER_LEADER_REPLICA_HOSTNAME + --main_process_port 29400 + --use_deepspeed + --deepspeed_config_file configs/ds_configs/stage3_no_offloading_accelerate.conf + --deepspeed_multinode_launcher standard + open_instruct/finetune.py + --model_name_or_path meta-llama/Meta-Llama-3-8B + --tokenizer_name meta-llama/Meta-Llama-3-8B + --use_slow_tokenizer + --use_flash_attn + --max_seq_length 4096 + --preprocessing_num_workers 16 + --per_device_train_batch_size 1 + --gradient_accumulation_steps 4 + --learning_rate 5e-6 + --lr_scheduler_type linear + --warmup_ratio 0.03 + --weight_decay 0. + --num_train_epochs 2 + --output_dir /output/ + --with_tracking + --report_to tensorboard + --logging_steps 1 + --reduce_loss sum + '] + envVars: + - name: CUDA_DEVICE_ORDER + value: PCI_BUS_ID + - name: TRANSFORMERS_CACHE + value: ./cache/ + - name: WANDB_API_KEY + secret: jacobm_WANDB_API_KEY + - name: WANDB_PROJECT + value: open-instruct + - name: WANDB_WATCH + value: false + - name: WANDB_LOG_MODEL + value: false + - name: WANDB_DISABLED + value: true + - name: HF_TOKEN + secret: jacobm_HF_TOKEN + datasets: + - mountPath: /oe-adapt-default + source: + weka: oe-adapt-default + # - mountPath: /model + # source: + # beaker: jacobm/llama-3.1-8b + result: + path: /output + resources: + gpuCount: 8 + context: + priority: normal + preemptible: true \ No newline at end of file diff --git a/configs/beaker_configs/default_finetune_multinode_olmoe.yaml b/configs/beaker_configs/default_finetune_multinode_olmoe.yaml new file mode 100644 index 000000000..771dc41b7 --- /dev/null +++ b/configs/beaker_configs/default_finetune_multinode_olmoe.yaml @@ -0,0 +1,78 @@ +version: v2 +description: open-instruct-finetune-multinode +budget: ai2/oe-adapt +tasks: + - name: open-instruct-finetune-multinode + replicas: 4 + leaderSelection: true + hostNetworking: true + propagateFailure: true + propagatePreemption: true + synchronizedStartTimeout: 60m + image: + beaker: nathanl/open_instruct_auto + command: [ + '/bin/sh', '-c' + ] + arguments: [' + unset CUDA_LAUNCH_BLOCKING && pip install --upgrade transformers && PYTHONPATH="/stage:$PYTHONPATH" accelerate launch + --mixed_precision bf16 + --num_machines 4 + --num_processes 32 + --machine_rank $BEAKER_REPLICA_RANK + --main_process_ip $BEAKER_LEADER_REPLICA_HOSTNAME + --main_process_port 29400 + --use_deepspeed + --deepspeed_config_file configs/ds_configs/stage3_no_offloading_accelerate.conf + --deepspeed_multinode_launcher standard + open_instruct/finetune.py + --model_name_or_path meta-llama/Meta-Llama-3-8B + --tokenizer_name meta-llama/Meta-Llama-3-8B + --use_slow_tokenizer + --use_flash_attn + --max_seq_length 4096 + --preprocessing_num_workers 16 + --per_device_train_batch_size 1 + --gradient_accumulation_steps 4 + --learning_rate 5e-6 + --lr_scheduler_type linear + --warmup_ratio 0.03 + --weight_decay 0. + --num_train_epochs 2 + --output_dir /output/ + --with_tracking + --report_to tensorboard + --logging_steps 1 + --reduce_loss sum + '] + envVars: + - name: CUDA_DEVICE_ORDER + value: PCI_BUS_ID + - name: TRANSFORMERS_CACHE + value: ./cache/ + - name: WANDB_API_KEY + secret: jacobm_WANDB_API_KEY + - name: WANDB_PROJECT + value: open-instruct + - name: WANDB_WATCH + value: false + - name: WANDB_LOG_MODEL + value: false + - name: WANDB_DISABLED + value: true + - name: HF_TOKEN + secret: jacobm_HF_TOKEN + datasets: + - mountPath: /oe-adapt-default + source: + weka: oe-adapt-default + # - mountPath: /model + # source: + # beaker: jacobm/llama-3.1-8b + result: + path: /output + resources: + gpuCount: 8 + context: + priority: normal + preemptible: true \ No newline at end of file diff --git a/configs/beaker_configs/default_finetune_olmo.yaml b/configs/beaker_configs/default_finetune_olmo.yaml new file mode 100644 index 000000000..7a3236d17 --- /dev/null +++ b/configs/beaker_configs/default_finetune_olmo.yaml @@ -0,0 +1,65 @@ +version: v2 +description: open-instruct-finetune +budget: ai2/oe-adapt +tasks: + - name: open-instruct-finetune + image: + beaker: nathanl/open_instruct_auto + command: [ + '/bin/sh', '-c' + ] + arguments: ['pip install git+https://github.com/vwxyzjn/transformers.git@olmo1124_classification && PYTHONPATH="/stage:$PYTHONPATH" accelerate launch + --mixed_precision bf16 + --num_machines 1 + --num_processes 4 + --use_deepspeed + --deepspeed_config_file configs/ds_configs/stage3_no_offloading_accelerate.conf + open_instruct/finetune.py + --model_name_or_path /hf_llama_models + --use_flash_attn + --max_seq_length 2048 + --preprocessing_num_workers 16 + --per_device_train_batch_size 2 + --gradient_accumulation_steps 16 + --learning_rate 2e-5 + --lr_scheduler_type linear + --warmup_ratio 0.03 + --weight_decay 0. + --num_train_epochs 2 + --output_dir /output/ + --with_tracking + --report_to tensorboard + --logging_steps 1 + '] + envVars: + - name: CUDA_DEVICE_ORDER + value: PCI_BUS_ID + - name: TRANSFORMERS_CACHE + value: ./cache/ + - name: WANDB_API_KEY + secret: jacobm_WANDB_API_KEY + - name: WANDB_PROJECT + value: open-instruct + - name: WANDB_WATCH + value: false + - name: WANDB_LOG_MODEL + value: false + - name: WANDB_DISABLED + value: true + - name: HF_TOKEN + secret: jacobm_HF_TOKEN + datasets: + - mountPath: /oe-adapt-default + source: + weka: oe-adapt-default + - mountPath: /oe-training-default + source: + weka: oe-training-default + result: + path: /output + resources: + gpuCount: 4 + context: + cluster: ai2/allennlp-cirrascale + priority: high + preemptible: false \ No newline at end of file diff --git a/configs/merge_configs/my-merge-config.yaml b/configs/merge_configs/my-merge-config.yaml index b77fec8f9..3c7246a6d 100644 --- a/configs/merge_configs/my-merge-config.yaml +++ b/configs/merge_configs/my-merge-config.yaml @@ -1,15 +1,26 @@ merge_method: linear normalize: true models: - - name: llama-3.1-8b-resized - location: huggingface - path: ai2-adapt-dev/llama-3.1-8b-resized - weight: 0.5 - - name: L3.1-8B-v3.9-nc-fixed-soup-best_2 + # - name: llama-3.1-8b-resized + # location: huggingface + # path: ai2-adapt-dev/llama-3.1-8b-resized + # weight: 0.5 + # - name: L3.1-8B-v3.9-nc-fixed-soup-best_2 + # location: weka + # path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-best_2/ + # wekaBucket: "oe-adapt-default" + # weight: 0.5 + + - name: gsm_math_if_valpy_best_overall_avg_8b_beta0.05-step_200 + location: weka + path: /oe-adapt-default/hamishi/model_checkpoints/gsm_math_if_valpy_best_overall_avg_8b_beta0.05_checkpoints/step_200/ + wekaBucket: "oe-adapt-default" + weight: 1.0 + - name: gsm_math_if_valpy_best_and_if_avg_8b_beta0.05-step_200 location: weka - path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-best_2/ + path: /oe-adapt-default/hamishi/model_checkpoints/gsm_math_if_valpy_best_and_if_avg_8b_beta0.05_checkpoints/step_200/ wekaBucket: "oe-adapt-default" - weight: 0.5 + weight: 1.0 # - name: L3.1-8B-v3.9-nc-fixed-2 # location: weka # path: /oe-adapt-default/jacobm/tulu-3-dev/checkpoints/base_models/L3.1-8B-v3.9-nc-fixed-2/ diff --git a/configs/train_configs/dpo/olmoe_dpo_test.yaml b/configs/train_configs/dpo/olmoe_dpo_test.yaml new file mode 100644 index 000000000..bed7f3037 --- /dev/null +++ b/configs/train_configs/dpo/olmoe_dpo_test.yaml @@ -0,0 +1,37 @@ +model_name_or_path: /model +tokenizer_name: /model +model_revision: main +use_flash_attn: true +gradient_checkpointing: true +dataset_mixer: + # ai2-adapt-dev/sft_v3.9_used_off_policy: 1.0 + # ai2-adapt-dev/sft_v3.9_used_on_policy_large_70b_ckpt: 1.0 + # ai2-adapt-dev/DaringAnteater-prefs-RM-filter-uf-pipeline-regen-v3.9_large_70b_ckpt: 1.0 + # ai2-adapt-dev/WildChat-prefs-280824-uf-pipeline-regen-v3.9_large_70b_ckpt: 1.0 + # ai2-adapt-dev/Llama-3.1-if_taxonomy_tulu-uf-pipeline-regen-v3.9_large_70b_ckpt: 1.0 + ai2-adapt-dev/wildchat_v3.9_unused_off_policy: 1.0 + + ai2-adapt-dev/sft_v3.9_used_p0_olmoe-1b-7b: 1.0 + ai2-adapt-dev/sft_v3.9_used_p1_olmoe-1b-7b: 1.0 + ai2-adapt-dev/daring_anteater_olmoe-1b-7b: 1.0 + ai2-adapt-dev/wildchat-prefs-280824_olmoe-1b-7b: 1.0 + ai2-adapt-dev/llama3.1-if_taxonomy_tulu_olmoe-1b-7b: 1.0 +use_slow_tokenizer: true +max_seq_length: 2048 +preprocessing_num_workers: 16 +per_device_train_batch_size: 2 +gradient_accumulation_steps: 8 # designed for 8 GPUs, so batch size 128 +learning_rate: 5.0e-7 +lr_scheduler_type: linear +warmup_ratio: 0.1 +weight_decay: 0.0 +num_train_epochs: 1 +output_dir: /output +with_tracking: true +report_to: + - wandb +logging_steps: 1 +use_lora: false +dpo_loss_type: dpo_norm +dpo_beta: 5 +checkpointing_steps: 1000 \ No newline at end of file diff --git a/configs/train_configs/sft/olmo_7b_preview_mix_v3.9-no-safety.yaml b/configs/train_configs/sft/olmo_7b_preview_mix_v3.9-no-safety.yaml index 2d76adf22..9fe931118 100644 --- a/configs/train_configs/sft/olmo_7b_preview_mix_v3.9-no-safety.yaml +++ b/configs/train_configs/sft/olmo_7b_preview_mix_v3.9-no-safety.yaml @@ -1,47 +1,47 @@ -model_name_or_path: meta-llama/Llama-3.1-8B -model_revision: main +model_name_or_path: allenai/open_instruct_dev +model_revision: peteish7-anneal-from-928646-50B-nowup-moremath-dclm07-fw2-olmo_1124 use_flash_attn: true -tokenizer_name: meta-llama/Llama-3.1-8B +tokenizer_name: allenai/open_instruct_dev use_slow_tokenizer: true dataset_mixer: # Static v3.9 huggingface dataset - # allenai/tulu-v.3.9-mix-preview-noncommercial: 0.05 + # allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0 # General datasets: - ai2-adapt-dev/oasst1_converted: 0.05 # 7132 # all - ai2-adapt-dev/flan_v2_converted: 0.05 # 89982 # all + ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all + ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all - ai2-adapt-dev/no_robots_converted: 0.05 # 9500 # all - ai2-adapt-dev/tulu_v3.9_wildchat_100k: 0.05 + ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all + ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0 # Math datasets: - ai2-adapt-dev/personahub_math_v5_regen_149960: 0.05 # 149960 # all - allenai/tulu-3-sft-personas-math-grade: 0.05 # 49980 # all - ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 0.05 - ai2-adapt-dev/numinamath_tir_math_decontaminated: 0.05 - ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 0.05 + ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all + allenai/tulu-3-sft-personas-math-grade: 1.0 # 49980 # all + ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 + ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0 + ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 # Coding datasets: - ai2-adapt-dev/personahub_code_v2_34999: 0.05 # 34999 # all - ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 0.05 # 107276 # all + ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all + ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all # IF datasets: - ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 0.05 # 29980 # all + ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all # Safety datasets: - # ai2-adapt-dev/coconot_converted: 0.05 # 10983 # all - # ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 0.05 - # ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 0.05 + # ai2-adapt-dev/coconot_converted: 1.0 # 10983 # all + # ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 1.0 + # ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 1.0 # Specialty datasets: - ai2-adapt-dev/tulu_v3.9_sciriff_10k: 0.05 - ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 0.05 - ai2-adapt-dev/tulu_v3.9_aya_100k: 0.05 + ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0 + ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0 + ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0 max_seq_length: 4096 preprocessing_num_workers: 128 per_device_train_batch_size: 1 # note, this is set up for 8 GPUs -gradient_accumulation_steps: 16 # effective batch size 128 with 4 nodes +gradient_accumulation_steps: 4 # effective batch size 128 with 4 nodes learning_rate: 5.0e-06 lr_scheduler_type: linear warmup_ratio: 0.03 diff --git a/configs/train_configs/sft/olmoe_v3.9.yaml b/configs/train_configs/sft/olmoe_v3.9.yaml new file mode 100644 index 000000000..c4b61014c --- /dev/null +++ b/configs/train_configs/sft/olmoe_v3.9.yaml @@ -0,0 +1,52 @@ +model_name_or_path: allenai/OLMoE-1B-7B-0924 +model_revision: main +tokenizer_name: allenai/OLMoE-1B-7B-0924 +use_slow_tokenizer: true +dataset_mixer: + allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0 + + # # General datasets: + # ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all + # ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all + # ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all + # ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all + # ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0 + + # # Math datasets: + # ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all + # allenai/tulu-3-sft-personas-math-grade: 1.0 # 49980 # all + # ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 + # ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0 + # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 + + # # Coding datasets: + # ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all + # ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all + + # # IF datasets: + # ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all + + # # Specialty datasets: + # ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0 + # ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0 + # ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0 +max_seq_length: 4096 # Note, reduced from 8192 to fit on one GPU with DeepSpeed Stage3 +preprocessing_num_workers: 128 +per_device_train_batch_size: 2 # note, this is set up for 8 GPUs +gradient_accumulation_steps: 1 # effective batch size 128 with 4 nodes +learning_rate: 2.0e-05 +lr_scheduler_type: linear +warmup_ratio: 0.03 +weight_decay: 0.0 +num_train_epochs: 2 +output_dir: /output/ +with_tracking: true +report_to: + - wandb +logging_steps: 1 +dataset_mix_dir: /output/ +checkpointing_steps: epoch +# keep_last_n_checkpoints: 1 +# load_balancing_loss: false # TODO: set to false +# load_balancing_weight: 0.5 +add_bos: true \ No newline at end of file diff --git a/configs/train_configs/sft/qwen2.5_7b_preview_mix_v3.9-noncommercial.yaml b/configs/train_configs/sft/qwen2.5_7b_preview_mix_v3.9-noncommercial.yaml new file mode 100644 index 000000000..c5c8b488d --- /dev/null +++ b/configs/train_configs/sft/qwen2.5_7b_preview_mix_v3.9-noncommercial.yaml @@ -0,0 +1,56 @@ +model_name_or_path: Qwen/Qwen2.5-Math-7B +model_revision: main +use_flash_attn: true +tokenizer_name: Qwen/Qwen2.5-Math-7B +use_slow_tokenizer: true +dataset_mixer: + # Static v3.9 huggingface dataset + allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0 + + # # General datasets: + # ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all + # ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all + # ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all + # ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all + # ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0 + + # # Math datasets: + # ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all + # allenai/tulu-3-sft-personas-math-grade: 1.0 # 49980 # all + # ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 + # ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0 + # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 + + # # Coding datasets: + # ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all + # ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all + + # # IF datasets: + # ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all + + # # Safety datasets: + # ai2-adapt-dev/coconot_converted: 1.0 # 10983 # all + # ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 1.0 + # ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 1.0 + + # # Specialty datasets: + # ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0 + # ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0 + # ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0 + +max_seq_length: 4096 +preprocessing_num_workers: 128 +per_device_train_batch_size: 1 # note, this is set up for 8 GPUs +gradient_accumulation_steps: 4 # effective batch size 128 with 8 nodes +learning_rate: 5.0e-06 +lr_scheduler_type: linear +warmup_ratio: 0.03 +weight_decay: 0.0 +num_train_epochs: 2 +output_dir: /output/ +with_tracking: true +report_to: + - wandb +logging_steps: 1 +checkpointing_steps: epoch +dataset_mix_dir: /output/ diff --git a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml index 5f72adf2f..84a523f4e 100644 --- a/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml +++ b/configs/train_configs/sft/tulu3_8b_preview_mix_v3.9-noncommercial.yaml @@ -5,43 +5,43 @@ tokenizer_name: meta-llama/Llama-3.1-8B use_slow_tokenizer: true dataset_mixer: # Static v3.9 huggingface dataset - allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0 + # allenai/tulu-v.3.9-mix-preview-noncommercial: 1.0 - # # General datasets: - # ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all - # ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all - # ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all - # ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all - # ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0 + # General datasets: + ai2-adapt-dev/oasst1_converted: 1.0 # 7132 # all + ai2-adapt-dev/flan_v2_converted: 1.0 # 89982 # all + ai2-adapt-dev/tulu_hard_coded_repeated_10: 1.0 # 240 # all + ai2-adapt-dev/no_robots_converted: 1.0 # 9500 # all + ai2-adapt-dev/tulu_v3.9_wildchat_100k: 1.0 - # # Math datasets: - # ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all - # allenai/tulu-3-sft-personas-math-grade: 1.0 # 49980 # all - # ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 - # ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0 - # ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 + # Math datasets: + ai2-adapt-dev/personahub_math_v5_regen_149960: 1.0 # 149960 # all + allenai/tulu-3-sft-personas-math-grade: 1.0 # 49980 # all + ai2-adapt-dev/tulu_v3.9_open_math_2_gsm8k_50k: 1.0 + ai2-adapt-dev/numinamath_tir_math_decontaminated: 1.0 + ai2-adapt-dev/tulu_v3.9_personahub_math_interm_algebra_20k: 1.0 - # # Coding datasets: - # ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all - # ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all + # Coding datasets: + ai2-adapt-dev/personahub_code_v2_34999: 1.0 # 34999 # all + ai2-adapt-dev/evol_codealpaca_heval_decontaminated: 1.0 # 107276 # all - # # IF datasets: - # ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all + # IF datasets: + ai2-adapt-dev/personahub_ifdata_manual_seed_v3_29980: 1.0 # 29980 # all - # # Safety datasets: - # ai2-adapt-dev/coconot_converted: 1.0 # 10983 # all - # ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 1.0 - # ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 1.0 + # Safety datasets: + ai2-adapt-dev/coconot_converted: 1.0 # 10983 # all + ai2-adapt-dev/tulu_v3.9_wildjailbreak_decontaminated_50k: 1.0 + ai2-adapt-dev/tulu_v3.9_synthetic_finalresp_wildguardmixtrain_decontaminated_50k: 1.0 - # # Specialty datasets: - # ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0 - # ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0 - # ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0 + # Specialty datasets: + ai2-adapt-dev/tulu_v3.9_sciriff_10k: 1.0 + ai2-adapt-dev/tulu_v3.9_table_gpt_5k: 1.0 + ai2-adapt-dev/tulu_v3.9_aya_100k: 1.0 max_seq_length: 4096 preprocessing_num_workers: 128 per_device_train_batch_size: 1 # note, this is set up for 8 GPUs -gradient_accumulation_steps: 2 # effective batch size 128 with 8 nodes +gradient_accumulation_steps: 8 # effective batch size 128 with 8 nodes learning_rate: 5.0e-06 lr_scheduler_type: linear warmup_ratio: 0.03 diff --git a/downsampling.pdf b/downsampling.pdf new file mode 100644 index 0000000000000000000000000000000000000000..8002b7a48adedee93a38a2a68242c4f33868fd05 GIT binary patch literal 17087 zcmd_ScUTll&^HQ_mn2z~uw;pw7nUSBh-5^f0!vgf3nC_R5KwYZ3?NBKA_x)`kSvlE zMN~jQl0-oy3f~OqIf}gZ_`LW2^F4QY+Ucq3P+eWq^{eh01@u(Z#1Rr`h(P`aXkH}* z0fj@|Y)(UDWuY*mbMAIfn6fp&+S$zk3e&T8u=9cX!;?Zmt9< zYP$f;MAzoD9i9M{+__ctC8!$`tO<5d^bSGKnn1AgbcJHTTL{e1(c0F@)d7nA@zm1| zZ)it=ngiXcr~$0l`4XToO&5R#r5{)2A6HGNy~&g~3W-kW!KuCz{fwuc+Ae#f#3*5{ODTE$r%_p6sG13NJGUA?`CTU^6y3Pw6k`B_@=d+ z54v$k(p|b;W5b2#;MK~!Brn*D4da4XoF=(=cy0gVvhyJ~d0pd%H`mwEC4BlNsFJFu zpQaDe`8Y`kT-wlxy*{%l*?W8IqWAZa8@=BLVU>$(v!6RVzSfK^m-T+1Tlt z+$Dav&LnYw>fWMRR;z7smqu8K{8#F+yWd1qa^v;ie53FEuKRs`bAx46E_FKZ%KW$S zTSf~{^qJRBN%wC-Ds9ek5xv_x@-}jSqQH$Fk#EJZB-1K zHo6`&d6G}>vN!j9(JXPrWYCH0PrAEG3x)FH?3D{^D(6m~@);jY9hlf$$Zkuo-gvZL zae1g??bKS^T$LNFs=R)di`KGNp_eUzQQArv`X<-|T2x9|EZ z?B8vP7)usx3$8|&D_ubPhiSW#XfQLM%ZV*&4ZTa0nS;M0P$VnE^!ST$xGwqpgH{(J zaVJ&cRIZs~stp3^S3zMn%VBJP@QUpQ-Hi`Torjvno-C%t-MvdhqpM!)s9{~C*x6Ah zBF|n6bW9v|o#k^D1w(<%)c`d~X6B~-q-4rm+`1wfOg+~hugvC4o)mycdu|q8_UPN1 z5}SMdc;gFOFTI1DCw^lp*Teh!=1A51DIqcLs*wES4FL)|ef36@#aZ||jtgC=Aj_>_d3!L3to+TuL3g#grrs432gipO^UGfA z6N2P@PkY=5&dD&2KK>%O;CMrzA%b6UDriK!;TCak3wfiJqBAt_B@-fk4B{C52-DN@ za9?|dk6@qptyPCw6gKy16T92HzWgh?4Fw1j|1&cMJk2e5Y;0HBp{hp7Et-`kh8y-% zGc*+<@+>%~2fV#+%~k38@^Ped-A!&D60#&6ra6gTZ}iLoA@kUhn2u2kP?o@#v`3ST(vErvl1Pwy1P1wHuvnX?{LUQrba=Q`Y+QM- zJbx&#u&PsVm4x)ugHS3T{+8H6*(kPPdjMej=)FbAgIbZ~)Rew!=GyOz?B!0D~W%orac zhOEkV=B(X)#NDj%xrW)Cz#43g%w~anU0#X1JN#|wYfZtvoanP<0(VCIhu%4{@snIz$c8`IW`QQZaNHG7LsNj*-%{(OCssE^){i5poAtjg z4R4T`EN3R>h@ec3zf6}m9@Ns^#nX@R-75gRyVHk~U(+#CsyHVHd>W-S$MT^I0 zt%oq1;t3oBA5YE7ilwGpcGSFhl6X;$JMMLJ%h&^XzxI>{TVDlB>O|PHipkQ}i#jpd z*GdcrpK3~@KYvtwKALmUxG{5I_uJyu`(a46I()ZjA3l#rGCL`WPu& zs|Fg{eKOB)KCjH^cX29uS-3JQDmi^{MogoFPc+HMA)gv?&6iL6d)!*S*bt|n+`*2e z`p@ffwb>pjf<*L35tK9Ko*Z|HH=UTxt4tlaM8xHY)uG9^?hY2osfTS+(T`-6!AtDE zZzOW&w5Oq&+=geh$SfbQHrGi_IME4SJxf6McOUoI6eJq&CP&>&xNAXW*E12mXbG9g zJooxI0!98Jft#5&G*1dT&RAJ=GQY`cNT%In&)IZ=KBYKIeVc@H1P8-?(y8LTAcpo*E|-cvs>{^qC}BKtV^u(cp( z@Bv!%y5fOYIY;Y7=8$h~!D*V5nHYv8Hcx&-XjJ+|Dz{ng1MPXtEICCYv|Up}rQQ=B zhk_J`66w=TIq%Jce^^@Q=STA+Et zi`;({#wCAoV`l7C|N9edlS_~F%QBNg$q1OI!U4w*ztn=%EKHdCC<(XBun0oV35e_D zZbT<+_+~T8QuC2qJpC$if#h;Q>fahXuCIap%}PASRey z4Bae!C1}uiPcZlVmm?BS7ATxIYsV^V&dQGJiRdK_bvFDgXJ$P>)blQES@o!ar@>8Q?B`I1z}9&mS0B>q-W+n zwqSznb5B_C?LDA!CcwnN(YQId@b0UbbBT?@2RLsmUa`5aqrE9qacT6HkvKZhm^$k{ zr_pBH%)--C)t`>cjquYz6sA7Yu6gJ18axvoSiU{ArY>NlvVQBxgL5Y>tE(zDS8gZT z`t=kjw`N;?lny<8PJt@DMNoL-c`~WDP(5G3!zV|dh_H!;!0XbS=nmfwdoy&;^oA~v zTXU73?p?7fg6ZN68N;P5ISLOlm@pK~WaQ@gJ`4WOpVba{omWWD{m4?;Kj#1ZxOW*A@T2nHH`)kM=Wq!-^N2;H;_gSdK_(3Hw>6s+_Zrd*uhVvN)C>Q^yA z3A_u=1kfM6KE0W&0D5NPw z6Jglc$@bt0bbxm8ghWgJH&fdBnyw9SnyPh{jas6l{Kj*-R}|@Iwu%HenP=+u^J!TO z&o-#Ao-EVrRSFxaS35~{BVaD*P1?%A;^8Cj)G`x~7@0$v5c>QCeaVB%8j{tJ?n{}0 zC+;tsEHa!GZ8w+pDXv+rD;Mf!>m`O`-@!ZfR{4iIW$0Cl7~XY>FUXl z%$jfY-Y)2>N7xdf`$6j2!3&xt2d}>pwr_FHZFUhcMnWX-M_pMrQZu7pz;+=0<41y# zX1YH4f*Bh?fdPqsB`6PzdCyJmd8}AG~9RJg(Q-=1r`f=TUnkYVNqgbxBc{9hAr}_J!{Q%!FxBO++riAEaN4##ZxWy>pppGH?wa6-a%R6n>$X9OF*DGp`tYcqKXRIBl zPG8Ko4=Xy3LIg{uG9I)NIqONkw3H(?^FA)d`Gj@X&4ZaG@E5Tga{CRvy1c?;H~3C@ zp0Vyq9b;n?&tosJJJvmIecxG}=5w^-+2>0=euay0hPg$OK}C7nKZlG0w-9zI3=)I+ z?^(gs$|t1K(&Xhm!CRc{NV z_N8Xz8C&)^(FSKt=lJEW0>%Tc(uO};Sxn{=nWjEa?wn*_QV9`j{uKATKd>}jw$47E zT*LKzhjU=*X{DyBz;LZTMTWGBhZ;)h$*(>YFK`X=v8sqaw#ks&ike?iXnw1e*is#P zW{}*&C%9)an$`A6t*W1zugtp^L5}*zZhaK1wDkJb4{;%zGDP#zfv02^Jcu$FJ3ro&>!uh!!4jmpFvh-UVjxvjnwcnsE@n3@t}~*m@S>k3$M-3 zmt&kgw)z%j(Zq!3&%I8@KgqOwFOb2h2|u1MK#%0mT^epID*s|OhFE%dU!e(G$Kfo5 z8e&!@9cD}5m-JxM=4|YKVd7QQrDyQzX;oN!{K@6y_}p4ip0q}@X9#6o(=po|MJl6$ zG$K@f?^C$WD>bUKrz@X`ep_06!5{UW!^ik&b{&px)-E0wXj>#B==^MC|KfmLU~GzH zO{93BXfm$XcCDCO_0VL{@LF1)a0!dRhH|28@$kW|+k@+?iyNC1QHL+_?J+xhpcm}l zSYQ;|2(H(K*~e%f`ul4AuLZ^%8uBL*$6%3|pjZ3Z?-09_t6T;bXoEA%wR>=~2S`D| z|HAId#c8>cAb=%CK15=F?ONZh#mvK%DFc_^3kghK;pk&kJ)Kl}%+!!81|1)Kd2-&p z_~wF)bEm!iN3)FNh|z@cE*6{Fj0=4))GSSf8b1fyOP5dRQ=c#7j51ej!NO3QvLq^#C!4$zj!j&zIhpS~FcYgHl z+cQd?D-T2kzG}P;MR^L%IyQ~H*8PHc-pp1DHQovPw$*F!oS9NVpzN`L4W4fw46%X(9Jx5mUBOEZ zo+mcMbVJmi6@1iW4oDt4=@Zy?sLAdK9g4d;*|;OKDW0Z96)PvpfP5>kUm9LT?)}#O zUBpwKp%OlWcD1Bey^LR9ijY1xEE}8*WOT_}^IFfRc3FSNq~mk`v*svM_;^x&t=$Mq ze4`1uoPLEC;$qBa=X9a;>ch8#oaIl4t0W(G_!v=Fcesmgx?;v*_&ATFWq*zfK3DB% z^Q#(KZS6S9RTV0SGW0C#=H<>OPkYztb$E;z_7K>fAn`9yj+~y%RTUnZDO?C_o_~SQ zol7lG+fOF<-Xj zuJnYhc1J#6YTql}_o*u!?&YW6+3F-V6S!Wiq?L9A;aDn{M&IVs%6RO5z<o;o`LYkmvpi*% z1c_oPPW1(9wUH3jbAdo6au2I`l;vC~Q+=j-=_#0=QP-)jU3lXE-s754Xhm{u2Dm{5b{o{L6Z-zq7ipL#< z0Pe&$?#D`}F10M_ZW7*`wR=~4Qf4TOYAeoH(G{u{H#8vSZ$=CIWr#gEc)eN;RcF1Q z;mvY=3(3Lnp4I!&IcIoFO?YoSvQde?;_M~&g!Ij^xklWZdZuU2b2s0lKSZHKcmm&5 zhMt~M@F{t^=w5dF2x=n3Wsf=C17ib6HUGVgXkSyLBSBD-%$*CltpHW@TBOv--a7WqN`WZ&k;wN8l zVs}%$o>Fj!JSIh@6XP#xA}eyo5_y<(XuLV-%rc)nK4ViqY}PFs{lRrz&E8HhnD&~j z(uV{7;n1ml4-beXV6jX{(+-ju+E!XKtKNHu^dkdB6W&k}GK?GUzxY@(_Qpq2S$yGR z?!_-_j>gGWA>CR_CcdXaej@)U3VKt}H_bopJEQCiZgvre&^$Av#+GM?L_a}bQlJ=cxD zxlD~;=ac=DOY|OC|F1BrDzF%VSii>5JwRh9IcBl+J_tR`ZybBw{*lVc*|*Iw+xGc` z3IRU;-UQu?yvq668)#2{-s}}kDl(tMYt3i0@}s9?bD3|y`!Z7eRV}-Z#cXM|JF7iI zzx{a&u4RAvEtblM=X>b-=KU-*D(4c~c{tj(^a8}}`aK~6isoHsrx^z6IST_t1FavN zkqDZxL-uT~g|S-9U0k+lmhflIAAed$N0ghxR7%#u{%#g4>_~LHGE(!|-332apA6@_ z$0t`gh){HJ&)RSSbIzyT14Ky|Ehki?uY4C(&^J}sXnE~ZH-Ko>aAzGOzxu>owlURR zOR8y~!j&h(2mIy*-j2ae6oy})r}D0}g;hVh^=@ojDW39d0TJCv+xmjMySYg-ld^%| z98^8kqIx2wq3jO*@(eHo~jg9WJ2NowhtO@nk%EeK%m(|&4 zo7Rx{>enVvp$QV2*oeID%r&1z_WNHyy|yX5n_&Ty=4E^lHO0{i z_cYHO)0T+HxT`g>#Qj-?RRiuXJ=gm5KVMJ3CL(%<>-4)5!Ilh1-^)wnAa(kmkeCfU zNz9XbH+O`~M|xq8!R`Ub|7Q0Ig)G+twolN+jG?RmAh#z6x5Se68%wYq6Qg*suRF;d z!h5SOk({W)fmUdbZudanNQ~5fFWsuKTArj(N|ohXfqecOpJd~Y+94{ou(TbvSfAwF z@4-j;vKht3na{y5wl1{WmQ9|;8Ot0ZV^%$!(t3QNcwe53VMFSfh|^PeyHAf_$&fc5 z>afJ!v-P+e^AtkS7ZdYD-ETU7>(+wPk$zWVO8FG2Fhvv4)Nc6o6XW#Ah|hR{mD@Q6jlTUY zu{;Mj1AT5pXO6GaV%cMkspL}$InQZ^|DJ3#G{%& zWv_>JR9Gcf1s$7PJ@Te=ZIgsc*nW?lWAq+i8i7Uqg(zcW7)f|&l+Wv=^R4ia$amT! zg|W+S__xjmwXaN|K~+qFJ?{b|CIZ9C1p3M53P>IB8^=zg;KvS2>6v<0-RGWOO(DT~ulaYty>%S;7$Hvi(VYW=ZFL==w8D0~L zb#;3>d@}7G9SH+R-1TZcOOs*=Mg}L0qIPGknYe#d`N_>grF@k%3g03cJgysDO|LoQ zp4AB(6*7Fghlut7x>A3m4gZ`ESdm^5{03~kKO3l_-7nz!Sg2plf|Dv$*cDZ;On!?- zEY#8?;A;|`+Cp9L>doo9)~}y_CXHUoHdvgaPq;OmKO8zmyie(%{T{6Cf!8p9u``3h zmI7-H91W+4Qwvo{ij7dZ!o#LP)bL_7kZR1|6xe0YkfJCQ3=<6!r6n3FrA^tR>phUw z-yBJ(Mi+xyo}}lqtw?DQa6RTM=z2M^IkJ6AQ|Nn@6eHcsV@zsx_2+A|O6Se_y=)XF z1+h9J?^XJ2F8SR!==P4* z8Ww@e`E8Ht*0%0Zyiw1X+=u3z?@>}Z@y9cvt?UJgE7=CH;w z4SbwN8CMYBb(0dylvE>;5`&>qr^aVBeP*v)RHEIZHzFpYTO`bD2NlI3)(#j| z%LGMfuAEf;V1}64RDl<>f1`3!6(kFeD8ASCw9_i|#WJK{8Uls}zK=0GJM_1G`kkF0u@?bieAF>X91_aZc({v=bEcKO4Y{o;6alS7K7L7* za-E4}4{r9b@hIe9h#dnUFfssvbqYUL$eP#fSiFBf@+sgBGCg5A2Dcg!hMB(9dfFrK zKCAwdIrp+avC_b`nd>hM8kbp1m^;Ob)H@l<#UZ5nOmZfvFc#jP&aV^Fx$Z}A2b`cR zXFgONq)8NXdWC{33kB61S7@rTK^l+*^1tS$%6zs30@_tDI?S+hDv_ zRu`wX&IEDe^G3}(mYS<~Qd5qq#3fH%ELQp&S{o-Tua0Q-pT;h*;M=(Xt z!e{j3E$iK9liJTxb~|n!VtxJ*&l2aao>R<7>m*ehUVH1LRO87-CLCRDIR55!F&ez1 zV2#Knw)x zZ&t!=4Ds5U@xSTHgqI633aAK7myJd< zN%#5U_m%6dM+()8_%{z|?DOe2qJ%$8G!7xy9m;#{diGCS7RjY5w&q(j2r#i7du`3< zKzAe+DN*VrCkY)_GfB3NaWy`OV!VnD(clTWe-%Wn~_|hYa^X4+zP>0QTB3nsmwlQd&-X z#+Dn5svVcDxnJoxS5u%&;T^E?3F3)-+)l; z&k#3B_G#{W!RYl2iP%@B*4ihVQA^)!LCI*F%>A?$%4>OT)p*e~{%dEsO#ijYQu|YK z8MP79l#LIFPxybYY!1+RwtDi_r(_WUJX6?0z-L|qtW5g;M@G}0_pg-n-w<_Y`MS}q zJUc`7h);HKk74hD2!LqT|BM(wJwgK~q;M>7>TRzE$rc>@=hWNI(H3Gq>dtO@UD(8g zh+WWrOphleltTh<=yiu!LA5E{>frq&JLWhshH-8Wmi9mbh`%|ck-Jt$R}UURu0pmH zGT#cPU(n4t9X7jF#HUq7?Y9pf71Gsp=>GNkP^%jwjV}d^&a5yuDhNonq*~EsWxnK8 zkM*;WI$*(7dg$tK@x&TKRh<3Bt|p~Q0Z0Gy`G!s(vOSm@I5A|Og+Hg=t=?a8`<}yF zbX-5WB+lzGZO!>k!``i&yy~`9vL|OHHuo6S9v}mW_zOv6^l@|m5}#gX*;L?*F8IOS49A+0&Y`>?<#Ol&1 zl!>VUFzG<@On-}ER#Vb}0CHkuDURNW-uXm1UBPYw3$iB<({_N0NtIW2^=kHOrYSS@ zOC)1Y6GQ%Kl4_ana2+BdCXcp#wzrQalzeJ%EB;zSeqwEt__n%H(x37H?CZZroIAng z+Y#e1C2KFcA31G(eHCL-6}!{c#@>e3u3n&;dU_F*9j!fq%CKYBKk~rQCG7aH@#C|X(#Ag905lnpyFu2aY9G} zJ4Og3U@CxsZ?Fq(Yl5fKc4W8&91a2p;P0QDB?JaCrw0VA1Htj&Qy`jL)fEWq2E*C^ z4Ws`_g1}Uq?CtGjuRD73_iFb>LqoAAsiNGc^R0ReA zg{ebf8c^_!*Mdp{2VB}vDL|5XP?$axbPNoE=z1s)pj$(6I0y_61hWH}9B9}c3In44 zfxvY*z=Sgt<^qKQUARFJ2w=$sq>TXBBtT)_P?!%OE}+giAW8{{WQUz_vL!eILGj@0 z@M}=~_Wb-j;{JF3f0@B;asLm~1q{X6+QADD*k4VYq8A>F_s1atF@QaQH{wWecUall zU1LX3a0txgXBPOH0Qkk6pO+_Z~)?#1ZElu%sdJR?*J19;1hffmPcU`J6Dt> zSpNShL4H0*0cF7iw(&DzPzXRpKmh>pEm$d_A)FMTXTUw6+8Cg7NiYFgi^O3dKmybg z3&mi7Mu2xs5>RkSDS!bukO!~ea}HoZ`e2(lDI9R!p@G-!!Js4|;1y&792A(a z2-J363^4GWHh{7~zuS2f8W4okcESMRL0}#Sn0_ok$0Crx0C)OD1N~wET}I)+dT1yH zxJm)S0{cleiC_sP*K(G+7PYh6JTPQ!71Fzfq1NMXlZ{VPaK(@i5 zfma}H&kNYyPJ&>5%-KIp?PLH3cbWhq?Z9dnkSl=mpK9=Px_(}PDTDlEWV;2lBmx}v zHv6C+{L?;I3LMk6G64z&-v84I$P^q7*?#t;0UWT({7j%C?C1cf2fwFnSp#|j>clT; zTOUB3_?fnK<%dpy`tWN4`P#k*+5mOqX97o#g=2s@1b78?V2^~80_FtN3&0%znttlU z&nqYtVBR6yS1|ue+Mes3=U|zC5~vs8tb*5n!!&<&FcqQOQJa8iQiW~@M*=xDC=!J9 zKu#SBOy71+1BwK(8bH$mG;}AY4MlFRqYz-l2G+9er~1(CXd$4SAu#7VG$YWGY(KRI zv~wqihXQPF)9ipZ0n9;um=?x#-Je)uKEm?*Z5= zhd?3zn?08pMY>=pjq0+TDr{a;OVw`b<(bE+$A-U`lXxf`>qmDzmA$0D88@$!RlCAZ zrKMBy$?sLn%gSR>2gY7MJluRoU;Isnw^VLtmph&iS^Jc>L?u|+8K>U%l2c|q~ zM>yz;g3q%m3t9*x$ioJyVejj=7;c^HspZayzJxbZ{=UcPe^H4aA9wyQ=H~y}(f^|R z|34i4uP^!k`n~+Wb?!l%@%wAP^D#3_%gGjW!T_%{QvVr=bB%~xzVo*l{!QD$526Or6k%XJ4gD_yYJ>6`*@xV8~ zzt(cMwTIeR%?5F9Qf{L3Qm z>uF~X0kIxHL8VBhK0M6-d8VW=Zzn4SeB>$knk$@xmM>!A={8kr%M*lfpGzI|s zyUR&|$m;jH7&rhFchi6)h}|?K=&&fbgzzD9{1=od!qZaJ$=sqY!_V zL!scif69~_MY5gYjXXU^ehG~k5orb&U;|Mzkj zB#1|M(NKVsu$v|c;=SL?0T_O_tl(G_@O{qia#A49{JpLu97Kb=Xuu~Pf6y=>g56aP z2V%zGX;KIP%EM z^*{S1cv?F-+j(yPM-zrles;hb289{8xq*xCHqg{`wFfr404>=8>s|zFPr~+chrmd| NB_RR=s=8{B{{w8dquu}j literal 0 HcmV?d00001 diff --git a/oe-eval-internal b/oe-eval-internal index 7936aa511..4c104ac6b 160000 --- a/oe-eval-internal +++ b/oe-eval-internal @@ -1 +1 @@ -Subproject commit 7936aa51128ae671bcae47aa50422e7f98c2fe39 +Subproject commit 4c104ac6b4fd05d1d0f83d3d2e6a46eb77efc592 diff --git a/scripts/plot-downsampling.py b/scripts/plot-downsampling.py new file mode 100644 index 000000000..60278ff38 --- /dev/null +++ b/scripts/plot-downsampling.py @@ -0,0 +1,219 @@ +benchmark_data = { + "Avg.": { + "eval_setting": "", + "sft_5": 56.6, + "sft_10": 57.0, + "sft_25": 57.7, + "sft_50": 58.1, + "sft_75": 58.6, + "sft_full": 59.1 + }, + "MMLU": { + "eval_setting": "5 shot", + "sft_5": 64.1, + "sft_10": 63.9, + "sft_25": 63.4, + "sft_50": 62.3, + "sft_75": 62.1, + "sft_full": 62.1 + }, + "TruthfulQA": { + "eval_setting": "6 shot", + "sft_5": 51.0, + "sft_10": 50.4, + "sft_25": 49.9, + "sft_50": 48.9, + "sft_75": 46.4, + "sft_full": 46.8 + }, + "PopQA": { + "eval_setting": "15 shot", + "sft_5": 30.8, + "sft_10": 30.8, + "sft_25": 29.8, + "sft_50": 30.1, + "sft_75": 29.6, + "sft_full": 29.3 + }, + "BigBenchHard": { + "eval_setting": "3 shot, CoT", + "sft_5": 67.5, + "sft_10": 68.2, + "sft_25": 68.5, + "sft_50": 67.6, + "sft_75": 69.7, + "sft_full": 68.8 + }, + "HumanEval": { + "eval_setting": "pass@10", + "sft_5": 81.5, + "sft_10": 81.5, + "sft_25": 81.4, + "sft_50": 84.4, + "sft_75": 86.7, + "sft_full": 86.2 + }, + "HumanEval+": { + "eval_setting": "pass@10", + "sft_5": 76.1, + "sft_10": 77.4, + "sft_25": 75.5, + "sft_50": 78.3, + "sft_75": 79.5, + "sft_full": 81.4 + }, + "GSM8K": { + "eval_setting": "8 shot, CoT", + "sft_5": 66.0, + "sft_10": 66.3, + "sft_25": 72.1, + "sft_50": 73.8, + "sft_75": 74.4, + "sft_full": 76.2 + }, + "DROP": { + "eval_setting": "3 shot", + "sft_5": 60.7, + "sft_10": 60.7, + "sft_25": 59.4, + "sft_50": 60.7, + "sft_75": 59.9, + "sft_full": 61.3 + }, + "MATH": { + "eval_setting": "4 shot CoT, Flex", + "sft_5": 29.3, + "sft_10": 28.7, + "sft_25": 30.0, + "sft_50": 30.9, + "sft_75": 31.7, + "sft_full": 31.5 + }, + "IFEval": { + "eval_setting": "Strict", + "sft_5": 65.4, + "sft_10": 68.6, + "sft_25": 70.6, + "sft_50": 68.2, + "sft_75": 70.6, + "sft_full": 72.8 + }, + "AlpacaEval 2": { + "eval_setting": "LC % win", + "sft_5": 11.1, + "sft_10": 10.2, + "sft_25": 11.7, + "sft_50": 13.3, + "sft_75": 12.4, + "sft_full": 12.4 + }, + "Safety": { + "eval_setting": "", + "sft_5": 75.3, + "sft_10": 77.8, + "sft_25": 79.9, + "sft_50": 79.1, + "sft_75": 80.2, + "sft_full": 80.2 + } +} + +import matplotlib.pyplot as plt +import numpy as np + +# Create x-axis values (SFT percentages) +x_values = [5, 10, 25, 50, 75, 100] # 100 represents full SFT + +# Create figure and axis with a larger size +plt.figure(figsize=(12, 8)) + +# Color palette for different lines +colors = plt.cm.tab20(np.linspace(0, 1, len(benchmark_data))) + +# Plot each benchmark +for (benchmark, data), color in zip(benchmark_data.items(), colors): + if benchmark != "Avg.": # Skip the average for now + y_values = [ + data["sft_5"], + data["sft_10"], + data["sft_25"], + data["sft_50"], + data["sft_75"], + data["sft_full"] + ] + plt.plot(x_values, y_values, marker='o', label=benchmark, color=color, linewidth=2) + +# Add the average line with higher emphasis +avg_values = [ + benchmark_data["Avg."]["sft_5"], + benchmark_data["Avg."]["sft_10"], + benchmark_data["Avg."]["sft_25"], + benchmark_data["Avg."]["sft_50"], + benchmark_data["Avg."]["sft_75"], + benchmark_data["Avg."]["sft_full"] +] +plt.plot(x_values, avg_values, 'k--', label='Average', linewidth=3, marker='s') + +# Customize the plot +plt.xlabel('SFT Percentage', fontsize=12) +plt.ylabel('Performance', fontsize=12) +plt.title('Benchmark Performance Across Different SFT Percentages', fontsize=14) +plt.grid(True, linestyle='--', alpha=0.7) +plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10) + +# Set x-axis ticks +plt.xticks(x_values) + +# Adjust layout to prevent label cutoff +plt.tight_layout() + +# Show the plot +plt.show() + +# Optional: Create a second plot focusing on specific benchmarks of interest +plt.figure(figsize=(12, 8)) + +# Define specific benchmarks and their colors +plot_config = { + 'Avg.': '#0a3235', # Black for average + 'TruthfulQA': '#b11bE8', # Coral red + 'HumanEval+': '#f0529c', # Turquoise + 'Safety': '#105257', # Light blue + 'GSM8K': '#0fcb8c' # Sage green +} + +# Plot each benchmark with its specified color +for benchmark, color in plot_config.items(): + data = benchmark_data[benchmark] + y_values = [ + data["sft_5"], + data["sft_10"], + data["sft_25"], + data["sft_50"], + data["sft_75"], + data["sft_full"] + ] + # Make average line dashed and thicker + if benchmark == 'Avg.': + plt.plot(x_values, y_values, '--', marker='s', label=benchmark, + color=color, linewidth=3) + else: + plt.plot(x_values, y_values, marker='o', label=benchmark, + color=color, linewidth=2) + +# Customize the focused plot +plt.xlabel('SFT Percentage', fontsize=12) +plt.ylabel('Performance', fontsize=12) +# plt.title('Selected Benchmark Performance Trends', fontsize=14) +plt.grid(True, linestyle='--', alpha=0.7) +plt.legend(fontsize=10) +plt.xticks(x_values) + +# Adjust layout +plt.tight_layout() + +# Show the plot +# plt.show() + +plt.savefig('downsampling.pdf', bbox_inches='tight', dpi=300) +plt.close() \ No newline at end of file diff --git a/scripts/table-script.py b/scripts/table-script.py new file mode 100644 index 000000000..ff199b035 --- /dev/null +++ b/scripts/table-script.py @@ -0,0 +1,331 @@ +import pandas as pd +import argparse +import sys + +""" +Examples: + +python -m plots.table-tulu3-90 --csv-path leaderboard/exported_results.csv --models ppo_ray_β_0.03__3__1730357435 Meta-Llama-3.1-8B-Instruct hf-ministral_8b_instruct_2410 hf-qwen2_5_7b_instruct valpy_dpo_70b_hslj_uflj_dalj_wciflj_iftaxlj_wcunusedlj hf-Llama-3.1-70B-Instruct hf-qwen2_5_72b_instruct + +8B +python -m plots.table-tulu3-90 --csv-path leaderboard/exported_results.csv --models Meta-Llama-3.1-8B-Instruct hf-google_gemma-2-9b-it hf-NousResearch-Hermes-3-Llama-3.1-8B hf-qwen2_5_7b_instruct hf-ministral_8b_instruct_2410 L3.18B-math-mix-final-nc__meta-llama_Llama-3.1-8B__42__1729284525 dpo_tune___model__42__1729311739 ppo_ray_β_0.03__3__1730357435 + +70B +python -m plots.table-tulu3-90 --csv-path leaderboard/exported_results.csv --models hf-Meta-Llama-3.1-70B-Instruct hf-qwen2_5_72b_instruct hf-NousResearch-Hermes-3-Llama-3.1-70B hf-llama_3_1_nemotron_70B_instruct_hf L3.1-70B-v3.8-lr_2e-6-2_epochs 70B_ppo_ray_β_0.07_lr_1e-7__3__1730258118 L3.1-70B-v3.8-lr_2e-6-2_epochs-pif_dpo-5e-7 + +Merging example: +python table-tulu3.py --csv-path ~/Downloads/exported_results_4.csv --models L3.1-8B-v3.8-nc-soup L3.1-8B-v3.9-nc-3__meta-llama_Llama-3.1-8B__456__1730332817 L3.1-8B-v3.9-nc-2__meta-llama_Llama-3.1-8B__123__1730333671 L3.1-8B-v3.9-nc__meta-llama_Llama-3.1-8B__42__1730330678 +""" + +model_label_conversion = { + # llamas + "Meta-Llama-3.1-8B-Instruct": "Llama 3.1 8B Instruct", + "hf-Llama-3.1-70B-Instruct": "Llama 3.1 70B Instruct", + "hf-Meta-Llama-3.1-70B-Instruct": "Llama 3.1 70B Instruct", + # + "hf-llama-3-tulu-2-8b": "Tulu 2 SFT", + "hf-llama-3-tulu-2-dpo-8b": "Tulu 2 + DPO", + "L3.1-8B-v3.8-nc-final__meta-llama_Llama-3.1-8B__42__1729991287": "Tulu 3 SFT", + "L3.1-8B-v3.8-wip-persona_code_v3-2-pif_dpo___model__42__1729725103": "Tulu 3 + DPO", + "ljrmvalue_lj_gsm_data_step_300": "Tulu 3 + RL", + "hf-NousResearch-Hermes-3-Llama-3.1-8B": "Hermes 3 8B", + "hf-NousResearch-Hermes-3-Llama-3.1-70B": "Hermes 3 70B", + "hf-llama_3_tulu_2_dpo_70b": "Tulu 2 + DPO 70B", + "L3.1-70B-v3.7-nc": "Tulu 3 70B SFT", + "hf-google_gemma-2-9b-it": "Gemma 2 9B", + "hf-ministral_8b_instruct_2410": "Ministral 8B", + "hf-magpielm_8b_chat_v0_1": "Magpie 8B", + "hf-gemma_2_9b_it_simpo": "Gemma 2 9B SimPO", + "L3.1-8B-v3.8-nc-soup-pif_dpo-soup": "Tulu 3 + Merging + DPO", + "L3.1-8B-v3.8-nc-soup": "Tulu 3 SFT Merge", + "L3.1-8B-v3.9-nc-3__meta-llama_Llama-3.1-8B__456__1730332817": "Seed 1", + "L3.1-8B-v3.9-nc-2__meta-llama_Llama-3.1-8B__123__1730333671": "Seed 2", + "L3.1-8B-v3.9-nc__meta-llama_Llama-3.1-8B__42__1730330678": "Seed 3", + # random SFT mixes + "fae_llama3_sftmix_v3.4_personahub_if_v1__meta-llama_Meta-Llama-3-8B__42__1728059424": "Tulu v3.4 SFT", + "sft_preview_mix_v3.5.10__meta-llama_Llama-3.1-8B__42__1729148912": "Tulu v3.6 SFT", + "L3.18B-v3.7-c__meta-llama_Llama-3.1-8B__42__1729454073": "Tulu v3.7 SFT", + "L3.1-8B-v3.8-nc-final__meta-llama_Llama-3.1-8B__42__1729991287": "Tulu v3.8 SFT", + "L3.1-8B-v3.8-nc-soup": "Tulu v3.8 SFT + Merging", + "hf-llama_3_tulu_2_70b": "Tulu 2 SFT 70B", + "L3.1-70B-v3.8-lr_2e-6-2_epochs-pif_dpo-5e-7": "Tulu 3 DPO 70B", + "L3.1-70B-v3.8-lr_2e-6-2_epochs": "Tulu 3 SFT 70B", + # 7b rivals + "hf-qwen2_5_7b_instruct": "Qwen 2.5 7B Instruct", + "hf-ministral_8b_instruct_2410": "Ministral 8B Instruct", + "hf-google_gemma-2-9b-it": "Gemma 2 9B", + "hf-gemma_2_9b_it_simpo": "Gemma 2 9B SimPO", + # 70b rivalsqw + "hf-llama_3_1_nemotron_70b_instruct_hf": "Nemotron Llama 3.1 70B", + "hf-llama_3_1_nemotron_70B_instruct_hf": "Nemotron Llama 3.1 70B", + "hf-qwen2_5_72b_instruct": "Qwen 2.5 72B", + # LMSYS version compare + "L3.18B-math-mix-final-nc__meta-llama_Llama-3.1-8B__42__1729284525": "Tulu 3 SFT", + "dpo_tune___model__42__1729311739": "Tulu 3 DPO", + "ppo_ray_β_0.03__3__1730357435": "Tulu 3 8B", + # 70b fine tunes + "L3.1-70B-v3.8-lr_2e-6-2_epochs-pif_dpo-5e-7": "Tulu 70B DPO", + "70B_ppo_ray_β_0.07_lr_1e-7__3__1730258118": "Tulu 70B RL", + "valpy_dpo_70b_hslj_uflj_dalj_wciflj_iftaxlj_wcunusedlj": "Tulu 3 70B", + "hf-NousResearch-Hermes-3-Llama-3.1-8B": "Hermes 3 8B", + "hf-llama-3-tulu-2-8b": "Tulu 2 8B SFT", + "L3.1-8B-v3.9-nc-fixed-2__meta-llama_Llama-3.1-8B__123__1730531285": "Tulu 3 8B SFT", + "hf-NousResearch-Hermes-3-Llama-3.1-70B": "Hermes 3 70B", + "hf-llama-3-tulu-2-70b": "Tulu 2 70B SFT", + "L3.1-70B-v3.9-nc-2e-6-2_ep-fixed-3__meta-llama_Llama-3.1-70B__456__1731059165": "Tulu 3 70B SFT", + "L3.1-8B-v3.9-nc-no-safety__meta-llama_Llama-3.1-8B__42__1731562927": "Tulu 3 8B SFT w/o Safety", + "L3.1-8B-v3.9-nc-no-wc__meta-llama_Llama-3.1-8B__42__1731562946": "Tulu 3 8B SFT w/o WildChat", + "L3.1-8B-v3.9-nc-no-synthetic__meta-llama_Llama-3.1-8B__42__1731613382": "Tulu 3 8B SFT w/o Synthetic Data (ours)", + "L3.1-8B-v3.9-nc-no-math__meta-llama_Llama-3.1-8B__42__1731562937": "Tulu 3 8B SFT w/o Mathematics", + "hf-RLHFlow-LLaMA3-SFT-v2": "RLHFlow SFT V2", + "hf-MAmmoTH2-8B": "MAmmoTH2 8B", + + # downsampling + "L3.1-8B-v3.9-nc-downsample-0.05__meta-llama_Llama-3.1-8B__42__1731214637": "Tulu 3 8B SFT (5\%)", + "L3.1-8B-v3.9-nc-downsample-0.10__meta-llama_Llama-3.1-8B__42__1731214619": "Tulu 3 8B SFT (10\%)", + "L3.1-8B-v3.9-nc-downsample-0.25__meta-llama_Llama-3.1-8B__42__1731214572": "Tulu 3 8B SFT (25\%)", + "L3.1-8B-v3.9-nc-downsample-0.50__meta-llama_Llama-3.1-8B__42__1731214572": "Tulu 3 8B SFT (50\%)", + "L3.1-8B-v3.9-nc-downsample-0.75__meta-llama_Llama-3.1-8B__42__1731214576": "Tulu 3 8B SFT (75\%)", +} + +# Metric keys definition +metric_keys = { + "MMLU": "mmlu:mc::tulu", + "TruthfulQA": "truthfulqa", + "PopQA": "popqa", + "BigBenchHard": "bbh:cot::tulu", + "HumanEval": "codex_humaneval", + "HumanEval+": "codex_humanevalplus", + "GSM8K": "gsm8k", + "DROP": "drop", + "MATH": "math::flex", + "IFEval": "ifeval", + "AlpacaEval 2": "alpaca_eval", + "Safety": "overall_oe_safety_average", +} + +eval_settings = { + "MMLU": "5 shot", + "TruthfulQA": "6 shot", + "PopQA": "15 shot", + "BigBenchHard": "3 shot, CoT", + "HumanEval": "pass@10", + "HumanEval+": "pass@10", + "GSM8K": "8 shot, CoT", + "DROP": "3 shot", + "MATH": "4 shot CoT, Flex", + "IFEval": "Strict", + "AlpacaEval 2": "LC \% win", + "Safety": "", +} + +# Change this to change the table size +AVERAGE_KEYS = [ + "alpaca_eval", + "bbh:cot::tulu", + "codex_humaneval", + "codex_humanevalplus", + "drop", + "gsm8k", + "ifeval", + "math::flex", + "mmlu:mc::tulu", + "popqa", + "truthfulqa", + "overall_oe_safety_average", +] + + +def parse_args(): + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Create a table of model performance metrics.", + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + + # Required arguments + parser.add_argument( + "--csv-path", required=True, help="Path to the CSV file containing the results" + ) + parser.add_argument( + "--models", + nargs="+", + required=True, + help="List of model names to generate table for", + ) + parser.add_argument( + "--markdown", + action="store_true", + help="Output in Markdown format instead of LaTeX", + ) + parser.add_argument( + "--extra_cols", + type=int, + default=0, + help="Number of extra columns to add to the table", + ) + + return parser.parse_args() + + +def format_value(value, markdown=False): + """Format a numeric value for table output.""" + if pd.isna(value): + return "N/A" + try: + return f"{float(value):.1f}" + except: + return "N/A" + + +def create_performance_table_rows(csv_path, model_names, markdown=False, extra_cols=0): + """ + Create performance table rows for the specified models. + + Parameters: + csv_path (str): Path to the CSV file containing the results + model_names (list): List of model names to generate table for + markdown (bool): Whether to output in Markdown format + extra_cols (int): Number of extra columns to add to the table + """ + + try: + all_data = {} + df = pd.read_csv(csv_path) + rows = [] + + for model_name in model_names: + model_data = df[df["Model"] == model_name] + if len(model_data) == 0: + print(f"Warning: Model '{model_name}' not found in CSV file") + continue + + # Get pretty model name from conversion dictionary + pretty_name = model_label_conversion.get(model_name, model_name) + + # Replace "Tulu" with "\modelname" for LaTeX output only + if not markdown: + pretty_name = pretty_name.replace("Tulu ", "\\modelname~") + + all_data[pretty_name] = {} + + # Calculate average + for key in AVERAGE_KEYS: + model_data[key] = model_data[key].apply( + lambda x: float(x) if x != "nan" else None + ) + average = model_data[AVERAGE_KEYS].mean(axis=1).iloc[0] + all_data[pretty_name]["Avg."] = format_value(average, markdown) + + # add all the eval scores + for metric_name, metric_key in metric_keys.items(): + value = model_data[metric_key].iloc[0] + all_data[pretty_name][metric_name] = format_value(value, markdown) + + for metric_name in ["Avg."] + list(metric_keys.keys()): + values = [metric_name] + if metric_name == "Avg.": + values.append("") + else: + values.append(f"\\small{{{eval_settings[metric_name]}}}") + for pretty_name in all_data.keys(): + values.append(all_data[pretty_name][metric_name]) + + values = ["-1" if i == "N/A" else i for i in values] + numbers = [float(v) for v in values[2:]] + max_index = numbers.index(max(numbers)) + 2 + values[max_index] = f"\\textbf{{{values[max_index]}}}" + + if markdown: + # Markdown table row with pretty name + r = f"| | {' | '.join(values)} |" + r += " |" * extra_cols + rows.append(r) + else: + # LaTeX table row with pretty name + r = f"{' & '.join(values)}" + r += " &" * extra_cols + r += " \\\\" + rows.append(r) + if metric_name == "Avg.": + rows.append("\\midrule") + + return rows + + except FileNotFoundError: + print(f"Error: Could not find CSV file at {csv_path}") + sys.exit(1) + except pd.errors.EmptyDataError: + print(f"Error: CSV file at {csv_path} is empty") + sys.exit(1) + + +def create_latex_table(model_names, extra_cols): + """Return the LaTeX table header.""" + header = """\\begin{table}[] +\\centering +\\setlength\\tabcolsep{5pt} +\\adjustbox{max width=\\linewidth}{ +""" + column_spec = "ll" + for model_name in model_names: + if "Tulu" in model_label_conversion[model_name]: + # P is defined via \newcolumntype{P}{>{\columncolor{ai2pink}}c} + column_spec += "l" + else: + column_spec += "c" + column_spec += "c" * extra_cols + + header += ( + """\\begin{NiceTabular}{@{}""" + + column_spec + + """@{}} +\\toprule +""" + ) + header += """\\textbf{Benchmark} & \\textbf{Eval Setting}""" + for model_name in model_names: + pretty_name = model_label_conversion.get(model_name, model_name) + if "Tulu" in pretty_name: + pretty_name = pretty_name.replace("Tulu ", "\\modelname~") + pretty_name = f"\\textbf{{{pretty_name}}}" + header += " & \\rotatebox{90}{" + pretty_name + "}" + for i in range(extra_cols): + header += " & " + header += """\\\\\\midrule""" + return header + + +def create_latex_footer(): + """Return the LaTeX table footer.""" + return """\\bottomrule +\\end{NiceTabular}} +\\vspace{3pt} +\\caption{TODO} +\\label{tab:TODO} +\\end{table}""" + + +def main(): + """Main function to run the script.""" + args = parse_args() + + rows = create_performance_table_rows( + csv_path=args.csv_path, + model_names=args.models, + markdown=args.markdown, + extra_cols=args.extra_cols, + ) + + if not args.markdown: + print(create_latex_table(model_names=args.models, extra_cols=args.extra_cols)) + + for row in rows: + print(row) + + if not args.markdown: + print(create_latex_footer()) + + +if __name__ == "__main__": + main() From 9c0e76924d750ed5fc3c6f601bfe0d1112ee3088 Mon Sep 17 00:00:00 2001 From: jacob-morrison Date: Thu, 21 Nov 2024 10:40:33 -0800 Subject: [PATCH 41/41] dumping changes, not necessary for release --- downsampling_bars.pdf | Bin 0 -> 13628 bytes scripts/plot-downsampling.py | 268 +++++++++++++++++++++++------------ scripts/plot-versions-sft.py | 177 +++++++++++++++++++++++ tulu_version_bars.pdf | Bin 0 -> 14040 bytes 4 files changed, 356 insertions(+), 89 deletions(-) create mode 100644 downsampling_bars.pdf create mode 100644 scripts/plot-versions-sft.py create mode 100644 tulu_version_bars.pdf diff --git a/downsampling_bars.pdf b/downsampling_bars.pdf new file mode 100644 index 0000000000000000000000000000000000000000..972ef4112a981d34261363e7d5e46b36d1d5e989 GIT binary patch literal 13628 zcmb_@2{=_<)Nm!yB|^r;F^9PGO7gp5U~aLw~@4QVol*E~l`gd`P_3YD3XDM~~N zg_2}Qq5S(?df#5%?|*x~=ewT$?0weRd#~}Vwaz|^h=H1h6j~aG5Gfvju2mw?5DN0J zaYiU8KuDA0o^}vY)tYSW>frz(4XhpPd>{;%Umi?DOIg(u4XvOvSboeW_q6Od;5 zHqLe=G9*VeL5!UW{dytIn$dCm< ztEK@+u?rwWNNqPjg34O2y4Gt$M}8p({W}9N-7xjuc7VE7@{vY%J|4c_B)|_$KfteN zXX|9G;t>EaqQD;-EsK_x!N{U9cnBwpk;dZCvNCvp38@Nr1RME4#8pa_-Q7LF6&hak zt6bpQKLx34=k7pugwSiqHJn@l>kv}I705%)j^tr$2Xo*<_O`QjLj+{CW$Jhg5O#zw zEVEno?^c;Tbv)hge5ID*rMPWmLo&b^$17X?NqwJv_p4vCZ*&|sO7_Hokte*ZMZ-VlK=hXYKq+xEwK=k;5?7HanwE^T^+X_t9l8|%!t zu_8&2DN4t&&%O_vp$j1t%Q~~$(yyH6LJLb~>nC0xnUtuHdnKRVDX&u)6%bZo`06-Y z7p`5V!KmW7Y)9%ZeL78zd^XxO>Jq;eMVse6*t?G}B$%^I zG&QO6o95#jzhRZcB?bELZxxTz-!fzwk!JTWmbc1}Nf1A4Dd#G1vnw&e>h-*>Qeq`h z>oM{CxlSqXbh@iB*bzEVd-n-_kqC#k{YYWS2R3x83wcz4E8r?uyXR{T|3h2D9JM$r z_h_Apjf*)b#W5YWZ<|M*WoTBPC6AT3anr#27e>$2rHqbc3o+N$wE7k|H(VFe;od1} z;}ZR02AWUz9~EfPuuIpJR}d%Nj5u&!oIGH3?6MehmlE%(UE}x8hvvnEdu@DM?w4>I z;JB!uM&l*jbZ>Nj6*2*H`I4%K?-9|w2O6e)xxrjjUKQb$hqYR|4mC^GdIMD)yNooh zv&mL&QUd+R^VZ^ z7&BTaB3Y};*C#F^Q7W`_Lealxji{`auezfa`W)W6?N;0@g7X>M7kLzqM{XHrE!n3Y z_OV^51${iZ%g$&5D=>!-oZ0Sn*{ZrgxHtx0QhWpVB&@S{{87yP=xtd#E{|Vio)3vV zPPbgR$f!0N=dAOW9M7p!7ZvLIcRQS#-?)eF<%Qs$Qx#2|2he`Ux6iaz&78|))tG#A ze}1r7$Tz;Q>)`cP#medu4F|sNgU&NemrXqv;*`d@@^hpJ zA|s?dNV+hY%dp@k^;7^^+?CTzol152U{MX+1rX2J4bhj+IbVLcJk(tD`7EfM|YwxM;xNAAr z_jayeZqj4^$}9PSo3XE1le3xbFU&ul{P1wDZ|D4??uA0}$_qnp+h)8?=Fb;LqTgX2 zKTS5=mHp$;>A^Ug-YMU-^vhaiXko_2tLGJ2>X63>pY~S&Se$wn@ln?A`V0Hx#5kK$ zOM$uVlU1FU^uxUur)Gv{&%CsrtPp!chz>?oO0ZSPbzNB|A87w_TlJakk=^rF_x+dN z&MvhgWyB(0zKI>mzOuPvXwltHzT9!!b003{v-^jYbiDfVBjn@12QqP#Wt^-`(2Uo7 z#rJ)8j>N4Lse;Dv-ODyy)5@V|y-FCk&nBeTPZLPDEfjb*WQU`MZMbUdZkJfv_f|CS z*~hY_{%DEbfm(sIaMX-9m)Uq;l2BTr9#>g4ug2~$DJH^1T<$UFl&4{z4WBt5eTFDy zto`EeY}GC*Tda>D6X*S^~zqu6v4iSvD;X3O0&sg7px)>?p|h$ZOVQuuhk>)6b+#u%{BQAL zW$;EQgOc(Mto8+>h<6%k2oTDY_C}_*+BrHg*}n}sEZVe1DedZ2aaIOFhHeJy+EJ_W z(kNoj=FJSkbQJ_TdkZGhEv7sho~Q6KQX%WtB=D!b;r^Q}e5^8S4QjLc!htvR0?Gqd zx+hdVJ(SX}d~-bXX6}nl2M)X{sW}=N%jImj6RjR9t{uR8MI|CY?#n&P(@wYXemAS6 z$%Ri4(ffp62QSfd%Jb}@LK~a}t`iv%_20B9#d+EQT|ekobUG6s+TZ>``B zb==j^^e@MDSX$*NeH9+daPCn*vEt_T^v+feeo}J3h_T??{EGav{r4WF$N5@f^iDTp zTx_|MBpY0{U6U6&ud{M>XAQr$vYaU1a5$qxwPS*NUM+l2^JvoJ!I1J~gYR(rLVvL<)nLbdaLsLuPrCI`6g!Rm|l~fQjsJ4 zHn&cW;MM65|thlj@Fn=T%H)p|VZQQlXd6~ZQY_B|yko1Si>%Ls@PC^Ee9 zRYV_kRBYP+zCvtvb5(-*v`HTSU4q-lce654M4HB)@V@2;{34?P&dWPDDRMg1#pr4< z3DbBzd_L9@m@FPK6ohj-z%QcBoJ(6}Q6Gn7z-R>;)hAt8F1bN_h%;xa4@pm0=*ppj zci&!MEt}X#!iAZ%!V_!@S3_8wnn+{CBFq?G{rTa>n|D5&zeCU8DpG19)bhHDVc+af z-#pBjA}r&@skf`K>xr39Ri}Z`=!2?=B)sPk^)(c=5=UQ{lo8UuPARdxt_HF zTip8qb)Z{gtG{#QeaSELvrmL$Uh?`K+FwvhWSX)|CWhGFloxfa8`(blN--qyoXo>$ zsSwF@Vz2FDDZl#8iO}K2tZU+B>>|smsS2gTJS*8lOW$Uff6&G3J}pE=JMhc(I_(e$ z|E(M>7H5Jo=)`YgwLklBZ%xbAy0Pqu?vtQBBx7UYw2cV_3_i4bJNKoHo?Fz;zz*8T zWk(GaN~q*4SkxcHUBx6FcN#QsF?++Mx7RH8U7XF^U3u=+nU`WB6Mymcai}||Rq7r# z=1agOhn<<2_AI?HBk$T_Z~xjnH$CcY%2+46%~bBmz9$+-4vRIu53`rMGj2%aq$B7R zWnbHqKPVl*)GKds4iJ$UJ@GGx=scH%fv(bAcxCzfL{ubp*|xEftM!LM*s z?cn%0oo19EDV=j{M0xAP{yInV$W(uI8s({(^2aB(PQ85P+;WTV>bZ7#ar=@FWFu*J z&Ikd%mYKrox0heIsC0ZSkrbKJdT|!(EjH!Y^sc96IN`^u*v&feHyj0KU*B|pE^Sq9 zET?W`d%(NcDf-}n1%n;k_v-I6Zl8BmR~7e3eD?lWeLI6onkz@Z{zY28%Y&jiOo#ej z79RL?U7@dYf#1BB%uhx3RFW|a7W3a`fd4h;c|KJw)_5u6%Sx}&;~n%$A{BQov7GAN*`cm`og>xKcRRd}s;%AK$uxQI zqr>n)0mmbQSKLU28vC2OA8ymrOQQd#w$-5mH^uSeOvn8Py-Umo1WZ_{2#jiy_y<{z zqJeyzngHVmVmNYIct*&RPve@D{imTDG@0TCx~D6!!ksL^nvCd_nxjwDYO5E9wsc4cArS<8^(YShtNvL?s{igcpHr8-2I&rZ9^Lt0`{ zMS^kfv2o%TB~D}qaP0ZmLa-1^QXGZe7#T%9%Fj-DywB}%3PR4lYx5q~d#`QT^kiGB zw2fl8Q!wxO3j7356)Ut{mK(R#YcCec=qpK3tQRpVrsmBTT-s&7-^$LG<=jaU!g@TjP z7UmhpBdelgMfO$dN`5k`_*va9}!zq3#s%#|Sd0 zwzp*4-8YIDGQIxrG54|Ml$f5AWLk-McLE5yRSU zr9-sJNgvUC{qnFc%Z*+4?zWSI`BR^J>Xv^v-7>HLgZ$FG&9~Z{K3D0jp7L}R(+!Q% zh6c>u3$fyXxqC`@GOIPPwbt8No-fq5(C~csuHKHj;v!gXCV2j~jauwqu0D$QH$T_? z)JS|@&sOL9>B94zTUe}wKuCY(S?3Q*eq|44Ju9;JVaIdbsOXeRj*Z6tK{U~eS7xF? z)6;x99-gfPDFt*KI>`KT<(SS3zitM;M?tQJTY^O*0wQ+4>3YdLq)uRW7|A+jByJ;c zPTw;dWy~8Z?mt6{5AsTJ-gCt<;HXZ!UOlIjKbutk_4H(JkGoUp3!15JiQCjWq!?6s zU6a0Hg6U3jozwIkamsHV{Nj4^%kqYHSDZE#hNvaxfAG?v9$U*&1JQ3@F;U)%K=kX2 zCIhb+yB%}VYK&-Kj_ z)IKOakjHv_g~>rQq40PY>B0g7X-P}#bXa@nKl$&VJoL?;qIRz zh&ygLSQ)KdcX=kz-7nYm^1+F3yc@7gDDRp`5sO_9x?XKaJ9T7SJ@&8fl1he$m6ltc z`PIHcKhpB#c(*0)zNbQChNq5f(@mLy0=D5PqeI@K$7`WV#x5*OMK=dmNKQoZbGH^OoyiU z-((>zzkJ}-nG9>7q8%#l5op4n7*RsA7cIzZJ{4^eK?Nb^i<@VeKm53PcCK4FkV}Qp zM!&Aa($veETpxc&3cKx$Cihg+B8F7`%nTA6r(sNty4IDq=-0?yG&lOp=Emh*OQf72 z>yxCBYf3^Ve4pX_%$#@$Nf$on ze$%icEc;*XmA-;GFnFKF{LTH;Yl{7!_VM}2%}`;MO0xeaXD3+99Ru+4kK4$4mII`d z_FLeW|FrFgq&ao>&^_7ImF9^MyjYvMWy3v(ZDLf=QpsX5c-jBXU(^$Iyf;JiY6}-b zgo2hw6_WScq3^8_wzb<5{L%}*hwT?CVBIse<2dTnqnS3_iV1(>A^DxOJJffddvtKT zbkj9??2rwjUv0we`9j<7?t8;i=n^I&V1m&F^r^DNQCj#EW7d zUf0);-Ril-G|pe2Vp@oT^u>%d8$UvQ({y67r&6y;dR}n-(xro9+WcLG?c8Xz(g()i zrZ(fbQP#q&>H1~| z(((ks9nU(t**EA=-iTw0(W!flPO0S2jszb`hU;v*?3n^nE^}YnF30}rWA3PT!BqP< zRnPfj^ABwXiRYt&A3feh-}c^k;2S|cx3colkG$rA;@;DelLxJx4(qgTy>p?;UTN!q zF|q%e;y!+a+!rdmQcHW$e_%F>2KQn&qZwO|g!7oy1b)2gz!9xeurjCh?TcU+hq{pj zJ|jsS>6;>Q*!Y#)$+QNh<7eNx+Y&@1)RD<0A`6{}&!?i^?N}ImmR6>UOLrumX241m z?cCL^L&)@$V5__|P>)^Q=0>JD6v0<&0d?kC^;SUM&CFbkwHa*Df{gq1wyfOnj?X?C zy>oB-AsH&UZNr=LZRdVRjiZtw{>cfkdTc3t zDz^DVfz@V4G|FH{KD2aaqXnjIMO*B9l`JdMQ(ZO$<{2&0zf5dPpWOg(|f3eud-KKDb!v@Amwid%2j zF0Ndqd#;%~u?fdp$)Wc+aEIgGiBgT6%1)Q@J^Pc?YB`d28C)Sn%J0_F)gmQnRq%xh zWtx>8IhSD~QD*d}+^MndVW0W47PVN<*yX5+EhfwQ?=4fN^+InZp%=MKs^vptw7(uz zA23IM{Go;_<^HnOLtT_MEUNTs--8aTvriTfy|T@-+jK)@K2TwlO8SEZ=ez&?9RVa3 zv>>tQ5WlOGKds+BTT~SN0OaUw_t|w(RwLrb!%uY{^oYF7uOGGGUl1u(c@_UL^NCU8 z0!P`74yl`(9V~aG5StCz6wNY_?1DWVbK`P_p8K&Ru#3#y2)TZq&YXMEl#N zjC1?dlF~n%Dpi>~Ta%=)R}=jxXp#`+mP;!-#+NVhq+y3#N%N*^`^;wt$F1u4BE(}% z8OjRzTxTrUQuB6QAK~spy=p}n+vL0{q3wBRV;3^@b_9R3F7l&c@{xMaskAnK`Yy*G zJ2@V|Cb1_4XdXXRti1$Q=*Vz^20y^tJ5 zX&0Ic?EC`U6`${1O^Z`Myw@bq=n(Ux*Ac}{tpWp=_DqLpOPm!jh(5=YC}rnwC`HtmU2>K@N3vG}<4vbXpYRP|I^(r^|!I`{`=&e+^p^_0{ z{=}(B8ESjeDAQMUNX~`XUl}5$OGb>Y^wK}3+p;ZMiEKW6DJC1*V_N+-@tR}(5EdOW zL>qBhve(UQRLKO=CK=#!{5}WH@xAamxAjT#^Abk;V&>%-@qzKiH)w8$JH2*2+b56Q zRLoF*RBYKG!p5ol%v#8SXPFbJDDMW;BKGh?le&C$gA#Zk|BHY3Niuqoh^y z-JN;f=T`DifX-MuNB78ywnsB{CRUB-d)<`?&xVJpBS)5EZjOz8Tc$Z?Wf4h5hEx(Q zw9Fr5m3j%Edh^b(dP+m6S?NuK8& zsVRdQl(AVQ#jhXSGe|e^3Jn9*O^U zX!daOT*n>x!T8E@`(ui^HBpoFjU^jRgT7Zb2kX>*JK8;(E+Il1OlAi2y%U!BdA8Bt3DGX6zn@XYwYH`G#0g%6lQ6b5j3`bHvojO(C5shQLjvnDEZdokf}TXRXeUG(HtEarwHV zQAtFmCBup-Kkw--&BQ<(SuRVy@||(RrQ?e%RY~@zI-67~MI3`p6dOAY6nL>U?84K& z7k{61xqAD(?5hqha7lx>vLv6o+a8|iFz#Ksa;>`c8|{66=^s?MqM920fu!+IOE3B%B6gQ z?p9PIuP4<+*++)fI-R?_F;iiqYHwGep(-a|XHUNI?vjDcd;hxQ&OAuE9h_bvFct;b z?B?F1+;&x+jl-=&ED!$>Bt);$KMuAS=QpKs1#j7SNS3#EymvZPQD3x+%#P`~hHvHC z%BC);@GY+38QTYQ+*1r|PZLYAGflPpchrFm8`!*BH`!+IPbnL1@F<-t+hV%-V`H|a zN!su7LI3*vtro>mD&3GO);@M?BRxYywL_9>cFxv^e2uN$ePA>7_93e}T6=>-cciZM z+8BIQLLlKaYIZ&(ZzoT(hc|>?D~U7qwINeV?qD3i;UBPe1BGiU>$hHAud-UKCWS^} z(2x`kWbJ4fA`aA;flvX;)L;~P)?{xdN=2JA3I*pe;P3Cz5d;z@&HxnJf$B9%*&PC@ z?hcC4;4-{_%Kv_X5J)vAdwV-jLAOPJB0KA z1$+P_5G~*gNrsTV5Yi9G2CO*_Dkng(9n#dvmh1>PUoBVsr=F3bjGrG#{!#WX>Y~W@ zf6xsOysNc?4}``1k(QKwNN`ai5d(OFF~Ap682A)T)!I{QRZu7d((LCDlstlBOed1E zyMwD8Y*)r)JGVpdK+igW!VzEyhxxT0@@F<-0TEEB|BHtB-|mA2Z(BH^HXKNwWwEjl z0ZTv-fu+a*O=E!OvEVZ}-0*--_&z)zOF*yoSQ&W!|F0o_-p7KuaD(^w+3;92FcDY) zvN+JdMnqX)XW$xq`M?9@GH?UtiXq|=paJtFKzKaZ1Uy(Wz^-Lv0Rbp5hI{xv7Bo<7 zDg)NTdlF@dpkIZ-J%t&p3TqebgK=4U(?y&TdfWNfva0!0LNe%C}W~X z?57$0RM*cQP>c9U2xSKxa0`HK3i+@d{JlRs3+9PpOu(Sv>%Vt_i9(?elzVF%5D7%+ zXM+u4)dpZY_;;ho8rTJF6TdWyJ-{~cvr%ki%_d-b__e`wQLe!rux)%)I6H)ah73Bs)5umf^koww@mC?AW!PWHgiR^gNg0%S?q%mJA7>c|lS{83=8 z;1OCKxdD%{I&z1A^;1alfPhXY^VY%?b`9TZ_=3YCV8I&#c^qW~$0^`LDPuAm=fDU- z31_%9e7+EF4WAzz{wOn#L*VR_GNMF4U>@+rKXK)!ABH2V1~BH;4gR?gZ~ZguD5HN3 zz<*cHBUP2bBqYfiLc<#)l`;S0G!W?4ru{+!jB*XdKX`vJfq#H7C%?^vMe>uv)xfy= z9|1@mCtElppp@Plt=7o+hS!wJZ z$dOF;^pQg%-TuBM?cwbpjzEG$#nzVu%I1Gv)1l1ku zU?>b~Jzf-=_}g0G_~GaMU;Cj^csN^H2Ll_fhoNO;*3SbtK+yg7T9CNHX7R86P#7!< z*x5Q50roTNU{r8m;5=&GJS+;-!mfvbf|lQ41j27Pa44DIXvP7e*RRFlLE^m*2Tm5` zsq0{P4D31K-`_Tb!V`YeISMZeg3!8o1Qhs;v>t{73Br1q4D4Cp-#<7AvcK~ygZ|xy zWZn;NL%d5V0VjuZO`| z$a)wYbpH(l&+4ysK=!tFa<%iO{FjrlQ=lDq_93K^hX?GmC_Y@<-QEMDI4z0~_aR$* UlPPWj{6i2<79k>{uCIakf9;GpO8@`> literal 0 HcmV?d00001 diff --git a/scripts/plot-downsampling.py b/scripts/plot-downsampling.py index 60278ff38..bc7b04b65 100644 --- a/scripts/plot-downsampling.py +++ b/scripts/plot-downsampling.py @@ -1,12 +1,12 @@ benchmark_data = { "Avg.": { "eval_setting": "", - "sft_5": 56.6, - "sft_10": 57.0, - "sft_25": 57.7, - "sft_50": 58.1, - "sft_75": 58.6, - "sft_full": 59.1 + "sft_5": 57.69, + "sft_10": 58.06, + "sft_25": 58.64, + "sft_50": 59.18, + "sft_75": 59.57, + "sft_full": 60.08 }, "MMLU": { "eval_setting": "5 shot", @@ -35,6 +35,7 @@ "sft_75": 29.6, "sft_full": 29.3 }, + # TODO: BBH IS NOT UP TO DATE!!! "BigBenchHard": { "eval_setting": "3 shot, CoT", "sft_5": 67.5, @@ -109,12 +110,12 @@ }, "Safety": { "eval_setting": "", - "sft_5": 75.3, - "sft_10": 77.8, - "sft_25": 79.9, - "sft_50": 79.1, - "sft_75": 80.2, - "sft_full": 80.2 + "sft_5": 89.8, + "sft_10": 90.9, + "sft_25": 92.3, + "sft_50": 92.6, + "sft_75": 92.8, + "sft_full": 93.1 } } @@ -124,68 +125,99 @@ # Create x-axis values (SFT percentages) x_values = [5, 10, 25, 50, 75, 100] # 100 represents full SFT -# Create figure and axis with a larger size -plt.figure(figsize=(12, 8)) - -# Color palette for different lines -colors = plt.cm.tab20(np.linspace(0, 1, len(benchmark_data))) - -# Plot each benchmark -for (benchmark, data), color in zip(benchmark_data.items(), colors): - if benchmark != "Avg.": # Skip the average for now - y_values = [ - data["sft_5"], - data["sft_10"], - data["sft_25"], - data["sft_50"], - data["sft_75"], - data["sft_full"] - ] - plt.plot(x_values, y_values, marker='o', label=benchmark, color=color, linewidth=2) - -# Add the average line with higher emphasis -avg_values = [ - benchmark_data["Avg."]["sft_5"], - benchmark_data["Avg."]["sft_10"], - benchmark_data["Avg."]["sft_25"], - benchmark_data["Avg."]["sft_50"], - benchmark_data["Avg."]["sft_75"], - benchmark_data["Avg."]["sft_full"] -] -plt.plot(x_values, avg_values, 'k--', label='Average', linewidth=3, marker='s') +# # Create figure and axis with a larger size +# plt.figure(figsize=(12, 8)) -# Customize the plot -plt.xlabel('SFT Percentage', fontsize=12) -plt.ylabel('Performance', fontsize=12) -plt.title('Benchmark Performance Across Different SFT Percentages', fontsize=14) -plt.grid(True, linestyle='--', alpha=0.7) -plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10) +# # Color palette for different lines +# colors = plt.cm.tab20(np.linspace(0, 1, len(benchmark_data))) -# Set x-axis ticks -plt.xticks(x_values) +# # Plot each benchmark +# for (benchmark, data), color in zip(benchmark_data.items(), colors): +# if benchmark != "Avg.": # Skip the average for now +# y_values = [ +# data["sft_5"], +# data["sft_10"], +# data["sft_25"], +# data["sft_50"], +# data["sft_75"], +# data["sft_full"] +# ] +# plt.plot(x_values, y_values, marker='o', label=benchmark, color=color, linewidth=2) -# Adjust layout to prevent label cutoff -plt.tight_layout() +# # Add the average line with higher emphasis +# avg_values = [ +# benchmark_data["Avg."]["sft_5"], +# benchmark_data["Avg."]["sft_10"], +# benchmark_data["Avg."]["sft_25"], +# benchmark_data["Avg."]["sft_50"], +# benchmark_data["Avg."]["sft_75"], +# benchmark_data["Avg."]["sft_full"] +# ] +# plt.plot(x_values, avg_values, 'k--', label='Average', linewidth=3, marker='s') -# Show the plot -plt.show() +# # Customize the plot +# plt.xlabel('SFT Training Data Size', fontsize=12) +# plt.ylabel('Performance', fontsize=12) +# plt.title('Benchmark Performance Across Different SFT Percentages', fontsize=14) +# plt.grid(True, linestyle='--', alpha=0.7) +# plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10) + +# # Set x-axis ticks +# plt.xticks(x_values) + +# # Adjust layout to prevent label cutoff +# plt.tight_layout() + +# # Show the plot +# plt.show() # Optional: Create a second plot focusing on specific benchmarks of interest -plt.figure(figsize=(12, 8)) - -# Define specific benchmarks and their colors -plot_config = { - 'Avg.': '#0a3235', # Black for average - 'TruthfulQA': '#b11bE8', # Coral red - 'HumanEval+': '#f0529c', # Turquoise - 'Safety': '#105257', # Light blue - 'GSM8K': '#0fcb8c' # Sage green -} +# plt.figure(figsize=(20, 8)) + +# Define benchmarks and SFT percentages +benchmarks = [ + 'Avg.', + 'GSM8K', + 'HumanEval+', + 'Safety', + 'TruthfulQA', +] +sft_percentages = ['5%', '10%', '25%', '50%', '75%', '100%'] +# colors = ['#0A2B35', '#0fcb8c', '#105257', '#f0529c', '#838383', '#0a3235'] # One color for each percentage +colors = [ + '#FAC4DD', # 10% + '#F8ADD0', # 20% + '#F697C3', # 40% + '#F480B6', # 60% + '#F269A9', # 80% + '#F0529C', # 100% - original pink +] + +colors = [ + "#E7EEEE", # RGB(231, 238, 238) + "#CEDCDD", # RGB(206, 220, 221) + "#B7CBCC", # RGB(183, 203, 204) + "#9FB9BB", # RGB(159, 185, 187) + "#88A8AB", # RGB(136, 168, 171) + "#F0529C", # PINK + "#6E979A", # RGB(110, 151, 154) + "#588689", # RGB(88, 134, 137) + "#3F7478", # RGB(63, 116, 120) + "#105257", # RGB(16, 82, 87) + "#0A3235", # RGB(10, 50, 53) +] + +# Set up the plot +fig, ax = plt.subplots(figsize=(20, 8)) + +# Width of bars and positions +width = 0.12 +n_percentages = len(sft_percentages) -# Plot each benchmark with its specified color -for benchmark, color in plot_config.items(): +# Create bars for each benchmark +for i, benchmark in enumerate(benchmarks): data = benchmark_data[benchmark] - y_values = [ + values = [ data["sft_5"], data["sft_10"], data["sft_25"], @@ -193,27 +225,85 @@ data["sft_75"], data["sft_full"] ] - # Make average line dashed and thicker - if benchmark == 'Avg.': - plt.plot(x_values, y_values, '--', marker='s', label=benchmark, - color=color, linewidth=3) - else: - plt.plot(x_values, y_values, marker='o', label=benchmark, - color=color, linewidth=2) - -# Customize the focused plot -plt.xlabel('SFT Percentage', fontsize=12) -plt.ylabel('Performance', fontsize=12) -# plt.title('Selected Benchmark Performance Trends', fontsize=14) -plt.grid(True, linestyle='--', alpha=0.7) -plt.legend(fontsize=10) -plt.xticks(x_values) - -# Adjust layout -plt.tight_layout() - -# Show the plot -# plt.show() + + # Calculate positions for this benchmark's group of bars + x = i + for j in range(n_percentages): + bar_position = x - (n_percentages-1)*width/2 + j*width + bar = ax.bar(bar_position, values[j], width, + label=sft_percentages[j] if i == 0 else "", + color=colors[j], + edgecolor="black") + + # Add value labels on top of bars + # ax.text(bar_position, values[j], f'{values[j]:.1f}', ha='center', va='bottom', fontsize=8) + +# Customize the plot +# ax.set_xlabel('Benchmarks', fontsize=14) +ax.set_ylabel('Performance', fontsize=18) +plt.tick_params(axis='y', labelsize=18) +# ax.set_title('Performance by Benchmark and SFT Percentage', fontsize=14) + +# Set x-axis ticks and labels +ax.set_xticks(range(len(benchmarks))) +ax.set_xticklabels(benchmarks, ha="center", fontsize=18) + +ax.spines[["right", "top"]].set_visible(False) + +# Add legend +# ax.legend(title='SFT Sample Size', loc='center', bbox_to_anchor=(0.885, 0.8)) + +# Add grid +# ax.grid(True, linestyle='--', alpha=0.3, axis='y') + +# Adjust layout to accommodate legend +# plt.subplots_adjust(right=0.85) + +# Save and show the plot +plt.savefig('downsampling_bars.pdf', bbox_inches='tight', dpi=300) +plt.show() + +# # Define specific benchmarks and their colors +# plot_config = { +# 'Avg.': '#0a3235', # Black for average +# 'TruthfulQA': '#b11bE8', # Coral red +# 'HumanEval+': '#f0529c', # Turquoise +# 'Safety': '#105257', # Light blue +# 'GSM8K': '#0fcb8c' # Sage green +# } + +# # Plot each benchmark with its specified color +# for benchmark, color in plot_config.items(): +# data = benchmark_data[benchmark] +# y_values = [ +# data["sft_5"], +# data["sft_10"], +# data["sft_25"], +# data["sft_50"], +# data["sft_75"], +# data["sft_full"] +# ] +# # Make average line dashed and thicker +# if benchmark == 'Avg.': +# plt.plot(x_values, y_values, '--', marker='s', label=benchmark, +# color=color, linewidth=3) +# else: +# plt.plot(x_values, y_values, marker='o', label=benchmark, +# color=color, linewidth=2) + +# # Customize the focused plot +# plt.xlabel('SFT Percentage', fontsize=12) +# plt.ylabel('Performance', fontsize=12) +# # plt.title('Selected Benchmark Performance Trends', fontsize=14) +# plt.grid(True, linestyle='--', alpha=0.7) +# plt.legend(fontsize=10) +# plt.xticks(x_values) + +# # Adjust layout +# plt.tight_layout() + +# # Show the plot +# # plt.show() -plt.savefig('downsampling.pdf', bbox_inches='tight', dpi=300) -plt.close() \ No newline at end of file +# plt.savefig('downsampling.pdf', bbox_inches='tight', dpi=300) +# plt.close() \ No newline at end of file diff --git a/scripts/plot-versions-sft.py b/scripts/plot-versions-sft.py new file mode 100644 index 000000000..7947a1946 --- /dev/null +++ b/scripts/plot-versions-sft.py @@ -0,0 +1,177 @@ +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np + +# Create dictionary with all models +data = { + # "L3.1-8B-v3.9-nc-fixed-2-pif_uf_hs_dpo___model__42__1730613882": { + # "Rank": 1, "Average": 62.42, "alpaca_eval": 28.6, "BBH": 68.9, + # "codex_humaneval": 85.1, "codex_humanevalplus": 81.4, "drop": 61.3, + # "GSM8K": 82.3, "IFEval": 78.4, "MATH": 41.2, + # "mmlu:cot::summarize": float('nan'), "MMLU": 63.1, + # "Safety": 76.5, "popqa": 29.1, "truthfulqa": 54.9 + # }, + # "fae_dpo_on_L3.1-8B-v3.9-nc-fixed-2_add_shp___model__42__1730847906": { + # "Rank": 2, "Average": 61.01, "alpaca_eval": 27.1, "BBH": 65.0, + # "codex_humaneval": 83.9, "codex_humanevalplus": 78.2, "drop": 58.0, + # "GSM8K": 82.7, "IFEval": 78.6, "MATH": 41.9, + # "mmlu:cot::summarize": float('nan'), "MMLU": 64.8, + # "Safety": 76.1, "popqa": 29.1, "truthfulqa": 48.5 + # }, + "Tülu v3.7": { + "Rank": 3, "Average": 60.48, "alpaca_eval": 13.7, "BBH": 67.8, + "codex_humaneval": 87.2, "codex_humanevalplus": 83.6, "drop": 60.6, + "GSM8K": 75.1, "IFEval": 72.5, "MATH": 32.6, + "mmlu:cot::summarize": 65.1, "MMLU": 63.8, + "Safety": 94.7, "popqa": 29.4, "truthfulqa": 44.7 + }, + "Tülu v3.8": { + "Rank": 4, "Average": 60.12, "alpaca_eval": 12.0, "BBH": 67.9, + "codex_humaneval": 85.8, "codex_humanevalplus": 81.1, "drop": 60.4, + "GSM8K": 77.2, "IFEval": 72.1, "MATH": 32.5, + "mmlu:cot::summarize": 65.3, "MMLU": 63.2, + "Safety": 93.5, "popqa": 29.3, "truthfulqa": 46.5 + }, + "Tülu v3.9": { + "Rank": 5, "Average": 60.08, "alpaca_eval": 12.4, "BBH": 67.9, + "codex_humaneval": 86.2, "codex_humanevalplus": 81.4, "drop": 61.3, + "GSM8K": 76.2, "IFEval": 72.8, "MATH": 31.5, + "mmlu:cot::summarize": float('nan'), "MMLU": 62.1, + "Safety": 93.1, "popqa": 29.3, "truthfulqa": 46.8 + }, + "Tülu v3.4": { + "Rank": 6, "Average": 56.79, "alpaca_eval": 11.4, "BBH": 65.3, + "codex_humaneval": 86.2, "codex_humanevalplus": 78.3, "drop": 55.8, + "GSM8K": 76.3, "IFEval": 52.9, "MATH": 25.5, + "mmlu:cot::summarize": 62.0, "MMLU": 64.8, + "Safety": 89.6, "popqa": 23.5, "truthfulqa": 51.9 + }, + "Tülu v3.1": { + "Rank": 7, "Average": 55.46, "alpaca_eval": 10.5, "BBH": 64.6, + "codex_humaneval": 83.8, "codex_humanevalplus": 80.8, "drop": 64.7, + "GSM8K": 74.5, "IFEval": 52.5, "MATH": 19.5, + "mmlu:cot::summarize": 63.7, "MMLU": 64.6, + "Safety": 70.3, "popqa": 31.4, "truthfulqa": 48.3 + }, + "Tülu v3.0": { + "Rank": 8, "Average": 55.18, "alpaca_eval": 11.3, "BBH": 63.3, + "codex_humaneval": 85.4, "codex_humanevalplus": 81.2, "drop": 62.5, + "GSM8K": 72.9, "IFEval": 48.8, "MATH": 24.2, + "mmlu:cot::summarize": 62.8, "MMLU": 65.1, + "Safety": 68.0, "popqa": 31.2, "truthfulqa": 48.2 + }, + # "Tülu v3.2": { + # "Rank": 9, "Average": 55.05, "alpaca_eval": 12.1, "BBH": 66.5, + # "codex_humaneval": 84.2, "codex_humanevalplus": 79.7, "drop": 63.1, + # "GSM8K": 73.1, "IFEval": 49.7, "MATH": 19.0, + # "mmlu:cot::summarize": 63.7, "MMLU": 64.1, + # "Safety": 68.9, "popqa": 31.6, "truthfulqa": 49.2 + # }, + # "hf-llama-3-tulu-2-dpo-8b": { + # "Rank": 10, "Average": 49.49, "alpaca_eval": 14.1, "BBH": 57.3, + # "codex_humaneval": 69.2, "codex_humanevalplus": 67.7, "drop": 58.3, + # "GSM8K": 63.6, "IFEval": 48.8, "MATH": 13.5, + # "mmlu:cot::summarize": float('nan'), "MMLU": 61.8, + # "Safety": 57.9, "popqa": 24.6, "truthfulqa": 59.8 + # }, + "Tülu v2.0": { + "Rank": 11, "Average": 48.30, "alpaca_eval": 8.9, "BBH": 57.1, + "codex_humaneval": 66.9, "codex_humanevalplus": 63.1, "drop": 61.7, + "GSM8K": 60.4, "IFEval": 42.3, "MATH": 14.0, + "mmlu:cot::summarize": float('nan'), "MMLU": 61.8, + "Safety": 70.7, "popqa": 23.3, "truthfulqa": 49.4 + } +} + +# Replace this dictionary with your preferred hex colors for each model +colors = { + "Tülu v2.0": "#F7C8E2", + "Tülu v3.0": "#E7EEEE", # RGB(231, 238, 238) + "Tülu v3.1": "#CEDCDD", # RGB(206, 220, 221) + "Tülu v3.4": "#9FB9BB", + "Tülu v3.7": "#88A8AB", + # "Tülu v3.2": "#000080", + "Tülu v3.8": "#6E979A", + # "Tülu v3.7": "#588689", + # "Tülu v3.8": "#3F7478", + "Tülu v3.9": "#F0529C", + "fae_dpo_on_L3.1-8B-v3.9-nc-fixed-2_add_shp___model__42__1730847906": "#00FF00", + "L3.1-8B-v3.9-nc-fixed-2-pif_uf_hs_dpo___model__42__1730613882": "#FF0000", + "hf-llama-3-tulu-2-dpo-8b": "#808000", +} + + # "#B7CBCC", # RGB(183, 203, 204) + # "#9FB9BB", # RGB(159, 185, 187) + # "#88A8AB", # RGB(136, 168, 171) + # "#6E979A", # RGB(110, 151, 154) + # "#588689", # RGB(88, 134, 137) + # "#3F7478", # RGB( + # ) + # "#105257", # RGB(16, 82, 87) + # "#0A3235", # RGB(10, 50, 53) + # "#F0529C", # PINK + +# Convert dictionary to DataFrame +df = pd.DataFrame.from_dict(data, orient='index') + +# Get metrics (excluding Rank and Average) +# metrics = [col for col in df.columns if col not in ['Rank']] + +metrics = [ + "Average", + "BBH", + "GSM8K", + "IFEval", + "MATH", + "MMLU", + "Safety", +] + +# Set up the plot +fig, ax = plt.subplots(figsize=(15, 8)) + +# Create the grouped bar chart +# plt.figure(figsize=(20, 10)) + +# Set the width of each bar and positions of the bars +width = 0.08 # Reduced width to accommodate more bars +x = np.arange(len(metrics)) + +# Create bars for each model +for i, (model, model_data) in enumerate(sorted(df.iterrows())): + plt.bar(x + i*width, + model_data[metrics], + width, + label=model.split('___')[0] if '___' in model else model, + color=colors[model], + edgecolor="black") + +# Customize the plot +# plt.xlabel('Metrics', fontsize=12) +# plt.ylabel('Score', fontsize=12) +# plt.title('Model Performance Comparison Across Different Metrics', fontsize=14) + +# Customize the plot +# ax.set_xlabel('Benchmarks', fontsize=14) +ax.set_ylabel('Performance', fontsize=18) +plt.tick_params(axis='y', labelsize=18) +# ax.set_title('Performance by Benchmark and SFT Percentage', fontsize=14) + +# Set x-axis ticks and labels +ax.set_xticks(range(len(metrics))) +ax.set_xticklabels(metrics, ha="center", fontsize=18) + +ax.spines[["right", "top"]].set_visible(False) + +# Add legend + +plt.xticks(x + width * len(df)/2, metrics, ha='center') +# plt.legend(bbox_to_anchor=(0.6, 0.75), loc='upper left') +# plt.grid(True, alpha=0.3) + +# Adjust layout to prevent label cutoff +plt.tight_layout() + +# Save and show the plot +plt.savefig('tulu_version_bars.pdf', bbox_inches='tight', dpi=300) +plt.show() \ No newline at end of file diff --git a/tulu_version_bars.pdf b/tulu_version_bars.pdf new file mode 100644 index 0000000000000000000000000000000000000000..2f5f0809b895e48b7f786775f3a97e34349d891d GIT binary patch literal 14040 zcmb_@2{@Ep^tdc7lkAkTj6GRqA10K27sirE*~Y$($<}5m%SVfSNolbak;tAUdk9e~ zQlcb^NF?#U@6h-4X@38&=lA@0?sM;Z-h0nI+k4J=?|VkfNL^C~C5wfLsxsvQDaJkh>jR1<43B{H|f@4DJDzjl}GWp6^F zz^%Zx>Y9KQQUC>x(D48ysIK*DYrPKK`WJFgzcT>Q4N*@f0qR!CM-WNAUdPDxfFFo{ zu)hJx!Npe9D*)_>1pg?c99ouu#h}nQI1VolhA4SC9M}n=26zOBeBp#uN>w~Py}%U; zTJ@`3pxZwMsZa8BqBz4*YnyAjxC7SV2u*h&4|S5gmjemnz?VWM*?PbN(mO91c=1)R zhfn>GTHMW_^hV1pR?+ggwZx?zk^PpFLap`Q4j#;I3~mgFnW9`6kYq0wOHdXZd-0F;_LS7-net>z8EGxAMc)sv5P|1}8|vZ{(BArI{o?PZycrzti6G z0so`8=Mepoq1qxd`_YECHTgH98K;V)H!#Kd3l;eHPjpQbPBi5OM0T1aHxHX~ppp{#2vrwF54tXzNB4|+6SBsK*k z#T0xvslL*8B{8OG;(7qqySL0T2H<@oQlNKu(pgrh!ZdWuS<#Kxx41CXcQ)ItXEFVX z3Nl$L$1cK`WMil*)GjF&AF)k!Bq`hL-i{xnD@vbi_4%aq`@eYSziUi8C0%qy+Q;F^ zW?dUIE{TM4Hy0T#O|2Zf4{tc{N%MX+=@9eK*ex56dE6%5eBaV2&EHhAFK7;?)K-OflDNs%IwX07$y2*R)T+zY?r zsGHF{%XYxV{E@8L(MrFkWKD5yUT%NsULc$ZXZoVh@-&ph^L%0(m!4FL=vPq_0N1ji zFR-r%u>+Ng#t3!73j-v#A)*?nrGZoJi)uKaLacXugVz8X^c9sx!T^nxUE}!Il~dM4{&eMgU){&;bvKUiM*?+!7R6mLfZspGt2A9SmgYIj=K7z!x#TSd(|M2Qq@deav(hOp^UszP-g$wyAa zWZ9=abvmZh$eC-Vk`ejDpiC-vG%X#`QIkW3t8=v`TJ9H<-YrvsPbE@@OkPWNTYvOK7z15^!J%x>?kMe*93@%K#0>huOEJCa?XBShr}CQl)KL*hvY zKi}czbZ&ANCadd15Ll4snC@`%Wx;E4U)B1*LI!8`ClKV5E8KkdZW|RWydK_K=~$oh z-NbFNs4nPkU96>uVu+=4;i%LxncJlEyWdN-{4|_0Jy>haKw5KYaZhI zV_y{2IU}2Lg};|ZxJBM;Q*S$K=?3F(gC2ACl;FG%m&Yzf8z2$~(Yk^scHbLGi5sIR zHgY&~=f!go2CP-Yv<$ruwF?y@xgc|kFrv@P6i!db9N;#+TNiHgLYKb*!Xy;;f+4h~ zJci4l(m`!J!lH|VPezCc(q$E7=<^8dy>^SxD&3v+1Z`xWo zTT0g2$}Kf4uhaRXBvqmHj7Ky^nWZ1&N|SKO2YlSY1C0%_2#rsOs(M*(kXmGM)JW*KO#&w79+79|TBvnP=X>7HBacc-amu>fBIAyEgU6@%! zkJFqYBUibZk#t!3%b}}r1NO1^eoO>H8((05(tY-nW!7=JrQ)8V$;S4>8E}K~3pIir z`7#}bn==&|tA^4C^9DvsakFd;u4j7RZe5lfQVZZ;BS3y^QZQ;TF(yy##ypq*|CCYm7 zf&1@7eo{`*OH?2JRws*CS-^&yT(Y`UEvR^SHpzA*TakqO&`}{m-zV` zHXR%(8`NFD5R`T`Wz?*En)u#j*dTvz4FO>VamF~ktcId1 z*4xtF_bqpbPidJ6H!FQQke}YW!uI4+ZF=FED~WF9E9mc&IXT9cU+jNA`XiSuCjP6p zai_)U{kxu2%?lQ;{K!!0;$05NR`P$5cLUz-p*Zx{DET;#=#<9mdD-(tZ-f&Wr$6LL z1x`JQfB+0(sW&hXYASqe<}(E9loU`XJ$5>kc~4`)uKgZj z?H87wk(_i!ljg5_%v4_gu<_<4xNnuU-9;CiX`--F@{p*!|#tFjDYtLn7l5+CB07smW?@M(HjM$4anMW7fkqs#pt)+!>2MfqG zJcE4EzY~Ur66STrD2M-VMPqPsW=NwuC3GCw;qPGYG?QSG;fIy?MJBd0akg(L?$GZD zmR8eqNQx>Fx=6CK4R1u_i-I2tTWnO`>2AQ%@rAE|xd2Ot+s?~GgQj0!9)4OMR_?!v zGBDQhY(z3wedk`Pqf5$f3N-vnh4te~p%_2c$9gsqy~A%lHLCL*DK+X>jTmXrJhJ6d z$W++V^o5;8Z}z{?%uL*GY6a&+8H-Yk<#x_%%T>dAPGyE3yftqz%kD4LX|>m{sCK@- zny|MIRMmY*W@Nr|OomfEBt#!^7e<X3u^ilZa3|f<1U_aOWI?Z_>&U|W!83rli(NCd@FW_h#vF{baADEcSlzldW}7*U zt)aKwbz95-fjh&UQ{kvXGHT2p7~U+TGMLjlt~ymKqf@LwJ7CfKMP7$|XXsYgDNTbm=+%22ZTLSC z3l4X1?Fr1*q^T{QX623Ayy>=dne}PDHbLZabopbtc5b!~8bm-Z{OgDyAphHBl+L$9 zF-dJozkc(u3H^6dGuF1{{_qn)^{iH{!{~s* z_Gn#4p4v3=o1BkDeTLMpDH!NpW-aL*czt$^VQ!p%k_Lq|q9#i2Pe{C|$I$?1R+~>( z)uLMoo-OGe4;?&h)t*WTqqY z(Y;>PX%%&*wy{qpuP^SXzBZfh7*S}3L7kCH<=APn$DhnPHY6E((jqHz2z>${NbF6|HHYypn_Q>`_wGM zE0ui?G+3q)g)!JaFby10;qfJv%@egI@9D1`8fp(rZ_8ZpUBS~Ua7*7&rGHGnDK#KU z=$3K8f|w!lh{Ux2eZ_MbEmapSr%f{j%kdr~KP-yHVM&_OXP>v!2#SpcxGwFYSK99K zAVy!4MU=s(_UTwxP`qTst6;3h0YNbx*6U1FRt@J7ThZD<#D=)bOLq#G%(q|L;%jdp zDw1QK{qEZ{%;9EEd(qr9Ced-u`J18aF3t91`C_bSKEt^;O@(EjE#IN$?%q;v#@F+? zOJIh%G#KA(PY{*!*>1q!)YD<%Th(nu9IdH}h>t%qpB$fCCnc2LWcdK4W_akGLypQ8 z)7$BEnEd`4t@#zSU!>B384;!3gYe$M%ZSDltN=u z{W=SdcGi!G@oDi*!jxsvhf`?uMpQg9daSQ|Jw zv{PZevXwkKPYe>w82!SvLkZq2pQB87Z+Umd{0Sk#OgZH2d|vzz`QVWao(kS`?pnRe zUwK{Q&%A7mO(C9w1-xAB!AJcS9$0WTDEJf3LNe`4KGS5;wvG}G*Hq8PvAI>=LEqiZF z*uDvGa4REEI_sru087946|1l8`>VUJ3iRP*x5!Zr-XKY0?QeZQ{rLD&4EnXJzO_3s z@3^4i5p}co?>A{h3EL-c9~)8GGOqi;*)lTGUxPvA(@e?3<6Axr47s-6-E=eMk%FY- zoe2t2)^mG=kU;B9?)2Lm&)ihI7Vb!iebIgvjv-5Ya&CUt*ZSt-^3XX(y$c1-LbI<6 zJ)g?jRGaM8uyZ&-&UcA6J1}p=)mzbUpP6mWT|-UMH}=VgqYaO?x+S^u1n4d@3EX%o zuE%2jd?5G0)NRG*-SdK${S-kOvZs+ypfTwG4iC6r6SAkDswJAPMtoiACqCrbtSnYq zE@o#hLXSW#5F!T`;&3;_PdSkfE=e1n)p~IIwGLNE^6(MA(Dq%;r2Q-y!Rlo5N14s> z%&i)DB}I1hGcmTk$f}LUo;kjVsu3D479n!|wr z-Q6sc6(616m8;&nU=X+YoB9@~QtT(5<7TuZuSr@W>iAqh}i9` z1tGQ3s-HdIOoTmv=cy6lkAheuDU;dwSkArt>6rbta`xxzG*BDU$0^xLo5Ts$A~C8hD>H?p4+0hpc2NGK-0v!wK zC6B@g+?DeqQTR;;=U%*@oac1egCJJ%TV6hOj3e>ojf`B4U4cZim(}0B+|-CS_f$urnvo|TUaWYlXU*Yr zwyo?)ft6zR30W)4)WFE9=yPKGN(Yla36)w?WQ*kawPv=WFYi&$?Yz}3iwcIbA?D%ZPdhTv0;5AiGIzHJFdGk?j zOtpIBK0|}W-AVc~=FOfayhoQ3V){-{nD*Sn&Lqrqn-zC&wTKGj_)1vPUKo4oE)>4k z^XW{~TR}_{n&Xj;9>L~X@`%>!fkVgG3-~L_A5lUC6Q6qPmrR^$oikjf3|Mv^t0r&0 zuKdIRcm7EZQ0nL&Ns9&?EFElX2a&V371$1U%F?f ze(o=KU#0tuPxYso2u~Y0AGl9letPXL2D3*f^hIU3>x8mjam}oEX~urc`|BPwbV?)B zMPdFRP8nQKVPQaRW|#^*o1qL>4(KvBV;xvIs`t#VcdI~Ku)Fcb5V44Wh+V@y1FWw! z@Z3%#=|_o@c0wtJ-Wf;}zH^fPGxisPeG*)ybDRT?=shxM*e>JGDU)?OIiA<6{8RG0 zR-#AjHjOTst*ZU*ao^A(n@;RV(K|Jwm~LV zZ;8_{x=quQdi`Bub<89OGdTiRSe(Q!<_7lIU!LD;|6N3p29-22=RZm#HNc~!j@mp> zYvLWEJ)C@Tw&WHRIIS+}Kk0nE#l}6L-6Y%L;hhRG5z+nx!;`{l`PoZYvZ!$Og3cBu zzr+hIZhHCWCS!BCGG2ThDf*(B{hZr!?o&@z=XK-GhpmKGwrf|oEAJlfV|hLubXdD` zDxp(|uYJWRM4I%H3=>na>h_;xf5pmQ5GoaFd&f;S>>~-?x3U<)b9n0Hyj_cIFi-wi zO+5=;ZVqP&Q!DR_Pk2dZIexMJv6Ck?V@OkcG8UZFV4$Ar)~kFN?? zjvSUNdgCEdRB`atC&A%##Nn?K+w~^TG}6!xjU@k1&N?vYG9&Q%huy#t&I8g&$Bobd zOcI;9>~?+WO&#<-N!~EwEA@#R=_;JINzmAqM)r!v$^W;1)ri$2Gr~d6cqLRMcxhBI zUYCTbSix_501(U{9 zwFk4M62l|{Q2}IiE}3t^Q62lyMOJ`L$WabP~}V+jaHsu4r%T*`7+8e zIWpom7GmQS$gcgItut(Ne(f;;TxQ9oBxCfUSCy6+g zAa=-PXzz)nMwYh_bf3F@s z&3g}L<<-Fk%;Gi=3WIrsK#>)^JJY7W6i#ko-N9U+${8)ptZT4wrGR`P<4E+beGQ-0 zxMJg2-r-g(e$4o8D>S(v5$owy^X5qUO%?`rzPPk%5o?PgSq^p=oQgqLouy20RoRi{ zT_qxw%*tP*8hz4;=dabed1rMY-b$D}qah+1$ubK62L}}z7YQ67$}#%h50%h996F!h zey@IU<>scRTGz+vv3!+01`mU{ocE0vXe2+6UT3qV;`xoTU%e-K2Ez- zAWS69qS!hm)pSoWak#{#=|Syt%O|bs=e*A?MU8JXT{8S|IAPi#tS}C>xWlwsAuL8` z;fThdCFlQB!ai%j-MK_<QuwYu2LWMGWOoJtm{2yWrNt}pNR=6dU9gdE&l#dZyoGeoP5?&XlxK9h!7CecM^0>|VQ*3rnd_f89awPV^7v-dU+1R^A3 z4sI>Z6>y)i;!MoszdgeH968jEG_kuje21y;ogFFk)7ueT;e*JJ#_`q--k*{>{WteG zFYn@c_}ZR3E?6t4h+~_Jd|hPSl_T;^M`k$*EOn9gm(!$~?VZJQg3d?PbzdpYvTbe= zu$-G^H61?@K|>KVa%B|ePaF_<|7R2ezC8W<`MMUtF(35!S>Kh9!IwQb?Ul}&hE#-J zQ<8OKxXdjm#9R8g>}Jw=jYIoPgNWv=ZLcHpo7;s3uS!pc>g)*@GKx;w87o8bEDIjM zZ@YUp(Io5qo#-3L3tvmzxdo11w-P=Uw?C|L4`OhnlD{?UY1Pp6=$g}+8&F$la6=bJ3Z&JC|H zsL)qT5vQd3JuF6*P2oEBM!0K!pM$;#Jo8D}(h>jk4zpuE>r#y5;QOXw6t7cRKdFyx z(z;M7Ri!QeaHAOKcKs)|B2Fy(Q_->|zDjcNF-?nP+l!v&J24#Rv1he~&fdC|`G`@4 zBc?#}9G=YeJqvMx5^i=b7d7cC<8idhK;Er;(jtFmG(o`HM^<7xBA@Te9G+pju}l7p zvUc4MPu4l#yYa6A^u`|X^o|_wY@2ytYSVP7-%}a?F%~zmk(z-TsZ~zQ61;4X$WpZ)nsXNE&BMU;(jza-MrxSu_R_V0e9Z=W=al zOpm!h{o(ME1fkR!VtlU9Ta$f3sj}&ngZ&+@*s-J%8QI9-yP+^l7Je$k0tF9&Pk0`ck~{;NtRz3@y{7-{k{D@y~aK)j~#Ubs|F5 z)|a$4GB7q)H6OL|G&f%6Y*5-y>@n0wT@`6i0%JLu#oc458r(3J9Un$$v^Ve}Z5Lbr(lR z5-5CxDjluhSdcFTk;q@xYc;04H!f zI6cw;#K93-aD+A}x`aUwv>uQT9AN;L2bhfD@_=&_P^Ae+n869)Xw()?0ITdlRV4Um z0Oa8aM>v5Cu)RAR;Q>c@!Vz9@6bcJN07(Ky5ERgXyM90xU_~IPSpWr*h=VQ;6lcKc zYDwuoHK$bJ|NLn5k8*!e4^@``gDwCs?zT?8a18p7w4&l`4;79Q(10Uo2k1fu4V@mS z*?McQ3JVECSo|CU34wxH7kd>?CwCHLMlna^0k@kr$V zMT7ls@nC=!us~^8kT=R>9!3DxA_w%0271SUZ*x$?0Xm`k(0mN|Xh`iba?t$$ zUxWR;j{$R`2I2VGa2OP@5l8^w3n^Y6AR@@a<-nIVJUH>ifz9Qh222)Bz`;NRCW?pS zZ~zH-kmP`E%gF-*kYEh;(0vSOpdeTdtcP$CLDT^rl5ug1_!Oh z0mfHxfVp5_>KKCsf{>>+oE(AL6XXEB0FFnY0fVdiV!^&RV9OW+v`Fc0D;1wJ#k8s0WRz4`{p9 z2E(oC>~B=713=&^3N+vtEW<%m0m`A}kQw|`*UuhM3;Rh36$4fdOacW+5c!ZD{EZ)) z1p%qX1TaC@e`A4&LLy<*dus>?z|H(@kRhzv0AvUMZd6$VyMS!smqxV*$R>U^s;#Wq z1Y{4tHi$0jHNXMc#?JELv#LakX=Aph5CPrwtmg1R6xK0o`eYbWgtZANCS?7j#L0l6HYCv0V6Fq&@2_E z4M#(-g#e}l2TBGb81UWbwdkb}2MVX&HGre3@dWbb=(Rme;OMnIOo3ci=b6E&AN>Fp zTVN%tFni!iS4SjZ&ub%U5CP0lu{(n2XKe&U8(`y9LY&~h`l%D#;Xo(Ukq3BWR!5$| zcveSVaFDN2?~>ubb5ci8w8E}2M1f=1HiCj0c5N#^;Kf$w2ExI?7IhUh`T?_mF8+xp zKm9KhK{bI9uOj&8K7{;dxKTm<8fgFiRRcmz1x!NN+rm*0VuT9%f1Cya+}gBXNPxCn z+v1;ne^LEEK!}syWZ5MLeIqkiu}N`A(pAXUpNK=7?sGi`a$q{=%2-E z;5P_RezO}SBHr#^6n7UpxF1GVUKS$_ccxIhefJ^|9)I7G^&&e-!Vn-KaX4lVe$w#k zDsKlzxSg%N8`xzH1DHug@8adD4*e2jxB6Z*5)C{h8VSxR2pED4QcePil#u*IE&*hc zBMfrOFc2Bm{($4h0eftPtkB%!~5=Am)Wv-)os5)Cq>bujSp?l+o2;tO`QQ$Th|ud`a!Bw3upWj6+1Gj)9wZs-VRFCg5{UG-dGeUw@yX-C$;LW-^70_q zu7eTKa=-CGz=C{#{XBV460sge0IwDQhJhCX{CXQlq2+(IH453*#hpZ^{sPU!C5Qx` seK>;XbnNg(l#UFflO=LrvKK1K)!BDF6Tf literal 0 HcmV?d00001