Skip to content

Commit

Permalink
add LLMs Trainer
Browse files Browse the repository at this point in the history
  • Loading branch information
HarderThenHarder committed Aug 2, 2023
1 parent cf9eb46 commit 52d146d
Show file tree
Hide file tree
Showing 38 changed files with 15,504 additions and 0 deletions.
Binary file added LLM/LLMsTrainer/assets/sampler_viewer.jpeg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
18 changes: 18 additions & 0 deletions LLM/LLMsTrainer/configs/accelerate_configs/ds_stage1.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_multinode_launcher: standard
gradient_clipping: 1.0
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: false
zero_stage: 1
distributed_type: DEEPSPEED
fsdp_config: {}
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 4
rdzv_backend: static
same_network: true
use_cpu: false
18 changes: 18 additions & 0 deletions LLM/LLMsTrainer/configs/accelerate_configs/ds_stage2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_multinode_launcher: standard
gradient_clipping: 1.0
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: false
zero_stage: 2
distributed_type: DEEPSPEED
fsdp_config: {}
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 5
rdzv_backend: static
same_network: true
use_cpu: false
18 changes: 18 additions & 0 deletions LLM/LLMsTrainer/configs/accelerate_configs/ds_stage3.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_multinode_launcher: standard
gradient_clipping: 1.0
offload_optimizer_device: none
offload_param_device: none
zero3_init_flag: true
zero_stage: 3
distributed_type: DEEPSPEED
fsdp_config: {}
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
use_cpu: false
18 changes: 18 additions & 0 deletions LLM/LLMsTrainer/configs/accelerate_configs/ds_stage3_offload.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
compute_environment: LOCAL_MACHINE
deepspeed_config:
deepspeed_multinode_launcher: standard
gradient_clipping: 1.0
offload_optimizer_device: cpu
offload_param_device: cpu
zero3_init_flag: true
zero_stage: 3
distributed_type: DEEPSPEED
fsdp_config: {}
machine_rank: 0
main_training_function: main
mixed_precision: bf16
num_machines: 1
num_processes: 8
rdzv_backend: static
same_network: true
use_cpu: false
45 changes: 45 additions & 0 deletions LLM/LLMsTrainer/configs/pretrain_configs/llama.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
data:
mode: "pretrain"
data:
MNBVC_news: "data/pretrain_data/MNBVC_news/*.jsonl.zst"
MNBVC_qa: "data/pretrain_data/MNBVC_qa/*.jsonl.zst"
MNBVC_wiki: "data/pretrain_data/MNBVC_wiki/*.jsonl.zst"
sample_policy_file: "configs/sample_policy/pretrain/MNBVC.json"
pad_to_max: false
sequence_sample_mode: "none"
concat_multiple_sequence: true
num_sequences: 10
seq_length: 2048
tokenizer_path: "openlm-research/open_llama_7b_v2"
split_by_shard: false
train:
train_batch_size: 1
num_training_steps: 10000
num_warmup_steps: 100
initializer_range: 1.0e-2
lr: 5.0e-5
weight_decay: 1.0e-1
resize_model_vocab_size: false
ckpt: 'openlm-research/open_llama_7b_v2'
train_num_workers: 8
gradient_accumulation_steps: 30
prefetch_factor: 100
train_and_eval: true
gradient_checkpointing_enable: true
use_lora: false
target_modules: ['q_proj', 'v_proj']
save_total_limit: 3
img_log_dir: "log/pretrain/open_llama_7b_v2"
img_log_name: "open_llama_7b_v2 test"
eval:
eval_methods: ["single_choice_eval", "generation_eval"]
single_choice_dataset:
single_choice_file: eval_data/knowledge/knowledge_and_reasoning.jsonl
generation_dataset:
general_test: eval_data/pretrain/generation_test.jsonl
genration_eval_save_path: "eval_while_training/pretrain/open_llama_7b_v2"
# global step
log_interval: 10
eval_interval: 50
save_interval: 100
work_dir: "checkpoints/pretrain/open_llama_7b_v2"
31 changes: 31 additions & 0 deletions LLM/LLMsTrainer/configs/reward_model_configs/llama.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
data:
reward_model_datasets:
sentiment_comments: "data/reward_model_data/sentiment_comments.jsonl"
tokenizer_path: "openlm-research/open_llama_7b_v2"
seq_length: 2048
batch_size: 1
dataset_map_num_proc: 16
train:
num_training_epochs: 1
initializer_range: 1.0e-2
lr: 1.0e-6
min_lr: 1.0e-7
weight_decay: 1.0e-1
ckpt: 'openlm-research/open_llama_7b_v2'
gradient_accumulation_steps: 1
train_and_eval: true
gradient_checkpointing_enable: true
downscale_weight: true
save_total_limit: 1
img_log_dir: "log/reward_model/llama7b"
img_log_name: "sentiment_comments"
eval:
test_reward_model_acc_files:
sentiment_comments_test: "eval_data/reward_model/sentiment_comments_test.jsonl"
save_delta_scores: false
delta_scores_save_path: "checkpoints/reward_model/llama7b/sentiment_comments"
# global step
log_interval: 10
eval_interval: 10
save_interval: 2000
work_dir: "checkpoints/reward_model/llama7b/sentiment_comments"
5 changes: 5 additions & 0 deletions LLM/LLMsTrainer/configs/sample_policy/pretrain/MNBVC.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
{
"MNBVC_news": 0.502,
"MNBVC_qa": 0.8001,
"MNBVC_wiki": 0.1
}
40 changes: 40 additions & 0 deletions LLM/LLMsTrainer/configs/sft_configs/llama.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
data:
mode: "sft"
data:
sharegpt: "data/sft_data/sharegpt/*.jsonl.zst"
pad_to_max: true
sequence_sample_mode: "truncation"
concat_multiple_sequence: false
num_sequences: 10
seq_length: 2048
tokenizer_path: "openlm-research/open_llama_7b_v2"
split_by_shard: false
train:
train_batch_size: 1
num_training_steps: 50000
num_warmup_steps: 1000
initializer_range: 1.0e-2
lr: 1.0e-5
weight_decay: 1.0e-1
resize_model_vocab_size: true
ckpt: 'openlm-research/open_llama_7b_v2'
train_num_workers: 8
gradient_accumulation_steps: 30
prefetch_factor: 100
train_and_eval: true
gradient_checkpointing_enable: true
use_lora: false
target_modules: ['q_proj', 'v_proj']
save_total_limit: 3
img_log_dir: "log/sft/open_llama_7b_v2"
img_log_name: "ShareGPT OpenLlama7b-v2"
eval:
eval_methods: ["generation_eval"]
generation_dataset:
share_gpt_test: "eval_data/sft/share_gpt_test.jsonl"
genration_eval_save_path: "eval_while_training/sft/sharegpt"
# global step
log_interval: 10
eval_interval: 100
save_interval: 100
work_dir: "checkpoints/sft/ShareGPT"
108 changes: 108 additions & 0 deletions LLM/LLMsTrainer/data/compress_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
# !/usr/bin/env python3
"""
==== No Bugs in code, just some Random Unexpected FEATURES ====
┌─────────────────────────────────────────────────────────────┐
│┌───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┐│
││Esc│!1 │@2 │#3 │$4 │%5 │^6 │&7 │*8 │(9 │)0 │_- │+= │|\ │`~ ││
│├───┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴───┤│
││ Tab │ Q │ W │ E │ R │ T │ Y │ U │ I │ O │ P │{[ │}] │ BS ││
│├─────┴┬──┴┬──┴┬──┴┬──┴┬──┴┬──┴┬──┴┬──┴┬──┴┬──┴┬──┴┬──┴─────┤│
││ Ctrl │ A │ S │ D │ F │ G │ H │ J │ K │ L │: ;│" '│ Enter ││
│├──────┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴────┬───┤│
││ Shift │ Z │ X │ C │ V │ B │ N │ M │< ,│> .│? /│Shift │Fn ││
│└─────┬──┴┬──┴──┬┴───┴───┴───┴───┴───┴──┬┴───┴┬──┴┬─────┴───┘│
│ │Fn │ Alt │ Space │ Alt │Win│ HHKB │
│ └───┴─────┴───────────────────────┴─────┴───┘ │
└─────────────────────────────────────────────────────────────┘
数据集压缩。
Author: pankeyu
Date: 2023/05/23
"""
import os
import json
import zstandard as zstd

SHARD_SIZE = 10 # 单个文件存放样本数量


def compress(
input_file: str,
write_path: str
):
"""
将数据压缩成.zst格式的文件,以支持流氏读取。
"""
print(f'processed {input_file}...')

path_name = os.path.dirname(write_path)
if not os.path.exists(path_name):
os.makedirs(path_name)

total_num, file_num, log_interval = 0, 1, 10000
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
with open(input_file, "r") as f:
for line in f:
line = json.loads(line)
if total_num % SHARD_SIZE == 0 and total_num > 0:
file_num += 1
wfp.close()
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8")
wfp.write(json.dumps(line).encode("utf-8"))
wfp.write("\n".encode("utf-8"))
total_num += 1
if not total_num % log_interval:
print(f'\rProcessed: {total_num} samples...', end='')
wfp.close()
print("total line: {}\ntotal files: {}".format(total_num, file_num))


def batch_compress_preatrain_data():
"""
批量压缩预训练数据。
"""
source_path = 'shuffled_data/pretrain' # 源数据文件
target_path = 'pretrain_data' # 压缩后存放地址

files = [
'MNBVC_news',
'MNBVC_qa',
'MNBVC_wiki'
]

compress_file = []
for file in files:
compress_file.append({
'input_file': f'{source_path}/{file}.jsonl',
'write_path': f'{target_path}/{file}/part-{{}}.jsonl.zst'
})

for file in compress_file:
compress(file['input_file'], file['write_path'])


def batch_compress_sft_data():
"""
批量压缩SFT数据。
"""
source_path = 'shuffled_data/sft'
target_path = 'sft_data'

files = [
'sharegpt'
]

compress_file = []
for file in files:
compress_file.append({
'input_file': f'{source_path}/{file}.jsonl',
'write_path': f'{target_path}/{file}/part-{{}}.jsonl.zst'
})

for file in compress_file:
compress(file['input_file'], file['write_path'])


if __name__ == '__main__':
# batch_compress_preatrain_data()
batch_compress_sft_data()
Loading

0 comments on commit 52d146d

Please sign in to comment.