forked from HarderThenHarder/transformers_tasks
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
cf9eb46
commit 52d146d
Showing
38 changed files
with
15,504 additions
and
0 deletions.
There are no files selected for viewing
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
compute_environment: LOCAL_MACHINE | ||
deepspeed_config: | ||
deepspeed_multinode_launcher: standard | ||
gradient_clipping: 1.0 | ||
offload_optimizer_device: none | ||
offload_param_device: none | ||
zero3_init_flag: false | ||
zero_stage: 1 | ||
distributed_type: DEEPSPEED | ||
fsdp_config: {} | ||
machine_rank: 0 | ||
main_training_function: main | ||
mixed_precision: bf16 | ||
num_machines: 1 | ||
num_processes: 4 | ||
rdzv_backend: static | ||
same_network: true | ||
use_cpu: false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
compute_environment: LOCAL_MACHINE | ||
deepspeed_config: | ||
deepspeed_multinode_launcher: standard | ||
gradient_clipping: 1.0 | ||
offload_optimizer_device: none | ||
offload_param_device: none | ||
zero3_init_flag: false | ||
zero_stage: 2 | ||
distributed_type: DEEPSPEED | ||
fsdp_config: {} | ||
machine_rank: 0 | ||
main_training_function: main | ||
mixed_precision: bf16 | ||
num_machines: 1 | ||
num_processes: 5 | ||
rdzv_backend: static | ||
same_network: true | ||
use_cpu: false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
compute_environment: LOCAL_MACHINE | ||
deepspeed_config: | ||
deepspeed_multinode_launcher: standard | ||
gradient_clipping: 1.0 | ||
offload_optimizer_device: none | ||
offload_param_device: none | ||
zero3_init_flag: true | ||
zero_stage: 3 | ||
distributed_type: DEEPSPEED | ||
fsdp_config: {} | ||
machine_rank: 0 | ||
main_training_function: main | ||
mixed_precision: bf16 | ||
num_machines: 1 | ||
num_processes: 8 | ||
rdzv_backend: static | ||
same_network: true | ||
use_cpu: false |
18 changes: 18 additions & 0 deletions
18
LLM/LLMsTrainer/configs/accelerate_configs/ds_stage3_offload.yaml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
compute_environment: LOCAL_MACHINE | ||
deepspeed_config: | ||
deepspeed_multinode_launcher: standard | ||
gradient_clipping: 1.0 | ||
offload_optimizer_device: cpu | ||
offload_param_device: cpu | ||
zero3_init_flag: true | ||
zero_stage: 3 | ||
distributed_type: DEEPSPEED | ||
fsdp_config: {} | ||
machine_rank: 0 | ||
main_training_function: main | ||
mixed_precision: bf16 | ||
num_machines: 1 | ||
num_processes: 8 | ||
rdzv_backend: static | ||
same_network: true | ||
use_cpu: false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
data: | ||
mode: "pretrain" | ||
data: | ||
MNBVC_news: "data/pretrain_data/MNBVC_news/*.jsonl.zst" | ||
MNBVC_qa: "data/pretrain_data/MNBVC_qa/*.jsonl.zst" | ||
MNBVC_wiki: "data/pretrain_data/MNBVC_wiki/*.jsonl.zst" | ||
sample_policy_file: "configs/sample_policy/pretrain/MNBVC.json" | ||
pad_to_max: false | ||
sequence_sample_mode: "none" | ||
concat_multiple_sequence: true | ||
num_sequences: 10 | ||
seq_length: 2048 | ||
tokenizer_path: "openlm-research/open_llama_7b_v2" | ||
split_by_shard: false | ||
train: | ||
train_batch_size: 1 | ||
num_training_steps: 10000 | ||
num_warmup_steps: 100 | ||
initializer_range: 1.0e-2 | ||
lr: 5.0e-5 | ||
weight_decay: 1.0e-1 | ||
resize_model_vocab_size: false | ||
ckpt: 'openlm-research/open_llama_7b_v2' | ||
train_num_workers: 8 | ||
gradient_accumulation_steps: 30 | ||
prefetch_factor: 100 | ||
train_and_eval: true | ||
gradient_checkpointing_enable: true | ||
use_lora: false | ||
target_modules: ['q_proj', 'v_proj'] | ||
save_total_limit: 3 | ||
img_log_dir: "log/pretrain/open_llama_7b_v2" | ||
img_log_name: "open_llama_7b_v2 test" | ||
eval: | ||
eval_methods: ["single_choice_eval", "generation_eval"] | ||
single_choice_dataset: | ||
single_choice_file: eval_data/knowledge/knowledge_and_reasoning.jsonl | ||
generation_dataset: | ||
general_test: eval_data/pretrain/generation_test.jsonl | ||
genration_eval_save_path: "eval_while_training/pretrain/open_llama_7b_v2" | ||
# global step | ||
log_interval: 10 | ||
eval_interval: 50 | ||
save_interval: 100 | ||
work_dir: "checkpoints/pretrain/open_llama_7b_v2" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
data: | ||
reward_model_datasets: | ||
sentiment_comments: "data/reward_model_data/sentiment_comments.jsonl" | ||
tokenizer_path: "openlm-research/open_llama_7b_v2" | ||
seq_length: 2048 | ||
batch_size: 1 | ||
dataset_map_num_proc: 16 | ||
train: | ||
num_training_epochs: 1 | ||
initializer_range: 1.0e-2 | ||
lr: 1.0e-6 | ||
min_lr: 1.0e-7 | ||
weight_decay: 1.0e-1 | ||
ckpt: 'openlm-research/open_llama_7b_v2' | ||
gradient_accumulation_steps: 1 | ||
train_and_eval: true | ||
gradient_checkpointing_enable: true | ||
downscale_weight: true | ||
save_total_limit: 1 | ||
img_log_dir: "log/reward_model/llama7b" | ||
img_log_name: "sentiment_comments" | ||
eval: | ||
test_reward_model_acc_files: | ||
sentiment_comments_test: "eval_data/reward_model/sentiment_comments_test.jsonl" | ||
save_delta_scores: false | ||
delta_scores_save_path: "checkpoints/reward_model/llama7b/sentiment_comments" | ||
# global step | ||
log_interval: 10 | ||
eval_interval: 10 | ||
save_interval: 2000 | ||
work_dir: "checkpoints/reward_model/llama7b/sentiment_comments" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
{ | ||
"MNBVC_news": 0.502, | ||
"MNBVC_qa": 0.8001, | ||
"MNBVC_wiki": 0.1 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
data: | ||
mode: "sft" | ||
data: | ||
sharegpt: "data/sft_data/sharegpt/*.jsonl.zst" | ||
pad_to_max: true | ||
sequence_sample_mode: "truncation" | ||
concat_multiple_sequence: false | ||
num_sequences: 10 | ||
seq_length: 2048 | ||
tokenizer_path: "openlm-research/open_llama_7b_v2" | ||
split_by_shard: false | ||
train: | ||
train_batch_size: 1 | ||
num_training_steps: 50000 | ||
num_warmup_steps: 1000 | ||
initializer_range: 1.0e-2 | ||
lr: 1.0e-5 | ||
weight_decay: 1.0e-1 | ||
resize_model_vocab_size: true | ||
ckpt: 'openlm-research/open_llama_7b_v2' | ||
train_num_workers: 8 | ||
gradient_accumulation_steps: 30 | ||
prefetch_factor: 100 | ||
train_and_eval: true | ||
gradient_checkpointing_enable: true | ||
use_lora: false | ||
target_modules: ['q_proj', 'v_proj'] | ||
save_total_limit: 3 | ||
img_log_dir: "log/sft/open_llama_7b_v2" | ||
img_log_name: "ShareGPT OpenLlama7b-v2" | ||
eval: | ||
eval_methods: ["generation_eval"] | ||
generation_dataset: | ||
share_gpt_test: "eval_data/sft/share_gpt_test.jsonl" | ||
genration_eval_save_path: "eval_while_training/sft/sharegpt" | ||
# global step | ||
log_interval: 10 | ||
eval_interval: 100 | ||
save_interval: 100 | ||
work_dir: "checkpoints/sft/ShareGPT" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
# !/usr/bin/env python3 | ||
""" | ||
==== No Bugs in code, just some Random Unexpected FEATURES ==== | ||
┌─────────────────────────────────────────────────────────────┐ | ||
│┌───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┬───┐│ | ||
││Esc│!1 │@2 │#3 │$4 │%5 │^6 │&7 │*8 │(9 │)0 │_- │+= │|\ │`~ ││ | ||
│├───┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴───┤│ | ||
││ Tab │ Q │ W │ E │ R │ T │ Y │ U │ I │ O │ P │{[ │}] │ BS ││ | ||
│├─────┴┬──┴┬──┴┬──┴┬──┴┬──┴┬──┴┬──┴┬──┴┬──┴┬──┴┬──┴┬──┴─────┤│ | ||
││ Ctrl │ A │ S │ D │ F │ G │ H │ J │ K │ L │: ;│" '│ Enter ││ | ||
│├──────┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴─┬─┴────┬───┤│ | ||
││ Shift │ Z │ X │ C │ V │ B │ N │ M │< ,│> .│? /│Shift │Fn ││ | ||
│└─────┬──┴┬──┴──┬┴───┴───┴───┴───┴───┴──┬┴───┴┬──┴┬─────┴───┘│ | ||
│ │Fn │ Alt │ Space │ Alt │Win│ HHKB │ | ||
│ └───┴─────┴───────────────────────┴─────┴───┘ │ | ||
└─────────────────────────────────────────────────────────────┘ | ||
数据集压缩。 | ||
Author: pankeyu | ||
Date: 2023/05/23 | ||
""" | ||
import os | ||
import json | ||
import zstandard as zstd | ||
|
||
SHARD_SIZE = 10 # 单个文件存放样本数量 | ||
|
||
|
||
def compress( | ||
input_file: str, | ||
write_path: str | ||
): | ||
""" | ||
将数据压缩成.zst格式的文件,以支持流氏读取。 | ||
""" | ||
print(f'processed {input_file}...') | ||
|
||
path_name = os.path.dirname(write_path) | ||
if not os.path.exists(path_name): | ||
os.makedirs(path_name) | ||
|
||
total_num, file_num, log_interval = 0, 1, 10000 | ||
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") | ||
with open(input_file, "r") as f: | ||
for line in f: | ||
line = json.loads(line) | ||
if total_num % SHARD_SIZE == 0 and total_num > 0: | ||
file_num += 1 | ||
wfp.close() | ||
wfp = zstd.open(write_path.format(file_num), "wb", encoding="utf-8") | ||
wfp.write(json.dumps(line).encode("utf-8")) | ||
wfp.write("\n".encode("utf-8")) | ||
total_num += 1 | ||
if not total_num % log_interval: | ||
print(f'\rProcessed: {total_num} samples...', end='') | ||
wfp.close() | ||
print("total line: {}\ntotal files: {}".format(total_num, file_num)) | ||
|
||
|
||
def batch_compress_preatrain_data(): | ||
""" | ||
批量压缩预训练数据。 | ||
""" | ||
source_path = 'shuffled_data/pretrain' # 源数据文件 | ||
target_path = 'pretrain_data' # 压缩后存放地址 | ||
|
||
files = [ | ||
'MNBVC_news', | ||
'MNBVC_qa', | ||
'MNBVC_wiki' | ||
] | ||
|
||
compress_file = [] | ||
for file in files: | ||
compress_file.append({ | ||
'input_file': f'{source_path}/{file}.jsonl', | ||
'write_path': f'{target_path}/{file}/part-{{}}.jsonl.zst' | ||
}) | ||
|
||
for file in compress_file: | ||
compress(file['input_file'], file['write_path']) | ||
|
||
|
||
def batch_compress_sft_data(): | ||
""" | ||
批量压缩SFT数据。 | ||
""" | ||
source_path = 'shuffled_data/sft' | ||
target_path = 'sft_data' | ||
|
||
files = [ | ||
'sharegpt' | ||
] | ||
|
||
compress_file = [] | ||
for file in files: | ||
compress_file.append({ | ||
'input_file': f'{source_path}/{file}.jsonl', | ||
'write_path': f'{target_path}/{file}/part-{{}}.jsonl.zst' | ||
}) | ||
|
||
for file in compress_file: | ||
compress(file['input_file'], file['write_path']) | ||
|
||
|
||
if __name__ == '__main__': | ||
# batch_compress_preatrain_data() | ||
batch_compress_sft_data() |
Oops, something went wrong.