Skip to content

Commit

Permalink
ADLR/megatron-lm!2266 - ci: Move REPEATS to launcher level
Browse files Browse the repository at this point in the history
ko3n1g committed Oct 26, 2024
1 parent d7e82d9 commit ef6cba6
Showing 156 changed files with 130 additions and 303 deletions.
12 changes: 12 additions & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -17,6 +17,8 @@ workflow:
UNIT_TEST_TIMEOUT: 75
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: mr
FUNCTIONAL_TEST_REPEAT: 5
FUNCTIONAL_TEST_TIME_LIMIT: 1800,
FUNCTIONAL_TEST_CLUSTER_A100: ""
FUNCTIONAL_TEST_CLUSTER_H100: ""
PUBLISH: "no"
@@ -26,6 +28,8 @@ workflow:
UNIT_TEST_TIMEOUT: 75
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: nightly
FUNCTIONAL_TEST_REPEAT: 5
FUNCTIONAL_TEST_TIME_LIMIT: 1800,
FUNCTIONAL_TEST_CLUSTER_A100: ""
FUNCTIONAL_TEST_CLUSTER_H100: ""
PUBLISH: "no"
@@ -35,6 +39,8 @@ workflow:
UNIT_TEST_TIMEOUT: 75
FUNCTIONAL_TEST: "yes"
FUNCTIONAL_TEST_SCOPE: weekly
FUNCTIONAL_TEST_REPEAT: 1,
FUNCTIONAL_TEST_TIME_LIMIT: 9000,
FUNCTIONAL_TEST_CLUSTER_A100: ""
FUNCTIONAL_TEST_CLUSTER_H100: ""
PUBLISH: "no"
@@ -82,6 +88,12 @@ variables:
- "pre-release"
- "release"
description: "Testsuite to run (only for FUNCTIONAL_TEST=yes)"
FUNCTIONAL_TEST_REPEAT:
value: "5"
description: "Number of repetitions per test"
FUNCTIONAL_TEST_TIME_LIMIT:
value: "1800"
description: "Timeout in seconds per test"
FUNCTIONAL_TEST_CLUSTER_A100:
value: "dgxa100_dracooci"
options:
4 changes: 4 additions & 0 deletions .gitlab/stages/02.functional-tests.yml
Original file line number Diff line number Diff line change
@@ -57,6 +57,8 @@ functional:configure:
python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \
--scope $FUNCTIONAL_TEST_SCOPE \
--environment dev \
--n-repeat "$FUNCTIONAL_TEST_REPEAT" \
--time-limit "$FUNCTIONAL_TEST_TIME_LIMIT" \
--a100-cluster $A100_CLUSTER \
--h100-cluster $H100_CLUSTER \
--container-image ${CI_MCORE_LTS_IMAGE} \
@@ -68,6 +70,8 @@ functional:configure:
python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \
--scope $FUNCTIONAL_TEST_SCOPE \
--environment lts \
--n-repeat "$FUNCTIONAL_TEST_REPEAT" \
--time-limit "$FUNCTIONAL_TEST_TIME_LIMIT" \
--a100-cluster $A100_CLUSTER \
--h100-cluster $H100_CLUSTER \
--container-image ${CI_MCORE_LTS_IMAGE} \
3 changes: 3 additions & 0 deletions tests/functional_tests/jet_recipes/bert.yaml
Original file line number Diff line number Diff line change
@@ -24,6 +24,7 @@ spec:
"TRAINING_SCRIPT_PATH=pretrain_bert.py"
"TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
"GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
"N_REPEAT={n_repeat}"
)
bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
@@ -32,6 +33,7 @@ products:
- environment: [lts, dev]
scope: [mr]
time_limit: [1800]
n_repeat: [5]
test_case:
- bert_mr_mcore_tp2_pp2_dgx_a100_1N8G
# - bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G
@@ -43,6 +45,7 @@ products:
- bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G
- environment: [lts]
scope: [nightly]
n_repeat: [5]
time_limit: [3600]
test_case:
- bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2
2 changes: 2 additions & 0 deletions tests/functional_tests/jet_recipes/gpt-nemo.yaml
Original file line number Diff line number Diff line change
@@ -24,13 +24,15 @@ spec:
"TRAINING_SCRIPT_PATH=/opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py"
"TRAINING_PARAMS_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
"GOLDEN_VALUES_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
"N_REPEAT={n_repeat}"
)
bash /opt/megatron-lm/tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
products:
- environment: [dev]
scope: [mr]
n_repeat: [5]
test_case:
- gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G
- gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G
4 changes: 4 additions & 0 deletions tests/functional_tests/jet_recipes/gpt.yaml
Original file line number Diff line number Diff line change
@@ -23,6 +23,7 @@ spec:
"TRAINING_SCRIPT_PATH=pretrain_gpt.py"
"TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
"GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
"N_REPEAT={n_repeat}"
)
bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
@@ -32,6 +33,7 @@ products:
scope: [mr]
platforms: [dgx_a100]
time_limit: [1800]
n_repeat: [5]
test_case:
- gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G
@@ -103,6 +105,7 @@ products:
scope: [nightly]
platforms: [dgx_a100]
time_limit: [3600]
n_repeat: [5]
test_case:
- gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather
- gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather
@@ -136,6 +139,7 @@ products:
scope: [mr]
platforms: [dgx_a100]
time_limit: [1800]
n_repeat: [5]
test_case:
- gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G
- gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G
2 changes: 2 additions & 0 deletions tests/functional_tests/jet_recipes/multimodal-llava.yaml
Original file line number Diff line number Diff line change
@@ -24,13 +24,15 @@ spec:
"TRAINING_SCRIPT_PATH=pretrain_vlm.py"
"TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
"GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
"N_REPEAT={n_repeat}"
)
bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
products:
- environment: [lts, dev]
scope: [mr]
n_repeat: [5]
test_case:
- multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G
- multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G
4 changes: 4 additions & 0 deletions tests/functional_tests/jet_recipes/t5.yaml
Original file line number Diff line number Diff line change
@@ -24,6 +24,7 @@ spec:
"TRAINING_SCRIPT_PATH=pretrain_t5.py"
"TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
"GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
"N_REPEAT={n_repeat}"
)
bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
@@ -32,6 +33,7 @@ products:
- environment: [lts, dev]
scope: [mr]
time_limit: [1800]
n_repeat: [5]
test_case:
- t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G
- t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G
@@ -41,13 +43,15 @@ products:
- environment: [lts]
scope: [mr]
time_limit: [1800]
n_repeat: [5]
test_case:
- t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G
- t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G
- t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G
- environment: [lts]
scope: [weekly]
time_limit: [9000]
n_repeat: [1]
test_case:
- t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch
- t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1
4 changes: 4 additions & 0 deletions tests/functional_tests/python_test_utils/jet/common.py
Original file line number Diff line number Diff line change
@@ -134,6 +134,8 @@ def filter_by_model(

def load_workloads(
container_tag: str,
n_repeat: int = 1,
time_limit: int = 1800,
environment: Optional[str] = None,
scope: Optional[str] = None,
model: Optional[str] = None,
@@ -171,4 +173,6 @@ def load_workloads(
container_image = container_image or build_workload.spec.source.image
build_workload.spec.source.image = f"{container_image}:{container_tag}"
workloads.append(build_workload)
workload.spec.n_repeat = n_repeat
workload.spec.time_limit = time_limit
return workloads
Original file line number Diff line number Diff line change
@@ -12,6 +12,8 @@
@click.command()
@click.option("--scope", required=True, type=str, help="Test scope")
@click.option("--environment", required=True, type=str, help="LTS or dev features")
@click.option("--n-repeat", required=False, default=1, type=int)
@click.option("--time-limit", required=False, default=1, type=int)
@click.option("--a100-cluster", required=True, type=str, help="A100 Cluster to run on")
@click.option("--h100-cluster", required=True, type=str, help="H100 Cluster to run on")
@click.option("--output-path", required=True, type=str, help="Path to write GitLab job to")
@@ -29,6 +31,8 @@
def main(
scope: str,
environment: str,
n_repeat: int,
time_limit: int,
a100_cluster: str,
h100_cluster: str,
output_path: str,
@@ -63,6 +67,8 @@ def main(
"python tests/functional_tests/python_test_utils/jet/launch_jet_workload.py",
f"--model {test_case.spec.model}",
f"--environment {test_case.spec.environment}",
f"--n-repeat {n_repeat}",
f"--time-limit {time_limit}",
f"--test-case {test_case.spec.test_case}",
f"--container-tag {container_tag}",
f"--cluster {cluster}",
Original file line number Diff line number Diff line change
@@ -42,6 +42,8 @@ def sigterm_handler(_signo, _stack_frame):
def launch_and_wait_for_completion(
test_case: str,
environment: str,
n_repeat: int,
time_limit: int,
container_image: str,
container_tag: str,
cluster: str,
@@ -54,6 +56,8 @@ def launch_and_wait_for_completion(
).workloads.submit(
workloads=common.load_workloads(
test_case=test_case,
n_repeat=n_repeat,
time_limit=time_limit,
container_image=container_image,
container_tag=container_tag,
environment=environment,
@@ -142,6 +146,8 @@ def parse_finished_training(logs: List[str]) -> Optional[bool]:
@click.option(
"--environment", required=True, type=click.Choice(['dev', 'lts']), help="Pytorch LTS or DEV"
)
@click.option("--n-repeat", required=False, default=1, type=int)
@click.option("--time-limit", required=False, default=1800, type=int)
@click.option(
"--account",
required=False,
@@ -165,6 +171,8 @@ def main(
model: str,
test_case: str,
environment: str,
n_repeat: int,
time_limit: int,
account: str,
cluster: str,
container_tag: str,
@@ -195,6 +203,8 @@ def main(
pipeline = launch_and_wait_for_completion(
test_case=test_case,
environment=environment,
n_repeat=n_repeat,
time_limit=time_limit,
container_image=container_image,
container_tag=container_tag,
cluster=cluster,
Original file line number Diff line number Diff line change
@@ -5,7 +5,6 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 5
MODEL_ARGS:
--num-layers: 24
--hidden-size: 1024
Original file line number Diff line number Diff line change
@@ -5,7 +5,6 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 5
MODEL_ARGS:
--num-layers: 24
--hidden-size: 1024
Original file line number Diff line number Diff line change
@@ -5,7 +5,6 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 5
MODEL_ARGS:
--num-layers: 24
--hidden-size: 1024
Original file line number Diff line number Diff line change
@@ -5,7 +5,6 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 5
MODEL_ARGS:
--num-layers: 24
--hidden-size: 1024
Original file line number Diff line number Diff line change
@@ -5,7 +5,6 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 5
MODEL_ARGS:
--num-layers: 24
--hidden-size: 1024
Original file line number Diff line number Diff line change
@@ -5,7 +5,6 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 5
MODEL_ARGS:
--num-layers: 24
--hidden-size: 1024
Original file line number Diff line number Diff line change
@@ -5,7 +5,6 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 5
MODEL_ARGS:
--num-layers: 24
--hidden-size: 1024
Original file line number Diff line number Diff line change
@@ -5,7 +5,6 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 5
MODEL_ARGS:
--num-layers: 24
--hidden-size: 1024
Original file line number Diff line number Diff line change
@@ -5,7 +5,6 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 5
MODEL_ARGS:
--num-layers: 24
--hidden-size: 1024
@@ -43,4 +42,4 @@ MODEL_ARGS:
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--ckpt-format: torch
TEST_TYPE: regular
TEST_TYPE: regular
Original file line number Diff line number Diff line change
@@ -5,7 +5,6 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 5
MODEL_ARGS:
--num-layers: 24
--hidden-size: 1024
@@ -44,4 +43,4 @@ MODEL_ARGS:
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--ckpt-format: torch
TEST_TYPE: regular
TEST_TYPE: regular
Original file line number Diff line number Diff line change
@@ -5,7 +5,6 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
N_REPEATS: 5
MODEL_ARGS:
--num-layers: 24
--hidden-size: 1024
@@ -43,4 +42,4 @@ MODEL_ARGS:
--data-cache-path: ${DATA_CACHE_PATH}
--bf16: true
--ckpt-format: torch
TEST_TYPE: regular
TEST_TYPE: regular
Original file line number Diff line number Diff line change
@@ -6,7 +6,6 @@ ENV_VARS:
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
NVTE_APPLY_QK_LAYER_SCALING: 1
N_REPEATS: 5
MODEL_ARGS:
--num-layers: 24
--hidden-size: 1024
@@ -46,4 +45,4 @@ MODEL_ARGS:
--fp16: true
--apply-query-key-layer-scaling: true
--ckpt-format: torch
TEST_TYPE: regular
TEST_TYPE: regular
Loading
Oops, something went wrong.

0 comments on commit ef6cba6

Please sign in to comment.