ADLR/megatron-lm!2266 - ci: Move REPEATS to launcher level

NVIDIA · Oct 26, 2024 · ef6cba6 · ef6cba6
1 parent d7e82d9
commit ef6cba6
Showing 156 changed files with 130 additions and 303 deletions.
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -17,6 +17,8 @@ workflow:
         UNIT_TEST_TIMEOUT: 75
         FUNCTIONAL_TEST: "yes"
         FUNCTIONAL_TEST_SCOPE: mr
+        FUNCTIONAL_TEST_REPEAT: 5
+        FUNCTIONAL_TEST_TIME_LIMIT: 1800,
         FUNCTIONAL_TEST_CLUSTER_A100: ""
         FUNCTIONAL_TEST_CLUSTER_H100: ""
         PUBLISH: "no"
@@ -26,6 +28,8 @@ workflow:
         UNIT_TEST_TIMEOUT: 75
         FUNCTIONAL_TEST: "yes"
         FUNCTIONAL_TEST_SCOPE: nightly
+        FUNCTIONAL_TEST_REPEAT: 5
+        FUNCTIONAL_TEST_TIME_LIMIT: 1800,
         FUNCTIONAL_TEST_CLUSTER_A100: ""
         FUNCTIONAL_TEST_CLUSTER_H100: ""
         PUBLISH: "no"
@@ -35,6 +39,8 @@ workflow:
         UNIT_TEST_TIMEOUT: 75
         FUNCTIONAL_TEST: "yes"
         FUNCTIONAL_TEST_SCOPE: weekly
+        FUNCTIONAL_TEST_REPEAT: 1,
+        FUNCTIONAL_TEST_TIME_LIMIT: 9000,
         FUNCTIONAL_TEST_CLUSTER_A100: ""
         FUNCTIONAL_TEST_CLUSTER_H100: ""
         PUBLISH: "no"
@@ -82,6 +88,12 @@ variables:
       - "pre-release"
       - "release"
     description: "Testsuite to run (only for FUNCTIONAL_TEST=yes)"
+  FUNCTIONAL_TEST_REPEAT:
+    value: "5"
+    description: "Number of repetitions per test"
+  FUNCTIONAL_TEST_TIME_LIMIT:
+    value: "1800"
+    description: "Timeout in seconds per test"
   FUNCTIONAL_TEST_CLUSTER_A100:
     value: "dgxa100_dracooci"
     options:

diff --git a/.gitlab/stages/02.functional-tests.yml b/.gitlab/stages/02.functional-tests.yml
@@ -57,6 +57,8 @@ functional:configure:
       python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \
         --scope $FUNCTIONAL_TEST_SCOPE \
         --environment dev \
+        --n-repeat "$FUNCTIONAL_TEST_REPEAT" \
+        --time-limit "$FUNCTIONAL_TEST_TIME_LIMIT" \
         --a100-cluster $A100_CLUSTER \
         --h100-cluster $H100_CLUSTER \
         --container-image ${CI_MCORE_LTS_IMAGE} \
@@ -68,6 +70,8 @@ functional:configure:
       python tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py \
         --scope $FUNCTIONAL_TEST_SCOPE \
         --environment lts \
+        --n-repeat "$FUNCTIONAL_TEST_REPEAT" \
+        --time-limit "$FUNCTIONAL_TEST_TIME_LIMIT" \
         --a100-cluster $A100_CLUSTER \
         --h100-cluster $H100_CLUSTER \
         --container-image ${CI_MCORE_LTS_IMAGE} \

diff --git a/tests/functional_tests/jet_recipes/bert.yaml b/tests/functional_tests/jet_recipes/bert.yaml
@@ -24,6 +24,7 @@ spec:
         "TRAINING_SCRIPT_PATH=pretrain_bert.py"
         "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
         "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
+        "N_REPEAT={n_repeat}"
     )
 
     bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
@@ -32,6 +33,7 @@ products:
   - environment: [lts, dev]
     scope: [mr]
     time_limit: [1800]
+    n_repeat: [5]
     test_case: 
     - bert_mr_mcore_tp2_pp2_dgx_a100_1N8G
     # - bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G
@@ -43,6 +45,7 @@ products:
     - bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G
   - environment: [lts]
     scope: [nightly]
+    n_repeat: [5]
     time_limit: [3600]
     test_case:
     - bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2

diff --git a/tests/functional_tests/jet_recipes/gpt-nemo.yaml b/tests/functional_tests/jet_recipes/gpt-nemo.yaml
@@ -24,13 +24,15 @@ spec:
         "TRAINING_SCRIPT_PATH=/opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py"
         "TRAINING_PARAMS_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
         "GOLDEN_VALUES_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
+        "N_REPEAT={n_repeat}"
     )
 
     bash /opt/megatron-lm/tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
 
 products:
   - environment: [dev]
     scope: [mr]
+    n_repeat: [5]
     test_case:
     - gpt3-nemo_126m_mr_mbs1_gbs8_mcore_te_tp2_pp4_vp3_seq_par_overlap_p2p_dgx_a100_1N8G
     - gpt3-nemo_126m_mr_mbs4_gbs64_mcore_te_tp1_pp1_dgx_a100_1N8G

diff --git a/tests/functional_tests/jet_recipes/gpt.yaml b/tests/functional_tests/jet_recipes/gpt.yaml
@@ -23,6 +23,7 @@ spec:
         "TRAINING_SCRIPT_PATH=pretrain_gpt.py"
         "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
         "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
+        "N_REPEAT={n_repeat}"
     )
 
     bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
@@ -32,6 +33,7 @@ products:
     scope: [mr]
     platforms: [dgx_a100]
     time_limit: [1800]
+    n_repeat: [5]
     test_case:
     - gpt3_mr_mcore_te_tp1_pp1_dist_optimizer_no_mmap_bin_files_dgx_a100_1N8G
     - gpt3_mr_mcore_te_tp1_pp1_resume_torch_dist_dist_optimizer_dgx_a100_1N8G
@@ -103,6 +105,7 @@ products:
     scope: [nightly]
     platforms: [dgx_a100]
     time_limit: [3600]
+    n_repeat: [5]
     test_case:
     - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_dist_optimizer_overlap_grad_reduce_param_gather
     - gpt3_345m_nightly_dgx_a100_1N8G_mcore_tp1_pp1_resume_torch_dist_dist_optimizer_overlap_grad_reduce_param_gather
@@ -136,6 +139,7 @@ products:
     scope: [mr]
     platforms: [dgx_a100]
     time_limit: [1800]
+    n_repeat: [5]
     test_case:
     - gpt3_mr_mcore_te_tp2_pp2_defer_embedding_wgrad_compute_dgx_a100_1N8G
     - gpt3_mr_mcore_te_tp1_pp4_vp1_dist_optimizer_overlap_grad_reduce_param_gather_dgx_a100_1N8G

diff --git a/tests/functional_tests/jet_recipes/multimodal-llava.yaml b/tests/functional_tests/jet_recipes/multimodal-llava.yaml
@@ -24,13 +24,15 @@ spec:
         "TRAINING_SCRIPT_PATH=pretrain_vlm.py"
         "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
         "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
+        "N_REPEAT={n_repeat}"
     )
 
     bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
 
 products:
   - environment: [lts, dev]
     scope: [mr]
+    n_repeat: [5]
     test_case:
     - multimodal_llava_mr_mcore_te_tp1_pp1_dgx_a100_1N8G
     - multimodal_llava_mr_mcore_te_tp2_pp3_dgx_a100_1N8G

diff --git a/tests/functional_tests/jet_recipes/t5.yaml b/tests/functional_tests/jet_recipes/t5.yaml
@@ -24,6 +24,7 @@ spec:
         "TRAINING_SCRIPT_PATH=pretrain_t5.py"
         "TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
         "GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values_{environment}.json"
+        "N_REPEAT={n_repeat}"
     )
 
     bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
@@ -32,6 +33,7 @@ products:
   - environment: [lts, dev]
     scope: [mr]
     time_limit: [1800]
+    n_repeat: [5]
     test_case:
     - t5_220m_mr_mcore_te_tp4_pp1_dgx_a100_1N8G
     - t5_220m_mr_mcore_te_tp4_pp1_resume_torch_dist_dgx_a100_1N8G
@@ -41,13 +43,15 @@ products:
   - environment: [lts]
     scope: [mr]
     time_limit: [1800]
+    n_repeat: [5]
     test_case:
     - t5_220m_mr_mcore_tp4_pp1_dgx_a100_1N8G
     - t5_220m_mr_mcore_tp2_pp2_dgx_a100_1N8G
     - t5_220m_mr_mcore_te_tp2_pp2_dgx_a100_1N8G
   - environment: [lts]
     scope: [weekly]
     time_limit: [9000]
+    n_repeat: [1]
     test_case:
     - t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp1_pp1_vp1_resume_torch
     - t5_220m_weekly_dgx_a100_1N8G_mcore_te_tp2_pp1_vp1

diff --git a/tests/functional_tests/python_test_utils/jet/common.py b/tests/functional_tests/python_test_utils/jet/common.py
@@ -134,6 +134,8 @@ def filter_by_model(
 
 def load_workloads(
     container_tag: str,
+    n_repeat: int = 1,
+    time_limit: int = 1800,
     environment: Optional[str] = None,
     scope: Optional[str] = None,
     model: Optional[str] = None,
@@ -171,4 +173,6 @@ def load_workloads(
                 container_image = container_image or build_workload.spec.source.image
                 build_workload.spec.source.image = f"{container_image}:{container_tag}"
                 workloads.append(build_workload)
+        workload.spec.n_repeat = n_repeat
+        workload.spec.time_limit = time_limit
     return workloads
diff --git a/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py b/tests/functional_tests/python_test_utils/jet/generate_jet_trigger_job.py
@@ -12,6 +12,8 @@
 @click.command()
 @click.option("--scope", required=True, type=str, help="Test scope")
 @click.option("--environment", required=True, type=str, help="LTS or dev features")
+@click.option("--n-repeat", required=False, default=1, type=int)
+@click.option("--time-limit", required=False, default=1, type=int)
 @click.option("--a100-cluster", required=True, type=str, help="A100 Cluster to run on")
 @click.option("--h100-cluster", required=True, type=str, help="H100 Cluster to run on")
 @click.option("--output-path", required=True, type=str, help="Path to write GitLab job to")
@@ -29,6 +31,8 @@
 def main(
     scope: str,
     environment: str,
+    n_repeat: int,
+    time_limit: int,
     a100_cluster: str,
     h100_cluster: str,
     output_path: str,
@@ -63,6 +67,8 @@ def main(
             "python tests/functional_tests/python_test_utils/jet/launch_jet_workload.py",
             f"--model {test_case.spec.model}",
             f"--environment {test_case.spec.environment}",
+            f"--n-repeat {n_repeat}",
+            f"--time-limit {time_limit}",
             f"--test-case {test_case.spec.test_case}",
             f"--container-tag {container_tag}",
             f"--cluster {cluster}",

diff --git a/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py b/tests/functional_tests/python_test_utils/jet/launch_jet_workload.py
@@ -42,6 +42,8 @@ def sigterm_handler(_signo, _stack_frame):
 def launch_and_wait_for_completion(
     test_case: str,
     environment: str,
+    n_repeat: int,
+    time_limit: int,
     container_image: str,
     container_tag: str,
     cluster: str,
@@ -54,6 +56,8 @@ def launch_and_wait_for_completion(
     ).workloads.submit(
         workloads=common.load_workloads(
             test_case=test_case,
+            n_repeat=n_repeat,
+            time_limit=time_limit,
             container_image=container_image,
             container_tag=container_tag,
             environment=environment,
@@ -142,6 +146,8 @@ def parse_finished_training(logs: List[str]) -> Optional[bool]:
 @click.option(
     "--environment", required=True, type=click.Choice(['dev', 'lts']), help="Pytorch LTS or DEV"
 )
+@click.option("--n-repeat", required=False, default=1, type=int)
+@click.option("--time-limit", required=False, default=1800, type=int)
 @click.option(
     "--account",
     required=False,
@@ -165,6 +171,8 @@ def main(
     model: str,
     test_case: str,
     environment: str,
+    n_repeat: int,
+    time_limit: int,
     account: str,
     cluster: str,
     container_tag: str,
@@ -195,6 +203,8 @@ def main(
         pipeline = launch_and_wait_for_completion(
             test_case=test_case,
             environment=environment,
+            n_repeat=n_repeat,
+            time_limit=time_limit,
             container_image=container_image,
             container_tag=container_tag,
             cluster=cluster,

diff --git a/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_dgx_a100_1N8G/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024

diff --git a/...al_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml b/...al_tests/test_cases/bert/bert_mr_mcore_tp2_pp2_local_spec_dgx_a100_1N8G/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024

diff --git a/...s/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml b/...s/test_cases/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_dgx_a100_1N8G/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024

diff --git a/...s/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml b/...s/bert/bert_mr_mcore_tp2_pp2_resume_torch_dist_local_spec_dgx_a100_1N8G/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024

diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_dgx_a100_1N8G/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024

diff --git a/...al_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/...al_tests/test_cases/bert/bert_mr_tp1_pp4_vp2_resume_torch_dgx_a100_1N8G/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024

diff --git a/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_mr_tp2_pp2_dgx_a100_1N8G/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024

diff --git a/...tional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml b/...tional_tests/test_cases/bert/bert_mr_tp2_pp2_resume_torch_dgx_a100_1N8G/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024

diff --git a/...nctional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml b/...nctional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp2/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024
@@ -43,4 +42,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
   --ckpt-format: torch
-TEST_TYPE: regular
+TEST_TYPE: regular
diff --git a/...onal_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml b/...onal_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp1_pp4_vp2/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024
@@ -44,4 +43,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
   --ckpt-format: torch
-TEST_TYPE: regular
+TEST_TYPE: regular
diff --git a/...nctional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml b/...nctional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_mcore_tp4_pp1/model_config.yaml
@@ -5,7 +5,6 @@ ENV_VARS:
   NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024
@@ -43,4 +42,4 @@ MODEL_ARGS:
   --data-cache-path: ${DATA_CACHE_PATH}
   --bf16: true
   --ckpt-format: torch
-TEST_TYPE: regular
+TEST_TYPE: regular
diff --git a/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml b/tests/functional_tests/test_cases/bert/bert_nightly_dgx_a100_1N8G_tp1_pp2/model_config.yaml
@@ -6,7 +6,6 @@ ENV_VARS:
   NCCL_ALGO: Tree
   CUBLAS_WORKSPACE_CONFIG: :4096:8
   NVTE_APPLY_QK_LAYER_SCALING: 1
-  N_REPEATS: 5
 MODEL_ARGS:
   --num-layers: 24
   --hidden-size: 1024
@@ -46,4 +45,4 @@ MODEL_ARGS:
   --fp16: true
   --apply-query-key-layer-scaling: true
   --ckpt-format: torch
-TEST_TYPE: regular
+TEST_TYPE: regular