Skip to content

Commit

Permalink
ADLR/megatron-lm!1924 - ci: Converge tests and release
Browse files Browse the repository at this point in the history
  • Loading branch information
ko3n1g committed Aug 30, 2024
1 parent 86e2927 commit 455e914
Show file tree
Hide file tree
Showing 18 changed files with 359 additions and 108 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ build
slurm*
logs
.vscode
local/
local/
.gitmodules
9 changes: 9 additions & 0 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,15 @@ variables:
- "yes"
- "no"
description: To run a convergence test
CONVERGENCE_TEST_SCOPE:
value: "release"
options:
- "release"
- "pre-release"
description: "Test suite to run (only for CONVERGENCE_TEST=yes)"
CONVERGENCE_TEST_RUN_NAME:
value: "pre-release-$$CI_PIPELINE_ID"
description: "Run directory of convergence test"
PUBLISH:
value: "no"
options:
Expand Down
5 changes: 1 addition & 4 deletions .gitlab/stages/01.tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ build_image:
script:
- |
set -x
env
eval "IMAGE=\$$IMAGE"
docker system prune -a --filter "until=24h" -f || true
Expand Down Expand Up @@ -63,10 +64,6 @@ build_image:
docker push ${IMAGE}:buildcache
fi
if [[ $CI_COMMIT_BRANCH == core_r* ]]; then
docker tag ${IMAGE}:${CI_PIPELINE_ID} ${IMAGE}:v${CI_COMMIT_BRANCH#core_r}-${CI_PIPELINE_ID}
docker push ${IMAGE}:v${CI_COMMIT_BRANCH#core_r}-${CI_PIPELINE_ID}
fi
retry:
max: 2

Expand Down
94 changes: 65 additions & 29 deletions .gitlab/stages/03.convergence-tests.yml
Original file line number Diff line number Diff line change
@@ -1,50 +1,86 @@
convergence-test:
release-test:
rules:
- if: $CONVERGENCE_TEST == "yes"
- when: never
- if: $CONVERGENCE_TEST == "yes" && $CONVERGENCE_TEST_SCOPE == "release"
stage: convergence_tests
needs: [build_image]
tags:
- ${TAG}
timeout: 7d
parallel:
matrix:
- SETTINGS: RELEASE_BERT
TAG: mcore-ssh-node-A
- SETTINGS: RELEASE_GPT
- MODEL: bert
VARIANT: bert_release
TAG: mcore-ssh-node-B
- SETTINGS: RELEASE_MOE
- MODEL: gpt
VARIANT: gpt3_15b_8t_release
TAG: mcore-ssh-node-B
- MODEL: mixtral
VARIANT: mixtral_8x7b_alltoall_tp2pp4ep4_release
TAG: mcore-ssh-node-B
before_script: |
python -m venv local/venv
source local/venv/bin/activate
pip install jet-api --upgrade $JET_INDEX_URLS
script:
- |
env
set -x
export MCORE_RELEASE_NUM=${CI_COMMIT_BRANCH#core_r}
export IMAGE_TAG=v${MCORE_RELEASE_NUM}-${CI_PIPELINE_ID}
export WANDB_API_KEY=${WANDB_API_KEY}
export GITLAB_TOKEN=${PAT}
MCORE_RELEASE_NUM=$(python -c "from megatron import core; print(core.__version__)")
export IMAGE_TAG=v$MCORE_RELEASE_NUM-${CI_PIPELINE_ID}
export RUN_NAME=release-testing/mcore-v$MCORE_RELEASE_NUM/$MODEL/$VARIANT
export WANDB_EXPERIMENT=v$MCORE_RELEASE_NUM_$MODEL_$VARIANT
export WANDB_API_KEY
bash ./tests/functional_tests/local_recipes/$MODEL/$VARIANT.sh
artifacts:
paths:
- ./golden_values.json

pre-release-test:
rules:
- if: $CONVERGENCE_TEST == "yes" && $CONVERGENCE_TEST_SCOPE == "pre-release"
stage: convergence_tests
needs: [build_image]
tags:
- ${TAG}
timeout: 7d
parallel:
matrix:
- MODEL: bert
VARIANT: bert_release
TAG: mcore-ssh-node-B
- MODEL: gpt
VARIANT: gpt3_15b_8t_release_sm
TAG: mcore-ssh-node-B
- MODEL: mixtral
VARIANT: mixtral_8x7b_alltoall_tp2pp4ep4_release_sm
TAG: mcore-ssh-node-B
variables:
GIT_SUBMODULE_STRATEGY: normal
before_script:
- python -m venv local/venv
- source local/venv/bin/activate
- pip install jet-api --upgrade $JET_INDEX_URLS
script:
- |
env
set -x
export IMAGE_TAG=${CI_PIPELINE_ID}
export WANDB_API_KEY
CONVERGENCE_TEST_RUN_NAME=$(eval echo $CONVERGENCE_TEST_RUN_NAME)
SETTINGS_ID=$(curl \
--request GET "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/snippets" \
--header "PRIVATE-TOKEN: $PROJECT_ACCESS_TOKEN_MCORE" \
| jq --arg TITLE "$SETTINGS" '
.[]
| select(.title == $TITLE)
| .id
' \
| tr -d '"')
SETTINGS=$(curl \
--request GET "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/snippets/${DATA_BLEND_ID}/raw" \
--header "PRIVATE-TOKEN: $PROJECT_ACCESS_TOKEN_MCORE"
)
echo "$SETTINGS" > settings.txt
source settings.sh
if [[ -z $CONVERGENCE_TEST_RUN_NAME ]]; then
echo Please assign a CONVERGENCE_TEST_RUN_NAME
fi
yq '.MODEL_ARGS."--data-path" = env(DATA_PATH)' -i $TRAINING_PARAMS_PATH
export RUN_NAME=$CONVERGENCE_TEST_RUN_NAME/$MODEL/$VARIANT
export WANDB_EXPERIMENT=$CONVERGENCE_TEST_RUN_NAME_$MODEL_$VARIANT
env
bash tests/functional_tests/shell_test_utils/run_ci_test_locally.sh
bash ./tests/functional_tests/local_recipes/$MODEL/$VARIANT.sh
artifacts:
paths:
- ./golden_values.json
2 changes: 1 addition & 1 deletion .gitlab/stages/04.publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ create-gh-release:
entrypoint: [""]
script:
- |
RELEASE_NUMBER=${CI_COMMIT_BRANCH#core_r}
RELEASE_NUMBER=$(python -c "from megatron import core; print(core.__version__)")
NAME="NVIDIA Megatron Core $RELEASE_NUMBER"
CHANGELOG=$(awk '/^## '$NAME'/{flag=1; next} /^## /{flag=0} flag' CHANGELOG.md)
CHANGELOG=$(echo "$CHANGELOG" | sed '/./!d')
Expand Down
3 changes: 1 addition & 2 deletions tests/functional_tests/jet_recipes/bert.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@ spec:
"TENSORBOARD_PATH={assets_dir}/tensorboard"
"CHECKPOINT_PATH=/workspace/checkpoints"
"TRAINING_SCRIPT_PATH=pretrain_bert.py"
"TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
"GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values.json"
"TEST_CASE_PATH=./tests/functional_tests/test_cases/{model}/{test_case}"
)
bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
Expand Down
3 changes: 1 addition & 2 deletions tests/functional_tests/jet_recipes/gpt-nemo.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@ spec:
"TENSORBOARD_PATH={assets_dir}/tensorboard"
"CHECKPOINT_PATH=/workspace/checkpoints"
"TRAINING_SCRIPT_PATH=/opt/NeMo/examples/nlp/language_modeling/megatron_gpt_pretraining.py"
"TRAINING_PARAMS_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
"GOLDEN_VALUES_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}/golden_values.json"
"TEST_CASE_PATH=/opt/megatron-lm/tests/functional_tests/test_cases/{model}/{test_case}"
)
bash /opt/megatron-lm/tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
Expand Down
3 changes: 1 addition & 2 deletions tests/functional_tests/jet_recipes/gpt.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ spec:
"TENSORBOARD_PATH={assets_dir}/tensorboard"
"CHECKPOINT_PATH=/workspace/checkpoints"
"TRAINING_SCRIPT_PATH=pretrain_gpt.py"
"TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
"GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values.json"
"TEST_CASE_PATH=./tests/functional_tests/test_cases/{model}/{test_case}"
)
bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
Expand Down
3 changes: 1 addition & 2 deletions tests/functional_tests/jet_recipes/multimodal-llava.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@ spec:
"TENSORBOARD_PATH={assets_dir}/tensorboard"
"CHECKPOINT_PATH=/workspace/checkpoints"
"TRAINING_SCRIPT_PATH=pretrain_vlm.py"
"TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
"GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values.json"
"TEST_CASE_PATH=./tests/functional_tests/test_cases/{model}/{test_case}"
)
bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
Expand Down
3 changes: 1 addition & 2 deletions tests/functional_tests/jet_recipes/t5.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,7 @@ spec:
"TENSORBOARD_PATH={assets_dir}/tensorboard"
"CHECKPOINT_PATH=/workspace/checkpoints"
"TRAINING_SCRIPT_PATH=pretrain_t5.py"
"TRAINING_PARAMS_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/model_config.yaml"
"GOLDEN_VALUES_PATH=./tests/functional_tests/test_cases/{model}/{test_case}/golden_values.json"
"TEST_CASE_PATH=./tests/functional_tests/test_cases/{model}/{test_case}"
)
bash ./tests/functional_tests/shell_test_utils/run_ci_test.sh ${{ARGUMENTS[@]}}
Expand Down
27 changes: 15 additions & 12 deletions tests/functional_tests/shell_test_utils/_run_training.sh
Original file line number Diff line number Diff line change
Expand Up @@ -47,12 +47,27 @@ if [[ "$SCRIPT" != null ]]; then
eval "$SCRIPT"
fi;

# Pull env vars to export
ENV_VARS=$(yq '... comments="" | .ENV_VARS | to_entries | .[] | [.key + "=" + .value] | join(" ")' $TRAINING_PARAMS_PATH)
for ARGUMENT in $ENV_VARS; do
KEY=$(echo $ARGUMENT | cut -f1 -d=)

KEY_LENGTH=${#KEY}
VALUE="${ARGUMENT:$KEY_LENGTH+1}"

export "$KEY"="$VALUE"
echo "$KEY=$VALUE"
done

# Exit earlier to leave time for properly saving checkpoint
if [[ $(echo "$TRAINING_SCRIPT_PATH" | tr '[:upper:]' '[:lower:]') == *nemo* ]]; then
PARAMS=""
TRAINING_PARAMS_FROM_CONFIG=$(yq '... comments="" | .MODEL_ARGS | to_entries | .[] | with(select(.value == "true"); .value = "") | [.key + "=" + .value] | join("")' $TRAINING_PARAMS_PATH | tr '\n' ' ')

else
# If this is a second run (of checkpoint-resume), we might want to use a
# different model configuration than during first time. So if key `MODEL_ARGS_2`
# exists we use it, otherwise we use the same as for the first run.
if [[ $RUN_NUMBER -eq 2 && $(yq 'has("MODEL_ARGS_2")' $TRAINING_PARAMS_PATH) == true ]]; then
export KEY="MODEL_ARGS_2"
else
Expand All @@ -66,18 +81,6 @@ fi
# Extract training params
PARAMS="$PARAMS $TRAINING_PARAMS_FROM_CONFIG"

# Pull env vars to export
ENV_VARS=$(yq '... comments="" | .ENV_VARS | to_entries | .[] | [.key + "=" + .value] | join(" ")' $TRAINING_PARAMS_PATH)
for ARGUMENT in $ENV_VARS; do
KEY=$(echo $ARGUMENT | cut -f1 -d=)

KEY_LENGTH=${#KEY}
VALUE="${ARGUMENT:$KEY_LENGTH+1}"

export "$KEY"="$VALUE"
echo "$KEY=$VALUE"
done

# Set PYTHONPATH
export PYTHONPATH="$(pwd):${PYTHONPATH:-}"
export WANDB_API_KEY="${WANDB_API_KEY:-}"
Expand Down
11 changes: 8 additions & 3 deletions tests/functional_tests/shell_test_utils/run_ci_test.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/bin/bash

set -euxo pipefail
set -exo pipefail

echo "------ARGUMENTS LIST --------"
for ARGUMENT in "$@"; do
Expand All @@ -17,7 +17,7 @@ echo "---------------------------------"
# Check that mandatory vars are set
MANDATORY_VARS=(
"TRAINING_SCRIPT_PATH"
"TRAINING_PARAMS_PATH"
"TEST_CASE_PATH"
"OUTPUT_PATH"
"TENSORBOARD_PATH"
"CHECKPOINT_PATH"
Expand All @@ -31,6 +31,9 @@ for mandatory_var in "${MANDATORY_VARS[@]}"; do
fi
done

export TRAINING_PARAMS_PATH=$TEST_CASE_PATH/model_config.yaml
export GOLDEN_VALUES_PATH=$TEST_CASE_PATH/golden_values.json

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
ROOT_DIR=$(realpath $SCRIPT_DIR/../../../)

Expand All @@ -46,7 +49,9 @@ N_REPEATS=$(cat $TRAINING_PARAMS_PATH \

for i in $(seq 1 $N_REPEATS);
do
rm -rf $CHECKPOINT_PATH/*
if [[ $i -gt 1 ]]; then
rm -rf $CHECKPOINT_PATH/*
fi

# Training
export RUN_NUMBER=1
Expand Down
Loading

0 comments on commit 455e914

Please sign in to comment.