Skip to content

Commit

Permalink
ADLR/megatron-lm!1902 - ci: Cleanup jobs
Browse files Browse the repository at this point in the history
  • Loading branch information
ko3n1g committed Aug 9, 2024
1 parent db0e623 commit db5c60a
Showing 18 changed files with 39 additions and 31 deletions.
30 changes: 14 additions & 16 deletions .gitlab-ci.yml
Original file line number Diff line number Diff line change
@@ -217,9 +217,8 @@ label_merge_request:
- |
source labels
curl --header "PRIVATE-TOKEN: ${PROJECT_ACCESS_TOKEN_MCORE}" --url "https://${GITLAB_ENDPOINT}/api/v4/projects/${CI_PROJECT_ID}/merge_requests/${CI_MERGE_REQUEST_IID}" --data-urlencode "add_labels=$LABELS" -X PUT
only:
refs:
- merge_requests
rules:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event"'

check_milestone:
stage: .pre
@@ -235,6 +234,8 @@ check_milestone:
echo Please assign a Milestone to this MR!
exit 1
fi
rules:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event"'

build_image:
tags:
@@ -311,7 +312,7 @@ unit_tests:
parallel:
matrix:
- TAG: latest
- TAG: 9229390b3ef365694d323b0cd8d5e86f86268b05
- TAG: a2628239fc6427a9b5238a0bc46d24a259e7c5b8
tags:
- 8xL40S
rules:
@@ -390,26 +391,23 @@ copyright:
- when: always
interruptible: true

secret_detection_check:
extends: secret_detection # Is from the template - Secret-Detection.gitlab-ci.yml
secret_detection:
stage: test
variables:
GIT_DEPTH: 0
SECRET_DETECTION_LOG_OPTIONS: ${CI_MERGE_REQUEST_DIFF_BASE_SHA}..${CI_COMMIT_SHA}
tags:
- mcore-docker-node-small
rules: # This is required because the template sets rules do not work for us.
- when: always
before_script: # JQ to parse the parse JSON report generated
- apk add jq
allow_failure: false
rules:
- if: '$CI_PIPELINE_SOURCE == "merge_request_event"'
script:
- !reference [secret_detection, script] # Source the script from the template
- echo "Secret detection Report can be downloaded from the Merge Request"
- echo -e "\n\n\n\n\n############# Printing Secret Detection Report#####################################################"
- echo -e "#############Looks for the vulnerabilities JSON section##################################################### \n\n\n\n\n"
- cat gl-secret-detection-report.json | jq '.'
# Parse to find vulnerabilities JSON key
- apk add jq
- /analyzer run
- |
if [[ $(cat gl-secret-detection-report.json | jq '.vulnerabilities | length > 0') == true ]]; then
echo "Atleast one vulnerability has been found"
cat gl-secret-detection-report.json | jq '.'
exit 1
fi
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -22,3 +22,4 @@ skip_string_normalization = true
# recongized by future versions, disallows to reformat code with incompatible versions
# Matches NeMO version so people working on both codebases don't need two different version of black installed
required_version = "24"
skip_magic_trailing_comma = true
Original file line number Diff line number Diff line change
@@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
NVTE_APPLY_QK_LAYER_SCALING: 1
MODEL_ARGS:
--num-layers: 24
--hidden-size: 1024
Original file line number Diff line number Diff line change
@@ -3,6 +3,7 @@ ENV_VARS:
NVTE_ALLOW_NONDETERMINISTIC_ALGO: 0
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
NVTE_APPLY_QK_LAYER_SCALING: 1
MODEL_ARGS:
--num-layers: 24
--hidden-size: 1024
Original file line number Diff line number Diff line change
@@ -47,6 +47,6 @@ MODEL_ARGS:
--use-mcore-models: true
--ckpt-format: torch_dist
--data-cache-path: ${DATA_CACHE_PATH}
--fp16: true
--bf16: true
--apply-query-key-layer-scaling: true
TEST_TYPE: regular
Original file line number Diff line number Diff line change
@@ -38,7 +38,7 @@ MODEL_ARGS:
--eval-iters: 10
--transformer-impl: local
--tensor-model-parallel-size: 1
--pipeline-model-parallel-size: 2:
--pipeline-model-parallel-size: 2
--deterministic-mode: true
--no-gradient-accumulation-fusion: true
--use-mcore-models: true
Original file line number Diff line number Diff line change
@@ -38,7 +38,7 @@ MODEL_ARGS:
--eval-iters: 10
--transformer-impl: local
--tensor-model-parallel-size: 1
--pipeline-model-parallel-size: 2:
--pipeline-model-parallel-size: 2
--deterministic-mode: true
--no-gradient-accumulation-fusion: true
--use-checkpoint-opt_param-scheduler: true
Original file line number Diff line number Diff line change
@@ -38,12 +38,12 @@ MODEL_ARGS:
--eval-iters: 10
--transformer-impl: local
--tensor-model-parallel-size: 1
--pipeline-model-parallel-size: 4:
--pipeline-model-parallel-size: 4
--deterministic-mode: true
--no-gradient-accumulation-fusion: true
--use-mcore-models: true
--ckpt-format: torch_dist
--data-cache-path: ${DATA_CACHE_PATH}
--fp16: true
--bf16: true
--apply-query-key-layer-scaling: true
TEST_TYPE: regular
Original file line number Diff line number Diff line change
@@ -38,7 +38,7 @@ MODEL_ARGS:
--eval-iters: 10
--transformer-impl: local
--tensor-model-parallel-size: 1
--pipeline-model-parallel-size: 4:
--pipeline-model-parallel-size: 4
--deterministic-mode: true
--no-gradient-accumulation-fusion: true
--use-checkpoint-opt_param-scheduler: true
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"lm loss": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [10.81916, 10.86661, 10.85683, 10.80678, 10.7112, 10.63712, 10.16253, 10.27882, 10.18795, 9.88907]}, "num-zeros": {"start_step": 0, "end_step": 50, "step_interval": 5, "values": [12923.0, 15794.0, 16416.0, 15771.0, 14114.0, 15096.0, 12918.0, 15842.0, 16657.0, 17467.0]}, "iteration_timing_avg": 0.340485}
Original file line number Diff line number Diff line change
@@ -48,6 +48,6 @@ MODEL_ARGS:
--use-mcore-models: true
--ckpt-format: torch_dist
--data-cache-path: ${DATA_CACHE_PATH}
--fp16: true
--bf16: true
--apply-query-key-layer-scaling: true
TEST_TYPE: regular
Original file line number Diff line number Diff line change
@@ -44,6 +44,6 @@ MODEL_ARGS:
--use-mcore-models: true
--ckpt-format: torch_dist
--data-cache-path: ${DATA_CACHE_PATH}
--fp16: true
--bf16: true
--apply-query-key-layer-scaling: true
TEST_TYPE: regular
Original file line number Diff line number Diff line change
@@ -38,7 +38,7 @@ MODEL_ARGS:
--eval-iters: 10
--transformer-impl: local
--tensor-model-parallel-size: 4
--pipeline-model-parallel-size: 1:
--pipeline-model-parallel-size: 1
--deterministic-mode: true
--no-gradient-accumulation-fusion: true
--use-checkpoint-opt_param-scheduler: true
Original file line number Diff line number Diff line change
@@ -4,6 +4,9 @@ ENV_VARS:
NCCL_ALGO: Tree
CUBLAS_WORKSPACE_CONFIG: :4096:8
SKIP_PYTEST: 1
BEFORE_SCRIPT:
pip uninstall -y transformer_engine
pip uninstall -y Apex ## TODO: remove once Apex dependency has been removed completely
MODEL_ARGS:
--num-layers: 12
--hidden-size: 512
2 changes: 2 additions & 0 deletions tests/unit_tests/data/test_gpt_dataset.py
Original file line number Diff line number Diff line change
@@ -5,6 +5,7 @@
import random

import numpy
import pytest
import torch

from megatron.core.datasets.blended_megatron_dataset_builder import BlendedMegatronDatasetBuilder
@@ -25,6 +26,7 @@ def sample_N(dataset, N, randomize):
return samples


@pytest.mark.skip(reason="Tests are flaky and need to be debugged")
def test_mock_gpt_dataset():
if torch.distributed.is_available():
Utils.initialize_distributed()
10 changes: 4 additions & 6 deletions tests/unit_tests/dist_checkpointing/test_fully_parallel.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved.
from pathlib import Path
from typing import Dict

import numpy as np
import pytest
import torch

@@ -22,7 +20,6 @@
FullyParallelLoadStrategyWrapper,
FullyParallelSaveStrategyWrapper,
_sharded_tensor_shard_id,
_ShardId,
)
from tests.unit_tests.dist_checkpointing import TempNamedDir
from tests.unit_tests.test_utilities import Utils
@@ -191,7 +188,7 @@ def test_save_distribution(self, parallelization_along_dp, tmp_path_dist_ckpt):
)
assert expected_key_to_saving_ranks == key_to_saving_rank

for k, sh_ten in state_dict.items():
for _, sh_ten in state_dict.items():
if (
_sharded_tensor_shard_id(sh_ten)
in save_strategy.cached_distribution.shards_in_this_group
@@ -231,7 +228,8 @@ def test_load_distribution(self, parallelization_along_dp, tmp_path_dist_ckpt):
'keyE': [6], # second largest tensor
}
else:
# When loading, expected key distribution is the same across TP, because every replica needs to be loaded
# When loading, expected key distribution is the same across TP, because every replica
# needs to be loaded
expected_key_to_saving_ranks = {
# everyone must load (disjoint shards, coverage == 1):
'keyB': list(
@@ -312,7 +310,7 @@ def _get_empty_tensor_for_exchange(self, *args, **kwargs) -> torch.Tensor:
mem_alloc_start = torch.cuda.memory_allocated()

with TempNamedDir(tmp_path_dist_ckpt / 'mock_dir') as ckpt_dir_A:
loaded_state_dict = load_strategy.load(sharded_state_dict, ckpt_dir_A)
_ = load_strategy.load(sharded_state_dict, ckpt_dir_A)

# Each rank is expected to do 7 * 10 empty allocations
assert len(mem_alloc) == 7 * 10
1 change: 1 addition & 0 deletions tests/unit_tests/dist_checkpointing/test_nonpersistent.py
Original file line number Diff line number Diff line change
@@ -29,6 +29,7 @@ def teardown_method(self, method):
Utils.destroy_model_parallel()

@pytest.mark.parametrize(('tp,pp'), [(2, 4)])
@pytest.mark.skip(reason="Tests are flaky and need to be debugged")
def test_basic_save_load_scenarios(self, tmp_path_dist_ckpt, tp, pp):
Utils.initialize_model_parallel(tp, pp)
num_floating_point_operations_so_far = 0
2 changes: 2 additions & 0 deletions tests/unit_tests/dist_checkpointing/test_optimizer.py
Original file line number Diff line number Diff line change
@@ -255,6 +255,7 @@ def test_dp_sharding(self, tmp_path_dist_ckpt, tp_pp, src_dp, dest_dp, use_fpsl,
('src_tp_pp', 'dest_tp_pp', 'use_glu'),
[((2, 2), (2, 4), False), ((1, 8), (4, 1), True), ((2, 4), (4, 2), False)],
)
@pytest.mark.skip(reason="Tests are flaky and need to be debugged")
def test_finetune_doesnt_load_optimizer(
self, tmp_path_dist_ckpt, src_tp_pp, dest_tp_pp, use_glu
):
@@ -327,6 +328,7 @@ def test_finetune_doesnt_load_optimizer(
assert not diffs[0] and not diffs[1] and diffs[2]
assert not any(diff(optimizer.state_dict(), optim_unloaded_state_dict))

@pytest.mark.skip(reason="Tests are flaky and need to be debugged")
def test_can_load_deprecated_bucket_space_format(self, tmp_path_dist_ckpt):
# sync=True to make sure other ranks wait for rank 0 to finish creating directory.
tp = 4

0 comments on commit db5c60a

Please sign in to comment.