Skip to content

Commit

Permalink
CI: D2L Infra 2.0 with Github Actions #2481 [skip ci] (#2481)
Browse files Browse the repository at this point in the history
* MXNet: Fix pandas 2.0 needs kwarg for sep arg

* PyTorch: Refactor deprecated torch.eig -> torch.linalg.eig

* PyTorch: torch>2.0 deprecations force using indexing arg in torch.meshgrid

* Bug: Pandas 2.0 errors out for all inputs.values returning object array

* CI: Add containter entrypoint d2l_job.sh script

* CI: Add Docker Images & Docker Build Workflow

* CI: Add actions, workflows and workflow scripts

* CI: Remove Jenkins

* remove torch specific packages as hard requirements

* CI: Streamline git setup

* Fix modified files timesync based caching on PRs

* CI: Double down on security by splitting push and release permissions
  • Loading branch information
AnirudhDagar authored May 5, 2023
1 parent 3864d18 commit b21c4d3
Show file tree
Hide file tree
Showing 28 changed files with 1,285 additions and 119 deletions.
59 changes: 59 additions & 0 deletions .github/actions/setup_env_vars/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
name: "Setup Env Vars"

runs:
using: "composite"
steps:
- name: Get Commit SHA (For Push Event)
if: ${{ github.event_name == 'push' }}
shell: bash
env:
SHA: ${{ github.sha }}
git_repo_full: ${{ github.repository }}
pr_number: ""
run: |
short_sha=$(git rev-parse --short "$SHA")
echo "SHORT_SHA=$short_sha" >> $GITHUB_ENV
echo "PR_NUMBER=$pr_number" >> $GITHUB_ENV
target_branch=${GITHUB_REF##*/}
echo "TARGET_BRANCH=$target_branch" >> $GITHUB_ENV
repo_name=${git_repo_full##*/}
echo "REPO_NAME=$repo_name" >> $GITHUB_ENV
IFS='-' read -ra name_parts <<< "$repo_name"
echo "LANG=${name_parts[1]}" >> $GITHUB_ENV
task="${repo_name}-${target_branch}"
echo "TASK=$task" >> $GITHUB_ENV
job_name=${repo_name}/${target_branch}
echo "JOB_NAME=$job_name" >> $GITHUB_ENV
- name: Get Commit SHA (For Pull Request)
if: ${{ github.event_name == 'pull_request_target' }}
shell: bash
env:
SHA: ${{ github.event.pull_request.head.sha }}
target_branch: ${{ github.event.pull_request.base.ref }}
git_repo_full: ${{ github.event.pull_request.base.repo.full_name }}
pr_number: PR-${{ github.event.number }}
run: |
short_sha=$(git rev-parse --short "$SHA")
echo "SHORT_SHA=$short_sha" >> $GITHUB_ENV
echo "PR_NUMBER=$pr_number" >> $GITHUB_ENV
echo "TARGET_BRANCH=$target_branch" >> $GITHUB_ENV
repo_name=${git_repo_full##*/}
echo "REPO_NAME=$repo_name" >> $GITHUB_ENV
IFS='-' read -ra name_parts <<< "$repo_name"
echo "LANG=${name_parts[1]}" >> $GITHUB_ENV
task="${repo_name}-${target_branch}"
echo "TASK=$task" >> $GITHUB_ENV
job_name=${repo_name}/PR-${{ github.event.number }}/${short_sha}
echo "JOB_NAME=$job_name" >> $GITHUB_ENV
91 changes: 91 additions & 0 deletions .github/actions/submit-job/action.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
name: "Submit Job to AWS Batch"
inputs:
job-type:
required: true
job-name:
required: true
work-dir:
required: false
default: .
command:
required: true

runs:
using: "composite"
steps:
- name: Configure AWS Credentials
uses: aws-actions/configure-aws-credentials@v2
with:
role-to-assume: arn:aws:iam::650140442593:role/D2L_CI_Batch
role-duration-seconds: 14400 # this requires changing max session duration to 4hrs in AWS Console for D2L_CI_Batch
aws-region: us-west-2

- name: Install dependencies
shell: bash
run: |
pip install boto3
- name: Check for Actor Permissions
id: check
continue-on-error: true
uses: prince-chrismc/check-actor-permissions-action@v2
with:
github_token: ${{ github.token }}
permission: write

- name: Submit Job (For Push on development branches)
if: ${{ github.event_name == 'push' && github.ref != 'refs/heads/release' && github.ref != 'refs/heads/classic' }}
shell: bash
run: |
echo "Start submitting job for a Push Event on a Development Branch"
# Add "-push" for all these jobs to use elevated push level job-def permissions
python ./ci/submit-job.py --job-type ${{ inputs.job-type }}-push \
--name ${{ inputs.job-name }}-'${{ github.ref }}' \
--source-ref '${{ github.ref }}' \
--work-dir ${{ inputs.work-dir }} \
--remote https://github.com/'${{ github.repository }}' \
--command "${{ inputs.command }}" \
--safe-to-use-script \
--wait
- name: Submit Job (For Push on Release/Classic)
if: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/release' || github.ref == 'refs/heads/classic') }}
shell: bash
run: |
echo "Start submitting job for a Push Event on Release/Classic Branch"
# Add "-release" for all these jobs to use elevated release level job-def permissions
python ./ci/submit-job.py --job-type ${{ inputs.job-type }}-release \
--name ${{ inputs.job-name }}-'${{ github.ref }}' \
--source-ref '${{ github.ref }}' \
--work-dir ${{ inputs.work-dir }} \
--remote https://github.com/'${{ github.repository }}' \
--command "${{ inputs.command }}" \
--safe-to-use-script \
--wait
- name: Submit Job (For Pull Request Safe Scripts)
if: ${{ github.event_name == 'pull_request_target' && steps.check.outputs.permitted == 'true' }}
shell: bash
run: |
echo "Start submitting job"
python ./ci/submit-job.py --job-type ${{ inputs.job-type }} \
--name ${{ inputs.job-name }}-PR#'${{ github.event.number }}' \
--source-ref '${{ github.event.pull_request.head.sha }}' \
--work-dir ${{ inputs.work-dir }} \
--remote https://github.com/'${{ github.event.pull_request.head.repo.full_name }}' \
--command "${{ inputs.command }}" \
--safe-to-use-script \
--wait
- name: Submit Job (For Pull Request Not Safe Scripts)
if: ${{ github.event_name == 'pull_request_target' && steps.check.outputs.permitted != 'true' }}
shell: bash
run: |
echo "Start submitting job"
python ./ci/submit-job.py --job-type ${{ inputs.job-type }} \
--name ${{ inputs.job-name }}-PR#'${{ github.event.number }}' \
--source-ref '${{ github.event.pull_request.head.sha }}' \
--work-dir ${{ inputs.work-dir }} \
--remote https://github.com/'${{ github.event.pull_request.head.repo.full_name }}' \
--command "${{ inputs.command }}" \
--wait
47 changes: 47 additions & 0 deletions .github/workflow_scripts/build_and_deploy.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/bin/bash

# Move all scripts related to html here!

set -ex

REPO_NAME="$1" # Eg. 'd2l-en'
TARGET_BRANCH="$2" # Eg. 'master' ; if PR raised to master
JOB_NAME="$3" # Eg. 'd2l-en/master' or 'd2l-en/PR-2453/21be1a4'
LANG="$4" # Eg. 'en','zh' etc.

pip3 install .
mkdir _build

# Move aws copy commands for cache restore outside
aws s3 sync s3://preview.d2l.ai/ci_cache/$REPO_NAME-$TARGET_BRANCH/_build _build --delete --quiet --exclude 'eval*/data/*'

# Build D2L Website
./.github/workflow_scripts/build_html.sh $TARGET_BRANCH $JOB_NAME

# Build PDFs
d2lbook build pdf
d2lbook build pdf --tab mxnet


# Check if the JOB_NAME is either "$REPO_NAME/release" or "$REPO_NAME/classic"
if [[ "$JOB_NAME" == "$REPO_NAME/release" || "$JOB_NAME" == "$REPO_NAME/classic" ]]; then

# Setup D2L Bot
source $(dirname "$0")/setup_git.sh
setup_git

# Run d2lbook release deployment
if [[ "$JOB_NAME" == *"/classic" ]]; then
# Use classic s3 bucket for classic release
LANG="classic"
fi
d2lbook build pkg
d2lbook deploy html pdf pkg colab sagemaker slides --s3 "s3://${LANG}.d2l.ai/"

else
# Run d2lbook preview deployment
d2lbook deploy html pdf --s3 "s3://preview.d2l.ai/${JOB_NAME}/"
fi

# Move aws copy commands for cache store outside
aws s3 sync _build s3://preview.d2l.ai/ci_cache/$REPO_NAME-$TARGET_BRANCH/_build --acl public-read --quiet --exclude 'eval*/data/*'
File renamed without changes.
34 changes: 34 additions & 0 deletions .github/workflow_scripts/build_jax.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#!/bin/bash

set -ex

# Used to capture status exit of build eval command
ss=0

REPO_NAME="$1" # Eg. 'd2l-en'
TARGET_BRANCH="$2" # Eg. 'master' ; if PR raised to master

pip3 install .
mkdir _build

# Move sanity check outside
d2lbook build outputcheck tabcheck

# Move aws copy commands for cache restore outside
echo "Retrieving jax build cache"
aws s3 sync s3://preview.d2l.ai/ci_cache/"$REPO_NAME"-"$TARGET_BRANCH"/_build/eval_jax/ _build/eval_jax/ --delete --quiet --exclude 'data/*'

export XLA_PYTHON_CLIENT_MEM_FRACTION=.70
export TF_CPP_MIN_LOG_LEVEL=3
export TF_FORCE_GPU_ALLOW_GROWTH=true
# Continue the script even if some notebooks in build fail to
# make sure that cache is copied to s3 for the successful notebooks
d2lbook build eval --tab jax || ((ss=1))

# Move aws copy commands for cache store outside
echo "Upload jax build cache to s3"
aws s3 sync _build s3://preview.d2l.ai/ci_cache/"$REPO_NAME"-"$TARGET_BRANCH"/_build --acl public-read --quiet

if [ "$ss" -ne 0 ]; then
exit 1
fi
38 changes: 38 additions & 0 deletions .github/workflow_scripts/build_mxnet.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash

set -ex

# Used to capture status exit of build eval command
ss=0

REPO_NAME="$1" # Eg. 'd2l-en'
TARGET_BRANCH="$2" # Eg. 'master' ; if PR raised to master

pip3 install .
mkdir _build

# Move sanity check outside
d2lbook build outputcheck tabcheck

# Move aws copy commands for cache restore outside
echo "Retrieving mxnet build cache"
aws s3 sync s3://preview.d2l.ai/ci_cache/"$REPO_NAME"-"$TARGET_BRANCH"/_build/eval_mxnet/ _build/eval_mxnet/ --delete --quiet --exclude 'data/*'

# MXNet training for the following notebooks is slow in the container;
# Setting NTHREADS=4 below seems to fix the issue:
# 1. chapter_multilayer-perceptrons/dropout.md
# 2. chapter_multilayer-perceptrons/mlp-implementation.md
# 3. chapter_linear-classification/softmax-regression-concise.md
# 4. chapter_linear-classification/softmax-regression-scratch.md
export MXNET_CPU_WORKER_NTHREADS=4
# Continue the script even if some notebooks in build fail to
# make sure that cache is copied to s3 for the successful notebooks
d2lbook build eval --tab mxnet || ((ss=1))

# Move aws copy commands for cache store outside
echo "Upload mxnet build cache to s3"
aws s3 sync _build s3://preview.d2l.ai/ci_cache/"$REPO_NAME"-"$TARGET_BRANCH"/_build --acl public-read --quiet

if [ "$ss" -ne 0 ]; then
exit 1
fi
35 changes: 35 additions & 0 deletions .github/workflow_scripts/build_pytorch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/bin/bash

set -ex

# Used to capture status exit of build eval command
ss=0

REPO_NAME="$1" # Eg. 'd2l-en'
TARGET_BRANCH="$2" # Eg. 'master' ; if PR raised to master

pip3 install .
mkdir _build

# Move sanity check outside
d2lbook build outputcheck tabcheck

# Move aws copy commands for cache restore outside
echo "Retrieving pytorch build cache"
aws s3 sync s3://preview.d2l.ai/ci_cache/"$REPO_NAME"-"$TARGET_BRANCH"/_build/eval _build/eval --delete --quiet --exclude 'data/*'
echo "Retrieving pytorch slides cache"
aws s3 sync s3://preview.d2l.ai/ci_cache/"$REPO_NAME"-"$TARGET_BRANCH"/_build/slides _build/slides --delete --quiet --exclude 'data/*'

# Continue the script even if some notebooks in build fail to
# make sure that cache is copied to s3 for the successful notebooks
d2lbook build eval || ((ss=1))
d2lbook build slides --tab pytorch

# Move aws copy commands for cache store outside
echo "Upload pytorch build cache to s3"
aws s3 sync _build s3://preview.d2l.ai/ci_cache/"$REPO_NAME"-"$TARGET_BRANCH"/_build --acl public-read --quiet

# Exit with a non-zero status if evaluation failed
if [ "$ss" -ne 0 ]; then
exit 1
fi
33 changes: 33 additions & 0 deletions .github/workflow_scripts/build_tf.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/bash

set -ex

# Used to capture status exit of build eval command
ss=0

REPO_NAME="$1" # Eg. 'd2l-en'
TARGET_BRANCH="$2" # Eg. 'master' ; if PR raised to master

pip3 install .
mkdir _build

# Move sanity check outside
d2lbook build outputcheck tabcheck

# Move aws copy commands for cache restore outside
echo "Retrieving tensorflow build cache"
aws s3 sync s3://preview.d2l.ai/ci_cache/"$REPO_NAME"-"$TARGET_BRANCH"/_build/eval_tensorflow/ _build/eval_tensorflow/ --delete --quiet --exclude 'data/*'

export TF_CPP_MIN_LOG_LEVEL=3
export TF_FORCE_GPU_ALLOW_GROWTH=true
# Continue the script even if some notebooks in build fail to
# make sure that cache is copied to s3 for the successful notebooks
d2lbook build eval --tab tensorflow || ((ss=1))

# Move aws copy commands for cache store outside
echo "Upload tensorflow build cache to s3"
aws s3 sync _build s3://preview.d2l.ai/ci_cache/"$REPO_NAME"-"$TARGET_BRANCH"/_build --acl public-read --quiet

if [ "$ss" -ne 0 ]; then
exit 1
fi
18 changes: 18 additions & 0 deletions .github/workflow_scripts/setup_git.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
function setup_git {
# Turn off logging
set +x
mkdir -p $HOME/.ssh
echo "yes" | ssh-keyscan -t rsa github.com >> ~/.ssh/known_hosts

# Retrieve the SSH key securely from AWS Secrets Manager
GIT_SSH_KEY=$(aws secretsmanager get-secret-value --secret-id d2l_bot_github --query SecretString --output text --region us-west-2)

# Write the SSH key to a file
echo "$GIT_SSH_KEY" > $HOME/.ssh/id_rsa
chmod 600 $HOME/.ssh/id_rsa

git config --global user.name "d2l-bot"
git config --global user.email "100248899+d2l-bot@users.noreply.github.com"

echo "Successfully Configured Bot"
}
Loading

0 comments on commit b21c4d3

Please sign in to comment.