CI: D2L Infra 2.0 with Github Actions #2481 [skip ci] (#2481)

* MXNet: Fix pandas 2.0 needs kwarg for sep arg * PyTorch: Refactor deprecated torch.eig -> torch.linalg.eig * PyTorch: torch>2.0 deprecations force using indexing arg in torch.meshgrid * Bug: Pandas 2.0 errors out for all inputs.values returning object array * CI: Add containter entrypoint d2l_job.sh script * CI: Add Docker Images & Docker Build Workflow * CI: Add actions, workflows and workflow scripts * CI: Remove Jenkins * remove torch specific packages as hard requirements * CI: Streamline git setup * Fix modified files timesync based caching on PRs * CI: Double down on security by splitting push and release permissions
d2l-ai · May 5, 2023 · b21c4d3 · b21c4d3
1 parent 3864d18
commit b21c4d3
Show file tree

Hide file tree

Showing 28 changed files with 1,285 additions and 119 deletions.
diff --git a/.github/actions/setup_env_vars/action.yml b/.github/actions/setup_env_vars/action.yml
@@ -0,0 +1,59 @@
+name: "Setup Env Vars"
+
+runs:
+  using: "composite"
+  steps:
+    - name: Get Commit SHA (For Push Event)
+      if: ${{ github.event_name == 'push' }}
+      shell: bash
+      env:
+        SHA: ${{ github.sha }}
+        git_repo_full: ${{ github.repository }}
+        pr_number: ""
+      run: |
+        short_sha=$(git rev-parse --short "$SHA")
+        echo "SHORT_SHA=$short_sha" >> $GITHUB_ENV
+        echo "PR_NUMBER=$pr_number" >> $GITHUB_ENV
+
+        target_branch=${GITHUB_REF##*/}
+        echo "TARGET_BRANCH=$target_branch" >> $GITHUB_ENV
+
+        repo_name=${git_repo_full##*/}
+        echo "REPO_NAME=$repo_name" >> $GITHUB_ENV
+
+        IFS='-' read -ra name_parts <<< "$repo_name"
+        echo "LANG=${name_parts[1]}" >> $GITHUB_ENV
+
+        task="${repo_name}-${target_branch}"
+        echo "TASK=$task" >> $GITHUB_ENV
+
+        job_name=${repo_name}/${target_branch}
+        echo "JOB_NAME=$job_name" >> $GITHUB_ENV
+
+
+    - name: Get Commit SHA (For Pull Request)
+      if: ${{ github.event_name == 'pull_request_target' }}
+      shell: bash
+      env:
+        SHA: ${{ github.event.pull_request.head.sha }}
+        target_branch: ${{ github.event.pull_request.base.ref }}
+        git_repo_full: ${{ github.event.pull_request.base.repo.full_name }}
+        pr_number: PR-${{ github.event.number }}
+      run: |
+        short_sha=$(git rev-parse --short "$SHA")
+        echo "SHORT_SHA=$short_sha" >> $GITHUB_ENV
+        echo "PR_NUMBER=$pr_number" >> $GITHUB_ENV
+
+        echo "TARGET_BRANCH=$target_branch" >> $GITHUB_ENV
+        
+        repo_name=${git_repo_full##*/}
+        echo "REPO_NAME=$repo_name" >> $GITHUB_ENV
+
+        IFS='-' read -ra name_parts <<< "$repo_name"
+        echo "LANG=${name_parts[1]}" >> $GITHUB_ENV
+
+        task="${repo_name}-${target_branch}"
+        echo "TASK=$task" >> $GITHUB_ENV
+
+        job_name=${repo_name}/PR-${{ github.event.number }}/${short_sha}
+        echo "JOB_NAME=$job_name" >> $GITHUB_ENV
diff --git a/.github/actions/submit-job/action.yml b/.github/actions/submit-job/action.yml
@@ -0,0 +1,91 @@
+name: "Submit Job to AWS Batch"
+inputs:
+  job-type:
+    required: true
+  job-name:
+    required: true
+  work-dir:
+    required: false
+    default: .
+  command:
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Configure AWS Credentials
+      uses: aws-actions/configure-aws-credentials@v2
+      with:
+        role-to-assume: arn:aws:iam::650140442593:role/D2L_CI_Batch
+        role-duration-seconds: 14400  # this requires changing max session duration to 4hrs in AWS Console for D2L_CI_Batch
+        aws-region: us-west-2
+
+    - name: Install dependencies
+      shell: bash
+      run: |
+        pip install boto3
+
+    - name: Check for Actor Permissions
+      id: check
+      continue-on-error: true
+      uses: prince-chrismc/check-actor-permissions-action@v2
+      with:
+        github_token: ${{ github.token }}
+        permission: write
+
+    - name: Submit Job (For Push on development branches)
+      if: ${{ github.event_name == 'push' && github.ref != 'refs/heads/release' && github.ref != 'refs/heads/classic' }}
+      shell: bash
+      run: |
+        echo "Start submitting job for a Push Event on a Development Branch"
+        # Add "-push" for all these jobs to use elevated push level job-def permissions
+        python ./ci/submit-job.py --job-type ${{ inputs.job-type }}-push \
+                                  --name ${{ inputs.job-name }}-'${{ github.ref }}' \
+                                  --source-ref '${{ github.ref }}' \
+                                  --work-dir ${{ inputs.work-dir }} \
+                                  --remote https://github.com/'${{ github.repository }}' \
+                                  --command "${{ inputs.command }}" \
+                                  --safe-to-use-script \
+                                  --wait
+
+    - name: Submit Job (For Push on Release/Classic)
+      if: ${{ github.event_name == 'push' && (github.ref == 'refs/heads/release' || github.ref == 'refs/heads/classic') }}
+      shell: bash
+      run: |
+        echo "Start submitting job for a Push Event on Release/Classic Branch"
+        # Add "-release" for all these jobs to use elevated release level job-def permissions
+        python ./ci/submit-job.py --job-type ${{ inputs.job-type }}-release \
+                                  --name ${{ inputs.job-name }}-'${{ github.ref }}' \
+                                  --source-ref '${{ github.ref }}' \
+                                  --work-dir ${{ inputs.work-dir }} \
+                                  --remote https://github.com/'${{ github.repository }}' \
+                                  --command "${{ inputs.command }}" \
+                                  --safe-to-use-script \
+                                  --wait
+
+    - name: Submit Job (For Pull Request Safe Scripts)
+      if: ${{ github.event_name == 'pull_request_target' && steps.check.outputs.permitted == 'true' }}
+      shell: bash
+      run: |
+        echo "Start submitting job"
+        python ./ci/submit-job.py --job-type ${{ inputs.job-type }} \
+                                  --name ${{ inputs.job-name }}-PR#'${{ github.event.number }}' \
+                                  --source-ref '${{ github.event.pull_request.head.sha }}' \
+                                  --work-dir ${{ inputs.work-dir }} \
+                                  --remote https://github.com/'${{ github.event.pull_request.head.repo.full_name }}' \
+                                  --command "${{ inputs.command }}" \
+                                  --safe-to-use-script \
+                                  --wait
+
+    - name: Submit Job (For Pull Request Not Safe Scripts)
+      if: ${{ github.event_name == 'pull_request_target' && steps.check.outputs.permitted != 'true' }}
+      shell: bash
+      run: |
+        echo "Start submitting job"
+        python ./ci/submit-job.py --job-type ${{ inputs.job-type }} \
+                                  --name ${{ inputs.job-name }}-PR#'${{ github.event.number }}' \
+                                  --source-ref '${{ github.event.pull_request.head.sha }}' \
+                                  --work-dir ${{ inputs.work-dir }} \
+                                  --remote https://github.com/'${{ github.event.pull_request.head.repo.full_name }}' \
+                                  --command "${{ inputs.command }}" \
+                                  --wait
diff --git a/.github/workflow_scripts/build_and_deploy.sh b/.github/workflow_scripts/build_and_deploy.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+
+# Move all scripts related to html here!
+
+set -ex
+
+REPO_NAME="$1"  # Eg. 'd2l-en'
+TARGET_BRANCH="$2" # Eg. 'master' ; if PR raised to master
+JOB_NAME="$3" # Eg. 'd2l-en/master' or 'd2l-en/PR-2453/21be1a4'
+LANG="$4" # Eg. 'en','zh' etc.
+
+pip3 install .
+mkdir _build
+
+# Move aws copy commands for cache restore outside
+aws s3 sync s3://preview.d2l.ai/ci_cache/$REPO_NAME-$TARGET_BRANCH/_build _build --delete --quiet --exclude 'eval*/data/*'
+
+# Build D2L Website
+./.github/workflow_scripts/build_html.sh $TARGET_BRANCH $JOB_NAME
+
+# Build PDFs
+d2lbook build pdf
+d2lbook build pdf --tab mxnet
+
+
+# Check if the JOB_NAME is either "$REPO_NAME/release" or "$REPO_NAME/classic"
+if [[ "$JOB_NAME" == "$REPO_NAME/release" || "$JOB_NAME" == "$REPO_NAME/classic" ]]; then
+
+  # Setup D2L Bot
+  source $(dirname "$0")/setup_git.sh
+  setup_git
+
+  # Run d2lbook release deployment
+  if [[ "$JOB_NAME" == *"/classic" ]]; then
+    # Use classic s3 bucket for classic release
+    LANG="classic"
+  fi
+  d2lbook build pkg
+  d2lbook deploy html pdf pkg colab sagemaker slides --s3 "s3://${LANG}.d2l.ai/"
+
+else
+  # Run d2lbook preview deployment
+  d2lbook deploy html pdf --s3 "s3://preview.d2l.ai/${JOB_NAME}/"
+fi
+
+# Move aws copy commands for cache store outside
+aws s3 sync _build s3://preview.d2l.ai/ci_cache/$REPO_NAME-$TARGET_BRANCH/_build --acl public-read --quiet --exclude 'eval*/data/*'
diff --git a/static/build_html.sh → .github/workflow_scripts/build_html.sh b/static/build_html.sh → .github/workflow_scripts/build_html.sh
diff --git a/.github/workflow_scripts/build_jax.sh b/.github/workflow_scripts/build_jax.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+
+set -ex
+
+# Used to capture status exit of build eval command
+ss=0
+
+REPO_NAME="$1"  # Eg. 'd2l-en'
+TARGET_BRANCH="$2" # Eg. 'master' ; if PR raised to master
+
+pip3 install .
+mkdir _build
+
+# Move sanity check outside
+d2lbook build outputcheck tabcheck
+
+# Move aws copy commands for cache restore outside
+echo "Retrieving jax build cache"
+aws s3 sync s3://preview.d2l.ai/ci_cache/"$REPO_NAME"-"$TARGET_BRANCH"/_build/eval_jax/ _build/eval_jax/ --delete --quiet --exclude 'data/*'
+
+export XLA_PYTHON_CLIENT_MEM_FRACTION=.70
+export TF_CPP_MIN_LOG_LEVEL=3
+export TF_FORCE_GPU_ALLOW_GROWTH=true
+# Continue the script even if some notebooks in build fail to
+# make sure that cache is copied to s3 for the successful notebooks
+d2lbook build eval --tab jax || ((ss=1))
+
+# Move aws copy commands for cache store outside
+echo "Upload jax build cache to s3"
+aws s3 sync _build s3://preview.d2l.ai/ci_cache/"$REPO_NAME"-"$TARGET_BRANCH"/_build --acl public-read --quiet
+
+if [ "$ss" -ne 0 ]; then
+  exit 1
+fi
diff --git a/.github/workflow_scripts/build_mxnet.sh b/.github/workflow_scripts/build_mxnet.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+set -ex
+
+# Used to capture status exit of build eval command
+ss=0
+
+REPO_NAME="$1"  # Eg. 'd2l-en'
+TARGET_BRANCH="$2" # Eg. 'master' ; if PR raised to master
+
+pip3 install .
+mkdir _build
+
+# Move sanity check outside
+d2lbook build outputcheck tabcheck
+
+# Move aws copy commands for cache restore outside
+echo "Retrieving mxnet build cache"
+aws s3 sync s3://preview.d2l.ai/ci_cache/"$REPO_NAME"-"$TARGET_BRANCH"/_build/eval_mxnet/ _build/eval_mxnet/ --delete --quiet --exclude 'data/*'
+
+# MXNet training for the following notebooks is slow in the container;
+# Setting NTHREADS=4 below seems to fix the issue:
+# 1. chapter_multilayer-perceptrons/dropout.md
+# 2. chapter_multilayer-perceptrons/mlp-implementation.md
+# 3. chapter_linear-classification/softmax-regression-concise.md
+# 4. chapter_linear-classification/softmax-regression-scratch.md
+export MXNET_CPU_WORKER_NTHREADS=4
+# Continue the script even if some notebooks in build fail to
+# make sure that cache is copied to s3 for the successful notebooks
+d2lbook build eval --tab mxnet || ((ss=1))
+
+# Move aws copy commands for cache store outside
+echo "Upload mxnet build cache to s3"
+aws s3 sync _build s3://preview.d2l.ai/ci_cache/"$REPO_NAME"-"$TARGET_BRANCH"/_build --acl public-read --quiet
+
+if [ "$ss" -ne 0 ]; then
+  exit 1
+fi
diff --git a/.github/workflow_scripts/build_pytorch.sh b/.github/workflow_scripts/build_pytorch.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+set -ex
+
+# Used to capture status exit of build eval command
+ss=0
+
+REPO_NAME="$1"  # Eg. 'd2l-en'
+TARGET_BRANCH="$2" # Eg. 'master' ; if PR raised to master
+
+pip3 install .
+mkdir _build
+
+# Move sanity check outside
+d2lbook build outputcheck tabcheck
+
+# Move aws copy commands for cache restore outside
+echo "Retrieving pytorch build cache"
+aws s3 sync s3://preview.d2l.ai/ci_cache/"$REPO_NAME"-"$TARGET_BRANCH"/_build/eval _build/eval --delete --quiet --exclude 'data/*'
+echo "Retrieving pytorch slides cache"
+aws s3 sync s3://preview.d2l.ai/ci_cache/"$REPO_NAME"-"$TARGET_BRANCH"/_build/slides _build/slides --delete --quiet --exclude 'data/*'
+
+# Continue the script even if some notebooks in build fail to
+# make sure that cache is copied to s3 for the successful notebooks
+d2lbook build eval || ((ss=1))
+d2lbook build slides --tab pytorch
+
+# Move aws copy commands for cache store outside
+echo "Upload pytorch build cache to s3"
+aws s3 sync _build s3://preview.d2l.ai/ci_cache/"$REPO_NAME"-"$TARGET_BRANCH"/_build --acl public-read --quiet
+
+# Exit with a non-zero status if evaluation failed
+if [ "$ss" -ne 0 ]; then
+  exit 1
+fi
diff --git a/.github/workflow_scripts/build_tf.sh b/.github/workflow_scripts/build_tf.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+set -ex
+
+# Used to capture status exit of build eval command
+ss=0
+
+REPO_NAME="$1"  # Eg. 'd2l-en'
+TARGET_BRANCH="$2" # Eg. 'master' ; if PR raised to master
+
+pip3 install .
+mkdir _build
+
+# Move sanity check outside
+d2lbook build outputcheck tabcheck
+
+# Move aws copy commands for cache restore outside
+echo "Retrieving tensorflow build cache"
+aws s3 sync s3://preview.d2l.ai/ci_cache/"$REPO_NAME"-"$TARGET_BRANCH"/_build/eval_tensorflow/ _build/eval_tensorflow/ --delete --quiet --exclude 'data/*'
+
+export TF_CPP_MIN_LOG_LEVEL=3
+export TF_FORCE_GPU_ALLOW_GROWTH=true
+# Continue the script even if some notebooks in build fail to
+# make sure that cache is copied to s3 for the successful notebooks
+d2lbook build eval --tab tensorflow || ((ss=1))
+
+# Move aws copy commands for cache store outside
+echo "Upload tensorflow build cache to s3"
+aws s3 sync _build s3://preview.d2l.ai/ci_cache/"$REPO_NAME"-"$TARGET_BRANCH"/_build --acl public-read --quiet
+
+if [ "$ss" -ne 0 ]; then
+  exit 1
+fi
diff --git a/.github/workflow_scripts/setup_git.sh b/.github/workflow_scripts/setup_git.sh
@@ -0,0 +1,18 @@
+function setup_git {
+    # Turn off logging
+    set +x
+    mkdir -p $HOME/.ssh
+    echo "yes" | ssh-keyscan -t rsa github.com >> ~/.ssh/known_hosts
+
+    # Retrieve the SSH key securely from AWS Secrets Manager
+    GIT_SSH_KEY=$(aws secretsmanager get-secret-value --secret-id d2l_bot_github --query SecretString --output text --region us-west-2)
+
+    # Write the SSH key to a file
+    echo "$GIT_SSH_KEY" > $HOME/.ssh/id_rsa
+    chmod 600 $HOME/.ssh/id_rsa
+
+    git config --global user.name "d2l-bot"
+    git config --global user.email "100248899+d2l-bot@users.noreply.github.com"
+
+    echo "Successfully Configured Bot"
+}