Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
[ghstack-poisoned]
  • Loading branch information
guangyey committed Jan 13, 2025
2 parents 8854448 + 954570a commit 27d3915
Show file tree
Hide file tree
Showing 294 changed files with 2,658 additions and 1,647 deletions.
3 changes: 3 additions & 0 deletions .ci/aarch64_linux/aarch64_ci_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ set -eux -o pipefail

GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}

# cuda arm build for Grace Hopper solely
export TORCH_CUDA_ARCH_LIST="9.0"

SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
source $SCRIPTPATH/aarch64_ci_setup.sh

Expand Down
2 changes: 1 addition & 1 deletion .ci/docker/common/install_acl.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
set -euo pipefail

readonly version=v24.04
readonly src_host=https://review.mlplatform.org/ml
readonly src_host=https://github.com/ARM-software
readonly src_repo=ComputeLibrary

# Clone ACL
Expand Down
2 changes: 1 addition & 1 deletion .ci/docker/requirements-ci.txt
Original file line number Diff line number Diff line change
Expand Up @@ -304,7 +304,7 @@ pytest-cpp==2.3.0
#Pinned versions: 2.3.0
#test that import:

z3-solver==4.12.2.0
z3-solver==4.12.6.0
#Description: The Z3 Theorem Prover Project
#Pinned versions:
#test that import:
Expand Down
14 changes: 1 addition & 13 deletions .ci/manywheel/build_cuda.sh
Original file line number Diff line number Diff line change
Expand Up @@ -53,22 +53,10 @@ cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6"
case ${CUDA_VERSION} in
12.6)
if [[ "$GPU_ARCH_TYPE" = "cuda-aarch64" ]]; then
TORCH_CUDA_ARCH_LIST="9.0"
else
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0+PTX"
fi
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0+PTX"
EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
;;
12.4)
if [[ "$GPU_ARCH_TYPE" = "cuda-aarch64" ]]; then
TORCH_CUDA_ARCH_LIST="9.0"
else
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
fi
EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
;;
12.1)
TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
;;
Expand Down
2 changes: 1 addition & 1 deletion .ci/pytorch/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
export CMAKE_BUILD_TYPE=RelWithAssert
fi

# Do not change workspace permissions for ROCm CI jobs
# Do not change workspace permissions for ROCm and s390x CI jobs
# as it can leave workspace with bad permissions for cancelled jobs
if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then
# Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
Expand Down
25 changes: 21 additions & 4 deletions .ci/pytorch/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,9 @@ export TERM=vt100
# shellcheck source=./common.sh
source "$(dirname "${BASH_SOURCE[0]}")/common.sh"

# Do not change workspace permissions for ROCm CI jobs
# Do not change workspace permissions for ROCm and s390x CI jobs
# as it can leave workspace with bad permissions for cancelled jobs
if [[ "$BUILD_ENVIRONMENT" != *rocm* && -d /var/lib/jenkins/workspace ]]; then
if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then
# Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
cleanup_workspace() {
Expand Down Expand Up @@ -86,6 +86,13 @@ if [[ "$BUILD_ENVIRONMENT" == *clang9* || "$BUILD_ENVIRONMENT" == *xpu* ]]; then
export VALGRIND=OFF
fi


if [[ "$BUILD_ENVIRONMENT" == *s390x* ]]; then
# There are additional warnings on s390x, maybe due to newer gcc.
# Skip this check for now
export VALGRIND=OFF
fi

if [[ "${PYTORCH_TEST_RERUN_DISABLED_TESTS}" == "1" ]] || [[ "${CONTINUE_THROUGH_ERROR}" == "1" ]]; then
# When rerunning disable tests, do not generate core dumps as it could consume
# the runner disk space when crashed tests are run multiple times. Running out
Expand Down Expand Up @@ -910,10 +917,20 @@ test_libtorch_api() {
else
# Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy
OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest"
python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr

# On s390x, pytorch is built without llvm.
# Even if it would be built with llvm, llvm currently doesn't support used features on s390x and
# test fails with errors like:
# JIT session error: Unsupported target machine architecture in ELF object pytorch-jitted-objectbuffer
# unknown file: Failure
# C++ exception with description "valOrErr INTERNAL ASSERT FAILED at "/var/lib/jenkins/workspace/torch/csrc/jit/tensorexpr/llvm_jit.h":34, please report a bug to PyTorch. Unexpected failure in LLVM JIT: Failed to materialize symbols: { (main, { func }) }
if [[ "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr
fi
fi

if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* && "${BUILD_ENVIRONMENT}" != *asan* ]]; then
# quantization is not fully supported on s390x yet
if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* && "${BUILD_ENVIRONMENT}" != *asan* && "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
# NB: This test is not under TORCH_BIN_DIR but under BUILD_BIN_DIR
export CPP_TESTS_DIR="${BUILD_BIN_DIR}"
python test/run_test.py --cpp --verbose -i cpp/static_runtime_test
Expand Down
2 changes: 1 addition & 1 deletion .github/ISSUE_TEMPLATE/bug-report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ body:
- type: markdown
attributes:
value: >
#### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/pytorch/pytorch/issues?q=is%3Aissue+sort%3Acreated-desc+).
#### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/pytorch/pytorch/issues?q=is%3Aissue+sort%3Acreated-desc+). Note: Please write your bug report in English to ensure it can be understood and addressed by the development team.
- type: textarea
attributes:
label: 🐛 Describe the bug
Expand Down
4 changes: 4 additions & 0 deletions .github/ISSUE_TEMPLATE/documentation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@ name: 📚 Documentation
description: Report an issue related to https://pytorch.org/docs/stable/index.html

body:
- type: markdown
attributes:
value: >
#### Note: Please report your documentation issue in English to ensure it can be understood and addressed by the development team.
- type: textarea
attributes:
label: 📚 The doc issue
Expand Down
4 changes: 4 additions & 0 deletions .github/ISSUE_TEMPLATE/feature-request.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@ name: 🚀 Feature request
description: Submit a proposal/request for a new PyTorch feature

body:
- type: markdown
attributes:
value: >
#### Note: Please write your feature request in English to ensure it can be understood and addressed by the development team.
- type: textarea
attributes:
label: 🚀 The feature, motivation and pitch
Expand Down
4 changes: 4 additions & 0 deletions .github/ISSUE_TEMPLATE/pt2-bug-report.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,10 @@ description: Create a report to help us reproduce and fix the bug
labels: ["oncall: pt2"]

body:
- type: markdown
attributes:
value: >
#### Note: Please write your bug report in English to ensure it can be understood and addressed by the development team.
- type: markdown
attributes:
value: >
Expand Down
41 changes: 36 additions & 5 deletions .github/workflows/_linux-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ jobs:
steps:
- name: Setup SSH (Click me for login details)
uses: pytorch/test-infra/.github/actions/setup-ssh@main
if: ${{ !contains(matrix.runner, 'gcp.a100') }}
if: ${{ !contains(matrix.runner, 'gcp.a100') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
with:
github-secret: ${{ secrets.GITHUB_TOKEN }}
instructions: |
Expand All @@ -95,9 +95,10 @@ jobs:

- name: Setup Linux
uses: ./.github/actions/setup-linux
if: inputs.build-environment != 'linux-s390x-binary-manywheel'

- name: configure aws credentials
if : ${{ inputs.aws-role-to-assume != '' }}
if : ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
uses: aws-actions/configure-aws-credentials@v3
with:
role-to-assume: ${{ inputs.aws-role-to-assume }}
Expand All @@ -107,11 +108,13 @@ jobs:
- name: Calculate docker image
id: calculate-docker-image
uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
with:
docker-image-name: ${{ inputs.docker-image }}

- name: Use following to pull public copy of the image
id: print-ghcr-mirror
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
env:
ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
shell: bash
Expand All @@ -121,6 +124,7 @@ jobs:
- name: Pull docker image
uses: pytorch/test-infra/.github/actions/pull-docker-image@main
if: inputs.build-environment != 'linux-s390x-binary-manywheel'
with:
docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}

Expand Down Expand Up @@ -166,6 +170,7 @@ jobs:
with:
name: ${{ inputs.build-environment }}
s3-bucket: ${{ inputs.s3-bucket }}
use-gha: ${{ inputs.use-gha }}

- name: Download TD artifacts
continue-on-error: true
Expand Down Expand Up @@ -262,9 +267,21 @@ jobs:
# comes from https://github.com/pytorch/test-infra/pull/6058
TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3))
if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
SHM_OPTS=
JENKINS_USER=
# since some steps are skipped on s390x, if they are necessary, run them here
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
else
SHM_OPTS="--shm-size=${SHM_SIZE}"
JENKINS_USER="--user jenkins"
fi
# detached container should get cleaned up by teardown_ec2_linux
# TODO: Stop building test binaries as part of the build phase
# Used for GPU_FLAG since that doesn't play nice
# Used for GPU_FLAG, SHM_OPTS and JENKINS_USER since that doesn't play nice
# shellcheck disable=SC2086,SC2090
container_name=$(docker run \
${GPU_FLAG:-} \
Expand Down Expand Up @@ -316,18 +333,23 @@ jobs:
--security-opt seccomp=unconfined \
--cap-add=SYS_PTRACE \
--ipc=host \
--shm-size="${SHM_SIZE}" \
${SHM_OPTS} \
--tty \
--detach \
--name="${container_name}" \
--user jenkins \
${JENKINS_USER} \
-v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
-w /var/lib/jenkins/workspace \
"${DOCKER_IMAGE}"
)
# Propagate download.pytorch.org IP to container
grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
docker exec -t "${container_name}" sh -c "python3 -m pip install -r .ci/docker/requirements-ci.txt"
fi
docker exec -t "${container_name}" sh -c "python3 -m pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
- name: Upload pytest cache if tests failed
Expand Down Expand Up @@ -467,3 +489,12 @@ jobs:
echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..."
.github/scripts/stop_runner_service.sh
fi
- name: Cleanup docker
if: always() && inputs.build-environment == 'linux-s390x-binary-manywheel'
shell: bash
run: |
# on s390x stop the container for clean worker stop
# ignore expansion of "docker ps -q" since it could be empty
# shellcheck disable=SC2046
docker stop $(docker ps -q) || true
1 change: 1 addition & 0 deletions .github/workflows/_mac-test-mps.yml
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ jobs:
set -e
${CONDA_RUN} python3 test/run_test.py --mps --verbose
MTL_CAPTURE_ENABLED=1 ${CONDA_RUN} python3 test/test_mps.py --verbose -k test_metal_capture
- name: Print remaining test logs
shell: bash
Expand Down
77 changes: 77 additions & 0 deletions .github/workflows/s390x-periodic.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
name: s390x-periodic

on:
schedule:
# We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
# Also run less frequently on weekends.
- cron: 29 8 * * * # about 1:29am PDT, for mem leak check and rerun disabled tests
push:
tags:
- ciflow/periodic/*
- ciflow/s390/*
branches:
- release/*
workflow_dispatch:

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
cancel-in-progress: true

permissions: read-all

jobs:
llm-td:
if: github.repository_owner == 'pytorch'
name: before-test
uses: ./.github/workflows/llm_td_retrieval.yml
permissions:
id-token: write
contents: read

target-determination:
name: before-test
uses: ./.github/workflows/target_determination.yml
needs: llm-td
permissions:
id-token: write
contents: read

linux-manylinux-2_28-py3-cpu-s390x-build:
if: github.repository_owner == 'pytorch'
name: linux-manylinux-2_28-py3-cpu-s390x
uses: ./.github/workflows/_linux-build.yml
with:
build-environment: linux-s390x-binary-manywheel
docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x-main
runner: linux.s390x
test-matrix: |
{ include: [
{ config: "default", shard: 1, num_shards: 10, runner: "linux.s390x" },
{ config: "default", shard: 2, num_shards: 10, runner: "linux.s390x" },
{ config: "default", shard: 3, num_shards: 10, runner: "linux.s390x" },
{ config: "default", shard: 4, num_shards: 10, runner: "linux.s390x" },
{ config: "default", shard: 5, num_shards: 10, runner: "linux.s390x" },
{ config: "default", shard: 6, num_shards: 10, runner: "linux.s390x" },
{ config: "default", shard: 7, num_shards: 10, runner: "linux.s390x" },
{ config: "default", shard: 8, num_shards: 10, runner: "linux.s390x" },
{ config: "default", shard: 9, num_shards: 10, runner: "linux.s390x" },
{ config: "default", shard: 10, num_shards: 10, runner: "linux.s390x" },
]}
secrets: inherit

linux-manylinux-2_28-py3-cpu-s390x-test:
permissions:
id-token: write
contents: read
name: linux-manylinux-2_28-py3-cpu-s390x
uses: ./.github/workflows/_linux-test.yml
needs:
- linux-manylinux-2_28-py3-cpu-s390x-build
- target-determination
with:
build-environment: linux-s390x-binary-manywheel
docker-image: pytorch/manylinuxs390x-builder:cpu-s390x-main
test-matrix: ${{ needs.linux-manylinux-2_28-py3-cpu-s390x-build.outputs.test-matrix }}
timeout-minutes: 480
use-gha: "yes"
secrets: inherit
Loading

0 comments on commit 27d3915

Please sign in to comment.