Update

[ghstack-poisoned]
pytorch · Jan 13, 2025 · 27d3915 · 27d3915
2 parents 8854448 + 954570a
commit 27d3915
Show file tree

Hide file tree

Showing 294 changed files with 2,658 additions and 1,647 deletions.
diff --git a/.ci/aarch64_linux/aarch64_ci_build.sh b/.ci/aarch64_linux/aarch64_ci_build.sh
@@ -3,6 +3,9 @@ set -eux -o pipefail
 
 GPU_ARCH_VERSION=${GPU_ARCH_VERSION:-}
 
+# cuda arm build for Grace Hopper solely
+export TORCH_CUDA_ARCH_LIST="9.0"
+
 SCRIPTPATH="$( cd -- "$(dirname "$0")" >/dev/null 2>&1 ; pwd -P )"
 source $SCRIPTPATH/aarch64_ci_setup.sh
 

diff --git a/.ci/docker/common/install_acl.sh b/.ci/docker/common/install_acl.sh
@@ -1,7 +1,7 @@
 set -euo pipefail
 
 readonly version=v24.04
-readonly src_host=https://review.mlplatform.org/ml
+readonly src_host=https://github.com/ARM-software
 readonly src_repo=ComputeLibrary
 
 # Clone ACL

diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt
@@ -304,7 +304,7 @@ pytest-cpp==2.3.0
 #Pinned versions: 2.3.0
 #test that import:
 
-z3-solver==4.12.2.0
+z3-solver==4.12.6.0
 #Description: The Z3 Theorem Prover Project
 #Pinned versions:
 #test that import:

diff --git a/.ci/manywheel/build_cuda.sh b/.ci/manywheel/build_cuda.sh
@@ -53,22 +53,10 @@ cuda_version_nodot=$(echo $CUDA_VERSION | tr -d '.')
 TORCH_CUDA_ARCH_LIST="5.0;6.0;7.0;7.5;8.0;8.6"
 case ${CUDA_VERSION} in
     12.6)
-        if [[ "$GPU_ARCH_TYPE" = "cuda-aarch64" ]]; then
-            TORCH_CUDA_ARCH_LIST="9.0"
-        else
-            TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0+PTX"
-        fi
+        TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0+PTX"
         EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
         ;;
     12.4)
-        if [[ "$GPU_ARCH_TYPE" = "cuda-aarch64" ]]; then
-            TORCH_CUDA_ARCH_LIST="9.0"
-        else
-            TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
-        fi
-        EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
-        ;;
-    12.1)
         TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
         EXTRA_CAFFE2_CMAKE_FLAGS+=("-DATEN_NO_TEST=ON")
         ;;

diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh
@@ -228,7 +228,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-debug* ]]; then
   export CMAKE_BUILD_TYPE=RelWithAssert
 fi
 
-# Do not change workspace permissions for ROCm CI jobs
+# Do not change workspace permissions for ROCm and s390x CI jobs
 # as it can leave workspace with bad permissions for cancelled jobs
 if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then
   # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)

diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh
@@ -12,9 +12,9 @@ export TERM=vt100
 # shellcheck source=./common.sh
 source "$(dirname "${BASH_SOURCE[0]}")/common.sh"
 
-# Do not change workspace permissions for ROCm CI jobs
+# Do not change workspace permissions for ROCm and s390x CI jobs
 # as it can leave workspace with bad permissions for cancelled jobs
-if [[ "$BUILD_ENVIRONMENT" != *rocm* && -d /var/lib/jenkins/workspace ]]; then
+if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *s390x* && -d /var/lib/jenkins/workspace ]]; then
   # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96)
   WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace")
   cleanup_workspace() {
@@ -86,6 +86,13 @@ if [[ "$BUILD_ENVIRONMENT" == *clang9* || "$BUILD_ENVIRONMENT" == *xpu* ]]; then
   export VALGRIND=OFF
 fi
 
+
+if [[ "$BUILD_ENVIRONMENT" == *s390x* ]]; then
+  # There are additional warnings on s390x, maybe due to newer gcc.
+  # Skip this check for now
+  export VALGRIND=OFF
+fi
+
 if [[ "${PYTORCH_TEST_RERUN_DISABLED_TESTS}" == "1" ]] || [[ "${CONTINUE_THROUGH_ERROR}" == "1" ]]; then
   # When rerunning disable tests, do not generate core dumps as it could consume
   # the runner disk space when crashed tests are run multiple times. Running out
@@ -910,10 +917,20 @@ test_libtorch_api() {
   else
     # Exclude IMethodTest that relies on torch::deploy, which will instead be ran in test_deploy
     OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="${MNIST_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_api -k "not IMethodTest"
-    python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr
+
+    # On s390x, pytorch is built without llvm.
+    # Even if it would be built with llvm, llvm currently doesn't support used features on s390x and
+    # test fails with errors like:
+    # JIT session error: Unsupported target machine architecture in ELF object pytorch-jitted-objectbuffer
+    # unknown file: Failure
+    # C++ exception with description "valOrErr INTERNAL ASSERT FAILED at "/var/lib/jenkins/workspace/torch/csrc/jit/tensorexpr/llvm_jit.h":34, please report a bug to PyTorch. Unexpected failure in LLVM JIT: Failed to materialize symbols: { (main, { func }) }
+    if [[ "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
+      python test/run_test.py --cpp --verbose -i cpp/test_tensorexpr
+    fi
   fi
 
-  if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* && "${BUILD_ENVIRONMENT}" != *asan* ]]; then
+  # quantization is not fully supported on s390x yet
+  if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* && "${BUILD_ENVIRONMENT}" != *asan* && "${BUILD_ENVIRONMENT}" != *s390x* ]]; then
     # NB: This test is not under TORCH_BIN_DIR but under BUILD_BIN_DIR
     export CPP_TESTS_DIR="${BUILD_BIN_DIR}"
     python test/run_test.py --cpp --verbose -i cpp/static_runtime_test

diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml
@@ -5,7 +5,7 @@ body:
 - type: markdown
   attributes:
     value: >
-      #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/pytorch/pytorch/issues?q=is%3Aissue+sort%3Acreated-desc+).
+      #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/pytorch/pytorch/issues?q=is%3Aissue+sort%3Acreated-desc+). Note: Please write your bug report in English to ensure it can be understood and addressed by the development team.
 - type: textarea
   attributes:
     label: 🐛 Describe the bug

diff --git a/.github/ISSUE_TEMPLATE/documentation.yml b/.github/ISSUE_TEMPLATE/documentation.yml
@@ -2,6 +2,10 @@ name: 📚 Documentation
 description: Report an issue related to https://pytorch.org/docs/stable/index.html
 
 body:
+- type: markdown
+  attributes:
+    value: >
+      #### Note: Please report your documentation issue in English to ensure it can be understood and addressed by the development team.
 - type: textarea
   attributes:
     label: 📚 The doc issue

diff --git a/.github/ISSUE_TEMPLATE/feature-request.yml b/.github/ISSUE_TEMPLATE/feature-request.yml
@@ -2,6 +2,10 @@ name: 🚀 Feature request
 description: Submit a proposal/request for a new PyTorch feature
 
 body:
+- type: markdown
+  attributes:
+    value: >
+      #### Note: Please write your feature request in English to ensure it can be understood and addressed by the development team.
 - type: textarea
   attributes:
     label: 🚀 The feature, motivation and pitch

diff --git a/.github/ISSUE_TEMPLATE/pt2-bug-report.yml b/.github/ISSUE_TEMPLATE/pt2-bug-report.yml
@@ -3,6 +3,10 @@ description: Create a report to help us reproduce and fix the bug
 labels: ["oncall: pt2"]
 
 body:
+  - type: markdown
+    attributes:
+      value: >
+      #### Note: Please write your bug report in English to ensure it can be understood and addressed by the development team.
   - type: markdown
     attributes:
       value: >

diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml
@@ -81,7 +81,7 @@ jobs:
     steps:
       - name: Setup SSH (Click me for login details)
         uses: pytorch/test-infra/.github/actions/setup-ssh@main
-        if: ${{ !contains(matrix.runner, 'gcp.a100') }}
+        if: ${{ !contains(matrix.runner, 'gcp.a100') && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
         with:
           github-secret: ${{ secrets.GITHUB_TOKEN }}
           instructions: |
@@ -95,9 +95,10 @@ jobs:
 
       - name: Setup Linux
         uses: ./.github/actions/setup-linux
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
 
       - name: configure aws credentials
-        if : ${{ inputs.aws-role-to-assume != '' }}
+        if : ${{ inputs.aws-role-to-assume != '' && inputs.build-environment != 'linux-s390x-binary-manywheel' }}
         uses: aws-actions/configure-aws-credentials@v3
         with:
           role-to-assume: ${{ inputs.aws-role-to-assume }}
@@ -107,11 +108,13 @@ jobs:
       - name: Calculate docker image
         id: calculate-docker-image
         uses: pytorch/test-infra/.github/actions/calculate-docker-image@main
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
         with:
           docker-image-name: ${{ inputs.docker-image }}
 
       - name: Use following to pull public copy of the image
         id: print-ghcr-mirror
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
         env:
           ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }}
         shell: bash
@@ -121,6 +124,7 @@ jobs:
 
       - name: Pull docker image
         uses: pytorch/test-infra/.github/actions/pull-docker-image@main
+        if: inputs.build-environment != 'linux-s390x-binary-manywheel'
         with:
           docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }}
 
@@ -166,6 +170,7 @@ jobs:
         with:
           name: ${{ inputs.build-environment }}
           s3-bucket: ${{ inputs.s3-bucket }}
+          use-gha: ${{ inputs.use-gha }}
 
       - name: Download TD artifacts
         continue-on-error: true
@@ -262,9 +267,21 @@ jobs:
           # comes from https://github.com/pytorch/test-infra/pull/6058
           TOTAL_MEMORY_WITH_SWAP=$(("${TOTAL_AVAILABLE_MEMORY_IN_GB%.*}" + 3))
 
+          if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
+            SHM_OPTS=
+            JENKINS_USER=
+
+            # since some steps are skipped on s390x, if they are necessary, run them here
+            env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+            env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
+          else
+            SHM_OPTS="--shm-size=${SHM_SIZE}"
+            JENKINS_USER="--user jenkins"
+          fi
+
           # detached container should get cleaned up by teardown_ec2_linux
           # TODO: Stop building test binaries as part of the build phase
-          # Used for GPU_FLAG since that doesn't play nice
+          # Used for GPU_FLAG, SHM_OPTS and JENKINS_USER since that doesn't play nice
           # shellcheck disable=SC2086,SC2090
           container_name=$(docker run \
             ${GPU_FLAG:-} \
@@ -316,18 +333,23 @@ jobs:
             --security-opt seccomp=unconfined \
             --cap-add=SYS_PTRACE \
             --ipc=host \
-            --shm-size="${SHM_SIZE}" \
+            ${SHM_OPTS} \
             --tty \
             --detach \
             --name="${container_name}" \
-            --user jenkins \
+            ${JENKINS_USER} \
             -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \
             -w /var/lib/jenkins/workspace \
             "${DOCKER_IMAGE}"
           )
           # Propagate download.pytorch.org IP to container
           grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts"
           echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}"
+
+          if [[ ${BUILD_ENVIRONMENT} == *"s390x"* ]]; then
+            docker exec -t "${container_name}" sh -c "python3 -m pip install -r .ci/docker/requirements-ci.txt"
+          fi
+
           docker exec -t "${container_name}" sh -c "python3 -m pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}"
 
       - name: Upload pytest cache if tests failed
@@ -467,3 +489,12 @@ jobs:
             echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..."
             .github/scripts/stop_runner_service.sh
           fi
+
+      - name: Cleanup docker
+        if: always() && inputs.build-environment == 'linux-s390x-binary-manywheel'
+        shell: bash
+        run: |
+          # on s390x stop the container for clean worker stop
+          # ignore expansion of "docker ps -q" since it could be empty
+          # shellcheck disable=SC2046
+          docker stop $(docker ps -q) || true
diff --git a/.github/workflows/_mac-test-mps.yml b/.github/workflows/_mac-test-mps.yml
@@ -152,6 +152,7 @@ jobs:
           set -e
 
           ${CONDA_RUN} python3 test/run_test.py --mps --verbose
+          MTL_CAPTURE_ENABLED=1 ${CONDA_RUN} python3 test/test_mps.py --verbose -k test_metal_capture
 
       - name: Print remaining test logs
         shell: bash

diff --git a/.github/workflows/s390x-periodic.yml b/.github/workflows/s390x-periodic.yml
@@ -0,0 +1,77 @@
+name: s390x-periodic
+
+on:
+  schedule:
+    # We have several schedules so jobs can check github.event.schedule to activate only for a fraction of the runs.
+    # Also run less frequently on weekends.
+    - cron: 29 8 * * *  # about 1:29am PDT, for mem leak check and rerun disabled tests
+  push:
+    tags:
+      - ciflow/periodic/*
+      - ciflow/s390/*
+    branches:
+      - release/*
+  workflow_dispatch:
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }}
+  cancel-in-progress: true
+
+permissions: read-all
+
+jobs:
+  llm-td:
+    if: github.repository_owner == 'pytorch'
+    name: before-test
+    uses: ./.github/workflows/llm_td_retrieval.yml
+    permissions:
+      id-token: write
+      contents: read
+
+  target-determination:
+    name: before-test
+    uses: ./.github/workflows/target_determination.yml
+    needs: llm-td
+    permissions:
+      id-token: write
+      contents: read
+
+  linux-manylinux-2_28-py3-cpu-s390x-build:
+    if: github.repository_owner == 'pytorch'
+    name: linux-manylinux-2_28-py3-cpu-s390x
+    uses: ./.github/workflows/_linux-build.yml
+    with:
+      build-environment: linux-s390x-binary-manywheel
+      docker-image-name: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      runner: linux.s390x
+      test-matrix: |
+        { include: [
+          { config: "default", shard: 1,  num_shards: 10, runner: "linux.s390x" },
+          { config: "default", shard: 2,  num_shards: 10, runner: "linux.s390x" },
+          { config: "default", shard: 3,  num_shards: 10, runner: "linux.s390x" },
+          { config: "default", shard: 4,  num_shards: 10, runner: "linux.s390x" },
+          { config: "default", shard: 5,  num_shards: 10, runner: "linux.s390x" },
+          { config: "default", shard: 6,  num_shards: 10, runner: "linux.s390x" },
+          { config: "default", shard: 7,  num_shards: 10, runner: "linux.s390x" },
+          { config: "default", shard: 8,  num_shards: 10, runner: "linux.s390x" },
+          { config: "default", shard: 9,  num_shards: 10, runner: "linux.s390x" },
+          { config: "default", shard: 10, num_shards: 10, runner: "linux.s390x" },
+        ]}
+    secrets: inherit
+
+  linux-manylinux-2_28-py3-cpu-s390x-test:
+    permissions:
+      id-token: write
+      contents: read
+    name: linux-manylinux-2_28-py3-cpu-s390x
+    uses: ./.github/workflows/_linux-test.yml
+    needs:
+      - linux-manylinux-2_28-py3-cpu-s390x-build
+      - target-determination
+    with:
+      build-environment: linux-s390x-binary-manywheel
+      docker-image: pytorch/manylinuxs390x-builder:cpu-s390x-main
+      test-matrix: ${{ needs.linux-manylinux-2_28-py3-cpu-s390x-build.outputs.test-matrix }}
+      timeout-minutes: 480
+      use-gha: "yes"
+    secrets: inherit