diff --git a/.circleci/cimodel/data/binary_build_data.py b/.circleci/cimodel/data/binary_build_data.py
index 51c2bb4312235b..1e1863cae53a53 100644
--- a/.circleci/cimodel/data/binary_build_data.py
+++ b/.circleci/cimodel/data/binary_build_data.py
@@ -126,6 +126,10 @@ def __init__(self, parent, package_format, python_versions):
self.props["python_versions"] = python_versions
self.props["package_format"] = package_format
+ # XXX Disabling conda for 11.3 as there's currently no appropriate cudatoolkit available
+ if package_format == "conda":
+ self.props["gpu_versions"] = filter(lambda x: x != "cuda113", self.find_prop("gpu_versions"))
+
def get_children(self):
if self.find_prop("os_name") == "linux":
return [LinuxGccConfigNode(self, v) for v in LINUX_GCC_CONFIG_VARIANTS[self.find_prop("package_format")]]
diff --git a/.circleci/cimodel/data/dimensions.py b/.circleci/cimodel/data/dimensions.py
index e4a86dc4e96f02..bdcc454c204520 100644
--- a/.circleci/cimodel/data/dimensions.py
+++ b/.circleci/cimodel/data/dimensions.py
@@ -3,6 +3,7 @@
CUDA_VERSIONS = [
"102",
"111",
+ "113",
]
ROCM_VERSIONS = [
diff --git a/.circleci/cimodel/data/windows_build_definitions.py b/.circleci/cimodel/data/windows_build_definitions.py
index c93d968b6f0e7b..c6728d500efe33 100644
--- a/.circleci/cimodel/data/windows_build_definitions.py
+++ b/.circleci/cimodel/data/windows_build_definitions.py
@@ -147,16 +147,13 @@ def render(self):
WindowsJob(None, _VC2019, CudaVersion(10, 1), master_only=True),
WindowsJob(1, _VC2019, CudaVersion(10, 1), master_only=True),
WindowsJob(2, _VC2019, CudaVersion(10, 1), master_only=True),
+ # VS2019 CUDA-10.1 force on cpu
+ WindowsJob(1, _VC2019, CudaVersion(10, 1), force_on_cpu=True, master_only=True),
# VS2019 CUDA-11.1
WindowsJob(None, _VC2019, CudaVersion(11, 1)),
WindowsJob(1, _VC2019, CudaVersion(11, 1), master_only=True),
WindowsJob(2, _VC2019, CudaVersion(11, 1), master_only=True),
WindowsJob('_azure_multi_gpu', _VC2019, CudaVersion(11, 1), multi_gpu=True, master_and_nightly=True),
- # VS2019 CPU-only
- WindowsJob(None, _VC2019, None),
- WindowsJob(1, _VC2019, None),
- WindowsJob(2, _VC2019, None),
- WindowsJob(1, _VC2019, CudaVersion(10, 1), force_on_cpu=True, master_only=True),
]
diff --git a/.circleci/config.yml b/.circleci/config.yml
index a242ccf193f089..1e012438af9ecb 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -2258,6 +2258,50 @@ workflows:
only:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
docker_image: "pytorch/manylinux-cuda111"
+ - binary_linux_build:
+ name: binary_linux_manywheel_3_6m_cu113_devtoolset7_nightly_build
+ build_environment: "manywheel 3.6m cu113 devtoolset7"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ docker_image: "pytorch/manylinux-cuda113"
+ - binary_linux_build:
+ name: binary_linux_manywheel_3_7m_cu113_devtoolset7_nightly_build
+ build_environment: "manywheel 3.7m cu113 devtoolset7"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ docker_image: "pytorch/manylinux-cuda113"
+ - binary_linux_build:
+ name: binary_linux_manywheel_3_8m_cu113_devtoolset7_nightly_build
+ build_environment: "manywheel 3.8m cu113 devtoolset7"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ docker_image: "pytorch/manylinux-cuda113"
+ - binary_linux_build:
+ name: binary_linux_manywheel_3_9m_cu113_devtoolset7_nightly_build
+ build_environment: "manywheel 3.9m cu113 devtoolset7"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ docker_image: "pytorch/manylinux-cuda113"
- binary_linux_build:
name: binary_linux_manywheel_3_6m_rocm4_0_1_devtoolset7_nightly_build
build_environment: "manywheel 3.6m rocm4.0.1 devtoolset7"
@@ -2666,6 +2710,54 @@ workflows:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
libtorch_variant: "static-without-deps"
docker_image: "pytorch/manylinux-cuda111"
+ - binary_linux_build:
+ name: binary_linux_libtorch_3_7m_cu113_devtoolset7_nightly_shared-with-deps_build
+ build_environment: "libtorch 3.7m cu113 devtoolset7"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ libtorch_variant: "shared-with-deps"
+ docker_image: "pytorch/manylinux-cuda113"
+ - binary_linux_build:
+ name: binary_linux_libtorch_3_7m_cu113_devtoolset7_nightly_shared-without-deps_build
+ build_environment: "libtorch 3.7m cu113 devtoolset7"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ libtorch_variant: "shared-without-deps"
+ docker_image: "pytorch/manylinux-cuda113"
+ - binary_linux_build:
+ name: binary_linux_libtorch_3_7m_cu113_devtoolset7_nightly_static-with-deps_build
+ build_environment: "libtorch 3.7m cu113 devtoolset7"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ libtorch_variant: "static-with-deps"
+ docker_image: "pytorch/manylinux-cuda113"
+ - binary_linux_build:
+ name: binary_linux_libtorch_3_7m_cu113_devtoolset7_nightly_static-without-deps_build
+ build_environment: "libtorch 3.7m cu113 devtoolset7"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ libtorch_variant: "static-without-deps"
+ docker_image: "pytorch/manylinux-cuda113"
- binary_linux_build:
name: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_nightly_shared-with-deps_build
build_environment: "libtorch 3.7m cpu gcc5.4_cxx11-abi"
@@ -2810,6 +2902,54 @@ workflows:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
libtorch_variant: "static-without-deps"
docker_image: "pytorch/libtorch-cxx11-builder:cuda111"
+ - binary_linux_build:
+ name: binary_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_shared-with-deps_build
+ build_environment: "libtorch 3.7m cu113 gcc5.4_cxx11-abi"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ libtorch_variant: "shared-with-deps"
+ docker_image: "pytorch/libtorch-cxx11-builder:cuda113"
+ - binary_linux_build:
+ name: binary_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_shared-without-deps_build
+ build_environment: "libtorch 3.7m cu113 gcc5.4_cxx11-abi"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ libtorch_variant: "shared-without-deps"
+ docker_image: "pytorch/libtorch-cxx11-builder:cuda113"
+ - binary_linux_build:
+ name: binary_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_static-with-deps_build
+ build_environment: "libtorch 3.7m cu113 gcc5.4_cxx11-abi"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ libtorch_variant: "static-with-deps"
+ docker_image: "pytorch/libtorch-cxx11-builder:cuda113"
+ - binary_linux_build:
+ name: binary_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_static-without-deps_build
+ build_environment: "libtorch 3.7m cu113 gcc5.4_cxx11-abi"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ libtorch_variant: "static-without-deps"
+ docker_image: "pytorch/libtorch-cxx11-builder:cuda113"
- binary_mac_build:
name: binary_macos_wheel_3_6_cpu_nightly_build
build_environment: "wheel 3.6 cpu"
@@ -3060,6 +3200,46 @@ workflows:
tags:
only:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ - binary_windows_build:
+ name: binary_windows_wheel_3_6_cu113_nightly_build
+ build_environment: "wheel 3.6 cu113"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ - binary_windows_build:
+ name: binary_windows_wheel_3_7_cu113_nightly_build
+ build_environment: "wheel 3.7 cu113"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ - binary_windows_build:
+ name: binary_windows_wheel_3_8_cu113_nightly_build
+ build_environment: "wheel 3.8 cu113"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ - binary_windows_build:
+ name: binary_windows_wheel_3_9_cu113_nightly_build
+ build_environment: "wheel 3.9 cu113"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
- binary_windows_build:
name: binary_windows_conda_3_6_cpu_nightly_build
build_environment: "conda 3.6 cpu"
@@ -3210,6 +3390,16 @@ workflows:
tags:
only:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ - binary_windows_build:
+ name: binary_windows_libtorch_3_7_cu113_debug_nightly_build
+ build_environment: "libtorch 3.7 cu113 debug"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
- binary_windows_build:
name: binary_windows_libtorch_3_7_cpu_release_nightly_build
build_environment: "libtorch 3.7 cpu release"
@@ -3240,6 +3430,16 @@ workflows:
tags:
only:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ - binary_windows_build:
+ name: binary_windows_libtorch_3_7_cu113_release_nightly_build
+ build_environment: "libtorch 3.7 cu113 release"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
- binary_linux_test:
name: binary_linux_manywheel_3_6m_cpu_devtoolset7_nightly_test
build_environment: "manywheel 3.6m cpu devtoolset7"
@@ -3412,6 +3612,66 @@ workflows:
docker_image: "pytorch/manylinux-cuda111"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
+ - binary_linux_test:
+ name: binary_linux_manywheel_3_6m_cu113_devtoolset7_nightly_test
+ build_environment: "manywheel 3.6m cu113 devtoolset7"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ requires:
+ - binary_linux_manywheel_3_6m_cu113_devtoolset7_nightly_build
+ docker_image: "pytorch/manylinux-cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
+ - binary_linux_test:
+ name: binary_linux_manywheel_3_7m_cu113_devtoolset7_nightly_test
+ build_environment: "manywheel 3.7m cu113 devtoolset7"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ requires:
+ - binary_linux_manywheel_3_7m_cu113_devtoolset7_nightly_build
+ docker_image: "pytorch/manylinux-cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
+ - binary_linux_test:
+ name: binary_linux_manywheel_3_8m_cu113_devtoolset7_nightly_test
+ build_environment: "manywheel 3.8m cu113 devtoolset7"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ requires:
+ - binary_linux_manywheel_3_8m_cu113_devtoolset7_nightly_build
+ docker_image: "pytorch/manylinux-cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
+ - binary_linux_test:
+ name: binary_linux_manywheel_3_9m_cu113_devtoolset7_nightly_test
+ build_environment: "manywheel 3.9m cu113 devtoolset7"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ requires:
+ - binary_linux_manywheel_3_9m_cu113_devtoolset7_nightly_build
+ docker_image: "pytorch/manylinux-cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
- binary_linux_test:
name: binary_linux_manywheel_3_6m_rocm4_0_1_devtoolset7_nightly_test
build_environment: "manywheel 3.6m rocm4.0.1 devtoolset7"
@@ -3948,6 +4208,70 @@ workflows:
docker_image: "pytorch/manylinux-cuda111"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
+ - binary_linux_test:
+ name: binary_linux_libtorch_3_7m_cu113_devtoolset7_nightly_shared-with-deps_test
+ build_environment: "libtorch 3.7m cu113 devtoolset7"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ libtorch_variant: "shared-with-deps"
+ requires:
+ - binary_linux_libtorch_3_7m_cu113_devtoolset7_nightly_shared-with-deps_build
+ docker_image: "pytorch/manylinux-cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
+ - binary_linux_test:
+ name: binary_linux_libtorch_3_7m_cu113_devtoolset7_nightly_shared-without-deps_test
+ build_environment: "libtorch 3.7m cu113 devtoolset7"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ libtorch_variant: "shared-without-deps"
+ requires:
+ - binary_linux_libtorch_3_7m_cu113_devtoolset7_nightly_shared-without-deps_build
+ docker_image: "pytorch/manylinux-cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
+ - binary_linux_test:
+ name: binary_linux_libtorch_3_7m_cu113_devtoolset7_nightly_static-with-deps_test
+ build_environment: "libtorch 3.7m cu113 devtoolset7"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ libtorch_variant: "static-with-deps"
+ requires:
+ - binary_linux_libtorch_3_7m_cu113_devtoolset7_nightly_static-with-deps_build
+ docker_image: "pytorch/manylinux-cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
+ - binary_linux_test:
+ name: binary_linux_libtorch_3_7m_cu113_devtoolset7_nightly_static-without-deps_test
+ build_environment: "libtorch 3.7m cu113 devtoolset7"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ libtorch_variant: "static-without-deps"
+ requires:
+ - binary_linux_libtorch_3_7m_cu113_devtoolset7_nightly_static-without-deps_build
+ docker_image: "pytorch/manylinux-cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
- binary_linux_test:
name: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_nightly_shared-with-deps_test
build_environment: "libtorch 3.7m cpu gcc5.4_cxx11-abi"
@@ -4132,9 +4456,9 @@ workflows:
docker_image: "pytorch/libtorch-cxx11-builder:cuda111"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
- - binary_windows_test:
- name: binary_windows_wheel_3_6_cpu_nightly_test
- build_environment: "wheel 3.6 cpu"
+ - binary_linux_test:
+ name: binary_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_shared-with-deps_test
+ build_environment: "libtorch 3.7m cu113 gcc5.4_cxx11-abi"
filters:
branches:
only:
@@ -4142,11 +4466,75 @@ workflows:
tags:
only:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ libtorch_variant: "shared-with-deps"
requires:
- - binary_windows_wheel_3_6_cpu_nightly_build
- - binary_windows_test:
- name: binary_windows_wheel_3_7_cpu_nightly_test
- build_environment: "wheel 3.7 cpu"
+ - binary_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_shared-with-deps_build
+ docker_image: "pytorch/libtorch-cxx11-builder:cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
+ - binary_linux_test:
+ name: binary_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_shared-without-deps_test
+ build_environment: "libtorch 3.7m cu113 gcc5.4_cxx11-abi"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ libtorch_variant: "shared-without-deps"
+ requires:
+ - binary_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_shared-without-deps_build
+ docker_image: "pytorch/libtorch-cxx11-builder:cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
+ - binary_linux_test:
+ name: binary_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_static-with-deps_test
+ build_environment: "libtorch 3.7m cu113 gcc5.4_cxx11-abi"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ libtorch_variant: "static-with-deps"
+ requires:
+ - binary_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_static-with-deps_build
+ docker_image: "pytorch/libtorch-cxx11-builder:cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
+ - binary_linux_test:
+ name: binary_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_static-without-deps_test
+ build_environment: "libtorch 3.7m cu113 gcc5.4_cxx11-abi"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ libtorch_variant: "static-without-deps"
+ requires:
+ - binary_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_static-without-deps_build
+ docker_image: "pytorch/libtorch-cxx11-builder:cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
+ - binary_windows_test:
+ name: binary_windows_wheel_3_6_cpu_nightly_test
+ build_environment: "wheel 3.6 cpu"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ requires:
+ - binary_windows_wheel_3_6_cpu_nightly_build
+ - binary_windows_test:
+ name: binary_windows_wheel_3_7_cpu_nightly_test
+ build_environment: "wheel 3.7 cpu"
filters:
branches:
only:
@@ -4284,6 +4672,58 @@ workflows:
requires:
- binary_windows_wheel_3_9_cu111_nightly_build
executor: windows-with-nvidia-gpu
+ - binary_windows_test:
+ name: binary_windows_wheel_3_6_cu113_nightly_test
+ build_environment: "wheel 3.6 cu113"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ requires:
+ - binary_windows_wheel_3_6_cu113_nightly_build
+ executor: windows-with-nvidia-gpu
+ - binary_windows_test:
+ name: binary_windows_wheel_3_7_cu113_nightly_test
+ build_environment: "wheel 3.7 cu113"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ requires:
+ - binary_windows_wheel_3_7_cu113_nightly_build
+ executor: windows-with-nvidia-gpu
+ - binary_windows_test:
+ name: binary_windows_wheel_3_8_cu113_nightly_test
+ build_environment: "wheel 3.8 cu113"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ requires:
+ - binary_windows_wheel_3_8_cu113_nightly_build
+ executor: windows-with-nvidia-gpu
+ - binary_windows_test:
+ name: binary_windows_wheel_3_9_cu113_nightly_test
+ build_environment: "wheel 3.9 cu113"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ requires:
+ - binary_windows_wheel_3_9_cu113_nightly_build
+ executor: windows-with-nvidia-gpu
- binary_windows_test:
name: binary_windows_conda_3_6_cpu_nightly_test
build_environment: "conda 3.6 cpu"
@@ -4474,6 +4914,19 @@ workflows:
requires:
- binary_windows_libtorch_3_7_cu111_debug_nightly_build
executor: windows-with-nvidia-gpu
+ - binary_windows_test:
+ name: binary_windows_libtorch_3_7_cu113_debug_nightly_test
+ build_environment: "libtorch 3.7 cu113 debug"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ requires:
+ - binary_windows_libtorch_3_7_cu113_debug_nightly_build
+ executor: windows-with-nvidia-gpu
- binary_windows_test:
name: binary_windows_libtorch_3_7_cpu_release_nightly_test
build_environment: "libtorch 3.7 cpu release"
@@ -4512,6 +4965,19 @@ workflows:
requires:
- binary_windows_libtorch_3_7_cu111_release_nightly_build
executor: windows-with-nvidia-gpu
+ - binary_windows_test:
+ name: binary_windows_libtorch_3_7_cu113_release_nightly_test
+ build_environment: "libtorch 3.7 cu113 release"
+ filters:
+ branches:
+ only:
+ - /.*/
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ requires:
+ - binary_windows_libtorch_3_7_cu113_release_nightly_build
+ executor: windows-with-nvidia-gpu
- binary_upload:
name: binary_linux_manywheel_3_6m_cpu_devtoolset7_nightly_upload
context: org-member
@@ -4680,6 +5146,62 @@ workflows:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
package_type: manywheel
upload_subfolder: cu111
+ - binary_upload:
+ name: binary_linux_manywheel_3_6m_cu113_devtoolset7_nightly_upload
+ context: org-member
+ requires:
+ - binary_linux_manywheel_3_6m_cu113_devtoolset7_nightly_test
+ filters:
+ branches:
+ only:
+ - nightly
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ package_type: manywheel
+ upload_subfolder: cu113
+ - binary_upload:
+ name: binary_linux_manywheel_3_7m_cu113_devtoolset7_nightly_upload
+ context: org-member
+ requires:
+ - binary_linux_manywheel_3_7m_cu113_devtoolset7_nightly_test
+ filters:
+ branches:
+ only:
+ - nightly
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ package_type: manywheel
+ upload_subfolder: cu113
+ - binary_upload:
+ name: binary_linux_manywheel_3_8m_cu113_devtoolset7_nightly_upload
+ context: org-member
+ requires:
+ - binary_linux_manywheel_3_8m_cu113_devtoolset7_nightly_test
+ filters:
+ branches:
+ only:
+ - nightly
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ package_type: manywheel
+ upload_subfolder: cu113
+ - binary_upload:
+ name: binary_linux_manywheel_3_9m_cu113_devtoolset7_nightly_upload
+ context: org-member
+ requires:
+ - binary_linux_manywheel_3_9m_cu113_devtoolset7_nightly_test
+ filters:
+ branches:
+ only:
+ - nightly
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ package_type: manywheel
+ upload_subfolder: cu113
- binary_upload:
name: binary_linux_manywheel_3_6m_rocm4_0_1_devtoolset7_nightly_upload
context: org-member
@@ -5184,6 +5706,62 @@ workflows:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
package_type: libtorch
upload_subfolder: cu111
+ - binary_upload:
+ name: binary_linux_libtorch_3_7m_cu113_devtoolset7_nightly_shared-with-deps_upload
+ context: org-member
+ requires:
+ - binary_linux_libtorch_3_7m_cu113_devtoolset7_nightly_shared-with-deps_test
+ filters:
+ branches:
+ only:
+ - nightly
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ package_type: libtorch
+ upload_subfolder: cu113
+ - binary_upload:
+ name: binary_linux_libtorch_3_7m_cu113_devtoolset7_nightly_shared-without-deps_upload
+ context: org-member
+ requires:
+ - binary_linux_libtorch_3_7m_cu113_devtoolset7_nightly_shared-without-deps_test
+ filters:
+ branches:
+ only:
+ - nightly
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ package_type: libtorch
+ upload_subfolder: cu113
+ - binary_upload:
+ name: binary_linux_libtorch_3_7m_cu113_devtoolset7_nightly_static-with-deps_upload
+ context: org-member
+ requires:
+ - binary_linux_libtorch_3_7m_cu113_devtoolset7_nightly_static-with-deps_test
+ filters:
+ branches:
+ only:
+ - nightly
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ package_type: libtorch
+ upload_subfolder: cu113
+ - binary_upload:
+ name: binary_linux_libtorch_3_7m_cu113_devtoolset7_nightly_static-without-deps_upload
+ context: org-member
+ requires:
+ - binary_linux_libtorch_3_7m_cu113_devtoolset7_nightly_static-without-deps_test
+ filters:
+ branches:
+ only:
+ - nightly
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ package_type: libtorch
+ upload_subfolder: cu113
- binary_upload:
name: binary_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_nightly_shared-with-deps_upload
context: org-member
@@ -5352,6 +5930,62 @@ workflows:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
package_type: libtorch
upload_subfolder: cu111
+ - binary_upload:
+ name: binary_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_shared-with-deps_upload
+ context: org-member
+ requires:
+ - binary_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_shared-with-deps_test
+ filters:
+ branches:
+ only:
+ - nightly
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ package_type: libtorch
+ upload_subfolder: cu113
+ - binary_upload:
+ name: binary_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_shared-without-deps_upload
+ context: org-member
+ requires:
+ - binary_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_shared-without-deps_test
+ filters:
+ branches:
+ only:
+ - nightly
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ package_type: libtorch
+ upload_subfolder: cu113
+ - binary_upload:
+ name: binary_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_static-with-deps_upload
+ context: org-member
+ requires:
+ - binary_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_static-with-deps_test
+ filters:
+ branches:
+ only:
+ - nightly
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ package_type: libtorch
+ upload_subfolder: cu113
+ - binary_upload:
+ name: binary_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_static-without-deps_upload
+ context: org-member
+ requires:
+ - binary_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_static-without-deps_test
+ filters:
+ branches:
+ only:
+ - nightly
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ package_type: libtorch
+ upload_subfolder: cu113
- binary_upload:
name: binary_macos_wheel_3_6_cpu_nightly_upload
context: org-member
@@ -5702,6 +6336,62 @@ workflows:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
package_type: wheel
upload_subfolder: cu111
+ - binary_upload:
+ name: binary_windows_wheel_3_6_cu113_nightly_upload
+ context: org-member
+ requires:
+ - binary_windows_wheel_3_6_cu113_nightly_test
+ filters:
+ branches:
+ only:
+ - nightly
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ package_type: wheel
+ upload_subfolder: cu113
+ - binary_upload:
+ name: binary_windows_wheel_3_7_cu113_nightly_upload
+ context: org-member
+ requires:
+ - binary_windows_wheel_3_7_cu113_nightly_test
+ filters:
+ branches:
+ only:
+ - nightly
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ package_type: wheel
+ upload_subfolder: cu113
+ - binary_upload:
+ name: binary_windows_wheel_3_8_cu113_nightly_upload
+ context: org-member
+ requires:
+ - binary_windows_wheel_3_8_cu113_nightly_test
+ filters:
+ branches:
+ only:
+ - nightly
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ package_type: wheel
+ upload_subfolder: cu113
+ - binary_upload:
+ name: binary_windows_wheel_3_9_cu113_nightly_upload
+ context: org-member
+ requires:
+ - binary_windows_wheel_3_9_cu113_nightly_test
+ filters:
+ branches:
+ only:
+ - nightly
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ package_type: wheel
+ upload_subfolder: cu113
- binary_upload:
name: binary_windows_conda_3_6_cpu_nightly_upload
context: org-member
@@ -5912,6 +6602,20 @@ workflows:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
package_type: libtorch
upload_subfolder: cu111
+ - binary_upload:
+ name: binary_windows_libtorch_3_7_cu113_debug_nightly_upload
+ context: org-member
+ requires:
+ - binary_windows_libtorch_3_7_cu113_debug_nightly_test
+ filters:
+ branches:
+ only:
+ - nightly
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ package_type: libtorch
+ upload_subfolder: cu113
- binary_upload:
name: binary_windows_libtorch_3_7_cpu_release_nightly_upload
context: org-member
@@ -5927,10 +6631,24 @@ workflows:
package_type: libtorch
upload_subfolder: cpu
- binary_upload:
- name: binary_windows_libtorch_3_7_cu102_release_nightly_upload
+ name: binary_windows_libtorch_3_7_cu102_release_nightly_upload
+ context: org-member
+ requires:
+ - binary_windows_libtorch_3_7_cu102_release_nightly_test
+ filters:
+ branches:
+ only:
+ - nightly
+ tags:
+ only:
+ - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
+ package_type: libtorch
+ upload_subfolder: cu102
+ - binary_upload:
+ name: binary_windows_libtorch_3_7_cu111_release_nightly_upload
context: org-member
requires:
- - binary_windows_libtorch_3_7_cu102_release_nightly_test
+ - binary_windows_libtorch_3_7_cu111_release_nightly_test
filters:
branches:
only:
@@ -5939,12 +6657,12 @@ workflows:
only:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
package_type: libtorch
- upload_subfolder: cu102
+ upload_subfolder: cu111
- binary_upload:
- name: binary_windows_libtorch_3_7_cu111_release_nightly_upload
+ name: binary_windows_libtorch_3_7_cu113_release_nightly_upload
context: org-member
requires:
- - binary_windows_libtorch_3_7_cu111_release_nightly_test
+ - binary_windows_libtorch_3_7_cu113_release_nightly_test
filters:
branches:
only:
@@ -5953,7 +6671,7 @@ workflows:
only:
- /v[0-9]+(\.[0-9]+)*-rc[0-9]+/
package_type: libtorch
- upload_subfolder: cu111
+ upload_subfolder: cu113
when: << pipeline.parameters.run_binary_tests >>
build:
jobs:
@@ -6946,6 +7664,24 @@ workflows:
vc_product: BuildTools
vc_version: ""
vc_year: "2019"
+ - pytorch_windows_test:
+ build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
+ cuda_version: "10.1"
+ filters:
+ branches:
+ only:
+ - master
+ - /ci-all\/.*/
+ - /release\/.*/
+ name: pytorch_windows_vs2019_py36_cuda10.1_on_cpu_test1
+ python_version: "3.6"
+ requires:
+ - pytorch_windows_vs2019_py36_cuda10.1_build
+ test_name: pytorch-windows-test1
+ use_cuda: "0"
+ vc_product: BuildTools
+ vc_version: ""
+ vc_year: "2019"
- pytorch_windows_build:
build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
cuda_version: "11.1"
@@ -7006,57 +7742,6 @@ workflows:
name: pytorch_windows_vs2019_py36_cuda11.1_test_azure_multi_gpu
requires:
- pytorch_windows_vs2019_py36_cuda11.1_build
- - pytorch_windows_build:
- build_environment: pytorch-win-vs2019-cpu-py3
- cuda_version: cpu
- name: pytorch_windows_vs2019_py36_cpu_build
- python_version: "3.6"
- use_cuda: "0"
- vc_product: BuildTools
- vc_version: ""
- vc_year: "2019"
- - pytorch_windows_test:
- build_environment: pytorch-win-vs2019-cpu-py3
- cuda_version: cpu
- name: pytorch_windows_vs2019_py36_cpu_test1
- python_version: "3.6"
- requires:
- - pytorch_windows_vs2019_py36_cpu_build
- test_name: pytorch-windows-test1
- use_cuda: "0"
- vc_product: BuildTools
- vc_version: ""
- vc_year: "2019"
- - pytorch_windows_test:
- build_environment: pytorch-win-vs2019-cpu-py3
- cuda_version: cpu
- name: pytorch_windows_vs2019_py36_cpu_test2
- python_version: "3.6"
- requires:
- - pytorch_windows_vs2019_py36_cpu_build
- test_name: pytorch-windows-test2
- use_cuda: "0"
- vc_product: BuildTools
- vc_version: ""
- vc_year: "2019"
- - pytorch_windows_test:
- build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
- cuda_version: "10.1"
- filters:
- branches:
- only:
- - master
- - /ci-all\/.*/
- - /release\/.*/
- name: pytorch_windows_vs2019_py36_cuda10.1_on_cpu_test1
- python_version: "3.6"
- requires:
- - pytorch_windows_vs2019_py36_cuda10.1_build
- test_name: pytorch-windows-test1
- use_cuda: "0"
- vc_product: BuildTools
- vc_version: ""
- vc_year: "2019"
- update_s3_htmls:
context: org-member
filters:
@@ -7200,6 +7885,54 @@ workflows:
docker_image: "pytorch/manylinux-cuda111"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
+ - smoke_linux_test:
+ name: smoke_linux_manywheel_3_6m_cu113_devtoolset7_nightly
+ build_environment: "manywheel 3.6m cu113 devtoolset7"
+ requires:
+ - update_s3_htmls
+ filters:
+ branches:
+ only:
+ - postnightly
+ docker_image: "pytorch/manylinux-cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
+ - smoke_linux_test:
+ name: smoke_linux_manywheel_3_7m_cu113_devtoolset7_nightly
+ build_environment: "manywheel 3.7m cu113 devtoolset7"
+ requires:
+ - update_s3_htmls
+ filters:
+ branches:
+ only:
+ - postnightly
+ docker_image: "pytorch/manylinux-cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
+ - smoke_linux_test:
+ name: smoke_linux_manywheel_3_8m_cu113_devtoolset7_nightly
+ build_environment: "manywheel 3.8m cu113 devtoolset7"
+ requires:
+ - update_s3_htmls
+ filters:
+ branches:
+ only:
+ - postnightly
+ docker_image: "pytorch/manylinux-cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
+ - smoke_linux_test:
+ name: smoke_linux_manywheel_3_9m_cu113_devtoolset7_nightly
+ build_environment: "manywheel 3.9m cu113 devtoolset7"
+ requires:
+ - update_s3_htmls
+ filters:
+ branches:
+ only:
+ - postnightly
+ docker_image: "pytorch/manylinux-cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
- smoke_linux_test:
name: smoke_linux_manywheel_3_6m_rocm4_0_1_devtoolset7_nightly
build_environment: "manywheel 3.6m rocm4.0.1 devtoolset7"
@@ -7628,6 +8361,58 @@ workflows:
docker_image: "pytorch/manylinux-cuda111"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
+ - smoke_linux_test:
+ name: smoke_linux_libtorch_3_7m_cu113_devtoolset7_nightly_shared-with-deps
+ build_environment: "libtorch 3.7m cu113 devtoolset7"
+ requires:
+ - update_s3_htmls
+ filters:
+ branches:
+ only:
+ - postnightly
+ libtorch_variant: "shared-with-deps"
+ docker_image: "pytorch/manylinux-cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
+ - smoke_linux_test:
+ name: smoke_linux_libtorch_3_7m_cu113_devtoolset7_nightly_shared-without-deps
+ build_environment: "libtorch 3.7m cu113 devtoolset7"
+ requires:
+ - update_s3_htmls
+ filters:
+ branches:
+ only:
+ - postnightly
+ libtorch_variant: "shared-without-deps"
+ docker_image: "pytorch/manylinux-cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
+ - smoke_linux_test:
+ name: smoke_linux_libtorch_3_7m_cu113_devtoolset7_nightly_static-with-deps
+ build_environment: "libtorch 3.7m cu113 devtoolset7"
+ requires:
+ - update_s3_htmls
+ filters:
+ branches:
+ only:
+ - postnightly
+ libtorch_variant: "static-with-deps"
+ docker_image: "pytorch/manylinux-cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
+ - smoke_linux_test:
+ name: smoke_linux_libtorch_3_7m_cu113_devtoolset7_nightly_static-without-deps
+ build_environment: "libtorch 3.7m cu113 devtoolset7"
+ requires:
+ - update_s3_htmls
+ filters:
+ branches:
+ only:
+ - postnightly
+ libtorch_variant: "static-without-deps"
+ docker_image: "pytorch/manylinux-cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
- smoke_linux_test:
name: smoke_linux_libtorch_3_7m_cpu_gcc5_4_cxx11-abi_nightly_shared-with-deps
build_environment: "libtorch 3.7m cpu gcc5.4_cxx11-abi"
@@ -7776,6 +8561,58 @@ workflows:
docker_image: "pytorch/libtorch-cxx11-builder:cuda111"
use_cuda_docker_runtime: "1"
resource_class: gpu.medium
+ - smoke_linux_test:
+ name: smoke_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_shared-with-deps
+ build_environment: "libtorch 3.7m cu113 gcc5.4_cxx11-abi"
+ requires:
+ - update_s3_htmls
+ filters:
+ branches:
+ only:
+ - postnightly
+ libtorch_variant: "shared-with-deps"
+ docker_image: "pytorch/libtorch-cxx11-builder:cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
+ - smoke_linux_test:
+ name: smoke_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_shared-without-deps
+ build_environment: "libtorch 3.7m cu113 gcc5.4_cxx11-abi"
+ requires:
+ - update_s3_htmls
+ filters:
+ branches:
+ only:
+ - postnightly
+ libtorch_variant: "shared-without-deps"
+ docker_image: "pytorch/libtorch-cxx11-builder:cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
+ - smoke_linux_test:
+ name: smoke_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_static-with-deps
+ build_environment: "libtorch 3.7m cu113 gcc5.4_cxx11-abi"
+ requires:
+ - update_s3_htmls
+ filters:
+ branches:
+ only:
+ - postnightly
+ libtorch_variant: "static-with-deps"
+ docker_image: "pytorch/libtorch-cxx11-builder:cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
+ - smoke_linux_test:
+ name: smoke_linux_libtorch_3_7m_cu113_gcc5_4_cxx11-abi_nightly_static-without-deps
+ build_environment: "libtorch 3.7m cu113 gcc5.4_cxx11-abi"
+ requires:
+ - update_s3_htmls
+ filters:
+ branches:
+ only:
+ - postnightly
+ libtorch_variant: "static-without-deps"
+ docker_image: "pytorch/libtorch-cxx11-builder:cuda113"
+ use_cuda_docker_runtime: "1"
+ resource_class: gpu.medium
- smoke_mac_test:
name: smoke_macos_wheel_3_6_cpu_nightly
build_environment: "wheel 3.6 cpu"
@@ -7973,6 +8810,46 @@ workflows:
only:
- postnightly
executor: windows-with-nvidia-gpu
+ - smoke_windows_test:
+ name: smoke_windows_wheel_3_6_cu113_nightly
+ build_environment: "wheel 3.6 cu113"
+ requires:
+ - update_s3_htmls
+ filters:
+ branches:
+ only:
+ - postnightly
+ executor: windows-with-nvidia-gpu
+ - smoke_windows_test:
+ name: smoke_windows_wheel_3_7_cu113_nightly
+ build_environment: "wheel 3.7 cu113"
+ requires:
+ - update_s3_htmls
+ filters:
+ branches:
+ only:
+ - postnightly
+ executor: windows-with-nvidia-gpu
+ - smoke_windows_test:
+ name: smoke_windows_wheel_3_8_cu113_nightly
+ build_environment: "wheel 3.8 cu113"
+ requires:
+ - update_s3_htmls
+ filters:
+ branches:
+ only:
+ - postnightly
+ executor: windows-with-nvidia-gpu
+ - smoke_windows_test:
+ name: smoke_windows_wheel_3_9_cu113_nightly
+ build_environment: "wheel 3.9 cu113"
+ requires:
+ - update_s3_htmls
+ filters:
+ branches:
+ only:
+ - postnightly
+ executor: windows-with-nvidia-gpu
- smoke_windows_test:
name: smoke_windows_conda_3_6_cpu_nightly
build_environment: "conda 3.6 cpu"
@@ -8118,6 +8995,16 @@ workflows:
only:
- postnightly
executor: windows-with-nvidia-gpu
+ - smoke_windows_test:
+ name: smoke_windows_libtorch_3_7_cu113_debug_nightly
+ build_environment: "libtorch 3.7 cu113 debug"
+ requires:
+ - update_s3_htmls
+ filters:
+ branches:
+ only:
+ - postnightly
+ executor: windows-with-nvidia-gpu
- smoke_windows_test:
name: smoke_windows_libtorch_3_7_cpu_release_nightly
build_environment: "libtorch 3.7 cpu release"
@@ -8147,6 +9034,16 @@ workflows:
only:
- postnightly
executor: windows-with-nvidia-gpu
+ - smoke_windows_test:
+ name: smoke_windows_libtorch_3_7_cu113_release_nightly
+ build_environment: "libtorch 3.7 cu113 release"
+ requires:
+ - update_s3_htmls
+ filters:
+ branches:
+ only:
+ - postnightly
+ executor: windows-with-nvidia-gpu
when: << pipeline.parameters.run_build >>
master_build:
jobs:
@@ -8409,6 +9306,18 @@ workflows:
vc_product: BuildTools
vc_version: ""
vc_year: "2019"
+ - pytorch_windows_test:
+ build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
+ cuda_version: "10.1"
+ name: pytorch_windows_vs2019_py36_cuda10.1_on_cpu_test1
+ python_version: "3.6"
+ requires:
+ - pytorch_windows_vs2019_py36_cuda10.1_build
+ test_name: pytorch-windows-test1
+ use_cuda: "0"
+ vc_product: BuildTools
+ vc_version: ""
+ vc_year: "2019"
- pytorch_windows_build:
build_environment: pytorch-win-vs2019-cuda11-cudnn8-py3
cuda_version: "11.1"
@@ -8448,18 +9357,6 @@ workflows:
name: pytorch_windows_vs2019_py36_cuda11.1_test_azure_multi_gpu
requires:
- pytorch_windows_vs2019_py36_cuda11.1_build
- - pytorch_windows_test:
- build_environment: pytorch-win-vs2019-cuda10-cudnn7-py3
- cuda_version: "10.1"
- name: pytorch_windows_vs2019_py36_cuda10.1_on_cpu_test1
- python_version: "3.6"
- requires:
- - pytorch_windows_vs2019_py36_cuda10.1_build
- test_name: pytorch-windows-test1
- use_cuda: "0"
- vc_product: BuildTools
- vc_version: ""
- vc_year: "2019"
when: << pipeline.parameters.run_master_build >>
scheduled-ci:
triggers:
diff --git a/.circleci/scripts/binary_linux_build.sh b/.circleci/scripts/binary_linux_build.sh
index e36d06906246dc..055bba4d2f5d57 100755
--- a/.circleci/scripts/binary_linux_build.sh
+++ b/.circleci/scripts/binary_linux_build.sh
@@ -4,10 +4,14 @@ echo "RUNNING ON $(uname -a) WITH $(nproc) CPUS AND $(free -m)"
set -eux -o pipefail
source /env
-# Defaults here so they can be changed in one place
-export MAX_JOBS=${MAX_JOBS:-$(( $(nproc) - 2 ))}
+# Because most Circle executors only have 20 CPUs, using more causes OOMs w/ Ninja and nvcc parallelization
+MEMORY_LIMIT_MAX_JOBS=18
+NUM_CPUS=$(( $(nproc) - 2 ))
-if [[ "${DESIRED_CUDA}" == "cu111" ]]; then
+# Defaults here for **binary** linux builds so they can be changed in one place
+export MAX_JOBS=${MAX_JOBS:-$(( ${NUM_CPUS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${NUM_CPUS} ))}
+
+if [[ "${DESIRED_CUDA}" == "cu111" || "${DESIRED_CUDA}" == "cu113" ]]; then
export BUILD_SPLIT_CUDA="ON"
fi
@@ -22,5 +26,9 @@ else
build_script='manywheel/build.sh'
fi
+if [[ "$CIRCLE_BRANCH" == "master" ]] || [[ "$CIRCLE_BRANCH" == release/* ]]; then
+ export BUILD_DEBUG_INFO=1
+fi
+
# Build the package
SKIP_ALL_TESTS=1 "/builder/$build_script"
diff --git a/.circleci/scripts/binary_linux_test.sh b/.circleci/scripts/binary_linux_test.sh
index d377093fd7ebd7..8d171e8101b4f7 100755
--- a/.circleci/scripts/binary_linux_test.sh
+++ b/.circleci/scripts/binary_linux_test.sh
@@ -38,6 +38,10 @@ if [[ "$DESIRED_CUDA" == "cu112" ]]; then
EXTRA_CONDA_FLAGS="-c=conda-forge"
fi
+# Move debug wheels out of the the package dir so they don't get installed
+mkdir -p /tmp/debug_final_pkgs
+mv /final_pkgs/debug-*.zip /tmp/debug_final_pkgs || echo "no debug packages to move"
+
# Install the package
# These network calls should not have 'retry's because they are installing
# locally and aren't actually network calls
diff --git a/.circleci/scripts/binary_windows_build.sh b/.circleci/scripts/binary_windows_build.sh
index 04d31002106cc1..2d1f228ec5a005 100644
--- a/.circleci/scripts/binary_windows_build.sh
+++ b/.circleci/scripts/binary_windows_build.sh
@@ -15,7 +15,7 @@ else
export VC_YEAR=2019
fi
-if [[ "${DESIRED_CUDA}" == "cu111" ]]; then
+if [[ "${DESIRED_CUDA}" == "cu111" || "${DESIRED_CUDA}" == "cu113" ]]; then
export BUILD_SPLIT_CUDA="ON"
fi
diff --git a/.github/scale-config.yml b/.github/scale-config.yml
index cfe100131d73fb..dee5222837ef32 100644
--- a/.github/scale-config.yml
+++ b/.github/scale-config.yml
@@ -32,3 +32,8 @@ runner_types:
os: windows
max_available: 200
disk_size: 256
+ windows.8xlarge.nvidia.gpu:
+ instance_type: g3.8xlarge
+ os: windows
+ max_available: 25
+ disk_size: 256
diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py
index 304a2dd1979ed8..31bdeba4817116 100755
--- a/.github/scripts/generate_ci_workflows.py
+++ b/.github/scripts/generate_ci_workflows.py
@@ -71,6 +71,7 @@ def generate_workflow_file(
PyTorchWindowsWorkflow(
build_environment="pytorch-win-vs2019-cpu-py3",
test_runner_type=WINDOWS_CPU_TEST_RUNNER,
+ on_pull_request=True
)
]
diff --git a/.github/scripts/report_git_status.sh b/.github/scripts/report_git_status.sh
index 357bacfecb2471..738fbcfd1451ae 100755
--- a/.github/scripts/report_git_status.sh
+++ b/.github/scripts/report_git_status.sh
@@ -1,5 +1,5 @@
#!/usr/bin/env bash
-CHANGES=$(git status --porcelain)
+CHANGES=$(git status --porcelain "$1")
echo "$CHANGES"
-git diff
+git diff "$1"
[ -z "$CHANGES" ]
diff --git a/.github/templates/windows_ci_workflow.yml.in b/.github/templates/windows_ci_workflow.yml.in
index 5a1c602b40f229..9544b83138e27d 100644
--- a/.github/templates/windows_ci_workflow.yml.in
+++ b/.github/templates/windows_ci_workflow.yml.in
@@ -31,6 +31,10 @@ jobs:
steps:
- name: Checkout PyTorch
uses: actions/checkout@v2
+ - name: Install 7zip if not already installed
+ shell: powershell
+ run: |
+ choco install 7zip.install -y
- name: Install Visual Studio 2019 toolchain
shell: powershell
run: |
@@ -73,6 +77,10 @@ jobs:
steps:
- name: Checkout PyTorch
uses: actions/checkout@v2
+ - name: Install 7zip if not already installed
+ shell: powershell
+ run: |
+ choco install 7zip.install -y
- name: Install Visual Studio 2019 toolchain
shell: powershell
run: |
diff --git a/.github/workflows/cancel_redundant_workflows.yml b/.github/workflows/cancel_redundant_workflows.yml
index a3dcf0d419a064..968ad48c15f7d5 100644
--- a/.github/workflows/cancel_redundant_workflows.yml
+++ b/.github/workflows/cancel_redundant_workflows.yml
@@ -9,6 +9,7 @@ on:
- Linux CI (pytorch-linux-xenial-py3.6-gcc5.4)
- Test tools
- TorchBench CI (pytorch-linux-py3.7-cu102)
+ - Windows CI (pytorch-win-vs2019-cpu-py3)
- clang-format
jobs:
cancel:
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 4bdbecebe64db0..e4c1bf1d330a2b 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -35,27 +35,6 @@ jobs:
run: |
pip install ruamel.yaml==0.17.4
.github/scripts/lint_native_functions.py
- - name: Extract scripts from GitHub Actions workflows
- if: always() && steps.requirements.outcome == 'success'
- run: |
- # For local lints, remove the .extracted_scripts folder if it was already there
- rm -rf .extracted_scripts
- tools/extract_scripts.py --out=.extracted_scripts
- - name: Install ShellCheck
- id: install_shellcheck
- if: always()
- # https://github.com/koalaman/shellcheck/tree/v0.7.2#installing-a-pre-compiled-binary
- run: |
- set -x
- scversion="v0.7.2"
- wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv
- sudo cp "shellcheck-${scversion}/shellcheck" /usr/bin/
- rm -r "shellcheck-${scversion}"
- shellcheck --version
- - name: Run ShellCheck
- if: always() && steps.install_shellcheck.outcome == 'success'
- run: |
- tools/run_shellcheck.sh .jenkins/pytorch .extracted_scripts
- name: Ensure correct trailing newlines
if: always() && steps.requirements.outcome == 'success'
run: |
@@ -109,7 +88,7 @@ jobs:
if: always() && steps.requirements.outcome == 'success'
run: |
set -eux
- python torch/testing/check_kernel_launches.py |& tee "${GITHUB_WORKSPACE}"/cuda_kernel_launch_checks.txt
+ python torch/testing/_check_kernel_launches.py |& tee "${GITHUB_WORKSPACE}"/cuda_kernel_launch_checks.txt
- name: Ensure no direct cub include
if: always()
run: |
@@ -129,7 +108,7 @@ jobs:
run: |
python2 setup.py | grep "Python 2 has reached end-of-life and is no longer supported by PyTorch."
- templates:
+ shellcheck:
runs-on: ubuntu-18.04
steps:
- name: Setup Python
@@ -137,14 +116,42 @@ jobs:
with:
python-version: 3.x
architecture: x64
+ - name: Checkout PyTorch
+ uses: actions/checkout@v2
+ - name: Install requirements
+ id: requirements
+ run: |
+ pip install -r requirements.txt
- name: Install Jinja2
- run: pip install Jinja2
+ run: |
+ pip install Jinja2==3.0.1
- name: Checkout PyTorch
uses: actions/checkout@v2
- name: Regenerate workflows
run: .github/scripts/generate_ci_workflows.py
- name: Assert that regenerating the workflows didn't change them
- run: .github/scripts/report_git_status.sh
+ run: .github/scripts/report_git_status.sh .github/workflows
+ - name: Install ShellCheck
+ id: install_shellcheck
+ if: always()
+ # https://github.com/koalaman/shellcheck/tree/v0.7.2#installing-a-pre-compiled-binary
+ run: |
+ set -x
+ scversion="v0.7.2"
+ wget -qO- "https://github.com/koalaman/shellcheck/releases/download/${scversion?}/shellcheck-${scversion?}.linux.x86_64.tar.xz" | tar -xJv
+ sudo cp "shellcheck-${scversion}/shellcheck" /usr/bin/
+ rm -r "shellcheck-${scversion}"
+ shellcheck --version
+ - name: Extract scripts from GitHub Actions workflows
+ if: always() && steps.install_shellcheck.outcome == 'success'
+ run: |
+ # For local lints, remove the .extracted_scripts folder if it was already there
+ rm -rf .extracted_scripts
+ tools/extract_scripts.py --out=.extracted_scripts
+ - name: Run ShellCheck
+ if: always() && steps.install_shellcheck.outcome == 'success'
+ run: |
+ tools/run_shellcheck.sh .extracted_scripts .jenkins/pytorch
toc:
runs-on: ubuntu-18.04
@@ -222,12 +229,10 @@ jobs:
clang-tidy:
if: github.event_name == 'pull_request'
runs-on: ubuntu-18.04
+ container:
+ # ubuntu18.04-cuda10.2-py3.6-tidy11
+ image: ghcr.io/pytorch/cilint-clang-tidy:e2cfc57ce4fa3a257a4b78fdfdc2b065c167b9c5
steps:
- - name: Setup Python
- uses: actions/setup-python@v2
- with:
- python-version: 3.x
- architecture: x64
- name: Checkout PyTorch
uses: actions/checkout@v2
with:
@@ -236,47 +241,32 @@ jobs:
env:
HEAD_SHA: ${{ github.event.pull_request.head.sha }}
run: |
+ cd "${GITHUB_WORKSPACE}"
mkdir clang-tidy-output
cd clang-tidy-output
echo "$HEAD_SHA" > commit-sha.txt
- - name: Install dependencies
- run: |
- set -eux
- # Install CUDA
- wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-ubuntu1804.pin
- sudo mv cuda-ubuntu1804.pin /etc/apt/preferences.d/cuda-repository-pin-600
- sudo apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/7fa2af80.pub
- sudo add-apt-repository "deb http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/ /"
- sudo apt-get update
- sudo apt-get --no-install-recommends -y install cuda-toolkit-10-2
- # Install dependencies
- pip install pyyaml typing_extensions
- wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key | sudo apt-key add -
- sudo apt-add-repository "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-11 main"
- sudo apt-get update
- sudo apt-get install -y clang-tidy-11
- sudo update-alternatives --install /usr/bin/clang-tidy clang-tidy /usr/bin/clang-tidy-11 1000
- name: Generate build files
run: |
+ cd "${GITHUB_WORKSPACE}"
set -eux
git remote add upstream https://github.com/pytorch/pytorch
git fetch upstream "$GITHUB_BASE_REF"
- if [[ ! -d build ]]; then
+ if [ ! -d build ]; then
git submodule update --init --recursive
export USE_NCCL=0
export USE_DEPLOY=1
# We really only need compile_commands.json, so no need to build!
- time python setup.py --cmake-only build
+ time python3 setup.py --cmake-only build
# Generate ATen files.
- time python -m tools.codegen.gen \
+ time python3 -m tools.codegen.gen \
-s aten/src/ATen \
-d build/aten/src/ATen
# Generate PyTorch files.
- time python tools/setup_helpers/generate_code.py \
+ time python3 tools/setup_helpers/generate_code.py \
--declarations-path build/aten/src/ATen/Declarations.yaml \
--native-functions-path aten/src/ATen/native/native_functions.yaml \
--nn-path aten/src
@@ -286,6 +276,7 @@ jobs:
BASE_SHA: ${{ github.event.pull_request.base.sha }}
HEAD_SHA: ${{ github.event.pull_request.head.sha }}
run: |
+ cd "${GITHUB_WORKSPACE}"
set -eux
# Run Clang-Tidy
@@ -296,7 +287,7 @@ jobs:
# /torch/csrc/generic/*.cpp is excluded because those files aren't actually built.
# deploy/interpreter files are excluded due to using macros and other techniquies
# that are not easily converted to accepted c++
- python tools/clang_tidy.py \
+ python3 tools/clang_tidy.py \
--verbose \
--paths torch/csrc/ \
--diff "$BASE_SHA" \
diff --git a/.github/workflows/pytorch-win-vs2019-cpu-py3.yml b/.github/workflows/pytorch-win-vs2019-cpu-py3.yml
index f2ac1d573a89a7..d3166967ed8c4e 100644
--- a/.github/workflows/pytorch-win-vs2019-cpu-py3.yml
+++ b/.github/workflows/pytorch-win-vs2019-cpu-py3.yml
@@ -4,6 +4,7 @@
name: Windows CI (pytorch-win-vs2019-cpu-py3)
on:
+ pull_request:
push:
branches:
- master
@@ -29,6 +30,10 @@ jobs:
steps:
- name: Checkout PyTorch
uses: actions/checkout@v2
+ - name: Install 7zip if not already installed
+ shell: powershell
+ run: |
+ choco install 7zip.install -y
- name: Install Visual Studio 2019 toolchain
shell: powershell
run: |
@@ -71,6 +76,10 @@ jobs:
steps:
- name: Checkout PyTorch
uses: actions/checkout@v2
+ - name: Install 7zip if not already installed
+ shell: powershell
+ run: |
+ choco install 7zip.install -y
- name: Install Visual Studio 2019 toolchain
shell: powershell
run: |
diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh
index bc309b8a54d832..c2be6c96b3e720 100755
--- a/.jenkins/pytorch/build.sh
+++ b/.jenkins/pytorch/build.sh
@@ -24,7 +24,7 @@ if [[ "$BUILD_ENVIRONMENT" == *-mobile-code-analysis* ]]; then
exec "$(dirname "${BASH_SOURCE[0]}")/build-mobile-code-analysis.sh" "$@"
fi
-if [[ "$BUILD_ENVIRONMENT" == pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7* ]]; then
+if [[ "$BUILD_ENVIRONMENT" == pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7* ]]; then
# Enabling DEPLOY build (embedded torch python interpreter, experimental)
# only on one config for now, can expand later
export USE_DEPLOY=ON
diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh
index 48840ad6c1bd58..3bce691f8cf85e 100755
--- a/.jenkins/pytorch/test.sh
+++ b/.jenkins/pytorch/test.sh
@@ -452,7 +452,7 @@ elif [[ "${BUILD_ENVIRONMENT}" == *libtorch* ]]; then
# TODO: run some C++ tests
echo "no-op at the moment"
elif [[ "${BUILD_ENVIRONMENT}" == *-test1 || "${JOB_BASE_NAME}" == *-test1 ]]; then
- if [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-cuda10.2-cudnn7-py3-gcc7-test1 ]]; then
+ if [[ "${BUILD_ENVIRONMENT}" == pytorch-linux-xenial-cuda11.1-cudnn8-py3-gcc7-test1 ]]; then
test_torch_deploy
fi
test_without_numpy
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5f308a75f07273..4818b5012b576f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -351,6 +351,7 @@ option(USE_SYSTEM_CPUINFO "Use system-provided cpuinfo." OFF)
option(USE_SYSTEM_SLEEF "Use system-provided sleef." OFF)
option(USE_SYSTEM_GLOO "Use system-provided gloo." OFF)
option(USE_SYSTEM_FP16 "Use system-provided fp16." OFF)
+option(USE_SYSTEM_PYBIND11 "Use system-provided PyBind11." OFF)
option(USE_SYSTEM_PTHREADPOOL "Use system-provided pthreadpool." OFF)
option(USE_SYSTEM_PSIMD "Use system-provided psimd." OFF)
option(USE_SYSTEM_FXDIV "Use system-provided fxdiv." OFF)
@@ -371,6 +372,7 @@ if(USE_SYSTEM_LIBS)
set(USE_SYSTEM_BENCHMARK ON)
set(USE_SYSTEM_ONNX ON)
set(USE_SYSTEM_XNNPACK ON)
+ set(USE_SYSTEM_PYBIND11 ON)
endif()
# Used when building Caffe2 through setup.py
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index c5a363f8a5c0e3..02d4f63a354710 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -910,7 +910,7 @@ tensor([1., 2., 3., 4.], dtype=torch.float64)
```
GDB tries to automatically load `pytorch-gdb` thanks to the
-[.gdbinit](.gdbinit) at the root of the pytorch repo. Howevever, auto-loadings is disabled by default, because of security reasons:
+[.gdbinit](.gdbinit) at the root of the pytorch repo. However, auto-loadings is disabled by default, because of security reasons:
```
$ gdb
diff --git a/Makefile b/Makefile
index 59cdae8f9faed2..8d61fd0c543ec4 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,8 @@
# This makefile does nothing but delegating the actual building to cmake.
+PYTHON = python3
all:
- @mkdir -p build && cd build && cmake .. $(shell python ./scripts/get_python_cmake_flags.py) && $(MAKE)
+ @mkdir -p build && cd build && cmake .. $(shell $(PYTHON) ./scripts/get_python_cmake_flags.py) && $(MAKE)
local:
@./scripts/build_local.sh
@@ -31,13 +32,32 @@ generate-gha-workflows:
.github/scripts/generate_ci_workflows.py
$(MAKE) shellcheck-gha
+shellcheck:
+ @$(PYTHON) tools/actions_local_runner.py \
+ --file .github/workflows/lint.yml \
+ --job 'shellcheck' \
+ --step "Regenerate workflows"
+ @$(PYTHON) tools/actions_local_runner.py \
+ --file .github/workflows/lint.yml \
+ --job 'shellcheck' \
+ --step "Assert that regenerating the workflows didn't change them"
+ @$(PYTHON) tools/actions_local_runner.py \
+ --file .github/workflows/lint.yml \
+ --job 'shellcheck' \
+ --step 'Extract scripts from GitHub Actions workflows'
+ @$(PYTHON) tools/actions_local_runner.py \
+ $(CHANGED_ONLY) \
+ --job 'shellcheck'
+
setup_lint:
- python tools/actions_local_runner.py --file .github/workflows/lint.yml \
- --job 'flake8-py3' --step 'Install dependencies' --no-quiet
- python tools/actions_local_runner.py --file .github/workflows/lint.yml \
- --job 'cmakelint' --step 'Install dependencies' --no-quiet
- python tools/actions_local_runner.py --file .github/workflows/lint.yml \
- --job 'mypy' --step 'Install dependencies' --no-quiet
+ $(PYTHON) tools/actions_local_runner.py --file .github/workflows/lint.yml \
+ --job 'flake8-py3' --step 'Install dependencies' --no-quiet
+ $(PYTHON) tools/actions_local_runner.py --file .github/workflows/lint.yml \
+ --job 'cmakelint' --step 'Install dependencies' --no-quiet
+ $(PYTHON) tools/actions_local_runner.py --file .github/workflows/lint.yml \
+ --job 'mypy' --step 'Install dependencies' --no-quiet
+ $(PYTHON) tools/actions_local_runner.py --file .github/workflows/lint.yml \
+ --job 'shellcheck' --step 'Install Jinja2' --no-quiet
@if [ "$$(uname)" = "Darwin" ]; then \
if [ -z "$$(which brew)" ]; then \
@@ -46,20 +66,15 @@ setup_lint:
fi; \
brew install shellcheck; \
else \
- python tools/actions_local_runner.py --file .github/workflows/lint.yml \
- --job 'quick-checks' --step 'Install ShellCheck' --no-quiet; \
+ $(PYTHON) tools/actions_local_runner.py --file .github/workflows/lint.yml \
+ --job 'shellcheck' --step 'Install ShellCheck' --no-quiet; \
fi
pip install jinja2
quick_checks:
- @python tools/actions_local_runner.py \
- --file .github/workflows/lint.yml \
- --job 'quick-checks' \
- --step 'Extract scripts from GitHub Actions workflows'
-
# TODO: This is broken when 'git config submodule.recurse' is 'true' since the
# lints will descend into third_party submodules
- @python tools/actions_local_runner.py \
+ @$(PYTHON) tools/actions_local_runner.py \
--file .github/workflows/lint.yml \
--job 'quick-checks' \
--step 'Ensure no trailing spaces' \
@@ -70,23 +85,20 @@ quick_checks:
--step 'Ensure no unqualified noqa' \
--step 'Ensure no unqualified type ignore' \
--step 'Ensure no direct cub include' \
- --step 'Run ShellCheck' \
--step 'Ensure correct trailing newlines'
flake8:
- @python tools/actions_local_runner.py \
- --file-filter '.py' \
+ @$(PYTHON) tools/actions_local_runner.py \
$(CHANGED_ONLY) \
--job 'flake8-py3'
mypy:
- @python tools/actions_local_runner.py \
- --file-filter '.py' \
+ @$(PYTHON) tools/actions_local_runner.py \
$(CHANGED_ONLY) \
--job 'mypy'
cmakelint:
- @python tools/actions_local_runner.py \
+ @$(PYTHON) tools/actions_local_runner.py \
--file .github/workflows/lint.yml \
--job 'cmakelint' \
--step 'Run cmakelint'
@@ -96,12 +108,12 @@ clang_tidy:
exit 1
toc:
- @python tools/actions_local_runner.py \
+ @$(PYTHON) tools/actions_local_runner.py \
--file .github/workflows/lint.yml \
--job 'toc' \
--step "Regenerate ToCs and check that they didn't change"
-lint: flake8 mypy quick_checks cmakelint generate-gha-workflows
+lint: flake8 mypy quick_checks cmakelint shellcheck
quicklint: CHANGED_ONLY=--changed-only
-quicklint: mypy flake8 mypy quick_checks cmakelint generate-gha-workflows
+quicklint: mypy flake8 mypy quick_checks cmakelint shellcheck
diff --git a/README.md b/README.md
index 0827c8c4e731a1..99a0209ee9bb5d 100644
--- a/README.md
+++ b/README.md
@@ -48,7 +48,7 @@ You can reuse your favorite Python packages such as NumPy, SciPy, and Cython to
| Linux (ppc64le) GPU |
— | [![Build Status](https://powerci.osuosl.org/job/pytorch-master-nightly-py3-linux-ppc64le-gpu/badge/icon)](https://powerci.osuosl.org/job/pytorch-master-nightly-py3-linux-ppc64le-gpu/) | — |
| Linux (aarch64) CPU | [![Build Status](http://openlabtesting.org:15000/badge?project=pytorch%2Fpytorch&job_name=pytorch-arm64-build-daily-master-py36)](https://status.openlabtesting.org/builds/builds?project=pytorch%2Fpytorch&job_name=pytorch-arm64-build-daily-master-py36) | [![Build Status](http://openlabtesting.org:15000/badge?project=pytorch%2Fpytorch&job_name=pytorch-arm64-build-daily-master-py37)](https://status.openlabtesting.org/builds/builds?project=pytorch%2Fpytorch&job_name=pytorch-arm64-build-daily-master-py37) | [![Build Status](http://openlabtesting.org:15000/badge?project=pytorch%2Fpytorch&job_name=pytorch-arm64-build-daily-master-py38)](https://status.openlabtesting.org/builds/builds?project=pytorch%2Fpytorch&job_name=pytorch-arm64-build-daily-master-py38) |
-See also the [ci.pytorch.org HUD](https://ezyang.github.io/pytorch-ci-hud/build/pytorch-master).
+See also the [ci.pytorch.org HUD](https://hud.pytorch.org/build2/pytorch-master).
## More About PyTorch
@@ -270,13 +270,13 @@ Sometimes there are regressions in new versions of Visual Studio, so
it's best to use the same Visual Studio Version [16.8.5](https://github.com/pytorch/pytorch/blob/master/.circleci/scripts/vs_install.ps1) as Pytorch CI's.
You can use Visual Studio Enterprise, Professional or Community though PyTorch CI uses Visual Studio BuildTools.
-If you want to build legacy python code, please refert to [Building on legacy code and CUDA](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md#building-on-legacy-code-and-cuda)
+If you want to build legacy python code, please refer to [Building on legacy code and CUDA](https://github.com/pytorch/pytorch/blob/master/CONTRIBUTING.md#building-on-legacy-code-and-cuda)
Build with CPU
It's fairly easy to build with CPU.
-Note on OpenMP: The desired OpenMP implementation is Intel OpenMP (iomp). In order to link against iomp, you'll need to manually download the library and set up the buliding environment by tweaking `CMAKE_INCLUDE_PATH` and `LIB`. The instruction [here](https://github.com/pytorch/pytorch/blob/master/docs/source/notes/windows.rst#building-from-source) is an example for setting up both MKL and Intel OpenMP. Without these configuraions for CMake, Microsoft Visual C OpenMP runtime (vcomp) will be used.
+Note on OpenMP: The desired OpenMP implementation is Intel OpenMP (iomp). In order to link against iomp, you'll need to manually download the library and set up the building environment by tweaking `CMAKE_INCLUDE_PATH` and `LIB`. The instruction [here](https://github.com/pytorch/pytorch/blob/master/docs/source/notes/windows.rst#building-from-source) is an example for setting up both MKL and Intel OpenMP. Without these configurations for CMake, Microsoft Visual C OpenMP runtime (vcomp) will be used.
Build with CUDA
diff --git a/RELEASE.md b/RELEASE.md
index 5ecc207a8ce67d..44369894d7c8b1 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -37,7 +37,7 @@ An example of this would look like:
release/1.8
```
-Please make sure to create branch that pins divergent point of release branch from the main thunk, i.e. `orig/release/{MAJOR}.{MINOR}`
+Please make sure to create branch that pins divergent point of release branch from the main branch, i.e. `orig/release/{MAJOR}.{MINOR}`
### Making release branch specific changes
These are examples of changes that should be made to release branches so that CI / tooling can function normally on
diff --git a/android/README.md b/android/README.md
index f4a0b193c07c04..cea8cc0c2ff1a0 100644
--- a/android/README.md
+++ b/android/README.md
@@ -95,13 +95,12 @@ dependencies {
implementation(name:'pytorch_android', ext:'aar')
implementation(name:'pytorch_android_torchvision', ext:'aar')
...
- implementation 'com.android.support:appcompat-v7:28.0.0'
implementation 'com.facebook.soloader:nativeloader:0.8.0'
implementation 'com.facebook.fbjni:fbjni-java-only:0.0.3'
}
```
We also have to add all transitive dependencies of our aars.
-As `pytorch_android` [depends](https://github.com/pytorch/pytorch/blob/master/android/pytorch_android/build.gradle#L62-L63) on `'com.android.support:appcompat-v7:28.0.0'`, `'com.facebook.soloader:nativeloader:0.8.0'` and 'com.facebook.fbjni:fbjni-java-only:0.0.3', we need to add them.
+As `pytorch_android` [depends](https://github.com/pytorch/pytorch/blob/master/android/pytorch_android/build.gradle#L76-L77) on `'com.facebook.soloader:nativeloader:0.8.0'` and `'com.facebook.fbjni:fbjni-java-only:0.0.3'`, we need to add them.
(In case of using maven dependencies they are added automatically from `pom.xml`).
You can check out [test app example](https://github.com/pytorch/pytorch/blob/master/android/test_app/app/build.gradle) that uses aars directly.
diff --git a/android/build.gradle b/android/build.gradle
index 60cd969bf652d3..e168347a2d99d3 100644
--- a/android/build.gradle
+++ b/android/build.gradle
@@ -12,7 +12,6 @@ allprojects {
rulesVersion = "1.2.0"
junitVersion = "4.12"
- androidSupportAppCompatV7Version = "28.0.0"
fbjniJavaOnlyVersion = "0.0.3"
soLoaderNativeLoaderVersion = "0.8.0"
}
diff --git a/android/pytorch_android/build.gradle b/android/pytorch_android/build.gradle
index 978389cbd242b0..f9a7559e598f2d 100644
--- a/android/pytorch_android/build.gradle
+++ b/android/pytorch_android/build.gradle
@@ -74,7 +74,6 @@ android {
dependencies {
implementation 'com.facebook.fbjni:fbjni-java-only:' + rootProject.fbjniJavaOnlyVersion
- implementation 'com.android.support:appcompat-v7:' + rootProject.androidSupportAppCompatV7Version
implementation 'com.facebook.soloader:nativeloader:' + rootProject.soLoaderNativeLoaderVersion
testImplementation 'junit:junit:' + rootProject.junitVersion
diff --git a/android/pytorch_android_torchvision/build.gradle b/android/pytorch_android_torchvision/build.gradle
index 77ac8d6fbbdec0..06d8d4db264f67 100644
--- a/android/pytorch_android_torchvision/build.gradle
+++ b/android/pytorch_android_torchvision/build.gradle
@@ -42,7 +42,6 @@ android {
dependencies {
implementation project(':pytorch_android')
- implementation 'com.android.support:appcompat-v7:' + rootProject.androidSupportAppCompatV7Version
implementation 'com.facebook.soloader:nativeloader:' + rootProject.soLoaderNativeLoaderVersion
testImplementation 'junit:junit:' + rootProject.junitVersion
diff --git a/aten/src/ATen/BatchingRegistrations.cpp b/aten/src/ATen/BatchingRegistrations.cpp
index afa51a2694afbb..d6d110b55ade9b 100644
--- a/aten/src/ATen/BatchingRegistrations.cpp
+++ b/aten/src/ATen/BatchingRegistrations.cpp
@@ -1144,10 +1144,10 @@ TORCH_LIBRARY_IMPL(aten, Batched, m) {
BINARY_POINTWISE(mul);
BINARY_POINTWISE(div);
{
- using Binop = Tensor (*)(const Tensor&, const Tensor&, c10::optional);
- using Unop = Tensor (*)(const Tensor&, const Scalar&, c10::optional);
- m.impl("div.Tensor_mode", binary_pointwise_batching_rule>);
- m.impl("div.Scalar_mode", unwrap_and_call>);
+ using Binop = Tensor (*)(const Tensor&, const Tensor&, c10::optional);
+ using Unop = Tensor (*)(const Tensor&, const Scalar&, c10::optional);
+ m.impl("div.Tensor_mode", binary_pointwise_batching_rule>);
+ m.impl("div.Scalar_mode", unwrap_and_call>);
}
// at::pow has three out-of-place overloads
diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt
index 804e8ff971b53c..bd94b95087fa7e 100644
--- a/aten/src/ATen/CMakeLists.txt
+++ b/aten/src/ATen/CMakeLists.txt
@@ -309,6 +309,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE)
set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE)
set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE)
set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE})
+ if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" AND CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+ set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE)
+ endif()
if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND
CMAKE_C_COMPILER_VERSION VERSION_GREATER 6.9 AND CMAKE_C_COMPILER_VERSION VERSION_LESS 8)
set(GCC_7 True)
diff --git a/aten/src/ATen/MemoryOverlap.cpp b/aten/src/ATen/MemoryOverlap.cpp
index 76a2c38244b068..4b90f59f5adab1 100644
--- a/aten/src/ATen/MemoryOverlap.cpp
+++ b/aten/src/ATen/MemoryOverlap.cpp
@@ -8,9 +8,9 @@ MemOverlap has_internal_overlap(const Tensor& tensor) {
}
MemOverlap has_internal_overlap(TensorImpl* t) {
- AT_ASSERT(t->layout() == kStrided);
+ TORCH_INTERNAL_ASSERT_DEBUG_ONLY(t->layout() == kStrided);
- if (t->is_contiguous()) {
+ if (t->is_non_overlapping_and_dense()) {
return MemOverlap::NO;
}
@@ -45,7 +45,7 @@ MemOverlapStatus get_overlap_status(TensorImpl* a, TensorImpl* b) {
if (a->numel() == 0 || b->numel() == 0) {
return MemOverlapStatus::NO;
}
- if (!a->is_contiguous() || !b->is_contiguous()) {
+ if (!a->is_non_overlapping_and_dense() || !b->is_non_overlapping_and_dense()) {
return MemOverlapStatus::TOO_HARD;
}
if (!a->has_storage() || !b->has_storage()) {
diff --git a/aten/src/ATen/Parallel.h b/aten/src/ATen/Parallel.h
index 122b8ea7548bdb..3da7c8155c8523 100644
--- a/aten/src/ATen/Parallel.h
+++ b/aten/src/ATen/Parallel.h
@@ -128,7 +128,7 @@ void launch_no_thread_state(std::function fn);
TORCH_API void intraop_launch(std::function func);
// Launches intra-op parallel task, returns a future
-TORCH_API std::shared_ptr intraop_launch_future(
+TORCH_API c10::intrusive_ptr intraop_launch_future(
std::function func);
// Returns number of intra-op threads used by default
diff --git a/aten/src/ATen/ParallelNative.cpp b/aten/src/ATen/ParallelNative.cpp
index 6aa2c64825912d..52da493c7d5759 100644
--- a/aten/src/ATen/ParallelNative.cpp
+++ b/aten/src/ATen/ParallelNative.cpp
@@ -271,10 +271,10 @@ void intraop_launch(std::function func) {
#endif // C10_MOBILE
}
-std::shared_ptr intraop_launch_future(
+c10::intrusive_ptr intraop_launch_future(
std::function func) {
#ifndef C10_MOBILE
- auto future = std::make_shared(c10::NoneType::get());
+ auto future = c10::make_intrusive(c10::NoneType::get());
if (!in_parallel_region() && get_num_threads() > 1) {
_get_intraop_pool().run(
[func, future]() {
@@ -290,7 +290,7 @@ std::shared_ptr intraop_launch_future(
#else
// TODO: caffe2::PThreadPool only provides a data-parallel API.
// Task parallelism is not currently supported.
- auto future = std::make_shared(NoneType::get());
+ auto future = c10::make_intrusive(NoneType::get());
func();
future->markCompleted();
return future;
diff --git a/aten/src/ATen/ParallelNativeTBB.cpp b/aten/src/ATen/ParallelNativeTBB.cpp
index 9def3aaa9fc2c2..7ea69f6a019b33 100644
--- a/aten/src/ATen/ParallelNativeTBB.cpp
+++ b/aten/src/ATen/ParallelNativeTBB.cpp
@@ -85,9 +85,9 @@ void intraop_launch(std::function func) {
}
}
-std::shared_ptr intraop_launch_future(
+c10::intrusive_ptr intraop_launch_future(
std::function func) {
- auto future = std::make_shared(NoneType::get());
+ auto future = c10::make_intrusive(NoneType::get());
if (get_num_threads() > 1) {
tg_.run(
[func, future]() {
diff --git a/aten/src/ATen/ParallelOpenMP.cpp b/aten/src/ATen/ParallelOpenMP.cpp
index 48b826a00c8952..070e8373dac32d 100644
--- a/aten/src/ATen/ParallelOpenMP.cpp
+++ b/aten/src/ATen/ParallelOpenMP.cpp
@@ -101,10 +101,10 @@ void intraop_launch(std::function func) {
func();
}
-std::shared_ptr intraop_launch_future(
+c10::intrusive_ptr intraop_launch_future(
std::function func) {
func();
- auto future = std::make_shared(NoneType::get());
+ auto future = c10::make_intrusive(NoneType::get());
future->markCompleted();
return future;
}
diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h
index a416e5e5305186..e2fc89a9db8498 100644
--- a/aten/src/ATen/SparseTensorImpl.h
+++ b/aten/src/ATen/SparseTensorImpl.h
@@ -29,6 +29,11 @@ struct TORCH_API SparseTensorImpl : public TensorImpl {
// because many algorithms proceed by merging two sorted lists (of indices).
bool coalesced_ = false;
+ // compute_numel with integer multiplication overflow check, see gh-57542
+ void refresh_numel() {
+ TensorImpl::safe_refresh_numel();
+ }
+
public:
// Public for now...
explicit SparseTensorImpl(at::DispatchKeySet, const caffe2::TypeMeta);
diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp
index bb1be1d97e13bd..e52a62ba723318 100644
--- a/aten/src/ATen/TensorIterator.cpp
+++ b/aten/src/ATen/TensorIterator.cpp
@@ -655,7 +655,7 @@ StrideVector TensorIteratorBase::get_strides() const {
StrideVector strides;
for (int dim = 0; dim < ndim(); dim++) {
for (int arg = 0; arg < ntensors(); arg++) {
- strides.push_back(operands_[arg].stride_bytes[dim]);
+ strides.emplace_back(operands_[arg].stride_bytes[dim]);
}
}
return strides;
@@ -670,10 +670,15 @@ void TensorIteratorBase::serial_for_each(loop2d_t loop, Range range) const {
strides.push_back(0);
}
+
auto base_ptrs = get_base_ptrs();
if (ndim() <= 1) {
- auto ptrs = get_data_ptrs(base_ptrs, { range.begin });
- loop(ptrs.data(), strides.data(), range.size(), 1);
+ if (range.begin > 0) {
+ auto ptrs = get_data_ptrs(base_ptrs, {range.begin});
+ loop(ptrs.data(), strides.data(), range.size(), 1);
+ } else {
+ loop(base_ptrs.data(), strides.data(), range.size(), 1);
+ }
} else {
auto counter = DimCounter(shape_, range);
while (!counter.is_done()) {
@@ -894,13 +899,22 @@ TensorIterator TensorIterator::unary_float_op(Tensor& out, const Tensor& a) {
return iter;
}
+#define NULLARY_OP_CONFIG() \
+ TensorIteratorConfig() \
+ .set_check_mem_overlap(true) \
+ .check_all_same_dtype(false) \
+ /* FIXME: workaround for bug: https://github.com/pytorch/pytorch/issues/20342 */ \
+ .resize_outputs(false)
+
TensorIterator TensorIterator::nullary_op(Tensor& out) {
- return TensorIteratorConfig()
- .set_check_mem_overlap(true)
- .check_all_same_dtype(false)
+ return NULLARY_OP_CONFIG()
.add_output(out)
- // FIXME: workaround for bug: https://github.com/pytorch/pytorch/issues/20342
- .resize_outputs(false)
+ .build();
+}
+
+TensorIterator TensorIterator::borrowing_nullary_op(Tensor& out) {
+ return NULLARY_OP_CONFIG()
+ .add_borrowed_output(out)
.build();
}
@@ -1003,7 +1017,9 @@ void TensorIteratorBase::compute_mem_overlaps(const TensorIteratorConfig& config
assert_no_internal_overlap(*output);
for (int j = num_outputs_; j < ntensors(); j++) {
const auto& input = operands_[j].tensor;
- assert_no_partial_overlap(*output, *input);
+ if (input->unsafeGetTensorImpl()!=output->unsafeGetTensorImpl()) {
+ assert_no_partial_overlap(*output, *input);
+ }
}
}
}
diff --git a/aten/src/ATen/TensorIterator.h b/aten/src/ATen/TensorIterator.h
index 20eec492b0e305..efb87d7dc96149 100644
--- a/aten/src/ATen/TensorIterator.h
+++ b/aten/src/ATen/TensorIterator.h
@@ -471,6 +471,7 @@ struct TORCH_API TensorIterator final : public TensorIteratorBase {
static TensorIterator unary_op(Tensor& out, const Tensor& a);
static TensorIterator unary_float_op(Tensor& out, const Tensor& a);
static TensorIterator nullary_op(Tensor& out);
+ static TensorIterator borrowing_nullary_op(Tensor& out);
static TensorIterator reduce_op(Tensor& out, const Tensor& a);
static TensorIterator reduce_op(Tensor& out1, Tensor& out2, const Tensor& a);
diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp
index ee5786b4355d01..0e81c969064b35 100644
--- a/aten/src/ATen/autocast_mode.cpp
+++ b/aten/src/ATen/autocast_mode.cpp
@@ -13,19 +13,24 @@ namespace at {
namespace autocast {
bool is_enabled() {
- //return !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::AutocastCUDA) ||
- // !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::AutocastCPU);
return !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::AutocastCUDA);
}
void set_enabled(bool new_enabled) {
- //c10::impl::tls_set_dispatch_key_excluded(DispatchKey::AutocastCPU, !new_enabled);
c10::impl::tls_set_dispatch_key_excluded(DispatchKey::AutocastCUDA, !new_enabled);
}
+bool is_cpu_enabled() {
+ return !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::AutocastCPU);
+}
+
+void set_cpu_enabled(bool new_enabled) {
+ c10::impl::tls_set_dispatch_key_excluded(DispatchKey::AutocastCPU, !new_enabled);
+}
+
namespace {
// Imitate Apex and cache some of the casts to streamline parameter reuse.
-// Our heuristic is to cache fp16 casts of fp32 model weights (see cached_cast below).
+// Our heuristic is to cache lower_precision_fp casts of fp32 model weights (see cached_cast below).
//
// After discussion with @ezyang, the cache uses the following structure:
// The key is the fp32 source tensor's TensorImpl*, a proxy for a Tensor uuid that's
@@ -51,6 +56,9 @@ thread_local std::unordered_map cached_casts;
// it calls clear_cache() to ensure cached Tensors don't leak outside the autocasting region.
// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables)
thread_local int nesting = 0;
+
+// autocast_cpu_dtype is the lower_precision_fp used by AutocastCPU.
+thread_local at::ScalarType autocast_cpu_dtype = at::kBFloat16;
}
void clear_cache() {
@@ -65,15 +73,28 @@ int decrement_nesting() {
return --nesting;
}
+at::ScalarType get_autocast_cpu_dtype() {
+ return autocast_cpu_dtype;
+}
+
+void set_autocast_cpu_dtype(at::ScalarType dtype) {
+ TORCH_CHECK(
+ dtype == at::kBFloat16,
+ "Currently, AutocastCPU only support Bfloat16 as the autocast_cpu_dtype");
+ autocast_cpu_dtype = dtype;
+}
+
// Overload to catch Tensor args
// TODO (possible optimization):
// Move cast_cache to an inline function in a header with cached_casts declared as
// extern thread_local in the header.
-Tensor cached_cast(at::ScalarType to_type, const Tensor& arg) {
- if (is_eligible(arg) && (arg.scalar_type() != to_type)) {
- // Heuristic: Do what Apex does, and cache fp16 casts of fp32 model weights (leaves).
+Tensor cached_cast(at::ScalarType to_type, const Tensor& arg, DeviceType device_type) {
+ if (is_eligible(arg, device_type) && (arg.scalar_type() != to_type)) {
+ // Heuristic: Do what Apex does, and cache lower_precision_fp casts of fp32 model weights (leaves).
// See cached_casts declaration above for detailed strategy.
- bool can_try_cache = (to_type == at::kHalf && arg.scalar_type() == at::kFloat && arg.requires_grad() && arg.is_leaf() && !arg.is_view());
+ bool can_try_cache = (to_type == get_lower_precision_fp_from_device_type(device_type) &&
+ arg.scalar_type() == at::kFloat && arg.requires_grad() &&
+ arg.is_leaf() && !arg.is_view());
if (can_try_cache) {
auto it = cached_casts.find(arg.unsafeGetTensorImpl());
if (it != cached_casts.end()) {
@@ -94,7 +115,8 @@ Tensor cached_cast(at::ScalarType to_type, const Tensor& arg) {
// Policies correspond to op categories that need code-divergent handling.
// Wrapper templates below are specialized based on a policy template parameter.
enum class CastPolicy : uint8_t {
- fp16 = 0, // Cast all inputs to at::kHalf before running the op.
+ lower_precision_fp = 0, // Cast all inputs to lower_precision_fp before running the op.
+ // Currently, lower_precision_fp is fp16 for AutocastCUDA, and is defined by user(default bf16) for AutocastCPU.
fp32, // Cast all inputs to at::kFloat before running the op.
fp32_set_opt_dtype, // Treats functions (like softmax) that
// 1. we'd like to run in fp32 and
@@ -122,29 +144,29 @@ Interior WrapFunction_ specializations are defined for each CastPolicy.
********************************************************************************************************/
// Base template for WrapFunction_, which is specialized to contain a "call" method each CastPolicy
-template struct WrapFunction_ {};
+template struct WrapFunction_ {};
-// CastPolicy::fp16
-template
-struct WrapFunction_> {
+// CastPolicy::lower_precision_fp General_DeviceType
+template
+struct WrapFunction_> {
static Ret call(Args... args) {
- c10::impl::ExcludeDispatchKeyGuard no_autocast(DispatchKey::Autocast);
- return (*F)(cached_cast(at::kHalf, args)...);
+ c10::impl::ExcludeDispatchKeyGuard no_autocast(get_autocast_dispatch_key_from_device_type(device_type));
+ return (*F)(cached_cast(get_lower_precision_fp_from_device_type(device_type), args, device_type)...);
}
};
-// CastPolicy::fp32
-template
-struct WrapFunction_> {
+// CastPolicy::fp32 General_DeviceType
+template
+struct WrapFunction_> {
static Ret call(Args... args) {
- c10::impl::ExcludeDispatchKeyGuard no_autocast(DispatchKey::Autocast);
- return (*F)(cached_cast(at::kFloat, args)...);
+ c10::impl::ExcludeDispatchKeyGuard no_autocast(get_autocast_dispatch_key_from_device_type(device_type));
+ return (*F)(cached_cast(at::kFloat, args, device_type)...);
}
};
-// CastPolicy::fp32_set_opt_dtype
+// CastPolicy::fp32_set_opt_dtype DeviceType::CUDA
template
-struct WrapFunction_> {
+struct WrapFunction_> {
static Ret call(Args... args) {
c10::impl::ExcludeDispatchKeyGuard no_autocast(DispatchKey::Autocast);
if (firstarg_is_eligible(args...)) {
@@ -157,9 +179,9 @@ struct WrapFunction_
-struct WrapFunction_> {
+struct WrapFunction_> {
static Ret call(Args... args) {
c10::impl::ExcludeDispatchKeyGuard no_autocast(DispatchKey::Autocast);
at::ScalarType out_type = type_from_firstarg(at::kFloat, args...);
@@ -167,18 +189,19 @@ struct WrapFunction_
-struct WrapFunction_> {
+// CastPolicy::promote General_DeviceType
+template
+struct WrapFunction_> {
static Ret call(Args... args) {
- c10::impl::ExcludeDispatchKeyGuard no_autocast(DispatchKey::Autocast);
- auto to_type = promote_type(at::kHalf, args...);
- return (*F)(cached_cast(to_type, args)...);
+ c10::impl::ExcludeDispatchKeyGuard no_autocast(get_autocast_dispatch_key_from_device_type(device_type));
+ auto to_type = promote_type(get_lower_precision_fp_from_device_type(device_type), device_type, args...);
+ return (*F)(cached_cast(to_type, args, device_type)...);
}
};
// Wrapper to infer return_type and parameter_types for WrapFunction_ (imitating core/boxing/impl/WrapFunctionIntoFunctor.h)
template // The actual function we're redispatching to.
struct WrapFunction final {
using type = WrapFunction_::return_type,
@@ -213,14 +237,15 @@ namespace {
This section performs load-time registration for autocast wrappers.
It's debatable at what level operations should be patched. We'd like casts to be autograd-exposed
-and precede autograd history recording, so that for fp16 ops, input tensors are saved for backward
-in fp16 rather than fp32. Saving inputs in fp16 can significantly reduce a model's memory footprint.
+and precede autograd history recording, so that for lower_precision_fp ops, input tensors are saved for backward
+in lower_precision_fp rather than fp32. Saving inputs in lower_precision_fp can significantly reduce
+a model's memory footprint.
Option 1 (strawman): Patch only at the level of explicit calls into cudnn/cublas (cudnn_convolution, etc),
because those are the code paths that are guaranteed to use Tensor Cores, therefore they're the ones that
-will benefit most from fp16. Potential pitfall: convolutions (and other ops) are wrapped in several
+will benefit most from lower_precision_fp. Potential pitfall: convolutions (and other ops) are wrapped in several
layers of at::* calls. If one of those happens to record autograd history, then we've lost the
-opportunity to save inputs in fp16.
+opportunity to save inputs in lower_precision_fp.
Option 2: Patch the Python-exposed surface of calls, to make 100% sure autograd history
recording can't sneak in ahead of autocast. This mirrors Apex most closely.
@@ -242,12 +267,17 @@ Therefore, for the moment, this is all copy pasted in from VariableTypeEverythin
// (that's why SIGNATURE is repeated in the WrapFunction instantiation)
#define KERNEL(FUNC, REGISTER_NAME, SIGNATURE, POLICY) \
m.impl(TORCH_SELECTIVE_NAME("aten::" REGISTER_NAME), \
- &WrapFunction::type::call);
+ &WrapFunction::type::call);
// Less-common but still useful case: redispatching to a function with a new signature (e.g. appending a dtype)
#define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE(REDISPATCH_FUNC, REGISTER_NAME, REGISTER_SIGNATURE, REDISPATCH_SIGNATURE, POLICY) \
m.impl(TORCH_SELECTIVE_NAME("aten::" REGISTER_NAME), \
- &WrapFunction::type::call);
+ &WrapFunction::type::call);
+
+// KERNEL_CPU registration for AutocastCPU
+#define KERNEL_CPU(FUNC, REGISTER_NAME, SIGNATURE, POLICY) \
+ m.impl(TORCH_SELECTIVE_NAME("aten::" REGISTER_NAME), \
+ &WrapFunction::type::call);
/*****************************************
Explicit registration for out-of-place ops
@@ -257,65 +287,65 @@ TORCH_LIBRARY_IMPL(_, Autocast, m) {
}
TORCH_LIBRARY_IMPL(aten, Autocast, m) {
- // fp16
- KERNEL(ADD_NS(_convolution), "_convolution.deprecated", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef, int64_t, bool, bool, bool), fp16)
- KERNEL(ADD_NS(_convolution), "_convolution", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef, int64_t, bool, bool, bool, bool), fp16)
- KERNEL(ADD_NS(_convolution_nogroup), "_convolution_nogroup", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef), fp16)
- KERNEL(ADD_NS(conv1d), "conv1d", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t), fp16)
- KERNEL(ADD_NS(conv2d), "conv2d", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t), fp16)
- KERNEL(ADD_NS(conv3d), "conv3d", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t), fp16)
- KERNEL(ADD_NS(conv_tbc), "conv_tbc", Tensor (const Tensor &, const Tensor &, const Tensor &, int64_t), fp16)
- KERNEL(ADD_NS(conv_transpose1d), "conv_transpose1d", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), fp16)
- KERNEL(ADD_NS(conv_transpose2d), "conv_transpose2d.input", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), fp16)
- KERNEL(ADD_NS(conv_transpose3d), "conv_transpose3d.input", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), fp16)
- KERNEL(ADD_NS(convolution), "convolution", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef, int64_t), fp16)
- KERNEL(ADD_NS(cudnn_convolution), "cudnn_convolution.deprecated", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, bool, bool), fp16)
- KERNEL(ADD_NS(cudnn_convolution_transpose), "cudnn_convolution_transpose.deprecated", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, bool, bool), fp16)
- KERNEL(ADD_NS(cudnn_convolution), "cudnn_convolution.deprecated2", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, bool, bool), fp16)
- KERNEL(ADD_NS(cudnn_convolution_transpose), "cudnn_convolution_transpose.deprecated2", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, bool, bool), fp16)
- KERNEL(ADD_NS(cudnn_convolution), "cudnn_convolution", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, bool, bool, bool), fp16)
- KERNEL(ADD_NS(cudnn_convolution_transpose), "cudnn_convolution_transpose", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, bool, bool, bool), fp16)
- KERNEL(ADD_NS(prelu), "prelu", Tensor (const Tensor &, const Tensor &), fp16)
- KERNEL(ADD_NS(addmm), "addmm", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), fp16)
- KERNEL(ADD_NS(addmv), "addmv", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), fp16)
- KERNEL(ADD_NS(addr), "addr", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), fp16)
- KERNEL(ADD_NS(matmul), "matmul", Tensor (const Tensor &, const Tensor &), fp16)
- KERNEL(ADD_NS(mm), "mm", Tensor (const Tensor &, const Tensor &), fp16)
- KERNEL(ADD_NS(mv), "mv", Tensor (const Tensor &, const Tensor &), fp16)
- KERNEL(ADD_NS(linear), "linear", Tensor (const Tensor &, const Tensor &, const c10::optional&), fp16)
- KERNEL(ADD_NS(addbmm), "addbmm", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), fp16)
- KERNEL(ADD_NS(baddbmm), "baddbmm", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), fp16)
- KERNEL(ADD_NS(bmm), "bmm", Tensor (const Tensor &, const Tensor &), fp16)
- KERNEL(ADD_NS(chain_matmul), "chain_matmul", Tensor (TensorList), fp16)
- KERNEL(ADD_NS(linalg_multi_dot), "linalg_multi_dot", Tensor (TensorList), fp16)
+ // lower_precision_fp
+ KERNEL(ADD_NS(_convolution), "_convolution.deprecated", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef, int64_t, bool, bool, bool), lower_precision_fp)
+ KERNEL(ADD_NS(_convolution), "_convolution", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef, int64_t, bool, bool, bool, bool), lower_precision_fp)
+ KERNEL(ADD_NS(_convolution_nogroup), "_convolution_nogroup", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef), lower_precision_fp)
+ KERNEL(ADD_NS(conv1d), "conv1d", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t), lower_precision_fp)
+ KERNEL(ADD_NS(conv2d), "conv2d", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t), lower_precision_fp)
+ KERNEL(ADD_NS(conv3d), "conv3d", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t), lower_precision_fp)
+ KERNEL(ADD_NS(conv_tbc), "conv_tbc", Tensor (const Tensor &, const Tensor &, const Tensor &, int64_t), lower_precision_fp)
+ KERNEL(ADD_NS(conv_transpose1d), "conv_transpose1d", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), lower_precision_fp)
+ KERNEL(ADD_NS(conv_transpose2d), "conv_transpose2d.input", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), lower_precision_fp)
+ KERNEL(ADD_NS(conv_transpose3d), "conv_transpose3d.input", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), lower_precision_fp)
+ KERNEL(ADD_NS(convolution), "convolution", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef, int64_t), lower_precision_fp)
+ KERNEL(ADD_NS(cudnn_convolution), "cudnn_convolution.deprecated", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, bool, bool), lower_precision_fp)
+ KERNEL(ADD_NS(cudnn_convolution_transpose), "cudnn_convolution_transpose.deprecated", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, bool, bool), lower_precision_fp)
+ KERNEL(ADD_NS(cudnn_convolution), "cudnn_convolution.deprecated2", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, bool, bool), lower_precision_fp)
+ KERNEL(ADD_NS(cudnn_convolution_transpose), "cudnn_convolution_transpose.deprecated2", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, bool, bool), lower_precision_fp)
+ KERNEL(ADD_NS(cudnn_convolution), "cudnn_convolution", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, bool, bool, bool), lower_precision_fp)
+ KERNEL(ADD_NS(cudnn_convolution_transpose), "cudnn_convolution_transpose", Tensor (const Tensor &, const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, bool, bool, bool), lower_precision_fp)
+ KERNEL(ADD_NS(prelu), "prelu", Tensor (const Tensor &, const Tensor &), lower_precision_fp)
+ KERNEL(ADD_NS(addmm), "addmm", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), lower_precision_fp)
+ KERNEL(ADD_NS(addmv), "addmv", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), lower_precision_fp)
+ KERNEL(ADD_NS(addr), "addr", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), lower_precision_fp)
+ KERNEL(ADD_NS(matmul), "matmul", Tensor (const Tensor &, const Tensor &), lower_precision_fp)
+ KERNEL(ADD_NS(mm), "mm", Tensor (const Tensor &, const Tensor &), lower_precision_fp)
+ KERNEL(ADD_NS(mv), "mv", Tensor (const Tensor &, const Tensor &), lower_precision_fp)
+ KERNEL(ADD_NS(linear), "linear", Tensor (const Tensor &, const Tensor &, const c10::optional&), lower_precision_fp)
+ KERNEL(ADD_NS(addbmm), "addbmm", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), lower_precision_fp)
+ KERNEL(ADD_NS(baddbmm), "baddbmm", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), lower_precision_fp)
+ KERNEL(ADD_NS(bmm), "bmm", Tensor (const Tensor &, const Tensor &), lower_precision_fp)
+ KERNEL(ADD_NS(chain_matmul), "chain_matmul", Tensor (TensorList), lower_precision_fp)
+ KERNEL(ADD_NS(linalg_multi_dot), "linalg_multi_dot", Tensor (TensorList), lower_precision_fp)
// The macro doesn't like these (I think it chokes on commas inside <>) so write them manually
m.impl(TORCH_SELECTIVE_NAME("aten::_thnn_fused_lstm_cell"),
- TORCH_FN((&WrapFunction (const Tensor &, const Tensor &, const Tensor &, const c10::optional&, const c10::optional&),
std::tuple (const Tensor &, const Tensor &, const Tensor &, const c10::optional&, const c10::optional&),
&ADD_NS(_thnn_fused_lstm_cell)>::type::call)));
m.impl("_thnn_fused_gru_cell",
- TORCH_FN((&WrapFunction (const Tensor &, const Tensor &, const Tensor &, const c10::optional&, const c10::optional&),
std::tuple (const Tensor &, const Tensor &, const Tensor &, const c10::optional&, const c10::optional&),
&ADD_NS(_thnn_fused_gru_cell)>::type::call)));
m.impl("lstm_cell",
- TORCH_FN((&WrapFunction (const Tensor &, TensorList, const Tensor &, const Tensor &, const c10::optional&, const c10::optional&),
std::tuple (const Tensor &, TensorList, const Tensor &, const Tensor &, const c10::optional&, const c10::optional&),
&ADD_NS(lstm_cell)>::type::call)));
m.impl("gru_cell",
- TORCH_FN((&WrapFunction&, const c10::optional&),
Tensor (const Tensor &, const Tensor &, const Tensor &, const Tensor &, const c10::optional&, const c10::optional&),
&ADD_NS(gru_cell)>::type::call)));
m.impl("rnn_tanh_cell", // tanh unary op is executed as a cuda math library call.
- TORCH_FN((&WrapFunction&, const c10::optional&),
Tensor (const Tensor &, const Tensor &, const Tensor &, const Tensor &, const c10::optional&, const c10::optional&),
&ADD_NS(rnn_tanh_cell)>::type::call)));
m.impl("rnn_relu_cell",
- TORCH_FN((&WrapFunction&, const c10::optional&),
Tensor (const Tensor &, const Tensor &, const Tensor &, const Tensor &, const c10::optional&, const c10::optional&),
&ADD_NS(rnn_relu_cell)>::type::call)));
@@ -342,7 +372,7 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
KERNEL(ADD_NS(layer_norm), "layer_norm", Tensor (const Tensor &, IntArrayRef, const c10::optional&, const c10::optional&, double, bool), fp32)
// The macro doesn't like this one (I think it chokes on commas inside <>) so write it manually
m.impl(TORCH_SELECTIVE_NAME("aten::native_layer_norm"),
- TORCH_FN((&WrapFunction (const Tensor&, IntArrayRef, const c10::optional&, const c10::optional&, double),
std::tuple (const Tensor&, IntArrayRef, const c10::optional&, const c10::optional&, double),
&ADD_NS(native_layer_norm)>::type::call)));
@@ -372,6 +402,7 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
KERNEL(ADD_NS(pdist), "pdist", Tensor (const Tensor &, double), fp32)
KERNEL(ADD_NS(cdist), "cdist", Tensor (const Tensor &, const Tensor &, double, c10::optional), fp32)
KERNEL(ADD_NS(renorm), "renorm", Tensor (const Tensor &, const Scalar&, int64_t, const Scalar&), fp32)
+ KERNEL(ADD_NS(grid_sampler), "grid_sampler", Tensor (const Tensor &, const Tensor &, int64_t, int64_t, bool), fp32)
// fp32_set_opt_dtype
KERNEL(ADD_NS(prod), "prod", Tensor (const Tensor &, c10::optional), fp32_set_opt_dtype)
KERNEL(ADD_NS(prod), "prod.dim_int", Tensor (const Tensor &, int64_t, bool, c10::optional), fp32_set_opt_dtype)
@@ -418,7 +449,86 @@ TORCH_LIBRARY_IMPL(aten, Autocast, m) {
TORCH_FN((&at::autocast::binary_cross_entropy_banned)));
}
+TORCH_LIBRARY_IMPL(_, AutocastCPU, m) {
+ m.fallback(torch::CppFunction::makeFallthrough());
}
+TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) {
+ // lower_precision_fp cast policy
+ KERNEL_CPU(ADD_NS(conv1d), "conv1d", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t), lower_precision_fp)
+ KERNEL_CPU(ADD_NS(conv2d), "conv2d", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t), lower_precision_fp)
+ KERNEL_CPU(ADD_NS(conv3d), "conv3d", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, int64_t), lower_precision_fp)
+ KERNEL_CPU(ADD_NS(_log_softmax), "_log_softmax", Tensor (const Tensor &, int64_t, bool), lower_precision_fp)
+ KERNEL_CPU(ADD_NS(bmm), "bmm", Tensor (const Tensor &, const Tensor &), lower_precision_fp)
+ KERNEL_CPU(ADD_NS(mm), "mm", Tensor (const Tensor &, const Tensor &), lower_precision_fp)
+ KERNEL_CPU(ADD_NS(baddbmm), "baddbmm", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), lower_precision_fp)
+ KERNEL_CPU(ADD_NS(addmm), "addmm", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), lower_precision_fp)
+ KERNEL_CPU(ADD_NS(addbmm), "addbmm", Tensor (const Tensor &, const Tensor &, const Tensor &, const Scalar&, const Scalar&), lower_precision_fp)
+ KERNEL_CPU(ADD_NS(linear), "linear", Tensor (const Tensor &, const Tensor &, const c10::optional &), lower_precision_fp)
+
+ // fp32 cast policy
+ KERNEL_CPU(ADD_NS(conv_transpose3d), "conv_transpose3d.input", Tensor (const Tensor &, const Tensor &, const c10::optional &, IntArrayRef, IntArrayRef, IntArrayRef, int64_t, IntArrayRef), fp32)
+ KERNEL_CPU(ADD_NS(batch_norm), "batch_norm", Tensor (const Tensor &, const c10::optional &, const c10::optional &, const c10::optional &, const c10::optional &, bool, double, double, bool), fp32)
+ KERNEL_CPU(ADD_NS(max_pool2d), "max_pool2d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, IntArrayRef, bool), fp32)
+ KERNEL_CPU(ADD_NS(adaptive_avg_pool2d), "adaptive_avg_pool2d", Tensor (const Tensor &, IntArrayRef), fp32)
+
+ KERNEL_CPU(ADD_NS(convolution), "convolution", Tensor (const Tensor &, const Tensor &, const c10::optional&, IntArrayRef, IntArrayRef, IntArrayRef, bool, IntArrayRef, int64_t), fp32)
+ KERNEL_CPU(ADD_NS(dropout), "dropout", Tensor (const Tensor &, double, bool), fp32)
+ KERNEL_CPU(ADD_NS(avg_pool2d), "avg_pool2d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, bool, bool, c10::optional), fp32)
+ KERNEL_CPU(ADD_NS(avg_pool3d), "avg_pool3d", Tensor (const Tensor &, IntArrayRef, IntArrayRef, IntArrayRef, bool, bool, c10::optional), fp32)
+ KERNEL_CPU(ADD_NS(gelu), "gelu", Tensor (const Tensor &), fp32)
+ KERNEL_CPU(ADD_NS(upsample_nearest1d), "upsample_nearest1d", Tensor (const Tensor &, IntArrayRef, c10::optional), fp32)
+ KERNEL_CPU(ADD_NS(upsample_nearest1d), "upsample_nearest1d.vec", Tensor (const Tensor &, c10::optional, c10::optional>), fp32)
+ KERNEL_CPU(ADD_NS(upsample_nearest2d), "upsample_nearest2d", Tensor (const Tensor &, IntArrayRef, c10::optional, c10::optional), fp32)
+ KERNEL_CPU(ADD_NS(upsample_nearest2d), "upsample_nearest2d.vec", Tensor (const Tensor &, c10::optional, c10::optional>), fp32)
+ KERNEL_CPU(ADD_NS(upsample_nearest3d), "upsample_nearest3d", Tensor (const Tensor &, IntArrayRef, c10::optional, c10::optional, c10::optional), fp32)
+ KERNEL_CPU(ADD_NS(upsample_nearest3d), "upsample_nearest3d.vec", Tensor (const Tensor &, c10::optional, c10::optional>), fp32)
+ KERNEL_CPU(ADD_NS(upsample_linear1d), "upsample_linear1d", Tensor (const Tensor &, IntArrayRef, bool, c10::optional), fp32)
+ KERNEL_CPU(ADD_NS(upsample_linear1d), "upsample_linear1d.vec", Tensor (const Tensor &, c10::optional, bool, c10::optional>), fp32)
+ KERNEL_CPU(ADD_NS(upsample_bilinear2d), "upsample_bilinear2d", Tensor (const Tensor &, IntArrayRef, bool, c10::optional, c10::optional), fp32)
+ KERNEL_CPU(ADD_NS(upsample_bilinear2d), "upsample_bilinear2d.vec", Tensor (const Tensor &, c10::optional, bool, c10::optional>), fp32)
+ KERNEL_CPU(ADD_NS(upsample_trilinear3d), "upsample_trilinear3d", Tensor (const Tensor &, IntArrayRef, bool, c10::optional, c10::optional, c10::optional), fp32)
+ KERNEL_CPU(ADD_NS(upsample_trilinear3d), "upsample_trilinear3d.vec", Tensor (const Tensor &, c10::optional, bool, c10::optional>), fp32)
+ KERNEL_CPU(ADD_NS(binary_cross_entropy), "binary_cross_entropy", Tensor (const Tensor &, const Tensor &, const c10::optional&, int64_t), fp32)
+ KERNEL_CPU(ADD_NS(binary_cross_entropy_with_logits), "binary_cross_entropy_with_logits", Tensor (const Tensor &, const Tensor &, const c10::optional&, const c10::optional&, int64_t), fp32)
+ KERNEL_CPU(ADD_NS(pow), "pow.Tensor_Scalar", Tensor (const Tensor &, const Scalar &), fp32)
+ KERNEL_CPU(ADD_NS(pow), "pow.Tensor_Tensor", Tensor (const Tensor &, const Tensor &), fp32)
+ KERNEL_CPU(ADD_NS(pow), "pow.Scalar", Tensor (const Scalar&, const Tensor &), fp32)
+ KERNEL_CPU(ADD_NS(smooth_l1_loss), "smooth_l1_loss", Tensor (const Tensor &, const Tensor &, int64_t, double), fp32)
+ KERNEL_CPU(ADD_NS(reflection_pad1d), "reflection_pad1d", Tensor (const Tensor &, IntArrayRef), fp32)
+ KERNEL_CPU(ADD_NS(std), "std", Tensor (const Tensor &, bool), fp32)
+ KERNEL_CPU(ADD_NS(std), "std.dim", Tensor (const Tensor &, IntArrayRef, bool, bool), fp32)
+ KERNEL_CPU(ADD_NS(instance_norm), "instance_norm", Tensor (const Tensor &, const c10::optional&, const c10::optional&, const c10::optional&, const c10::optional&, bool, double, double, bool), fp32)
+ KERNEL_CPU(ADD_NS(fake_quantize_per_tensor_affine), "fake_quantize_per_tensor_affine", Tensor (const Tensor &, double, int64_t, int64_t, int64_t), fp32)
+
+ // promote
+ KERNEL_CPU(ADD_NS(cat), "cat", Tensor (TensorList, int64_t), promote)
+ KERNEL_CPU(ADD_NS(stack), "stack", Tensor (TensorList, int64_t), promote)
+
+ m.impl(TORCH_SELECTIVE_NAME("aten::topk"),
+ TORCH_FN((&WrapFunction (const Tensor &, int64_t, int64_t, bool, bool),
+ std::tuple (const Tensor &, int64_t, int64_t, bool, bool),
+ &ADD_NS(topk)>::type::call)));
+
+ m.impl(TORCH_SELECTIVE_NAME("aten::sort"),
+ TORCH_FN((&WrapFunction (const Tensor &, int64_t, bool),
+ std::tuple (const Tensor &, int64_t, bool),
+ &ADD_NS(sort)>::type::call)));
+
+ m.impl(TORCH_SELECTIVE_NAME("aten::kthvalue"),
+ TORCH_FN((&WrapFunction (const Tensor &, int64_t, int64_t, bool),
+ std::tuple (const Tensor &, int64_t, int64_t, bool),
+ &ADD_NS(kthvalue)>::type::call)));
+
+ m.impl(TORCH_SELECTIVE_NAME("aten::kthvalue.dimname"),
+ TORCH_FN((&WrapFunction (const Tensor &, int64_t, at::Dimname, bool),
+ std::tuple (const Tensor &, int64_t, at::Dimname, bool),
+ &ADD_NS(kthvalue)>::type::call)));
+}
+} // namespace
} // namespace autocast
} // namespace at
diff --git a/aten/src/ATen/autocast_mode.h b/aten/src/ATen/autocast_mode.h
index 85db1c2e1a45da..e78a9c8b54ca28 100644
--- a/aten/src/ATen/autocast_mode.h
+++ b/aten/src/ATen/autocast_mode.h
@@ -3,17 +3,49 @@
namespace at {
namespace autocast {
-namespace {
- bool is_autocast_eligible(const Tensor& tensor) {
- return (tensor.is_cuda() || tensor.is_xla()) && tensor.is_floating_point();
- }
-} // namespace
-
TORCH_API bool is_enabled();
TORCH_API void set_enabled(bool enabled);
TORCH_API void clear_cache();
TORCH_API int increment_nesting();
TORCH_API int decrement_nesting();
+TORCH_API bool is_cpu_enabled();
+TORCH_API void set_cpu_enabled(bool enabled);
+TORCH_API at::ScalarType get_autocast_cpu_dtype();
+TORCH_API void set_autocast_cpu_dtype(at::ScalarType dtype);
+
+namespace {
+ bool is_autocast_eligible(const Tensor& tensor, DeviceType device_type) {
+ return device_type == DeviceType::CUDA
+ ? (tensor.is_cuda() || tensor.is_xla()) && tensor.is_floating_point()
+ : (tensor.is_cpu() || tensor.is_mkldnn()) && tensor.is_floating_point();
+ }
+} // namespace
+
+inline DispatchKey get_autocast_dispatch_key_from_device_type(
+ DeviceType device_type) {
+ switch (device_type) {
+ case DeviceType::CUDA:
+ return DispatchKey::Autocast;
+ case DeviceType::CPU:
+ return DispatchKey::AutocastCPU;
+ default:
+ throw std::runtime_error(
+ "unknown device type for autocast in get_autocast_dispatch_key_from_device_type");
+ }
+}
+
+inline at::ScalarType get_lower_precision_fp_from_device_type(
+ DeviceType device_type) {
+ switch (device_type) {
+ case DeviceType::CUDA:
+ return at::kHalf;
+ case DeviceType::CPU:
+ return get_autocast_cpu_dtype();
+ default:
+ throw std::runtime_error(
+ "unknown device type for autocast in get_lower_precision_fp_from_device_type");
+ }
+}
/********************************************************************
Logic to extract the promote type from any Tensor or TensorList args.
@@ -22,19 +54,24 @@ Logic to extract the promote type from any Tensor or TensorList args.
// Overload to catch Tensor args.
// If nextArg is floating-point, compare its scalar_type with our
// current best guess for the promote type, and update if necessary.
-inline at::ScalarType prioritize(at::ScalarType current, const Tensor& nextArg) {
+inline at::ScalarType prioritize(
+ at::ScalarType current,
+ const Tensor& nextArg,
+ DeviceType device_type=DeviceType::CUDA) {
if (current == at::kDouble) {
AT_ERROR("promote type is double in at::autocast::prioritize");
return current;
}
- if (is_autocast_eligible(nextArg)) {
+ at::ScalarType lower_precision_fp =
+ get_lower_precision_fp_from_device_type(device_type);
+ if (is_autocast_eligible(nextArg, device_type)) {
auto next = nextArg.scalar_type();
if (next == at::kDouble) {
return current; // ignores double tensors
} else if (current == at::kFloat || next == at::kFloat) {
- return at::kFloat; // prioritizes float over half
- } else if (current == at::kHalf && next == at::kHalf) {
- return at::kHalf;
+ return at::kFloat; // prioritizes float over lower_precision_fp
+ } else if (current == lower_precision_fp && next == lower_precision_fp) {
+ return lower_precision_fp;
} else {
AT_ERROR("Unexpected floating ScalarType in at::autocast::prioritize");
return current;
@@ -46,64 +83,92 @@ inline at::ScalarType prioritize(at::ScalarType current, const Tensor& nextArg)
// Overload to catch TensorList args (for e.g. cat, stack).
// Reuses the overload above to process each Tensor in the list.
-inline at::ScalarType prioritize(at::ScalarType current, const TensorList& list) {
+inline at::ScalarType prioritize(
+ at::ScalarType current,
+ const TensorList& list,
+ DeviceType device_type=DeviceType::CUDA) {
for (const auto& tensor : list) {
- current = prioritize(current, tensor);
+ current = prioritize(current, tensor, device_type);
}
return current;
}
// Template to catch non-Tensor args (no-op that returns current best guess)
template
-inline at::ScalarType prioritize(at::ScalarType current, T nextArg) {
+inline at::ScalarType prioritize(
+ at::ScalarType current,
+ T nextArg,
+ DeviceType device_type=DeviceType::CUDA) {
return current;
}
// Overload for the tail case.
-inline at::ScalarType promote_type(at::ScalarType current) {
+inline at::ScalarType promote_type(
+ at::ScalarType current,
+ DeviceType device_type) {
return current;
}
-// Unpack args and determine if incoming float16 tensors need to be promoted to float32.
+// Unpack args and determine if incoming lower_precision_fp tensors need to be promoted to float32.
// Non-Tensor arguments are ignored.
template
-inline at::ScalarType promote_type(at::ScalarType current, Arg0 arg0, Args... args) {
- auto new_current = prioritize(current, arg0);
- return promote_type(new_current, args...);
+inline at::ScalarType promote_type(
+ at::ScalarType current,
+ DeviceType device_type,
+ Arg0 arg0,
+ Args... args) {
+ auto new_current = prioritize(current, arg0, device_type);
+ return promote_type(new_current, device_type, args...);
}
/****************************************************
Logic to apply cached casting to any Tensor argument.
****************************************************/
-inline bool is_eligible(const Tensor& arg) {
- return (arg.defined() && is_autocast_eligible(arg) && (arg.scalar_type() != at::kDouble));
+inline bool is_eligible(
+ const Tensor& arg,
+ DeviceType device_type=DeviceType::CUDA) {
+ return (arg.defined() &&
+ is_autocast_eligible(arg, device_type) &&
+ (arg.scalar_type() != at::kDouble));
}
// Overload to catch Tensor args
-TORCH_API Tensor cached_cast(at::ScalarType to_type, const Tensor& arg);
+TORCH_API Tensor cached_cast(
+ at::ScalarType to_type,
+ const Tensor& arg,
+ DeviceType device_type=DeviceType::CUDA);
// Overload to process optional
-inline c10::optional cached_cast(at::ScalarType to_type, const c10::optional& arg) {
+inline c10::optional cached_cast(
+ at::ScalarType to_type,
+ const c10::optional& arg,
+ DeviceType device_type=DeviceType::CUDA) {
if (arg.has_value()) {
- return cached_cast(to_type, *arg);
+ return cached_cast(to_type, *arg, device_type);
} else {
return c10::nullopt;
}
}
// Overload to process TensorLists
-inline std::vector cached_cast(at::ScalarType to_type, const TensorList& arg) {
+inline std::vector cached_cast(
+ at::ScalarType to_type,
+ const TensorList& arg,
+ DeviceType device_type=DeviceType::CUDA) {
std::vector vec;
vec.reserve(arg.size());
for (const auto& t : arg) {
- vec.push_back(cached_cast(to_type, t));
+ vec.push_back(cached_cast(to_type, t, device_type));
}
return vec;
}
// Template to catch non-Tensor args.
template
-inline T cached_cast(at::ScalarType to_type, T arg) {
+inline T cached_cast(
+ at::ScalarType to_type,
+ T arg,
+ DeviceType device_type=DeviceType::CUDA) {
return arg;
}
diff --git a/aten/src/ATen/core/Dict_inl.h b/aten/src/ATen/core/Dict_inl.h
index cf04a9d950ed9b..9e743558e5a1b4 100644
--- a/aten/src/ATen/core/Dict_inl.h
+++ b/aten/src/ATen/core/Dict_inl.h
@@ -41,7 +41,7 @@ inline size_t DictKeyHash::operator()(const IValue& ivalue) const {
if (ivalue.isInt()) {
return std::hash()(ivalue.toInt());
} else if (ivalue.isString()) {
- return std::hash()(ivalue.toStringRef());
+ return std::hash()(ivalue.toStringView());
} else if (ivalue.isDouble()) {
return std::hash()(ivalue.toDouble());
} else if (ivalue.isComplexDouble()) {
diff --git a/aten/src/ATen/core/aten_interned_strings.h b/aten/src/ATen/core/aten_interned_strings.h
index af1452ecddd8ad..d2251c06a4f4ed 100644
--- a/aten/src/ATen/core/aten_interned_strings.h
+++ b/aten/src/ATen/core/aten_interned_strings.h
@@ -495,6 +495,7 @@ _(aten, miopen_depthwise_convolution_backward_input) \
_(aten, miopen_depthwise_convolution_backward_weight) \
_(aten, miopen_rnn) \
_(aten, miopen_rnn_backward) \
+_(aten, mish) \
_(aten, mkldnn_convolution) \
_(aten, mkldnn_convolution_backward) \
_(aten, mkldnn_convolution_backward_input) \
diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
index 04286d28c6edd4..bb01559265fbeb 100644
--- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
+++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h
@@ -91,7 +91,7 @@ namespace impl {
int64_t,
double,
bool,
- std::string,
+ c10::string_view,
at::Tensor,
at::Scalar,
c10::QScheme,
@@ -199,7 +199,7 @@ namespace impl {
template
struct assert_is_valid_input_type::value>> {
static_assert(guts::false_t::value,
- "You tried to register a kernel with an unsupported input type: const char*. Please use std::string instead.");
+ "You tried to register a kernel with an unsupported input type: const char*. Please use c10::string_view instead.");
};
template
struct assert_is_valid_input_type, T>::value>> {
@@ -287,7 +287,7 @@ namespace impl {
template
struct assert_is_valid_output_type::value>> {
static_assert(guts::false_t::value,
- "You tried to register a kernel with an unsupported output type: const char*. Please use std::string instead.");
+ "You tried to register a kernel with an unsupported output type: const char*. Please use c10::string_view instead.");
};
template
struct assert_is_valid_output_type, T>::value>> {
diff --git a/aten/src/ATen/core/custom_class.cpp b/aten/src/ATen/core/custom_class.cpp
index c396e810eabe15..8f1a66452576b4 100644
--- a/aten/src/ATen/core/custom_class.cpp
+++ b/aten/src/ATen/core/custom_class.cpp
@@ -50,5 +50,51 @@ std::vector customClassSchemasForBCCheck() {
});
}
+namespace detail {
+class_base::class_base(
+ const std::string& namespaceName,
+ const std::string& className,
+ std::string doc_string,
+ const std::type_info& intrusivePtrClassTypeid,
+ const std::type_info& taggedCapsuleClassTypeid)
+ : qualClassName("__torch__.torch.classes." + namespaceName + '.' + className),
+ classTypePtr(at::ClassType::create(
+ c10::QualifiedName(qualClassName),
+ std::weak_ptr(),
+ /*is_module=*/false,
+ std::move(doc_string)))
+{
+ detail::checkValidIdent(namespaceName, "Namespace name");
+ detail::checkValidIdent(className, "Class name");
+ classTypePtr->addAttribute("capsule", at::CapsuleType::get());
+ c10::getCustomClassTypeMap().insert(
+ {std::type_index(intrusivePtrClassTypeid), classTypePtr});
+ c10::getCustomClassTypeMap().insert(
+ {std::type_index(taggedCapsuleClassTypeid), classTypePtr});
+ registerCustomClass(classTypePtr);
+}
+
+c10::FunctionSchema class_base::withNewArguments(
+ const c10::FunctionSchema& schema,
+ std::initializer_list default_args) {
+ const auto& old_args = schema.arguments();
+ std::vector new_args;
+ new_args.reserve(old_args.size());
+
+ new_args.emplace_back(old_args[0]);
+ // Skip self.
+ size_t argIdx = 1;
+ for (const auto& default_arg : default_args) {
+ auto& old_arg = old_args[argIdx++];
+ new_args.emplace_back(
+ default_arg.name_,
+ old_arg.type(),
+ old_arg.N(),
+ default_arg.value_);
+ }
+ return schema.cloneWithArguments(std::move(new_args));
+}
+
+} // namespace detail
} // namespace torch
diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp
index 5d963098117b6f..c3156628007b62 100644
--- a/aten/src/ATen/core/ivalue.cpp
+++ b/aten/src/ATen/core/ivalue.cpp
@@ -39,6 +39,16 @@ TORCH_API c10::intrusive_ptr ConstantString::create(
return c10::make_intrusive(std::move(str_));
}
+TORCH_API c10::intrusive_ptr ConstantString::create(
+ c10::string_view str_) {
+ return c10::make_intrusive(std::string(str_));
+}
+
+TORCH_API c10::intrusive_ptr ConstantString::create(
+ const char* str_) {
+ return c10::make_intrusive(std::string(str_));
+}
+
bool operator==(const ivalue::Tuple& lhs, const ivalue::Tuple& rhs) {
return lhs.elements_.size() == rhs.elements_.size() &&
// see [container equality]
@@ -616,7 +626,7 @@ IValueComparator getLessThanComparator(const IValue& v) {
if (v.isString()) {
return [](const IValue& a, const IValue& b) {
- return a.toString()->string() < b.toString()->string();
+ return a.toStringRef() < b.toStringRef();
};
}
diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h
index 57ac050a96e966..ec699fa44af033 100644
--- a/aten/src/ATen/core/ivalue.h
+++ b/aten/src/ATen/core/ivalue.h
@@ -555,6 +555,7 @@ struct TORCH_API IValue final {
IValue(c10::intrusive_ptr v);
IValue(std::string v);
IValue(const char* v) : IValue(std::string(v)) {}
+ IValue(c10::string_view v) : IValue(std::string(v)) {};
bool isString() const {
return Tag::String == tag;
}
@@ -563,6 +564,7 @@ struct TORCH_API IValue final {
const std::string& toStringRef() const;
c10::optional> toOptionalStringRef()
const;
+ c10::string_view toStringView() const;
// DoubleList
bool isDoubleList() const;
diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h
index ed09e4b8d2667c..10d0a7a985577d 100644
--- a/aten/src/ATen/core/ivalue_inl.h
+++ b/aten/src/ATen/core/ivalue_inl.h
@@ -227,10 +227,18 @@ struct TORCH_API ConstantString final : c10::intrusive_ptr_target {
public:
ConstantString(std::string str) : str_(std::move(str)) {}
+ ConstantString(c10::string_view str) : str_(std::string(str)) {}
static c10::intrusive_ptr create(std::string str_);
+ static c10::intrusive_ptr create(c10::string_view str_);
+ static c10::intrusive_ptr create(const char* str_);
+
const std::string& string() const {
return str_;
}
+ c10::string_view string_view() const {
+ return str_;
+ }
+
operator const std::string&() const {
return string();
}
@@ -306,12 +314,22 @@ struct EnumHolder;
// Future
struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
- public:
+ private:
+ // Keep this private in order to force users to go through make_intrusive and
+ // thus prevent creating a Future that's not held by an intrusive_ptr.
explicit Future(TypePtr type, std::vector devices={})
: type_(std::move(type)),
impl_(getTypeOfDevices(devices)),
devices_(sortAndDeduplicateDevices(impl_, std::move(devices))) {}
+ friend c10::intrusive_ptr;
+
+ public:
+ Future(const Future&) = delete;
+ Future(Future&&) = delete;
+ Future& operator=(const Future&) = delete;
+ Future& operator=(Future&&) = delete;
+
struct TORCH_API FutureError final : public std::exception {
explicit FutureError(std::string&& error_msg_)
: error_msg(std::move(error_msg_)) {}
@@ -477,7 +495,13 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
* If the future has already completed,
* this function will execute the callback immediately.
*/
- void addCallback(std::function callback) {
+ template
+ void addCallback(T callback) {
+#if __cpp_lib_is_invocable >= 201703
+ static_assert(
+ std::is_invocable_r::value,
+ "The callback must have signature void(Future&)");
+#endif
std::unique_lock lock(mutex_);
if (completed()) {
lock.unlock();
@@ -492,12 +516,16 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
* value of the callback. This is necessary when the callback provider needs
* to know for sure when the callback has finished.
*/
- c10::intrusive_ptr then(
- std::function callback,
- TypePtr type) {
+ template
+ c10::intrusive_ptr then(T callback, TypePtr type) {
+#if __cpp_lib_is_invocable >= 201703
+ static_assert(
+ std::is_invocable_r::value,
+ "The callback must have signature IValue(Future&)");
+#endif
auto childFut = createInstance(std::move(type));
addCallback(
- [childFut, cb = std::move(callback)](Future& parentFut) {
+ [childFut, cb = std::move(callback)](Future& parentFut) mutable {
try {
childFut->markCompleted(cb(parentFut));
} catch (std::exception&) {
@@ -507,6 +535,36 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
return childFut;
}
+ template
+ c10::intrusive_ptr thenAsync(T callback, TypePtr type) {
+#if __cpp_lib_is_invocable >= 201703
+ static_assert(
+ std::is_invocable_r, T, Future&>::value,
+ "The callback must have signature c10::intrusive_ptr(Future&)");
+#endif
+ auto childFut = createInstance(std::move(type));
+ addCallback(
+ [childFut, cb = std::move(callback)](Future& parentFut) mutable {
+ c10::intrusive_ptr intermediateFut;
+ try {
+ intermediateFut = cb(parentFut);
+ } catch (std::exception&) {
+ childFut->setError(std::current_exception());
+ return;
+ }
+ intermediateFut->addCallback(
+ [childFut = std::move(childFut)](Future& intermediateFut) {
+ if (intermediateFut.hasError()) {
+ childFut->setError(intermediateFut.exception_ptr());
+ } else {
+ childFut->markCompleted(
+ intermediateFut.value(), intermediateFut.dataPtrs());
+ }
+ });
+ });
+ return childFut;
+ }
+
// Tries to retrieve the error message from std::exception_ptr.
std::string tryRetrieveErrorMessage() const {
TORCH_CHECK(hasError(), "No error present on the future.");
@@ -558,7 +616,14 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target {
// how/when that happens) as it will ensure that the proper "environment" is
// set up before running the callback, as in, it will set up the CUDA streams,
// synchronize them with the value, and so on (if needed).
- void invokeCallback(std::function callback) {
+ template
+ void invokeCallback(T callback) {
+#if __cpp_lib_is_invocable >= 201703
+ static_assert(
+ std::is_invocable_r::value,
+ "The callback must have signature void(Future&)");
+#endif
+
c10::OptionalDeviceGuard deviceGuard(currentDevice_);
std::vector streams;
@@ -989,6 +1054,7 @@ DEFINE_TO(c10::impl::GenericList, toList)
DEFINE_TO(c10::impl::GenericDict, toGenericDict)
DEFINE_TO(c10::intrusive_ptr, toTuple)
DEFINE_TO(std::string, toStringRef)
+DEFINE_TO(c10::string_view, toStringView)
DEFINE_TO(c10::intrusive_ptr, toFuture)
DEFINE_TO(c10::intrusive_ptr, toRRef)
DEFINE_TO(c10::intrusive_ptr, toQuantizer)
@@ -1198,6 +1264,14 @@ inline T IValue::to() && {
return generic_to(std::move(*this), _fake_type{});
}
+template <>
+inline c10::optional IValue::to() && {
+ // In the default implementation, the IValue is destroyed with std::move.
+ // But if the unboxed type is optional we cannot destroy
+ // the IValue.
+ return generic_to(*this, _fake_type>{});
+}
+
template
inline typename c10::detail::ivalue_to_const_ref_overload_return::type IValue::to() const& {
return generic_to(*this, _fake_type{});
@@ -1495,6 +1569,16 @@ inline c10::optional> IValue::
->string());
}
+inline c10::string_view IValue::toStringView() const {
+ AT_ASSERT(isString(), "Expected String but got ", tagKind());
+ TORCH_INTERNAL_ASSERT_DEBUG_ONLY(
+ payload.u.as_intrusive_ptr != c10::UndefinedTensorImpl::singleton(),
+ "called toStringView on null intrusive_ptr IValue");
+ return static_cast(
+ payload.u.as_intrusive_ptr)
+ ->string_view();
+}
+
inline PyObject* IValue::toPyObject() const {
return toPyObjectHolder()->getPyObject();
}
diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h
index d0a4f6295e98f4..d20231d19530ca 100644
--- a/aten/src/ATen/core/jit_type.h
+++ b/aten/src/ATen/core/jit_type.h
@@ -244,6 +244,10 @@ struct TORCH_API ShapeSymbol {
return value_;
};
+ int64_t value() const {
+ return value_;
+ };
+
static ShapeSymbol newSymbol() {
return fromStaticSize(-static_cast