Skip to content

Commit

Permalink
switch to rocThrust for thrust/cub APIs (#25620)
Browse files Browse the repository at this point in the history
Summary:
Pull Request resolved: pytorch/pytorch#25620

Pull Request resolved: pytorch/pytorch#25602

Enable rocThrust with hipCUB and rocPRIM for ROCm. They are the ROCm implementations of the thrust and cub APIs and replace the older hip-thrust and cub-hip packages going forward. ROCm 2.5 is the first release to contain the new packages as an option, as of 2.6 they will be the only available option.

Add hipification rules to correctly hipify thrust::cuda to thrust::hip and cub:: to hipcub:: going forward. Add hipification rules to hipify specific cub headers to the general hipcub header.

Infrastructure work to correctly find, include and link against the new packages. Add the macro definition to choose the HIP backend to Thrust.

Since include chains are now a little different from CUDA's Thrust, add includes for functionality used where applicable.

Skip four tests that fail with the new rocThrust for now.
Pull Request resolved: pytorch/pytorch#21864

Reviewed By: xw285cornell

Differential Revision: D16940768

Pulled By: bddppq

fbshipit-source-id: 3dba8a8f1763dd23d89eb0dd26d1db109973dbe5
  • Loading branch information
iotamudelta authored and facebook-github-bot committed Sep 4, 2019
1 parent 68b9920 commit 4fe8571
Show file tree
Hide file tree
Showing 19 changed files with 107 additions and 41 deletions.
2 changes: 1 addition & 1 deletion .circleci/cimodel/data/caffe2_build_definitions.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

DOCKER_IMAGE_PATH_BASE = "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/"

DOCKER_IMAGE_VERSION = 287
DOCKER_IMAGE_VERSION = 301


@dataclass
Expand Down
40 changes: 20 additions & 20 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1835,7 +1835,7 @@ workflows:
- master
- /ci-all\/.*/
build_environment: "caffe2-py2-gcc4.8-ubuntu14.04-build"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:287"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:301"
- caffe2_linux_test:
name: caffe2_py2_gcc4_8_ubuntu14_04_test
requires:
Expand All @@ -1847,7 +1847,7 @@ workflows:
- master
- /ci-all\/.*/
build_environment: "caffe2-py2-gcc4.8-ubuntu14.04-test"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:287"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc4.8-ubuntu14.04:301"
resource_class: large
- caffe2_linux_build:
name: caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_build
Expand All @@ -1859,7 +1859,7 @@ workflows:
- master
- /ci-all\/.*/
build_environment: "caffe2-py2-cuda9.0-cudnn7-ubuntu16.04-build"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:287"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:301"
- caffe2_linux_test:
name: caffe2_py2_cuda9_0_cudnn7_ubuntu16_04_test
requires:
Expand All @@ -1872,65 +1872,65 @@ workflows:
- /ci-all\/.*/
build_environment: "caffe2-py2-cuda9.0-cudnn7-ubuntu16.04-test"
use_cuda_docker_runtime: "1"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:287"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:301"
resource_class: gpu.medium
- caffe2_linux_build:
name: caffe2_cmake_cuda9_0_cudnn7_ubuntu16_04_build
requires:
- setup
build_environment: "caffe2-cmake-cuda9.0-cudnn7-ubuntu16.04-build"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:287"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:301"
- caffe2_linux_test:
name: caffe2_cmake_cuda9_0_cudnn7_ubuntu16_04_test
requires:
- setup
- caffe2_cmake_cuda9_0_cudnn7_ubuntu16_04_build
build_environment: "caffe2-cmake-cuda9.0-cudnn7-ubuntu16.04-test"
use_cuda_docker_runtime: "1"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:287"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-ubuntu16.04:301"
resource_class: gpu.medium
- caffe2_linux_build:
name: caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build
requires:
- setup
build_environment: "caffe2-py2-cuda9.1-cudnn7-ubuntu16.04-build"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.1-cudnn7-ubuntu16.04:287"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.1-cudnn7-ubuntu16.04:301"
- caffe2_linux_test:
name: caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_test
requires:
- setup
- caffe2_py2_cuda9_1_cudnn7_ubuntu16_04_build
build_environment: "caffe2-py2-cuda9.1-cudnn7-ubuntu16.04-test"
use_cuda_docker_runtime: "1"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.1-cudnn7-ubuntu16.04:287"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.1-cudnn7-ubuntu16.04:301"
resource_class: gpu.medium
- caffe2_linux_build:
name: caffe2_py2_mkl_ubuntu16_04_build
requires:
- setup
build_environment: "caffe2-py2-mkl-ubuntu16.04-build"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:287"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:301"
- caffe2_linux_test:
name: caffe2_py2_mkl_ubuntu16_04_test
requires:
- setup
- caffe2_py2_mkl_ubuntu16_04_build
build_environment: "caffe2-py2-mkl-ubuntu16.04-test"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:287"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-mkl-ubuntu16.04:301"
resource_class: large
- caffe2_linux_build:
name: caffe2_onnx_py2_gcc5_ubuntu16_04_build
requires:
- setup
build_environment: "caffe2-onnx-py2-gcc5-ubuntu16.04-build"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:287"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:301"
- caffe2_linux_test:
name: caffe2_onnx_py2_gcc5_ubuntu16_04_test
requires:
- setup
- caffe2_onnx_py2_gcc5_ubuntu16_04_build
build_environment: "caffe2-onnx-py2-gcc5-ubuntu16.04-test"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:287"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-gcc5-ubuntu16.04:301"
resource_class: large
- caffe2_linux_build:
name: caffe2_py2_clang3_8_ubuntu16_04_build
Expand All @@ -1942,7 +1942,7 @@ workflows:
- master
- /ci-all\/.*/
build_environment: "caffe2-py2-clang3.8-ubuntu16.04-build"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.8-ubuntu16.04:287"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.8-ubuntu16.04:301"
build_only: "1"
- caffe2_linux_build:
name: caffe2_py2_clang3_9_ubuntu16_04_build
Expand All @@ -1954,35 +1954,35 @@ workflows:
- master
- /ci-all\/.*/
build_environment: "caffe2-py2-clang3.9-ubuntu16.04-build"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.9-ubuntu16.04:287"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang3.9-ubuntu16.04:301"
build_only: "1"
- caffe2_linux_build:
name: caffe2_py2_clang7_ubuntu16_04_build
requires:
- setup
build_environment: "caffe2-py2-clang7-ubuntu16.04-build"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang7-ubuntu16.04:287"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-clang7-ubuntu16.04:301"
build_only: "1"
- caffe2_linux_build:
name: caffe2_onnx_py3_6_clang7_ubuntu16_04_build
requires:
- setup
build_environment: "caffe2-onnx-py3.6-clang7-ubuntu16.04-build"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:287"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:301"
- caffe2_linux_test:
name: caffe2_onnx_py3_6_clang7_ubuntu16_04_test
requires:
- setup
- caffe2_onnx_py3_6_clang7_ubuntu16_04_build
build_environment: "caffe2-onnx-py3.6-clang7-ubuntu16.04-test"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:287"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py3.6-clang7-ubuntu16.04:301"
resource_class: large
- caffe2_linux_build:
name: caffe2_py2_android_ubuntu16_04_build
requires:
- setup
build_environment: "caffe2-py2-android-ubuntu16.04-build"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-android-ubuntu16.04:287"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-android-ubuntu16.04:301"
build_only: "1"
- caffe2_linux_build:
name: caffe2_py2_cuda9_0_cudnn7_centos7_build
Expand All @@ -1994,7 +1994,7 @@ workflows:
- master
- /ci-all\/.*/
build_environment: "caffe2-py2-cuda9.0-cudnn7-centos7-build"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:287"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:301"
- caffe2_linux_test:
name: caffe2_py2_cuda9_0_cudnn7_centos7_test
requires:
Expand All @@ -2007,7 +2007,7 @@ workflows:
- /ci-all\/.*/
build_environment: "caffe2-py2-cuda9.0-cudnn7-centos7-test"
use_cuda_docker_runtime: "1"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:287"
docker_image: "308535385114.dkr.ecr.us-east-1.amazonaws.com/caffe2/py2-cuda9.0-cudnn7-centos7:301"
resource_class: gpu.medium
- caffe2_macos_build:
name: caffe2_py2_ios_macos10_13_build
Expand Down
1 change: 1 addition & 0 deletions aten/src/ATen/native/cuda/Embedding.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
#include <THC/THCThrustAllocator.cuh>

#include <thrust/execution_policy.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/unique.h>

#include <ATen/native/cuda/EmbeddingBackwardKernel.cuh>
Expand Down
1 change: 1 addition & 0 deletions aten/src/ATen/native/cuda/EmbeddingBag.cu
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

#include <thrust/execution_policy.h>
#include <thrust/unique.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/device_vector.h>

#include <ATen/native/cuda/EmbeddingBackwardKernel.cuh>
Expand Down
1 change: 1 addition & 0 deletions aten/src/ATen/native/cuda/Unique.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

#include <tuple>
#include <iterator>
#include <thrust/adjacent_difference.h>
#include <thrust/unique.h>
#include <thrust/sort.h>
#include <thrust/scan.h>
Expand Down
2 changes: 2 additions & 0 deletions aten/src/THC/generic/THCTensorMode.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#define THC_GENERIC_FILE "THC/generic/THCTensorMode.cu"
#else

#include <thrust/iterator/constant_iterator.h>

void THCTensor_(calculateMode)(THCState *state,
THCTensor *values,
THCudaLongTensor *indices,
Expand Down
2 changes: 2 additions & 0 deletions aten/src/THCUNN/generic/LookupTable.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
#define THC_GENERIC_FILE "THCUNN/generic/LookupTable.cu"
#else

#include <thrust/iterator/constant_iterator.h>

void THNN_(LookupTable_accGradParameters)(
THCState *state,
THCIndexTensor *input,
Expand Down
8 changes: 8 additions & 0 deletions caffe2/core/common_gpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,14 @@ const char* cublasGetErrorString(cublasStatus_t error) {
#ifdef __HIP_PLATFORM_HCC__
case rocblas_status_invalid_size:
return "rocblas_status_invalid_size";
case rocblas_status_perf_degraded:
return "rocblas_status_perf_degraded";
case rocblas_status_size_query_mismatch:
return "rocblas_status_size_query_mismatch";
case rocblas_status_size_increased:
return "rocblas_status_size_increased";
case rocblas_status_size_unchanged:
return "rocblas_status_size_unchanged";
#endif
}
// To suppress compiler warning.
Expand Down
4 changes: 4 additions & 0 deletions caffe2/operators/generate_proposals_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
#include "caffe2/operators/generate_proposals_op_util_nms.h"
#include "caffe2/operators/generate_proposals_op_util_nms_gpu.h"

#ifdef __HIP_PLATFORM_HCC__
#include <cfloat>
#endif

using caffe2::utils::RotatedBox;

namespace caffe2 {
Expand Down
4 changes: 4 additions & 0 deletions caffe2/operators/reduce_front_back_max_ops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/reduce_front_back_max_ops.h"

#ifdef __HIP_PLATFORM_HCC__
#include <cfloat>
#endif

namespace caffe2 {

/***
Expand Down
8 changes: 8 additions & 0 deletions caffe2/operators/rmac_regions_op.cu
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,15 @@
#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/rmac_regions_op.h"

#ifdef __HIP_PLATFORM_HCC__
#include <cfloat>
#endif

#ifdef __HIP_PLATFORM_HCC__
namespace rocprim {
#else
namespace cub {
#endif

template <typename KeyT, typename ValueT>
inline __host__ __device__ bool operator<(
Expand Down
1 change: 1 addition & 0 deletions caffe2/operators/unique_ops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include <thrust/sort.h>
#include <thrust/system/cuda/execution_policy.h>
#include <thrust/unique.h>
#include <thrust/version.h>
#include "caffe2/core/context_gpu.h"

namespace caffe2 {
Expand Down
18 changes: 6 additions & 12 deletions caffe2/utils/math_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -622,10 +622,8 @@ CAFFE2_CUDA_EXPORT void Gemm<at::Half, CUDAContext>(
N, // ldd
rocblas_datatype_f32_r, // compute type
rocblas_gemm_algo_standard, // rocblas_gemm_algo
0, // solution index, reserved for future use
0, // flags, reserved for future use
NULL, // size of workspace
NULL)); // workspace
0, // solution index, reserved for future use
0)); // flags, reserved for future use
#else
CUBLAS_ENFORCE(cublasSgemmEx(
context->cublas_handle(),
Expand Down Expand Up @@ -1033,10 +1031,8 @@ CAFFE2_CUDA_EXPORT void GemmStridedBatched<at::Half, CUDAContext>(
batch_size,
rocblas_datatype_f32_r, // compute type
rocblas_gemm_algo_standard, // rocblas_gemm_algo
0, // solution index, reserved for future use
0, // flags, reserved for future use
NULL, // size of workspace
NULL)); // workspace
0, // solution index, reserved for future use
0)); // flags, reserved for future use
#else
CUBLAS_ENFORCE(cublasGemmStridedBatchedEx(
context->cublas_handle(),
Expand Down Expand Up @@ -1178,10 +1174,8 @@ CAFFE2_CUDA_EXPORT void Gemv<at::Half, CUDAContext>(
ldc, // ldd
rocblas_datatype_f32_r, // compute type
rocblas_gemm_algo_standard, // rocblas_gemm_algo
0, // solution index, reserved for future use
0, // flags, reserved for future use
NULL, // size of workspace
NULL)); // workspace
0, // solution index, reserved for future use
0)); // flags, reserved for future use
#else
CUBLAS_ENFORCE(cublasSgemmEx(
context->cublas_handle(),
Expand Down
6 changes: 3 additions & 3 deletions cmake/Dependencies.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -872,6 +872,7 @@ if(USE_ROCM)
list(APPEND HIP_CXX_FLAGS -Wno-unused-command-line-argument)
list(APPEND HIP_CXX_FLAGS -Wno-duplicate-decl-specifier)
list(APPEND HIP_CXX_FLAGS -DCAFFE2_USE_MIOPEN)
list(APPEND HIP_CXX_FLAGS -DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_HIP)

if(CMAKE_BUILD_TYPE MATCHES Debug)
list(APPEND HIP_CXX_FLAGS -g)
Expand All @@ -887,13 +888,12 @@ if(USE_ROCM)
endforeach()

set(Caffe2_HIP_INCLUDE
${hip_INCLUDE_DIRS} ${hcc_INCLUDE_DIRS} ${hsa_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${thrust_INCLUDE_DIRS} $<INSTALL_INTERFACE:include> ${Caffe2_HIP_INCLUDE})

${thrust_INCLUDE_DIRS} ${hipcub_INCLUDE_DIRS} ${rocprim_INCLUDE_DIRS} ${miopen_INCLUDE_DIRS} ${rocblas_INCLUDE_DIRS} ${rocrand_INCLUDE_DIRS} ${hiprand_INCLUDE_DIRS} ${hip_INCLUDE_DIRS} ${hcc_INCLUDE_DIRS} ${hsa_INCLUDE_DIRS} $<INSTALL_INTERFACE:include> ${Caffe2_HIP_INCLUDE})
# This is needed for library added by hip_add_library (same for hip_add_executable)
hip_include_directories(${Caffe2_HIP_INCLUDE})

set(Caffe2_HIP_DEPENDENCY_LIBS
${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES})
${PYTORCH_HIP_HCC_LIBRARIES} ${PYTORCH_MIOPEN_LIBRARIES} ${hipcub_LIBRARIES})

# Note [rocblas & rocfft cmake bug]
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Expand Down
Loading

0 comments on commit 4fe8571

Please sign in to comment.