Skip to content

Commit

Permalink
ROCm support (#213)
Browse files Browse the repository at this point in the history
Co-authored-by: Binyang Li <binyli@microsoft.com>
  • Loading branch information
chhwang and Binyang2014 authored Nov 24, 2023
1 parent dab19e0 commit 544ff0c
Show file tree
Hide file tree
Showing 65 changed files with 770 additions and 398 deletions.
2 changes: 1 addition & 1 deletion .azure-pipelines/integration-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ jobs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release ..
cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON ..
make -j
workingDirectory: '$(System.DefaultWorkingDirectory)'

Expand Down
2 changes: 1 addition & 1 deletion .azure-pipelines/multi-nodes-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
targetType: 'inline'
script: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON ..
cmake -DCMAKE_BUILD_TYPE=Release -DBYPASS_PEERMEM_CHECK=ON -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON ..
make -j
make pylib-copy
workingDirectory: '$(System.DefaultWorkingDirectory)'
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/codeql-analysis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
- name: Build
run: |
cmake -DBYPASS_PEERMEM_CHECK=ON .
cmake -DBYPASS_PEERMEM_CHECK=ON -DBYPASS_GPU_CHECK=ON -DUSE_CUDA=ON .
make -j
- name: Perform CodeQL Analysis
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/lint.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
- name: Run cpplint
run: |
CPPSOURCES=$(find ./ -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)' -not -path "./build/*")
CPPSOURCES=$(find ./src ./include ./python ./test -regextype posix-extended -regex '.*\.(c|cpp|h|hpp|cc|cxx|cu)')
clang-format -style=file --verbose --Werror --dry-run ${CPPSOURCES}
pylint:
Expand Down
119 changes: 82 additions & 37 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,80 +9,125 @@ set(MSCCLPP_SOVERSION ${MSCCLPP_MAJOR})
set(MSCCLPP_VERSION "${MSCCLPP_MAJOR}.${MSCCLPP_MINOR}.${MSCCLPP_PATCH}")

cmake_minimum_required(VERSION 3.25)
project(mscclpp LANGUAGES CUDA CXX)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wextra")

list(APPEND CMAKE_MODULE_PATH ${PROJECT_SOURCE_DIR}/cmake)
enable_language(CXX)

# Format targets
include(${PROJECT_SOURCE_DIR}/cmake/AddFormatTargets.cmake)
list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)

# Options
option(ENABLE_TRACE "Enable tracing" OFF)
option(USE_NPKIT "Use NPKIT" ON)
option(BUILD_TESTS "Build tests" ON)
option(BUILD_PYTHON_BINDINGS "Build Python bindings" ON)
option(ALLOW_GDRCOPY "Use GDRCopy, if available" OFF)
option(USE_CUDA "Use NVIDIA/CUDA." OFF)
option(USE_ROCM "Use AMD/ROCm." OFF)
option(BYPASS_GPU_CHECK "Bypass GPU check." OFF)
option(BYPASS_PEERMEM_CHECK "Bypass checking nvidia_peermem" OFF)

# Find CUDAToolkit. Set CUDA flags based on the detected CUDA version
find_package(CUDAToolkit REQUIRED)
if(CUDAToolkit_FOUND)
if(BYPASS_GPU_CHECK)
if(USE_CUDA)
message("Bypassing GPU check: using NVIDIA/CUDA.")
find_package(CUDAToolkit REQUIRED)
elseif(USE_ROCM)
message("Bypassing GPU check: using AMD/ROCm.")
# Temporal fix for rocm5.6
set(CMAKE_PREFIX_PATH "/opt/rocm;${CMAKE_PREFIX_PATH}")
find_package(hip REQUIRED)
else()
message(FATAL_ERROR "Bypassing GPU check: neither NVIDIA/CUDA nor AMD/ROCm is specified.")
endif()
else()
# Detect GPUs
include(CheckNvidiaGpu)
include(CheckAmdGpu)
if(NVIDIA_FOUND AND AMD_FOUND)
message("Detected NVIDIA/CUDA and AMD/ROCm: prioritizing NVIDIA/CUDA.")
set(USE_CUDA ON)
set(USE_ROCM OFF)
elseif(NVIDIA_FOUND)
message("Detected NVIDIA/CUDA.")
set(USE_CUDA ON)
set(USE_ROCM OFF)
elseif(AMD_FOUND)
message("Detected AMD/ROCm.")
set(USE_CUDA OFF)
set(USE_ROCM ON)
else()
message(FATAL_ERROR "Neither NVIDIA/CUDA nor AMD/ROCm is found.")
endif()
endif()

# Declare project
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra")
if(USE_CUDA)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcompiler -Wall,-Wextra")
project(mscclpp LANGUAGES CXX CUDA)

# CUDA 11 or higher is required
if(CUDAToolkit_VERSION_MAJOR LESS 11)
message(FATAL_ERROR "CUDA 11 or higher is required but detected ${CUDAToolkit_VERSION}")
endif()

# Set CUDA architectures
if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 11)
set(CMAKE_CUDA_ARCHITECTURES 80)
endif()

# Hopper architecture
if(CUDAToolkit_VERSION_MAJOR GREATER_EQUAL 12)
set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES} 90)
endif()
endif()
set(CUDA_LIBRARIES CUDA::cudart CUDA::cuda_driver)

# Find if nvidia_peermem is installed and loaded
if(NOT BYPASS_PEERMEM_CHECK)
execute_process(COMMAND sh -c "lsmod | grep nvidia_peermem"
RESULT_VARIABLE lsmod_result
OUTPUT_VARIABLE lsmod_output)
if(NOT lsmod_result EQUAL 0)
message(FATAL_ERROR "nvidia_peermem is not installed or not loaded.")

set(GPU_LIBRARIES CUDA::cudart CUDA::cuda_driver)
set(GPU_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIRS})

# Find if nvidia_peermem is installed and loaded
if(NOT BYPASS_PEERMEM_CHECK)
execute_process(COMMAND sh -c "lsmod | grep nvidia_peermem"
RESULT_VARIABLE lsmod_result
OUTPUT_VARIABLE lsmod_output)
if(NOT lsmod_result EQUAL 0)
message(FATAL_ERROR "nvidia_peermem is not installed or not loaded.")
endif()
endif()
else()
set(CMAKE_HIP_STANDARD 17)
set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Wall -Wextra")
project(mscclpp LANGUAGES CXX HIP)

set(CMAKE_HIP_ARCHITECTURES gfx90a gfx941 gfx942)

set(GPU_LIBRARIES hip::host)
set(GPU_INCLUDE_DIRS ${hip_INCLUDE_DIRS})
endif()

# Format targets
include(${PROJECT_SOURCE_DIR}/cmake/AddFormatTargets.cmake)

# Find ibverbs and libnuma
find_package(IBVerbs REQUIRED)
find_package(NUMA REQUIRED)

# Find optional packages
if(ALLOW_GDRCOPY)
find_package(GDRCopy)
endif()

add_library(mscclpp_obj OBJECT)
target_include_directories(mscclpp_obj
PRIVATE
${CUDAToolkit_INCLUDE_DIRS}
${GPU_INCLUDE_DIRS}
${IBVERBS_INCLUDE_DIRS}
${NUMA_INCLUDE_DIRS}
${GDRCOPY_INCLUDE_DIRS})
target_link_libraries(mscclpp_obj PRIVATE ${CUDA_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES} ${GDRCOPY_LIBRARIES})
${NUMA_INCLUDE_DIRS})
target_link_libraries(mscclpp_obj PRIVATE ${GPU_LIBRARIES} ${NUMA_LIBRARIES} ${IBVERBS_LIBRARIES})
set_target_properties(mscclpp_obj PROPERTIES LINKER_LANGUAGE CXX POSITION_INDEPENDENT_CODE 1 VERSION ${MSCCLPP_VERSION} SOVERSION ${MSCCLPP_SOVERSION})
if(USE_CUDA)
target_compile_definitions(mscclpp_obj PRIVATE USE_CUDA)
elseif(USE_ROCM)
target_compile_definitions(mscclpp_obj PRIVATE USE_ROCM)
endif()
if(ENABLE_TRACE)
target_compile_definitions(mscclpp_obj PRIVATE ENABLE_TRACE)
endif()
if(USE_NPKIT)
target_compile_definitions(mscclpp_obj PRIVATE ENABLE_NPKIT)
endif()
if(ALLOW_GDRCOPY AND GDRCOPY_FOUND)
target_compile_definitions(mscclpp_obj PRIVATE MSCCLPP_USE_GDRCOPY)
target_link_libraries(mscclpp_obj PRIVATE MSCCLPP::gdrcopy)
endif()

# libmscclpp
add_library(mscclpp SHARED)
Expand All @@ -103,7 +148,7 @@ install(TARGETS mscclpp_static
ARCHIVE DESTINATION lib)

# Tests
if (BUILD_TESTS)
if(BUILD_TESTS)
enable_testing() # Called here to allow ctest from the build directory
add_subdirectory(test)
endif()
Expand Down
25 changes: 25 additions & 0 deletions cmake/CheckAmdGpu.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

set(AMD_FOUND "FALSE")

set(CMAKE_PREFIX_PATH "/opt/rocm;${CMAKE_PREFIX_PATH}")
# Temporal fix for rocm5.6
set(ENV{amd_comgr_DIR} "/opt/rocm/lib/cmake/amd_comgr")
set(ENV{AMDDeviceLibs_DIR} "/opt/rocm/lib/cmake/AMDDeviceLibs")

find_package(hip QUIET)

if(NOT hip_FOUND)
return()
endif()

enable_language(HIP)

set(CHECK_SRC "${CMAKE_CURRENT_SOURCE_DIR}/cmake/check_amd_gpu.hip")

try_run(RUN_RESULT COMPILE_SUCCESS SOURCES ${CHECK_SRC})

if(COMPILE_SUCCESS AND RUN_RESULT EQUAL 0)
set(AMD_FOUND "TRUE")
endif()
36 changes: 36 additions & 0 deletions cmake/CheckNvidiaGpu.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.

set(NVIDIA_FOUND "FALSE")

find_package(CUDAToolkit)

if(NOT CUDAToolkit_FOUND)
return()
endif()

set(CMAKE_CUDA_ARCHITECTURES "60")
if(NOT CMAKE_CUDA_COMPILER)
# In case the CUDA Toolkit directory is not in the PATH
find_program(CUDA_COMPILER
NAMES nvcc
PATHS ${CUDAToolkit_BIN_DIR})
if(NOT CUDA_COMPILER)
message(WARNING "Could not find nvcc in ${CUDAToolkit_BIN_DIR}")
unset(CMAKE_CUDA_ARCHITECTURES)
return()
endif()
set(CMAKE_CUDA_COMPILER "${CUDA_COMPILER}")
endif()
enable_language(CUDA)

set(CHECK_SRC "${CMAKE_CURRENT_SOURCE_DIR}/cmake/check_nvidia_gpu.cu")

try_run(RUN_RESULT COMPILE_SUCCESS SOURCES ${CHECK_SRC})

if(COMPILE_SUCCESS AND RUN_RESULT EQUAL 0)
set(NVIDIA_FOUND "TRUE")
else()
unset(CMAKE_CUDA_ARCHITECTURES)
unset(CMAKE_CUDA_COMPILER)
endif()
15 changes: 15 additions & 0 deletions cmake/check_amd_gpu.hip
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#include <hip/hip_runtime.h>

__global__ void kernel() {}

int main() {
int cnt;
hipError_t err = hipGetDeviceCount(&cnt);
if (err != hipSuccess || cnt == 0) {
return 1;
}
return 0;
}
15 changes: 15 additions & 0 deletions cmake/check_nvidia_gpu.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#include <cuda_runtime.h>

__global__ void kernel() {}

int main() {
int cnt;
cudaError_t err = cudaGetDeviceCount(&cnt);
if (err != cudaSuccess || cnt == 0) {
return 1;
}
return 0;
}
65 changes: 65 additions & 0 deletions include/mscclpp/atomic_device.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

#ifndef MSCCLPP_ATOMIC_DEVICE_HPP_
#define MSCCLPP_ATOMIC_DEVICE_HPP_

#include "device.hpp"

#if defined(MSCCLPP_DEVICE_CUDA)
#include <cuda/atomic>
#endif // defined(MSCCLPP_DEVICE_CUDA)

namespace mscclpp {

#if defined(MSCCLPP_DEVICE_CUDA)

constexpr cuda::memory_order memoryOrderRelaxed = cuda::memory_order_relaxed;
constexpr cuda::memory_order memoryOrderAcquire = cuda::memory_order_acquire;
constexpr cuda::memory_order memoryOrderRelease = cuda::memory_order_release;
constexpr cuda::memory_order memoryOrderAcqRel = cuda::memory_order_acq_rel;
constexpr cuda::memory_order memoryOrderSeqCst = cuda::memory_order_seq_cst;

template <typename T>
MSCCLPP_HOST_DEVICE_INLINE T atomicLoad(T* ptr, cuda::memory_order memoryOrder) {
return cuda::atomic_ref<T, cuda::thread_scope_system>{*ptr}.load(memoryOrder);
}

template <typename T>
MSCCLPP_HOST_DEVICE_INLINE void atomicStore(T* ptr, const T& val, cuda::memory_order memoryOrder) {
cuda::atomic_ref<T, cuda::thread_scope_system>{*ptr}.store(val, memoryOrder);
}

template <typename T>
MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, cuda::memory_order memoryOrder) {
return cuda::atomic_ref<T, cuda::thread_scope_system>{*ptr}.fetch_add(val, memoryOrder);
}

#elif defined(MSCCLPP_DEVICE_HIP)

constexpr auto memoryOrderRelaxed = __ATOMIC_RELAXED;
constexpr auto memoryOrderAcquire = __ATOMIC_ACQUIRE;
constexpr auto memoryOrderRelease = __ATOMIC_RELEASE;
constexpr auto memoryOrderAcqRel = __ATOMIC_ACQ_REL;
constexpr auto memoryOrderSeqCst = __ATOMIC_SEQ_CST;

template <typename T>
MSCCLPP_HOST_DEVICE_INLINE T atomicLoad(const T* ptr, int memoryOrder) {
return __atomic_load_n(ptr, memoryOrder);
}

template <typename T>
MSCCLPP_HOST_DEVICE_INLINE void atomicStore(T* ptr, const T& val, int memoryOrder) {
__atomic_store_n(ptr, val, memoryOrder);
}

template <typename T>
MSCCLPP_HOST_DEVICE_INLINE T atomicFetchAdd(T* ptr, const T& val, int memoryOrder) {
return __atomic_fetch_add(ptr, val, memoryOrder);
}

#endif // defined(MSCCLPP_DEVICE_HIP)

} // namespace mscclpp

#endif // MSCCLPP_ATOMIC_DEVICE_HPP_
Loading

0 comments on commit 544ff0c

Please sign in to comment.