Skip to content

Commit

Permalink
Reduction kernel optimization (microsoft#6088)
Browse files Browse the repository at this point in the history
Optimize reduction kernel code by moving loads from global memory before computation.
Add CMake option to build CUDA code with --generate-line-info option.
  • Loading branch information
edgchen1 authored Dec 9, 2020
1 parent 9e26e59 commit abdbb5f
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 26 deletions.
5 changes: 5 additions & 0 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ option(onnxruntime_ENABLE_PYTHON "Enable python buildings" OFF)
# Enable it may cause LNK1169 error
option(onnxruntime_ENABLE_MEMLEAK_CHECKER "Experimental: Enable memory leak checker in Windows debug build" OFF)
option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
option(onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO "When building with CUDA support, generate device code line number information." OFF)
option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
option(onnxruntime_USE_EIGEN_FOR_BLAS "Use eign for blas" ON)
option(onnxruntime_USE_NNAPI_BUILTIN "Build with builtin NNAPI lib for Android NNAPI support" OFF)
Expand Down Expand Up @@ -1177,6 +1178,10 @@ if (onnxruntime_USE_CUDA)
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe \"--diag_suppress=bad_friend_decl\"")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe \"--diag_suppress=unsigned_compare_with_zero\"")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe \"--diag_suppress=expr_has_no_effect\"")

if (onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO)
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:--generate-line-info>")
endif()
endif()

if (onnxruntime_USE_MIGRAPHX)
Expand Down
45 changes: 19 additions & 26 deletions onnxruntime/core/providers/cuda/reduction/reduction_functions.cu
Original file line number Diff line number Diff line change
Expand Up @@ -134,26 +134,37 @@ __device__ void reduce_all(
// Total number of threads in a grid row with 2-D blocks.
const int num_threads_in_grid_row = num_blocks_in_grid_row * num_threads_in_block;

const auto write_result = [&output, &num_elements](const TOut result) {
// Compilation time if-else branch controlled by template argument can be
// optimized out, so there will be no branch in real computation phase.
if (DivideResultBySize) {
output[0] = TFinalOp()(result / TOut(num_elements));
} else {
output[0] = TFinalOp()(result);
}
};

// Thread-level reduction (storage change: global memory -> register).
// One thread reduces MAX_NUM_ELEMENTS_PER_THREAD elements to a thread register
// in one iteration.
TBuf value = 0;
for (int id = tid_in_grid_row; id < num_elements; id += MAX_NUM_ELEMENTS_PER_THREAD * num_threads_in_grid_row) {
TBuf v[MAX_NUM_ELEMENTS_PER_THREAD];
TIn v[MAX_NUM_ELEMENTS_PER_THREAD];

#pragma unroll
for (int i = 0; i < MAX_NUM_ELEMENTS_PER_THREAD; i++) {
const int offset = id + i * num_threads_in_grid_row;
if (offset < num_elements) {
v[i] = TOp()(TBuf(input[offset]));
} else {
v[i] = TBuf(0);
v[i] = input[offset];
}
}

#pragma unroll
for (int i = 0; i < MAX_NUM_ELEMENTS_PER_THREAD; i++) {
value += v[i];
const int offset = id + i * num_threads_in_grid_row;
if (offset < num_elements) {
value += TOp()(TBuf(v[i]));
}
}
}

Expand All @@ -177,13 +188,7 @@ __device__ void reduce_all(
// 2. two warps and each of them has only 2 threads.
if (num_warps_in_block == 1) {
if (tid_in_grid_row == 0) {
// Compilation time if-else branch controlled by template argument can be
// optimized out, so there will be no branch in real computation phase.
if (DivideResultBySize) {
output[0] = TFinalOp()(TOut(value) / TOut(num_elements));
} else {
output[0] = TFinalOp()(TOut(value));
}
write_result(value);
}
return;
}
Expand Down Expand Up @@ -212,13 +217,7 @@ __device__ void reduce_all(
// Return early if only one block is used for reduction.
if (num_blocks_in_grid_row == 1) {
if (tid_in_grid_row == 0) {
// Compilation time if-else branch controlled by template argument can be
// optimized out, so there will be no branch in real computation phase.
if (DivideResultBySize) {
output[0] = TFinalOp()(TOut(shared_memory[0]) / TOut(num_elements));
} else {
output[0] = TFinalOp()(TOut(shared_memory[0]));
}
write_result(shared_memory[0]);
}
return;
}
Expand Down Expand Up @@ -256,13 +255,7 @@ __device__ void reduce_all(

// The first thread in the last block assigns the final output.
if (tid_in_block == 0) {
// Compilation time if-else branch controlled by template argument can be
// optimized out, so there will be no branch in real computation phase.
if (DivideResultBySize) {
output[0] = TFinalOp()(TOut(block_reductions_buffer[0]) / TOut(num_elements));
} else {
output[0] = TFinalOp()(TOut(block_reductions_buffer[0]));
}
write_result(block_reductions_buffer[0]);
}
}
}
Expand Down

0 comments on commit abdbb5f

Please sign in to comment.