Reduction kernel optimization (microsoft#6088)

Optimize reduction kernel code by moving loads from global memory before computation. Add CMake option to build CUDA code with --generate-line-info option.
jjsbear · Dec 9, 2020 · abdbb5f · abdbb5f
1 parent 9e26e59
commit abdbb5f
Show file tree

Hide file tree

Showing 2 changed files with 24 additions and 26 deletions.
diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -57,6 +57,7 @@ option(onnxruntime_ENABLE_PYTHON "Enable python buildings" OFF)
 # Enable it may cause LNK1169 error
 option(onnxruntime_ENABLE_MEMLEAK_CHECKER "Experimental: Enable memory leak checker in Windows debug build" OFF)
 option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
+option(onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO "When building with CUDA support, generate device code line number information." OFF)
 option(onnxruntime_USE_OPENVINO "Build with OpenVINO support" OFF)
 option(onnxruntime_USE_EIGEN_FOR_BLAS "Use eign for blas" ON)
 option(onnxruntime_USE_NNAPI_BUILTIN "Build with builtin NNAPI lib for Android NNAPI support" OFF)
@@ -1177,6 +1178,10 @@ if (onnxruntime_USE_CUDA)
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe \"--diag_suppress=bad_friend_decl\"")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe \"--diag_suppress=unsigned_compare_with_zero\"")
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xcudafe \"--diag_suppress=expr_has_no_effect\"")
+
+  if (onnxruntime_ENABLE_CUDA_LINE_NUMBER_INFO)
+    add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:--generate-line-info>")
+  endif()
 endif()
 
 if (onnxruntime_USE_MIGRAPHX)

diff --git a/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu b/onnxruntime/core/providers/cuda/reduction/reduction_functions.cu
@@ -134,26 +134,37 @@ __device__ void reduce_all(
   // Total number of threads in a grid row with 2-D blocks.
   const int num_threads_in_grid_row = num_blocks_in_grid_row * num_threads_in_block;
 
+  const auto write_result = [&output, &num_elements](const TOut result) {
+    // Compilation time if-else branch controlled by template argument can be
+    // optimized out, so there will be no branch in real computation phase.
+    if (DivideResultBySize) {
+      output[0] = TFinalOp()(result / TOut(num_elements));
+    } else {
+      output[0] = TFinalOp()(result);
+    }
+  };
+
   // Thread-level reduction (storage change: global memory -> register).
   // One thread reduces MAX_NUM_ELEMENTS_PER_THREAD elements to a thread register
   // in one iteration.
   TBuf value = 0;
   for (int id = tid_in_grid_row; id < num_elements; id += MAX_NUM_ELEMENTS_PER_THREAD * num_threads_in_grid_row) {
-    TBuf v[MAX_NUM_ELEMENTS_PER_THREAD];
+    TIn v[MAX_NUM_ELEMENTS_PER_THREAD];
 
 #pragma unroll
     for (int i = 0; i < MAX_NUM_ELEMENTS_PER_THREAD; i++) {
       const int offset = id + i * num_threads_in_grid_row;
       if (offset < num_elements) {
-        v[i] = TOp()(TBuf(input[offset]));
-      } else {
-        v[i] = TBuf(0);
+        v[i] = input[offset];
       }
     }
 
 #pragma unroll
     for (int i = 0; i < MAX_NUM_ELEMENTS_PER_THREAD; i++) {
-      value += v[i];
+      const int offset = id + i * num_threads_in_grid_row;
+      if (offset < num_elements) {
+        value += TOp()(TBuf(v[i]));
+      }
     }
   }
 
@@ -177,13 +188,7 @@ __device__ void reduce_all(
   // 2. two warps and each of them has only 2 threads.
   if (num_warps_in_block == 1) {
     if (tid_in_grid_row == 0) {
-      // Compilation time if-else branch controlled by template argument can be
-      // optimized out, so there will be no branch in real computation phase.
-      if (DivideResultBySize) {
-        output[0] = TFinalOp()(TOut(value) / TOut(num_elements));
-      } else {
-        output[0] = TFinalOp()(TOut(value));
-      }
+      write_result(value);
     }
     return;
   }
@@ -212,13 +217,7 @@ __device__ void reduce_all(
   // Return early if only one block is used for reduction.
   if (num_blocks_in_grid_row == 1) {
     if (tid_in_grid_row == 0) {
-      // Compilation time if-else branch controlled by template argument can be
-      // optimized out, so there will be no branch in real computation phase.
-      if (DivideResultBySize) {
-        output[0] = TFinalOp()(TOut(shared_memory[0]) / TOut(num_elements));
-      } else {
-        output[0] = TFinalOp()(TOut(shared_memory[0]));
-      }
+      write_result(shared_memory[0]);
     }
     return;
   }
@@ -256,13 +255,7 @@ __device__ void reduce_all(
 
     // The first thread in the last block assigns the final output.
     if (tid_in_block == 0) {
-      // Compilation time if-else branch controlled by template argument can be
-      // optimized out, so there will be no branch in real computation phase.
-      if (DivideResultBySize) {
-        output[0] = TFinalOp()(TOut(block_reductions_buffer[0]) / TOut(num_elements));
-      } else {
-        output[0] = TFinalOp()(TOut(block_reductions_buffer[0]));
-      }
+      write_result(block_reductions_buffer[0]);
     }
   }
 }