-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathkernel.hpp
123 lines (109 loc) · 3.09 KB
/
kernel.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#pragma once
#include "accessor.hpp"
#include "cuda_utils.hpp"
namespace stereomatch {
template <Device dev>
struct KernelLauncher {};
#ifdef __CUDACC__
/**
* Used by KernelLauncher<kCUDA>
*
*/
template <typename Kernel>
static __global__ void Exec1DKernel(Kernel kern, int size) {
const int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
kern(idx);
}
}
template <typename Kernel>
static __global__ void Exec2DKernel(Kernel kern, int width, int height) {
const int row = blockIdx.y * blockDim.y + threadIdx.y;
const int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < height && col < width) {
kern(row, col);
}
}
template <>
struct KernelLauncher<kCUDA> {
/**
* Launches a kernel functor on CUDA. The launch configuration is
* determined automatically by the given size.
*
* After execution errors are verified and the device is synchronized.
*
* @param kern Kernel functor.
* @param size the number of elements to process.
*/
template <typename Kernel>
static void Launch1D(Kernel &kern, int size, bool sequential = false) {
CudaKernelDims kl = Get1DKernelDims(size);
Exec1DKernel<<<kl.grid, kl.block>>>(kern, size);
CudaCheck();
CudaSafeCall(cudaDeviceSynchronize());
}
template <typename Kernel>
static void Launch2D(Kernel &kern, int width, int height,
bool sequential = false) {
CudaKernelDims kl = Get2DKernelDims(width, height);
Exec2DKernel<<<kl.grid, kl.block>>>(kern, width, height);
CudaCheck();
CudaSafeCall(cudaDeviceSynchronize());
}
template <typename Kernel>
static void Launch2D(Kernel &kern, int width, int height, dim3 grid_size,
dim3 block_size) {
Exec2DKernel<<<grid_size, block_size>>>(kern, width, height);
CudaCheck();
CudaSafeCall(cudaDeviceSynchronize());
}
template <typename Kernel>
static void Launch2DSharedMem(Kernel &kern, int width, int height,
size_t shared_mem_size) {
CudaKernelDims kl = Get2DKernelDims(width, height);
Exec2DKernel<<<kl.grid, kl.block, shared_mem_size>>>(kern, width, height);
CudaCheck();
CudaSafeCall(cudaDeviceSynchronize());
}
};
#endif
template <>
struct KernelLauncher<kCPU> {
template <typename Kernel>
static void Launch1D(Kernel &kern, int size, bool sequential = false) {
if (!sequential) {
#ifdef NDEBUG
#pragma omp parallel for
#endif
for (int i = 0; i < size; ++i) {
kern(i);
}
} else {
for (int i = 0; i < size; ++i) {
kern(i);
}
}
}
template <typename Kernel>
static void Launch2D(Kernel &kern, int width, int height,
bool sequential = false) {
if (!sequential) {
#pragma omp parallel
{
#pragma omp for
for (int row = 0; row < height; ++row) {
for (int col = 0; col < width; ++col) {
kern(row, col);
}
}
}
} else {
for (int row = 0; row < height; ++row) {
for (int col = 0; col < width; ++col) {
kern(row, col);
}
}
}
}
};
} // namespace stereomatch