diff --git a/include/stereomatch/semiglobal.hpp b/include/stereomatch/semiglobal.hpp index 4dd02cb..06d0571 100644 --- a/include/stereomatch/semiglobal.hpp +++ b/include/stereomatch/semiglobal.hpp @@ -35,15 +35,15 @@ struct SGPixelPath { Point2 direction; uint16_t size; - SGPixelPath(Point2 start, Point2 end, - Point2 direction, int16_t size) + STM_DEVICE_HOST SGPixelPath(Point2 start, Point2 end, + Point2 direction, int16_t size) noexcept : start(start), end(end), direction(direction), size(size) {} - SGPixelPath inverse() const { + STM_DEVICE_HOST SGPixelPath inverse() const { return SGPixelPath(end, start, Point2(-direction.x, -direction.y), size); } - static std::vector GeneratePaths(size_t width, size_t height); + static std::vector GeneratePaths(size_t width, size_t height) noexcept; }; } // namespace stereomatch diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 34099c0..d2764dc 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -26,12 +26,11 @@ target_include_directories(stereomatch $ $ ) - target_link_libraries(stereomatch "${TORCH_LIBRARIES}" "${TORCH_PYTHON_LIBRARY}" ) - set_property(TARGET stereomatch PROPERTY CXX_STANDARD 17) - set_target_properties(stereomatch PROPERTIES CUDA_SEPARABLE_COMPILATION ON) +# set_property(TARGET stereomatch PROPERTY CXX_INCLUDE_WHAT_YOU_USE ${iwyu_path}) + add_library(_cstereomatch SHARED _cstereomatch.cpp) set_property(TARGET _cstereomatch PROPERTY CXX_STANDARD 17) diff --git a/src/semiglobal.cpp b/src/semiglobal.cpp index a53fd5b..b7bef3c 100644 --- a/src/semiglobal.cpp +++ b/src/semiglobal.cpp @@ -24,7 +24,7 @@ struct BorderedBuffer { BorderedBuffer(int size, T border_value) noexcept : array(size + border_size) { for (auto i = 0; i < border_size; ++i) { - array[i] = array[size - 1 + i] = border_value; + array[i] = array[size + border_size + i] = border_value; } } @@ -42,7 +42,7 @@ struct BorderedBuffer { }; std::vector SGPixelPath::GeneratePaths(size_t width, - size_t height) { + size_t height) noexcept { std::vector path_descs; /** @@ -57,13 +57,11 @@ std::vector SGPixelPath::GeneratePaths(size_t width, /** * Vertical paths */ - for (auto i = 0; i < width; i++) { path_descs.push_back(SGPixelPath(Point2(i, 0), Point2(i, height - 1), Point2(0, 1), height)); } - /** * Diagonal left to right */ @@ -120,13 +118,19 @@ std::vector SGPixelPath::GeneratePaths(size_t width, template struct SGMCostOperator { - public: + const torch::TensorAccessor cost_volume; + const torch::TensorAccessor left_image; + torch::TensorAccessor output_cost_vol; + const scalar_t penalty1, penalty2; + + BorderedBuffer prev_cost, prev_cost_cache; + SGMCostOperator(const torch::TensorAccessor cost_volume, - const torch::TensorAccessor intensity_image, + const torch::TensorAccessor left_image, torch::TensorAccessor output_cost_vol, scalar_t penalty1, scalar_t penalty2) : cost_volume(cost_volume), - intensity_image(intensity_image), + left_image(left_image), output_cost_vol(output_cost_vol), penalty1{penalty1}, penalty2{penalty2}, @@ -135,61 +139,50 @@ struct SGMCostOperator { prev_cost_cache(cost_volume.size(2), std::numeric_limits::infinity()) {} - void operator()(const SGPixelPath &path) noexcept { - scalar_t prev_intensity = 0; + void operator()(const SGPixelPath &path_desc) noexcept { const auto max_disparity = cost_volume.size(2); - const auto X = path.start.x; - const auto Y = path.start.y; - const auto disparities = cost_volume[Y][X]; - auto output_cost = output_cost_vol[Y][X]; + auto current_pixel = path_desc.start; + const auto cost_volume_acc = cost_volume[current_pixel.y][current_pixel.x]; + auto output_cost_acc = output_cost_vol[current_pixel.y][current_pixel.x]; for (auto disp = 0; disp < max_disparity; disp++) { - const auto cost = disparities[disp]; - - output_cost[disp] = cost; - prev_cost[disp] = cost; + const auto initial_cost = cost_volume_acc[disp]; + prev_cost[disp] = initial_cost; + output_cost_acc[disp] += initial_cost; } - prev_intensity = intensity_image[Y][X]; - - auto current_pixel = path.start + path.direction; - for (auto i = 0; i < path.size - 1; ++i, current_pixel += path.direction) { + scalar_t prev_intensity = left_image[current_pixel.y][current_pixel.x]; + for (auto i = 1; i < path_desc.size; ++i) { const auto prev_min_cost = *std::min_element(prev_cost.begin(), prev_cost.end()); + current_pixel += path_desc.direction; + + const auto intensity = left_image[current_pixel.y][current_pixel.x]; - const auto intensity = intensity_image[current_pixel.y][current_pixel.x]; const auto p2_adjusted = std::max(penalty1, penalty2 / std::abs(intensity - prev_intensity)); - const auto disparities = cost_volume[current_pixel.y][current_pixel.x]; - auto output_cost = output_cost_vol[current_pixel.y][current_pixel.x]; - for (size_t disp = 0; disp < max_disparity; disp++) { - const auto match_cost = disparities[disp]; + prev_intensity = intensity; + const auto cost_volume_acc = + cost_volume[current_pixel.y][current_pixel.x]; + auto output_cost_acc = output_cost_vol[current_pixel.y][current_pixel.x]; + for (size_t disp = 0; disp < max_disparity; disp++) { + const auto match_cost = cost_volume_acc[disp]; const auto sgm_cost = match_cost + get_min(prev_cost[disp], prev_cost[disp - 1] + penalty1, prev_cost[disp + 1] + penalty1, prev_min_cost + p2_adjusted) - prev_min_cost; - output_cost[disp] += sgm_cost; + output_cost_acc[disp] += sgm_cost; prev_cost_cache[disp] = sgm_cost; } - prev_intensity = intensity; - std::swap(prev_cost, prev_cost_cache); } } - - private: - const torch::TensorAccessor cost_volume; - const torch::TensorAccessor intensity_image; - torch::TensorAccessor output_cost_vol; - const scalar_t penalty1, penalty2; - - BorderedBuffer prev_cost, prev_cost_cache; }; void RunSemiglobalAggregationGPU(const torch::Tensor &cost_volume, @@ -221,9 +214,10 @@ void AggregationOps::RunSemiglobal(const torch::Tensor &cost_volume, output_cost_volume.accessor(), scalar_t(penalty1), scalar_t(penalty2)); - for (const auto sg_path : aggregation_paths) { - sgm_cost_op(sg_path); - sgm_cost_op(sg_path.inverse()); + for (const auto sg_path_desc : aggregation_paths) { + sgm_cost_op(sg_path_desc); + // TODO: Make the GPU version also run inverse paths + // sgm_cost_op(sg_path.inverse()); } }); } diff --git a/src/semiglobal.cu b/src/semiglobal.cu index 3a00505..a9440a5 100644 --- a/src/semiglobal.cu +++ b/src/semiglobal.cu @@ -18,32 +18,7 @@ #include "check.hpp" #include "numeric.hpp" -#define SG_MAX_DISP 256 - namespace stereomatch { -template -class CUDAArray { - public: - static CUDAArray FromCPU(const data_t* memory, size_t size); - - void size() {} - - private: - data_t* array_; -}; - -template -inline __device__ scalar_t GetCost(scalar_t cost, scalar_t keep_disp_cost, - scalar_t change_1_disp_cost1, - scalar_t change_1_disp_cost2, - scalar_t min_cost_all_disps, - scalar_t penalty1, scalar_t penalty2) { - return cost + get_min( - keep_disp_cost, - change_1_disp_cost1 + penalty1, - change_1_disp_cost2 + penalty2, - min_cost_all_disps + penalty2) - min_cost_all_disps; -} template struct SemiglobalKernel { @@ -60,86 +35,71 @@ struct SemiglobalKernel { : cost_volume(Accessor::Get(cost_volume)), left_image(Accessor::Get(left_image)), path_descriptors(thrust::raw_pointer_cast(path_descriptors)), - penalty1(penalty1), - penalty2(penalty2), + penalty1{penalty1}, + penalty2{penalty2}, output_cost_volume( Accessor::Get(output_cost_volume)) {} __device__ void operator()(int disp, int path) { - __shared__ scalar_t min_cost; - __shared__ int2 current_pixel; - - __shared__ scalar_t adapted_penalty2; - __shared__ scalar_t last_intensity; - - // extern __shared__ scalar_t prev_cost_memory[]; - const auto max_disparity = cost_volume.size(2); - // scalar_t *prev_cost = prev_cost_memory; - // scalar_t *prev_cost_min_search = &prev_cost_memory[max_disparity + 2]; - - __shared__ scalar_t prev_cost[SG_MAX_DISP + 2]; - __shared__ scalar_t prev_cost_min_search[SG_MAX_DISP]; + extern __shared__ __align__(sizeof(float)) uint8_t _shared_mem[]; + scalar_t* shr_prev_cost = (scalar_t*)_shared_mem; + scalar_t* shr_prev_cost_min_search = &shr_prev_cost[max_disparity + 2]; const auto path_desc(path_descriptors[path]); + auto current_pixel = cast_point2(path_desc.start); - __shared__ int2 current_path; - if (disp == 0) { - current_pixel = cast_point2(path_desc.start); - // prev_min_cost[0] = NumericLimits::infinity(); - last_intensity = left_image[current_pixel.y][current_pixel.x]; - prev_cost[0] = prev_cost[max_disparity] = - NumericLimits::infinity(); - } - - __syncthreads(); - - const auto initial_cost = cost_volume[current_pixel.y][current_pixel.x][disp]; - - prev_cost[disp + 1] = initial_cost; - prev_cost_min_search[disp] = initial_cost; + const auto initial_cost = + cost_volume[current_pixel.y][current_pixel.x][disp]; + shr_prev_cost[disp + 1] = initial_cost; + shr_prev_cost_min_search[disp] = initial_cost; output_cost_volume[current_pixel.y][current_pixel.x][disp] += initial_cost; - __syncthreads(); + if (disp == 0) { + // Pading borders + shr_prev_cost[0] = shr_prev_cost[max_disparity + 1] = + NumericLimits::infinity(); + } - for (auto i = 1; i < path_desc.size; i++) { - int search_idx = max_disparity >> 1; - while (search_idx != 0) { - if (disp < search_idx) { - prev_cost_min_search[disp] = - fminf(prev_cost_min_search[disp], prev_cost_min_search[disp + search_idx]); + scalar_t prev_intensity = left_image[current_pixel.y][current_pixel.x]; + for (auto i = 1; i < path_desc.size; ++i) { + __syncthreads(); // Wait writes into of sgm_cost into the search array + for (auto s = max_disparity >> 1; s >= 1; s = s >> 1) { + if (disp < s) { + const auto rhs_idx = s + disp; + const auto rhs_cost = shr_prev_cost_min_search[rhs_idx]; + if (shr_prev_cost_min_search[disp] >= rhs_cost) { + shr_prev_cost_min_search[disp] = rhs_cost; + } } __syncthreads(); - search_idx = search_idx >> 1; } - if (disp == 0) { - min_cost = prev_cost_min_search[0]; - current_pixel.x += path_desc.direction.x; - current_pixel.y += path_desc.direction.y; + const auto prev_min_cost = shr_prev_cost_min_search[0]; + current_pixel.x += path_desc.direction.x; + current_pixel.y += path_desc.direction.y; - const auto intensity = left_image[current_pixel.y][current_pixel.x]; + const auto intensity = left_image[current_pixel.y][current_pixel.x]; + const auto p2_adjusted = + max(penalty1, penalty2 / abs(intensity - prev_intensity)); - adapted_penalty2 = penalty2 / abs(intensity - last_intensity); - last_intensity = intensity; - } - - __syncthreads(); - - - const auto current_cost = - GetCost(cost_volume[current_pixel.y][current_pixel.x][disp], - prev_cost[disp + 1], prev_cost[disp], - prev_cost[disp + 2], min_cost, penalty1, adapted_penalty2); + prev_intensity = intensity; - __syncthreads(); + const auto match_cost = + cost_volume[current_pixel.y][current_pixel.x][disp]; + const auto sgm_cost = + match_cost + + get_min(shr_prev_cost[disp + 1], shr_prev_cost[disp] + penalty1, + shr_prev_cost[disp + 2] + penalty1, + prev_min_cost + p2_adjusted) - + prev_min_cost; - prev_cost[disp + 1] = current_cost; - prev_cost_min_search[disp] = current_cost; + output_cost_volume[current_pixel.y][current_pixel.x][disp] += sgm_cost; - output_cost_volume[current_pixel.y][current_pixel.x][disp] += current_cost; - __syncthreads(); + __syncthreads(); // Wait for all threads to read their neighbor costs + shr_prev_cost[disp + 1] = sgm_cost; + shr_prev_cost_min_search[disp] = sgm_cost; } } }; @@ -148,7 +108,7 @@ template static __global__ void LaunchKernel(SemiglobalKernel kernel, int path_descriptor_count, int max_disparity) { - const int path_descriptor_idx = blockIdx.x; + const int path_descriptor_idx = blockIdx.x; const int disparity = threadIdx.x; if (path_descriptor_idx < path_descriptor_count && @@ -170,11 +130,9 @@ void RunSemiglobalAggregationGPU(const torch::Tensor& cost_volume, cost_volume, left_image, thrust::raw_pointer_cast(path_descriptors.data()), penalty1, penalty2, output_cost_volume); - LaunchKernel<<>>(kernel, path_descriptors.size(), max_disparity); + LaunchKernel<<>>( + kernel, path_descriptors.size(), max_disparity); }); } } // namespace stereomatch - -