Skip to content

Commit

Permalink
Fixes into the Semiglobal aggregation
Browse files Browse the repository at this point in the history
  • Loading branch information
otaviog committed Aug 22, 2022
1 parent 9282273 commit bb1cae5
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 137 deletions.
8 changes: 4 additions & 4 deletions include/stereomatch/semiglobal.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,15 @@ struct SGPixelPath {
Point2<int16_t> direction;
uint16_t size;

SGPixelPath(Point2<int16_t> start, Point2<int16_t> end,
Point2<int16_t> direction, int16_t size)
STM_DEVICE_HOST SGPixelPath(Point2<int16_t> start, Point2<int16_t> end,
Point2<int16_t> direction, int16_t size) noexcept
: start(start), end(end), direction(direction), size(size) {}

SGPixelPath inverse() const {
STM_DEVICE_HOST SGPixelPath inverse() const {
return SGPixelPath(end, start, Point2<int16_t>(-direction.x, -direction.y),
size);
}
static std::vector<SGPixelPath> GeneratePaths(size_t width, size_t height);
static std::vector<SGPixelPath> GeneratePaths(size_t width, size_t height) noexcept;
};

} // namespace stereomatch
5 changes: 2 additions & 3 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,11 @@ target_include_directories(stereomatch
$<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include/stereomatch>
$<BUILD_INTERFACE:${EIGEN3_INCLUDE_DIR}>
)

target_link_libraries(stereomatch "${TORCH_LIBRARIES}" "${TORCH_PYTHON_LIBRARY}" )

set_property(TARGET stereomatch PROPERTY CXX_STANDARD 17)

set_target_properties(stereomatch PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
# set_property(TARGET stereomatch PROPERTY CXX_INCLUDE_WHAT_YOU_USE ${iwyu_path})


add_library(_cstereomatch SHARED _cstereomatch.cpp)
set_property(TARGET _cstereomatch PROPERTY CXX_STANDARD 17)
Expand Down
74 changes: 34 additions & 40 deletions src/semiglobal.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ struct BorderedBuffer {
BorderedBuffer(int size, T border_value) noexcept
: array(size + border_size) {
for (auto i = 0; i < border_size; ++i) {
array[i] = array[size - 1 + i] = border_value;
array[i] = array[size + border_size + i] = border_value;
}
}

Expand All @@ -42,7 +42,7 @@ struct BorderedBuffer {
};

std::vector<SGPixelPath> SGPixelPath::GeneratePaths(size_t width,
size_t height) {
size_t height) noexcept {
std::vector<SGPixelPath> path_descs;

/**
Expand All @@ -57,13 +57,11 @@ std::vector<SGPixelPath> SGPixelPath::GeneratePaths(size_t width,
/**
* Vertical paths
*/

for (auto i = 0; i < width; i++) {
path_descs.push_back(SGPixelPath(Point2<int16_t>(i, 0),
Point2<int16_t>(i, height - 1),
Point2<int16_t>(0, 1), height));
}

/**
* Diagonal left to right
*/
Expand Down Expand Up @@ -120,13 +118,19 @@ std::vector<SGPixelPath> SGPixelPath::GeneratePaths(size_t width,

template <typename scalar_t>
struct SGMCostOperator {
public:
const torch::TensorAccessor<scalar_t, 3> cost_volume;
const torch::TensorAccessor<scalar_t, 2> left_image;
torch::TensorAccessor<scalar_t, 3> output_cost_vol;
const scalar_t penalty1, penalty2;

BorderedBuffer<scalar_t, 1> prev_cost, prev_cost_cache;

SGMCostOperator(const torch::TensorAccessor<scalar_t, 3> cost_volume,
const torch::TensorAccessor<scalar_t, 2> intensity_image,
const torch::TensorAccessor<scalar_t, 2> left_image,
torch::TensorAccessor<scalar_t, 3> output_cost_vol,
scalar_t penalty1, scalar_t penalty2)
: cost_volume(cost_volume),
intensity_image(intensity_image),
left_image(left_image),
output_cost_vol(output_cost_vol),
penalty1{penalty1},
penalty2{penalty2},
Expand All @@ -135,61 +139,50 @@ struct SGMCostOperator {
prev_cost_cache(cost_volume.size(2),
std::numeric_limits<scalar_t>::infinity()) {}

void operator()(const SGPixelPath &path) noexcept {
scalar_t prev_intensity = 0;
void operator()(const SGPixelPath &path_desc) noexcept {
const auto max_disparity = cost_volume.size(2);

const auto X = path.start.x;
const auto Y = path.start.y;
const auto disparities = cost_volume[Y][X];
auto output_cost = output_cost_vol[Y][X];
auto current_pixel = path_desc.start;
const auto cost_volume_acc = cost_volume[current_pixel.y][current_pixel.x];
auto output_cost_acc = output_cost_vol[current_pixel.y][current_pixel.x];

for (auto disp = 0; disp < max_disparity; disp++) {
const auto cost = disparities[disp];

output_cost[disp] = cost;
prev_cost[disp] = cost;
const auto initial_cost = cost_volume_acc[disp];
prev_cost[disp] = initial_cost;
output_cost_acc[disp] += initial_cost;
}

prev_intensity = intensity_image[Y][X];

auto current_pixel = path.start + path.direction;
for (auto i = 0; i < path.size - 1; ++i, current_pixel += path.direction) {
scalar_t prev_intensity = left_image[current_pixel.y][current_pixel.x];
for (auto i = 1; i < path_desc.size; ++i) {
const auto prev_min_cost =
*std::min_element(prev_cost.begin(), prev_cost.end());
current_pixel += path_desc.direction;

const auto intensity = left_image[current_pixel.y][current_pixel.x];

const auto intensity = intensity_image[current_pixel.y][current_pixel.x];
const auto p2_adjusted =
std::max(penalty1, penalty2 / std::abs(intensity - prev_intensity));

const auto disparities = cost_volume[current_pixel.y][current_pixel.x];
auto output_cost = output_cost_vol[current_pixel.y][current_pixel.x];
for (size_t disp = 0; disp < max_disparity; disp++) {
const auto match_cost = disparities[disp];
prev_intensity = intensity;

const auto cost_volume_acc =
cost_volume[current_pixel.y][current_pixel.x];
auto output_cost_acc = output_cost_vol[current_pixel.y][current_pixel.x];
for (size_t disp = 0; disp < max_disparity; disp++) {
const auto match_cost = cost_volume_acc[disp];
const auto sgm_cost =
match_cost +
get_min(prev_cost[disp], prev_cost[disp - 1] + penalty1,
prev_cost[disp + 1] + penalty1,
prev_min_cost + p2_adjusted) -
prev_min_cost;
output_cost[disp] += sgm_cost;
output_cost_acc[disp] += sgm_cost;
prev_cost_cache[disp] = sgm_cost;
}

prev_intensity = intensity;

std::swap(prev_cost, prev_cost_cache);
}
}

private:
const torch::TensorAccessor<scalar_t, 3> cost_volume;
const torch::TensorAccessor<scalar_t, 2> intensity_image;
torch::TensorAccessor<scalar_t, 3> output_cost_vol;
const scalar_t penalty1, penalty2;

BorderedBuffer<scalar_t, 1> prev_cost, prev_cost_cache;
};

void RunSemiglobalAggregationGPU(const torch::Tensor &cost_volume,
Expand Down Expand Up @@ -221,9 +214,10 @@ void AggregationOps::RunSemiglobal(const torch::Tensor &cost_volume,
output_cost_volume.accessor<scalar_t, 3>(), scalar_t(penalty1),
scalar_t(penalty2));

for (const auto sg_path : aggregation_paths) {
sgm_cost_op(sg_path);
sgm_cost_op(sg_path.inverse());
for (const auto sg_path_desc : aggregation_paths) {
sgm_cost_op(sg_path_desc);
// TODO: Make the GPU version also run inverse paths
// sgm_cost_op(sg_path.inverse());
}
});
}
Expand Down
138 changes: 48 additions & 90 deletions src/semiglobal.cu
Original file line number Diff line number Diff line change
Expand Up @@ -18,32 +18,7 @@
#include "check.hpp"
#include "numeric.hpp"

#define SG_MAX_DISP 256

namespace stereomatch {
template <typename data_t>
class CUDAArray {
public:
static CUDAArray FromCPU(const data_t* memory, size_t size);

void size() {}

private:
data_t* array_;
};

template <typename scalar_t>
inline __device__ scalar_t GetCost(scalar_t cost, scalar_t keep_disp_cost,
scalar_t change_1_disp_cost1,
scalar_t change_1_disp_cost2,
scalar_t min_cost_all_disps,
scalar_t penalty1, scalar_t penalty2) {
return cost + get_min(
keep_disp_cost,
change_1_disp_cost1 + penalty1,
change_1_disp_cost2 + penalty2,
min_cost_all_disps + penalty2) - min_cost_all_disps;
}

template <typename scalar_t>
struct SemiglobalKernel {
Expand All @@ -60,86 +35,71 @@ struct SemiglobalKernel {
: cost_volume(Accessor<kCUDA, scalar_t, 3>::Get(cost_volume)),
left_image(Accessor<kCUDA, scalar_t, 2>::Get(left_image)),
path_descriptors(thrust::raw_pointer_cast(path_descriptors)),
penalty1(penalty1),
penalty2(penalty2),
penalty1{penalty1},
penalty2{penalty2},
output_cost_volume(
Accessor<kCUDA, scalar_t, 3>::Get(output_cost_volume)) {}

__device__ void operator()(int disp, int path) {
__shared__ scalar_t min_cost;
__shared__ int2 current_pixel;

__shared__ scalar_t adapted_penalty2;
__shared__ scalar_t last_intensity;

// extern __shared__ scalar_t prev_cost_memory[];

const auto max_disparity = cost_volume.size(2);

// scalar_t *prev_cost = prev_cost_memory;
// scalar_t *prev_cost_min_search = &prev_cost_memory[max_disparity + 2];

__shared__ scalar_t prev_cost[SG_MAX_DISP + 2];
__shared__ scalar_t prev_cost_min_search[SG_MAX_DISP];
extern __shared__ __align__(sizeof(float)) uint8_t _shared_mem[];
scalar_t* shr_prev_cost = (scalar_t*)_shared_mem;
scalar_t* shr_prev_cost_min_search = &shr_prev_cost[max_disparity + 2];

const auto path_desc(path_descriptors[path]);
auto current_pixel = cast_point2<int2>(path_desc.start);

__shared__ int2 current_path;
if (disp == 0) {
current_pixel = cast_point2<int2>(path_desc.start);
// prev_min_cost[0] = NumericLimits<scalar_t>::infinity();
last_intensity = left_image[current_pixel.y][current_pixel.x];
prev_cost[0] = prev_cost[max_disparity] =
NumericLimits<scalar_t>::infinity();
}

__syncthreads();

const auto initial_cost = cost_volume[current_pixel.y][current_pixel.x][disp];

prev_cost[disp + 1] = initial_cost;
prev_cost_min_search[disp] = initial_cost;
const auto initial_cost =
cost_volume[current_pixel.y][current_pixel.x][disp];
shr_prev_cost[disp + 1] = initial_cost;
shr_prev_cost_min_search[disp] = initial_cost;
output_cost_volume[current_pixel.y][current_pixel.x][disp] += initial_cost;

__syncthreads();
if (disp == 0) {
// Pading borders
shr_prev_cost[0] = shr_prev_cost[max_disparity + 1] =
NumericLimits<scalar_t>::infinity();
}

for (auto i = 1; i < path_desc.size; i++) {
int search_idx = max_disparity >> 1;
while (search_idx != 0) {
if (disp < search_idx) {
prev_cost_min_search[disp] =
fminf(prev_cost_min_search[disp], prev_cost_min_search[disp + search_idx]);
scalar_t prev_intensity = left_image[current_pixel.y][current_pixel.x];
for (auto i = 1; i < path_desc.size; ++i) {
__syncthreads(); // Wait writes into of sgm_cost into the search array
for (auto s = max_disparity >> 1; s >= 1; s = s >> 1) {
if (disp < s) {
const auto rhs_idx = s + disp;
const auto rhs_cost = shr_prev_cost_min_search[rhs_idx];
if (shr_prev_cost_min_search[disp] >= rhs_cost) {
shr_prev_cost_min_search[disp] = rhs_cost;
}
}
__syncthreads();
search_idx = search_idx >> 1;
}

if (disp == 0) {
min_cost = prev_cost_min_search[0];
current_pixel.x += path_desc.direction.x;
current_pixel.y += path_desc.direction.y;
const auto prev_min_cost = shr_prev_cost_min_search[0];
current_pixel.x += path_desc.direction.x;
current_pixel.y += path_desc.direction.y;

const auto intensity = left_image[current_pixel.y][current_pixel.x];
const auto intensity = left_image[current_pixel.y][current_pixel.x];
const auto p2_adjusted =
max(penalty1, penalty2 / abs(intensity - prev_intensity));

adapted_penalty2 = penalty2 / abs(intensity - last_intensity);
last_intensity = intensity;
}

__syncthreads();


const auto current_cost =
GetCost(cost_volume[current_pixel.y][current_pixel.x][disp],
prev_cost[disp + 1], prev_cost[disp],
prev_cost[disp + 2], min_cost, penalty1, adapted_penalty2);
prev_intensity = intensity;

__syncthreads();
const auto match_cost =
cost_volume[current_pixel.y][current_pixel.x][disp];
const auto sgm_cost =
match_cost +
get_min(shr_prev_cost[disp + 1], shr_prev_cost[disp] + penalty1,
shr_prev_cost[disp + 2] + penalty1,
prev_min_cost + p2_adjusted) -
prev_min_cost;

prev_cost[disp + 1] = current_cost;
prev_cost_min_search[disp] = current_cost;
output_cost_volume[current_pixel.y][current_pixel.x][disp] += sgm_cost;

output_cost_volume[current_pixel.y][current_pixel.x][disp] += current_cost;
__syncthreads();
__syncthreads(); // Wait for all threads to read their neighbor costs
shr_prev_cost[disp + 1] = sgm_cost;
shr_prev_cost_min_search[disp] = sgm_cost;
}
}
};
Expand All @@ -148,7 +108,7 @@ template <typename T>
static __global__ void LaunchKernel(SemiglobalKernel<T> kernel,
int path_descriptor_count,
int max_disparity) {
const int path_descriptor_idx = blockIdx.x;
const int path_descriptor_idx = blockIdx.x;
const int disparity = threadIdx.x;

if (path_descriptor_idx < path_descriptor_count &&
Expand All @@ -170,11 +130,9 @@ void RunSemiglobalAggregationGPU(const torch::Tensor& cost_volume,
cost_volume, left_image,
thrust::raw_pointer_cast(path_descriptors.data()), penalty1,
penalty2, output_cost_volume);
LaunchKernel<<<path_descriptors.size(), max_disparity
//,(max_disparity*2 + 2)*sizeof(scalar_t)
>>>(kernel, path_descriptors.size(), max_disparity);
LaunchKernel<<<path_descriptors.size(), max_disparity,
(2 * max_disparity + 3) * sizeof(scalar_t)>>>(
kernel, path_descriptors.size(), max_disparity);
});
}
} // namespace stereomatch


0 comments on commit bb1cae5

Please sign in to comment.