diff --git a/include/stereomatch/semiglobal.hpp b/include/stereomatch/semiglobal.hpp
index 4dd02cb..06d0571 100644
--- a/include/stereomatch/semiglobal.hpp
+++ b/include/stereomatch/semiglobal.hpp
@@ -35,15 +35,15 @@ struct SGPixelPath {
   Point2<int16_t> direction;
   uint16_t size;
 
-  SGPixelPath(Point2<int16_t> start, Point2<int16_t> end,
-              Point2<int16_t> direction, int16_t size)
+  STM_DEVICE_HOST SGPixelPath(Point2<int16_t> start, Point2<int16_t> end,
+              Point2<int16_t> direction, int16_t size) noexcept
       : start(start), end(end), direction(direction), size(size) {}
 
-  SGPixelPath inverse() const {
+  STM_DEVICE_HOST SGPixelPath inverse() const {
     return SGPixelPath(end, start, Point2<int16_t>(-direction.x, -direction.y),
                        size);
   }
-  static std::vector<SGPixelPath> GeneratePaths(size_t width, size_t height);
+  static std::vector<SGPixelPath> GeneratePaths(size_t width, size_t height) noexcept;
 };
 
 }  // namespace stereomatch
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 34099c0..d2764dc 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -26,12 +26,11 @@ target_include_directories(stereomatch
   $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/../include/stereomatch>
   $<BUILD_INTERFACE:${EIGEN3_INCLUDE_DIR}>
   )
-
 target_link_libraries(stereomatch "${TORCH_LIBRARIES}" "${TORCH_PYTHON_LIBRARY}" )
-
 set_property(TARGET stereomatch PROPERTY CXX_STANDARD 17)
-
 set_target_properties(stereomatch PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+# set_property(TARGET stereomatch PROPERTY CXX_INCLUDE_WHAT_YOU_USE ${iwyu_path})
+
 
 add_library(_cstereomatch SHARED _cstereomatch.cpp)
 set_property(TARGET _cstereomatch PROPERTY CXX_STANDARD 17)
diff --git a/src/semiglobal.cpp b/src/semiglobal.cpp
index a53fd5b..b7bef3c 100644
--- a/src/semiglobal.cpp
+++ b/src/semiglobal.cpp
@@ -24,7 +24,7 @@ struct BorderedBuffer {
   BorderedBuffer(int size, T border_value) noexcept
       : array(size + border_size) {
     for (auto i = 0; i < border_size; ++i) {
-      array[i] = array[size - 1 + i] = border_value;
+      array[i] = array[size + border_size + i] = border_value;
     }
   }
 
@@ -42,7 +42,7 @@ struct BorderedBuffer {
 };
 
 std::vector<SGPixelPath> SGPixelPath::GeneratePaths(size_t width,
-                                                    size_t height) {
+                                                    size_t height) noexcept {
   std::vector<SGPixelPath> path_descs;
 
   /**
@@ -57,13 +57,11 @@ std::vector<SGPixelPath> SGPixelPath::GeneratePaths(size_t width,
   /**
    * Vertical paths
    */
-
   for (auto i = 0; i < width; i++) {
     path_descs.push_back(SGPixelPath(Point2<int16_t>(i, 0),
                                      Point2<int16_t>(i, height - 1),
                                      Point2<int16_t>(0, 1), height));
   }
-
   /**
    * Diagonal left to right
    */
@@ -120,13 +118,19 @@ std::vector<SGPixelPath> SGPixelPath::GeneratePaths(size_t width,
 
 template <typename scalar_t>
 struct SGMCostOperator {
- public:
+  const torch::TensorAccessor<scalar_t, 3> cost_volume;
+  const torch::TensorAccessor<scalar_t, 2> left_image;
+  torch::TensorAccessor<scalar_t, 3> output_cost_vol;
+  const scalar_t penalty1, penalty2;
+
+  BorderedBuffer<scalar_t, 1> prev_cost, prev_cost_cache;
+
   SGMCostOperator(const torch::TensorAccessor<scalar_t, 3> cost_volume,
-                  const torch::TensorAccessor<scalar_t, 2> intensity_image,
+                  const torch::TensorAccessor<scalar_t, 2> left_image,
                   torch::TensorAccessor<scalar_t, 3> output_cost_vol,
                   scalar_t penalty1, scalar_t penalty2)
       : cost_volume(cost_volume),
-        intensity_image(intensity_image),
+        left_image(left_image),
         output_cost_vol(output_cost_vol),
         penalty1{penalty1},
         penalty2{penalty2},
@@ -135,61 +139,50 @@ struct SGMCostOperator {
         prev_cost_cache(cost_volume.size(2),
                         std::numeric_limits<scalar_t>::infinity()) {}
 
-  void operator()(const SGPixelPath &path) noexcept {
-    scalar_t prev_intensity = 0;
+  void operator()(const SGPixelPath &path_desc) noexcept {
     const auto max_disparity = cost_volume.size(2);
 
-    const auto X = path.start.x;
-    const auto Y = path.start.y;
-    const auto disparities = cost_volume[Y][X];
-    auto output_cost = output_cost_vol[Y][X];
+    auto current_pixel = path_desc.start;
+    const auto cost_volume_acc = cost_volume[current_pixel.y][current_pixel.x];
+    auto output_cost_acc = output_cost_vol[current_pixel.y][current_pixel.x];
 
     for (auto disp = 0; disp < max_disparity; disp++) {
-      const auto cost = disparities[disp];
-
-      output_cost[disp] = cost;
-      prev_cost[disp] = cost;
+      const auto initial_cost = cost_volume_acc[disp];
+      prev_cost[disp] = initial_cost;
+      output_cost_acc[disp] += initial_cost;
     }
 
-    prev_intensity = intensity_image[Y][X];
-
-    auto current_pixel = path.start + path.direction;
-    for (auto i = 0; i < path.size - 1; ++i, current_pixel += path.direction) {
+    scalar_t prev_intensity = left_image[current_pixel.y][current_pixel.x];
+    for (auto i = 1; i < path_desc.size; ++i) {
       const auto prev_min_cost =
           *std::min_element(prev_cost.begin(), prev_cost.end());
+      current_pixel += path_desc.direction;
+
+      const auto intensity = left_image[current_pixel.y][current_pixel.x];
 
-      const auto intensity = intensity_image[current_pixel.y][current_pixel.x];
       const auto p2_adjusted =
           std::max(penalty1, penalty2 / std::abs(intensity - prev_intensity));
 
-      const auto disparities = cost_volume[current_pixel.y][current_pixel.x];
-      auto output_cost = output_cost_vol[current_pixel.y][current_pixel.x];
-      for (size_t disp = 0; disp < max_disparity; disp++) {
-        const auto match_cost = disparities[disp];
+      prev_intensity = intensity;
 
+      const auto cost_volume_acc =
+          cost_volume[current_pixel.y][current_pixel.x];
+      auto output_cost_acc = output_cost_vol[current_pixel.y][current_pixel.x];
+      for (size_t disp = 0; disp < max_disparity; disp++) {
+        const auto match_cost = cost_volume_acc[disp];
         const auto sgm_cost =
             match_cost +
             get_min(prev_cost[disp], prev_cost[disp - 1] + penalty1,
                     prev_cost[disp + 1] + penalty1,
                     prev_min_cost + p2_adjusted) -
             prev_min_cost;
-        output_cost[disp] += sgm_cost;
+        output_cost_acc[disp] += sgm_cost;
         prev_cost_cache[disp] = sgm_cost;
       }
 
-      prev_intensity = intensity;
-
       std::swap(prev_cost, prev_cost_cache);
     }
   }
-
- private:
-  const torch::TensorAccessor<scalar_t, 3> cost_volume;
-  const torch::TensorAccessor<scalar_t, 2> intensity_image;
-  torch::TensorAccessor<scalar_t, 3> output_cost_vol;
-  const scalar_t penalty1, penalty2;
-
-  BorderedBuffer<scalar_t, 1> prev_cost, prev_cost_cache;
 };
 
 void RunSemiglobalAggregationGPU(const torch::Tensor &cost_volume,
@@ -221,9 +214,10 @@ void AggregationOps::RunSemiglobal(const torch::Tensor &cost_volume,
           output_cost_volume.accessor<scalar_t, 3>(), scalar_t(penalty1),
           scalar_t(penalty2));
 
-      for (const auto sg_path : aggregation_paths) {
-        sgm_cost_op(sg_path);
-        sgm_cost_op(sg_path.inverse());
+      for (const auto sg_path_desc : aggregation_paths) {
+        sgm_cost_op(sg_path_desc);
+        // TODO: Make the GPU version also run inverse paths
+        // sgm_cost_op(sg_path.inverse());
       }
     });
   }
diff --git a/src/semiglobal.cu b/src/semiglobal.cu
index 3a00505..a9440a5 100644
--- a/src/semiglobal.cu
+++ b/src/semiglobal.cu
@@ -18,32 +18,7 @@
 #include "check.hpp"
 #include "numeric.hpp"
 
-#define SG_MAX_DISP 256
-
 namespace stereomatch {
-template <typename data_t>
-class CUDAArray {
- public:
-  static CUDAArray FromCPU(const data_t* memory, size_t size);
-
-  void size() {}
-
- private:
-  data_t* array_;
-};
-                     
-template <typename scalar_t>
-inline __device__ scalar_t GetCost(scalar_t cost, scalar_t keep_disp_cost,
-                                   scalar_t change_1_disp_cost1,
-                                   scalar_t change_1_disp_cost2,
-                                   scalar_t min_cost_all_disps,
-                                   scalar_t penalty1, scalar_t penalty2) {
-  return cost + get_min(
-      keep_disp_cost,
-      change_1_disp_cost1 + penalty1,
-      change_1_disp_cost2 + penalty2,
-      min_cost_all_disps + penalty2) - min_cost_all_disps;
-}
 
 template <typename scalar_t>
 struct SemiglobalKernel {
@@ -60,86 +35,71 @@ struct SemiglobalKernel {
       : cost_volume(Accessor<kCUDA, scalar_t, 3>::Get(cost_volume)),
         left_image(Accessor<kCUDA, scalar_t, 2>::Get(left_image)),
         path_descriptors(thrust::raw_pointer_cast(path_descriptors)),
-        penalty1(penalty1),
-        penalty2(penalty2),
+        penalty1{penalty1},
+        penalty2{penalty2},
         output_cost_volume(
             Accessor<kCUDA, scalar_t, 3>::Get(output_cost_volume)) {}
 
   __device__ void operator()(int disp, int path) {
-    __shared__ scalar_t min_cost;
-    __shared__ int2 current_pixel;
-
-    __shared__ scalar_t adapted_penalty2;
-    __shared__ scalar_t last_intensity;
-
-    // extern __shared__ scalar_t prev_cost_memory[];
-
     const auto max_disparity = cost_volume.size(2);
 
-    // scalar_t *prev_cost = prev_cost_memory;
-    // scalar_t *prev_cost_min_search = &prev_cost_memory[max_disparity + 2];
-
-    __shared__ scalar_t prev_cost[SG_MAX_DISP + 2];
-    __shared__ scalar_t prev_cost_min_search[SG_MAX_DISP];
+    extern __shared__ __align__(sizeof(float)) uint8_t _shared_mem[];
+    scalar_t* shr_prev_cost = (scalar_t*)_shared_mem;
+    scalar_t* shr_prev_cost_min_search = &shr_prev_cost[max_disparity + 2];
 
     const auto path_desc(path_descriptors[path]);
+    auto current_pixel = cast_point2<int2>(path_desc.start);
 
-    __shared__ int2 current_path;
-    if (disp == 0) {
-      current_pixel = cast_point2<int2>(path_desc.start);
-      // prev_min_cost[0] = NumericLimits<scalar_t>::infinity();
-      last_intensity = left_image[current_pixel.y][current_pixel.x];
-      prev_cost[0] = prev_cost[max_disparity] =
-                     NumericLimits<scalar_t>::infinity();
-    }
-
-    __syncthreads();
-
-    const auto initial_cost = cost_volume[current_pixel.y][current_pixel.x][disp];
-
-    prev_cost[disp + 1] = initial_cost;
-    prev_cost_min_search[disp] = initial_cost;
+    const auto initial_cost =
+        cost_volume[current_pixel.y][current_pixel.x][disp];
+    shr_prev_cost[disp + 1] = initial_cost;
+    shr_prev_cost_min_search[disp] = initial_cost;
     output_cost_volume[current_pixel.y][current_pixel.x][disp] += initial_cost;
 
-    __syncthreads();
+    if (disp == 0) {
+      // Pading borders
+      shr_prev_cost[0] = shr_prev_cost[max_disparity + 1] =
+          NumericLimits<scalar_t>::infinity();
+    }
 
-    for (auto i = 1; i < path_desc.size; i++) {
-      int search_idx = max_disparity >> 1;
-      while (search_idx != 0) {
-        if (disp < search_idx) {
-          prev_cost_min_search[disp] =
-              fminf(prev_cost_min_search[disp], prev_cost_min_search[disp + search_idx]);
+    scalar_t prev_intensity = left_image[current_pixel.y][current_pixel.x];
+    for (auto i = 1; i < path_desc.size; ++i) {
+      __syncthreads();  // Wait writes into of sgm_cost into the search array
+      for (auto s = max_disparity >> 1; s >= 1; s = s >> 1) {
+        if (disp < s) {
+          const auto rhs_idx = s + disp;
+          const auto rhs_cost = shr_prev_cost_min_search[rhs_idx];
+          if (shr_prev_cost_min_search[disp] >= rhs_cost) {
+            shr_prev_cost_min_search[disp] = rhs_cost;
+          }
         }
         __syncthreads();
-        search_idx = search_idx >> 1;
       }
 
-      if (disp == 0) {
-        min_cost = prev_cost_min_search[0];
-        current_pixel.x += path_desc.direction.x;
-        current_pixel.y += path_desc.direction.y;
+      const auto prev_min_cost = shr_prev_cost_min_search[0];
+      current_pixel.x += path_desc.direction.x;
+      current_pixel.y += path_desc.direction.y;
 
-        const auto intensity = left_image[current_pixel.y][current_pixel.x];
+      const auto intensity = left_image[current_pixel.y][current_pixel.x];
+      const auto p2_adjusted =
+          max(penalty1, penalty2 / abs(intensity - prev_intensity));
 
-        adapted_penalty2 = penalty2 / abs(intensity - last_intensity);
-        last_intensity = intensity;
-      }
-
-      __syncthreads();
-
-      
-      const auto current_cost =
-          GetCost(cost_volume[current_pixel.y][current_pixel.x][disp],
-                  prev_cost[disp + 1], prev_cost[disp],
-                  prev_cost[disp + 2], min_cost, penalty1, adapted_penalty2);
+      prev_intensity = intensity;
 
-      __syncthreads();
+      const auto match_cost =
+          cost_volume[current_pixel.y][current_pixel.x][disp];
+      const auto sgm_cost =
+          match_cost +
+          get_min(shr_prev_cost[disp + 1], shr_prev_cost[disp] + penalty1,
+                  shr_prev_cost[disp + 2] + penalty1,
+                  prev_min_cost + p2_adjusted) -
+          prev_min_cost;
 
-      prev_cost[disp + 1] = current_cost;
-      prev_cost_min_search[disp] = current_cost;
+      output_cost_volume[current_pixel.y][current_pixel.x][disp] += sgm_cost;
 
-      output_cost_volume[current_pixel.y][current_pixel.x][disp] += current_cost;
-      __syncthreads();
+      __syncthreads();  // Wait for all threads to read their neighbor costs
+      shr_prev_cost[disp + 1] = sgm_cost;
+      shr_prev_cost_min_search[disp] = sgm_cost;
     }
   }
 };
@@ -148,7 +108,7 @@ template <typename T>
 static __global__ void LaunchKernel(SemiglobalKernel<T> kernel,
                                     int path_descriptor_count,
                                     int max_disparity) {
-  const int path_descriptor_idx = blockIdx.x; 
+  const int path_descriptor_idx = blockIdx.x;
   const int disparity = threadIdx.x;
 
   if (path_descriptor_idx < path_descriptor_count &&
@@ -170,11 +130,9 @@ void RunSemiglobalAggregationGPU(const torch::Tensor& cost_volume,
             cost_volume, left_image,
             thrust::raw_pointer_cast(path_descriptors.data()), penalty1,
             penalty2, output_cost_volume);
-        LaunchKernel<<<path_descriptors.size(), max_disparity
-                       //,(max_disparity*2 + 2)*sizeof(scalar_t)
-                       >>>(kernel, path_descriptors.size(), max_disparity);
+        LaunchKernel<<<path_descriptors.size(), max_disparity,
+                       (2 * max_disparity + 3) * sizeof(scalar_t)>>>(
+            kernel, path_descriptors.size(), max_disparity);
       });
 }
 }  // namespace stereomatch
-
-