forked from tracel-ai/burn
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Perf/wgpu/reduce dim (tracel-ai#943)
* new reduce half working * surprisingly working * good on elongated matrix, bad on balanced ones * working and clean * autotune not tested, tests fail at non contiguous * fixed * autotune tested * mean dim * some fixes * clippy
- Loading branch information
Showing
16 changed files
with
790 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -57,3 +57,7 @@ serial_test = "2.0.0" | |
[[bench]] | ||
name = "matmul" | ||
harness = false | ||
|
||
[[bench]] | ||
name = "reduction" | ||
harness = false |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,108 @@ | ||
use burn_common::benchmark::{run_benchmark, Benchmark}; | ||
use burn_tensor::backend::Backend; | ||
use burn_tensor::{Distribution, Shape, Tensor}; | ||
use burn_wgpu::kernel::reduce::{init_reduce_output, sum_dim, sum_dim_shared_memory}; | ||
use burn_wgpu::WgpuDevice; | ||
use burn_wgpu::{AutoGraphicsApi, Wgpu}; | ||
use derive_new::new; | ||
use std::marker::PhantomData; | ||
|
||
use burn_wgpu::GraphicsApi; | ||
|
||
type WTensor<G, const D: usize> = Tensor<Wgpu<G, f32, i32>, D>; | ||
|
||
#[derive(new)] | ||
struct ReduceBenchmark<B: Backend, F, const D: usize> { | ||
shape: Shape<D>, | ||
dim: usize, | ||
num_repeats: usize, | ||
device: B::Device, | ||
reduce: PhantomData<F>, | ||
} | ||
|
||
trait ReduceFunction<G: GraphicsApi, const D: usize> { | ||
fn run(input: WTensor<G, D>, dim: usize) -> WTensor<G, D>; | ||
} | ||
|
||
impl<F, const D: usize, G> Benchmark for ReduceBenchmark<Wgpu<G, f32, i32>, F, D> | ||
where | ||
F: ReduceFunction<G, D>, | ||
G: GraphicsApi, | ||
{ | ||
type Args = WTensor<G, D>; | ||
|
||
fn name(&self) -> String { | ||
format!( | ||
"{:?} {:?} dim={:?}", | ||
std::any::type_name::<F>(), | ||
self.shape.dims, | ||
self.dim | ||
) | ||
} | ||
|
||
fn num_samples(&self) -> usize { | ||
10 | ||
} | ||
|
||
fn execute(&self, input: Self::Args) { | ||
for _ in 0..self.num_repeats { | ||
F::run(input.clone(), self.dim); | ||
} | ||
} | ||
|
||
fn prepare(&self) -> Self::Args { | ||
WTensor::random_device(self.shape.clone(), Distribution::Default, &self.device) | ||
} | ||
|
||
fn sync(&self) { | ||
Wgpu::<G, f32, i32>::sync(&self.device) | ||
} | ||
} | ||
|
||
macro_rules! bench_reduce { | ||
($benchmark:ident, $reduce_name:ident, $func:expr) => { | ||
struct $reduce_name {} | ||
impl<G: GraphicsApi, const D: usize> ReduceFunction<G, D> for $reduce_name { | ||
fn run(input: WTensor<G, D>, dim: usize) -> WTensor<G, D> { | ||
let input = input.into_primitive(); | ||
let output = init_reduce_output(&input, dim); | ||
Tensor::from_primitive($func(input, output, dim)) | ||
} | ||
} | ||
type $benchmark<const D: usize> = | ||
ReduceBenchmark<Wgpu<AutoGraphicsApi, f32, i32>, $reduce_name, D>; | ||
}; | ||
} | ||
|
||
bench_reduce!(SumDimBenchmark, SumDim, sum_dim); | ||
bench_reduce!( | ||
SumDimSharedMemoryBenchmark, | ||
SumDimSharedMemory, | ||
sum_dim_shared_memory | ||
); | ||
|
||
#[allow(dead_code)] | ||
/// Runs the benchmarks for wgpu matmul implementations | ||
pub fn bench(device: &WgpuDevice) { | ||
let num_repeats = 3; | ||
let shape = Shape::new([50, 8000, 50]); | ||
let dim = 1; | ||
|
||
macro_rules! run_reduce_benchmark { | ||
($benchmark:ident) => { | ||
run_benchmark($benchmark::new( | ||
shape.clone(), | ||
dim, | ||
num_repeats, | ||
device.clone(), | ||
)); | ||
}; | ||
} | ||
|
||
run_reduce_benchmark!(SumDimSharedMemoryBenchmark); | ||
run_reduce_benchmark!(SumDimBenchmark); | ||
} | ||
|
||
fn main() { | ||
bench(&WgpuDevice::BestAvailable) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
use crate::{element::WgpuElement, tensor::WgpuTensor}; | ||
|
||
/// Creates an empty output tensor with reduce output shape | ||
pub fn init_reduce_output<E: WgpuElement, const D: usize>( | ||
input: &WgpuTensor<E, D>, | ||
reduce_dim: usize, | ||
) -> WgpuTensor<E, D> { | ||
let mut shape_out = input.shape.clone(); | ||
shape_out.dims[reduce_dim] = 1; | ||
|
||
// Create output handle | ||
let num_elems_output = shape_out.num_elements(); | ||
let handle = input | ||
.client | ||
.empty(num_elems_output * core::mem::size_of::<E>()); | ||
WgpuTensor::new( | ||
input.client.clone(), | ||
input.device.clone(), | ||
shape_out.clone(), | ||
handle, | ||
) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
mod base; | ||
mod reduction; | ||
mod reduction_shared_memory; | ||
mod tune; | ||
|
||
pub use base::*; | ||
pub use reduction::*; | ||
pub use reduction_shared_memory::*; | ||
pub use tune::*; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.