Skip to content

Commit

Permalink
Feat/wgpu/autotune compute (tracel-ai#906)
Browse files Browse the repository at this point in the history
  • Loading branch information
louisfd authored Oct 29, 2023
1 parent a9567ab commit e2a3329
Show file tree
Hide file tree
Showing 70 changed files with 1,036 additions and 889 deletions.
4 changes: 1 addition & 3 deletions backend-comparison/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,7 @@ wgpu = ["burn/wgpu"]
burn = { path = "../burn" }
derive-new = { workspace = true }
rand = { workspace = true }
burn-tensor = { path = "../burn-tensor", version = "0.11.0", features = [
"benchmark",
] }
burn-common = { path = "../burn-common", version = "0.11.0" }

[dev-dependencies]

Expand Down
34 changes: 15 additions & 19 deletions backend-comparison/benches/binary.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
use std::marker::PhantomData;

use burn::tensor::{backend::Backend, Distribution, Shape, Tensor};
use burn_tensor::benchmark::{run_benchmark, Benchmark};
use burn_common::benchmark::{run_benchmark, Benchmark};

pub struct BinaryBenchmark<B: Backend, const D: usize> {
shape: Shape<D>,
num_repeats: usize,
backend: PhantomData<B>,
device: B::Device,
}

impl<B: Backend, const D: usize> Benchmark<B> for BinaryBenchmark<B, D> {
impl<B: Backend, const D: usize> Benchmark for BinaryBenchmark<B, D> {
type Args = (Tensor<B, D>, Tensor<B, D>);

fn name(&self) -> String {
Expand All @@ -23,27 +21,25 @@ impl<B: Backend, const D: usize> Benchmark<B> for BinaryBenchmark<B, D> {
}
}

fn prepare(&self, device: &B::Device) -> Self::Args {
let lhs = Tensor::random(self.shape.clone(), Distribution::Default).to_device(device);
let rhs = Tensor::random(self.shape.clone(), Distribution::Default).to_device(device);
fn prepare(&self) -> Self::Args {
let lhs = Tensor::random_device(self.shape.clone(), Distribution::Default, &self.device);
let rhs = Tensor::random_device(self.shape.clone(), Distribution::Default, &self.device);

(lhs, rhs)
}

fn sync(&self) {
B::sync(&self.device)
}
}

#[allow(dead_code)]
fn bench<B: Backend>(device: &B::Device) {
const D: usize = 3;
let shape: Shape<D> = [32, 512, 1024].into();
let num_repeats = 10;

let benchmark = BinaryBenchmark::<B, D> {
shape,
num_repeats,
backend: PhantomData,
};

run_benchmark(benchmark, device)
run_benchmark(BinaryBenchmark::<B, 3> {
shape: [32, 512, 1024].into(),
num_repeats: 10,
device: device.clone(),
})
}

fn main() {
Expand Down
37 changes: 22 additions & 15 deletions backend-comparison/benches/data.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
use std::marker::PhantomData;

use burn::tensor::{backend::Backend, Data, Distribution, Shape, Tensor};
use burn_tensor::benchmark::{run_benchmark, Benchmark};
use burn_common::benchmark::{run_benchmark, Benchmark};
use derive_new::new;

#[derive(new)]
struct ToDataBenchmark<B: Backend, const D: usize> {
shape: Shape<D>,
num_repeats: usize,
backend: PhantomData<B>,
device: B::Device,
}

impl<B: Backend, const D: usize> Benchmark<B> for ToDataBenchmark<B, D> {
impl<B: Backend, const D: usize> Benchmark for ToDataBenchmark<B, D> {
type Args = Tensor<B, D>;

fn name(&self) -> String {
Expand All @@ -24,19 +22,23 @@ impl<B: Backend, const D: usize> Benchmark<B> for ToDataBenchmark<B, D> {
}
}

fn prepare(&self, device: &B::Device) -> Self::Args {
Tensor::random_device(self.shape.clone(), Distribution::Default, device)
fn prepare(&self) -> Self::Args {
Tensor::random_device(self.shape.clone(), Distribution::Default, &self.device)
}

fn sync(&self) {
B::sync(&self.device)
}
}

#[derive(new)]
struct FromDataBenchmark<B: Backend, const D: usize> {
shape: Shape<D>,
num_repeats: usize,
backend: PhantomData<B>,
device: B::Device,
}

impl<B: Backend, const D: usize> Benchmark<B> for FromDataBenchmark<B, D> {
impl<B: Backend, const D: usize> Benchmark for FromDataBenchmark<B, D> {
type Args = (Data<B::FloatElem, D>, B::Device);

fn name(&self) -> String {
Expand All @@ -49,16 +51,20 @@ impl<B: Backend, const D: usize> Benchmark<B> for FromDataBenchmark<B, D> {
}
}

fn prepare(&self, device: &B::Device) -> Self::Args {
fn prepare(&self) -> Self::Args {
(
Data::random(
self.shape.clone(),
Distribution::Default,
&mut rand::thread_rng(),
),
device.clone(),
self.device.clone(),
)
}

fn sync(&self) {
B::sync(&self.device)
}
}

#[allow(dead_code)]
Expand All @@ -67,11 +73,12 @@ fn bench<B: Backend>(device: &B::Device) {
let shape: Shape<D> = [32, 512, 1024].into();
let num_repeats = 10;

let to_benchmark = ToDataBenchmark::<B, D>::new(shape.clone(), num_repeats);
let from_benchmark = FromDataBenchmark::<B, D>::new(shape, num_repeats);
let to_benchmark = ToDataBenchmark::<B, D>::new(shape.clone(), num_repeats, device.clone());
let from_benchmark = FromDataBenchmark::<B, D>::new(shape, num_repeats, device.clone());

run_benchmark(to_benchmark, device);
run_benchmark(from_benchmark, device)
println!("Backend {}", B::name());
run_benchmark(to_benchmark);
run_benchmark(from_benchmark)
}

fn main() {
Expand Down
26 changes: 16 additions & 10 deletions backend-comparison/benches/matmul.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,16 @@
use burn::tensor::{backend::Backend, Distribution, Shape, Tensor};
use burn_tensor::benchmark::{run_benchmark, Benchmark};
use burn_common::benchmark::{run_benchmark, Benchmark};
use derive_new::new;
use std::marker::PhantomData;

#[derive(new)]
struct MatmulBenchmark<B, const D: usize> {
struct MatmulBenchmark<B: Backend, const D: usize> {
shape_lhs: Shape<D>,
shape_rhs: Shape<D>,
num_repeats: usize,
backend: PhantomData<B>,
device: B::Device,
}

impl<B: Backend, const D: usize> Benchmark<B> for MatmulBenchmark<B, D> {
impl<B: Backend, const D: usize> Benchmark for MatmulBenchmark<B, D> {
type Args = (Tensor<B, D>, Tensor<B, D>);

fn name(&self) -> String {
Expand All @@ -31,12 +30,18 @@ impl<B: Backend, const D: usize> Benchmark<B> for MatmulBenchmark<B, D> {
}
}

fn prepare(&self, device: &B::Device) -> Self::Args {
let lhs = Tensor::random_device(self.shape_lhs.clone(), Distribution::Default, device);
let rhs = Tensor::random_device(self.shape_rhs.clone(), Distribution::Default, device);
fn prepare(&self) -> Self::Args {
let lhs =
Tensor::random_device(self.shape_lhs.clone(), Distribution::Default, &self.device);
let rhs =
Tensor::random_device(self.shape_rhs.clone(), Distribution::Default, &self.device);

(lhs, rhs)
}

fn sync(&self) {
B::sync(&self.device)
}
}

#[allow(dead_code)]
Expand All @@ -50,8 +55,9 @@ fn bench<B: Backend>(device: &B::Device) {
let shape_lhs = [batch_size, m, k].into();
let shape_rhs = [batch_size, k, n].into();

let benchmark = MatmulBenchmark::<B, D>::new(shape_lhs, shape_rhs, num_repeats);
run_benchmark(benchmark, device);
let benchmark = MatmulBenchmark::<B, D>::new(shape_lhs, shape_rhs, num_repeats, device.clone());
println!("Backend {}", B::name());
run_benchmark(benchmark);
}

fn main() {
Expand Down
21 changes: 12 additions & 9 deletions backend-comparison/benches/unary.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
use std::marker::PhantomData;

use burn::tensor::{backend::Backend, Distribution, Shape, Tensor};
use burn_tensor::benchmark::{run_benchmark, Benchmark};
use burn_common::benchmark::{run_benchmark, Benchmark};
use derive_new::new;

#[derive(new)]
struct UnaryBenchmark<B: Backend, const D: usize> {
shape: Shape<D>,
num_repeats: usize,
backend: PhantomData<B>,
device: B::Device,
}

impl<B: Backend, const D: usize> Benchmark<B> for UnaryBenchmark<B, D> {
impl<B: Backend, const D: usize> Benchmark for UnaryBenchmark<B, D> {
type Args = Tensor<B, D>;

fn name(&self) -> String {
Expand All @@ -25,8 +23,12 @@ impl<B: Backend, const D: usize> Benchmark<B> for UnaryBenchmark<B, D> {
}
}

fn prepare(&self, device: &B::Device) -> Self::Args {
Tensor::random_device(self.shape.clone(), Distribution::Default, device)
fn prepare(&self) -> Self::Args {
Tensor::random_device(self.shape.clone(), Distribution::Default, &self.device)
}

fn sync(&self) {
B::sync(&self.device)
}
}

Expand All @@ -36,9 +38,10 @@ fn bench<B: Backend>(device: &B::Device) {
let shape: Shape<D> = [32, 512, 1024].into();
let num_repeats = 10;

let benchmark = UnaryBenchmark::<B, D>::new(shape, num_repeats);
let benchmark = UnaryBenchmark::<B, D>::new(shape, num_repeats, device.clone());

run_benchmark(benchmark, device)
println!("Backend {}", B::name());
run_benchmark(benchmark)
}

fn main() {
Expand Down
71 changes: 68 additions & 3 deletions burn-common/src/benchmark.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
use alloc::format;
use alloc::string::String;
use alloc::vec::Vec;
use core::fmt::Display;
use core::time::Duration;

#[cfg(feature = "std")]
use std::time::Instant;

Expand All @@ -17,6 +20,47 @@ impl BenchmarkResult {
sorted.sort();
*sorted.get(sorted.len() / 2).unwrap()
}
pub(crate) fn mean_duration(&self) -> Duration {
self.durations.iter().sum::<Duration>() / self.durations.len() as u32
}
}

impl Display for BenchmarkResult {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
let mean = self.mean_duration();
let var = self
.durations
.iter()
.map(|duration| {
let tmp = duration.as_secs_f64() - mean.as_secs_f64();
Duration::from_secs_f64(tmp * tmp)
})
.sum::<Duration>()
/ self.durations.len() as u32;

let mut sorted = self.durations.clone();
sorted.sort();

let min = sorted.first().unwrap();
let max = sorted.last().unwrap();
let median = sorted.get(sorted.len() / 2).unwrap();
let num_sample = self.durations.len();

f.write_str(
format!(
"
―――――――― Result ―――――――――
Samples {num_sample}
Mean {mean:.3?}
Variance {var:.3?}
Median {median:.3?}
Min {min:.3?}
Max {max:.3?}
―――――――――――――――――――――――――"
)
.as_str(),
)
}
}

/// Benchmark trait.
Expand All @@ -33,17 +77,17 @@ pub trait Benchmark {
/// measuring the execution time.
fn prepare(&self) -> Self::Args;
/// Execute the benchmark and returns the time it took to complete.
fn execute(&mut self, args: Self::Args);
fn execute(&self, args: Self::Args);
/// Number of samples required to have a statistical significance.
fn num_samples(&self) -> usize {
10
}
/// Name of the benchmark.
fn name(&self) -> String;
/// Wait for computations to be over
fn sync(&mut self);
fn sync(&self);
/// Run the benchmark a number of times.
fn run(&mut self) -> BenchmarkResult {
fn run(&self) -> BenchmarkResult {
#[cfg(not(feature = "std"))]
panic!("Attempting to run benchmark in a no-std environment");

Expand Down Expand Up @@ -74,3 +118,24 @@ pub trait Benchmark {
}
}
}

#[cfg(feature = "std")]
/// Runs the given benchmark on the device and prints result and information.
pub fn run_benchmark<BM>(benchmark: BM)
where
BM: Benchmark,
{
let timestamp = std::time::SystemTime::now()
.duration_since(std::time::UNIX_EPOCH)
.unwrap()
.as_millis();
let output = std::process::Command::new("git")
.args(["rev-porse", "HEAD"])
.output()
.unwrap();
let git_hash = String::from_utf8(output.stdout).unwrap();

println!("Timestamp: {}", timestamp);
println!("Git Hash: {}", str::trim(&git_hash));
println!("Benchmarking - {}{}", benchmark.name(), benchmark.run());
}
1 change: 1 addition & 0 deletions burn-compute/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ storage-bytes = []
burn-common = { path = "../burn-common", version = "0.11.0", default-features = false }
derive-new = { workspace = true }
spin = { workspace = true }
log = { workspace = true }
hashbrown = { workspace = true }

[dev-dependencies]
Expand Down
13 changes: 1 addition & 12 deletions burn-compute/src/channel/base.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,4 @@
use crate::{
server::{ComputeServer, Handle},
tune::AutotuneOperation,
};
use alloc::boxed::Box;
use crate::server::{ComputeServer, Handle};
use alloc::vec::Vec;
use burn_common::reader::Reader;

Expand All @@ -23,11 +19,4 @@ pub trait ComputeChannel<Server: ComputeServer>: Clone + core::fmt::Debug {

/// Wait for the completion of every task in the server.
fn sync(&self);

/// Executes the fastest kernel in the autotune operation, using (cached) runtime benchmarks
fn execute_autotune(
&self,
autotune_kernel: Box<dyn AutotuneOperation<Server>>,
handles: &[&Handle<Server>],
);
}
Loading

0 comments on commit e2a3329

Please sign in to comment.