Feat/wgpu/autotune compute (tracel-ai#906)

hojjatabdollahi · Oct 29, 2023 · e2a3329 · e2a3329
1 parent a9567ab
commit e2a3329
Show file tree

Hide file tree

Showing 70 changed files with 1,036 additions and 889 deletions.
diff --git a/backend-comparison/Cargo.toml b/backend-comparison/Cargo.toml
@@ -26,9 +26,7 @@ wgpu = ["burn/wgpu"]
 burn = { path = "../burn" }
 derive-new = { workspace = true }
 rand = { workspace = true }
-burn-tensor = { path = "../burn-tensor", version = "0.11.0", features = [
-    "benchmark",
-] }
+burn-common = { path = "../burn-common", version = "0.11.0" }
 
 [dev-dependencies]
 

diff --git a/backend-comparison/benches/binary.rs b/backend-comparison/benches/binary.rs
@@ -1,15 +1,13 @@
-use std::marker::PhantomData;
-
 use burn::tensor::{backend::Backend, Distribution, Shape, Tensor};
-use burn_tensor::benchmark::{run_benchmark, Benchmark};
+use burn_common::benchmark::{run_benchmark, Benchmark};
 
 pub struct BinaryBenchmark<B: Backend, const D: usize> {
     shape: Shape<D>,
     num_repeats: usize,
-    backend: PhantomData<B>,
+    device: B::Device,
 }
 
-impl<B: Backend, const D: usize> Benchmark<B> for BinaryBenchmark<B, D> {
+impl<B: Backend, const D: usize> Benchmark for BinaryBenchmark<B, D> {
     type Args = (Tensor<B, D>, Tensor<B, D>);
 
     fn name(&self) -> String {
@@ -23,27 +21,25 @@ impl<B: Backend, const D: usize> Benchmark<B> for BinaryBenchmark<B, D> {
         }
     }
 
-    fn prepare(&self, device: &B::Device) -> Self::Args {
-        let lhs = Tensor::random(self.shape.clone(), Distribution::Default).to_device(device);
-        let rhs = Tensor::random(self.shape.clone(), Distribution::Default).to_device(device);
+    fn prepare(&self) -> Self::Args {
+        let lhs = Tensor::random_device(self.shape.clone(), Distribution::Default, &self.device);
+        let rhs = Tensor::random_device(self.shape.clone(), Distribution::Default, &self.device);
 
         (lhs, rhs)
     }
+
+    fn sync(&self) {
+        B::sync(&self.device)
+    }
 }
 
 #[allow(dead_code)]
 fn bench<B: Backend>(device: &B::Device) {
-    const D: usize = 3;
-    let shape: Shape<D> = [32, 512, 1024].into();
-    let num_repeats = 10;
-
-    let benchmark = BinaryBenchmark::<B, D> {
-        shape,
-        num_repeats,
-        backend: PhantomData,
-    };
-
-    run_benchmark(benchmark, device)
+    run_benchmark(BinaryBenchmark::<B, 3> {
+        shape: [32, 512, 1024].into(),
+        num_repeats: 10,
+        device: device.clone(),
+    })
 }
 
 fn main() {

diff --git a/backend-comparison/benches/data.rs b/backend-comparison/benches/data.rs
@@ -1,17 +1,15 @@
-use std::marker::PhantomData;
-
 use burn::tensor::{backend::Backend, Data, Distribution, Shape, Tensor};
-use burn_tensor::benchmark::{run_benchmark, Benchmark};
+use burn_common::benchmark::{run_benchmark, Benchmark};
 use derive_new::new;
 
 #[derive(new)]
 struct ToDataBenchmark<B: Backend, const D: usize> {
     shape: Shape<D>,
     num_repeats: usize,
-    backend: PhantomData<B>,
+    device: B::Device,
 }
 
-impl<B: Backend, const D: usize> Benchmark<B> for ToDataBenchmark<B, D> {
+impl<B: Backend, const D: usize> Benchmark for ToDataBenchmark<B, D> {
     type Args = Tensor<B, D>;
 
     fn name(&self) -> String {
@@ -24,19 +22,23 @@ impl<B: Backend, const D: usize> Benchmark<B> for ToDataBenchmark<B, D> {
         }
     }
 
-    fn prepare(&self, device: &B::Device) -> Self::Args {
-        Tensor::random_device(self.shape.clone(), Distribution::Default, device)
+    fn prepare(&self) -> Self::Args {
+        Tensor::random_device(self.shape.clone(), Distribution::Default, &self.device)
+    }
+
+    fn sync(&self) {
+        B::sync(&self.device)
     }
 }
 
 #[derive(new)]
 struct FromDataBenchmark<B: Backend, const D: usize> {
     shape: Shape<D>,
     num_repeats: usize,
-    backend: PhantomData<B>,
+    device: B::Device,
 }
 
-impl<B: Backend, const D: usize> Benchmark<B> for FromDataBenchmark<B, D> {
+impl<B: Backend, const D: usize> Benchmark for FromDataBenchmark<B, D> {
     type Args = (Data<B::FloatElem, D>, B::Device);
 
     fn name(&self) -> String {
@@ -49,16 +51,20 @@ impl<B: Backend, const D: usize> Benchmark<B> for FromDataBenchmark<B, D> {
         }
     }
 
-    fn prepare(&self, device: &B::Device) -> Self::Args {
+    fn prepare(&self) -> Self::Args {
         (
             Data::random(
                 self.shape.clone(),
                 Distribution::Default,
                 &mut rand::thread_rng(),
             ),
-            device.clone(),
+            self.device.clone(),
         )
     }
+
+    fn sync(&self) {
+        B::sync(&self.device)
+    }
 }
 
 #[allow(dead_code)]
@@ -67,11 +73,12 @@ fn bench<B: Backend>(device: &B::Device) {
     let shape: Shape<D> = [32, 512, 1024].into();
     let num_repeats = 10;
 
-    let to_benchmark = ToDataBenchmark::<B, D>::new(shape.clone(), num_repeats);
-    let from_benchmark = FromDataBenchmark::<B, D>::new(shape, num_repeats);
+    let to_benchmark = ToDataBenchmark::<B, D>::new(shape.clone(), num_repeats, device.clone());
+    let from_benchmark = FromDataBenchmark::<B, D>::new(shape, num_repeats, device.clone());
 
-    run_benchmark(to_benchmark, device);
-    run_benchmark(from_benchmark, device)
+    println!("Backend {}", B::name());
+    run_benchmark(to_benchmark);
+    run_benchmark(from_benchmark)
 }
 
 fn main() {

diff --git a/backend-comparison/benches/matmul.rs b/backend-comparison/benches/matmul.rs
@@ -1,17 +1,16 @@
 use burn::tensor::{backend::Backend, Distribution, Shape, Tensor};
-use burn_tensor::benchmark::{run_benchmark, Benchmark};
+use burn_common::benchmark::{run_benchmark, Benchmark};
 use derive_new::new;
-use std::marker::PhantomData;
 
 #[derive(new)]
-struct MatmulBenchmark<B, const D: usize> {
+struct MatmulBenchmark<B: Backend, const D: usize> {
     shape_lhs: Shape<D>,
     shape_rhs: Shape<D>,
     num_repeats: usize,
-    backend: PhantomData<B>,
+    device: B::Device,
 }
 
-impl<B: Backend, const D: usize> Benchmark<B> for MatmulBenchmark<B, D> {
+impl<B: Backend, const D: usize> Benchmark for MatmulBenchmark<B, D> {
     type Args = (Tensor<B, D>, Tensor<B, D>);
 
     fn name(&self) -> String {
@@ -31,12 +30,18 @@ impl<B: Backend, const D: usize> Benchmark<B> for MatmulBenchmark<B, D> {
         }
     }
 
-    fn prepare(&self, device: &B::Device) -> Self::Args {
-        let lhs = Tensor::random_device(self.shape_lhs.clone(), Distribution::Default, device);
-        let rhs = Tensor::random_device(self.shape_rhs.clone(), Distribution::Default, device);
+    fn prepare(&self) -> Self::Args {
+        let lhs =
+            Tensor::random_device(self.shape_lhs.clone(), Distribution::Default, &self.device);
+        let rhs =
+            Tensor::random_device(self.shape_rhs.clone(), Distribution::Default, &self.device);
 
         (lhs, rhs)
     }
+
+    fn sync(&self) {
+        B::sync(&self.device)
+    }
 }
 
 #[allow(dead_code)]
@@ -50,8 +55,9 @@ fn bench<B: Backend>(device: &B::Device) {
     let shape_lhs = [batch_size, m, k].into();
     let shape_rhs = [batch_size, k, n].into();
 
-    let benchmark = MatmulBenchmark::<B, D>::new(shape_lhs, shape_rhs, num_repeats);
-    run_benchmark(benchmark, device);
+    let benchmark = MatmulBenchmark::<B, D>::new(shape_lhs, shape_rhs, num_repeats, device.clone());
+    println!("Backend {}", B::name());
+    run_benchmark(benchmark);
 }
 
 fn main() {

diff --git a/backend-comparison/benches/unary.rs b/backend-comparison/benches/unary.rs
@@ -1,17 +1,15 @@
-use std::marker::PhantomData;
-
 use burn::tensor::{backend::Backend, Distribution, Shape, Tensor};
-use burn_tensor::benchmark::{run_benchmark, Benchmark};
+use burn_common::benchmark::{run_benchmark, Benchmark};
 use derive_new::new;
 
 #[derive(new)]
 struct UnaryBenchmark<B: Backend, const D: usize> {
     shape: Shape<D>,
     num_repeats: usize,
-    backend: PhantomData<B>,
+    device: B::Device,
 }
 
-impl<B: Backend, const D: usize> Benchmark<B> for UnaryBenchmark<B, D> {
+impl<B: Backend, const D: usize> Benchmark for UnaryBenchmark<B, D> {
     type Args = Tensor<B, D>;
 
     fn name(&self) -> String {
@@ -25,8 +23,12 @@ impl<B: Backend, const D: usize> Benchmark<B> for UnaryBenchmark<B, D> {
         }
     }
 
-    fn prepare(&self, device: &B::Device) -> Self::Args {
-        Tensor::random_device(self.shape.clone(), Distribution::Default, device)
+    fn prepare(&self) -> Self::Args {
+        Tensor::random_device(self.shape.clone(), Distribution::Default, &self.device)
+    }
+
+    fn sync(&self) {
+        B::sync(&self.device)
     }
 }
 
@@ -36,9 +38,10 @@ fn bench<B: Backend>(device: &B::Device) {
     let shape: Shape<D> = [32, 512, 1024].into();
     let num_repeats = 10;
 
-    let benchmark = UnaryBenchmark::<B, D>::new(shape, num_repeats);
+    let benchmark = UnaryBenchmark::<B, D>::new(shape, num_repeats, device.clone());
 
-    run_benchmark(benchmark, device)
+    println!("Backend {}", B::name());
+    run_benchmark(benchmark)
 }
 
 fn main() {

diff --git a/burn-common/src/benchmark.rs b/burn-common/src/benchmark.rs
@@ -1,6 +1,9 @@
+use alloc::format;
 use alloc::string::String;
 use alloc::vec::Vec;
+use core::fmt::Display;
 use core::time::Duration;
+
 #[cfg(feature = "std")]
 use std::time::Instant;
 
@@ -17,6 +20,47 @@ impl BenchmarkResult {
         sorted.sort();
         *sorted.get(sorted.len() / 2).unwrap()
     }
+    pub(crate) fn mean_duration(&self) -> Duration {
+        self.durations.iter().sum::<Duration>() / self.durations.len() as u32
+    }
+}
+
+impl Display for BenchmarkResult {
+    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
+        let mean = self.mean_duration();
+        let var = self
+            .durations
+            .iter()
+            .map(|duration| {
+                let tmp = duration.as_secs_f64() - mean.as_secs_f64();
+                Duration::from_secs_f64(tmp * tmp)
+            })
+            .sum::<Duration>()
+            / self.durations.len() as u32;
+
+        let mut sorted = self.durations.clone();
+        sorted.sort();
+
+        let min = sorted.first().unwrap();
+        let max = sorted.last().unwrap();
+        let median = sorted.get(sorted.len() / 2).unwrap();
+        let num_sample = self.durations.len();
+
+        f.write_str(
+            format!(
+                "
+―――――――― Result ―――――――――
+  Samples     {num_sample}
+  Mean        {mean:.3?}
+  Variance    {var:.3?}
+  Median      {median:.3?}
+  Min         {min:.3?}
+  Max         {max:.3?}
+―――――――――――――――――――――――――"
+            )
+            .as_str(),
+        )
+    }
 }
 
 /// Benchmark trait.
@@ -33,17 +77,17 @@ pub trait Benchmark {
     /// measuring the execution time.
     fn prepare(&self) -> Self::Args;
     /// Execute the benchmark and returns the time it took to complete.
-    fn execute(&mut self, args: Self::Args);
+    fn execute(&self, args: Self::Args);
     /// Number of samples required to have a statistical significance.
     fn num_samples(&self) -> usize {
         10
     }
     /// Name of the benchmark.
     fn name(&self) -> String;
     /// Wait for computations to be over
-    fn sync(&mut self);
+    fn sync(&self);
     /// Run the benchmark a number of times.
-    fn run(&mut self) -> BenchmarkResult {
+    fn run(&self) -> BenchmarkResult {
         #[cfg(not(feature = "std"))]
         panic!("Attempting to run benchmark in a no-std environment");
 
@@ -74,3 +118,24 @@ pub trait Benchmark {
         }
     }
 }
+
+#[cfg(feature = "std")]
+/// Runs the given benchmark on the device and prints result and information.
+pub fn run_benchmark<BM>(benchmark: BM)
+where
+    BM: Benchmark,
+{
+    let timestamp = std::time::SystemTime::now()
+        .duration_since(std::time::UNIX_EPOCH)
+        .unwrap()
+        .as_millis();
+    let output = std::process::Command::new("git")
+        .args(["rev-porse", "HEAD"])
+        .output()
+        .unwrap();
+    let git_hash = String::from_utf8(output.stdout).unwrap();
+
+    println!("Timestamp: {}", timestamp);
+    println!("Git Hash: {}", str::trim(&git_hash));
+    println!("Benchmarking - {}{}", benchmark.name(), benchmark.run());
+}
diff --git a/burn-compute/Cargo.toml b/burn-compute/Cargo.toml
@@ -28,6 +28,7 @@ storage-bytes = []
 burn-common = { path = "../burn-common", version = "0.11.0", default-features = false }
 derive-new = { workspace = true }
 spin = { workspace = true }
+log = { workspace = true }
 hashbrown = { workspace = true }
 
 [dev-dependencies]

diff --git a/burn-compute/src/channel/base.rs b/burn-compute/src/channel/base.rs
@@ -1,8 +1,4 @@
-use crate::{
-    server::{ComputeServer, Handle},
-    tune::AutotuneOperation,
-};
-use alloc::boxed::Box;
+use crate::server::{ComputeServer, Handle};
 use alloc::vec::Vec;
 use burn_common::reader::Reader;
 
@@ -23,11 +19,4 @@ pub trait ComputeChannel<Server: ComputeServer>: Clone + core::fmt::Debug {
 
     /// Wait for the completion of every task in the server.
     fn sync(&self);
-
-    /// Executes the fastest kernel in the autotune operation, using (cached) runtime benchmarks
-    fn execute_autotune(
-        &self,
-        autotune_kernel: Box<dyn AutotuneOperation<Server>>,
-        handles: &[&Handle<Server>],
-    );
 }