feat: module init (tracel-ai#117)

djdisodo · Nov 26, 2022 · 46d06f0 · 46d06f0
1 parent 4abc281
commit 46d06f0
Show file tree

Hide file tree

Showing 11 changed files with 51 additions and 24 deletions.
diff --git a/burn-tch/src/backend.rs b/burn-tch/src/backend.rs
@@ -96,6 +96,11 @@ impl<E: TchElement> Backend for TchBackend<E> {
                     .uniform_(from.to_f64().unwrap(), to.to_f64().unwrap());
                 tensor
             }
+            Distribution::Normal(mean, std) => {
+                let mut tensor = TchTensor::<Self::Elem, D>::empty(shape, device);
+                tensor.tensor = tensor.tensor.normal(mean, std);
+                tensor
+            }
         }
     }
 

diff --git a/burn-tensor/Cargo.toml b/burn-tensor/Cargo.toml
@@ -24,6 +24,7 @@ burn-tensor-testgen = { version = "0.3.0", path = "../burn-tensor-testgen", opti
 num-traits = "0.2"
 derive-new = "0.5"
 rand = "0.8"
+statrs = "0.16"
 half = { version = "1.6", features = ["num-traits"] } # needs to be 1.6 to work with tch
 
 # Autodiff

diff --git a/burn-tensor/src/tensor/data.rs b/burn-tensor/src/tensor/data.rs
@@ -19,6 +19,7 @@ pub enum Distribution<P> {
     Standard,
     Bernoulli(f64),
     Uniform(P, P),
+    Normal(f64, f64),
 }
 
 #[derive(new)]
@@ -39,6 +40,7 @@ where
     Standard(rand::distributions::Standard),
     Uniform(rand::distributions::Uniform<P>),
     Bernoulli(rand::distributions::Bernoulli),
+    Normal(statrs::distribution::Normal),
 }
 
 impl<'a, P> DistributionSampler<'a, P>
@@ -58,6 +60,9 @@ where
                     P::zeros(&P::default())
                 }
             }
+            DistributionSamplerKind::Normal(distribution) => {
+                self.rng.sample(distribution).to_elem()
+            }
         }
     }
 }
@@ -78,6 +83,9 @@ where
             Distribution::Bernoulli(prob) => DistributionSamplerKind::Bernoulli(
                 rand::distributions::Bernoulli::new(prob).unwrap(),
             ),
+            Distribution::Normal(mean, std) => DistributionSamplerKind::Normal(
+                statrs::distribution::Normal::new(mean, std).unwrap(),
+            ),
         };
 
         DistributionSampler::new(kind, rng)
@@ -93,6 +101,7 @@ where
             Distribution::Standard => Distribution::Standard,
             Distribution::Uniform(a, b) => Distribution::Uniform(E::from_elem(a), E::from_elem(b)),
             Distribution::Bernoulli(prob) => Distribution::Bernoulli(prob),
+            Distribution::Normal(mean, std) => Distribution::Normal(mean, std),
         }
     }
 }

diff --git a/burn/src/module/param/base.rs b/burn/src/module/param/base.rs
@@ -2,6 +2,7 @@ use super::ParamId;
 use crate::module::{LoadingError, State, StateNamed};
 use crate::tensor::Element;
 
+/// Define a trainable parameter.
 #[derive(Debug)]
 pub struct Param<T> {
     pub(super) id: ParamId,

diff --git a/burn/src/nn/dropout.rs b/burn/src/nn/dropout.rs
@@ -31,8 +31,8 @@ impl Dropout {
     ///
     /// # Shapes
     ///
-    /// - input: [..., any]
-    /// - output: [..., any]
+    /// - input: `[..., any]`
+    /// - output: `[..., any]`
     pub fn forward<B: Backend, const D: usize>(&self, input: Tensor<B, D>) -> Tensor<B, D> {
         if !B::ad_enabled() || self.prob == 0.0 {
             return input;

diff --git a/burn/src/nn/embedding.rs b/burn/src/nn/embedding.rs
@@ -4,7 +4,7 @@ use crate::config::Config;
 use crate::module::Module;
 use crate::module::Param;
 use crate::tensor::backend::Backend;
-use crate::tensor::{Distribution, ElementConversion, Tensor};
+use crate::tensor::{Distribution, Tensor};
 
 /// Configuration to create an [Embedding](Embedding) layer.
 #[derive(Config)]
@@ -16,6 +16,11 @@ pub struct EmbeddingConfig {
 }
 
 /// Lookup table to store a fix number of vectors.
+///
+/// # Params
+///
+/// - weight: Matrix of shape `[n_embedding, d_model]` initialized from a normal distribution:
+///     `N(0, 1)`
 #[derive(Module, Debug)]
 pub struct Embedding<B: Backend> {
     weight: Param<Tensor<B, 2>>,
@@ -24,10 +29,10 @@ pub struct Embedding<B: Backend> {
 impl<B: Backend> Embedding<B> {
     /// Create the module from the given configuration.
     pub fn new(config: &EmbeddingConfig) -> Self {
-        let start = -1.0 / f64::sqrt(config.d_model as f64);
-        let end = 1.0 / f64::sqrt(config.d_model as f64);
-        let distribution = Distribution::Uniform(start.to_elem(), end.to_elem());
-        let weight = Tensor::random([config.n_embedding, config.d_model], distribution);
+        let weight = Tensor::random(
+            [config.n_embedding, config.d_model],
+            Distribution::Normal(0.0, 1.0),
+        );
 
         Self {
             weight: Param::new(weight),

diff --git a/burn/src/nn/gelu.rs b/burn/src/nn/gelu.rs
@@ -15,8 +15,8 @@ impl GELU {
     ///
     /// # Shapes
     ///
-    /// - input: [..., any]
-    /// - output: [..., any]
+    /// - input: `[..., any]`
+    /// - output: `[..., any]`
     pub fn forward<B: Backend, const D: usize>(&self, input: Tensor<B, D>) -> Tensor<B, D> {
         crate::tensor::activation::gelu(&input)
     }

diff --git a/burn/src/nn/layer_norm.rs b/burn/src/nn/layer_norm.rs
@@ -43,8 +43,8 @@ impl<B: Backend> LayerNorm<B> {
     ///
     /// # Shapes
     ///
-    /// - input: [..., any, d_model]
-    /// - output: [..., any, d_model]
+    /// - input: `[..., any, d_model]`
+    /// - output: `[..., any, d_model]`
     pub fn forward<const D: usize>(&self, input: Tensor<B, D>) -> Tensor<B, D> {
         let (var, mean) = input.var_mean_bias(D - 1);
 

diff --git a/burn/src/nn/linear.rs b/burn/src/nn/linear.rs
@@ -4,7 +4,7 @@ use crate::config::Config;
 use crate::module::Module;
 use crate::module::Param;
 use crate::tensor::backend::Backend;
-use crate::tensor::{Distribution, ElementConversion, Shape, Tensor};
+use crate::tensor::{Distribution, ElementConversion, Tensor};
 use std::ops::Deref;
 
 /// Configuration to create a [Linear](Linear) layer.
@@ -22,6 +22,14 @@ pub struct LinearConfig {
 /// Applies a linear transformation to the input tensor:
 ///
 /// `O = IW + b`
+///
+/// # Params
+///
+/// - weight: Matrix of shape `[d_input, d_output]` initialized from a uniform distribution:
+///     `U(-k, k)`, where `k = sqrt(1 / d_input)`
+///
+/// - bias (optional): Vector of size `d_output` initialized from a uniform distribution:
+///     `U(-k, k)`, where `k = sqrt(1 / d_input)`
 #[derive(Module, Debug)]
 pub struct Linear<B: Backend> {
     weight: Param<Tensor<B, 2>>,
@@ -31,14 +39,12 @@ pub struct Linear<B: Backend> {
 impl<B: Backend> Linear<B> {
     /// Create the module from the given configuration.
     pub fn new(config: &LinearConfig) -> Self {
-        // Glorot init
-        let start = -1.0 / f64::sqrt(config.d_input as f64);
-        let end = 1.0 / f64::sqrt(config.d_input as f64);
-        let distribution = Distribution::Uniform(start.to_elem(), end.to_elem());
+        let k = f64::sqrt(1.0 / config.d_input as f64);
+        let distribution = Distribution::Uniform((-1.0 * k).to_elem(), k.to_elem());
 
-        let weight = Tensor::random(Shape::new([config.d_input, config.d_output]), distribution);
+        let weight = Tensor::random([config.d_input, config.d_output], distribution);
         let bias = match config.bias {
-            true => Some(Tensor::zeros(Shape::new([config.d_output]))),
+            true => Some(Tensor::random([config.d_output], distribution)),
             false => None,
         };
 
@@ -52,8 +58,8 @@ impl<B: Backend> Linear<B> {
     ///
     /// # Shapes
     ///
-    /// - input: [..., any, d_input]
-    /// - output: [..., any, d_output]
+    /// - input: `[..., any, d_input]`
+    /// - output: `[..., any, d_output]`
     pub fn forward<const D: usize>(&self, input: Tensor<B, D>) -> Tensor<B, D> {
         let output = input.matmul(&self.weight.unsqueeze());
 

diff --git a/burn/src/nn/relu.rs b/burn/src/nn/relu.rs
@@ -16,8 +16,8 @@ impl ReLU {
     ///
     /// # Shapes
     ///
-    /// - input: [..., any]
-    /// - output: [..., any]
+    /// - input: `[..., any]`
+    /// - output: `[..., any]`
     pub fn forward<B: Backend, const D: usize>(&self, input: Tensor<B, D>) -> Tensor<B, D> {
         crate::tensor::activation::relu(&input)
     }

diff --git a/examples/mnist/src/mlp.rs b/examples/mnist/src/mlp.rs
@@ -48,8 +48,8 @@ impl<B: Backend> Mlp<B> {
     ///
     /// # Shapes
     ///
-    /// - input: [batch_size, d_model]
-    /// - output: [batch_size, d_model]
+    /// - input: `[batch_size, d_model]`
+    /// - output: `[batch_size, d_model]`
     pub fn forward(&self, input: Tensor<B, 2>) -> Tensor<B, 2> {
         let mut x = input;