naspert · naspert · Oct 29, 2024 · Feb 10, 2024 · Feb 10, 2024 · Feb 10, 2024
diff --git a/docs/experiment.md b/docs/experiment.md
@@ -119,6 +119,14 @@ Beyond waveforms, this format is used for any tensor in the computation pipeline
 (batch_size, time_steps, n_filters)
 ```
 
+## Modified PyTorch globals and GPU quirks
+
+For various reasons, SpeechBrain modifies some PyTorch global configuration to work around issues or improve execution speed, sometimes depending on GPU configuration.
+We do so when we consider that some modified defaults make more sense given our usecases than PyTorch's defaults. For instance, we very commonly encounter dynamic tensor shapes, which comes at odds with certain auto-tuning methods.
+
+These changes are applied in a standardized location, [`quirks.py`](https://github.com/speechbrain/speechbrain/tree/develop/speechbrain/utils/quirks.py). They are logged when starting an experiment.
+
+The `SB_DISABLE_QUIRKS` environment variable lets you disable quirks easily. For instance, to disable TensorFloat32 and re-enable JIT profiling, you would use `SB_DISABLE_QUIRKS=allow_tf32,disable_jit_profiling`.
 
 ## Reproducibility
 

diff --git a/recipes/AudioMNIST/audiomnist_prepare.py b/recipes/AudioMNIST/audiomnist_prepare.py
@@ -16,10 +16,12 @@
 import json
 import math
 import os
+import random
 from functools import partial
 from glob import glob
 from subprocess import list2cmdline
 
+import torch.nn.functional as Ft
 import torchaudio
 from torchaudio import functional as F
 from tqdm.auto import tqdm
@@ -61,6 +63,7 @@ def prepare_audiomnist(
     norm=True,
     highpass=True,
     process_audio=None,
+    pad_output=None,
     skip_prep=False,
 ):
     """Auto-downloads and prepares the AudioMNIST dataset
@@ -106,6 +109,8 @@ def prepare_audiomnist(
     process_audio: callable
         a custom function used to process audio files - instead of
         the standard transform (resample + normalize + trim)
+    pad_output: int
+        the length in samples of the output signal. If None, no padding is applied.
     skip_prep: bool
         whether preparation should be skipped
 
@@ -174,6 +179,7 @@ def prepare_audiomnist(
             trim_threshold=trim_threshold,
             norm=norm,
             highpass=highpass,
+            pad_output=pad_output,
         )
 
     # Get file lists for train/valid/test splits
@@ -224,7 +230,7 @@ def skip(json_files, save_opt, conf):
     """
 
     # Checking csv files
-    skip = any(not os.path.isfile(json_file) for json_file in json_files)
+    skip = all(os.path.isfile(json_file) for json_file in json_files.values())
 
     #  Checking saved options
     if skip is True:
@@ -739,6 +745,7 @@ def process_audio_default(
     src_sample_rate=48000,
     tgt_sample_rate=22050,
     trim_threshold=-30.0,
+    pad_output=None,
 ):
     """Standard audio preprocessing / conversion
 
@@ -758,6 +765,8 @@ def process_audio_default(
         the target sample rate
     trim_threshold: float
         the decibels threshold for trimming the file
+    pad_output: int
+        the length of the output signal (if padding is needed). If None, no padding is applied.
 
     Returns
     -------
@@ -779,6 +788,13 @@ def process_audio_default(
         )
         sig = sig.squeeze(0)
 
+    if pad_output is not None:
+        delta = pad_output - len(sig)
+        offset = random.randint(
+            0, delta
+        )  # if padding, insert blank space of random length at start of signal
+        sig = Ft.pad(sig, (offset, delta - offset), "constant", 0)
+
     # Normalize
     if norm:
         sig = sig / sig.abs().max()

diff --git a/recipes/AudioMNIST/classification/README.md b/recipes/AudioMNIST/classification/README.md
@@ -0,0 +1,75 @@
+# AudioMNIST Dataset classification
+This folder contains recipes for spoken digit recognition with [AudioMNIST Dataset](https://github.com/soerenab/AudioMNIST),
+including sample recipes for the [Learnable gammatone filterbank audio frontend](https://septentrio.uit.no/index.php/nldl/article/view/6279).
+The recipes include the original [AudioNet architecture](https://arxiv.org/abs/1807.03418) and two other versions using a learnable
+Gammatone filter bank as frontend.
+
+# How to run
+To run it, please type:
+
+```
+python train.py hparams/audionet.yaml --data_folder=/path_to_/AudioMNIST (AudioNet)
+# LFB frontends
+python train.py hparams/audionet_lfb.yaml --data_folder=/path_to_/AudioMNIST --seed=1234 (AudioNet with Gammatone LFB)
+python train.py hparams/audionet_custom_lfb.yaml --data_folder=/path_to_/AudioMNIST --seed=1234 (customized AudioNet with Gammatone LFB)
+```
+
+# Performance summary
+
+[Test accuracy on AudioMNIST split 0]
+| System | Accuracy |
+|---------------------- | ------------ |
+| AudioNet | 94.05% |
+| AudioNet + LFB | 97.03% |
+| AudioNet custom + LFB | 96.70% |
+
+
+# Checkpoints and Training logs
+
+You can find the full experiment folder (i.e., checkpoints, logs, etc) here:
+- AudioNet: https://os.unil.cloud.switch.ch/swift/v1/lts2-speechbrain/AudioMNIST/results/audionet
+- AudioNet + LFB: https://os.unil.cloud.switch.ch/swift/v1/lts2-speechbrain/AudioMNIST/results/audionet_lfb
+- AudioNet custom + LFB: https://os.unil.cloud.switch.ch/swift/v1/lts2-speechbrain/AudioMNIST/results/audionet_custom_lfb
+
+## Notes
+
+- The recipe automatically downloads the AudioMNSIT dataset. You only need to specify the path to which you would like to download it.
+
+- The dataset has 5 different splits for training/validation/test samples. Check the yaml recipe for more information.
+
+---------------------------------------------------------------------------------------------------------
+
+## Citing
+
+If you find this recipe useful, please cite:
+
+```bibtex
+@inproceedings{learnablefb,
+      title = {Learnable filter-banks for CNN-based audio applications},
+      author = {Peic Tukuljac, Helena and Ricaud, Benjamin and Aspert,  Nicolas and Colbois, Laurent},
+      journal = {Proceedings of the Northern Lights Deep Learning Workshop  2022 },
+      series = {Proceedings of the Northern Lights Deep Learning Workshop.  3},
+      pages = {9},
+      year = {2022},
+      abstract = {We investigate the design of a convolutional layer where  kernels are parameterized functions. This layer aims at  being the input layer of convolutional neural networks for  audio applications or applications involving time-series.  The kernels are defined as one-dimensional functions having  a band-pass filter shape, with a limited number of  trainable parameters. Building on the literature on this  topic, we confirm that networks having such an input layer  can achieve state-of-the-art accuracy on several audio  classification tasks. We explore the effect of different  parameters on the network accuracy and learning ability.  This approach reduces the number of weights to be trained  and enables larger kernel sizes, an advantage for audio  applications. Furthermore, the learned filters bring  additional interpretability and a better understanding of  the audio properties exploited by the network.},
+      url = {https://septentrio.uit.no/index.php/nldl/article/view/6279},
+      doi = {10.7557/18.6279},
+}
+```
+
+If you use **SpeechBrain**, please cite:
+
+```bibtex
+@misc{speechbrain,
+    title={{SpeechBrain}: A General-Purpose Speech Toolkit},
+    author={Mirco Ravanelli and Titouan Parcollet and Peter Plantinga and Aku Rouhe and Samuele Cornell and Loren Lugosch and Cem Subakan and Nauman Dawalatabad and Abdelwahab Heba and Jianyuan Zhong and Ju-Chieh Chou and Sung-Lin Yeh and Szu-Wei Fu and Chien-Feng Liao and Elena Rastorgueva and François Grondin and William Aris and Hwidong Na and Yan Gao and Renato De Mori and Yoshua Bengio},
+    year={2021},
+    eprint={2106.04624},
+    archivePrefix={arXiv},
+    primaryClass={eess.AS},
+    note={arXiv:2106.04624}
+}
+```
+
+---------------------------------------------------------------------------------------------------------
+
diff --git a/recipes/AudioMNIST/classification/audiomnist_prepare.py b/recipes/AudioMNIST/classification/audiomnist_prepare.py
@@ -0,0 +1 @@
+../audiomnist_prepare.py
diff --git a/recipes/AudioMNIST/classification/hparams/audionet.yaml b/recipes/AudioMNIST/classification/hparams/audionet.yaml
@@ -0,0 +1,110 @@
+# ################################
+# Model: Classification with AudioNet and Gammatone LFB
+# Authors: Nicolas Aspert 2024
+# ################################
+
+# Basic parameters
+seed: 1879
+__set_seed: !apply:torch.manual_seed [!ref <seed>]
+
+number_of_digits: 10
+# use "split[0-4].zip" for the official audiomnist splits, null for default data url in audiomnist_prepare.py
+metadata_repo: https://os.unil.cloud.switch.ch/swift/v1/lts2-speechbrain/AudioMNIST/metadata_split0.zip
+metadata_folder: null
+output_folder: !ref ./results/audionet/<seed>
+save_folder: !ref <output_folder>/save
+train_log: !ref <output_folder>/train_log.txt
+data_save_folder: !ref <data_folder>/audiomnist_prepared
+train_json: !ref <data_save_folder>/train.json
+valid_json: !ref <data_save_folder>/valid.json
+test_json: !ref <data_save_folder>/test.json
+
+# Data files
+data_folder: !PLACEHOLDER  # e.g. /path/to/audiomnist
+
+skip_prep: False
+ckpt_interval_minutes: 15 # save checkpoint every N min
+
+# Preparation Parameters
+data_prepare_norm: True
+data_prepare_trim: False
+data_prepare_trim_threshold: -30.
+data_prepare_sample_rate_src: 48000
+data_prepare_sample_rate_tgt: 8000
+# make all samples 1s long
+data_prepare_pad_output: !ref <data_prepare_sample_rate_tgt>
+
+
+####################### Training Parameters ####################################
+number_of_epochs: 50
+batch_size: 128
+lr: 0.001
+shuffle: True
+
+
+# Feature parameters
+audionet_features: 100
+
+# Number of classes (i.e. different commands)
+out_n_neurons: !ref <number_of_digits>
+
+num_workers: 4
+dataloader_options:
+    batch_size: !ref <batch_size>
+    shuffle: !ref <shuffle>
+    num_workers: !ref <num_workers>
+
+# Functions
+compute_features: !new:speechbrain.lobes.models.AudioNet.AudioNetFrontend
+    out_channels: !ref <audionet_features>
+    in_channels: 1
+    conv_kernel_size: 3
+    conv_dilation: 1
+    activation: !name:torch.nn.ReLU
+
+embedding_model: !new:speechbrain.lobes.models.AudioNet.AudioNet
+    in_channels: !ref <audionet_features>
+    activation: !name:torch.nn.ReLU
+    conv_blocks: 5
+    conv_channels: [64, 128, 128, 128, 128]
+    conv_kernel_sizes: [3, 3, 3, 3, 3]
+    conv_dilations: [1, 1, 1, 1, 1]
+    max_pooling_kernel: [2, 2, 2, 2, 2]
+    max_pooling_stride: [2, 2, 2, 2, 2]
+
+classifier: !new:speechbrain.lobes.models.AudioNet.Classifier
+    input_shape: [null, 16000]
+    activation: null
+    lin_blocks: 2
+    lin_neurons: [1024, 512]
+    out_neurons: !ref <out_n_neurons>
+
+epoch_counter: !new:speechbrain.utils.epoch_loop.EpochCounter
+    limit: !ref <number_of_epochs>
+
+
+modules:
+    compute_features: !ref <compute_features>
+    embedding_model: !ref <embedding_model>
+    classifier: !ref <classifier>
+
+# Cost + optimization
+compute_cost: !name:speechbrain.nnet.losses.nll_loss
+
+opt_class: !name:torch.optim.Adam
+    lr: !ref <lr>
+
+# Logging + checkpoints
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+    save_file: !ref <train_log>
+
+error_stats: !name:speechbrain.utils.metric_stats.MetricStats
+    metric: !name:speechbrain.nnet.losses.classification_error
+        reduction: batch
+
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+    checkpoints_dir: !ref <save_folder>
+    recoverables:
+        embedding_model: !ref <embedding_model>
+        classifier: !ref <classifier>
+        counter: !ref <epoch_counter>