From 7661291737e082b9310b7c3fdf5200eed8fab6d3 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sat, 17 Jun 2023 21:59:26 +0200
Subject: [PATCH 01/21] Hparams + Loading

---
 crates/models/falcon/Cargo.toml |  13 ++
 crates/models/falcon/src/lib.rs | 370 ++++++++++++++++++++++++++++++++
 2 files changed, 383 insertions(+)
 create mode 100644 crates/models/falcon/Cargo.toml
 create mode 100644 crates/models/falcon/src/lib.rs
diff --git a/crates/models/falcon/Cargo.toml b/crates/models/falcon/Cargo.toml
new file mode 100644
index 00000000..0c9cdbc8
--- /dev/null
+++ b/crates/models/falcon/Cargo.toml
@@ -0,0 +1,13 @@
+[package]
+name = "llm-falcon"
+version = "0.2.0-dev"
+license = { workspace = true }
+repository = { workspace = true }
+description = "An implementation of tiiuae falcon model for the `llm` ecosystem."
+edition = "2021"
+readme = "../../../README.md"
+
+[dependencies]
+llm-base = { path = "../../llm-base", version = "0.2.0-dev" }
+
+bytemuck = { workspace = true }
diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs
new file mode 100644
index 00000000..c371a147
--- /dev/null
+++ b/crates/models/falcon/src/lib.rs
@@ -0,0 +1,370 @@
+//! An implementation of [tiiuae](https://huggingface.co/tiiuae)'s [falcon] model for the `llm` ecosystem.
+#![deny(missing_docs)]
+
+use ggml::Tensor;
+use llm_base::{
+    ggml,
+    model::{common, HyperparametersWriteError},
+    util, FileType, InferenceParameters, InferenceSession, InferenceSessionConfig, KnownModel,
+    LoadError, Mmap, ModelParameters, OutputRequest, Regex, TokenId, Vocabulary,
+};
+
+/// The falcon model. Ref: [Technology Innovation Institute](https://huggingface.co/tiiuae/falcon-40b)
+///
+/// # Safety
+/// This implements [Send] and [Sync] as it is immutable after construction.
+pub struct Falcon {
+    // the context size ("memory") the model should use when evaluating a prompt
+    context_size: usize,
+
+    hyperparameters: Hyperparameters,
+
+    vocabulary: Vocabulary,
+
+    // model-global weights
+    // weighted token embeddings
+    tok_embeddings: Tensor,
+    output_norm: Tensor,
+    output_norm_b: Tensor,
+    lm_head: Tensor,
+
+    // weights for the model
+    layers: Vec<Layer>,
+
+    // must be kept alive for the model
+    _context: ggml::Context,
+    _mmap: Option<Mmap>,
+}
+
+unsafe impl Send for Falcon {}
+unsafe impl Sync for Falcon {}
+
+impl KnownModel for Falcon {
+    type Hyperparameters = Hyperparameters;
+
+    fn new<E: std::error::Error>(
+        hyperparameters: Self::Hyperparameters,
+        params: ModelParameters,
+        vocabulary: Vocabulary,
+        tensor_loader: impl llm_base::TensorLoader<E>,
+    ) -> Result<Self, E> {
+        let mut tl = tensor_loader;
+
+        // model-gobal weights
+        let tok_embeddings = tl.load("transformer.word_embeddings.weight")?;
+        let output_norm = tl.load("transformer.ln_f.weight")?;
+        let output_norm_b = tl.load("transformer.ln_f.bias")?;
+        let lm_head = tl.load("lm_head.weight")?;
+
+        let mut layers = Vec::new();
+        for i in 0..hyperparameters.n_layer {
+            let layer = Layer {
+                attention_norm: tl.load(&format!("transformer.h.{i}.input_layernorm.weight"))?,
+                attention_norm_b: tl.load(&format!("transformer.h.{i}.input_layernorm.bias"))?,
+
+                query_key_value: tl.load(&format!(
+                    "transformer.h.{i}.self_attention.query_key_value.weight"
+                ))?,
+                wo: tl.load(&format!("transformer.h.{i}.self_attention.dense.weight"))?,
+
+                ffn_up: tl.load(&format!("transformer.h.{i}.mlp.dense_h_to_4h.weight"))?,
+                ffn_down: tl.load(&format!("transformer.h.{i}.mlp.dense_4h_to_h.weight"))?,
+            };
+
+            layers.push(layer);
+        }
+
+        let (_context, _, _mmap) = tl.finish();
+
+        let ModelParameters { context_size, .. } = params;
+
+        Ok(Falcon {
+            hyperparameters,
+            context_size,
+            vocabulary,
+            tok_embeddings,
+            output_norm,
+            output_norm_b,
+            lm_head,
+            layers,
+            _context,
+            _mmap,
+        })
+    }
+
+    fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession {
+        InferenceSession::new(
+            config,
+            self.context_size,
+            self.hyperparameters.n_layer,
+            self.hyperparameters.n_embd,
+            self.hyperparameters.n_vocab,
+        )
+    }
+
+    fn evaluate(
+        &self,
+        session: &mut InferenceSession,
+        params: &InferenceParameters,
+        input_tokens: &[TokenId],
+        output_request: &mut OutputRequest,
+    ) {
+        let input_len = input_tokens.len();
+        let session_len = session.n_past;
+        let num_threads = params.n_threads;
+        let ctx_size = self.context_size;
+
+        let Hyperparameters {
+            n_embd,
+            n_head,
+            n_vocab,
+            n_layer,
+            ..
+        } = self.hyperparameters;
+
+        let (ctx0, embd) = common::prepare_for_evaluate(n_layer, session, input_tokens);
+
+        let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, &embd);
+
+        let f32_size = std::mem::size_of::<f32>();
+
+        let memory_k = &session.memory_k;
+        let memory_k_size = memory_k.element_size();
+
+        let memory_v = &session.memory_v;
+        let memory_v_size = memory_v.element_size();
+
+        let mut gf = ggml::ComputationGraph::new(num_threads);
+        // for il in 0..n_layer {
+        //     // attention uses first scratch buffer
+        //     ctx0.use_scratch(Some(&mut session.scratch[0]));
+
+        //     let mut current = ctx0.op_norm(&input_layer);
+        //     current = ctx0.op_mul(
+        //         &ctx0.op_repeat(&self.layers[il].norm_1_weight, &current),
+        //         &current,
+        //     );
+
+        //     current = ctx0.op_mul_mat(&self.layers[il].c_attn_wqkv_weight, &current);
+
+        //     let nb = current.get_nb()[1];
+        //     let qcur = ctx0.op_view_2d(&current, (n_embd, input_len), nb, 0);
+        //     let kcur = ctx0.op_view_2d(&current, (n_embd, input_len), nb, f32_size * n_embd);
+        //     let vcur = ctx0.op_view_2d(&current, (n_embd, input_len), nb, f32_size * n_embd * 2);
+
+        //     let k = ctx0.op_view_1d(
+        //         memory_k,
+        //         input_len * n_embd,
+        //         (memory_k_size * n_embd) * (il * ctx_size + session_len),
+        //     );
+        //     let v = ctx0.op_view_1d(
+        //         memory_v,
+        //         input_len * n_embd,
+        //         (memory_v_size * n_embd) * (il * ctx_size + session_len),
+        //     );
+
+        //     gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k));
+        //     gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v));
+
+        //     let q = ctx0.op_permute(
+        //         &ctx0.op_cpy(
+        //             &qcur,
+        //             &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, input_len),
+        //         ),
+        //         (0, 2, 1, 3),
+        //     );
+
+        //     let bigk = ctx0.op_permute(
+        //         &ctx0.op_reshape_3d(
+        //             &ctx0.op_view_1d(
+        //                 memory_k,
+        //                 (session_len + input_len) * n_embd,
+        //                 il * ctx_size * memory_k_size * n_embd,
+        //             ),
+        //             n_embd / n_head,
+        //             n_head,
+        //             session_len + input_len,
+        //         ),
+        //         (0, 2, 1, 3),
+        //     );
+
+        //     let kq = ctx0.op_mul_mat(&bigk, &q);
+        //     let kq_scaled = ctx0.op_scale(
+        //         &kq,
+        //         &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)),
+        //     );
+        //     let kq_scaled_alibi = ctx0.op_alibi(&kq_scaled, session_len, n_head, alibi_bias_max);
+        //     let kq_masked = ctx0.op_diag_mask_inf(&kq_scaled_alibi, session_len);
+        //     let kq_softmax = ctx0.op_soft_max(&kq_masked);
+
+        //     let v_trans = ctx0.op_cpy(
+        //         &ctx0.op_permute(
+        //             &ctx0.op_reshape_3d(
+        //                 &ctx0.op_view_1d(
+        //                     &session.memory_v,
+        //                     (session_len + input_len) * n_embd,
+        //                     il * ctx_size * memory_v_size * n_embd,
+        //                 ),
+        //                 n_embd / n_head,
+        //                 n_head,
+        //                 session_len + input_len,
+        //             ),
+        //             (1, 2, 0, 3),
+        //         ),
+        //         &ctx0.new_tensor_3d(
+        //             session.memory_v.get_type(),
+        //             session_len + input_len,
+        //             n_embd / n_head,
+        //             n_head,
+        //         ),
+        //     );
+
+        //     let kqv = ctx0.op_mul_mat(&v_trans, &kq_softmax);
+        //     let kqv_merged = ctx0.op_permute(&kqv, (0, 2, 1, 3));
+
+        //     current = ctx0.op_cpy(
+        //         &kqv_merged,
+        //         &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len),
+        //     );
+        //     // projection
+        //     current = ctx0.op_mul_mat(&self.layers[il].c_attn_out_proj_weight, &current);
+
+        //     input_layer = ctx0.op_add(&input_layer, &current);
+
+        //     // feed forward uses second scratch buffer
+        //     ctx0.use_scratch(Some(&mut session.scratch[1]));
+
+        //     current = ctx0.op_norm(&input_layer);
+        //     current = ctx0.op_mul(
+        //         &ctx0.op_repeat(&self.layers[il].norm_2_weight, &current),
+        //         &current,
+        //     );
+
+        //     current = ctx0.op_mul_mat(&self.layers[il].ffn_up_proj, &current);
+
+        //     current = ctx0.op_gelu(&current);
+
+        //     // projection
+        //     current = ctx0.op_mul_mat(&self.layers[il].ffn_down_proj, &current);
+
+        //     input_layer = ctx0.op_add(&input_layer, &current);
+        // }
+
+        // //use scratch buffer 0 for the rest
+        // ctx0.use_scratch(Some(&mut session.scratch[0]));
+
+        // // norm
+        // input_layer = ctx0.op_norm(&input_layer);
+        // input_layer = ctx0.op_mul(&ctx0.op_repeat(&self.norm, &input_layer), &input_layer);
+
+        // let embeddings_tensor: ggml::Tensor = input_layer.share();
+
+        // // disable scratch buffer for last layer
+        // ctx0.use_scratch(None);
+        // // output embedding weight tied to input embedding
+        // input_layer = ctx0.op_mul_mat(&self.wte, &input_layer);
+
+        // // run the computation
+        // gf.build_forward_expand(&input_layer);
+        // ctx0.graph_compute(&mut gf);
+
+        // // finish evaluation
+        // common::read_last_token(session, &input_layer, n_vocab, input_len);
+        // common::extract_logits(output_request, &input_layer, n_vocab, input_len);
+        // common::extract_embeddings(output_request, &embeddings_tensor, n_embd, input_len);
+        // common::update_session(session, &ctx0, input_tokens.len(), input_len);
+    }
+
+    /// Returns the vocabulary used by this model.
+    fn vocabulary(&self) -> &Vocabulary {
+        &self.vocabulary
+    }
+
+    fn context_size(&self) -> usize {
+        self.context_size
+    }
+
+    fn bot_token_id(&self) -> Option<TokenId> {
+        self.vocabulary.id("<|padding|>".as_bytes())
+    }
+
+    fn eot_token_id(&self) -> TokenId {
+        self.vocabulary.id("<|endoftext|>".as_bytes()).unwrap()
+    }
+
+    fn quantize_tensors() -> Vec<Regex> {
+        vec![Regex::new(".*weight").unwrap()]
+    }
+
+    fn skip_quantize_tensors() -> Vec<Regex> {
+        vec![]
+    }
+}
+
+/// MPT [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning))
+#[derive(Debug, Default, PartialEq, Clone, Copy)]
+pub struct Hyperparameters {
+    /// Size of the model's vocabulary
+    n_vocab: usize,
+    /// Maximum sequence length
+    n_ctx: usize,
+    /// Size of the model's embedding layer
+    n_embd: usize,
+    /// n_heads
+    n_head: usize,
+    /// Number of layers in the model
+    n_layer: usize,
+    /// file_type
+    file_type: FileType,
+}
+
+impl llm_base::Hyperparameters for Hyperparameters {
+    fn read_ggml(reader: &mut dyn std::io::BufRead) -> Result<Self, LoadError> {
+        let hyperparameters = Hyperparameters {
+            n_vocab: util::read_i32(reader)?.try_into()?,
+            n_ctx: util::read_i32(reader)?.try_into()?,
+            n_embd: util::read_i32(reader)?.try_into()?,
+            n_head: util::read_i32(reader)?.try_into()?,
+            n_layer: util::read_i32(reader)?.try_into()?,
+            file_type: util::read_filetype(reader)?,
+        };
+
+        Ok(hyperparameters)
+    }
+
+    fn write_ggml(&self, writer: &mut dyn std::io::Write) -> Result<(), HyperparametersWriteError> {
+        util::write_i32(writer, self.n_vocab.try_into()?)?;
+        util::write_i32(writer, self.n_embd.try_into()?)?;
+        util::write_i32(writer, self.n_embd.try_into()?)?;
+        util::write_i32(writer, self.n_head.try_into()?)?;
+        util::write_i32(writer, self.n_layer.try_into()?)?;
+        util::write_i32(writer, self.file_type.into())?;
+        Ok(())
+    }
+
+    fn n_vocabulary(&self) -> usize {
+        self.n_vocab
+    }
+
+    fn file_type(&self) -> Option<FileType> {
+        Some(self.file_type)
+    }
+
+    fn file_type_mut(&mut self) -> Option<&mut FileType> {
+        Some(&mut self.file_type)
+    }
+}
+
+struct Layer {
+    // normalization
+    attention_norm: Tensor,
+    attention_norm_b: Tensor,
+
+    // attention
+    query_key_value: Tensor,
+    wo: Tensor,
+
+    // ff
+    ffn_up: Tensor,
+    ffn_down: Tensor,
+}

From 5eec60f8f79e3035f0d6ec12e117cd4a67900d37 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sat, 17 Jun 2023 23:01:31 +0200
Subject: [PATCH 02/21] Added eval

---
 crates/models/falcon/src/lib.rs | 318 ++++++++++++++++++--------------
 1 file changed, 180 insertions(+), 138 deletions(-)

diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs
index c371a147..0149228a 100644
--- a/crates/models/falcon/src/lib.rs
+++ b/crates/models/falcon/src/lib.rs
@@ -115,6 +115,7 @@ impl KnownModel for Falcon {
         let ctx_size = self.context_size;
 
         let Hyperparameters {
+            n_ctx,
             n_embd,
             n_head,
             n_vocab,
@@ -122,9 +123,18 @@ impl KnownModel for Falcon {
             ..
         } = self.hyperparameters;
 
+        let head_dim = n_embd / n_head;
+        let N = input_len;
+
         let (ctx0, embd) = common::prepare_for_evaluate(n_layer, session, input_tokens);
 
         let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, &embd);
+        let mut repeat_dummy = ctx0.new_tensor_3d(
+            input_layer.get_type(),
+            head_dim,
+            input_len + session_len,
+            n_head,
+        );
 
         let f32_size = std::mem::size_of::<f32>();
 
@@ -135,144 +145,176 @@ impl KnownModel for Falcon {
         let memory_v_size = memory_v.element_size();
 
         let mut gf = ggml::ComputationGraph::new(num_threads);
-        // for il in 0..n_layer {
-        //     // attention uses first scratch buffer
-        //     ctx0.use_scratch(Some(&mut session.scratch[0]));
-
-        //     let mut current = ctx0.op_norm(&input_layer);
-        //     current = ctx0.op_mul(
-        //         &ctx0.op_repeat(&self.layers[il].norm_1_weight, &current),
-        //         &current,
-        //     );
-
-        //     current = ctx0.op_mul_mat(&self.layers[il].c_attn_wqkv_weight, &current);
-
-        //     let nb = current.get_nb()[1];
-        //     let qcur = ctx0.op_view_2d(&current, (n_embd, input_len), nb, 0);
-        //     let kcur = ctx0.op_view_2d(&current, (n_embd, input_len), nb, f32_size * n_embd);
-        //     let vcur = ctx0.op_view_2d(&current, (n_embd, input_len), nb, f32_size * n_embd * 2);
-
-        //     let k = ctx0.op_view_1d(
-        //         memory_k,
-        //         input_len * n_embd,
-        //         (memory_k_size * n_embd) * (il * ctx_size + session_len),
-        //     );
-        //     let v = ctx0.op_view_1d(
-        //         memory_v,
-        //         input_len * n_embd,
-        //         (memory_v_size * n_embd) * (il * ctx_size + session_len),
-        //     );
-
-        //     gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k));
-        //     gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v));
-
-        //     let q = ctx0.op_permute(
-        //         &ctx0.op_cpy(
-        //             &qcur,
-        //             &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, input_len),
-        //         ),
-        //         (0, 2, 1, 3),
-        //     );
-
-        //     let bigk = ctx0.op_permute(
-        //         &ctx0.op_reshape_3d(
-        //             &ctx0.op_view_1d(
-        //                 memory_k,
-        //                 (session_len + input_len) * n_embd,
-        //                 il * ctx_size * memory_k_size * n_embd,
-        //             ),
-        //             n_embd / n_head,
-        //             n_head,
-        //             session_len + input_len,
-        //         ),
-        //         (0, 2, 1, 3),
-        //     );
-
-        //     let kq = ctx0.op_mul_mat(&bigk, &q);
-        //     let kq_scaled = ctx0.op_scale(
-        //         &kq,
-        //         &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)),
-        //     );
-        //     let kq_scaled_alibi = ctx0.op_alibi(&kq_scaled, session_len, n_head, alibi_bias_max);
-        //     let kq_masked = ctx0.op_diag_mask_inf(&kq_scaled_alibi, session_len);
-        //     let kq_softmax = ctx0.op_soft_max(&kq_masked);
-
-        //     let v_trans = ctx0.op_cpy(
-        //         &ctx0.op_permute(
-        //             &ctx0.op_reshape_3d(
-        //                 &ctx0.op_view_1d(
-        //                     &session.memory_v,
-        //                     (session_len + input_len) * n_embd,
-        //                     il * ctx_size * memory_v_size * n_embd,
-        //                 ),
-        //                 n_embd / n_head,
-        //                 n_head,
-        //                 session_len + input_len,
-        //             ),
-        //             (1, 2, 0, 3),
-        //         ),
-        //         &ctx0.new_tensor_3d(
-        //             session.memory_v.get_type(),
-        //             session_len + input_len,
-        //             n_embd / n_head,
-        //             n_head,
-        //         ),
-        //     );
-
-        //     let kqv = ctx0.op_mul_mat(&v_trans, &kq_softmax);
-        //     let kqv_merged = ctx0.op_permute(&kqv, (0, 2, 1, 3));
-
-        //     current = ctx0.op_cpy(
-        //         &kqv_merged,
-        //         &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len),
-        //     );
-        //     // projection
-        //     current = ctx0.op_mul_mat(&self.layers[il].c_attn_out_proj_weight, &current);
-
-        //     input_layer = ctx0.op_add(&input_layer, &current);
-
-        //     // feed forward uses second scratch buffer
-        //     ctx0.use_scratch(Some(&mut session.scratch[1]));
-
-        //     current = ctx0.op_norm(&input_layer);
-        //     current = ctx0.op_mul(
-        //         &ctx0.op_repeat(&self.layers[il].norm_2_weight, &current),
-        //         &current,
-        //     );
-
-        //     current = ctx0.op_mul_mat(&self.layers[il].ffn_up_proj, &current);
-
-        //     current = ctx0.op_gelu(&current);
-
-        //     // projection
-        //     current = ctx0.op_mul_mat(&self.layers[il].ffn_down_proj, &current);
-
-        //     input_layer = ctx0.op_add(&input_layer, &current);
-        // }
-
-        // //use scratch buffer 0 for the rest
-        // ctx0.use_scratch(Some(&mut session.scratch[0]));
-
-        // // norm
-        // input_layer = ctx0.op_norm(&input_layer);
-        // input_layer = ctx0.op_mul(&ctx0.op_repeat(&self.norm, &input_layer), &input_layer);
-
-        // let embeddings_tensor: ggml::Tensor = input_layer.share();
-
-        // // disable scratch buffer for last layer
-        // ctx0.use_scratch(None);
-        // // output embedding weight tied to input embedding
-        // input_layer = ctx0.op_mul_mat(&self.wte, &input_layer);
-
-        // // run the computation
-        // gf.build_forward_expand(&input_layer);
-        // ctx0.graph_compute(&mut gf);
-
-        // // finish evaluation
-        // common::read_last_token(session, &input_layer, n_vocab, input_len);
-        // common::extract_logits(output_request, &input_layer, n_vocab, input_len);
-        // common::extract_embeddings(output_request, &embeddings_tensor, n_embd, input_len);
-        // common::update_session(session, &ctx0, input_tokens.len(), input_len);
+
+        let mut current: Tensor;
+        let mut layernorm_output: Tensor;
+
+        for il in 0..n_layer {
+            // attention uses first scratch buffer
+            ctx0.use_scratch(Some(&mut session.scratch[0]));
+
+            // self-attention
+            let mut current = ctx0.op_norm(&input_layer);
+            current = ctx0.op_add(
+                &ctx0.op_mul(
+                    &ctx0.op_repeat(&self.layers[il].attention_norm, &current),
+                    &current,
+                ),
+                &ctx0.op_repeat(&self.layers[il].attention_norm_b, &current),
+            );
+
+            layernorm_output = current.share();
+
+            // compute QKV
+            current = ctx0.op_mul_mat(&self.layers[il].query_key_value, &current);
+
+            let fused_qkv_row_nb = (n_embd + 2 * (n_embd / n_head)) * f32_size;
+
+            let mut qcur = ctx0.op_view_3d(
+                &current,
+                (head_dim, n_head, N),
+                (head_dim * f32_size, fused_qkv_row_nb),
+                0,
+            );
+
+            let mut kcur = ctx0.op_view_3d(
+                &current,
+                (head_dim, 1, N),
+                (head_dim * f32_size, fused_qkv_row_nb),
+                n_embd * f32_size,
+            );
+
+            let vcur = ctx0.op_view_3d(
+                &current,
+                (head_dim, 1, N),
+                (head_dim * f32_size, fused_qkv_row_nb),
+                (n_embd + head_dim) * f32_size,
+            );
+
+            // using mode = 2 for neox mode
+            qcur = ctx0.op_rope_inplace(&qcur, session_len, head_dim, 2);
+            kcur = ctx0.op_rope_inplace(&qcur, session_len, head_dim, 2);
+
+            // store key and value to memory
+
+            let k = ctx0.op_view_1d(
+                &memory_k,
+                N * head_dim,
+                (memory_k_size * head_dim) * (il * n_ctx + session_len),
+            );
+            let v = ctx0.op_view_1d(
+                &memory_v,
+                N * head_dim,
+                (memory_k_size * head_dim) * (il * n_ctx + session_len),
+            );
+
+            gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k));
+            gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v));
+
+            // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+            let bigq = ctx0.op_permute(&qcur, (0, 2, 1, 3));
+
+            let mut bigk = ctx0.op_permute(
+                &ctx0.op_reshape_3d(
+                    &ctx0.op_view_1d(
+                        &memory_k,
+                        (session_len + N) * head_dim,
+                        il * n_ctx * memory_k_size * head_dim,
+                    ),
+                    head_dim,
+                    1,
+                    session_len + N,
+                ),
+                (0, 2, 1, 3),
+            );
+            // K * Q
+            bigk = ctx0.op_cont(&ctx0.op_repeat(&bigk, &repeat_dummy));
+            let big_kq = ctx0.op_mul(&bigk, &bigq);
+
+            // KQ_scaled = KQ / sqrt(n_embd/n_head)
+            let big_kq_scaled = ctx0.op_scale_inplace(
+                &big_kq,
+                &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)),
+            );
+
+            let big_kq_masked = ctx0.op_diag_mask_inf_inplace(&big_kq_scaled, session_len);
+
+            let big_kq_softmax = ctx0.op_soft_max_inplace(&big_kq_masked);
+
+            let mut bigv = ctx0.op_permute(
+                &ctx0.op_reshape_3d(
+                    &ctx0.op_view_1d(
+                        &memory_v,
+                        (session_len + N) * head_dim,
+                        il * n_ctx * memory_v_size * head_dim,
+                    ),
+                    head_dim,
+                    1,
+                    session_len + N,
+                ),
+                (0, 2, 1, 3),
+            );
+            bigv = ctx0.op_cont(&ctx0.op_transpose(&ctx0.op_repeat(&bigv, &repeat_dummy)));
+
+            // KQV = transpose(V) * KQ_soft_max
+            let big_kqv = ctx0.op_mul_mat(&bigv, &big_kq_softmax);
+            // KQV_merged = KQV.permute(0, 2, 1, 3)
+            let big_kqv_merged = ctx0.op_permute(&big_kqv, (0, 2, 1, 3));
+
+            // cur = KQV_merged.contiguous().view(n_embd, N)
+            current = ctx0.op_cpy(
+                &big_kqv_merged,
+                &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, N),
+            );
+
+            // projection
+            current = ctx0.op_mul_mat(&self.layers[il].wo, &current);
+
+            // feed forward uses second scratch buffer
+            ctx0.use_scratch(Some(&mut session.scratch[1]));
+
+            let inpFF = layernorm_output.share();
+            let attn_out = ctx0.op_cpy(&current, &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, N));
+
+            current = ctx0.op_mul_mat(&self.layers[il].ffn_up, &inpFF);
+            current = ctx0.op_gelu(&current);
+            current = ctx0.op_mul_mat(&self.layers[il].ffn_down, &current);
+
+            current = ctx0.op_add(&current, &attn_out);
+            current = ctx0.op_add(&current, &input_layer);
+
+            input_layer = current.share();
+        }
+
+        ctx0.use_scratch(Some(&mut session.scratch[0]));
+
+        // norm
+        input_layer = ctx0.op_norm(&input_layer);
+
+        input_layer = ctx0.op_add(
+            &ctx0.op_mul(
+                &ctx0.op_repeat(&self.output_norm, &input_layer),
+                &input_layer,
+            ),
+            &ctx0.op_repeat(&self.output_norm_b, &input_layer),
+        );
+
+        let embeddings_tensor: ggml::Tensor = input_layer.share();
+
+        ctx0.use_scratch(None);
+
+        // lm_head
+        input_layer = ctx0.op_mul_mat(&self.lm_head, &input_layer);
+
+        // run the computation
+        gf.build_forward_expand(&input_layer);
+        ctx0.graph_compute(&mut gf);
+
+        // finish evaluation
+        common::read_last_token(session, &input_layer, n_vocab, input_len);
+        common::extract_logits(output_request, &input_layer, n_vocab, input_len);
+        common::extract_embeddings(output_request, &embeddings_tensor, n_embd, input_len);
+        common::update_session(session, &ctx0, input_tokens.len(), input_len);
     }
 
     /// Returns the vocabulary used by this model.

From 41bde927e8074b5d2277411b657949ebc5a9128c Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sat, 17 Jun 2023 23:04:20 +0200
Subject: [PATCH 03/21] Naming + warnings

---
 crates/models/falcon/src/lib.rs | 43 ++++++++++++++++-----------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs
index 0149228a..eca6ee7b 100644
--- a/crates/models/falcon/src/lib.rs
+++ b/crates/models/falcon/src/lib.rs
@@ -115,7 +115,6 @@ impl KnownModel for Falcon {
         let ctx_size = self.context_size;
 
         let Hyperparameters {
-            n_ctx,
             n_embd,
             n_head,
             n_vocab,
@@ -124,12 +123,12 @@ impl KnownModel for Falcon {
         } = self.hyperparameters;
 
         let head_dim = n_embd / n_head;
-        let N = input_len;
+        let n = input_len;
 
         let (ctx0, embd) = common::prepare_for_evaluate(n_layer, session, input_tokens);
 
         let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, &embd);
-        let mut repeat_dummy = ctx0.new_tensor_3d(
+        let repeat_dummy = ctx0.new_tensor_3d(
             input_layer.get_type(),
             head_dim,
             input_len + session_len,
@@ -154,7 +153,7 @@ impl KnownModel for Falcon {
             ctx0.use_scratch(Some(&mut session.scratch[0]));
 
             // self-attention
-            let mut current = ctx0.op_norm(&input_layer);
+            current = ctx0.op_norm(&input_layer);
             current = ctx0.op_add(
                 &ctx0.op_mul(
                     &ctx0.op_repeat(&self.layers[il].attention_norm, &current),
@@ -172,40 +171,40 @@ impl KnownModel for Falcon {
 
             let mut qcur = ctx0.op_view_3d(
                 &current,
-                (head_dim, n_head, N),
+                (head_dim, n_head, n),
                 (head_dim * f32_size, fused_qkv_row_nb),
                 0,
             );
 
             let mut kcur = ctx0.op_view_3d(
                 &current,
-                (head_dim, 1, N),
+                (head_dim, 1, n),
                 (head_dim * f32_size, fused_qkv_row_nb),
                 n_embd * f32_size,
             );
 
             let vcur = ctx0.op_view_3d(
                 &current,
-                (head_dim, 1, N),
+                (head_dim, 1, n),
                 (head_dim * f32_size, fused_qkv_row_nb),
                 (n_embd + head_dim) * f32_size,
             );
 
             // using mode = 2 for neox mode
             qcur = ctx0.op_rope_inplace(&qcur, session_len, head_dim, 2);
-            kcur = ctx0.op_rope_inplace(&qcur, session_len, head_dim, 2);
+            kcur = ctx0.op_rope_inplace(&kcur, session_len, head_dim, 2);
 
             // store key and value to memory
 
             let k = ctx0.op_view_1d(
                 &memory_k,
-                N * head_dim,
-                (memory_k_size * head_dim) * (il * n_ctx + session_len),
+                n * head_dim,
+                (memory_k_size * head_dim) * (il * ctx_size + session_len),
             );
             let v = ctx0.op_view_1d(
                 &memory_v,
-                N * head_dim,
-                (memory_k_size * head_dim) * (il * n_ctx + session_len),
+                n * head_dim,
+                (memory_k_size * head_dim) * (il * ctx_size + session_len),
             );
 
             gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k));
@@ -218,12 +217,12 @@ impl KnownModel for Falcon {
                 &ctx0.op_reshape_3d(
                     &ctx0.op_view_1d(
                         &memory_k,
-                        (session_len + N) * head_dim,
-                        il * n_ctx * memory_k_size * head_dim,
+                        (session_len + n) * head_dim,
+                        il * ctx_size * memory_k_size * head_dim,
                     ),
                     head_dim,
                     1,
-                    session_len + N,
+                    session_len + n,
                 ),
                 (0, 2, 1, 3),
             );
@@ -245,12 +244,12 @@ impl KnownModel for Falcon {
                 &ctx0.op_reshape_3d(
                     &ctx0.op_view_1d(
                         &memory_v,
-                        (session_len + N) * head_dim,
-                        il * n_ctx * memory_v_size * head_dim,
+                        (session_len + n) * head_dim,
+                        il * ctx_size * memory_v_size * head_dim,
                     ),
                     head_dim,
                     1,
-                    session_len + N,
+                    session_len + n,
                 ),
                 (0, 2, 1, 3),
             );
@@ -264,7 +263,7 @@ impl KnownModel for Falcon {
             // cur = KQV_merged.contiguous().view(n_embd, N)
             current = ctx0.op_cpy(
                 &big_kqv_merged,
-                &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, N),
+                &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n),
             );
 
             // projection
@@ -273,10 +272,10 @@ impl KnownModel for Falcon {
             // feed forward uses second scratch buffer
             ctx0.use_scratch(Some(&mut session.scratch[1]));
 
-            let inpFF = layernorm_output.share();
-            let attn_out = ctx0.op_cpy(&current, &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, N));
+            let inp_ff = layernorm_output.share();
+            let attn_out = ctx0.op_cpy(&current, &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n));
 
-            current = ctx0.op_mul_mat(&self.layers[il].ffn_up, &inpFF);
+            current = ctx0.op_mul_mat(&self.layers[il].ffn_up, &inp_ff);
             current = ctx0.op_gelu(&current);
             current = ctx0.op_mul_mat(&self.layers[il].ffn_down, &current);
 

From c2cf35eabaf6d549d218b2dcd6dbdf58611f9c7e Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sat, 17 Jun 2023 23:29:58 +0200
Subject: [PATCH 04/21] Add Falcon to CLI

---
 binaries/llm-cli/src/cli_args.rs |  6 ++++++
 binaries/llm-cli/src/main.rs     |  1 +
 crates/llm/Cargo.toml            |  4 +++-
 crates/llm/src/lib.rs            | 15 +++++++++++++++
 4 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs
index 2b5ad199..efedfd87 100644
--- a/binaries/llm-cli/src/cli_args.rs
+++ b/binaries/llm-cli/src/cli_args.rs
@@ -44,6 +44,12 @@ pub enum Args {
         #[command(subcommand)]
         args: BaseArgs,
     },
+    /// Use a Falcon model
+    #[clap(id = "falcon")]
+    Falcon {
+        #[command(subcommand)]
+        args: BaseArgs,
+    },
 }
 
 #[derive(Subcommand, Debug)]
diff --git a/binaries/llm-cli/src/main.rs b/binaries/llm-cli/src/main.rs
index 5c4a4a7a..0329dc18 100644
--- a/binaries/llm-cli/src/main.rs
+++ b/binaries/llm-cli/src/main.rs
@@ -33,6 +33,7 @@ fn main() -> Result<()> {
         Args::GptJ { args } => handle_args::<llm::models::GptJ>(args),
         Args::GptNeoX { args } => handle_args::<llm::models::GptNeoX>(args),
         Args::Mpt { args } => handle_args::<llm::models::Mpt>(args),
+        Args::Falcon { args } => handle_args::<llm::models::Falcon>(args),
     }
 }
 
diff --git a/crates/llm/Cargo.toml b/crates/llm/Cargo.toml
index 05a2bdae..108252d4 100644
--- a/crates/llm/Cargo.toml
+++ b/crates/llm/Cargo.toml
@@ -15,6 +15,7 @@ llm-gptj = { path = "../models/gptj", optional = true, version = "0.2.0-dev" }
 llm-bloom = { path = "../models/bloom", optional = true, version = "0.2.0-dev" }
 llm-gptneox = { path = "../models/gptneox", optional = true, version = "0.2.0-dev" }
 llm-mpt = { path = "../models/mpt", optional = true, version = "0.2.0-dev" }
+llm-falcon = { path = "../models/falcon", optional = true, version = "0.2.0-dev" }
 
 serde = { workspace = true }
 
@@ -28,10 +29,11 @@ serde_json = { workspace = true }
 clap = { workspace = true }
 
 [features]
-default = ["llama", "gpt2", "gptj", "bloom", "gptneox", "mpt"]
+default = ["llama", "gpt2", "gptj", "bloom", "gptneox", "mpt", "falcon"]
 llama = ["dep:llm-llama"]
 gpt2 = ["dep:llm-gpt2"]
 gptj = ["dep:llm-gptj"]
 bloom = ["dep:llm-bloom"]
 gptneox = ["dep:llm-gptneox"]
 mpt = ["dep:llm-mpt"]
+falcon = ["dep:llm-falcon"]
diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs
index 13d308a0..d40f37b7 100644
--- a/crates/llm/src/lib.rs
+++ b/crates/llm/src/lib.rs
@@ -7,6 +7,7 @@
 //! - [GPT-NeoX](llm_gptneox)
 //! - [LLaMA](llm_llama)
 //! - [MPT](llm_mpt)
+//! - [Falcon](llm_falcon)
 //!
 //! At present, the only supported backend is [GGML](https://github.com/ggerganov/ggml), but this is expected to
 //! change in the future.
@@ -101,6 +102,8 @@ pub mod models {
     pub use llm_llama::{self as llama, Llama};
     #[cfg(feature = "mpt")]
     pub use llm_mpt::{self as mpt, Mpt};
+    #[cfg(feature = "falcon")]
+    pub use llm_falcon::{self as falcon, Falcon};
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize)]
@@ -124,6 +127,9 @@ pub enum ModelArchitecture {
     #[cfg(feature = "mpt")]
     /// [MPT](llm_mpt)
     Mpt,
+    #[cfg(feature = "falcon")]
+    /// [Falcon](llm_falcon)
+    Falcon,
 }
 
 impl ModelArchitecture {
@@ -141,6 +147,8 @@ impl ModelArchitecture {
         Self::Llama,
         #[cfg(feature = "mpt")]
         Self::Mpt,
+        #[cfg(feature = "falcon")]
+        Self::Falcon,
     ];
 }
 
@@ -184,6 +192,8 @@ impl FromStr for ModelArchitecture {
             "llama" => Ok(Llama),
             #[cfg(feature = "mpt")]
             "mpt" => Ok(Mpt),
+            #[cfg(feature = "falcon")]
+            "falcon" => Ok(Falcon),
 
             _ => Err(UnsupportedModelArchitecture(format!(
                 "{s} is not a supported model architecture"
@@ -209,6 +219,8 @@ impl Display for ModelArchitecture {
             Llama => write!(f, "LLaMA"),
             #[cfg(feature = "mpt")]
             Mpt => write!(f, "MPT"),
+            #[cfg(feature = "falcon")]
+            Falcon => write!(f, "Falcon"),
         }
     }
 }
@@ -263,6 +275,9 @@ pub fn load_dynamic(
         }
         #[cfg(feature = "mpt")]
         Mpt => load_model::<models::Mpt>(path, vocabulary_source, params, load_progress_callback)?,
+        #[cfg(feature = "falcon")]
+        Falcon => load_model::<models::Falcon>(path, vocabulary_source, params, load_progress_callback)?,
+        
     };
 
     Ok(model)

From df40cc156d6025a4f9aef178f138bbb46cd98c92 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sun, 18 Jun 2023 10:05:45 +0200
Subject: [PATCH 05/21] Update crates/models/falcon/Cargo.toml

Co-authored-by: Dan Forbes <dan@danforbes.dev>
---
 crates/models/falcon/Cargo.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/models/falcon/Cargo.toml b/crates/models/falcon/Cargo.toml
index 0c9cdbc8..e71c261c 100644
--- a/crates/models/falcon/Cargo.toml
+++ b/crates/models/falcon/Cargo.toml
@@ -3,7 +3,7 @@ name = "llm-falcon"
 version = "0.2.0-dev"
 license = { workspace = true }
 repository = { workspace = true }
-description = "An implementation of tiiuae falcon model for the `llm` ecosystem."
+description = "An implementation of Falcon for the `llm` ecosystem."
 edition = "2021"
 readme = "../../../README.md"
 

From b63070536794bb64cc050833b8d785392bce0c55 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sun, 18 Jun 2023 10:05:53 +0200
Subject: [PATCH 06/21] Update crates/models/falcon/src/lib.rs

Co-authored-by: Dan Forbes <dan@danforbes.dev>
---
 crates/models/falcon/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs
index eca6ee7b..55e351a9 100644
--- a/crates/models/falcon/src/lib.rs
+++ b/crates/models/falcon/src/lib.rs
@@ -1,4 +1,4 @@
-//! An implementation of [tiiuae](https://huggingface.co/tiiuae)'s [falcon] model for the `llm` ecosystem.
+//! An implementation of [Falcon](https://falconllm.tii.ae/) model for the `llm` ecosystem.
 #![deny(missing_docs)]
 
 use ggml::Tensor;

From 6f628bec0442ff4d64b12fd784972ace43dd997c Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sun, 18 Jun 2023 10:06:17 +0200
Subject: [PATCH 07/21] Update crates/models/falcon/src/lib.rs

Co-authored-by: Dan Forbes <dan@danforbes.dev>
---
 crates/models/falcon/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs
index 55e351a9..801a2830 100644
--- a/crates/models/falcon/src/lib.rs
+++ b/crates/models/falcon/src/lib.rs
@@ -9,7 +9,7 @@ use llm_base::{
     LoadError, Mmap, ModelParameters, OutputRequest, Regex, TokenId, Vocabulary,
 };
 
-/// The falcon model. Ref: [Technology Innovation Institute](https://huggingface.co/tiiuae/falcon-40b)
+/// The Falcon model. Ref: [Technology Innovation Institute](https://huggingface.co/tiiuae)
 ///
 /// # Safety
 /// This implements [Send] and [Sync] as it is immutable after construction.

From 611d2455627338cdd6df293f6139648c6ec25634 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sun, 18 Jun 2023 10:06:36 +0200
Subject: [PATCH 08/21] Update crates/models/falcon/src/lib.rs

Co-authored-by: Dan Forbes <dan@danforbes.dev>
---
 crates/models/falcon/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs
index 801a2830..06505f78 100644
--- a/crates/models/falcon/src/lib.rs
+++ b/crates/models/falcon/src/lib.rs
@@ -342,7 +342,7 @@ impl KnownModel for Falcon {
     }
 }
 
-/// MPT [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning))
+/// Falcon [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning))
 #[derive(Debug, Default, PartialEq, Clone, Copy)]
 pub struct Hyperparameters {
     /// Size of the model's vocabulary

From d67148ebb2983e50c165c53c1955256c47aaa333 Mon Sep 17 00:00:00 2001
From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com>
Date: Sun, 18 Jun 2023 16:12:08 +0200
Subject: [PATCH 09/21] Bugfix: Mat-Mul and wrong memory

---
 crates/llm/src/lib.rs           |  9 +++++----
 crates/models/falcon/src/lib.rs | 18 +++++++-----------
 2 files changed, 12 insertions(+), 15 deletions(-)

diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs
index d40f37b7..d792b64b 100644
--- a/crates/llm/src/lib.rs
+++ b/crates/llm/src/lib.rs
@@ -92,6 +92,8 @@ use serde::Serialize;
 pub mod models {
     #[cfg(feature = "bloom")]
     pub use llm_bloom::{self as bloom, Bloom};
+    #[cfg(feature = "falcon")]
+    pub use llm_falcon::{self as falcon, Falcon};
     #[cfg(feature = "gpt2")]
     pub use llm_gpt2::{self as gpt2, Gpt2};
     #[cfg(feature = "gptj")]
@@ -102,8 +104,6 @@ pub mod models {
     pub use llm_llama::{self as llama, Llama};
     #[cfg(feature = "mpt")]
     pub use llm_mpt::{self as mpt, Mpt};
-    #[cfg(feature = "falcon")]
-    pub use llm_falcon::{self as falcon, Falcon};
 }
 
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize)]
@@ -276,8 +276,9 @@ pub fn load_dynamic(
         #[cfg(feature = "mpt")]
         Mpt => load_model::<models::Mpt>(path, vocabulary_source, params, load_progress_callback)?,
         #[cfg(feature = "falcon")]
-        Falcon => load_model::<models::Falcon>(path, vocabulary_source, params, load_progress_callback)?,
-        
+        Falcon => {
+            load_model::<models::Falcon>(path, vocabulary_source, params, load_progress_callback)?
+        }
     };
 
     Ok(model)
diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs
index 06505f78..78a26b0f 100644
--- a/crates/models/falcon/src/lib.rs
+++ b/crates/models/falcon/src/lib.rs
@@ -197,14 +197,14 @@ impl KnownModel for Falcon {
             // store key and value to memory
 
             let k = ctx0.op_view_1d(
-                &memory_k,
+                memory_k,
                 n * head_dim,
                 (memory_k_size * head_dim) * (il * ctx_size + session_len),
             );
             let v = ctx0.op_view_1d(
-                &memory_v,
+                memory_v,
                 n * head_dim,
-                (memory_k_size * head_dim) * (il * ctx_size + session_len),
+                (memory_v_size * head_dim) * (il * ctx_size + session_len),
             );
 
             gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k));
@@ -216,7 +216,7 @@ impl KnownModel for Falcon {
             let mut bigk = ctx0.op_permute(
                 &ctx0.op_reshape_3d(
                     &ctx0.op_view_1d(
-                        &memory_k,
+                        memory_k,
                         (session_len + n) * head_dim,
                         il * ctx_size * memory_k_size * head_dim,
                     ),
@@ -228,7 +228,7 @@ impl KnownModel for Falcon {
             );
             // K * Q
             bigk = ctx0.op_cont(&ctx0.op_repeat(&bigk, &repeat_dummy));
-            let big_kq = ctx0.op_mul(&bigk, &bigq);
+            let big_kq = ctx0.op_mul_mat(&bigk, &bigq);
 
             // KQ_scaled = KQ / sqrt(n_embd/n_head)
             let big_kq_scaled = ctx0.op_scale_inplace(
@@ -243,7 +243,7 @@ impl KnownModel for Falcon {
             let mut bigv = ctx0.op_permute(
                 &ctx0.op_reshape_3d(
                     &ctx0.op_view_1d(
-                        &memory_v,
+                        memory_v,
                         (session_len + n) * head_dim,
                         il * ctx_size * memory_v_size * head_dim,
                     ),
@@ -326,7 +326,7 @@ impl KnownModel for Falcon {
     }
 
     fn bot_token_id(&self) -> Option<TokenId> {
-        self.vocabulary.id("<|padding|>".as_bytes())
+        None
     }
 
     fn eot_token_id(&self) -> TokenId {
@@ -347,8 +347,6 @@ impl KnownModel for Falcon {
 pub struct Hyperparameters {
     /// Size of the model's vocabulary
     n_vocab: usize,
-    /// Maximum sequence length
-    n_ctx: usize,
     /// Size of the model's embedding layer
     n_embd: usize,
     /// n_heads
@@ -363,7 +361,6 @@ impl llm_base::Hyperparameters for Hyperparameters {
     fn read_ggml(reader: &mut dyn std::io::BufRead) -> Result<Self, LoadError> {
         let hyperparameters = Hyperparameters {
             n_vocab: util::read_i32(reader)?.try_into()?,
-            n_ctx: util::read_i32(reader)?.try_into()?,
             n_embd: util::read_i32(reader)?.try_into()?,
             n_head: util::read_i32(reader)?.try_into()?,
             n_layer: util::read_i32(reader)?.try_into()?,
@@ -376,7 +373,6 @@ impl llm_base::Hyperparameters for Hyperparameters {
     fn write_ggml(&self, writer: &mut dyn std::io::Write) -> Result<(), HyperparametersWriteError> {
         util::write_i32(writer, self.n_vocab.try_into()?)?;
         util::write_i32(writer, self.n_embd.try_into()?)?;
-        util::write_i32(writer, self.n_embd.try_into()?)?;
         util::write_i32(writer, self.n_head.try_into()?)?;
         util::write_i32(writer, self.n_layer.try_into()?)?;
         util::write_i32(writer, self.file_type.into())?;

From f9f477518eba0b4bf7e790f610f1068a1059e172 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Fri, 23 Jun 2023 01:21:44 +0200
Subject: [PATCH 10/21] feat: automatically run checks before commit

---
 .github/workflows/rust.yml           |  2 +-
 .rusty-hook.toml                     |  5 +++
 Cargo.lock                           | 66 ++++++++++++++++++++++++++++
 binaries/llm-cli/Cargo.toml          |  3 ++
 binaries/precommit-check/Cargo.toml  |  8 ++++
 binaries/precommit-check/README.md   |  3 ++
 binaries/precommit-check/src/main.rs | 16 +++++++
 doc/CONTRIBUTING.md                  | 12 ++---
 8 files changed, 105 insertions(+), 10 deletions(-)
 create mode 100644 .rusty-hook.toml
 create mode 100644 binaries/precommit-check/Cargo.toml
 create mode 100644 binaries/precommit-check/README.md
 create mode 100644 binaries/precommit-check/src/main.rs

diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml
index 602820fb..e228bd95 100644
--- a/.github/workflows/rust.yml
+++ b/.github/workflows/rust.yml
@@ -27,7 +27,7 @@ jobs:
       - name: Build
         run: cargo build --verbose
       - name: Run tests
-        run: cargo test --verbose
+        run: cargo test --all --verbose
   fmt:
     name: Clippy, formatting and docs
     runs-on: ubuntu-latest
diff --git a/.rusty-hook.toml b/.rusty-hook.toml
new file mode 100644
index 00000000..53820f01
--- /dev/null
+++ b/.rusty-hook.toml
@@ -0,0 +1,5 @@
+[hooks]
+pre-commit = "cargo run -p precommit-check"
+
+[logging]
+verbose = true
diff --git a/Cargo.lock b/Cargo.lock
index e941d329..ef6e1ab5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -295,6 +295,15 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
 
+[[package]]
+name = "ci_info"
+version = "0.10.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "24f638c70e8c5753795cc9a8c07c44da91554a09e4cf11a7326e8161b0a3c45e"
+dependencies = [
+ "envmnt",
+]
+
 [[package]]
 name = "cipher"
 version = "0.4.4"
@@ -669,6 +678,16 @@ dependencies = [
  "termcolor",
 ]
 
+[[package]]
+name = "envmnt"
+version = "0.8.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a2d328fc287c61314c4a61af7cfdcbd7e678e39778488c7cb13ec133ce0f4059"
+dependencies = [
+ "fsio",
+ "indexmap",
+]
+
 [[package]]
 name = "errno"
 version = "0.3.1"
@@ -801,6 +820,12 @@ dependencies = [
  "winapi",
 ]
 
+[[package]]
+name = "fsio"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c1fd087255f739f4f1aeea69f11b72f8080e9c2e7645cd06955dad4a178a49e3"
+
 [[package]]
 name = "futures-channel"
 version = "0.3.28"
@@ -866,6 +891,15 @@ dependencies = [
  "version_check",
 ]
 
+[[package]]
+name = "getopts"
+version = "0.2.21"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5"
+dependencies = [
+ "unicode-width",
+]
+
 [[package]]
 name = "getrandom"
 version = "0.2.9"
@@ -1295,6 +1329,7 @@ dependencies = [
  "log",
  "num_cpus",
  "rand",
+ "rusty-hook",
  "rustyline",
  "spinoff",
  "zstd 0.12.3+zstd.1.5.2",
@@ -1473,6 +1508,12 @@ dependencies = [
  "tempfile",
 ]
 
+[[package]]
+name = "nias"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ab250442c86f1850815b5d268639dff018c0627022bc1940eb2d642ca1ce12f0"
+
 [[package]]
 name = "nibble_vec"
 version = "0.1.0"
@@ -1684,6 +1725,10 @@ version = "0.2.17"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
 
+[[package]]
+name = "precommit-check"
+version = "0.1.0"
+
 [[package]]
 name = "prettyplease"
 version = "0.2.4"
@@ -1900,6 +1945,18 @@ dependencies = [
  "windows-sys 0.48.0",
 ]
 
+[[package]]
+name = "rusty-hook"
+version = "0.11.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96cee9be61be7e1cbadd851e58ed7449c29c620f00b23df937cb9cbc04ac21a3"
+dependencies = [
+ "ci_info",
+ "getopts",
+ "nias",
+ "toml",
+]
+
 [[package]]
 name = "rustyline"
 version = "11.0.0"
@@ -2313,6 +2370,15 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "toml"
+version = "0.5.11"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234"
+dependencies = [
+ "serde",
+]
+
 [[package]]
 name = "tower-service"
 version = "0.3.2"
diff --git a/binaries/llm-cli/Cargo.toml b/binaries/llm-cli/Cargo.toml
index a0a692a2..dea1f3d3 100644
--- a/binaries/llm-cli/Cargo.toml
+++ b/binaries/llm-cli/Cargo.toml
@@ -28,6 +28,9 @@ num_cpus = "1.15.0"
 color-eyre = { version = "0.6.2", default-features = false }
 zstd = { version = "0.12", default-features = false }
 
+[dev-dependencies]
+rusty-hook = "^0.11.2"
+
 [features]
 cublas = ["llm/cublas"]
 clblast = ["llm/clblast"]
diff --git a/binaries/precommit-check/Cargo.toml b/binaries/precommit-check/Cargo.toml
new file mode 100644
index 00000000..ba24f36d
--- /dev/null
+++ b/binaries/precommit-check/Cargo.toml
@@ -0,0 +1,8 @@
+[package]
+name = "precommit-check"
+version = "0.1.0"
+edition = "2021"
+publish = false
+
+[package.metadata.release]
+release = false
\ No newline at end of file
diff --git a/binaries/precommit-check/README.md b/binaries/precommit-check/README.md
new file mode 100644
index 00000000..3a7c8118
--- /dev/null
+++ b/binaries/precommit-check/README.md
@@ -0,0 +1,3 @@
+# precommit-check
+
+Helper script to run pre-commit checks on a repository. Used with `rusty-hook` to execute all of the checks and early exit if any of them fail.
diff --git a/binaries/precommit-check/src/main.rs b/binaries/precommit-check/src/main.rs
new file mode 100644
index 00000000..945881d1
--- /dev/null
+++ b/binaries/precommit-check/src/main.rs
@@ -0,0 +1,16 @@
+fn main() {
+    // Ensure that these match `.github/workflows/rust.yml`.
+    cmd("cargo", &["check"]);
+    cmd("cargo", &["test", "--all"]);
+    cmd("cargo", &["fmt", "--check", "--all"]);
+    cmd("cargo", &["doc", "--workspace", "--exclude", "llm-cli"]);
+    cmd("cargo", &["clippy", "--", "-Dclippy::all"]);
+}
+
+fn cmd(cmd: &str, args: &[&str]) {
+    println!("=== Running command: {cmd} {args:?}");
+    let mut child = std::process::Command::new(cmd).args(args).spawn().unwrap();
+    if !child.wait().unwrap().success() {
+        panic!("Failed to run command: {} {:?}", cmd, args);
+    }
+}
diff --git a/doc/CONTRIBUTING.md b/doc/CONTRIBUTING.md
index a29d11b5..43da4e1c 100644
--- a/doc/CONTRIBUTING.md
+++ b/doc/CONTRIBUTING.md
@@ -9,16 +9,10 @@ or on [Discord](https://discord.gg/YB9WaXYAWU)!
 ## Checking Changes
 
 This project uses a [GitHub workflow](../.github/workflows/rust.yml) to enforce
-code standards - it will execute the following commands, which can be performed
-locally for faster turnaround and a better developer experience:
+code standards.
 
-```shell
-cargo check
-cargo test
-cargo fmt --all
-cargo doc --workspace --exclude llm-cli
-cargo clippy --fix --allow-dirty -- -Dclippy::all
-```
+The `rusty-hook` project is used to run a similar set of checks automatically before committing.
+If you would like to run these checks locally, use `cargo run -p precommit-check`.
 
 ## Regenerating GGML Bindings
 

From 7c2f7c1840c4d2fcfb1ec0835f83837a0b622003 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Sun, 25 Jun 2023 14:05:51 +0200
Subject: [PATCH 11/21] feat(llm): source error in VocabularyLoadError

---
 crates/llm-base/src/loader.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs
index 4e4b60a0..fcb6b6c9 100644
--- a/crates/llm-base/src/loader.rs
+++ b/crates/llm-base/src/loader.rs
@@ -326,7 +326,7 @@ pub enum LoadError {
         paths: Vec<PathBuf>,
     },
     /// The vocab file for the tokenizer could not be loaded.
-    #[error("could not load vocabulary file {path:?}")]
+    #[error("could not load vocabulary file {path:?}: {error}")]
     VocabularyLoadError {
         /// The invalid vocabulary path
         path: PathBuf,

From 461cbce91c86cd229070247861d9543fbdfd3bb6 Mon Sep 17 00:00:00 2001
From: Julia Merz <julia@jmerz.is>
Date: Wed, 28 Jun 2023 01:40:01 +0200
Subject: [PATCH 12/21] Added InferenceSessionRef to exports, for easier
 serializing and deserializing

---
 crates/llm-base/src/lib.rs | 2 +-
 crates/llm/src/lib.rs      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/crates/llm-base/src/lib.rs b/crates/llm-base/src/lib.rs
index 127fedf6..effae215 100644
--- a/crates/llm-base/src/lib.rs
+++ b/crates/llm-base/src/lib.rs
@@ -24,7 +24,7 @@ pub use ggml::Type as ElementType;
 
 pub use inference_session::{
     feed_prompt_callback, GraphOutputs, InferenceError, InferenceFeedback, InferenceRequest,
-    InferenceResponse, InferenceSession, InferenceSessionConfig, InferenceSnapshot, InferenceStats,
+    InferenceResponse, InferenceSession, InferenceSessionConfig, InferenceSnapshot, InferenceSnapshotRef, InferenceStats,
     ModelKVMemoryType, SnapshotError,
 };
 pub use loader::{
diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs
index 13d308a0..0f7363ec 100644
--- a/crates/llm/src/lib.rs
+++ b/crates/llm/src/lib.rs
@@ -79,7 +79,7 @@ pub use llm_base::{
     feed_prompt_callback, ggml::format as ggml_format, load, load_progress_callback_stdout,
     quantize, samplers, ElementType, FileType, FileTypeFormat, InferenceError, InferenceFeedback,
     InferenceParameters, InferenceRequest, InferenceResponse, InferenceSession,
-    InferenceSessionConfig, InferenceSnapshot, InferenceStats, InvalidTokenBias, KnownModel,
+    InferenceSessionConfig, InferenceSnapshot, InferenceSnapshotRef, InferenceStats, InvalidTokenBias, KnownModel,
     LoadError, LoadProgress, Loader, Model, ModelKVMemoryType, ModelParameters, OutputRequest,
     Prompt, QuantizeError, QuantizeProgress, Sampler, SnapshotError, TokenBias, TokenId,
     TokenUtf8Buffer, TokenizationError, Vocabulary, VocabularySource,

From 45258e31c1bd43ec4d2ddf861b1b39430adb5255 Mon Sep 17 00:00:00 2001
From: Julia Merz <julia@jmerz.is>
Date: Wed, 28 Jun 2023 20:22:33 +0200
Subject: [PATCH 13/21] formatting fix

---
 crates/llm-base/src/lib.rs | 4 ++--
 crates/llm/src/lib.rs      | 9 +++++----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/crates/llm-base/src/lib.rs b/crates/llm-base/src/lib.rs
index effae215..45c5c17e 100644
--- a/crates/llm-base/src/lib.rs
+++ b/crates/llm-base/src/lib.rs
@@ -24,8 +24,8 @@ pub use ggml::Type as ElementType;
 
 pub use inference_session::{
     feed_prompt_callback, GraphOutputs, InferenceError, InferenceFeedback, InferenceRequest,
-    InferenceResponse, InferenceSession, InferenceSessionConfig, InferenceSnapshot, InferenceSnapshotRef, InferenceStats,
-    ModelKVMemoryType, SnapshotError,
+    InferenceResponse, InferenceSession, InferenceSessionConfig, InferenceSnapshot,
+    InferenceSnapshotRef, InferenceStats, ModelKVMemoryType, SnapshotError,
 };
 pub use loader::{
     load, load_progress_callback_stdout, ContainerType, FileType, FileTypeFormat, LoadError,
diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs
index 0f7363ec..b5e12da7 100644
--- a/crates/llm/src/lib.rs
+++ b/crates/llm/src/lib.rs
@@ -79,10 +79,11 @@ pub use llm_base::{
     feed_prompt_callback, ggml::format as ggml_format, load, load_progress_callback_stdout,
     quantize, samplers, ElementType, FileType, FileTypeFormat, InferenceError, InferenceFeedback,
     InferenceParameters, InferenceRequest, InferenceResponse, InferenceSession,
-    InferenceSessionConfig, InferenceSnapshot, InferenceSnapshotRef, InferenceStats, InvalidTokenBias, KnownModel,
-    LoadError, LoadProgress, Loader, Model, ModelKVMemoryType, ModelParameters, OutputRequest,
-    Prompt, QuantizeError, QuantizeProgress, Sampler, SnapshotError, TokenBias, TokenId,
-    TokenUtf8Buffer, TokenizationError, Vocabulary, VocabularySource,
+    InferenceSessionConfig, InferenceSnapshot, InferenceSnapshotRef, InferenceStats,
+    InvalidTokenBias, KnownModel, LoadError, LoadProgress, Loader, Model, ModelKVMemoryType,
+    ModelParameters, OutputRequest, Prompt, QuantizeError, QuantizeProgress, Sampler,
+    SnapshotError, TokenBias, TokenId, TokenUtf8Buffer, TokenizationError, Vocabulary,
+    VocabularySource,
 };
 
 use serde::Serialize;

From 68bbedb12577da6cf63883fdccafb5cb53091224 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Wed, 28 Jun 2023 22:41:27 +0200
Subject: [PATCH 14/21] docs(readme): ake "getting models" more obvious

---
 README.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ad47e3e3..28b3892e 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,8 @@ Currently, the following models are supported:
   [Wizard](https://github.com/nlpxucan/WizardLM))
 - [MPT](https://www.mosaicml.com/blog/mpt-7b)
 
+See [getting models](#getting-models) for more information on how to download supported models.
+
 ## Using `llm` in a Rust Project
 
 This project depends on Rust v1.65.0 or above and a modern C toolchain.
@@ -86,7 +88,7 @@ opt-level = 3
 ```
 ## Leverage Accelerators with `llm`
 
-The `llm` library is engineered to take advantage of hardware accelerators such as `cuda` and `metal` for optimized performance. 
+The `llm` library is engineered to take advantage of hardware accelerators such as `cuda` and `metal` for optimized performance.
 
 To enable `llm` to harness these accelerators, some preliminary configuration steps are necessary, which vary based on your operating system. For comprehensive guidance, please refer to the [Acceleration Support for Building section](doc/CONTRIBUTING.md#acceleration-support-for-building) in our documentation.
 

From 4716b1d19f05f55762cd30ba72c9e9f270defb6d Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Thu, 29 Jun 2023 00:50:06 +0200
Subject: [PATCH 15/21] feat(falcon): disable by default

---
 binaries/llm-cli/Cargo.toml      | 3 +++
 binaries/llm-cli/src/cli_args.rs | 1 +
 binaries/llm-cli/src/main.rs     | 1 +
 crates/llm/Cargo.toml            | 3 ++-
 crates/llm/src/lib.rs            | 2 +-
 crates/models/falcon/src/lib.rs  | 8 +++++++-
 6 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/binaries/llm-cli/Cargo.toml b/binaries/llm-cli/Cargo.toml
index dea1f3d3..bac2ff87 100644
--- a/binaries/llm-cli/Cargo.toml
+++ b/binaries/llm-cli/Cargo.toml
@@ -35,3 +35,6 @@ rusty-hook = "^0.11.2"
 cublas = ["llm/cublas"]
 clblast = ["llm/clblast"]
 metal = ["llm/metal"]
+
+# Falcon is off by default. See `llm_falcon`'s module documentation for more information.
+falcon = ["llm/falcon"]
diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs
index 5e0919b9..5aec546f 100644
--- a/binaries/llm-cli/src/cli_args.rs
+++ b/binaries/llm-cli/src/cli_args.rs
@@ -46,6 +46,7 @@ pub enum Args {
     },
     /// Use a Falcon model
     #[clap(id = "falcon")]
+    #[cfg(feature = "falcon")]
     Falcon {
         #[command(subcommand)]
         args: BaseArgs,
diff --git a/binaries/llm-cli/src/main.rs b/binaries/llm-cli/src/main.rs
index 51e8896e..679a753e 100644
--- a/binaries/llm-cli/src/main.rs
+++ b/binaries/llm-cli/src/main.rs
@@ -33,6 +33,7 @@ fn main() -> Result<()> {
         Args::GptJ { args } => handle_args::<llm::models::GptJ>(args),
         Args::GptNeoX { args } => handle_args::<llm::models::GptNeoX>(args),
         Args::Mpt { args } => handle_args::<llm::models::Mpt>(args),
+        #[cfg(feature = "falcon")]
         Args::Falcon { args } => handle_args::<llm::models::Falcon>(args),
     }
 }
diff --git a/crates/llm/Cargo.toml b/crates/llm/Cargo.toml
index f501a29e..035b8a83 100644
--- a/crates/llm/Cargo.toml
+++ b/crates/llm/Cargo.toml
@@ -29,7 +29,7 @@ serde_json = { workspace = true }
 clap = { workspace = true }
 
 [features]
-default = ["llama", "gpt2", "gptj", "bloom", "gptneox", "mpt", "falcon"]
+default = ["llama", "gpt2", "gptj", "bloom", "gptneox", "mpt"]
 
 llama = ["dep:llm-llama"]
 gpt2 = ["dep:llm-gpt2"]
@@ -37,6 +37,7 @@ gptj = ["dep:llm-gptj"]
 bloom = ["dep:llm-bloom"]
 gptneox = ["dep:llm-gptneox"]
 mpt = ["dep:llm-mpt"]
+# Falcon is off by default. See `llm_falcon`'s module documentation for more information.
 falcon = ["dep:llm-falcon"]
 
 cublas = ["llm-base/cublas"]
diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs
index 211f4c78..869a8416 100644
--- a/crates/llm/src/lib.rs
+++ b/crates/llm/src/lib.rs
@@ -7,7 +7,7 @@
 //! - [GPT-NeoX](llm_gptneox)
 //! - [LLaMA](llm_llama)
 //! - [MPT](llm_mpt)
-//! - [Falcon](llm_falcon)
+//! - [Falcon](llm_falcon) (disabled by default)
 //!
 //! At present, the only supported backend is [GGML](https://github.com/ggerganov/ggml), but this is expected to
 //! change in the future.
diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs
index a15a100c..3b989e26 100644
--- a/crates/models/falcon/src/lib.rs
+++ b/crates/models/falcon/src/lib.rs
@@ -1,4 +1,10 @@
-//! An implementation of [Falcon](https://falconllm.tii.ae/) model for the `llm` ecosystem.
+//! An implementation of the [Falcon](https://falconllm.tii.ae/) model for the `llm` ecosystem.
+//!
+//! This implementation only works for Falcon 7B, and with 32-bit memory tensors (i.e. your inference session
+//! must be configured with a 32-bit [InferenceSessionConfig]).
+//!
+//! This model will not be generally available in the `llm` ecosystem until Falcon 40B and 16-bit memory is
+//! supported. It is currently only available as a preview.
 #![deny(missing_docs)]
 
 use std::sync::Arc;

From bbb089c68f67bf1d0ed4f12b4859f7710663106f Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Thu, 29 Jun 2023 01:06:19 +0200
Subject: [PATCH 16/21] fix: broken doclink

---
 binaries/precommit-check/src/main.rs | 23 +++++++++++++++--------
 crates/llm/src/lib.rs                |  2 +-
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/binaries/precommit-check/src/main.rs b/binaries/precommit-check/src/main.rs
index 945881d1..04d3add0 100644
--- a/binaries/precommit-check/src/main.rs
+++ b/binaries/precommit-check/src/main.rs
@@ -1,16 +1,23 @@
 fn main() {
     // Ensure that these match `.github/workflows/rust.yml`.
-    cmd("cargo", &["check"]);
-    cmd("cargo", &["test", "--all"]);
-    cmd("cargo", &["fmt", "--check", "--all"]);
-    cmd("cargo", &["doc", "--workspace", "--exclude", "llm-cli"]);
-    cmd("cargo", &["clippy", "--", "-Dclippy::all"]);
+    cmd("cargo", &["check"], &[]);
+    cmd("cargo", &["test", "--all"], &[]);
+    cmd("cargo", &["fmt", "--check", "--all"], &[]);
+    cmd(
+        "cargo",
+        &["doc", "--workspace", "--exclude", "llm-cli"],
+        &[("RUSTDOCFLAGS", "-Dwarnings")],
+    );
+    cmd("cargo", &["clippy", "--", "-Dclippy::all"], &[]);
 }
 
-fn cmd(cmd: &str, args: &[&str]) {
+fn cmd(cmd: &str, args: &[&str], env: &[(&str, &str)]) {
     println!("=== Running command: {cmd} {args:?}");
-    let mut child = std::process::Command::new(cmd).args(args).spawn().unwrap();
+    let mut builder = std::process::Command::new(cmd);
+    builder.args(args);
+    builder.envs(env.iter().copied());
+    let mut child = builder.spawn().unwrap();
     if !child.wait().unwrap().success() {
-        panic!("Failed to run command: {} {:?}", cmd, args);
+        panic!("Failed to run command: {} {:?}", cmd, builder);
     }
 }
diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs
index 869a8416..8adda7e7 100644
--- a/crates/llm/src/lib.rs
+++ b/crates/llm/src/lib.rs
@@ -7,7 +7,7 @@
 //! - [GPT-NeoX](llm_gptneox)
 //! - [LLaMA](llm_llama)
 //! - [MPT](llm_mpt)
-//! - [Falcon](llm_falcon) (disabled by default)
+//! - Falcon (currently disabled due to incompleteness)
 //!
 //! At present, the only supported backend is [GGML](https://github.com/ggerganov/ggml), but this is expected to
 //! change in the future.

From ae15cc4db5b1bafc1aaca05f3203e5d60f6a30bc Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Thu, 29 Jun 2023 02:34:52 +0200
Subject: [PATCH 17/21] refactor: remove unnecessary deps

---
 Cargo.lock                       | 9 ---------
 crates/models/bloom/Cargo.toml   | 4 +---
 crates/models/falcon/Cargo.toml  | 4 +---
 crates/models/gptj/Cargo.toml    | 4 +---
 crates/models/gptneox/Cargo.toml | 3 ---
 crates/models/llama/Cargo.toml   | 6 +-----
 crates/models/mpt/Cargo.toml     | 3 +--
 7 files changed, 5 insertions(+), 28 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d4b5a65f..23fb7284 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1313,7 +1313,6 @@ dependencies = [
 name = "llm-bloom"
 version = "0.2.0-dev"
 dependencies = [
- "bytemuck",
  "llm-base",
 ]
 
@@ -1340,7 +1339,6 @@ dependencies = [
 name = "llm-falcon"
 version = "0.2.0-dev"
 dependencies = [
- "bytemuck",
  "llm-base",
 ]
 
@@ -1356,7 +1354,6 @@ dependencies = [
 name = "llm-gptj"
 version = "0.2.0-dev"
 dependencies = [
- "bytemuck",
  "llm-base",
 ]
 
@@ -1364,26 +1361,20 @@ dependencies = [
 name = "llm-gptneox"
 version = "0.2.0-dev"
 dependencies = [
- "bytemuck",
  "llm-base",
- "serde",
 ]
 
 [[package]]
 name = "llm-llama"
 version = "0.2.0-dev"
 dependencies = [
- "bytemuck",
  "llm-base",
- "rand",
- "thiserror",
 ]
 
 [[package]]
 name = "llm-mpt"
 version = "0.2.0-dev"
 dependencies = [
- "bytemuck",
  "llm-base",
 ]
 
diff --git a/crates/models/bloom/Cargo.toml b/crates/models/bloom/Cargo.toml
index ed6e4df0..01bb5e16 100644
--- a/crates/models/bloom/Cargo.toml
+++ b/crates/models/bloom/Cargo.toml
@@ -8,6 +8,4 @@ edition = "2021"
 readme = "../../../README.md"
 
 [dependencies]
-llm-base = { path = "../../llm-base", version = "0.2.0-dev" }
-
-bytemuck = { workspace = true }
+llm-base = { path = "../../llm-base", version = "0.2.0-dev" }
\ No newline at end of file
diff --git a/crates/models/falcon/Cargo.toml b/crates/models/falcon/Cargo.toml
index e71c261c..d06ad05c 100644
--- a/crates/models/falcon/Cargo.toml
+++ b/crates/models/falcon/Cargo.toml
@@ -8,6 +8,4 @@ edition = "2021"
 readme = "../../../README.md"
 
 [dependencies]
-llm-base = { path = "../../llm-base", version = "0.2.0-dev" }
-
-bytemuck = { workspace = true }
+llm-base = { path = "../../llm-base", version = "0.2.0-dev" }
\ No newline at end of file
diff --git a/crates/models/gptj/Cargo.toml b/crates/models/gptj/Cargo.toml
index 05ea615f..2d3ce60a 100644
--- a/crates/models/gptj/Cargo.toml
+++ b/crates/models/gptj/Cargo.toml
@@ -10,6 +10,4 @@ readme = "../../../README.md"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-llm-base = { path = "../../llm-base", version = "0.2.0-dev" }
-
-bytemuck = { workspace = true }
+llm-base = { path = "../../llm-base", version = "0.2.0-dev" }
\ No newline at end of file
diff --git a/crates/models/gptneox/Cargo.toml b/crates/models/gptneox/Cargo.toml
index 2f84b9f5..4df13941 100644
--- a/crates/models/gptneox/Cargo.toml
+++ b/crates/models/gptneox/Cargo.toml
@@ -9,6 +9,3 @@ readme = "../../../README.md"
 
 [dependencies]
 llm-base = { path = "../../llm-base", version = "0.2.0-dev" }
-
-bytemuck = { workspace = true }
-serde = { workspace = true }
diff --git a/crates/models/llama/Cargo.toml b/crates/models/llama/Cargo.toml
index 3131b53a..b7c3bdbf 100644
--- a/crates/models/llama/Cargo.toml
+++ b/crates/models/llama/Cargo.toml
@@ -8,8 +8,4 @@ edition = "2021"
 readme = "../../../README.md"
 
 [dependencies]
-llm-base = { path = "../../llm-base", version = "0.2.0-dev" }
-
-bytemuck = { workspace = true }
-rand = { workspace = true }
-thiserror = { workspace = true }
+llm-base = { path = "../../llm-base", version = "0.2.0-dev" }
\ No newline at end of file
diff --git a/crates/models/mpt/Cargo.toml b/crates/models/mpt/Cargo.toml
index 9e2544e8..2a5cbcc4 100644
--- a/crates/models/mpt/Cargo.toml
+++ b/crates/models/mpt/Cargo.toml
@@ -8,5 +8,4 @@ edition = "2021"
 readme = "../../../README.md"
 
 [dependencies]
-llm-base = { path = "../../llm-base", version = "0.2.0-dev" }
-bytemuck = { workspace = true }
+llm-base = { path = "../../llm-base", version = "0.2.0-dev" }
\ No newline at end of file

From e7e732eaefdfeb65893d74e5a344ef828dbbcf3e Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Thu, 29 Jun 2023 03:04:46 +0200
Subject: [PATCH 18/21] refactor: vocabulary -> tokenizer

---
 README.md                                     |   6 +-
 binaries/llm-cli/src/cli_args.rs              |  54 ++---
 binaries/llm-cli/src/main.rs                  |  25 ++-
 crates/ggml/src/format/loader.rs              |   4 +-
 crates/ggml/src/tests.rs                      |  34 ++--
 crates/llm-base/src/inference_session.rs      |  26 +--
 crates/llm-base/src/lib.rs                    |  11 +-
 crates/llm-base/src/loader.rs                 |  40 ++--
 crates/llm-base/src/model/mod.rs              |  24 +--
 crates/llm-base/src/quantize.rs               |  16 +-
 .../src/{vocabulary.rs => tokenizer.rs}       | 184 +++++++++---------
 crates/llm/examples/embeddings.rs             |  22 +--
 crates/llm/examples/inference.rs              |  20 +-
 crates/llm/examples/vicuna-chat.rs            |  20 +-
 crates/llm/src/lib.rs                         |  44 +++--
 crates/models/bloom/src/lib.rs                |  16 +-
 crates/models/falcon/src/lib.rs               |  15 +-
 crates/models/gpt2/src/lib.rs                 |  14 +-
 crates/models/gptj/src/lib.rs                 |  15 +-
 crates/models/gptneox/src/lib.rs              |  15 +-
 crates/models/llama/src/lib.rs                |  14 +-
 crates/models/mpt/src/lib.rs                  |  17 +-
 22 files changed, 317 insertions(+), 319 deletions(-)
 rename crates/llm-base/src/{vocabulary.rs => tokenizer.rs} (75%)

diff --git a/README.md b/README.md
index 28b3892e..7828fe63 100644
--- a/README.md
+++ b/README.md
@@ -175,10 +175,10 @@ llm gptneox infer -m RedPajama-INCITE-Base-3B-v1-q4_0.bin -p "Rust is a cool pro
 In the example above, the first two arguments specify the model architecture and
 command, respectively. The required `-m` argument specifies the local path to
 the model, and the required `-p` argument specifies the evaluation prompt. The
-optional `-r` argument is used to load the model's vocabulary from a remote
+optional `-r` argument is used to load the model's tokenizer from a remote
 Hugging Face 🤗 repository, which will typically improve results when compared
-to loading the vocabulary from the model file itself; there is also an optional
-`-v` argument that can be used to specify the path to a local vocabulary file.
+to loading the tokenizer from the model file itself; there is also an optional
+`-v` argument that can be used to specify the path to a local tokenizer file.
 For more information about the `llm` CLI, use the `--help` parameter.
 
 There is also a [simple inference example](./crates/llm/examples/inference.rs)
diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs
index 5aec546f..ce7db33f 100644
--- a/binaries/llm-cli/src/cli_args.rs
+++ b/binaries/llm-cli/src/cli_args.rs
@@ -4,7 +4,7 @@ use clap::{Parser, Subcommand, ValueEnum};
 use color_eyre::eyre::{bail, Result, WrapErr};
 use llm::{
     ggml_format, ElementType, InferenceParameters, InferenceSessionConfig, InvalidTokenBias,
-    LoadProgress, Model, ModelKVMemoryType, ModelParameters, TokenBias, VocabularySource,
+    LoadProgress, Model, ModelKVMemoryType, ModelParameters, TokenBias, TokenizerSource,
 };
 use rand::SeedableRng;
 
@@ -149,15 +149,15 @@ pub struct Perplexity {
 #[derive(Parser, Debug)]
 pub struct Info {
     #[command(flatten)]
-    pub model_and_vocabulary: ModelAndVocabulary,
+    pub model_and_tokenizer: ModelAndTokenizer,
 
     /// Show all of the tensors in the model, including their names, formats and shapes.
     #[arg(long, short = 't')]
     pub tensors: bool,
 
-    /// Show all of the tokens in the vocabulary.
+    /// Show all of the tokens in the tokenizer.
     #[arg(long, short = 'v')]
-    pub vocabulary: bool,
+    pub tokenizer: bool,
 }
 
 #[derive(Parser, Debug)]
@@ -350,47 +350,47 @@ fn parse_bias(s: &str) -> Result<TokenBias, InvalidTokenBias> {
 }
 
 #[derive(Parser, Debug)]
-pub struct ModelVocabulary {
-    /// Local path to vocabulary
+pub struct ModelTokenizer {
+    /// Local path to Hugging Face tokenizer file
     #[arg(long, short = 'v')]
-    pub vocabulary_path: Option<PathBuf>,
+    pub tokenizer_path: Option<PathBuf>,
 
-    /// Remote HuggingFace repository containing vocabulary
+    /// Remote Hugging Face repository containing a tokenizer
     #[arg(long, short = 'r')]
-    pub vocabulary_repository: Option<String>,
+    pub tokenizer_repository: Option<String>,
 }
-impl ModelVocabulary {
-    pub fn to_source(&self) -> Result<VocabularySource> {
-        Ok(match (&self.vocabulary_path, &self.vocabulary_repository) {
+impl ModelTokenizer {
+    pub fn to_source(&self) -> Result<TokenizerSource> {
+        Ok(match (&self.tokenizer_path, &self.tokenizer_repository) {
             (Some(_), Some(_)) => {
-                bail!("Cannot specify both --vocabulary-path and --vocabulary-repository");
+                bail!("Cannot specify both --tokenizer-path and --tokenizer-repository");
             }
-            (Some(path), None) => VocabularySource::HuggingFaceTokenizerFile(path.to_owned()),
-            (None, Some(repo)) => VocabularySource::HuggingFaceRemote(repo.to_owned()),
-            (None, None) => VocabularySource::Model,
+            (Some(path), None) => TokenizerSource::HuggingFaceTokenizerFile(path.to_owned()),
+            (None, Some(repo)) => TokenizerSource::HuggingFaceRemote(repo.to_owned()),
+            (None, None) => TokenizerSource::Embedded,
         })
     }
 }
 
 #[derive(Parser, Debug)]
-pub struct ModelAndVocabulary {
+pub struct ModelAndTokenizer {
     /// Where to load the model from
     #[arg(long, short = 'm')]
     pub model_path: PathBuf,
 
     #[command(flatten)]
-    pub vocabulary: ModelVocabulary,
+    pub tokenizer: ModelTokenizer,
 }
-impl ModelAndVocabulary {
-    pub fn to_source(&self) -> Result<VocabularySource> {
-        self.vocabulary.to_source()
+impl ModelAndTokenizer {
+    pub fn to_source(&self) -> Result<TokenizerSource> {
+        self.tokenizer.to_source()
     }
 }
 
 #[derive(Parser, Debug)]
 pub struct ModelLoad {
     #[command(flatten)]
-    pub model_and_vocabulary: ModelAndVocabulary,
+    pub model_and_tokenizer: ModelAndTokenizer,
 
     /// Sets the size of the context (in tokens). Allows feeding longer prompts.
     /// Note that this affects memory.
@@ -431,19 +431,19 @@ impl ModelLoad {
         let now = std::time::Instant::now();
         let mut prev_load_time = now;
 
-        let vocabulary_source = match self.model_and_vocabulary.to_source() {
+        let tokenizer_source = match self.model_and_tokenizer.to_source() {
             Ok(vs) => vs,
             Err(err) => {
                 if let Some(sp) = sp.take() {
-                    sp.fail(&format!("Failed to load vocabulary: {}", err));
+                    sp.fail(&format!("Failed to load tokenizer: {}", err));
                 }
                 return Err(err);
             }
         };
 
         let model = llm::load::<M>(
-            &self.model_and_vocabulary.model_path,
-            vocabulary_source,
+            &self.model_and_tokenizer.model_path,
+            tokenizer_source,
             params,
             |progress| match progress {
                 LoadProgress::HyperparametersLoaded => {
@@ -557,7 +557,7 @@ pub struct Quantize {
     pub destination: PathBuf,
 
     #[command(flatten)]
-    pub vocabulary: ModelVocabulary,
+    pub tokenizer: ModelTokenizer,
 
     /// The GGML container type to target.
     ///
diff --git a/binaries/llm-cli/src/main.rs b/binaries/llm-cli/src/main.rs
index 679a753e..45e4c127 100644
--- a/binaries/llm-cli/src/main.rs
+++ b/binaries/llm-cli/src/main.rs
@@ -143,15 +143,12 @@ fn perplexity<M: llm::KnownModel + 'static>(args: &cli_args::Perplexity) -> Resu
 }
 
 fn info<M: llm::KnownModel + 'static>(args: &cli_args::Info) -> Result<()> {
-    let model_path = &args.model_and_vocabulary.model_path;
-    let vocabulary = args
-        .model_and_vocabulary
-        .to_source()?
-        .retrieve(model_path)?;
+    let model_path = &args.model_and_tokenizer.model_path;
+    let tokenizer = args.model_and_tokenizer.to_source()?.retrieve(model_path)?;
 
     let file = File::open(model_path)?;
     let mut reader = BufReader::new(&file);
-    let mut loader: llm::Loader<M::Hyperparameters, _> = llm::Loader::new(vocabulary, |_| {
+    let mut loader: llm::Loader<M::Hyperparameters, _> = llm::Loader::new(tokenizer, |_| {
         // We purposely do not print progress here, as we are only interested in the metadata
     });
 
@@ -159,12 +156,12 @@ fn info<M: llm::KnownModel + 'static>(args: &cli_args::Info) -> Result<()> {
 
     log::info!("Container type: {:?}", loader.container_type);
     log::info!("Hyperparameters: {:?}", loader.hyperparameters);
-    log::info!("Vocabulary size: {}", loader.vocabulary.len());
+    log::info!("Tokenizer vocabulary size: {}", loader.tokenizer.len());
 
-    if args.vocabulary {
-        log::info!("Vocabulary:");
-        for i in 0..loader.vocabulary.len() {
-            log::info!("- {}: {}", i, utf8_or_array(&loader.vocabulary.token(i)));
+    if args.tokenizer {
+        log::info!("Tokens:");
+        for i in 0..loader.tokenizer.len() {
+            log::info!("- {}: {}", i, utf8_or_array(&loader.tokenizer.token(i)));
         }
     }
 
@@ -187,7 +184,7 @@ fn info<M: llm::KnownModel + 'static>(args: &cli_args::Info) -> Result<()> {
 fn prompt_tokens<M: llm::KnownModel + 'static>(args: &cli_args::PromptTokens) -> Result<()> {
     let prompt = load_prompt_file_with_prompt(&args.prompt_file, args.prompt.as_deref());
     let model = args.model_load.load::<M>(false)?;
-    let toks = match model.vocabulary().tokenize(&prompt, false) {
+    let toks = match model.tokenizer().tokenize(&prompt, false) {
         Ok(toks) => toks,
         Err(e) => {
             log::error!("Could not tokenize prompt: {e}");
@@ -326,12 +323,12 @@ fn quantize<M: llm::KnownModel + 'static>(args: &cli_args::Quantize) -> Result<(
 
     let mut source = BufReader::new(std::fs::File::open(&args.source)?);
     let mut destination = BufWriter::new(std::fs::File::create(&args.destination)?);
-    let vocabulary = args.vocabulary.to_source()?.retrieve(&args.source)?;
+    let tokenizer = args.tokenizer.to_source()?.retrieve(&args.source)?;
 
     llm::quantize::<M, _, _>(
         &mut source,
         &mut destination,
-        vocabulary,
+        tokenizer,
         args.container_type.into(),
         args.target.into(),
         |progress| match progress {
diff --git a/crates/ggml/src/format/loader.rs b/crates/ggml/src/format/loader.rs
index f9dd76e8..ca0f9e6b 100644
--- a/crates/ggml/src/format/loader.rs
+++ b/crates/ggml/src/format/loader.rs
@@ -117,7 +117,7 @@ pub fn tensor_size(element_type: ElementType, n_elements: usize) -> usize {
 /// Information present within GGML [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning))
 /// that is required to continue loading the model.
 pub struct PartialHyperparameters {
-    /// The number of tokens in the model's vocabulary.
+    /// The number of tokens in the model's embedded vocabulary.
     pub n_vocab: usize,
 }
 
@@ -125,7 +125,7 @@ pub struct PartialHyperparameters {
 pub trait LoadHandler<E: Error> {
     /// Called when the [ContainerType] is read.
     fn container_type(&mut self, container_type: ContainerType) -> Result<(), E>;
-    /// Called when a token is read so it can be added to the model's vocabulary.
+    /// Called when a token is read so it can be added to the model's embedded vocabulary.
     fn vocabulary_token(&mut self, i: usize, token: Vec<u8>, score: f32) -> Result<(), E>;
     /// Called when the model's hyperparameters need to be read.
     fn read_hyperparameters(
diff --git a/crates/ggml/src/tests.rs b/crates/ggml/src/tests.rs
index 8b099c35..b842f45d 100644
--- a/crates/ggml/src/tests.rs
+++ b/crates/ggml/src/tests.rs
@@ -18,19 +18,19 @@ impl Error for DummyError {}
 
 #[test]
 fn can_roundtrip_loader_and_saver_ggml() {
-    let vocabulary = vec![
+    let tokenizer = vec![
         ("blazingly".as_bytes().to_vec(), 0.0),
         ("fast".as_bytes().to_vec(), 0.0),
         ("memory".as_bytes().to_vec(), 0.0),
         ("efficient".as_bytes().to_vec(), 0.0),
     ];
 
-    roundtrip_test(format::SaveContainerType::Ggml, vocabulary).unwrap();
+    roundtrip_test(format::SaveContainerType::Ggml, tokenizer).unwrap();
 }
 
 #[test]
 fn will_fail_on_scored_ggml_save() {
-    let vocabulary = vec![
+    let tokenizer = vec![
         ("blazingly".as_bytes().to_vec(), 0.1),
         ("fast".as_bytes().to_vec(), 0.2),
         ("memory".as_bytes().to_vec(), 0.3),
@@ -38,7 +38,7 @@ fn will_fail_on_scored_ggml_save() {
     ];
 
     assert_eq!(
-        roundtrip_test(format::SaveContainerType::Ggml, vocabulary)
+        roundtrip_test(format::SaveContainerType::Ggml, tokenizer)
             .unwrap_err()
             .to_string(),
         format::SaveError::<std::io::Error>::VocabularyScoringNotSupported.to_string()
@@ -47,19 +47,19 @@ fn will_fail_on_scored_ggml_save() {
 
 #[test]
 fn can_roundtrip_loader_and_saver_ggjt_v3() {
-    let vocabulary = vec![
+    let tokenizer = vec![
         ("blazingly".as_bytes().to_vec(), 0.1),
         ("fast".as_bytes().to_vec(), 0.2),
         ("memory".as_bytes().to_vec(), 0.3),
         ("efficient".as_bytes().to_vec(), 0.4),
     ];
 
-    roundtrip_test(format::SaveContainerType::GgjtV3, vocabulary).unwrap();
+    roundtrip_test(format::SaveContainerType::GgjtV3, tokenizer).unwrap();
 }
 
 fn roundtrip_test(
     save_container_type: format::SaveContainerType,
-    vocabulary: Vec<(Vec<u8>, f32)>,
+    tokenizer: Vec<(Vec<u8>, f32)>,
 ) -> anyhow::Result<()> {
     let mut rng = rand::thread_rng();
     let element_type = crate::Type::F16;
@@ -67,9 +67,9 @@ fn roundtrip_test(
         hyperparameters: Hyperparameters {
             some_hyperparameter: random(),
             some_other_hyperparameter: random(),
-            vocabulary_size: vocabulary.len().try_into()?,
+            tokenizer_size: tokenizer.len().try_into()?,
         },
-        vocabulary,
+        tokenizer,
         tensors: (0..10)
             .map(|i| {
                 let n_dims = Uniform::from(1..3).sample(&mut rng);
@@ -104,7 +104,7 @@ fn roundtrip_test(
         &mut cursor,
         &mut save_handler,
         save_container_type,
-        &model.vocabulary,
+        &model.tokenizer,
         &model.tensors.keys().cloned().collect::<Vec<String>>(),
     )?;
 
@@ -125,21 +125,21 @@ fn roundtrip_test(
 struct Hyperparameters {
     some_hyperparameter: u32,
     some_other_hyperparameter: u32,
-    vocabulary_size: u32,
+    tokenizer_size: u32,
 }
 impl Hyperparameters {
     fn read(reader: &mut dyn BufRead) -> Result<Self, std::io::Error> {
         Ok(Self {
             some_hyperparameter: util::read_u32(reader)?,
             some_other_hyperparameter: util::read_u32(reader)?,
-            vocabulary_size: util::read_u32(reader)?,
+            tokenizer_size: util::read_u32(reader)?,
         })
     }
 
     fn write(&self, writer: &mut dyn Write) -> Result<(), std::io::Error> {
         util::write_u32(writer, self.some_hyperparameter)?;
         util::write_u32(writer, self.some_other_hyperparameter)?;
-        util::write_u32(writer, self.vocabulary_size)?;
+        util::write_u32(writer, self.tokenizer_size)?;
         Ok(())
     }
 }
@@ -147,7 +147,7 @@ impl Hyperparameters {
 #[derive(Default, PartialEq, Debug)]
 struct Model {
     hyperparameters: Hyperparameters,
-    vocabulary: Vec<(Vec<u8>, f32)>,
+    tokenizer: Vec<(Vec<u8>, f32)>,
     tensors: BTreeMap<String, format::TensorSaveInfo>,
 }
 
@@ -181,8 +181,8 @@ impl format::LoadHandler<DummyError> for MockLoadHandler<'_> {
     }
 
     fn vocabulary_token(&mut self, i: usize, token: Vec<u8>, score: f32) -> Result<(), DummyError> {
-        assert_eq!(i, self.loaded_model.vocabulary.len());
-        self.loaded_model.vocabulary.push((token, score));
+        assert_eq!(i, self.loaded_model.tokenizer.len());
+        self.loaded_model.tokenizer.push((token, score));
         Ok(())
     }
 
@@ -195,7 +195,7 @@ impl format::LoadHandler<DummyError> for MockLoadHandler<'_> {
             n_vocab: self
                 .loaded_model
                 .hyperparameters
-                .vocabulary_size
+                .tokenizer_size
                 .try_into()
                 .unwrap(),
         })
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index 57ea0908..4d1489a4 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -289,7 +289,7 @@ impl InferenceSession {
     ) -> Result<(), InferenceError> {
         let beginning_of_sentence = self.n_past == 0;
 
-        let vocab = model.vocabulary();
+        let vocab = model.tokenizer();
         let prompt_tokens = prompt.into().to_tokens(vocab, beginning_of_sentence)?;
 
         if self.n_past + prompt_tokens.len() >= model.context_size() {
@@ -301,13 +301,13 @@ impl InferenceSession {
             for &tk in batch {
                 let should_call_callback = Some(tk) != model.bot_token_id();
 
-                let mut token = match model.vocabulary() {
-                    crate::Vocabulary::Model(_) => model.vocabulary().token(tk as usize).to_vec(),
-                    crate::Vocabulary::External(_) => {
+                let mut token = match model.tokenizer() {
+                    crate::Tokenizer::Embedded(_) => model.tokenizer().token(tk as usize).to_vec(),
+                    crate::Tokenizer::HuggingFace(_) => {
                         let mut previous_tokens = self.tokens.clone();
                         previous_tokens.push(tk);
 
-                        let all_tokens = model.vocabulary().decode(previous_tokens, true);
+                        let all_tokens = model.tokenizer().decode(previous_tokens, true);
                         let splitted = all_tokens.split_at(self.decoded_tokens.len());
 
                         splitted.1.to_vec()
@@ -359,12 +359,12 @@ impl InferenceSession {
         if next_token as TokenId == model.eot_token_id() {
             Err(InferenceError::EndOfText)
         } else {
-            let res = match model.vocabulary() {
-                crate::Vocabulary::Model(_) => {
-                    model.vocabulary().token(next_token as usize).to_vec()
+            let res = match model.tokenizer() {
+                crate::Tokenizer::Embedded(_) => {
+                    model.tokenizer().token(next_token as usize).to_vec()
                 }
-                crate::Vocabulary::External(_) => {
-                    let all_tokens = model.vocabulary().decode(self.tokens.clone(), true);
+                crate::Tokenizer::HuggingFace(_) => {
+                    let all_tokens = model.tokenizer().decode(self.tokens.clone(), true);
                     let splitted = all_tokens.split_at(self.decoded_tokens.len());
 
                     splitted.1.to_vec()
@@ -399,7 +399,7 @@ impl InferenceSession {
             for token_id in &self.tokens {
                 // Buffer the token until it's valid UTF-8, then call the callback.
                 if let Some(tokens) =
-                    token_utf8_buf.push(&model.vocabulary().token(*token_id as usize))
+                    token_utf8_buf.push(&model.tokenizer().token(*token_id as usize))
                 {
                     if let Err(e) = callback(InferenceResponse::SnapshotToken(tokens)) {
                         return Err(InferenceError::UserCallback(Box::new(e)));
@@ -472,14 +472,14 @@ impl InferenceSession {
     ) -> Result<(), TokenizationError> {
         // Implementation based on perplexity example of llama.cpp:
         // https://github.com/ggerganov/llama.cpp/blob/2d5db48371052087a83974abda3767d1aedec598/examples/perplexity/perplexity.cpp#L24
-        let mut tokens = prompt.into().to_tokens(model.vocabulary(), true)?;
+        let mut tokens = prompt.into().to_tokens(model.tokenizer(), true)?;
 
         let mut count = 0;
 
         // TODO: make this handle <n_ctx tokens
         let n_ctx = model.context_size();
         let n_chunk = tokens.len() / n_ctx;
-        let n_vocab = model.vocabulary().len();
+        let n_vocab = model.tokenizer().len();
         let n_batch = parameters.n_batch;
 
         let mut nll = 0.0;
diff --git a/crates/llm-base/src/lib.rs b/crates/llm-base/src/lib.rs
index 45c5c17e..722bb27f 100644
--- a/crates/llm-base/src/lib.rs
+++ b/crates/llm-base/src/lib.rs
@@ -11,7 +11,7 @@ mod inference_session;
 mod loader;
 mod lora;
 mod quantize;
-mod vocabulary;
+mod tokenizer;
 
 pub mod model;
 pub mod samplers;
@@ -37,12 +37,11 @@ pub use model::{Hyperparameters, KnownModel, Model, ModelParameters, OutputReque
 pub use quantize::{quantize, QuantizeError, QuantizeProgress};
 pub use regex::Regex;
 pub use samplers::Sampler;
-pub use util::TokenUtf8Buffer;
-pub(crate) use vocabulary::ModelVocabulary;
-pub use vocabulary::{
-    InvalidTokenBias, Prompt, TokenBias, TokenId, TokenizationError, Vocabulary,
-    VocabularyLoadError, VocabularySource,
+pub use tokenizer::{
+    InvalidTokenBias, Prompt, TokenBias, TokenId, TokenizationError, Tokenizer, TokenizerLoadError,
+    TokenizerSource,
 };
+pub use util::TokenUtf8Buffer;
 
 #[derive(Clone, Debug)]
 /// The parameters for text generation.
diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs
index fcb6b6c9..13b04516 100644
--- a/crates/llm-base/src/loader.rs
+++ b/crates/llm-base/src/loader.rs
@@ -8,8 +8,8 @@ use std::{
 };
 
 use crate::{
-    util, Hyperparameters, KnownModel, LoraAdapter, LoraParameters, ModelParameters,
-    ModelVocabulary, TokenId, Vocabulary, VocabularyLoadError, VocabularySource,
+    util, Hyperparameters, KnownModel, LoraAdapter, LoraParameters, ModelParameters, TokenId,
+    Tokenizer, TokenizerLoadError, TokenizerSource,
 };
 pub use ggml::ContainerType;
 use ggml::{
@@ -325,10 +325,10 @@ pub enum LoadError {
         /// The paths that were found.
         paths: Vec<PathBuf>,
     },
-    /// The vocab file for the tokenizer could not be loaded.
-    #[error("could not load vocabulary file {path:?}: {error}")]
-    VocabularyLoadError {
-        /// The invalid vocabulary path
+    /// The tokenizer could not be loaded.
+    #[error("could not load tokenizer {path:?}: {error}")]
+    TokenizerLoadError {
+        /// The invalid tokenizer path
         path: PathBuf,
 
         /// The error that occurred.
@@ -343,9 +343,9 @@ impl From<util::FindAllModelFilesError> for LoadError {
         }
     }
 }
-impl From<VocabularyLoadError> for LoadError {
-    fn from(value: VocabularyLoadError) -> Self {
-        LoadError::VocabularyLoadError {
+impl From<TokenizerLoadError> for LoadError {
+    fn from(value: TokenizerLoadError) -> Self {
+        LoadError::TokenizerLoadError {
             path: value.path,
             error: value.error,
         }
@@ -405,7 +405,7 @@ pub trait TensorLoader<E: std::error::Error> {
 ///   store any information about the architecture.
 pub fn load<M: KnownModel>(
     path: &Path,
-    vocabulary_source: VocabularySource,
+    tokenizer_source: TokenizerSource,
     params: ModelParameters,
     load_progress_callback: impl FnMut(LoadProgress),
 ) -> Result<M, LoadError> {
@@ -426,15 +426,15 @@ pub fn load<M: KnownModel>(
     })?;
     let mut reader = BufReader::new(&file);
 
-    let vocabulary = vocabulary_source.retrieve(path)?;
-    let mut loader = Loader::new(vocabulary, load_progress_callback);
+    let tokenizer = tokenizer_source.retrieve(path)?;
+    let mut loader = Loader::new(tokenizer, load_progress_callback);
 
     ggml::format::load(&mut reader, &mut loader)
         .map_err(|err| LoadError::from_format_error(err, path.to_owned()))?;
 
     let Loader {
         hyperparameters,
-        vocabulary,
+        tokenizer,
         tensors,
         mut load_progress_callback,
         container_type,
@@ -486,7 +486,7 @@ pub fn load<M: KnownModel>(
                 // TODO: Consider updating the progress callback to report the progress of the LoRA file.
                 // Most LoRAs are small enough that this is not necessary, but it would be nice to have.
                 let mut lora_loader: Loader<LoraParameters, _> =
-                    Loader::new(ModelVocabulary::default().into(), |_| {});
+                    Loader::new(Tokenizer::empty_embedded(), |_| {});
                 ggml::format::load(&mut lora_reader, &mut lora_loader)
                     .map_err(|err| LoadError::from_format_error(err, lora_path.to_owned()))?;
 
@@ -533,7 +533,7 @@ pub fn load<M: KnownModel>(
         loaded_tensors: Default::default(),
     };
 
-    let model = KnownModel::new(hyperparameters, params, vocabulary, tl)?;
+    let model = KnownModel::new(hyperparameters, params, tokenizer, tl)?;
 
     (load_progress_callback)(LoadProgress::Loaded {
         file_size,
@@ -549,8 +549,8 @@ pub struct Loader<Hp: Hyperparameters, F: FnMut(LoadProgress)> {
     load_progress_callback: F,
 
     // Input/Output
-    /// The vocabulary of the model.
-    pub vocabulary: Vocabulary,
+    /// The tokenizer of the model.
+    pub tokenizer: Tokenizer,
 
     // Output
     /// The container type of the model.
@@ -562,13 +562,13 @@ pub struct Loader<Hp: Hyperparameters, F: FnMut(LoadProgress)> {
 }
 impl<Hp: Hyperparameters, F: FnMut(LoadProgress)> Loader<Hp, F> {
     /// Creates a new loader.
-    pub fn new(vocabulary: Vocabulary, load_progress_callback: F) -> Self {
+    pub fn new(tokenizer: Tokenizer, load_progress_callback: F) -> Self {
         Self {
             load_progress_callback,
 
             container_type: ContainerType::Ggml,
             hyperparameters: Hp::default(),
-            vocabulary,
+            tokenizer,
             tensors: HashMap::default(),
         }
     }
@@ -582,7 +582,7 @@ impl<Hp: Hyperparameters, F: FnMut(LoadProgress)> ggml::format::LoadHandler<Load
     }
 
     fn vocabulary_token(&mut self, i: usize, token: Vec<u8>, score: f32) -> Result<(), LoadError> {
-        if let Vocabulary::Model(mv) = &mut self.vocabulary {
+        if let Tokenizer::Embedded(mv) = &mut self.tokenizer {
             let id = match TokenId::try_from(i) {
                 Ok(id) => id,
                 Err(err) => return Err(LoadError::InvalidIntegerConversion(err)),
diff --git a/crates/llm-base/src/model/mod.rs b/crates/llm-base/src/model/mod.rs
index 15bd8ee5..bee50f37 100644
--- a/crates/llm-base/src/model/mod.rs
+++ b/crates/llm-base/src/model/mod.rs
@@ -11,8 +11,8 @@ use regex::Regex;
 use thiserror::Error;
 
 use crate::{
-    loader::TensorLoader, vocabulary::TokenId, FileType, InferenceParameters, InferenceSession,
-    InferenceSessionConfig, LoadError, LoadProgress, Vocabulary, VocabularySource,
+    loader::TensorLoader, tokenizer::TokenId, FileType, InferenceParameters, InferenceSession,
+    InferenceSessionConfig, LoadError, LoadProgress, Tokenizer, TokenizerSource,
 };
 
 /// Common functions for model evaluation
@@ -29,14 +29,14 @@ pub trait KnownModel: Send + Sync {
     /// is a helper function on top of [llm_base::load](crate::load).
     fn load(
         path: &Path,
-        vocabulary_source: VocabularySource,
+        tokenizer_source: TokenizerSource,
         params: ModelParameters,
         load_progress_callback: impl FnMut(LoadProgress),
     ) -> Result<Self, LoadError>
     where
         Self: Sized,
     {
-        crate::load(path, vocabulary_source, params, load_progress_callback)
+        crate::load(path, tokenizer_source, params, load_progress_callback)
     }
 
     /// Creates a new model from the provided [ModelParameters] hyperparameters.
@@ -44,7 +44,7 @@ pub trait KnownModel: Send + Sync {
     fn new<E: Error>(
         hyperparameters: Self::Hyperparameters,
         params: ModelParameters,
-        vocabulary: Vocabulary,
+        tokenizer: Tokenizer,
         tensor_loader: impl TensorLoader<E>,
     ) -> Result<Self, E>
     where
@@ -65,8 +65,8 @@ pub trait KnownModel: Send + Sync {
         output_request: &mut OutputRequest,
     );
 
-    /// Get the vocabulary for this model.
-    fn vocabulary(&self) -> &Vocabulary;
+    /// Get the tokenizer for this model.
+    fn tokenizer(&self) -> &Tokenizer;
 
     /// Get the context size (configured with [ModelParameters::context_size]) used by
     /// this model.
@@ -103,8 +103,8 @@ pub trait Model: Send + Sync {
         output_request: &mut OutputRequest,
     );
 
-    /// Get the vocabulary for this model.
-    fn vocabulary(&self) -> &Vocabulary;
+    /// Get the tokenizer for this model.
+    fn tokenizer(&self) -> &Tokenizer;
 
     /// Get the context size (configured with [ModelParameters::context_size]) used by
     /// this model.
@@ -131,8 +131,8 @@ impl<H: Hyperparameters, M: KnownModel<Hyperparameters = H>> Model for M {
         KnownModel::evaluate(self, session, params, input_tokens, output_request)
     }
 
-    fn vocabulary(&self) -> &Vocabulary {
-        KnownModel::vocabulary(self)
+    fn tokenizer(&self) -> &Tokenizer {
+        KnownModel::tokenizer(self)
     }
 
     fn context_size(&self) -> usize {
@@ -157,7 +157,7 @@ pub trait Hyperparameters: Sized + Default + Debug {
     /// Write the parameters in GGML format to a writer.
     fn write_ggml(&self, writer: &mut dyn Write) -> Result<(), HyperparametersWriteError>;
 
-    /// Get the number of tokens in the vocabulary.
+    /// Get the number of tokens in the embedded vocabulary, if any.
     fn n_vocabulary(&self) -> usize;
 
     /// Get the filetype of the model.
diff --git a/crates/llm-base/src/quantize.rs b/crates/llm-base/src/quantize.rs
index 696f7698..187a6fc0 100644
--- a/crates/llm-base/src/quantize.rs
+++ b/crates/llm-base/src/quantize.rs
@@ -2,7 +2,7 @@
 
 use crate::{
     loader::FileTypeFormat, model::HyperparametersWriteError, Hyperparameters, KnownModel,
-    LoadError, LoadProgress, Loader, Vocabulary,
+    LoadError, LoadProgress, Loader, Tokenizer,
 };
 use ggml::format::{SaveError, SaveHandler, TensorLoadInfo, TensorSaveInfo};
 use half::f16;
@@ -139,7 +139,7 @@ impl QuantizeError {
 pub fn quantize<M: KnownModel, R: BufRead + Seek, W: Write + Seek>(
     reader: &mut R,
     writer: &mut W,
-    vocabulary: Vocabulary,
+    tokenizer: Tokenizer,
     save_container_type: ggml::format::SaveContainerType,
     quantization_type: ggml::Type,
     progress_callback: impl Fn(QuantizeProgress),
@@ -154,7 +154,7 @@ pub fn quantize<M: KnownModel, R: BufRead + Seek, W: Write + Seek>(
     // Load the model
     let progress_callback = Arc::new(progress_callback);
 
-    let mut loader = Loader::<M::Hyperparameters, _>::new(vocabulary, {
+    let mut loader = Loader::<M::Hyperparameters, _>::new(tokenizer, {
         let progress_callback = progress_callback.clone();
         move |p| {
             if let LoadProgress::HyperparametersLoaded = p {
@@ -168,7 +168,7 @@ pub fn quantize<M: KnownModel, R: BufRead + Seek, W: Write + Seek>(
     // Save the quantized model, quantizing as we go
     let Loader {
         mut hyperparameters,
-        vocabulary,
+        tokenizer,
         tensors,
         ..
     } = loader;
@@ -180,14 +180,14 @@ pub fn quantize<M: KnownModel, R: BufRead + Seek, W: Write + Seek>(
             .expect("format has no corresponding ftype");
     }
 
-    let vocabulary = match vocabulary {
-        Vocabulary::Model(v) => v
+    let tokenizer = match tokenizer {
+        Tokenizer::Embedded(v) => v
             .id_to_token
             .iter()
             .cloned()
             .zip(v.id_to_token_score)
             .collect::<Vec<_>>(),
-        Vocabulary::External(_) => vec![],
+        Tokenizer::HuggingFace(_) => vec![],
     };
 
     let to_quantize = M::quantize_tensors();
@@ -205,7 +205,7 @@ pub fn quantize<M: KnownModel, R: BufRead + Seek, W: Write + Seek>(
         writer,
         &mut saver,
         save_container_type,
-        &vocabulary,
+        &tokenizer,
         &tensors.keys().cloned().collect::<Vec<_>>(),
     )
     .map_err(|err| QuantizeError::from_format_error(err, PathBuf::default()))?;
diff --git a/crates/llm-base/src/vocabulary.rs b/crates/llm-base/src/tokenizer.rs
similarity index 75%
rename from crates/llm-base/src/vocabulary.rs
rename to crates/llm-base/src/tokenizer.rs
index 43540c83..b914eb3f 100644
--- a/crates/llm-base/src/vocabulary.rs
+++ b/crates/llm-base/src/tokenizer.rs
@@ -7,9 +7,8 @@ use std::{
 };
 
 use thiserror::Error;
-use tokenizers::Tokenizer;
 
-/// The identifier of a token in a vocabulary.
+/// The identifier of a token in a tokenizer.
 pub type TokenId = u32;
 pub(crate) type Token = Vec<u8>;
 pub(crate) type TokenScore = f32;
@@ -25,21 +24,21 @@ pub enum TokenizationError {
         error: Box<dyn Error + Send + Sync>,
     },
     #[error("the token ID {0} was invalid for this model")]
-    /// One of the tokens provided by the user was invalid, and did not belong to this model's vocabulary.
+    /// One of the tokens provided by the user was invalid, and did not belong to this model's tokenizer.
     InvalidTokenId(TokenId),
 }
 
 #[derive(Error, Debug)]
-/// Errors related to loading the vocabulary.
-#[error("error loading vocabulary from {path}: {error}")]
-pub struct VocabularyLoadError {
-    /// The path to the vocabulary.
+/// Errors related to loading the tokenizer.
+#[error("error loading tokenizer from {path}: {error}")]
+pub struct TokenizerLoadError {
+    /// The path to the tokenizer.
     pub path: PathBuf,
     /// The error that occurred during loading.
     pub error: Box<dyn Error + Send + Sync>,
 }
 
-impl VocabularyLoadError {
+impl TokenizerLoadError {
     fn new(path: impl Into<PathBuf>, error: impl Into<Box<dyn Error + Send + Sync>>) -> Self {
         Self {
             path: path.into(),
@@ -49,111 +48,118 @@ impl VocabularyLoadError {
 }
 
 #[derive(Clone, Debug, PartialEq)]
-/// The source of a vocabulary.
-pub enum VocabularySource {
+/// The source of a tokenizer.
+pub enum TokenizerSource {
     /// Read the vocabulary from the model if available, and use a simplistic tokenizer.
     ///
     /// This is easy to use, but may not be the best choice for your use case, and is not
     /// guaranteed to be available for all models.
-    Model,
+    Embedded,
 
-    /// Read the vocabulary from a local HuggingFace-format tokenizer file, and use the
+    /// Read the tokenizer from a local HuggingFace-format tokenizer file, and use the
     /// HuggingFace tokenizer.
     HuggingFaceTokenizerFile(PathBuf),
 
-    /// Fetch the vocabulary from a remote HuggingFace repository. This will make a blocking
-    /// HTTP request to HuggingFace to retrieve the vocabulary and may store files locally,
+    /// Fetch the tokenizer from a remote HuggingFace repository. This will make a blocking
+    /// HTTP request to HuggingFace to retrieve the tokenizer and may store files locally,
     /// so it is not recommended for production use. This will use the HuggingFace tokenizer.
     HuggingFaceRemote(String),
 }
-impl VocabularySource {
-    /// Retrieve the vocabulary from the source.
+impl TokenizerSource {
+    /// Retrieve the tokenizer from the source.
     ///
-    /// Note that this may make a blocking HTTP request to HuggingFace to retrieve the vocabulary
+    /// Note that this may make a blocking HTTP request to HuggingFace to retrieve the tokenizer.
     /// if `self` is [`Self::HuggingFaceRemote`].
-    pub fn retrieve(self, model_path: &Path) -> Result<Vocabulary, VocabularyLoadError> {
+    pub fn retrieve(self, model_path: &Path) -> Result<Tokenizer, TokenizerLoadError> {
         Ok(match self {
-            Self::HuggingFaceRemote(identifier) => ExternalVocabulary::new(
-                Tokenizer::from_pretrained(&identifier, None)
-                    .map_err(|error| VocabularyLoadError::new(model_path, error))?,
+            Self::HuggingFaceRemote(identifier) => HuggingFaceTokenizer::new(
+                tokenizers::Tokenizer::from_pretrained(&identifier, None)
+                    .map_err(|error| TokenizerLoadError::new(model_path, error))?,
             )
             .into(),
 
             Self::HuggingFaceTokenizerFile(path) => {
                 if !path.is_file() {
-                    return Err(VocabularyLoadError::new(
+                    return Err(TokenizerLoadError::new(
                         path,
                         std::io::Error::new(
                             std::io::ErrorKind::NotFound,
-                            "Vocabulary file not found",
+                            "Tokenizer was not a file, or did not exist",
                         ),
                     ));
                 }
 
-                ExternalVocabulary::new(
-                    Tokenizer::from_file(&path)
-                        .map_err(|error| VocabularyLoadError::new(path, error))?,
+                HuggingFaceTokenizer::new(
+                    tokenizers::Tokenizer::from_file(&path)
+                        .map_err(|error| TokenizerLoadError::new(path, error))?,
                 )
                 .into()
             }
 
-            Self::Model => ModelVocabulary::default().into(),
+            Self::Embedded => EmbeddedTokenizer::default().into(),
         })
     }
 }
 
-/// Vocabulary enum
-pub enum Vocabulary {
+/// Encapsulates the tokenizer for a model, and provides methods to tokenize text.
+pub enum Tokenizer {
     /// The vocabulary built-in to the model.
-    Model(ModelVocabulary),
+    Embedded(EmbeddedTokenizer),
 
-    /// A custom vocabulary provided by the user.
-    External(ExternalVocabulary),
+    /// A Hugging Face tokenizer.
+    HuggingFace(HuggingFaceTokenizer),
 }
-impl From<ModelVocabulary> for Vocabulary {
-    fn from(v: ModelVocabulary) -> Self {
-        Self::Model(v)
+impl From<EmbeddedTokenizer> for Tokenizer {
+    fn from(v: EmbeddedTokenizer) -> Self {
+        Self::Embedded(v)
     }
 }
-impl From<ExternalVocabulary> for Vocabulary {
-    fn from(v: ExternalVocabulary) -> Self {
-        Self::External(v)
+impl From<HuggingFaceTokenizer> for Tokenizer {
+    fn from(v: HuggingFaceTokenizer) -> Self {
+        Self::HuggingFace(v)
     }
 }
-impl Vocabulary {
-    /// Converts a token to the token ID it represents in this vocabulary.
+impl Tokenizer {
+    /// Creates an empty embedded tokenizer, for contexts where you need a tokenizer but don't
+    /// need to tokenize anything.
+    pub(crate) fn empty_embedded() -> Self {
+        Self::Embedded(EmbeddedTokenizer::default())
+    }
+}
+impl Tokenizer {
+    /// Converts a token to the token ID it represents in this tokenizer.
     pub fn id(&self, token: &[u8]) -> Option<TokenId> {
         match self {
-            Vocabulary::Model(v) => v.id(token),
-            Vocabulary::External(v) => v.id(token),
+            Tokenizer::Embedded(v) => v.id(token),
+            Tokenizer::HuggingFace(v) => v.id(token),
         }
     }
 
-    /// Converts a token index to the token it represents in this vocabulary.
+    /// Converts a token index to the token it represents in this tokenizer.
     pub fn token(&self, idx: usize) -> Vec<u8> {
         match self {
-            Vocabulary::Model(v) => v.token(idx),
-            Vocabulary::External(v) => v.token(idx),
+            Tokenizer::Embedded(v) => v.token(idx),
+            Tokenizer::HuggingFace(v) => v.token(idx),
         }
     }
 
-    /// Returns the number of tokens in the vocabulary.
+    /// Returns the number of tokens in the tokenizer.
     pub fn len(&self) -> usize {
         match self {
-            Vocabulary::Model(v) => v.len(),
-            Vocabulary::External(v) => v.len(),
+            Tokenizer::Embedded(v) => v.len(),
+            Tokenizer::HuggingFace(v) => v.len(),
         }
     }
 
-    /// Returns whether the vocabulary is empty.
+    /// Returns whether the tokenizer is empty.
     pub fn is_empty(&self) -> bool {
         match self {
-            Vocabulary::Model(v) => v.is_empty(),
-            Vocabulary::External(v) => v.is_empty(),
+            Tokenizer::Embedded(v) => v.is_empty(),
+            Tokenizer::HuggingFace(v) => v.is_empty(),
         }
     }
 
-    /// Tokenize a `text` with this vocabulary.
+    /// Tokenize a `text` with this tokenizer.
     ///
     /// `bos` controls whether a beginning-of-string token should be inserted.
     pub fn tokenize(
@@ -162,31 +168,31 @@ impl Vocabulary {
         bos: bool,
     ) -> Result<Vec<(Vec<u8>, TokenId)>, TokenizationError> {
         match self {
-            Vocabulary::Model(v) => v.tokenize(text, bos),
-            Vocabulary::External(v) => v.tokenize(text, bos),
+            Tokenizer::Embedded(v) => v.tokenize(text, bos),
+            Tokenizer::HuggingFace(v) => v.tokenize(text, bos),
         }
     }
 
-    /// decode a list `tokens` with this vocabulary.
+    /// Decode a list `tokens` with this tokenizer.
     pub fn decode(&self, tokens: Vec<TokenId>, bos: bool) -> Vec<u8> {
         match self {
-            Vocabulary::Model(v) => v.decode(tokens, bos),
-            Vocabulary::External(v) => v.decode(tokens, bos),
+            Tokenizer::Embedded(v) => v.decode(tokens, bos),
+            Tokenizer::HuggingFace(v) => v.decode(tokens, bos),
         }
     }
 }
 
 #[derive(Debug, Error)]
-/// Errors that can occur when using a model vocabulary.
-pub enum ModelVocabularyError {
-    /// Arbitrary error that occurred during use of the model vocabulary.
+/// Errors that can occur when using a model tokenizer.
+pub enum ModelTokenizerError {
+    /// Arbitrary error that occurred during use of the model tokenizer.
     #[error("Arbitrary error: {0:?}")]
     Arbitrary(String),
 }
 
-/// The built-in GGML vocabulary.
+/// The built-in GGML tokenizer.
 #[derive(Debug, Clone, Default)]
-pub struct ModelVocabulary {
+pub struct EmbeddedTokenizer {
     // TODO: make these private
     /// Maps every integer (index) token ID to its corresponding token.
     pub id_to_token: Vec<Token>,
@@ -198,12 +204,12 @@ pub struct ModelVocabulary {
     /// Maps a token to a token ID.
     pub token_to_id: HashMap<Token, TokenId>,
 
-    /// The longest token in this vocabulary.
+    /// The longest token in this tokenizer.
     pub max_token_length: usize,
 }
 
-impl ModelVocabulary {
-    /// Add a token to the vocabulary.
+impl EmbeddedTokenizer {
+    /// Add a token to the internal vocabulary.
     ///
     /// The token added must have `id` directly after the last token in the vocabulary.
     ///
@@ -229,23 +235,23 @@ impl ModelVocabulary {
         self.token_to_id.get(token).copied()
     }
 
-    /// Converts a token index to the token it represents in this vocabulary.
+    /// Converts a token index to the token it represents in this tokenizer.
     fn token(&self, idx: usize) -> Vec<u8> {
         self.id_to_token[idx].clone()
     }
 
-    /// Returns the number of tokens in the vocabulary.
+    /// Returns the number of tokens in the tokenizer.
     fn len(&self) -> usize {
         self.id_to_token.len()
     }
 
-    /// Returns whether the vocabulary is empty.
+    /// Returns whether the tokenizer is empty.
     fn is_empty(&self) -> bool {
         self.id_to_token.is_empty()
     }
 
     // SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
-    /// Tokenize a `text` with this vocabulary.
+    /// Tokenize a `text` with this tokenizer.
     ///
     /// `bos` controls whether a beginning-of-string token should be inserted.
     fn tokenize(
@@ -284,7 +290,7 @@ impl ModelVocabulary {
             let token_id = prev[i];
             if token_id == 0 {
                 return Err(TokenizationError::TokenizationFailed {
-                    error: Box::new(ModelVocabularyError::Arbitrary(
+                    error: Box::new(ModelTokenizerError::Arbitrary(
                         "the backward pass for the tokenizer encountered a non-set token"
                             .to_string(),
                     )),
@@ -306,7 +312,7 @@ impl ModelVocabulary {
         Ok(res)
     }
 
-    /// decode a list `tokens` with this vocabulary.
+    /// Decode a list `tokens` with this tokenizer.
     fn decode(&self, tokens: Vec<TokenId>, skip_special_tokens: bool) -> Vec<u8> {
         let mut vec = vec![];
 
@@ -322,45 +328,45 @@ impl ModelVocabulary {
     }
 }
 
-/// A vocabulary that does not originate from the model file.
+/// A Hugging Face tokenizer.
 #[derive(Debug, Clone)]
-pub struct ExternalVocabulary {
-    tokenizer: Tokenizer,
+pub struct HuggingFaceTokenizer {
+    tokenizer: tokenizers::Tokenizer,
 }
 
-impl ExternalVocabulary {
-    /// Create a new `ExternalVocabulary`.
-    pub fn new(tokenizer: Tokenizer) -> Self {
+impl HuggingFaceTokenizer {
+    /// Create a new `HuggingFaceTokenizer`.
+    pub fn new(tokenizer: tokenizers::Tokenizer) -> Self {
         Self { tokenizer }
     }
 }
 
-impl ExternalVocabulary {
+impl HuggingFaceTokenizer {
     fn id(&self, token: &[u8]) -> Option<TokenId> {
         self.tokenizer
             .token_to_id(std::str::from_utf8(token).unwrap())
     }
 
-    /// Converts a token index to the token it represents in this vocabulary.
+    /// Converts a token index to the token it represents in this tokenizer.
     fn token(&self, idx: usize) -> Vec<u8> {
         self.tokenizer
             .decode(vec![idx as u32], true)
-            .expect("Cannot decode token from tokenizer vocabulary.")
+            .expect("Cannot decode token from tokenizer tokenizer.")
             .as_bytes()
             .to_vec()
     }
 
-    /// Returns the number of tokens in the vocabulary.
+    /// Returns the number of tokens in the tokenizer.
     fn len(&self) -> usize {
         self.tokenizer.get_vocab_size(false)
     }
 
-    /// Returns whether the vocabulary is empty.
+    /// Returns whether the tokenizer is empty.
     fn is_empty(&self) -> bool {
         self.tokenizer.get_vocab_size(false) == 0
     }
 
-    /// Tokenize a `text` with this vocabulary.
+    /// Tokenize a `text` with this tokenizer.
     ///
     /// `bos` controls whether a beginning-of-string token should be inserted.
     fn tokenize(
@@ -386,11 +392,11 @@ impl ExternalVocabulary {
             .collect())
     }
 
-    /// decode a list `tokens` with this vocabulary.
+    /// Decode a list `tokens` with this tokenizer.
     fn decode(&self, tokens: Vec<TokenId>, skip_special_tokens: bool) -> Vec<u8> {
         self.tokenizer
             .decode(tokens, skip_special_tokens)
-            .expect("Cannot decode token from tokenizer vocabulary.")
+            .expect("Cannot decode token from tokenizer.")
             .as_bytes()
             .to_vec()
     }
@@ -409,17 +415,17 @@ impl ExternalVocabulary {
 pub enum Prompt<'a> {
     /// A prompt specified as text.
     Text(&'a str),
-    /// A prompt specified as tokens for this model's vocabulary.
+    /// A prompt specified as tokens for this model's tokenizer.
     Tokens(&'a [TokenId]),
 }
 impl Prompt<'_> {
-    /// Converts this prompt to a list of tokens for this model's vocabulary.
+    /// Converts this prompt to a list of tokens for this model's tokenizer.
     ///
     /// Can return an error if [Self::Tokens] is used and includes a token ID that is not
-    /// in this model's vocabulary.
+    /// in this model's tokenizer.
     pub fn to_tokens(
         &self,
-        vocab: &Vocabulary,
+        vocab: &Tokenizer,
         beginning_of_sentence: bool,
     ) -> Result<Vec<TokenId>, TokenizationError> {
         Ok(match self {
diff --git a/crates/llm/examples/embeddings.rs b/crates/llm/examples/embeddings.rs
index a4a7fdeb..74207a1d 100644
--- a/crates/llm/examples/embeddings.rs
+++ b/crates/llm/examples/embeddings.rs
@@ -7,23 +7,23 @@ struct Args {
     model_architecture: llm::ModelArchitecture,
     model_path: PathBuf,
     #[arg(long, short = 'v')]
-    pub vocabulary_path: Option<PathBuf>,
+    pub tokenizer_path: Option<PathBuf>,
     #[arg(long, short = 'r')]
-    pub vocabulary_repository: Option<String>,
+    pub tokenizer_repository: Option<String>,
     #[arg(long, short = 'q')]
     pub query: Option<String>,
     #[arg(long, short = 'c')]
     pub comparands: Vec<String>,
 }
 impl Args {
-    pub fn to_vocabulary_source(&self) -> llm::VocabularySource {
-        match (&self.vocabulary_path, &self.vocabulary_repository) {
+    pub fn to_tokenizer_source(&self) -> llm::TokenizerSource {
+        match (&self.tokenizer_path, &self.tokenizer_repository) {
             (Some(_), Some(_)) => {
-                panic!("Cannot specify both --vocabulary-path and --vocabulary-repository");
+                panic!("Cannot specify both --tokenizer-path and --tokenizer-repository");
             }
-            (Some(path), None) => llm::VocabularySource::HuggingFaceTokenizerFile(path.to_owned()),
-            (None, Some(repo)) => llm::VocabularySource::HuggingFaceRemote(repo.to_owned()),
-            (None, None) => llm::VocabularySource::Model,
+            (Some(path), None) => llm::TokenizerSource::HuggingFaceTokenizerFile(path.to_owned()),
+            (None, Some(repo)) => llm::TokenizerSource::HuggingFaceRemote(repo.to_owned()),
+            (None, None) => llm::TokenizerSource::Embedded,
         }
     }
 }
@@ -31,7 +31,7 @@ impl Args {
 fn main() {
     let args = Args::parse();
 
-    let vocabulary_source = args.to_vocabulary_source();
+    let tokenizer_source = args.to_tokenizer_source();
     let model_architecture = args.model_architecture;
     let model_path = args.model_path;
     let query = args
@@ -53,7 +53,7 @@ fn main() {
     let model = llm::load_dynamic(
         model_architecture,
         &model_path,
-        vocabulary_source,
+        tokenizer_source,
         model_params,
         llm::load_progress_callback_stdout,
     )
@@ -117,7 +117,7 @@ fn get_embeddings(
         all_logits: None,
         embeddings: Some(Vec::new()),
     };
-    let vocab = model.vocabulary();
+    let vocab = model.tokenizer();
     let beginning_of_sentence = true;
     let query_token_ids = vocab
         .tokenize(query, beginning_of_sentence)
diff --git a/crates/llm/examples/inference.rs b/crates/llm/examples/inference.rs
index d2385b8c..aa740b02 100644
--- a/crates/llm/examples/inference.rs
+++ b/crates/llm/examples/inference.rs
@@ -8,19 +8,19 @@ struct Args {
     #[arg(long, short = 'p')]
     prompt: Option<String>,
     #[arg(long, short = 'v')]
-    vocabulary_path: Option<PathBuf>,
+    pub tokenizer_path: Option<PathBuf>,
     #[arg(long, short = 'r')]
-    vocabulary_repository: Option<String>,
+    pub tokenizer_repository: Option<String>,
 }
 impl Args {
-    pub fn to_vocabulary_source(&self) -> llm::VocabularySource {
-        match (&self.vocabulary_path, &self.vocabulary_repository) {
+    pub fn to_tokenizer_source(&self) -> llm::TokenizerSource {
+        match (&self.tokenizer_path, &self.tokenizer_repository) {
             (Some(_), Some(_)) => {
-                panic!("Cannot specify both --vocabulary-path and --vocabulary-repository");
+                panic!("Cannot specify both --tokenizer-path and --tokenizer-repository");
             }
-            (Some(path), None) => llm::VocabularySource::HuggingFaceTokenizerFile(path.to_owned()),
-            (None, Some(repo)) => llm::VocabularySource::HuggingFaceRemote(repo.to_owned()),
-            (None, None) => llm::VocabularySource::Model,
+            (Some(path), None) => llm::TokenizerSource::HuggingFaceTokenizerFile(path.to_owned()),
+            (None, Some(repo)) => llm::TokenizerSource::HuggingFaceRemote(repo.to_owned()),
+            (None, None) => llm::TokenizerSource::Embedded,
         }
     }
 }
@@ -28,7 +28,7 @@ impl Args {
 fn main() {
     let args = Args::parse();
 
-    let vocabulary_source = args.to_vocabulary_source();
+    let tokenizer_source = args.to_tokenizer_source();
     let model_architecture = args.model_architecture;
     let model_path = args.model_path;
     let prompt = args
@@ -41,7 +41,7 @@ fn main() {
     let model = llm::load_dynamic(
         model_architecture,
         &model_path,
-        vocabulary_source,
+        tokenizer_source,
         Default::default(),
         llm::load_progress_callback_stdout,
     )
diff --git a/crates/llm/examples/vicuna-chat.rs b/crates/llm/examples/vicuna-chat.rs
index 98d94606..e08f0be3 100644
--- a/crates/llm/examples/vicuna-chat.rs
+++ b/crates/llm/examples/vicuna-chat.rs
@@ -7,19 +7,19 @@ struct Args {
     model_architecture: llm::ModelArchitecture,
     model_path: PathBuf,
     #[arg(long, short = 'v')]
-    pub vocabulary_path: Option<PathBuf>,
+    pub tokenizer_path: Option<PathBuf>,
     #[arg(long, short = 'r')]
-    pub vocabulary_repository: Option<String>,
+    pub tokenizer_repository: Option<String>,
 }
 impl Args {
-    pub fn to_vocabulary_source(&self) -> llm::VocabularySource {
-        match (&self.vocabulary_path, &self.vocabulary_repository) {
+    pub fn to_tokenizer_source(&self) -> llm::TokenizerSource {
+        match (&self.tokenizer_path, &self.tokenizer_repository) {
             (Some(_), Some(_)) => {
-                panic!("Cannot specify both --vocabulary-path and --vocabulary-repository");
+                panic!("Cannot specify both --tokenizer-path and --tokenizer-repository");
             }
-            (Some(path), None) => llm::VocabularySource::HuggingFaceTokenizerFile(path.to_owned()),
-            (None, Some(repo)) => llm::VocabularySource::HuggingFaceRemote(repo.to_owned()),
-            (None, None) => llm::VocabularySource::Model,
+            (Some(path), None) => llm::TokenizerSource::HuggingFaceTokenizerFile(path.to_owned()),
+            (None, Some(repo)) => llm::TokenizerSource::HuggingFaceRemote(repo.to_owned()),
+            (None, None) => llm::TokenizerSource::Embedded,
         }
     }
 }
@@ -27,13 +27,13 @@ impl Args {
 fn main() {
     let args = Args::parse();
 
-    let vocabulary_source = args.to_vocabulary_source();
+    let tokenizer_source = args.to_tokenizer_source();
     let model_architecture = args.model_architecture;
     let model_path = args.model_path;
     let model = llm::load_dynamic(
         model_architecture,
         &model_path,
-        vocabulary_source,
+        tokenizer_source,
         Default::default(),
         llm::load_progress_callback_stdout,
     )
diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs
index 8adda7e7..30ea6c56 100644
--- a/crates/llm/src/lib.rs
+++ b/crates/llm/src/lib.rs
@@ -22,8 +22,8 @@
 //! let llama = llm::load::<llm::models::Llama>(
 //!     // path to GGML file
 //!     std::path::Path::new("/path/to/model"),
-//!     // llm::VocabularySource
-//!     llm::VocabularySource::Model,
+//!     // llm::TokenizerSource
+//!     llm::TokenizerSource::Embedded,
 //!     // llm::ModelParameters
 //!     Default::default(),
 //!     // load progress callback
@@ -83,8 +83,8 @@ pub use llm_base::{
     InferenceSessionConfig, InferenceSnapshot, InferenceSnapshotRef, InferenceStats,
     InvalidTokenBias, KnownModel, LoadError, LoadProgress, Loader, Model, ModelKVMemoryType,
     ModelParameters, OutputRequest, Prompt, QuantizeError, QuantizeProgress, Sampler,
-    SnapshotError, TokenBias, TokenId, TokenUtf8Buffer, TokenizationError, Vocabulary,
-    VocabularySource,
+    SnapshotError, TokenBias, TokenId, TokenUtf8Buffer, TokenizationError, Tokenizer,
+    TokenizerSource,
 };
 
 use serde::Serialize;
@@ -233,21 +233,21 @@ impl Display for ModelArchitecture {
 pub fn load_dynamic(
     architecture: ModelArchitecture,
     path: &Path,
-    vocabulary_source: VocabularySource,
+    tokenizer_source: TokenizerSource,
     params: ModelParameters,
     load_progress_callback: impl FnMut(LoadProgress),
 ) -> Result<Box<dyn Model>, LoadError> {
-    use ModelArchitecture::*;
+    use ModelArchitecture as MA;
 
     fn load_model<M: KnownModel + 'static>(
         path: &Path,
-        vocabulary_source: VocabularySource,
+        tokenizer_source: TokenizerSource,
         params: ModelParameters,
         load_progress_callback: impl FnMut(LoadProgress),
     ) -> Result<Box<dyn Model>, LoadError> {
         Ok(Box::new(load::<M>(
             path,
-            vocabulary_source,
+            tokenizer_source,
             params,
             load_progress_callback,
         )?))
@@ -255,30 +255,32 @@ pub fn load_dynamic(
 
     let model: Box<dyn Model> = match architecture {
         #[cfg(feature = "bloom")]
-        Bloom => {
-            load_model::<models::Bloom>(path, vocabulary_source, params, load_progress_callback)?
+        MA::Bloom => {
+            load_model::<models::Bloom>(path, tokenizer_source, params, load_progress_callback)?
         }
         #[cfg(feature = "gpt2")]
-        Gpt2 => {
-            load_model::<models::Gpt2>(path, vocabulary_source, params, load_progress_callback)?
+        MA::Gpt2 => {
+            load_model::<models::Gpt2>(path, tokenizer_source, params, load_progress_callback)?
         }
         #[cfg(feature = "gptj")]
-        GptJ => {
-            load_model::<models::GptJ>(path, vocabulary_source, params, load_progress_callback)?
+        MA::GptJ => {
+            load_model::<models::GptJ>(path, tokenizer_source, params, load_progress_callback)?
         }
         #[cfg(feature = "gptneox")]
-        GptNeoX => {
-            load_model::<models::GptNeoX>(path, vocabulary_source, params, load_progress_callback)?
+        MA::GptNeoX => {
+            load_model::<models::GptNeoX>(path, tokenizer_source, params, load_progress_callback)?
         }
         #[cfg(feature = "llama")]
-        Llama => {
-            load_model::<models::Llama>(path, vocabulary_source, params, load_progress_callback)?
+        MA::Llama => {
+            load_model::<models::Llama>(path, tokenizer_source, params, load_progress_callback)?
         }
         #[cfg(feature = "mpt")]
-        Mpt => load_model::<models::Mpt>(path, vocabulary_source, params, load_progress_callback)?,
+        MA::Mpt => {
+            load_model::<models::Mpt>(path, tokenizer_source, params, load_progress_callback)?
+        }
         #[cfg(feature = "falcon")]
-        Falcon => {
-            load_model::<models::Falcon>(path, vocabulary_source, params, load_progress_callback)?
+        MA::Falcon => {
+            load_model::<models::Falcon>(path, tokenizer_source, params, load_progress_callback)?
         }
     };
 
diff --git a/crates/models/bloom/src/lib.rs b/crates/models/bloom/src/lib.rs
index d44f143e..18cd5e5b 100644
--- a/crates/models/bloom/src/lib.rs
+++ b/crates/models/bloom/src/lib.rs
@@ -8,7 +8,7 @@ use llm_base::{
     ggml,
     model::{common, HyperparametersWriteError},
     util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig,
-    KnownModel, ModelParameters, OutputRequest, Regex, TokenId, Vocabulary,
+    KnownModel, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer,
 };
 
 /// The BLOOM model. Ref: [Introducing BLOOM](https://bigscience.huggingface.co/blog/bloom)
@@ -20,7 +20,7 @@ pub struct Bloom {
     context_size: usize,
 
     hyperparameters: Hyperparameters,
-    vocabulary: Vocabulary,
+    tokenizer: Tokenizer,
 
     // model-global weights
     // weighted token embeddings
@@ -50,7 +50,7 @@ impl KnownModel for Bloom {
     fn new<E: std::error::Error>(
         hyperparameters: Self::Hyperparameters,
         params: ModelParameters,
-        vocabulary: Vocabulary,
+        tokenizer: Tokenizer,
         tensor_loader: impl llm_base::TensorLoader<E>,
     ) -> Result<Self, E> {
         let mut tl = tensor_loader;
@@ -96,7 +96,7 @@ impl KnownModel for Bloom {
         Ok(Bloom {
             hyperparameters,
             context_size,
-            vocabulary,
+            tokenizer,
             wte,
             norm,
             norm_bias,
@@ -369,8 +369,8 @@ impl KnownModel for Bloom {
         common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len);
     }
 
-    fn vocabulary(&self) -> &Vocabulary {
-        &self.vocabulary
+    fn tokenizer(&self) -> &Tokenizer {
+        &self.tokenizer
     }
 
     fn context_size(&self) -> usize {
@@ -378,11 +378,11 @@ impl KnownModel for Bloom {
     }
 
     fn bot_token_id(&self) -> Option<TokenId> {
-        self.vocabulary.id("<s>".as_bytes())
+        self.tokenizer.id("<s>".as_bytes())
     }
 
     fn eot_token_id(&self) -> TokenId {
-        self.vocabulary.id("</s>".as_bytes()).unwrap()
+        self.tokenizer.id("</s>".as_bytes()).unwrap()
     }
 
     fn quantize_tensors() -> Vec<Regex> {
diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs
index 3b989e26..8ee37453 100644
--- a/crates/models/falcon/src/lib.rs
+++ b/crates/models/falcon/src/lib.rs
@@ -14,7 +14,7 @@ use llm_base::{
     ggml,
     model::{common, HyperparametersWriteError},
     util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig,
-    KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TokenId, Vocabulary,
+    KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer,
 };
 
 /// The Falcon model. Ref: [Technology Innovation Institute](https://huggingface.co/tiiuae)
@@ -27,7 +27,7 @@ pub struct Falcon {
 
     hyperparameters: Hyperparameters,
 
-    vocabulary: Vocabulary,
+    tokenizer: Tokenizer,
 
     // model-global weights
     // weighted token embeddings
@@ -52,7 +52,7 @@ impl KnownModel for Falcon {
     fn new<E: std::error::Error>(
         hyperparameters: Self::Hyperparameters,
         params: ModelParameters,
-        vocabulary: Vocabulary,
+        tokenizer: Tokenizer,
         tensor_loader: impl llm_base::TensorLoader<E>,
     ) -> Result<Self, E> {
         let mut tl = tensor_loader;
@@ -88,7 +88,7 @@ impl KnownModel for Falcon {
         Ok(Falcon {
             hyperparameters,
             context_size,
-            vocabulary,
+            tokenizer,
             tok_embeddings,
             output_norm,
             output_norm_b,
@@ -328,9 +328,8 @@ impl KnownModel for Falcon {
         common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len);
     }
 
-    /// Returns the vocabulary used by this model.
-    fn vocabulary(&self) -> &Vocabulary {
-        &self.vocabulary
+    fn tokenizer(&self) -> &Tokenizer {
+        &self.tokenizer
     }
 
     fn context_size(&self) -> usize {
@@ -342,7 +341,7 @@ impl KnownModel for Falcon {
     }
 
     fn eot_token_id(&self) -> TokenId {
-        self.vocabulary.id("<|endoftext|>".as_bytes()).unwrap()
+        self.tokenizer.id("<|endoftext|>".as_bytes()).unwrap()
     }
 
     fn quantize_tensors() -> Vec<Regex> {
diff --git a/crates/models/gpt2/src/lib.rs b/crates/models/gpt2/src/lib.rs
index abc0726d..1b2427a5 100644
--- a/crates/models/gpt2/src/lib.rs
+++ b/crates/models/gpt2/src/lib.rs
@@ -8,7 +8,7 @@ use llm_base::{
     ggml,
     model::{common, HyperparametersWriteError},
     util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig,
-    KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TokenId, Vocabulary,
+    KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer,
 };
 
 /// The GPT-2 model. Ref: [The Illustrated GPT-2](https://jalammar.github.io/illustrated-gpt2/)
@@ -20,7 +20,7 @@ pub struct Gpt2 {
     context_size: usize,
 
     hyperparameters: Hyperparameters,
-    vocabulary: Vocabulary,
+    tokenizer: Tokenizer,
 
     // model-global weights
     // normalization gain & bias
@@ -49,7 +49,7 @@ impl KnownModel for Gpt2 {
     fn new<E: std::error::Error>(
         hyperparameters: Self::Hyperparameters,
         params: ModelParameters,
-        vocabulary: Vocabulary,
+        tokenizer: Tokenizer,
         tensor_loader: impl llm_base::TensorLoader<E>,
     ) -> Result<Self, E> {
         let mut tl = tensor_loader;
@@ -88,7 +88,7 @@ impl KnownModel for Gpt2 {
         Ok(Gpt2 {
             hyperparameters,
             context_size,
-            vocabulary,
+            tokenizer,
             layers,
             ln_f_g,
             ln_f_b,
@@ -323,8 +323,8 @@ impl KnownModel for Gpt2 {
         common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len);
     }
 
-    fn vocabulary(&self) -> &Vocabulary {
-        &self.vocabulary
+    fn tokenizer(&self) -> &Tokenizer {
+        &self.tokenizer
     }
 
     fn context_size(&self) -> usize {
@@ -336,7 +336,7 @@ impl KnownModel for Gpt2 {
     }
 
     fn eot_token_id(&self) -> TokenId {
-        self.vocabulary.id("<|endoftext|>".as_bytes()).unwrap()
+        self.tokenizer.id("<|endoftext|>".as_bytes()).unwrap()
     }
 
     fn quantize_tensors() -> Vec<Regex> {
diff --git a/crates/models/gptj/src/lib.rs b/crates/models/gptj/src/lib.rs
index 5ec7d5bc..92ee8f4a 100644
--- a/crates/models/gptj/src/lib.rs
+++ b/crates/models/gptj/src/lib.rs
@@ -8,8 +8,7 @@ use llm_base::{
     ggml,
     model::{common, HyperparametersWriteError},
     util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig,
-    KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId,
-    Vocabulary,
+    KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer,
 };
 
 /// The GPT-J model. Ref: [GitHub](https://github.com/kingoflolz/mesh-transformer-jax/#gpt-j-6b)
@@ -21,7 +20,7 @@ pub struct GptJ {
     context_size: usize,
 
     hyperparameters: Hyperparameters,
-    vocabulary: Vocabulary,
+    tokenizer: Tokenizer,
 
     // model-global weights
     // normalization gain & bias
@@ -49,7 +48,7 @@ impl KnownModel for GptJ {
     fn new<E: Error>(
         hyperparameters: Self::Hyperparameters,
         params: ModelParameters,
-        vocabulary: Vocabulary,
+        tokenizer: Tokenizer,
         tensor_loader: impl TensorLoader<E>,
     ) -> Result<Self, E>
     where
@@ -89,7 +88,7 @@ impl KnownModel for GptJ {
         Ok(GptJ {
             hyperparameters,
             context_size,
-            vocabulary,
+            tokenizer,
             ln_f_g,
             ln_f_b,
             wte,
@@ -292,8 +291,8 @@ impl KnownModel for GptJ {
         common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len);
     }
 
-    fn vocabulary(&self) -> &Vocabulary {
-        &self.vocabulary
+    fn tokenizer(&self) -> &Tokenizer {
+        &self.tokenizer
     }
 
     fn context_size(&self) -> usize {
@@ -305,7 +304,7 @@ impl KnownModel for GptJ {
     }
 
     fn eot_token_id(&self) -> TokenId {
-        self.vocabulary.id("<|endoftext|>".as_bytes()).unwrap()
+        self.tokenizer.id("<|endoftext|>".as_bytes()).unwrap()
     }
 
     fn quantize_tensors() -> Vec<Regex> {
diff --git a/crates/models/gptneox/src/lib.rs b/crates/models/gptneox/src/lib.rs
index 5b4ea0c0..84a5c417 100644
--- a/crates/models/gptneox/src/lib.rs
+++ b/crates/models/gptneox/src/lib.rs
@@ -9,8 +9,7 @@ use llm_base::{
     ggml,
     model::{common, HyperparametersWriteError},
     util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig,
-    KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId,
-    Vocabulary,
+    KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer,
 };
 
 /// The GPT-NeoX model. Ref: [GitHub](https://github.com/EleutherAI/gpt-neox)
@@ -22,7 +21,7 @@ pub struct GptNeoX {
     context_size: usize,
 
     hyperparameters: Hyperparameters,
-    vocabulary: Vocabulary,
+    tokenizer: Tokenizer,
 
     // model-global weights
     // normalization gain & bias
@@ -49,7 +48,7 @@ impl KnownModel for GptNeoX {
     fn new<E: Error>(
         hyperparameters: Hyperparameters,
         params: ModelParameters,
-        vocabulary: Vocabulary,
+        tokenizer: Tokenizer,
         tensor_loader: impl TensorLoader<E>,
     ) -> Result<Self, E>
     where
@@ -103,7 +102,7 @@ impl KnownModel for GptNeoX {
         Ok(GptNeoX {
             hyperparameters,
             context_size,
-            vocabulary,
+            tokenizer,
             ln_f_g,
             ln_f_b,
             wte,
@@ -338,8 +337,8 @@ impl KnownModel for GptNeoX {
         common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, n);
     }
 
-    fn vocabulary(&self) -> &Vocabulary {
-        &self.vocabulary
+    fn tokenizer(&self) -> &Tokenizer {
+        &self.tokenizer
     }
 
     fn context_size(&self) -> usize {
@@ -351,7 +350,7 @@ impl KnownModel for GptNeoX {
     }
 
     fn eot_token_id(&self) -> TokenId {
-        self.vocabulary.id("<|endoftext|>".as_bytes()).unwrap()
+        self.tokenizer.id("<|endoftext|>".as_bytes()).unwrap()
     }
 
     fn quantize_tensors() -> Vec<Regex> {
diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs
index d4abb2e1..025352fd 100644
--- a/crates/models/llama/src/lib.rs
+++ b/crates/models/llama/src/lib.rs
@@ -7,8 +7,7 @@ use llm_base::{
     ggml,
     model::{common, HyperparametersWriteError},
     util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig,
-    KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId,
-    Vocabulary,
+    KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer,
 };
 
 /// The LLaMA model. Ref: [Introducing LLaMA](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/)
@@ -20,7 +19,7 @@ pub struct Llama {
     context_size: usize,
 
     hyperparameters: Hyperparameters,
-    vocabulary: Vocabulary,
+    tokenizer: Tokenizer,
 
     // model-global weights
     // weighted token embeddings
@@ -46,7 +45,7 @@ impl KnownModel for Llama {
     fn new<E: Error>(
         hyperparameters: Self::Hyperparameters,
         params: ModelParameters,
-        vocabulary: Vocabulary,
+        tokenizer: Tokenizer,
         tensor_loader: impl TensorLoader<E>,
     ) -> Result<Self, E> {
         let mut tl = tensor_loader;
@@ -80,7 +79,7 @@ impl KnownModel for Llama {
         Ok(Self {
             hyperparameters,
             context_size,
-            vocabulary,
+            tokenizer,
             wte,
             norm,
             output,
@@ -322,9 +321,8 @@ impl KnownModel for Llama {
         common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len);
     }
 
-    /// Returns the vocabulary used by this model.
-    fn vocabulary(&self) -> &Vocabulary {
-        &self.vocabulary
+    fn tokenizer(&self) -> &Tokenizer {
+        &self.tokenizer
     }
 
     fn context_size(&self) -> usize {
diff --git a/crates/models/mpt/src/lib.rs b/crates/models/mpt/src/lib.rs
index 10ce78e9..56e129e4 100644
--- a/crates/models/mpt/src/lib.rs
+++ b/crates/models/mpt/src/lib.rs
@@ -8,7 +8,7 @@ use llm_base::{
     ggml::{self},
     model::{common, HyperparametersWriteError},
     util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig,
-    KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TokenId, Vocabulary,
+    KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer,
 };
 
 /// The MosaicML Pretrained Transformer (MPT) model. Ref: [Mosaic ML](https://www.mosaicml.com/blog/mpt-7b)
@@ -20,7 +20,7 @@ pub struct Mpt {
     context_size: usize,
 
     hyperparameters: Hyperparameters,
-    vocabulary: Vocabulary,
+    tokenizer: Tokenizer,
 
     // model-global weights
     // weighted token embeddings
@@ -44,7 +44,7 @@ impl KnownModel for Mpt {
     fn new<E: std::error::Error>(
         hyperparameters: Self::Hyperparameters,
         params: ModelParameters,
-        vocabulary: Vocabulary,
+        tokenizer: Tokenizer,
         tensor_loader: impl llm_base::TensorLoader<E>,
     ) -> Result<Self, E> {
         let mut tl = tensor_loader;
@@ -77,7 +77,7 @@ impl KnownModel for Mpt {
         Ok(Mpt {
             hyperparameters,
             context_size,
-            vocabulary,
+            tokenizer,
             wte,
             norm,
             layers,
@@ -271,9 +271,8 @@ impl KnownModel for Mpt {
         common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, n);
     }
 
-    /// Returns the vocabulary used by this model.
-    fn vocabulary(&self) -> &Vocabulary {
-        &self.vocabulary
+    fn tokenizer(&self) -> &Tokenizer {
+        &self.tokenizer
     }
 
     fn context_size(&self) -> usize {
@@ -281,11 +280,11 @@ impl KnownModel for Mpt {
     }
 
     fn bot_token_id(&self) -> Option<TokenId> {
-        self.vocabulary.id("<|padding|>".as_bytes())
+        self.tokenizer.id("<|padding|>".as_bytes())
     }
 
     fn eot_token_id(&self) -> TokenId {
-        self.vocabulary.id("<|endoftext|>".as_bytes()).unwrap()
+        self.tokenizer.id("<|endoftext|>".as_bytes()).unwrap()
     }
 
     fn quantize_tensors() -> Vec<Regex> {

From 60d61688d23b4ae6fe4a514c98bd1fe0154d03c5 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Thu, 29 Jun 2023 03:19:42 +0200
Subject: [PATCH 19/21] refactor(tokenizer): split into multiple files

---
 crates/llm-base/src/quantize.rs               |   7 +-
 crates/llm-base/src/tokenizer/embedded.rs     | 157 ++++++++++++
 crates/llm-base/src/tokenizer/huggingface.rs  |  75 ++++++
 .../src/{tokenizer.rs => tokenizer/mod.rs}    | 226 +-----------------
 4 files changed, 238 insertions(+), 227 deletions(-)
 create mode 100644 crates/llm-base/src/tokenizer/embedded.rs
 create mode 100644 crates/llm-base/src/tokenizer/huggingface.rs
 rename crates/llm-base/src/{tokenizer.rs => tokenizer/mod.rs} (60%)

diff --git a/crates/llm-base/src/quantize.rs b/crates/llm-base/src/quantize.rs
index 187a6fc0..d3d2a0cf 100644
--- a/crates/llm-base/src/quantize.rs
+++ b/crates/llm-base/src/quantize.rs
@@ -181,12 +181,7 @@ pub fn quantize<M: KnownModel, R: BufRead + Seek, W: Write + Seek>(
     }
 
     let tokenizer = match tokenizer {
-        Tokenizer::Embedded(v) => v
-            .id_to_token
-            .iter()
-            .cloned()
-            .zip(v.id_to_token_score)
-            .collect::<Vec<_>>(),
+        Tokenizer::Embedded(v) => v.iter().collect::<Vec<_>>(),
         Tokenizer::HuggingFace(_) => vec![],
     };
 
diff --git a/crates/llm-base/src/tokenizer/embedded.rs b/crates/llm-base/src/tokenizer/embedded.rs
new file mode 100644
index 00000000..cf96b183
--- /dev/null
+++ b/crates/llm-base/src/tokenizer/embedded.rs
@@ -0,0 +1,157 @@
+use std::collections::HashMap;
+
+use thiserror::Error;
+
+use super::{Token, TokenId, TokenScore, TokenizationError};
+
+#[derive(Debug, Error)]
+/// Errors that can occur when using a model tokenizer.
+pub enum EmbeddedTokenizerError {
+    /// Arbitrary error that occurred during use of the model tokenizer.
+    #[error("Arbitrary error: {0:?}")]
+    Arbitrary(String),
+}
+
+/// The built-in GGML tokenizer.
+#[derive(Debug, Clone, Default)]
+pub struct EmbeddedTokenizer {
+    /// Maps every integer (index) token ID to its corresponding token.
+    id_to_token: Vec<Token>,
+
+    /// Maps every integer (index) token ID to corresponding score.
+    id_to_token_score: Vec<TokenScore>,
+
+    // todo: use a radix tree
+    /// Maps a token to a token ID.
+    token_to_id: HashMap<Token, TokenId>,
+
+    /// The longest token in this tokenizer.
+    max_token_length: usize,
+}
+
+impl EmbeddedTokenizer {
+    /// Add a token to the internal vocabulary.
+    ///
+    /// The token added must have `id` directly after the last token in the vocabulary.
+    ///
+    /// # Panics
+    /// - This function can panic if `id` does not correspond to the next token in the vocabulary.
+    ///   That is, if there are already `n` tokens in the vocabulary, then `id` must be `n`.
+    pub(crate) fn push_token(&mut self, id: TokenId, content: Token, score: TokenScore) {
+        // These are loader invariants. If this is broken, then the loader is broken and this is a bug,
+        // not an issue with the model itself.
+        assert_eq!(self.id_to_token.len(), self.id_to_token_score.len());
+        if self.id_to_token.len() != id as usize || self.id_to_token_score.len() != id as usize {
+            let expected_id = self.id_to_token.len() as TokenId;
+            panic!("the id of token added should be {expected_id}; is {id}");
+        }
+
+        self.max_token_length = self.max_token_length.max(content.len());
+        self.id_to_token.push(content.clone());
+        self.id_to_token_score.push(score);
+        self.token_to_id.insert(content, id);
+    }
+
+    pub(crate) fn id(&self, token: &[u8]) -> Option<TokenId> {
+        self.token_to_id.get(token).copied()
+    }
+
+    /// Converts a token index to the token it represents in this tokenizer.
+    pub(crate) fn token(&self, idx: usize) -> Vec<u8> {
+        self.id_to_token[idx].clone()
+    }
+
+    /// Returns the number of tokens in the tokenizer.
+    pub(crate) fn len(&self) -> usize {
+        self.id_to_token.len()
+    }
+
+    /// Returns whether the tokenizer is empty.
+    pub(crate) fn is_empty(&self) -> bool {
+        self.id_to_token.is_empty()
+    }
+
+    // SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
+    /// Tokenize a `text` with this tokenizer.
+    ///
+    /// `bos` controls whether a beginning-of-string token should be inserted.
+    pub(crate) fn tokenize(
+        &self,
+        text: &str,
+        bos: bool,
+    ) -> Result<Vec<(Vec<u8>, TokenId)>, TokenizationError> {
+        let len = text.len();
+
+        let mut score = vec![0usize; len + 1];
+        let mut prev = vec![TokenId::default(); len + 1];
+
+        for i in 0..len {
+            let max_len = (len - i).min(self.max_token_length);
+            for sub_len in 1..=max_len {
+                let sub = &text.as_bytes()[i..i + sub_len];
+                let token = self.token_to_id.get(sub);
+
+                if let Some(token) = token {
+                    let token_score = sub.len() * sub.len();
+                    let local_score = score[i] + token_score;
+                    let next = i + sub_len;
+
+                    if score[next] < local_score {
+                        score[next] = local_score;
+                        prev[next] = *token;
+                    }
+                }
+            }
+        }
+
+        // Backward pass
+        let mut res = vec![];
+        let mut i = len;
+        while i > 0 {
+            let token_id = prev[i];
+            if token_id == 0 {
+                return Err(TokenizationError::TokenizationFailed {
+                    error: Box::new(EmbeddedTokenizerError::Arbitrary(
+                        "the backward pass for the tokenizer encountered a non-set token"
+                            .to_string(),
+                    )),
+                });
+            }
+            let token = self.id_to_token[token_id as usize].as_slice();
+            res.push((token.to_vec(), token_id));
+            i -= token.len();
+        }
+
+        if bos {
+            // TODO: replace with vocab.bos
+            res.push((vec![], 1));
+        }
+
+        // Pieces are in reverse order so correct that
+        res.reverse();
+
+        Ok(res)
+    }
+
+    /// Decode a list `tokens` with this tokenizer.
+    pub(crate) fn decode(&self, tokens: Vec<TokenId>, skip_special_tokens: bool) -> Vec<u8> {
+        let mut vec = vec![];
+
+        for token in tokens {
+            if skip_special_tokens && token == 1 {
+                continue;
+            }
+
+            vec.append(&mut self.id_to_token[token as usize].to_vec());
+        }
+
+        vec
+    }
+
+    pub(crate) fn iter(&self) -> impl Iterator<Item = (Token, f32)> + '_ {
+        self.id_to_token
+            .iter()
+            .zip(self.id_to_token_score.iter())
+            .map(|(token, score)| (token.clone(), *score))
+    }
+}
diff --git a/crates/llm-base/src/tokenizer/huggingface.rs b/crates/llm-base/src/tokenizer/huggingface.rs
new file mode 100644
index 00000000..8f3a5565
--- /dev/null
+++ b/crates/llm-base/src/tokenizer/huggingface.rs
@@ -0,0 +1,75 @@
+use super::{TokenId, TokenizationError};
+
+/// A Hugging Face tokenizer.
+#[derive(Debug, Clone)]
+pub struct HuggingFaceTokenizer {
+    pub(crate) tokenizer: tokenizers::Tokenizer,
+}
+
+impl HuggingFaceTokenizer {
+    /// Create a new `HuggingFaceTokenizer`.
+    pub fn new(tokenizer: tokenizers::Tokenizer) -> Self {
+        Self { tokenizer }
+    }
+}
+
+impl HuggingFaceTokenizer {
+    pub(crate) fn id(&self, token: &[u8]) -> Option<TokenId> {
+        self.tokenizer
+            .token_to_id(std::str::from_utf8(token).unwrap())
+    }
+
+    /// Converts a token index to the token it represents in this tokenizer.
+    pub(crate) fn token(&self, idx: usize) -> Vec<u8> {
+        self.tokenizer
+            .decode(vec![idx as u32], true)
+            .expect("Cannot decode token from tokenizer tokenizer.")
+            .as_bytes()
+            .to_vec()
+    }
+
+    /// Returns the number of tokens in the tokenizer.
+    pub(crate) fn len(&self) -> usize {
+        self.tokenizer.get_vocab_size(false)
+    }
+
+    /// Returns whether the tokenizer is empty.
+    pub(crate) fn is_empty(&self) -> bool {
+        self.tokenizer.get_vocab_size(false) == 0
+    }
+
+    /// Tokenize a `text` with this tokenizer.
+    ///
+    /// `bos` controls whether a beginning-of-string token should be inserted.
+    pub(crate) fn tokenize(
+        &self,
+        text: &str,
+        bos: bool,
+    ) -> Result<Vec<(Vec<u8>, TokenId)>, TokenizationError> {
+        let encoding = self
+            .tokenizer
+            .encode(text, false)
+            .map_err(|e| TokenizationError::TokenizationFailed { error: e })?;
+
+        let encoding = self
+            .tokenizer
+            .post_process(encoding, None, bos)
+            .map_err(|e| TokenizationError::TokenizationFailed { error: e })?;
+
+        Ok(encoding
+            .get_tokens()
+            .iter()
+            .map(|t| t.as_bytes().to_vec())
+            .zip(encoding.get_ids().iter().copied())
+            .collect())
+    }
+
+    /// Decode a list `tokens` with this tokenizer.
+    pub(crate) fn decode(&self, tokens: Vec<TokenId>, skip_special_tokens: bool) -> Vec<u8> {
+        self.tokenizer
+            .decode(tokens, skip_special_tokens)
+            .expect("Cannot decode token from tokenizer.")
+            .as_bytes()
+            .to_vec()
+    }
+}
diff --git a/crates/llm-base/src/tokenizer.rs b/crates/llm-base/src/tokenizer/mod.rs
similarity index 60%
rename from crates/llm-base/src/tokenizer.rs
rename to crates/llm-base/src/tokenizer/mod.rs
index b914eb3f..8f8fc69d 100644
--- a/crates/llm-base/src/tokenizer.rs
+++ b/crates/llm-base/src/tokenizer/mod.rs
@@ -1,5 +1,4 @@
 use std::{
-    collections::HashMap,
     error::Error,
     fmt::Display,
     path::{Path, PathBuf},
@@ -8,6 +7,11 @@ use std::{
 
 use thiserror::Error;
 
+mod embedded;
+pub use embedded::*;
+mod huggingface;
+pub use huggingface::*;
+
 /// The identifier of a token in a tokenizer.
 pub type TokenId = u32;
 pub(crate) type Token = Vec<u8>;
@@ -182,226 +186,6 @@ impl Tokenizer {
     }
 }
 
-#[derive(Debug, Error)]
-/// Errors that can occur when using a model tokenizer.
-pub enum ModelTokenizerError {
-    /// Arbitrary error that occurred during use of the model tokenizer.
-    #[error("Arbitrary error: {0:?}")]
-    Arbitrary(String),
-}
-
-/// The built-in GGML tokenizer.
-#[derive(Debug, Clone, Default)]
-pub struct EmbeddedTokenizer {
-    // TODO: make these private
-    /// Maps every integer (index) token ID to its corresponding token.
-    pub id_to_token: Vec<Token>,
-
-    /// Maps every integer (index) token ID to corresponding score.
-    pub id_to_token_score: Vec<TokenScore>,
-
-    // todo: use a radix tree
-    /// Maps a token to a token ID.
-    pub token_to_id: HashMap<Token, TokenId>,
-
-    /// The longest token in this tokenizer.
-    pub max_token_length: usize,
-}
-
-impl EmbeddedTokenizer {
-    /// Add a token to the internal vocabulary.
-    ///
-    /// The token added must have `id` directly after the last token in the vocabulary.
-    ///
-    /// # Panics
-    /// - This function can panic if `id` does not correspond to the next token in the vocabulary.
-    ///   That is, if there are already `n` tokens in the vocabulary, then `id` must be `n`.
-    pub(crate) fn push_token(&mut self, id: TokenId, content: Token, score: TokenScore) {
-        // These are loader invariants. If this is broken, then the loader is broken and this is a bug,
-        // not an issue with the model itself.
-        assert_eq!(self.id_to_token.len(), self.id_to_token_score.len());
-        if self.id_to_token.len() != id as usize || self.id_to_token_score.len() != id as usize {
-            let expected_id = self.id_to_token.len() as TokenId;
-            panic!("the id of token added should be {expected_id}; is {id}");
-        }
-
-        self.max_token_length = self.max_token_length.max(content.len());
-        self.id_to_token.push(content.clone());
-        self.id_to_token_score.push(score);
-        self.token_to_id.insert(content, id);
-    }
-
-    fn id(&self, token: &[u8]) -> Option<TokenId> {
-        self.token_to_id.get(token).copied()
-    }
-
-    /// Converts a token index to the token it represents in this tokenizer.
-    fn token(&self, idx: usize) -> Vec<u8> {
-        self.id_to_token[idx].clone()
-    }
-
-    /// Returns the number of tokens in the tokenizer.
-    fn len(&self) -> usize {
-        self.id_to_token.len()
-    }
-
-    /// Returns whether the tokenizer is empty.
-    fn is_empty(&self) -> bool {
-        self.id_to_token.is_empty()
-    }
-
-    // SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece
-    /// Tokenize a `text` with this tokenizer.
-    ///
-    /// `bos` controls whether a beginning-of-string token should be inserted.
-    fn tokenize(
-        &self,
-        text: &str,
-        bos: bool,
-    ) -> Result<Vec<(Vec<u8>, TokenId)>, TokenizationError> {
-        let len = text.len();
-
-        let mut score = vec![0usize; len + 1];
-        let mut prev = vec![TokenId::default(); len + 1];
-
-        for i in 0..len {
-            let max_len = (len - i).min(self.max_token_length);
-            for sub_len in 1..=max_len {
-                let sub = &text.as_bytes()[i..i + sub_len];
-                let token = self.token_to_id.get(sub);
-
-                if let Some(token) = token {
-                    let token_score = sub.len() * sub.len();
-                    let local_score = score[i] + token_score;
-                    let next = i + sub_len;
-
-                    if score[next] < local_score {
-                        score[next] = local_score;
-                        prev[next] = *token;
-                    }
-                }
-            }
-        }
-
-        // Backward pass
-        let mut res = vec![];
-        let mut i = len;
-        while i > 0 {
-            let token_id = prev[i];
-            if token_id == 0 {
-                return Err(TokenizationError::TokenizationFailed {
-                    error: Box::new(ModelTokenizerError::Arbitrary(
-                        "the backward pass for the tokenizer encountered a non-set token"
-                            .to_string(),
-                    )),
-                });
-            }
-            let token = self.id_to_token[token_id as usize].as_slice();
-            res.push((token.to_vec(), token_id));
-            i -= token.len();
-        }
-
-        if bos {
-            // TODO: replace with vocab.bos
-            res.push((vec![], 1));
-        }
-
-        // Pieces are in reverse order so correct that
-        res.reverse();
-
-        Ok(res)
-    }
-
-    /// Decode a list `tokens` with this tokenizer.
-    fn decode(&self, tokens: Vec<TokenId>, skip_special_tokens: bool) -> Vec<u8> {
-        let mut vec = vec![];
-
-        for token in tokens {
-            if skip_special_tokens && token == 1 {
-                continue;
-            }
-
-            vec.append(&mut self.id_to_token[token as usize].to_vec());
-        }
-
-        vec
-    }
-}
-
-/// A Hugging Face tokenizer.
-#[derive(Debug, Clone)]
-pub struct HuggingFaceTokenizer {
-    tokenizer: tokenizers::Tokenizer,
-}
-
-impl HuggingFaceTokenizer {
-    /// Create a new `HuggingFaceTokenizer`.
-    pub fn new(tokenizer: tokenizers::Tokenizer) -> Self {
-        Self { tokenizer }
-    }
-}
-
-impl HuggingFaceTokenizer {
-    fn id(&self, token: &[u8]) -> Option<TokenId> {
-        self.tokenizer
-            .token_to_id(std::str::from_utf8(token).unwrap())
-    }
-
-    /// Converts a token index to the token it represents in this tokenizer.
-    fn token(&self, idx: usize) -> Vec<u8> {
-        self.tokenizer
-            .decode(vec![idx as u32], true)
-            .expect("Cannot decode token from tokenizer tokenizer.")
-            .as_bytes()
-            .to_vec()
-    }
-
-    /// Returns the number of tokens in the tokenizer.
-    fn len(&self) -> usize {
-        self.tokenizer.get_vocab_size(false)
-    }
-
-    /// Returns whether the tokenizer is empty.
-    fn is_empty(&self) -> bool {
-        self.tokenizer.get_vocab_size(false) == 0
-    }
-
-    /// Tokenize a `text` with this tokenizer.
-    ///
-    /// `bos` controls whether a beginning-of-string token should be inserted.
-    fn tokenize(
-        &self,
-        text: &str,
-        bos: bool,
-    ) -> Result<Vec<(Vec<u8>, TokenId)>, TokenizationError> {
-        let encoding = self
-            .tokenizer
-            .encode(text, false)
-            .map_err(|e| TokenizationError::TokenizationFailed { error: e })?;
-
-        let encoding = self
-            .tokenizer
-            .post_process(encoding, None, bos)
-            .map_err(|e| TokenizationError::TokenizationFailed { error: e })?;
-
-        Ok(encoding
-            .get_tokens()
-            .iter()
-            .map(|t| t.as_bytes().to_vec())
-            .zip(encoding.get_ids().iter().copied())
-            .collect())
-    }
-
-    /// Decode a list `tokens` with this tokenizer.
-    fn decode(&self, tokens: Vec<TokenId>, skip_special_tokens: bool) -> Vec<u8> {
-        self.tokenizer
-            .decode(tokens, skip_special_tokens)
-            .expect("Cannot decode token from tokenizer.")
-            .as_bytes()
-            .to_vec()
-    }
-}
-
 #[derive(Debug, PartialEq, Clone, Copy)]
 /// Represents the prompt, which can be specified as either text or tokens.
 ///

From 7e2f2bf059f3239681c85346323196155bf522c4 Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Fri, 30 Jun 2023 01:38:13 +0200
Subject: [PATCH 20/21] fix #298: don't send new bytes if invalid decoding

---
 crates/llm-base/src/inference_session.rs | 36 ++++++++++++++++--------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index 4d1489a4..8a1a85e6 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -304,13 +304,10 @@ impl InferenceSession {
                 let mut token = match model.tokenizer() {
                     crate::Tokenizer::Embedded(_) => model.tokenizer().token(tk as usize).to_vec(),
                     crate::Tokenizer::HuggingFace(_) => {
-                        let mut previous_tokens = self.tokens.clone();
-                        previous_tokens.push(tk);
+                        let mut tokens = self.tokens.clone();
+                        tokens.push(tk);
 
-                        let all_tokens = model.tokenizer().decode(previous_tokens, true);
-                        let splitted = all_tokens.split_at(self.decoded_tokens.len());
-
-                        splitted.1.to_vec()
+                        get_newly_decoded_portion_huggingface(model, tokens, &self.decoded_tokens)
                     }
                 };
 
@@ -363,12 +360,11 @@ impl InferenceSession {
                 crate::Tokenizer::Embedded(_) => {
                     model.tokenizer().token(next_token as usize).to_vec()
                 }
-                crate::Tokenizer::HuggingFace(_) => {
-                    let all_tokens = model.tokenizer().decode(self.tokens.clone(), true);
-                    let splitted = all_tokens.split_at(self.decoded_tokens.len());
-
-                    splitted.1.to_vec()
-                }
+                crate::Tokenizer::HuggingFace(_) => get_newly_decoded_portion_huggingface(
+                    model,
+                    self.tokens.clone(),
+                    &self.decoded_tokens,
+                ),
             };
 
             self.decoded_tokens.append(&mut res.clone());
@@ -595,6 +591,22 @@ impl InferenceSession {
     }
 }
 
+fn get_newly_decoded_portion_huggingface(
+    model: &dyn Model,
+    tokens: Vec<u32>,
+    decoded_tokens: &[u8],
+) -> Vec<u8> {
+    let all_tokens = model.tokenizer().decode(tokens, true);
+    // The bytes here come from a lossily-decoded String, so we need to convert it back to a String
+    // to check if it ends with a replacement character.
+    let all_tokens = unsafe { String::from_utf8_unchecked(all_tokens) };
+    if all_tokens.ends_with('�') {
+        // Return an empty vector: no valid text was generated from this token.
+        return vec![];
+    }
+    all_tokens.as_bytes()[decoded_tokens.len()..].to_vec()
+}
+
 #[derive(Error, Debug)]
 /// Errors encountered during the inference process.
 pub enum InferenceError {

From 9a222690cd0a9e3322bb6a926d54e90d08fb2c0f Mon Sep 17 00:00:00 2001
From: Philpax <me@philpax.me>
Date: Fri, 30 Jun 2023 03:17:14 +0200
Subject: [PATCH 21/21] fix #317 - cli move architecture into subcommands

---
 binaries/llm-cli/src/cli_args.rs   |  69 +++------
 binaries/llm-cli/src/main.rs       | 220 +++++++++++++++--------------
 crates/llm-base/src/loader.rs      |  12 +-
 crates/llm-base/src/model/mod.rs   |   3 +-
 crates/llm/examples/embeddings.rs  |   2 +-
 crates/llm/examples/inference.rs   |   2 +-
 crates/llm/examples/vicuna-chat.rs |   2 +-
 crates/llm/src/lib.rs              |  91 +++++++-----
 8 files changed, 208 insertions(+), 193 deletions(-)

diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs
index ce7db33f..0da6e3c5 100644
--- a/binaries/llm-cli/src/cli_args.rs
+++ b/binaries/llm-cli/src/cli_args.rs
@@ -1,6 +1,6 @@
 use std::{fmt, ops::Deref, path::PathBuf, sync::Arc};
 
-use clap::{Parser, Subcommand, ValueEnum};
+use clap::{Parser, ValueEnum};
 use color_eyre::eyre::{bail, Result, WrapErr};
 use llm::{
     ggml_format, ElementType, InferenceParameters, InferenceSessionConfig, InvalidTokenBias,
@@ -11,50 +11,6 @@ use rand::SeedableRng;
 #[derive(Parser, Debug)]
 #[command(author, version, about, long_about = None)]
 pub enum Args {
-    /// Use a BLOOM model
-    Bloom {
-        #[command(subcommand)]
-        args: BaseArgs,
-    },
-    /// Use a GPT-2 model
-    Gpt2 {
-        #[command(subcommand)]
-        args: BaseArgs,
-    },
-    /// Use a GPT-J model
-    #[clap(id = "gptj")]
-    GptJ {
-        #[command(subcommand)]
-        args: BaseArgs,
-    },
-    /// Use a GPT-NeoX model
-    #[clap(id = "gptneox")]
-    GptNeoX {
-        #[command(subcommand)]
-        args: BaseArgs,
-    },
-    /// Use a LLaMA model
-    Llama {
-        #[command(subcommand)]
-        args: BaseArgs,
-    },
-    /// Use a MPT model
-    #[clap(id = "mpt")]
-    Mpt {
-        #[command(subcommand)]
-        args: BaseArgs,
-    },
-    /// Use a Falcon model
-    #[clap(id = "falcon")]
-    #[cfg(feature = "falcon")]
-    Falcon {
-        #[command(subcommand)]
-        args: BaseArgs,
-    },
-}
-
-#[derive(Subcommand, Debug)]
-pub enum BaseArgs {
     #[command()]
     /// Use a model to infer the next tokens in a sequence, and exit.
     Infer(Box<Infer>),
@@ -156,7 +112,7 @@ pub struct Info {
     pub tensors: bool,
 
     /// Show all of the tokens in the tokenizer.
-    #[arg(long, short = 'v')]
+    #[arg(long, short = 'k')]
     pub tokenizer: bool,
 }
 
@@ -372,12 +328,22 @@ impl ModelTokenizer {
     }
 }
 
+#[derive(Parser, Debug)]
+pub struct ModelArchitecture {
+    /// The model architecture to use. Will attempt to guess if not specified.
+    #[arg(long, short = 'a')]
+    pub model_architecture: Option<llm::ModelArchitecture>,
+}
+
 #[derive(Parser, Debug)]
 pub struct ModelAndTokenizer {
     /// Where to load the model from
     #[arg(long, short = 'm')]
     pub model_path: PathBuf,
 
+    #[command(flatten)]
+    pub architecture: ModelArchitecture,
+
     #[command(flatten)]
     pub tokenizer: ModelTokenizer,
 }
@@ -415,7 +381,7 @@ pub struct ModelLoad {
     pub lora_paths: Option<Vec<PathBuf>>,
 }
 impl ModelLoad {
-    pub fn load<M: llm::KnownModel + 'static>(&self, use_gpu: bool) -> Result<Box<dyn Model>> {
+    pub fn load(&self, use_gpu: bool) -> Result<Box<dyn Model>> {
         let params = ModelParameters {
             prefer_mmap: !self.no_mmap,
             context_size: self.num_ctx_tokens,
@@ -441,7 +407,8 @@ impl ModelLoad {
             }
         };
 
-        let model = llm::load::<M>(
+        let model = llm::load_dynamic(
+            self.model_and_tokenizer.architecture.model_architecture,
             &self.model_and_tokenizer.model_path,
             tokenizer_source,
             params,
@@ -496,7 +463,6 @@ impl ModelLoad {
                 }
             },
         )
-        .map(Box::new)
         .wrap_err("Could not load model");
 
         if model.is_err() {
@@ -507,7 +473,7 @@ impl ModelLoad {
             }
         }
 
-        Ok(model?)
+        model
     }
 }
 
@@ -548,6 +514,9 @@ impl PromptFile {
 
 #[derive(Parser, Debug)]
 pub struct Quantize {
+    #[command(flatten)]
+    pub architecture: ModelArchitecture,
+
     /// The path to the model to quantize
     #[arg()]
     pub source: PathBuf,
diff --git a/binaries/llm-cli/src/main.rs b/binaries/llm-cli/src/main.rs
index 45e4c127..443f6733 100644
--- a/binaries/llm-cli/src/main.rs
+++ b/binaries/llm-cli/src/main.rs
@@ -5,8 +5,8 @@ use std::{
 };
 
 use clap::Parser;
-use cli_args::{Args, BaseArgs};
-use color_eyre::eyre::{Context, Result};
+use cli_args::Args;
+use color_eyre::eyre::{Context, ContextCompat, Result};
 use llm::{InferenceError, InferenceFeedback, InferenceResponse};
 use rustyline::{
     error::ReadlineError,
@@ -25,35 +25,22 @@ fn main() -> Result<()> {
         .init();
     color_eyre::install()?;
 
-    let cli_args = Args::parse();
-    match &cli_args {
-        Args::Llama { args } => handle_args::<llm::models::Llama>(args),
-        Args::Bloom { args } => handle_args::<llm::models::Bloom>(args),
-        Args::Gpt2 { args } => handle_args::<llm::models::Gpt2>(args),
-        Args::GptJ { args } => handle_args::<llm::models::GptJ>(args),
-        Args::GptNeoX { args } => handle_args::<llm::models::GptNeoX>(args),
-        Args::Mpt { args } => handle_args::<llm::models::Mpt>(args),
-        #[cfg(feature = "falcon")]
-        Args::Falcon { args } => handle_args::<llm::models::Falcon>(args),
-    }
-}
-
-fn handle_args<M: llm::KnownModel + 'static>(args: &cli_args::BaseArgs) -> Result<()> {
+    let args = Args::parse();
     match args {
-        BaseArgs::Infer(args) => infer::<M>(args),
-        BaseArgs::Perplexity(args) => perplexity::<M>(args),
-        BaseArgs::Info(args) => info::<M>(args),
-        BaseArgs::PromptTokens(args) => prompt_tokens::<M>(args),
-        BaseArgs::Repl(args) => interactive::<M>(args, false),
-        BaseArgs::Chat(args) => interactive::<M>(args, true),
-        BaseArgs::Quantize(args) => quantize::<M>(args),
+        Args::Infer(args) => infer(&args),
+        Args::Perplexity(args) => perplexity(&args),
+        Args::Info(args) => info(&args),
+        Args::PromptTokens(args) => prompt_tokens(&args),
+        Args::Repl(args) => interactive(&args, false),
+        Args::Chat(args) => interactive(&args, true),
+        Args::Quantize(args) => quantize(&args),
     }
 }
 
-fn infer<M: llm::KnownModel + 'static>(args: &cli_args::Infer) -> Result<()> {
+fn infer(args: &cli_args::Infer) -> Result<()> {
     let prompt = load_prompt_file_with_prompt(&args.prompt_file, args.prompt.as_deref());
     let inference_session_config = args.generate.inference_session_config();
-    let model = args.model_load.load::<M>(args.generate.use_gpu)?;
+    let model = args.model_load.load(args.generate.use_gpu)?;
 
     let (mut session, session_loaded) = snapshot::read_or_create_session(
         model.as_ref(),
@@ -118,10 +105,10 @@ fn infer<M: llm::KnownModel + 'static>(args: &cli_args::Infer) -> Result<()> {
     Ok(())
 }
 
-fn perplexity<M: llm::KnownModel + 'static>(args: &cli_args::Perplexity) -> Result<()> {
+fn perplexity(args: &cli_args::Perplexity) -> Result<()> {
     let prompt = load_prompt_file_with_prompt(&args.prompt_file, args.prompt.as_deref());
     let inference_session_config = args.generate.inference_session_config();
-    let model = args.model_load.load::<M>(args.generate.use_gpu)?;
+    let model = args.model_load.load(args.generate.use_gpu)?;
     let (mut session, _) = snapshot::read_or_create_session(
         model.as_ref(),
         None,
@@ -142,48 +129,62 @@ fn perplexity<M: llm::KnownModel + 'static>(args: &cli_args::Perplexity) -> Resu
     Ok(())
 }
 
-fn info<M: llm::KnownModel + 'static>(args: &cli_args::Info) -> Result<()> {
-    let model_path = &args.model_and_tokenizer.model_path;
-    let tokenizer = args.model_and_tokenizer.to_source()?.retrieve(model_path)?;
+fn info(args: &cli_args::Info) -> Result<()> {
+    struct InfoVisitor<'a>(&'a cli_args::Info);
+    impl llm::ModelArchitectureVisitor<Result<()>> for InfoVisitor<'_> {
+        fn visit<M: llm::KnownModel + 'static>(&mut self) -> Result<()> {
+            let args = self.0;
 
-    let file = File::open(model_path)?;
-    let mut reader = BufReader::new(&file);
-    let mut loader: llm::Loader<M::Hyperparameters, _> = llm::Loader::new(tokenizer, |_| {
-        // We purposely do not print progress here, as we are only interested in the metadata
-    });
+            let model_path = &args.model_and_tokenizer.model_path;
+            let tokenizer = args.model_and_tokenizer.to_source()?.retrieve(model_path)?;
 
-    llm::ggml_format::load(&mut reader, &mut loader)?;
+            let file = File::open(model_path)?;
+            let mut reader = BufReader::new(&file);
+            let mut loader: llm::Loader<M::Hyperparameters, _> =
+                llm::Loader::new(tokenizer, |_| {
+                    // We purposely do not print progress here, as we are only interested in the metadata
+                });
 
-    log::info!("Container type: {:?}", loader.container_type);
-    log::info!("Hyperparameters: {:?}", loader.hyperparameters);
-    log::info!("Tokenizer vocabulary size: {}", loader.tokenizer.len());
+            llm::ggml_format::load(&mut reader, &mut loader)?;
 
-    if args.tokenizer {
-        log::info!("Tokens:");
-        for i in 0..loader.tokenizer.len() {
-            log::info!("- {}: {}", i, utf8_or_array(&loader.tokenizer.token(i)));
-        }
-    }
+            log::info!("Container type: {:?}", loader.container_type);
+            log::info!("Hyperparameters: {:?}", loader.hyperparameters);
+            log::info!("Tokenizer vocabulary size: {}", loader.tokenizer.len());
 
-    if args.tensors {
-        log::info!("Tensors:");
-        for (name, tensor) in &loader.tensors {
-            log::info!("- {} ({:?} {:?})", name, tensor.element_type, tensor.dims());
-        }
-    }
+            if args.tokenizer {
+                log::info!("Tokens:");
+                for i in 0..loader.tokenizer.len() {
+                    log::info!("- {}: {}", i, utf8_or_array(&loader.tokenizer.token(i)));
+                }
+            }
 
-    fn utf8_or_array(token: &[u8]) -> String {
-        std::str::from_utf8(token)
-            .map(|s| s.to_owned())
-            .unwrap_or(format!("{:?}", token))
+            if args.tensors {
+                log::info!("Tensors:");
+                for (name, tensor) in &loader.tensors {
+                    log::info!("- {} ({:?} {:?})", name, tensor.element_type, tensor.dims());
+                }
+            }
+
+            fn utf8_or_array(token: &[u8]) -> String {
+                std::str::from_utf8(token)
+                    .map(|s| s.to_owned())
+                    .unwrap_or(format!("{:?}", token))
+            }
+
+            Ok(())
+        }
     }
 
-    Ok(())
+    args.model_and_tokenizer
+        .architecture
+        .model_architecture
+        .wrap_err("a model architecture is required at present")?
+        .visit(&mut InfoVisitor(args))
 }
 
-fn prompt_tokens<M: llm::KnownModel + 'static>(args: &cli_args::PromptTokens) -> Result<()> {
+fn prompt_tokens(args: &cli_args::PromptTokens) -> Result<()> {
     let prompt = load_prompt_file_with_prompt(&args.prompt_file, args.prompt.as_deref());
-    let model = args.model_load.load::<M>(false)?;
+    let model = args.model_load.load(false)?;
     let toks = match model.tokenizer().tokenize(&prompt, false) {
         Ok(toks) => toks,
         Err(e) => {
@@ -222,7 +223,7 @@ fn force_newline_event_seq() -> KeyEvent {
     KeyEvent(KeyCode::Enter, Modifiers::SHIFT)
 }
 
-fn interactive<M: llm::KnownModel + 'static>(
+fn interactive(
     args: &cli_args::Repl,
     // If set to false, the session will be cloned after each inference
     // to ensure that previous state is not carried over.
@@ -230,7 +231,7 @@ fn interactive<M: llm::KnownModel + 'static>(
 ) -> Result<()> {
     let prompt_file = args.prompt_file.contents();
     let inference_session_config = args.generate.inference_session_config();
-    let model = args.model_load.load::<M>(args.generate.use_gpu)?;
+    let model = args.model_load.load(args.generate.use_gpu)?;
     let (mut session, mut session_loaded) = snapshot::read_or_create_session(
         model.as_ref(),
         None,
@@ -318,51 +319,64 @@ fn interactive<M: llm::KnownModel + 'static>(
     Ok(())
 }
 
-fn quantize<M: llm::KnownModel + 'static>(args: &cli_args::Quantize) -> Result<()> {
+fn quantize(args: &cli_args::Quantize) -> Result<()> {
     use llm::QuantizeProgress;
 
-    let mut source = BufReader::new(std::fs::File::open(&args.source)?);
-    let mut destination = BufWriter::new(std::fs::File::create(&args.destination)?);
-    let tokenizer = args.tokenizer.to_source()?.retrieve(&args.source)?;
-
-    llm::quantize::<M, _, _>(
-        &mut source,
-        &mut destination,
-        tokenizer,
-        args.container_type.into(),
-        args.target.into(),
-        |progress| match progress {
-            QuantizeProgress::HyperparametersLoaded => log::info!("Loaded hyperparameters"),
-            QuantizeProgress::TensorLoading {
-                name,
-                dims,
-                element_type,
-                n_elements,
-            } => log::info!(
-                "Loading tensor `{name}` ({n_elements} ({dims:?}) {element_type} elements)"
-            ),
-            QuantizeProgress::TensorQuantizing { name } => log::info!("Quantizing tensor `{name}`"),
-            QuantizeProgress::TensorQuantized {
-                name,
-                original_size,
-                reduced_size,
-                history,
-            } => log::info!(
-            "Quantized tensor `{name}` from {original_size} to {reduced_size} bytes ({history:?})"
-        ),
-            QuantizeProgress::TensorSkipped { name, size } => {
-                log::info!("Skipped tensor `{name}` ({size} bytes)")
-            }
-            QuantizeProgress::Finished {
-                original_size,
-                reduced_size,
-                history,
-            } => log::info!(
-                "Finished quantization from {original_size} to {reduced_size} bytes ({history:?})"
-            ),
-        },
-    )
-    .wrap_err("failed to quantize model")
+    struct QuantizeVisitor<'a>(&'a cli_args::Quantize);
+    impl llm::ModelArchitectureVisitor<Result<()>> for QuantizeVisitor<'_> {
+        fn visit<M: llm::KnownModel>(&mut self) -> Result<()> {
+            let args = self.0;
+
+            let mut source: BufReader<File> = BufReader::new(std::fs::File::open(&args.source)?);
+            let mut destination: BufWriter<File> =
+                BufWriter::new(std::fs::File::create(&args.destination)?);
+            let tokenizer: llm::Tokenizer = args.tokenizer.to_source()?.retrieve(&args.source)?;
+
+            llm::quantize::<M, _, _>(
+                &mut source,
+                &mut destination,
+                tokenizer,
+                args.container_type.into(),
+                args.target.into(),
+                |progress| match progress {
+                    QuantizeProgress::HyperparametersLoaded => log::info!("Loaded hyperparameters"),
+                    QuantizeProgress::TensorLoading {
+                        name,
+                        dims,
+                        element_type,
+                        n_elements,
+                    } => log::info!(
+                        "Loading tensor `{name}` ({n_elements} ({dims:?}) {element_type} elements)"
+                    ),
+                    QuantizeProgress::TensorQuantizing { name } => log::info!("Quantizing tensor `{name}`"),
+                    QuantizeProgress::TensorQuantized {
+                        name,
+                        original_size,
+                        reduced_size,
+                        history,
+                    } => log::info!(
+                    "Quantized tensor `{name}` from {original_size} to {reduced_size} bytes ({history:?})"
+                ),
+                    QuantizeProgress::TensorSkipped { name, size } => {
+                        log::info!("Skipped tensor `{name}` ({size} bytes)")
+                    }
+                    QuantizeProgress::Finished {
+                        original_size,
+                        reduced_size,
+                        history,
+                    } => log::info!(
+                        "Finished quantization from {original_size} to {reduced_size} bytes ({history:?})"
+                    ),
+                },
+            )
+            .wrap_err("failed to quantize model")
+        }
+    }
+
+    args.architecture
+        .model_architecture
+        .wrap_err("the architecture must be known for quantization")?
+        .visit(&mut QuantizeVisitor(args))
 }
 
 fn load_prompt_file_with_prompt(
diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs
index 13b04516..1725535b 100644
--- a/crates/llm-base/src/loader.rs
+++ b/crates/llm-base/src/loader.rs
@@ -327,13 +327,21 @@ pub enum LoadError {
     },
     /// The tokenizer could not be loaded.
     #[error("could not load tokenizer {path:?}: {error}")]
-    TokenizerLoadError {
+    TokenizerLoadFail {
         /// The invalid tokenizer path
         path: PathBuf,
 
         /// The error that occurred.
         error: Box<dyn Error + Send + Sync>,
     },
+    /// There is insufficient information to guess the model architecture from the provided file.
+    ///
+    /// A model architecture must be provided to load the model.
+    #[error("could not guess model architecture from {path:?}")]
+    MissingModelArchitecture {
+        /// The path that failed.
+        path: PathBuf,
+    },
 }
 impl From<util::FindAllModelFilesError> for LoadError {
     fn from(value: util::FindAllModelFilesError) -> Self {
@@ -345,7 +353,7 @@ impl From<util::FindAllModelFilesError> for LoadError {
 }
 impl From<TokenizerLoadError> for LoadError {
     fn from(value: TokenizerLoadError) -> Self {
-        LoadError::TokenizerLoadError {
+        LoadError::TokenizerLoadFail {
             path: value.path,
             error: value.error,
         }
diff --git a/crates/llm-base/src/model/mod.rs b/crates/llm-base/src/model/mod.rs
index bee50f37..45eb8650 100644
--- a/crates/llm-base/src/model/mod.rs
+++ b/crates/llm-base/src/model/mod.rs
@@ -177,7 +177,8 @@ pub enum HyperparametersWriteError {
     InvalidIntegerConversion(#[from] std::num::TryFromIntError),
 }
 
-/// Parameters for tuning model instances
+/// Parameters for model-wide behaviour.
+#[derive(Debug, Clone)]
 pub struct ModelParameters {
     /// For [GGML formats](ggml::ContainerType) that support it, [mmap](https://en.wikipedia.org/wiki/Mmap)
     /// is the default. Although mmap typically improves performance, setting this value to `false` may
diff --git a/crates/llm/examples/embeddings.rs b/crates/llm/examples/embeddings.rs
index 74207a1d..0a6a999a 100644
--- a/crates/llm/examples/embeddings.rs
+++ b/crates/llm/examples/embeddings.rs
@@ -51,7 +51,7 @@ fn main() {
     // Load model
     let model_params = llm::ModelParameters::default();
     let model = llm::load_dynamic(
-        model_architecture,
+        Some(model_architecture),
         &model_path,
         tokenizer_source,
         model_params,
diff --git a/crates/llm/examples/inference.rs b/crates/llm/examples/inference.rs
index aa740b02..51e7369a 100644
--- a/crates/llm/examples/inference.rs
+++ b/crates/llm/examples/inference.rs
@@ -39,7 +39,7 @@ fn main() {
     let now = std::time::Instant::now();
 
     let model = llm::load_dynamic(
-        model_architecture,
+        Some(model_architecture),
         &model_path,
         tokenizer_source,
         Default::default(),
diff --git a/crates/llm/examples/vicuna-chat.rs b/crates/llm/examples/vicuna-chat.rs
index e08f0be3..7cdeb1d1 100644
--- a/crates/llm/examples/vicuna-chat.rs
+++ b/crates/llm/examples/vicuna-chat.rs
@@ -31,7 +31,7 @@ fn main() {
     let model_architecture = args.model_architecture;
     let model_path = args.model_path;
     let model = llm::load_dynamic(
-        model_architecture,
+        Some(model_architecture),
         &model_path,
         tokenizer_source,
         Default::default(),
diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs
index 30ea6c56..c165deb5 100644
--- a/crates/llm/src/lib.rs
+++ b/crates/llm/src/lib.rs
@@ -153,6 +153,33 @@ impl ModelArchitecture {
     ];
 }
 
+/// Used to dispatch some code based on the model architecture.
+pub trait ModelArchitectureVisitor<R> {
+    /// Visit a model architecture.
+    fn visit<M: KnownModel + 'static>(&mut self) -> R;
+}
+impl ModelArchitecture {
+    /// Use a visitor to dispatch some code based on the model architecture.
+    pub fn visit<R>(&self, visitor: &mut impl ModelArchitectureVisitor<R>) -> R {
+        match self {
+            #[cfg(feature = "bloom")]
+            Self::Bloom => visitor.visit::<models::Bloom>(),
+            #[cfg(feature = "gpt2")]
+            Self::Gpt2 => visitor.visit::<models::Gpt2>(),
+            #[cfg(feature = "gptj")]
+            Self::GptJ => visitor.visit::<models::GptJ>(),
+            #[cfg(feature = "gptneox")]
+            Self::GptNeoX => visitor.visit::<models::GptNeoX>(),
+            #[cfg(feature = "llama")]
+            Self::Llama => visitor.visit::<models::Llama>(),
+            #[cfg(feature = "mpt")]
+            Self::Mpt => visitor.visit::<models::Mpt>(),
+            #[cfg(feature = "falcon")]
+            Self::Falcon => visitor.visit::<models::Falcon>(),
+        }
+    }
+}
+
 /// An unsupported model architecture was specified.
 pub struct UnsupportedModelArchitecture(String);
 impl Display for UnsupportedModelArchitecture {
@@ -227,18 +254,17 @@ impl Display for ModelArchitecture {
 }
 
 /// A helper function that loads the specified model from disk using an architecture
-/// specified at runtime.
+/// specified at runtime. If no architecture is specified, it will try to infer it
+/// from the model's metadata.
 ///
 /// A wrapper around [load] that dispatches to the correct model.
 pub fn load_dynamic(
-    architecture: ModelArchitecture,
+    architecture: Option<ModelArchitecture>,
     path: &Path,
     tokenizer_source: TokenizerSource,
     params: ModelParameters,
     load_progress_callback: impl FnMut(LoadProgress),
 ) -> Result<Box<dyn Model>, LoadError> {
-    use ModelArchitecture as MA;
-
     fn load_model<M: KnownModel + 'static>(
         path: &Path,
         tokenizer_source: TokenizerSource,
@@ -253,38 +279,35 @@ pub fn load_dynamic(
         )?))
     }
 
-    let model: Box<dyn Model> = match architecture {
-        #[cfg(feature = "bloom")]
-        MA::Bloom => {
-            load_model::<models::Bloom>(path, tokenizer_source, params, load_progress_callback)?
-        }
-        #[cfg(feature = "gpt2")]
-        MA::Gpt2 => {
-            load_model::<models::Gpt2>(path, tokenizer_source, params, load_progress_callback)?
-        }
-        #[cfg(feature = "gptj")]
-        MA::GptJ => {
-            load_model::<models::GptJ>(path, tokenizer_source, params, load_progress_callback)?
-        }
-        #[cfg(feature = "gptneox")]
-        MA::GptNeoX => {
-            load_model::<models::GptNeoX>(path, tokenizer_source, params, load_progress_callback)?
-        }
-        #[cfg(feature = "llama")]
-        MA::Llama => {
-            load_model::<models::Llama>(path, tokenizer_source, params, load_progress_callback)?
-        }
-        #[cfg(feature = "mpt")]
-        MA::Mpt => {
-            load_model::<models::Mpt>(path, tokenizer_source, params, load_progress_callback)?
-        }
-        #[cfg(feature = "falcon")]
-        MA::Falcon => {
-            load_model::<models::Falcon>(path, tokenizer_source, params, load_progress_callback)?
+    let architecture = architecture.ok_or_else(|| LoadError::MissingModelArchitecture {
+        path: path.to_owned(),
+    })?;
+
+    struct LoadVisitor<'a, F: FnMut(LoadProgress)> {
+        path: &'a Path,
+        tokenizer_source: TokenizerSource,
+        params: ModelParameters,
+        load_progress_callback: F,
+    }
+    impl<'a, F: FnMut(LoadProgress)> ModelArchitectureVisitor<Result<Box<dyn Model>, LoadError>>
+        for LoadVisitor<'a, F>
+    {
+        fn visit<M: KnownModel + 'static>(&mut self) -> Result<Box<dyn Model>, LoadError> {
+            load_model::<M>(
+                self.path,
+                self.tokenizer_source.clone(),
+                self.params.clone(),
+                &mut self.load_progress_callback,
+            )
         }
-    };
+    }
 
-    Ok(model)
+    architecture.visit(&mut LoadVisitor {
+        path,
+        tokenizer_source,
+        params,
+        load_progress_callback,
+    })
 }
 
 #[cfg(test)]