From 7661291737e082b9310b7c3fdf5200eed8fab6d3 Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Sat, 17 Jun 2023 21:59:26 +0200 Subject: [PATCH 01/21] Hparams + Loading --- crates/models/falcon/Cargo.toml | 13 ++ crates/models/falcon/src/lib.rs | 370 ++++++++++++++++++++++++++++++++ 2 files changed, 383 insertions(+) create mode 100644 crates/models/falcon/Cargo.toml create mode 100644 crates/models/falcon/src/lib.rs diff --git a/crates/models/falcon/Cargo.toml b/crates/models/falcon/Cargo.toml new file mode 100644 index 00000000..0c9cdbc8 --- /dev/null +++ b/crates/models/falcon/Cargo.toml @@ -0,0 +1,13 @@ +[package] +name = "llm-falcon" +version = "0.2.0-dev" +license = { workspace = true } +repository = { workspace = true } +description = "An implementation of tiiuae falcon model for the `llm` ecosystem." +edition = "2021" +readme = "../../../README.md" + +[dependencies] +llm-base = { path = "../../llm-base", version = "0.2.0-dev" } + +bytemuck = { workspace = true } diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs new file mode 100644 index 00000000..c371a147 --- /dev/null +++ b/crates/models/falcon/src/lib.rs @@ -0,0 +1,370 @@ +//! An implementation of [tiiuae](https://huggingface.co/tiiuae)'s [falcon] model for the `llm` ecosystem. +#![deny(missing_docs)] + +use ggml::Tensor; +use llm_base::{ + ggml, + model::{common, HyperparametersWriteError}, + util, FileType, InferenceParameters, InferenceSession, InferenceSessionConfig, KnownModel, + LoadError, Mmap, ModelParameters, OutputRequest, Regex, TokenId, Vocabulary, +}; + +/// The falcon model. Ref: [Technology Innovation Institute](https://huggingface.co/tiiuae/falcon-40b) +/// +/// # Safety +/// This implements [Send] and [Sync] as it is immutable after construction. +pub struct Falcon { + // the context size ("memory") the model should use when evaluating a prompt + context_size: usize, + + hyperparameters: Hyperparameters, + + vocabulary: Vocabulary, + + // model-global weights + // weighted token embeddings + tok_embeddings: Tensor, + output_norm: Tensor, + output_norm_b: Tensor, + lm_head: Tensor, + + // weights for the model + layers: Vec, + + // must be kept alive for the model + _context: ggml::Context, + _mmap: Option, +} + +unsafe impl Send for Falcon {} +unsafe impl Sync for Falcon {} + +impl KnownModel for Falcon { + type Hyperparameters = Hyperparameters; + + fn new( + hyperparameters: Self::Hyperparameters, + params: ModelParameters, + vocabulary: Vocabulary, + tensor_loader: impl llm_base::TensorLoader, + ) -> Result { + let mut tl = tensor_loader; + + // model-gobal weights + let tok_embeddings = tl.load("transformer.word_embeddings.weight")?; + let output_norm = tl.load("transformer.ln_f.weight")?; + let output_norm_b = tl.load("transformer.ln_f.bias")?; + let lm_head = tl.load("lm_head.weight")?; + + let mut layers = Vec::new(); + for i in 0..hyperparameters.n_layer { + let layer = Layer { + attention_norm: tl.load(&format!("transformer.h.{i}.input_layernorm.weight"))?, + attention_norm_b: tl.load(&format!("transformer.h.{i}.input_layernorm.bias"))?, + + query_key_value: tl.load(&format!( + "transformer.h.{i}.self_attention.query_key_value.weight" + ))?, + wo: tl.load(&format!("transformer.h.{i}.self_attention.dense.weight"))?, + + ffn_up: tl.load(&format!("transformer.h.{i}.mlp.dense_h_to_4h.weight"))?, + ffn_down: tl.load(&format!("transformer.h.{i}.mlp.dense_4h_to_h.weight"))?, + }; + + layers.push(layer); + } + + let (_context, _, _mmap) = tl.finish(); + + let ModelParameters { context_size, .. } = params; + + Ok(Falcon { + hyperparameters, + context_size, + vocabulary, + tok_embeddings, + output_norm, + output_norm_b, + lm_head, + layers, + _context, + _mmap, + }) + } + + fn start_session(&self, config: InferenceSessionConfig) -> InferenceSession { + InferenceSession::new( + config, + self.context_size, + self.hyperparameters.n_layer, + self.hyperparameters.n_embd, + self.hyperparameters.n_vocab, + ) + } + + fn evaluate( + &self, + session: &mut InferenceSession, + params: &InferenceParameters, + input_tokens: &[TokenId], + output_request: &mut OutputRequest, + ) { + let input_len = input_tokens.len(); + let session_len = session.n_past; + let num_threads = params.n_threads; + let ctx_size = self.context_size; + + let Hyperparameters { + n_embd, + n_head, + n_vocab, + n_layer, + .. + } = self.hyperparameters; + + let (ctx0, embd) = common::prepare_for_evaluate(n_layer, session, input_tokens); + + let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, &embd); + + let f32_size = std::mem::size_of::(); + + let memory_k = &session.memory_k; + let memory_k_size = memory_k.element_size(); + + let memory_v = &session.memory_v; + let memory_v_size = memory_v.element_size(); + + let mut gf = ggml::ComputationGraph::new(num_threads); + // for il in 0..n_layer { + // // attention uses first scratch buffer + // ctx0.use_scratch(Some(&mut session.scratch[0])); + + // let mut current = ctx0.op_norm(&input_layer); + // current = ctx0.op_mul( + // &ctx0.op_repeat(&self.layers[il].norm_1_weight, ¤t), + // ¤t, + // ); + + // current = ctx0.op_mul_mat(&self.layers[il].c_attn_wqkv_weight, ¤t); + + // let nb = current.get_nb()[1]; + // let qcur = ctx0.op_view_2d(¤t, (n_embd, input_len), nb, 0); + // let kcur = ctx0.op_view_2d(¤t, (n_embd, input_len), nb, f32_size * n_embd); + // let vcur = ctx0.op_view_2d(¤t, (n_embd, input_len), nb, f32_size * n_embd * 2); + + // let k = ctx0.op_view_1d( + // memory_k, + // input_len * n_embd, + // (memory_k_size * n_embd) * (il * ctx_size + session_len), + // ); + // let v = ctx0.op_view_1d( + // memory_v, + // input_len * n_embd, + // (memory_v_size * n_embd) * (il * ctx_size + session_len), + // ); + + // gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); + // gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v)); + + // let q = ctx0.op_permute( + // &ctx0.op_cpy( + // &qcur, + // &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, input_len), + // ), + // (0, 2, 1, 3), + // ); + + // let bigk = ctx0.op_permute( + // &ctx0.op_reshape_3d( + // &ctx0.op_view_1d( + // memory_k, + // (session_len + input_len) * n_embd, + // il * ctx_size * memory_k_size * n_embd, + // ), + // n_embd / n_head, + // n_head, + // session_len + input_len, + // ), + // (0, 2, 1, 3), + // ); + + // let kq = ctx0.op_mul_mat(&bigk, &q); + // let kq_scaled = ctx0.op_scale( + // &kq, + // &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), + // ); + // let kq_scaled_alibi = ctx0.op_alibi(&kq_scaled, session_len, n_head, alibi_bias_max); + // let kq_masked = ctx0.op_diag_mask_inf(&kq_scaled_alibi, session_len); + // let kq_softmax = ctx0.op_soft_max(&kq_masked); + + // let v_trans = ctx0.op_cpy( + // &ctx0.op_permute( + // &ctx0.op_reshape_3d( + // &ctx0.op_view_1d( + // &session.memory_v, + // (session_len + input_len) * n_embd, + // il * ctx_size * memory_v_size * n_embd, + // ), + // n_embd / n_head, + // n_head, + // session_len + input_len, + // ), + // (1, 2, 0, 3), + // ), + // &ctx0.new_tensor_3d( + // session.memory_v.get_type(), + // session_len + input_len, + // n_embd / n_head, + // n_head, + // ), + // ); + + // let kqv = ctx0.op_mul_mat(&v_trans, &kq_softmax); + // let kqv_merged = ctx0.op_permute(&kqv, (0, 2, 1, 3)); + + // current = ctx0.op_cpy( + // &kqv_merged, + // &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), + // ); + // // projection + // current = ctx0.op_mul_mat(&self.layers[il].c_attn_out_proj_weight, ¤t); + + // input_layer = ctx0.op_add(&input_layer, ¤t); + + // // feed forward uses second scratch buffer + // ctx0.use_scratch(Some(&mut session.scratch[1])); + + // current = ctx0.op_norm(&input_layer); + // current = ctx0.op_mul( + // &ctx0.op_repeat(&self.layers[il].norm_2_weight, ¤t), + // ¤t, + // ); + + // current = ctx0.op_mul_mat(&self.layers[il].ffn_up_proj, ¤t); + + // current = ctx0.op_gelu(¤t); + + // // projection + // current = ctx0.op_mul_mat(&self.layers[il].ffn_down_proj, ¤t); + + // input_layer = ctx0.op_add(&input_layer, ¤t); + // } + + // //use scratch buffer 0 for the rest + // ctx0.use_scratch(Some(&mut session.scratch[0])); + + // // norm + // input_layer = ctx0.op_norm(&input_layer); + // input_layer = ctx0.op_mul(&ctx0.op_repeat(&self.norm, &input_layer), &input_layer); + + // let embeddings_tensor: ggml::Tensor = input_layer.share(); + + // // disable scratch buffer for last layer + // ctx0.use_scratch(None); + // // output embedding weight tied to input embedding + // input_layer = ctx0.op_mul_mat(&self.wte, &input_layer); + + // // run the computation + // gf.build_forward_expand(&input_layer); + // ctx0.graph_compute(&mut gf); + + // // finish evaluation + // common::read_last_token(session, &input_layer, n_vocab, input_len); + // common::extract_logits(output_request, &input_layer, n_vocab, input_len); + // common::extract_embeddings(output_request, &embeddings_tensor, n_embd, input_len); + // common::update_session(session, &ctx0, input_tokens.len(), input_len); + } + + /// Returns the vocabulary used by this model. + fn vocabulary(&self) -> &Vocabulary { + &self.vocabulary + } + + fn context_size(&self) -> usize { + self.context_size + } + + fn bot_token_id(&self) -> Option { + self.vocabulary.id("<|padding|>".as_bytes()) + } + + fn eot_token_id(&self) -> TokenId { + self.vocabulary.id("<|endoftext|>".as_bytes()).unwrap() + } + + fn quantize_tensors() -> Vec { + vec![Regex::new(".*weight").unwrap()] + } + + fn skip_quantize_tensors() -> Vec { + vec![] + } +} + +/// MPT [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)) +#[derive(Debug, Default, PartialEq, Clone, Copy)] +pub struct Hyperparameters { + /// Size of the model's vocabulary + n_vocab: usize, + /// Maximum sequence length + n_ctx: usize, + /// Size of the model's embedding layer + n_embd: usize, + /// n_heads + n_head: usize, + /// Number of layers in the model + n_layer: usize, + /// file_type + file_type: FileType, +} + +impl llm_base::Hyperparameters for Hyperparameters { + fn read_ggml(reader: &mut dyn std::io::BufRead) -> Result { + let hyperparameters = Hyperparameters { + n_vocab: util::read_i32(reader)?.try_into()?, + n_ctx: util::read_i32(reader)?.try_into()?, + n_embd: util::read_i32(reader)?.try_into()?, + n_head: util::read_i32(reader)?.try_into()?, + n_layer: util::read_i32(reader)?.try_into()?, + file_type: util::read_filetype(reader)?, + }; + + Ok(hyperparameters) + } + + fn write_ggml(&self, writer: &mut dyn std::io::Write) -> Result<(), HyperparametersWriteError> { + util::write_i32(writer, self.n_vocab.try_into()?)?; + util::write_i32(writer, self.n_embd.try_into()?)?; + util::write_i32(writer, self.n_embd.try_into()?)?; + util::write_i32(writer, self.n_head.try_into()?)?; + util::write_i32(writer, self.n_layer.try_into()?)?; + util::write_i32(writer, self.file_type.into())?; + Ok(()) + } + + fn n_vocabulary(&self) -> usize { + self.n_vocab + } + + fn file_type(&self) -> Option { + Some(self.file_type) + } + + fn file_type_mut(&mut self) -> Option<&mut FileType> { + Some(&mut self.file_type) + } +} + +struct Layer { + // normalization + attention_norm: Tensor, + attention_norm_b: Tensor, + + // attention + query_key_value: Tensor, + wo: Tensor, + + // ff + ffn_up: Tensor, + ffn_down: Tensor, +} From 5eec60f8f79e3035f0d6ec12e117cd4a67900d37 Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Sat, 17 Jun 2023 23:01:31 +0200 Subject: [PATCH 02/21] Added eval --- crates/models/falcon/src/lib.rs | 318 ++++++++++++++++++-------------- 1 file changed, 180 insertions(+), 138 deletions(-) diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index c371a147..0149228a 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -115,6 +115,7 @@ impl KnownModel for Falcon { let ctx_size = self.context_size; let Hyperparameters { + n_ctx, n_embd, n_head, n_vocab, @@ -122,9 +123,18 @@ impl KnownModel for Falcon { .. } = self.hyperparameters; + let head_dim = n_embd / n_head; + let N = input_len; + let (ctx0, embd) = common::prepare_for_evaluate(n_layer, session, input_tokens); let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, &embd); + let mut repeat_dummy = ctx0.new_tensor_3d( + input_layer.get_type(), + head_dim, + input_len + session_len, + n_head, + ); let f32_size = std::mem::size_of::(); @@ -135,144 +145,176 @@ impl KnownModel for Falcon { let memory_v_size = memory_v.element_size(); let mut gf = ggml::ComputationGraph::new(num_threads); - // for il in 0..n_layer { - // // attention uses first scratch buffer - // ctx0.use_scratch(Some(&mut session.scratch[0])); - - // let mut current = ctx0.op_norm(&input_layer); - // current = ctx0.op_mul( - // &ctx0.op_repeat(&self.layers[il].norm_1_weight, ¤t), - // ¤t, - // ); - - // current = ctx0.op_mul_mat(&self.layers[il].c_attn_wqkv_weight, ¤t); - - // let nb = current.get_nb()[1]; - // let qcur = ctx0.op_view_2d(¤t, (n_embd, input_len), nb, 0); - // let kcur = ctx0.op_view_2d(¤t, (n_embd, input_len), nb, f32_size * n_embd); - // let vcur = ctx0.op_view_2d(¤t, (n_embd, input_len), nb, f32_size * n_embd * 2); - - // let k = ctx0.op_view_1d( - // memory_k, - // input_len * n_embd, - // (memory_k_size * n_embd) * (il * ctx_size + session_len), - // ); - // let v = ctx0.op_view_1d( - // memory_v, - // input_len * n_embd, - // (memory_v_size * n_embd) * (il * ctx_size + session_len), - // ); - - // gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); - // gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v)); - - // let q = ctx0.op_permute( - // &ctx0.op_cpy( - // &qcur, - // &ctx0.new_tensor_3d(ggml::Type::F32, n_embd / n_head, n_head, input_len), - // ), - // (0, 2, 1, 3), - // ); - - // let bigk = ctx0.op_permute( - // &ctx0.op_reshape_3d( - // &ctx0.op_view_1d( - // memory_k, - // (session_len + input_len) * n_embd, - // il * ctx_size * memory_k_size * n_embd, - // ), - // n_embd / n_head, - // n_head, - // session_len + input_len, - // ), - // (0, 2, 1, 3), - // ); - - // let kq = ctx0.op_mul_mat(&bigk, &q); - // let kq_scaled = ctx0.op_scale( - // &kq, - // &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), - // ); - // let kq_scaled_alibi = ctx0.op_alibi(&kq_scaled, session_len, n_head, alibi_bias_max); - // let kq_masked = ctx0.op_diag_mask_inf(&kq_scaled_alibi, session_len); - // let kq_softmax = ctx0.op_soft_max(&kq_masked); - - // let v_trans = ctx0.op_cpy( - // &ctx0.op_permute( - // &ctx0.op_reshape_3d( - // &ctx0.op_view_1d( - // &session.memory_v, - // (session_len + input_len) * n_embd, - // il * ctx_size * memory_v_size * n_embd, - // ), - // n_embd / n_head, - // n_head, - // session_len + input_len, - // ), - // (1, 2, 0, 3), - // ), - // &ctx0.new_tensor_3d( - // session.memory_v.get_type(), - // session_len + input_len, - // n_embd / n_head, - // n_head, - // ), - // ); - - // let kqv = ctx0.op_mul_mat(&v_trans, &kq_softmax); - // let kqv_merged = ctx0.op_permute(&kqv, (0, 2, 1, 3)); - - // current = ctx0.op_cpy( - // &kqv_merged, - // &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), - // ); - // // projection - // current = ctx0.op_mul_mat(&self.layers[il].c_attn_out_proj_weight, ¤t); - - // input_layer = ctx0.op_add(&input_layer, ¤t); - - // // feed forward uses second scratch buffer - // ctx0.use_scratch(Some(&mut session.scratch[1])); - - // current = ctx0.op_norm(&input_layer); - // current = ctx0.op_mul( - // &ctx0.op_repeat(&self.layers[il].norm_2_weight, ¤t), - // ¤t, - // ); - - // current = ctx0.op_mul_mat(&self.layers[il].ffn_up_proj, ¤t); - - // current = ctx0.op_gelu(¤t); - - // // projection - // current = ctx0.op_mul_mat(&self.layers[il].ffn_down_proj, ¤t); - - // input_layer = ctx0.op_add(&input_layer, ¤t); - // } - - // //use scratch buffer 0 for the rest - // ctx0.use_scratch(Some(&mut session.scratch[0])); - - // // norm - // input_layer = ctx0.op_norm(&input_layer); - // input_layer = ctx0.op_mul(&ctx0.op_repeat(&self.norm, &input_layer), &input_layer); - - // let embeddings_tensor: ggml::Tensor = input_layer.share(); - - // // disable scratch buffer for last layer - // ctx0.use_scratch(None); - // // output embedding weight tied to input embedding - // input_layer = ctx0.op_mul_mat(&self.wte, &input_layer); - - // // run the computation - // gf.build_forward_expand(&input_layer); - // ctx0.graph_compute(&mut gf); - - // // finish evaluation - // common::read_last_token(session, &input_layer, n_vocab, input_len); - // common::extract_logits(output_request, &input_layer, n_vocab, input_len); - // common::extract_embeddings(output_request, &embeddings_tensor, n_embd, input_len); - // common::update_session(session, &ctx0, input_tokens.len(), input_len); + + let mut current: Tensor; + let mut layernorm_output: Tensor; + + for il in 0..n_layer { + // attention uses first scratch buffer + ctx0.use_scratch(Some(&mut session.scratch[0])); + + // self-attention + let mut current = ctx0.op_norm(&input_layer); + current = ctx0.op_add( + &ctx0.op_mul( + &ctx0.op_repeat(&self.layers[il].attention_norm, ¤t), + ¤t, + ), + &ctx0.op_repeat(&self.layers[il].attention_norm_b, ¤t), + ); + + layernorm_output = current.share(); + + // compute QKV + current = ctx0.op_mul_mat(&self.layers[il].query_key_value, ¤t); + + let fused_qkv_row_nb = (n_embd + 2 * (n_embd / n_head)) * f32_size; + + let mut qcur = ctx0.op_view_3d( + ¤t, + (head_dim, n_head, N), + (head_dim * f32_size, fused_qkv_row_nb), + 0, + ); + + let mut kcur = ctx0.op_view_3d( + ¤t, + (head_dim, 1, N), + (head_dim * f32_size, fused_qkv_row_nb), + n_embd * f32_size, + ); + + let vcur = ctx0.op_view_3d( + ¤t, + (head_dim, 1, N), + (head_dim * f32_size, fused_qkv_row_nb), + (n_embd + head_dim) * f32_size, + ); + + // using mode = 2 for neox mode + qcur = ctx0.op_rope_inplace(&qcur, session_len, head_dim, 2); + kcur = ctx0.op_rope_inplace(&qcur, session_len, head_dim, 2); + + // store key and value to memory + + let k = ctx0.op_view_1d( + &memory_k, + N * head_dim, + (memory_k_size * head_dim) * (il * n_ctx + session_len), + ); + let v = ctx0.op_view_1d( + &memory_v, + N * head_dim, + (memory_k_size * head_dim) * (il * n_ctx + session_len), + ); + + gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); + gf.build_forward_expand(&ctx0.op_cpy(&vcur, &v)); + + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) + let bigq = ctx0.op_permute(&qcur, (0, 2, 1, 3)); + + let mut bigk = ctx0.op_permute( + &ctx0.op_reshape_3d( + &ctx0.op_view_1d( + &memory_k, + (session_len + N) * head_dim, + il * n_ctx * memory_k_size * head_dim, + ), + head_dim, + 1, + session_len + N, + ), + (0, 2, 1, 3), + ); + // K * Q + bigk = ctx0.op_cont(&ctx0.op_repeat(&bigk, &repeat_dummy)); + let big_kq = ctx0.op_mul(&bigk, &bigq); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + let big_kq_scaled = ctx0.op_scale_inplace( + &big_kq, + &ctx0.new_f32(1f32 / f32::sqrt(n_embd as f32 / n_head as f32)), + ); + + let big_kq_masked = ctx0.op_diag_mask_inf_inplace(&big_kq_scaled, session_len); + + let big_kq_softmax = ctx0.op_soft_max_inplace(&big_kq_masked); + + let mut bigv = ctx0.op_permute( + &ctx0.op_reshape_3d( + &ctx0.op_view_1d( + &memory_v, + (session_len + N) * head_dim, + il * n_ctx * memory_v_size * head_dim, + ), + head_dim, + 1, + session_len + N, + ), + (0, 2, 1, 3), + ); + bigv = ctx0.op_cont(&ctx0.op_transpose(&ctx0.op_repeat(&bigv, &repeat_dummy))); + + // KQV = transpose(V) * KQ_soft_max + let big_kqv = ctx0.op_mul_mat(&bigv, &big_kq_softmax); + // KQV_merged = KQV.permute(0, 2, 1, 3) + let big_kqv_merged = ctx0.op_permute(&big_kqv, (0, 2, 1, 3)); + + // cur = KQV_merged.contiguous().view(n_embd, N) + current = ctx0.op_cpy( + &big_kqv_merged, + &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, N), + ); + + // projection + current = ctx0.op_mul_mat(&self.layers[il].wo, ¤t); + + // feed forward uses second scratch buffer + ctx0.use_scratch(Some(&mut session.scratch[1])); + + let inpFF = layernorm_output.share(); + let attn_out = ctx0.op_cpy(¤t, &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, N)); + + current = ctx0.op_mul_mat(&self.layers[il].ffn_up, &inpFF); + current = ctx0.op_gelu(¤t); + current = ctx0.op_mul_mat(&self.layers[il].ffn_down, ¤t); + + current = ctx0.op_add(¤t, &attn_out); + current = ctx0.op_add(¤t, &input_layer); + + input_layer = current.share(); + } + + ctx0.use_scratch(Some(&mut session.scratch[0])); + + // norm + input_layer = ctx0.op_norm(&input_layer); + + input_layer = ctx0.op_add( + &ctx0.op_mul( + &ctx0.op_repeat(&self.output_norm, &input_layer), + &input_layer, + ), + &ctx0.op_repeat(&self.output_norm_b, &input_layer), + ); + + let embeddings_tensor: ggml::Tensor = input_layer.share(); + + ctx0.use_scratch(None); + + // lm_head + input_layer = ctx0.op_mul_mat(&self.lm_head, &input_layer); + + // run the computation + gf.build_forward_expand(&input_layer); + ctx0.graph_compute(&mut gf); + + // finish evaluation + common::read_last_token(session, &input_layer, n_vocab, input_len); + common::extract_logits(output_request, &input_layer, n_vocab, input_len); + common::extract_embeddings(output_request, &embeddings_tensor, n_embd, input_len); + common::update_session(session, &ctx0, input_tokens.len(), input_len); } /// Returns the vocabulary used by this model. From 41bde927e8074b5d2277411b657949ebc5a9128c Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Sat, 17 Jun 2023 23:04:20 +0200 Subject: [PATCH 03/21] Naming + warnings --- crates/models/falcon/src/lib.rs | 43 ++++++++++++++++----------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index 0149228a..eca6ee7b 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -115,7 +115,6 @@ impl KnownModel for Falcon { let ctx_size = self.context_size; let Hyperparameters { - n_ctx, n_embd, n_head, n_vocab, @@ -124,12 +123,12 @@ impl KnownModel for Falcon { } = self.hyperparameters; let head_dim = n_embd / n_head; - let N = input_len; + let n = input_len; let (ctx0, embd) = common::prepare_for_evaluate(n_layer, session, input_tokens); let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, &embd); - let mut repeat_dummy = ctx0.new_tensor_3d( + let repeat_dummy = ctx0.new_tensor_3d( input_layer.get_type(), head_dim, input_len + session_len, @@ -154,7 +153,7 @@ impl KnownModel for Falcon { ctx0.use_scratch(Some(&mut session.scratch[0])); // self-attention - let mut current = ctx0.op_norm(&input_layer); + current = ctx0.op_norm(&input_layer); current = ctx0.op_add( &ctx0.op_mul( &ctx0.op_repeat(&self.layers[il].attention_norm, ¤t), @@ -172,40 +171,40 @@ impl KnownModel for Falcon { let mut qcur = ctx0.op_view_3d( ¤t, - (head_dim, n_head, N), + (head_dim, n_head, n), (head_dim * f32_size, fused_qkv_row_nb), 0, ); let mut kcur = ctx0.op_view_3d( ¤t, - (head_dim, 1, N), + (head_dim, 1, n), (head_dim * f32_size, fused_qkv_row_nb), n_embd * f32_size, ); let vcur = ctx0.op_view_3d( ¤t, - (head_dim, 1, N), + (head_dim, 1, n), (head_dim * f32_size, fused_qkv_row_nb), (n_embd + head_dim) * f32_size, ); // using mode = 2 for neox mode qcur = ctx0.op_rope_inplace(&qcur, session_len, head_dim, 2); - kcur = ctx0.op_rope_inplace(&qcur, session_len, head_dim, 2); + kcur = ctx0.op_rope_inplace(&kcur, session_len, head_dim, 2); // store key and value to memory let k = ctx0.op_view_1d( &memory_k, - N * head_dim, - (memory_k_size * head_dim) * (il * n_ctx + session_len), + n * head_dim, + (memory_k_size * head_dim) * (il * ctx_size + session_len), ); let v = ctx0.op_view_1d( &memory_v, - N * head_dim, - (memory_k_size * head_dim) * (il * n_ctx + session_len), + n * head_dim, + (memory_k_size * head_dim) * (il * ctx_size + session_len), ); gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); @@ -218,12 +217,12 @@ impl KnownModel for Falcon { &ctx0.op_reshape_3d( &ctx0.op_view_1d( &memory_k, - (session_len + N) * head_dim, - il * n_ctx * memory_k_size * head_dim, + (session_len + n) * head_dim, + il * ctx_size * memory_k_size * head_dim, ), head_dim, 1, - session_len + N, + session_len + n, ), (0, 2, 1, 3), ); @@ -245,12 +244,12 @@ impl KnownModel for Falcon { &ctx0.op_reshape_3d( &ctx0.op_view_1d( &memory_v, - (session_len + N) * head_dim, - il * n_ctx * memory_v_size * head_dim, + (session_len + n) * head_dim, + il * ctx_size * memory_v_size * head_dim, ), head_dim, 1, - session_len + N, + session_len + n, ), (0, 2, 1, 3), ); @@ -264,7 +263,7 @@ impl KnownModel for Falcon { // cur = KQV_merged.contiguous().view(n_embd, N) current = ctx0.op_cpy( &big_kqv_merged, - &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, N), + &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n), ); // projection @@ -273,10 +272,10 @@ impl KnownModel for Falcon { // feed forward uses second scratch buffer ctx0.use_scratch(Some(&mut session.scratch[1])); - let inpFF = layernorm_output.share(); - let attn_out = ctx0.op_cpy(¤t, &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, N)); + let inp_ff = layernorm_output.share(); + let attn_out = ctx0.op_cpy(¤t, &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n)); - current = ctx0.op_mul_mat(&self.layers[il].ffn_up, &inpFF); + current = ctx0.op_mul_mat(&self.layers[il].ffn_up, &inp_ff); current = ctx0.op_gelu(¤t); current = ctx0.op_mul_mat(&self.layers[il].ffn_down, ¤t); From c2cf35eabaf6d549d218b2dcd6dbdf58611f9c7e Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Sat, 17 Jun 2023 23:29:58 +0200 Subject: [PATCH 04/21] Add Falcon to CLI --- binaries/llm-cli/src/cli_args.rs | 6 ++++++ binaries/llm-cli/src/main.rs | 1 + crates/llm/Cargo.toml | 4 +++- crates/llm/src/lib.rs | 15 +++++++++++++++ 4 files changed, 25 insertions(+), 1 deletion(-) diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs index 2b5ad199..efedfd87 100644 --- a/binaries/llm-cli/src/cli_args.rs +++ b/binaries/llm-cli/src/cli_args.rs @@ -44,6 +44,12 @@ pub enum Args { #[command(subcommand)] args: BaseArgs, }, + /// Use a Falcon model + #[clap(id = "falcon")] + Falcon { + #[command(subcommand)] + args: BaseArgs, + }, } #[derive(Subcommand, Debug)] diff --git a/binaries/llm-cli/src/main.rs b/binaries/llm-cli/src/main.rs index 5c4a4a7a..0329dc18 100644 --- a/binaries/llm-cli/src/main.rs +++ b/binaries/llm-cli/src/main.rs @@ -33,6 +33,7 @@ fn main() -> Result<()> { Args::GptJ { args } => handle_args::(args), Args::GptNeoX { args } => handle_args::(args), Args::Mpt { args } => handle_args::(args), + Args::Falcon { args } => handle_args::(args), } } diff --git a/crates/llm/Cargo.toml b/crates/llm/Cargo.toml index 05a2bdae..108252d4 100644 --- a/crates/llm/Cargo.toml +++ b/crates/llm/Cargo.toml @@ -15,6 +15,7 @@ llm-gptj = { path = "../models/gptj", optional = true, version = "0.2.0-dev" } llm-bloom = { path = "../models/bloom", optional = true, version = "0.2.0-dev" } llm-gptneox = { path = "../models/gptneox", optional = true, version = "0.2.0-dev" } llm-mpt = { path = "../models/mpt", optional = true, version = "0.2.0-dev" } +llm-falcon = { path = "../models/falcon", optional = true, version = "0.2.0-dev" } serde = { workspace = true } @@ -28,10 +29,11 @@ serde_json = { workspace = true } clap = { workspace = true } [features] -default = ["llama", "gpt2", "gptj", "bloom", "gptneox", "mpt"] +default = ["llama", "gpt2", "gptj", "bloom", "gptneox", "mpt", "falcon"] llama = ["dep:llm-llama"] gpt2 = ["dep:llm-gpt2"] gptj = ["dep:llm-gptj"] bloom = ["dep:llm-bloom"] gptneox = ["dep:llm-gptneox"] mpt = ["dep:llm-mpt"] +falcon = ["dep:llm-falcon"] diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs index 13d308a0..d40f37b7 100644 --- a/crates/llm/src/lib.rs +++ b/crates/llm/src/lib.rs @@ -7,6 +7,7 @@ //! - [GPT-NeoX](llm_gptneox) //! - [LLaMA](llm_llama) //! - [MPT](llm_mpt) +//! - [Falcon](llm_falcon) //! //! At present, the only supported backend is [GGML](https://github.com/ggerganov/ggml), but this is expected to //! change in the future. @@ -101,6 +102,8 @@ pub mod models { pub use llm_llama::{self as llama, Llama}; #[cfg(feature = "mpt")] pub use llm_mpt::{self as mpt, Mpt}; + #[cfg(feature = "falcon")] + pub use llm_falcon::{self as falcon, Falcon}; } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize)] @@ -124,6 +127,9 @@ pub enum ModelArchitecture { #[cfg(feature = "mpt")] /// [MPT](llm_mpt) Mpt, + #[cfg(feature = "falcon")] + /// [Falcon](llm_falcon) + Falcon, } impl ModelArchitecture { @@ -141,6 +147,8 @@ impl ModelArchitecture { Self::Llama, #[cfg(feature = "mpt")] Self::Mpt, + #[cfg(feature = "falcon")] + Self::Falcon, ]; } @@ -184,6 +192,8 @@ impl FromStr for ModelArchitecture { "llama" => Ok(Llama), #[cfg(feature = "mpt")] "mpt" => Ok(Mpt), + #[cfg(feature = "falcon")] + "falcon" => Ok(Falcon), _ => Err(UnsupportedModelArchitecture(format!( "{s} is not a supported model architecture" @@ -209,6 +219,8 @@ impl Display for ModelArchitecture { Llama => write!(f, "LLaMA"), #[cfg(feature = "mpt")] Mpt => write!(f, "MPT"), + #[cfg(feature = "falcon")] + Falcon => write!(f, "Falcon"), } } } @@ -263,6 +275,9 @@ pub fn load_dynamic( } #[cfg(feature = "mpt")] Mpt => load_model::(path, vocabulary_source, params, load_progress_callback)?, + #[cfg(feature = "falcon")] + Falcon => load_model::(path, vocabulary_source, params, load_progress_callback)?, + }; Ok(model) From df40cc156d6025a4f9aef178f138bbb46cd98c92 Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Sun, 18 Jun 2023 10:05:45 +0200 Subject: [PATCH 05/21] Update crates/models/falcon/Cargo.toml Co-authored-by: Dan Forbes --- crates/models/falcon/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/models/falcon/Cargo.toml b/crates/models/falcon/Cargo.toml index 0c9cdbc8..e71c261c 100644 --- a/crates/models/falcon/Cargo.toml +++ b/crates/models/falcon/Cargo.toml @@ -3,7 +3,7 @@ name = "llm-falcon" version = "0.2.0-dev" license = { workspace = true } repository = { workspace = true } -description = "An implementation of tiiuae falcon model for the `llm` ecosystem." +description = "An implementation of Falcon for the `llm` ecosystem." edition = "2021" readme = "../../../README.md" From b63070536794bb64cc050833b8d785392bce0c55 Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Sun, 18 Jun 2023 10:05:53 +0200 Subject: [PATCH 06/21] Update crates/models/falcon/src/lib.rs Co-authored-by: Dan Forbes --- crates/models/falcon/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index eca6ee7b..55e351a9 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -1,4 +1,4 @@ -//! An implementation of [tiiuae](https://huggingface.co/tiiuae)'s [falcon] model for the `llm` ecosystem. +//! An implementation of [Falcon](https://falconllm.tii.ae/) model for the `llm` ecosystem. #![deny(missing_docs)] use ggml::Tensor; From 6f628bec0442ff4d64b12fd784972ace43dd997c Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Sun, 18 Jun 2023 10:06:17 +0200 Subject: [PATCH 07/21] Update crates/models/falcon/src/lib.rs Co-authored-by: Dan Forbes --- crates/models/falcon/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index 55e351a9..801a2830 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -9,7 +9,7 @@ use llm_base::{ LoadError, Mmap, ModelParameters, OutputRequest, Regex, TokenId, Vocabulary, }; -/// The falcon model. Ref: [Technology Innovation Institute](https://huggingface.co/tiiuae/falcon-40b) +/// The Falcon model. Ref: [Technology Innovation Institute](https://huggingface.co/tiiuae) /// /// # Safety /// This implements [Send] and [Sync] as it is immutable after construction. From 611d2455627338cdd6df293f6139648c6ec25634 Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Sun, 18 Jun 2023 10:06:36 +0200 Subject: [PATCH 08/21] Update crates/models/falcon/src/lib.rs Co-authored-by: Dan Forbes --- crates/models/falcon/src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index 801a2830..06505f78 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -342,7 +342,7 @@ impl KnownModel for Falcon { } } -/// MPT [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)) +/// Falcon [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)) #[derive(Debug, Default, PartialEq, Clone, Copy)] pub struct Hyperparameters { /// Size of the model's vocabulary From d67148ebb2983e50c165c53c1955256c47aaa333 Mon Sep 17 00:00:00 2001 From: Lukas Kreussel <65088241+LLukas22@users.noreply.github.com> Date: Sun, 18 Jun 2023 16:12:08 +0200 Subject: [PATCH 09/21] Bugfix: Mat-Mul and wrong memory --- crates/llm/src/lib.rs | 9 +++++---- crates/models/falcon/src/lib.rs | 18 +++++++----------- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs index d40f37b7..d792b64b 100644 --- a/crates/llm/src/lib.rs +++ b/crates/llm/src/lib.rs @@ -92,6 +92,8 @@ use serde::Serialize; pub mod models { #[cfg(feature = "bloom")] pub use llm_bloom::{self as bloom, Bloom}; + #[cfg(feature = "falcon")] + pub use llm_falcon::{self as falcon, Falcon}; #[cfg(feature = "gpt2")] pub use llm_gpt2::{self as gpt2, Gpt2}; #[cfg(feature = "gptj")] @@ -102,8 +104,6 @@ pub mod models { pub use llm_llama::{self as llama, Llama}; #[cfg(feature = "mpt")] pub use llm_mpt::{self as mpt, Mpt}; - #[cfg(feature = "falcon")] - pub use llm_falcon::{self as falcon, Falcon}; } #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize)] @@ -276,8 +276,9 @@ pub fn load_dynamic( #[cfg(feature = "mpt")] Mpt => load_model::(path, vocabulary_source, params, load_progress_callback)?, #[cfg(feature = "falcon")] - Falcon => load_model::(path, vocabulary_source, params, load_progress_callback)?, - + Falcon => { + load_model::(path, vocabulary_source, params, load_progress_callback)? + } }; Ok(model) diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index 06505f78..78a26b0f 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -197,14 +197,14 @@ impl KnownModel for Falcon { // store key and value to memory let k = ctx0.op_view_1d( - &memory_k, + memory_k, n * head_dim, (memory_k_size * head_dim) * (il * ctx_size + session_len), ); let v = ctx0.op_view_1d( - &memory_v, + memory_v, n * head_dim, - (memory_k_size * head_dim) * (il * ctx_size + session_len), + (memory_v_size * head_dim) * (il * ctx_size + session_len), ); gf.build_forward_expand(&ctx0.op_cpy(&kcur, &k)); @@ -216,7 +216,7 @@ impl KnownModel for Falcon { let mut bigk = ctx0.op_permute( &ctx0.op_reshape_3d( &ctx0.op_view_1d( - &memory_k, + memory_k, (session_len + n) * head_dim, il * ctx_size * memory_k_size * head_dim, ), @@ -228,7 +228,7 @@ impl KnownModel for Falcon { ); // K * Q bigk = ctx0.op_cont(&ctx0.op_repeat(&bigk, &repeat_dummy)); - let big_kq = ctx0.op_mul(&bigk, &bigq); + let big_kq = ctx0.op_mul_mat(&bigk, &bigq); // KQ_scaled = KQ / sqrt(n_embd/n_head) let big_kq_scaled = ctx0.op_scale_inplace( @@ -243,7 +243,7 @@ impl KnownModel for Falcon { let mut bigv = ctx0.op_permute( &ctx0.op_reshape_3d( &ctx0.op_view_1d( - &memory_v, + memory_v, (session_len + n) * head_dim, il * ctx_size * memory_v_size * head_dim, ), @@ -326,7 +326,7 @@ impl KnownModel for Falcon { } fn bot_token_id(&self) -> Option { - self.vocabulary.id("<|padding|>".as_bytes()) + None } fn eot_token_id(&self) -> TokenId { @@ -347,8 +347,6 @@ impl KnownModel for Falcon { pub struct Hyperparameters { /// Size of the model's vocabulary n_vocab: usize, - /// Maximum sequence length - n_ctx: usize, /// Size of the model's embedding layer n_embd: usize, /// n_heads @@ -363,7 +361,6 @@ impl llm_base::Hyperparameters for Hyperparameters { fn read_ggml(reader: &mut dyn std::io::BufRead) -> Result { let hyperparameters = Hyperparameters { n_vocab: util::read_i32(reader)?.try_into()?, - n_ctx: util::read_i32(reader)?.try_into()?, n_embd: util::read_i32(reader)?.try_into()?, n_head: util::read_i32(reader)?.try_into()?, n_layer: util::read_i32(reader)?.try_into()?, @@ -376,7 +373,6 @@ impl llm_base::Hyperparameters for Hyperparameters { fn write_ggml(&self, writer: &mut dyn std::io::Write) -> Result<(), HyperparametersWriteError> { util::write_i32(writer, self.n_vocab.try_into()?)?; util::write_i32(writer, self.n_embd.try_into()?)?; - util::write_i32(writer, self.n_embd.try_into()?)?; util::write_i32(writer, self.n_head.try_into()?)?; util::write_i32(writer, self.n_layer.try_into()?)?; util::write_i32(writer, self.file_type.into())?; From f9f477518eba0b4bf7e790f610f1068a1059e172 Mon Sep 17 00:00:00 2001 From: Philpax Date: Fri, 23 Jun 2023 01:21:44 +0200 Subject: [PATCH 10/21] feat: automatically run checks before commit --- .github/workflows/rust.yml | 2 +- .rusty-hook.toml | 5 +++ Cargo.lock | 66 ++++++++++++++++++++++++++++ binaries/llm-cli/Cargo.toml | 3 ++ binaries/precommit-check/Cargo.toml | 8 ++++ binaries/precommit-check/README.md | 3 ++ binaries/precommit-check/src/main.rs | 16 +++++++ doc/CONTRIBUTING.md | 12 ++--- 8 files changed, 105 insertions(+), 10 deletions(-) create mode 100644 .rusty-hook.toml create mode 100644 binaries/precommit-check/Cargo.toml create mode 100644 binaries/precommit-check/README.md create mode 100644 binaries/precommit-check/src/main.rs diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 602820fb..e228bd95 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -27,7 +27,7 @@ jobs: - name: Build run: cargo build --verbose - name: Run tests - run: cargo test --verbose + run: cargo test --all --verbose fmt: name: Clippy, formatting and docs runs-on: ubuntu-latest diff --git a/.rusty-hook.toml b/.rusty-hook.toml new file mode 100644 index 00000000..53820f01 --- /dev/null +++ b/.rusty-hook.toml @@ -0,0 +1,5 @@ +[hooks] +pre-commit = "cargo run -p precommit-check" + +[logging] +verbose = true diff --git a/Cargo.lock b/Cargo.lock index e941d329..ef6e1ab5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -295,6 +295,15 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "ci_info" +version = "0.10.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24f638c70e8c5753795cc9a8c07c44da91554a09e4cf11a7326e8161b0a3c45e" +dependencies = [ + "envmnt", +] + [[package]] name = "cipher" version = "0.4.4" @@ -669,6 +678,16 @@ dependencies = [ "termcolor", ] +[[package]] +name = "envmnt" +version = "0.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a2d328fc287c61314c4a61af7cfdcbd7e678e39778488c7cb13ec133ce0f4059" +dependencies = [ + "fsio", + "indexmap", +] + [[package]] name = "errno" version = "0.3.1" @@ -801,6 +820,12 @@ dependencies = [ "winapi", ] +[[package]] +name = "fsio" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1fd087255f739f4f1aeea69f11b72f8080e9c2e7645cd06955dad4a178a49e3" + [[package]] name = "futures-channel" version = "0.3.28" @@ -866,6 +891,15 @@ dependencies = [ "version_check", ] +[[package]] +name = "getopts" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "14dbbfd5c71d70241ecf9e6f13737f7b5ce823821063188d7e46c41d371eebd5" +dependencies = [ + "unicode-width", +] + [[package]] name = "getrandom" version = "0.2.9" @@ -1295,6 +1329,7 @@ dependencies = [ "log", "num_cpus", "rand", + "rusty-hook", "rustyline", "spinoff", "zstd 0.12.3+zstd.1.5.2", @@ -1473,6 +1508,12 @@ dependencies = [ "tempfile", ] +[[package]] +name = "nias" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab250442c86f1850815b5d268639dff018c0627022bc1940eb2d642ca1ce12f0" + [[package]] name = "nibble_vec" version = "0.1.0" @@ -1684,6 +1725,10 @@ version = "0.2.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +[[package]] +name = "precommit-check" +version = "0.1.0" + [[package]] name = "prettyplease" version = "0.2.4" @@ -1900,6 +1945,18 @@ dependencies = [ "windows-sys 0.48.0", ] +[[package]] +name = "rusty-hook" +version = "0.11.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96cee9be61be7e1cbadd851e58ed7449c29c620f00b23df937cb9cbc04ac21a3" +dependencies = [ + "ci_info", + "getopts", + "nias", + "toml", +] + [[package]] name = "rustyline" version = "11.0.0" @@ -2313,6 +2370,15 @@ dependencies = [ "tracing", ] +[[package]] +name = "toml" +version = "0.5.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f4f7f0dd8d50a853a531c426359045b1998f04219d88799810762cd4ad314234" +dependencies = [ + "serde", +] + [[package]] name = "tower-service" version = "0.3.2" diff --git a/binaries/llm-cli/Cargo.toml b/binaries/llm-cli/Cargo.toml index a0a692a2..dea1f3d3 100644 --- a/binaries/llm-cli/Cargo.toml +++ b/binaries/llm-cli/Cargo.toml @@ -28,6 +28,9 @@ num_cpus = "1.15.0" color-eyre = { version = "0.6.2", default-features = false } zstd = { version = "0.12", default-features = false } +[dev-dependencies] +rusty-hook = "^0.11.2" + [features] cublas = ["llm/cublas"] clblast = ["llm/clblast"] diff --git a/binaries/precommit-check/Cargo.toml b/binaries/precommit-check/Cargo.toml new file mode 100644 index 00000000..ba24f36d --- /dev/null +++ b/binaries/precommit-check/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "precommit-check" +version = "0.1.0" +edition = "2021" +publish = false + +[package.metadata.release] +release = false \ No newline at end of file diff --git a/binaries/precommit-check/README.md b/binaries/precommit-check/README.md new file mode 100644 index 00000000..3a7c8118 --- /dev/null +++ b/binaries/precommit-check/README.md @@ -0,0 +1,3 @@ +# precommit-check + +Helper script to run pre-commit checks on a repository. Used with `rusty-hook` to execute all of the checks and early exit if any of them fail. diff --git a/binaries/precommit-check/src/main.rs b/binaries/precommit-check/src/main.rs new file mode 100644 index 00000000..945881d1 --- /dev/null +++ b/binaries/precommit-check/src/main.rs @@ -0,0 +1,16 @@ +fn main() { + // Ensure that these match `.github/workflows/rust.yml`. + cmd("cargo", &["check"]); + cmd("cargo", &["test", "--all"]); + cmd("cargo", &["fmt", "--check", "--all"]); + cmd("cargo", &["doc", "--workspace", "--exclude", "llm-cli"]); + cmd("cargo", &["clippy", "--", "-Dclippy::all"]); +} + +fn cmd(cmd: &str, args: &[&str]) { + println!("=== Running command: {cmd} {args:?}"); + let mut child = std::process::Command::new(cmd).args(args).spawn().unwrap(); + if !child.wait().unwrap().success() { + panic!("Failed to run command: {} {:?}", cmd, args); + } +} diff --git a/doc/CONTRIBUTING.md b/doc/CONTRIBUTING.md index a29d11b5..43da4e1c 100644 --- a/doc/CONTRIBUTING.md +++ b/doc/CONTRIBUTING.md @@ -9,16 +9,10 @@ or on [Discord](https://discord.gg/YB9WaXYAWU)! ## Checking Changes This project uses a [GitHub workflow](../.github/workflows/rust.yml) to enforce -code standards - it will execute the following commands, which can be performed -locally for faster turnaround and a better developer experience: +code standards. -```shell -cargo check -cargo test -cargo fmt --all -cargo doc --workspace --exclude llm-cli -cargo clippy --fix --allow-dirty -- -Dclippy::all -``` +The `rusty-hook` project is used to run a similar set of checks automatically before committing. +If you would like to run these checks locally, use `cargo run -p precommit-check`. ## Regenerating GGML Bindings From 7c2f7c1840c4d2fcfb1ec0835f83837a0b622003 Mon Sep 17 00:00:00 2001 From: Philpax Date: Sun, 25 Jun 2023 14:05:51 +0200 Subject: [PATCH 11/21] feat(llm): source error in VocabularyLoadError --- crates/llm-base/src/loader.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs index 4e4b60a0..fcb6b6c9 100644 --- a/crates/llm-base/src/loader.rs +++ b/crates/llm-base/src/loader.rs @@ -326,7 +326,7 @@ pub enum LoadError { paths: Vec, }, /// The vocab file for the tokenizer could not be loaded. - #[error("could not load vocabulary file {path:?}")] + #[error("could not load vocabulary file {path:?}: {error}")] VocabularyLoadError { /// The invalid vocabulary path path: PathBuf, From 461cbce91c86cd229070247861d9543fbdfd3bb6 Mon Sep 17 00:00:00 2001 From: Julia Merz Date: Wed, 28 Jun 2023 01:40:01 +0200 Subject: [PATCH 12/21] Added InferenceSessionRef to exports, for easier serializing and deserializing --- crates/llm-base/src/lib.rs | 2 +- crates/llm/src/lib.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/llm-base/src/lib.rs b/crates/llm-base/src/lib.rs index 127fedf6..effae215 100644 --- a/crates/llm-base/src/lib.rs +++ b/crates/llm-base/src/lib.rs @@ -24,7 +24,7 @@ pub use ggml::Type as ElementType; pub use inference_session::{ feed_prompt_callback, GraphOutputs, InferenceError, InferenceFeedback, InferenceRequest, - InferenceResponse, InferenceSession, InferenceSessionConfig, InferenceSnapshot, InferenceStats, + InferenceResponse, InferenceSession, InferenceSessionConfig, InferenceSnapshot, InferenceSnapshotRef, InferenceStats, ModelKVMemoryType, SnapshotError, }; pub use loader::{ diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs index 13d308a0..0f7363ec 100644 --- a/crates/llm/src/lib.rs +++ b/crates/llm/src/lib.rs @@ -79,7 +79,7 @@ pub use llm_base::{ feed_prompt_callback, ggml::format as ggml_format, load, load_progress_callback_stdout, quantize, samplers, ElementType, FileType, FileTypeFormat, InferenceError, InferenceFeedback, InferenceParameters, InferenceRequest, InferenceResponse, InferenceSession, - InferenceSessionConfig, InferenceSnapshot, InferenceStats, InvalidTokenBias, KnownModel, + InferenceSessionConfig, InferenceSnapshot, InferenceSnapshotRef, InferenceStats, InvalidTokenBias, KnownModel, LoadError, LoadProgress, Loader, Model, ModelKVMemoryType, ModelParameters, OutputRequest, Prompt, QuantizeError, QuantizeProgress, Sampler, SnapshotError, TokenBias, TokenId, TokenUtf8Buffer, TokenizationError, Vocabulary, VocabularySource, From 45258e31c1bd43ec4d2ddf861b1b39430adb5255 Mon Sep 17 00:00:00 2001 From: Julia Merz Date: Wed, 28 Jun 2023 20:22:33 +0200 Subject: [PATCH 13/21] formatting fix --- crates/llm-base/src/lib.rs | 4 ++-- crates/llm/src/lib.rs | 9 +++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/crates/llm-base/src/lib.rs b/crates/llm-base/src/lib.rs index effae215..45c5c17e 100644 --- a/crates/llm-base/src/lib.rs +++ b/crates/llm-base/src/lib.rs @@ -24,8 +24,8 @@ pub use ggml::Type as ElementType; pub use inference_session::{ feed_prompt_callback, GraphOutputs, InferenceError, InferenceFeedback, InferenceRequest, - InferenceResponse, InferenceSession, InferenceSessionConfig, InferenceSnapshot, InferenceSnapshotRef, InferenceStats, - ModelKVMemoryType, SnapshotError, + InferenceResponse, InferenceSession, InferenceSessionConfig, InferenceSnapshot, + InferenceSnapshotRef, InferenceStats, ModelKVMemoryType, SnapshotError, }; pub use loader::{ load, load_progress_callback_stdout, ContainerType, FileType, FileTypeFormat, LoadError, diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs index 0f7363ec..b5e12da7 100644 --- a/crates/llm/src/lib.rs +++ b/crates/llm/src/lib.rs @@ -79,10 +79,11 @@ pub use llm_base::{ feed_prompt_callback, ggml::format as ggml_format, load, load_progress_callback_stdout, quantize, samplers, ElementType, FileType, FileTypeFormat, InferenceError, InferenceFeedback, InferenceParameters, InferenceRequest, InferenceResponse, InferenceSession, - InferenceSessionConfig, InferenceSnapshot, InferenceSnapshotRef, InferenceStats, InvalidTokenBias, KnownModel, - LoadError, LoadProgress, Loader, Model, ModelKVMemoryType, ModelParameters, OutputRequest, - Prompt, QuantizeError, QuantizeProgress, Sampler, SnapshotError, TokenBias, TokenId, - TokenUtf8Buffer, TokenizationError, Vocabulary, VocabularySource, + InferenceSessionConfig, InferenceSnapshot, InferenceSnapshotRef, InferenceStats, + InvalidTokenBias, KnownModel, LoadError, LoadProgress, Loader, Model, ModelKVMemoryType, + ModelParameters, OutputRequest, Prompt, QuantizeError, QuantizeProgress, Sampler, + SnapshotError, TokenBias, TokenId, TokenUtf8Buffer, TokenizationError, Vocabulary, + VocabularySource, }; use serde::Serialize; From 68bbedb12577da6cf63883fdccafb5cb53091224 Mon Sep 17 00:00:00 2001 From: Philpax Date: Wed, 28 Jun 2023 22:41:27 +0200 Subject: [PATCH 14/21] docs(readme): ake "getting models" more obvious --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index ad47e3e3..28b3892e 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,8 @@ Currently, the following models are supported: [Wizard](https://github.com/nlpxucan/WizardLM)) - [MPT](https://www.mosaicml.com/blog/mpt-7b) +See [getting models](#getting-models) for more information on how to download supported models. + ## Using `llm` in a Rust Project This project depends on Rust v1.65.0 or above and a modern C toolchain. @@ -86,7 +88,7 @@ opt-level = 3 ``` ## Leverage Accelerators with `llm` -The `llm` library is engineered to take advantage of hardware accelerators such as `cuda` and `metal` for optimized performance. +The `llm` library is engineered to take advantage of hardware accelerators such as `cuda` and `metal` for optimized performance. To enable `llm` to harness these accelerators, some preliminary configuration steps are necessary, which vary based on your operating system. For comprehensive guidance, please refer to the [Acceleration Support for Building section](doc/CONTRIBUTING.md#acceleration-support-for-building) in our documentation. From 4716b1d19f05f55762cd30ba72c9e9f270defb6d Mon Sep 17 00:00:00 2001 From: Philpax Date: Thu, 29 Jun 2023 00:50:06 +0200 Subject: [PATCH 15/21] feat(falcon): disable by default --- binaries/llm-cli/Cargo.toml | 3 +++ binaries/llm-cli/src/cli_args.rs | 1 + binaries/llm-cli/src/main.rs | 1 + crates/llm/Cargo.toml | 3 ++- crates/llm/src/lib.rs | 2 +- crates/models/falcon/src/lib.rs | 8 +++++++- 6 files changed, 15 insertions(+), 3 deletions(-) diff --git a/binaries/llm-cli/Cargo.toml b/binaries/llm-cli/Cargo.toml index dea1f3d3..bac2ff87 100644 --- a/binaries/llm-cli/Cargo.toml +++ b/binaries/llm-cli/Cargo.toml @@ -35,3 +35,6 @@ rusty-hook = "^0.11.2" cublas = ["llm/cublas"] clblast = ["llm/clblast"] metal = ["llm/metal"] + +# Falcon is off by default. See `llm_falcon`'s module documentation for more information. +falcon = ["llm/falcon"] diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs index 5e0919b9..5aec546f 100644 --- a/binaries/llm-cli/src/cli_args.rs +++ b/binaries/llm-cli/src/cli_args.rs @@ -46,6 +46,7 @@ pub enum Args { }, /// Use a Falcon model #[clap(id = "falcon")] + #[cfg(feature = "falcon")] Falcon { #[command(subcommand)] args: BaseArgs, diff --git a/binaries/llm-cli/src/main.rs b/binaries/llm-cli/src/main.rs index 51e8896e..679a753e 100644 --- a/binaries/llm-cli/src/main.rs +++ b/binaries/llm-cli/src/main.rs @@ -33,6 +33,7 @@ fn main() -> Result<()> { Args::GptJ { args } => handle_args::(args), Args::GptNeoX { args } => handle_args::(args), Args::Mpt { args } => handle_args::(args), + #[cfg(feature = "falcon")] Args::Falcon { args } => handle_args::(args), } } diff --git a/crates/llm/Cargo.toml b/crates/llm/Cargo.toml index f501a29e..035b8a83 100644 --- a/crates/llm/Cargo.toml +++ b/crates/llm/Cargo.toml @@ -29,7 +29,7 @@ serde_json = { workspace = true } clap = { workspace = true } [features] -default = ["llama", "gpt2", "gptj", "bloom", "gptneox", "mpt", "falcon"] +default = ["llama", "gpt2", "gptj", "bloom", "gptneox", "mpt"] llama = ["dep:llm-llama"] gpt2 = ["dep:llm-gpt2"] @@ -37,6 +37,7 @@ gptj = ["dep:llm-gptj"] bloom = ["dep:llm-bloom"] gptneox = ["dep:llm-gptneox"] mpt = ["dep:llm-mpt"] +# Falcon is off by default. See `llm_falcon`'s module documentation for more information. falcon = ["dep:llm-falcon"] cublas = ["llm-base/cublas"] diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs index 211f4c78..869a8416 100644 --- a/crates/llm/src/lib.rs +++ b/crates/llm/src/lib.rs @@ -7,7 +7,7 @@ //! - [GPT-NeoX](llm_gptneox) //! - [LLaMA](llm_llama) //! - [MPT](llm_mpt) -//! - [Falcon](llm_falcon) +//! - [Falcon](llm_falcon) (disabled by default) //! //! At present, the only supported backend is [GGML](https://github.com/ggerganov/ggml), but this is expected to //! change in the future. diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index a15a100c..3b989e26 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -1,4 +1,10 @@ -//! An implementation of [Falcon](https://falconllm.tii.ae/) model for the `llm` ecosystem. +//! An implementation of the [Falcon](https://falconllm.tii.ae/) model for the `llm` ecosystem. +//! +//! This implementation only works for Falcon 7B, and with 32-bit memory tensors (i.e. your inference session +//! must be configured with a 32-bit [InferenceSessionConfig]). +//! +//! This model will not be generally available in the `llm` ecosystem until Falcon 40B and 16-bit memory is +//! supported. It is currently only available as a preview. #![deny(missing_docs)] use std::sync::Arc; From bbb089c68f67bf1d0ed4f12b4859f7710663106f Mon Sep 17 00:00:00 2001 From: Philpax Date: Thu, 29 Jun 2023 01:06:19 +0200 Subject: [PATCH 16/21] fix: broken doclink --- binaries/precommit-check/src/main.rs | 23 +++++++++++++++-------- crates/llm/src/lib.rs | 2 +- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/binaries/precommit-check/src/main.rs b/binaries/precommit-check/src/main.rs index 945881d1..04d3add0 100644 --- a/binaries/precommit-check/src/main.rs +++ b/binaries/precommit-check/src/main.rs @@ -1,16 +1,23 @@ fn main() { // Ensure that these match `.github/workflows/rust.yml`. - cmd("cargo", &["check"]); - cmd("cargo", &["test", "--all"]); - cmd("cargo", &["fmt", "--check", "--all"]); - cmd("cargo", &["doc", "--workspace", "--exclude", "llm-cli"]); - cmd("cargo", &["clippy", "--", "-Dclippy::all"]); + cmd("cargo", &["check"], &[]); + cmd("cargo", &["test", "--all"], &[]); + cmd("cargo", &["fmt", "--check", "--all"], &[]); + cmd( + "cargo", + &["doc", "--workspace", "--exclude", "llm-cli"], + &[("RUSTDOCFLAGS", "-Dwarnings")], + ); + cmd("cargo", &["clippy", "--", "-Dclippy::all"], &[]); } -fn cmd(cmd: &str, args: &[&str]) { +fn cmd(cmd: &str, args: &[&str], env: &[(&str, &str)]) { println!("=== Running command: {cmd} {args:?}"); - let mut child = std::process::Command::new(cmd).args(args).spawn().unwrap(); + let mut builder = std::process::Command::new(cmd); + builder.args(args); + builder.envs(env.iter().copied()); + let mut child = builder.spawn().unwrap(); if !child.wait().unwrap().success() { - panic!("Failed to run command: {} {:?}", cmd, args); + panic!("Failed to run command: {} {:?}", cmd, builder); } } diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs index 869a8416..8adda7e7 100644 --- a/crates/llm/src/lib.rs +++ b/crates/llm/src/lib.rs @@ -7,7 +7,7 @@ //! - [GPT-NeoX](llm_gptneox) //! - [LLaMA](llm_llama) //! - [MPT](llm_mpt) -//! - [Falcon](llm_falcon) (disabled by default) +//! - Falcon (currently disabled due to incompleteness) //! //! At present, the only supported backend is [GGML](https://github.com/ggerganov/ggml), but this is expected to //! change in the future. From ae15cc4db5b1bafc1aaca05f3203e5d60f6a30bc Mon Sep 17 00:00:00 2001 From: Philpax Date: Thu, 29 Jun 2023 02:34:52 +0200 Subject: [PATCH 17/21] refactor: remove unnecessary deps --- Cargo.lock | 9 --------- crates/models/bloom/Cargo.toml | 4 +--- crates/models/falcon/Cargo.toml | 4 +--- crates/models/gptj/Cargo.toml | 4 +--- crates/models/gptneox/Cargo.toml | 3 --- crates/models/llama/Cargo.toml | 6 +----- crates/models/mpt/Cargo.toml | 3 +-- 7 files changed, 5 insertions(+), 28 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d4b5a65f..23fb7284 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1313,7 +1313,6 @@ dependencies = [ name = "llm-bloom" version = "0.2.0-dev" dependencies = [ - "bytemuck", "llm-base", ] @@ -1340,7 +1339,6 @@ dependencies = [ name = "llm-falcon" version = "0.2.0-dev" dependencies = [ - "bytemuck", "llm-base", ] @@ -1356,7 +1354,6 @@ dependencies = [ name = "llm-gptj" version = "0.2.0-dev" dependencies = [ - "bytemuck", "llm-base", ] @@ -1364,26 +1361,20 @@ dependencies = [ name = "llm-gptneox" version = "0.2.0-dev" dependencies = [ - "bytemuck", "llm-base", - "serde", ] [[package]] name = "llm-llama" version = "0.2.0-dev" dependencies = [ - "bytemuck", "llm-base", - "rand", - "thiserror", ] [[package]] name = "llm-mpt" version = "0.2.0-dev" dependencies = [ - "bytemuck", "llm-base", ] diff --git a/crates/models/bloom/Cargo.toml b/crates/models/bloom/Cargo.toml index ed6e4df0..01bb5e16 100644 --- a/crates/models/bloom/Cargo.toml +++ b/crates/models/bloom/Cargo.toml @@ -8,6 +8,4 @@ edition = "2021" readme = "../../../README.md" [dependencies] -llm-base = { path = "../../llm-base", version = "0.2.0-dev" } - -bytemuck = { workspace = true } +llm-base = { path = "../../llm-base", version = "0.2.0-dev" } \ No newline at end of file diff --git a/crates/models/falcon/Cargo.toml b/crates/models/falcon/Cargo.toml index e71c261c..d06ad05c 100644 --- a/crates/models/falcon/Cargo.toml +++ b/crates/models/falcon/Cargo.toml @@ -8,6 +8,4 @@ edition = "2021" readme = "../../../README.md" [dependencies] -llm-base = { path = "../../llm-base", version = "0.2.0-dev" } - -bytemuck = { workspace = true } +llm-base = { path = "../../llm-base", version = "0.2.0-dev" } \ No newline at end of file diff --git a/crates/models/gptj/Cargo.toml b/crates/models/gptj/Cargo.toml index 05ea615f..2d3ce60a 100644 --- a/crates/models/gptj/Cargo.toml +++ b/crates/models/gptj/Cargo.toml @@ -10,6 +10,4 @@ readme = "../../../README.md" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -llm-base = { path = "../../llm-base", version = "0.2.0-dev" } - -bytemuck = { workspace = true } +llm-base = { path = "../../llm-base", version = "0.2.0-dev" } \ No newline at end of file diff --git a/crates/models/gptneox/Cargo.toml b/crates/models/gptneox/Cargo.toml index 2f84b9f5..4df13941 100644 --- a/crates/models/gptneox/Cargo.toml +++ b/crates/models/gptneox/Cargo.toml @@ -9,6 +9,3 @@ readme = "../../../README.md" [dependencies] llm-base = { path = "../../llm-base", version = "0.2.0-dev" } - -bytemuck = { workspace = true } -serde = { workspace = true } diff --git a/crates/models/llama/Cargo.toml b/crates/models/llama/Cargo.toml index 3131b53a..b7c3bdbf 100644 --- a/crates/models/llama/Cargo.toml +++ b/crates/models/llama/Cargo.toml @@ -8,8 +8,4 @@ edition = "2021" readme = "../../../README.md" [dependencies] -llm-base = { path = "../../llm-base", version = "0.2.0-dev" } - -bytemuck = { workspace = true } -rand = { workspace = true } -thiserror = { workspace = true } +llm-base = { path = "../../llm-base", version = "0.2.0-dev" } \ No newline at end of file diff --git a/crates/models/mpt/Cargo.toml b/crates/models/mpt/Cargo.toml index 9e2544e8..2a5cbcc4 100644 --- a/crates/models/mpt/Cargo.toml +++ b/crates/models/mpt/Cargo.toml @@ -8,5 +8,4 @@ edition = "2021" readme = "../../../README.md" [dependencies] -llm-base = { path = "../../llm-base", version = "0.2.0-dev" } -bytemuck = { workspace = true } +llm-base = { path = "../../llm-base", version = "0.2.0-dev" } \ No newline at end of file From e7e732eaefdfeb65893d74e5a344ef828dbbcf3e Mon Sep 17 00:00:00 2001 From: Philpax Date: Thu, 29 Jun 2023 03:04:46 +0200 Subject: [PATCH 18/21] refactor: vocabulary -> tokenizer --- README.md | 6 +- binaries/llm-cli/src/cli_args.rs | 54 ++--- binaries/llm-cli/src/main.rs | 25 ++- crates/ggml/src/format/loader.rs | 4 +- crates/ggml/src/tests.rs | 34 ++-- crates/llm-base/src/inference_session.rs | 26 +-- crates/llm-base/src/lib.rs | 11 +- crates/llm-base/src/loader.rs | 40 ++-- crates/llm-base/src/model/mod.rs | 24 +-- crates/llm-base/src/quantize.rs | 16 +- .../src/{vocabulary.rs => tokenizer.rs} | 184 +++++++++--------- crates/llm/examples/embeddings.rs | 22 +-- crates/llm/examples/inference.rs | 20 +- crates/llm/examples/vicuna-chat.rs | 20 +- crates/llm/src/lib.rs | 44 +++-- crates/models/bloom/src/lib.rs | 16 +- crates/models/falcon/src/lib.rs | 15 +- crates/models/gpt2/src/lib.rs | 14 +- crates/models/gptj/src/lib.rs | 15 +- crates/models/gptneox/src/lib.rs | 15 +- crates/models/llama/src/lib.rs | 14 +- crates/models/mpt/src/lib.rs | 17 +- 22 files changed, 317 insertions(+), 319 deletions(-) rename crates/llm-base/src/{vocabulary.rs => tokenizer.rs} (75%) diff --git a/README.md b/README.md index 28b3892e..7828fe63 100644 --- a/README.md +++ b/README.md @@ -175,10 +175,10 @@ llm gptneox infer -m RedPajama-INCITE-Base-3B-v1-q4_0.bin -p "Rust is a cool pro In the example above, the first two arguments specify the model architecture and command, respectively. The required `-m` argument specifies the local path to the model, and the required `-p` argument specifies the evaluation prompt. The -optional `-r` argument is used to load the model's vocabulary from a remote +optional `-r` argument is used to load the model's tokenizer from a remote Hugging Face 🤗 repository, which will typically improve results when compared -to loading the vocabulary from the model file itself; there is also an optional -`-v` argument that can be used to specify the path to a local vocabulary file. +to loading the tokenizer from the model file itself; there is also an optional +`-v` argument that can be used to specify the path to a local tokenizer file. For more information about the `llm` CLI, use the `--help` parameter. There is also a [simple inference example](./crates/llm/examples/inference.rs) diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs index 5aec546f..ce7db33f 100644 --- a/binaries/llm-cli/src/cli_args.rs +++ b/binaries/llm-cli/src/cli_args.rs @@ -4,7 +4,7 @@ use clap::{Parser, Subcommand, ValueEnum}; use color_eyre::eyre::{bail, Result, WrapErr}; use llm::{ ggml_format, ElementType, InferenceParameters, InferenceSessionConfig, InvalidTokenBias, - LoadProgress, Model, ModelKVMemoryType, ModelParameters, TokenBias, VocabularySource, + LoadProgress, Model, ModelKVMemoryType, ModelParameters, TokenBias, TokenizerSource, }; use rand::SeedableRng; @@ -149,15 +149,15 @@ pub struct Perplexity { #[derive(Parser, Debug)] pub struct Info { #[command(flatten)] - pub model_and_vocabulary: ModelAndVocabulary, + pub model_and_tokenizer: ModelAndTokenizer, /// Show all of the tensors in the model, including their names, formats and shapes. #[arg(long, short = 't')] pub tensors: bool, - /// Show all of the tokens in the vocabulary. + /// Show all of the tokens in the tokenizer. #[arg(long, short = 'v')] - pub vocabulary: bool, + pub tokenizer: bool, } #[derive(Parser, Debug)] @@ -350,47 +350,47 @@ fn parse_bias(s: &str) -> Result { } #[derive(Parser, Debug)] -pub struct ModelVocabulary { - /// Local path to vocabulary +pub struct ModelTokenizer { + /// Local path to Hugging Face tokenizer file #[arg(long, short = 'v')] - pub vocabulary_path: Option, + pub tokenizer_path: Option, - /// Remote HuggingFace repository containing vocabulary + /// Remote Hugging Face repository containing a tokenizer #[arg(long, short = 'r')] - pub vocabulary_repository: Option, + pub tokenizer_repository: Option, } -impl ModelVocabulary { - pub fn to_source(&self) -> Result { - Ok(match (&self.vocabulary_path, &self.vocabulary_repository) { +impl ModelTokenizer { + pub fn to_source(&self) -> Result { + Ok(match (&self.tokenizer_path, &self.tokenizer_repository) { (Some(_), Some(_)) => { - bail!("Cannot specify both --vocabulary-path and --vocabulary-repository"); + bail!("Cannot specify both --tokenizer-path and --tokenizer-repository"); } - (Some(path), None) => VocabularySource::HuggingFaceTokenizerFile(path.to_owned()), - (None, Some(repo)) => VocabularySource::HuggingFaceRemote(repo.to_owned()), - (None, None) => VocabularySource::Model, + (Some(path), None) => TokenizerSource::HuggingFaceTokenizerFile(path.to_owned()), + (None, Some(repo)) => TokenizerSource::HuggingFaceRemote(repo.to_owned()), + (None, None) => TokenizerSource::Embedded, }) } } #[derive(Parser, Debug)] -pub struct ModelAndVocabulary { +pub struct ModelAndTokenizer { /// Where to load the model from #[arg(long, short = 'm')] pub model_path: PathBuf, #[command(flatten)] - pub vocabulary: ModelVocabulary, + pub tokenizer: ModelTokenizer, } -impl ModelAndVocabulary { - pub fn to_source(&self) -> Result { - self.vocabulary.to_source() +impl ModelAndTokenizer { + pub fn to_source(&self) -> Result { + self.tokenizer.to_source() } } #[derive(Parser, Debug)] pub struct ModelLoad { #[command(flatten)] - pub model_and_vocabulary: ModelAndVocabulary, + pub model_and_tokenizer: ModelAndTokenizer, /// Sets the size of the context (in tokens). Allows feeding longer prompts. /// Note that this affects memory. @@ -431,19 +431,19 @@ impl ModelLoad { let now = std::time::Instant::now(); let mut prev_load_time = now; - let vocabulary_source = match self.model_and_vocabulary.to_source() { + let tokenizer_source = match self.model_and_tokenizer.to_source() { Ok(vs) => vs, Err(err) => { if let Some(sp) = sp.take() { - sp.fail(&format!("Failed to load vocabulary: {}", err)); + sp.fail(&format!("Failed to load tokenizer: {}", err)); } return Err(err); } }; let model = llm::load::( - &self.model_and_vocabulary.model_path, - vocabulary_source, + &self.model_and_tokenizer.model_path, + tokenizer_source, params, |progress| match progress { LoadProgress::HyperparametersLoaded => { @@ -557,7 +557,7 @@ pub struct Quantize { pub destination: PathBuf, #[command(flatten)] - pub vocabulary: ModelVocabulary, + pub tokenizer: ModelTokenizer, /// The GGML container type to target. /// diff --git a/binaries/llm-cli/src/main.rs b/binaries/llm-cli/src/main.rs index 679a753e..45e4c127 100644 --- a/binaries/llm-cli/src/main.rs +++ b/binaries/llm-cli/src/main.rs @@ -143,15 +143,12 @@ fn perplexity(args: &cli_args::Perplexity) -> Resu } fn info(args: &cli_args::Info) -> Result<()> { - let model_path = &args.model_and_vocabulary.model_path; - let vocabulary = args - .model_and_vocabulary - .to_source()? - .retrieve(model_path)?; + let model_path = &args.model_and_tokenizer.model_path; + let tokenizer = args.model_and_tokenizer.to_source()?.retrieve(model_path)?; let file = File::open(model_path)?; let mut reader = BufReader::new(&file); - let mut loader: llm::Loader = llm::Loader::new(vocabulary, |_| { + let mut loader: llm::Loader = llm::Loader::new(tokenizer, |_| { // We purposely do not print progress here, as we are only interested in the metadata }); @@ -159,12 +156,12 @@ fn info(args: &cli_args::Info) -> Result<()> { log::info!("Container type: {:?}", loader.container_type); log::info!("Hyperparameters: {:?}", loader.hyperparameters); - log::info!("Vocabulary size: {}", loader.vocabulary.len()); + log::info!("Tokenizer vocabulary size: {}", loader.tokenizer.len()); - if args.vocabulary { - log::info!("Vocabulary:"); - for i in 0..loader.vocabulary.len() { - log::info!("- {}: {}", i, utf8_or_array(&loader.vocabulary.token(i))); + if args.tokenizer { + log::info!("Tokens:"); + for i in 0..loader.tokenizer.len() { + log::info!("- {}: {}", i, utf8_or_array(&loader.tokenizer.token(i))); } } @@ -187,7 +184,7 @@ fn info(args: &cli_args::Info) -> Result<()> { fn prompt_tokens(args: &cli_args::PromptTokens) -> Result<()> { let prompt = load_prompt_file_with_prompt(&args.prompt_file, args.prompt.as_deref()); let model = args.model_load.load::(false)?; - let toks = match model.vocabulary().tokenize(&prompt, false) { + let toks = match model.tokenizer().tokenize(&prompt, false) { Ok(toks) => toks, Err(e) => { log::error!("Could not tokenize prompt: {e}"); @@ -326,12 +323,12 @@ fn quantize(args: &cli_args::Quantize) -> Result<( let mut source = BufReader::new(std::fs::File::open(&args.source)?); let mut destination = BufWriter::new(std::fs::File::create(&args.destination)?); - let vocabulary = args.vocabulary.to_source()?.retrieve(&args.source)?; + let tokenizer = args.tokenizer.to_source()?.retrieve(&args.source)?; llm::quantize::( &mut source, &mut destination, - vocabulary, + tokenizer, args.container_type.into(), args.target.into(), |progress| match progress { diff --git a/crates/ggml/src/format/loader.rs b/crates/ggml/src/format/loader.rs index f9dd76e8..ca0f9e6b 100644 --- a/crates/ggml/src/format/loader.rs +++ b/crates/ggml/src/format/loader.rs @@ -117,7 +117,7 @@ pub fn tensor_size(element_type: ElementType, n_elements: usize) -> usize { /// Information present within GGML [hyperparameters](https://en.wikipedia.org/wiki/Hyperparameter_(machine_learning)) /// that is required to continue loading the model. pub struct PartialHyperparameters { - /// The number of tokens in the model's vocabulary. + /// The number of tokens in the model's embedded vocabulary. pub n_vocab: usize, } @@ -125,7 +125,7 @@ pub struct PartialHyperparameters { pub trait LoadHandler { /// Called when the [ContainerType] is read. fn container_type(&mut self, container_type: ContainerType) -> Result<(), E>; - /// Called when a token is read so it can be added to the model's vocabulary. + /// Called when a token is read so it can be added to the model's embedded vocabulary. fn vocabulary_token(&mut self, i: usize, token: Vec, score: f32) -> Result<(), E>; /// Called when the model's hyperparameters need to be read. fn read_hyperparameters( diff --git a/crates/ggml/src/tests.rs b/crates/ggml/src/tests.rs index 8b099c35..b842f45d 100644 --- a/crates/ggml/src/tests.rs +++ b/crates/ggml/src/tests.rs @@ -18,19 +18,19 @@ impl Error for DummyError {} #[test] fn can_roundtrip_loader_and_saver_ggml() { - let vocabulary = vec![ + let tokenizer = vec![ ("blazingly".as_bytes().to_vec(), 0.0), ("fast".as_bytes().to_vec(), 0.0), ("memory".as_bytes().to_vec(), 0.0), ("efficient".as_bytes().to_vec(), 0.0), ]; - roundtrip_test(format::SaveContainerType::Ggml, vocabulary).unwrap(); + roundtrip_test(format::SaveContainerType::Ggml, tokenizer).unwrap(); } #[test] fn will_fail_on_scored_ggml_save() { - let vocabulary = vec![ + let tokenizer = vec![ ("blazingly".as_bytes().to_vec(), 0.1), ("fast".as_bytes().to_vec(), 0.2), ("memory".as_bytes().to_vec(), 0.3), @@ -38,7 +38,7 @@ fn will_fail_on_scored_ggml_save() { ]; assert_eq!( - roundtrip_test(format::SaveContainerType::Ggml, vocabulary) + roundtrip_test(format::SaveContainerType::Ggml, tokenizer) .unwrap_err() .to_string(), format::SaveError::::VocabularyScoringNotSupported.to_string() @@ -47,19 +47,19 @@ fn will_fail_on_scored_ggml_save() { #[test] fn can_roundtrip_loader_and_saver_ggjt_v3() { - let vocabulary = vec![ + let tokenizer = vec![ ("blazingly".as_bytes().to_vec(), 0.1), ("fast".as_bytes().to_vec(), 0.2), ("memory".as_bytes().to_vec(), 0.3), ("efficient".as_bytes().to_vec(), 0.4), ]; - roundtrip_test(format::SaveContainerType::GgjtV3, vocabulary).unwrap(); + roundtrip_test(format::SaveContainerType::GgjtV3, tokenizer).unwrap(); } fn roundtrip_test( save_container_type: format::SaveContainerType, - vocabulary: Vec<(Vec, f32)>, + tokenizer: Vec<(Vec, f32)>, ) -> anyhow::Result<()> { let mut rng = rand::thread_rng(); let element_type = crate::Type::F16; @@ -67,9 +67,9 @@ fn roundtrip_test( hyperparameters: Hyperparameters { some_hyperparameter: random(), some_other_hyperparameter: random(), - vocabulary_size: vocabulary.len().try_into()?, + tokenizer_size: tokenizer.len().try_into()?, }, - vocabulary, + tokenizer, tensors: (0..10) .map(|i| { let n_dims = Uniform::from(1..3).sample(&mut rng); @@ -104,7 +104,7 @@ fn roundtrip_test( &mut cursor, &mut save_handler, save_container_type, - &model.vocabulary, + &model.tokenizer, &model.tensors.keys().cloned().collect::>(), )?; @@ -125,21 +125,21 @@ fn roundtrip_test( struct Hyperparameters { some_hyperparameter: u32, some_other_hyperparameter: u32, - vocabulary_size: u32, + tokenizer_size: u32, } impl Hyperparameters { fn read(reader: &mut dyn BufRead) -> Result { Ok(Self { some_hyperparameter: util::read_u32(reader)?, some_other_hyperparameter: util::read_u32(reader)?, - vocabulary_size: util::read_u32(reader)?, + tokenizer_size: util::read_u32(reader)?, }) } fn write(&self, writer: &mut dyn Write) -> Result<(), std::io::Error> { util::write_u32(writer, self.some_hyperparameter)?; util::write_u32(writer, self.some_other_hyperparameter)?; - util::write_u32(writer, self.vocabulary_size)?; + util::write_u32(writer, self.tokenizer_size)?; Ok(()) } } @@ -147,7 +147,7 @@ impl Hyperparameters { #[derive(Default, PartialEq, Debug)] struct Model { hyperparameters: Hyperparameters, - vocabulary: Vec<(Vec, f32)>, + tokenizer: Vec<(Vec, f32)>, tensors: BTreeMap, } @@ -181,8 +181,8 @@ impl format::LoadHandler for MockLoadHandler<'_> { } fn vocabulary_token(&mut self, i: usize, token: Vec, score: f32) -> Result<(), DummyError> { - assert_eq!(i, self.loaded_model.vocabulary.len()); - self.loaded_model.vocabulary.push((token, score)); + assert_eq!(i, self.loaded_model.tokenizer.len()); + self.loaded_model.tokenizer.push((token, score)); Ok(()) } @@ -195,7 +195,7 @@ impl format::LoadHandler for MockLoadHandler<'_> { n_vocab: self .loaded_model .hyperparameters - .vocabulary_size + .tokenizer_size .try_into() .unwrap(), }) diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs index 57ea0908..4d1489a4 100644 --- a/crates/llm-base/src/inference_session.rs +++ b/crates/llm-base/src/inference_session.rs @@ -289,7 +289,7 @@ impl InferenceSession { ) -> Result<(), InferenceError> { let beginning_of_sentence = self.n_past == 0; - let vocab = model.vocabulary(); + let vocab = model.tokenizer(); let prompt_tokens = prompt.into().to_tokens(vocab, beginning_of_sentence)?; if self.n_past + prompt_tokens.len() >= model.context_size() { @@ -301,13 +301,13 @@ impl InferenceSession { for &tk in batch { let should_call_callback = Some(tk) != model.bot_token_id(); - let mut token = match model.vocabulary() { - crate::Vocabulary::Model(_) => model.vocabulary().token(tk as usize).to_vec(), - crate::Vocabulary::External(_) => { + let mut token = match model.tokenizer() { + crate::Tokenizer::Embedded(_) => model.tokenizer().token(tk as usize).to_vec(), + crate::Tokenizer::HuggingFace(_) => { let mut previous_tokens = self.tokens.clone(); previous_tokens.push(tk); - let all_tokens = model.vocabulary().decode(previous_tokens, true); + let all_tokens = model.tokenizer().decode(previous_tokens, true); let splitted = all_tokens.split_at(self.decoded_tokens.len()); splitted.1.to_vec() @@ -359,12 +359,12 @@ impl InferenceSession { if next_token as TokenId == model.eot_token_id() { Err(InferenceError::EndOfText) } else { - let res = match model.vocabulary() { - crate::Vocabulary::Model(_) => { - model.vocabulary().token(next_token as usize).to_vec() + let res = match model.tokenizer() { + crate::Tokenizer::Embedded(_) => { + model.tokenizer().token(next_token as usize).to_vec() } - crate::Vocabulary::External(_) => { - let all_tokens = model.vocabulary().decode(self.tokens.clone(), true); + crate::Tokenizer::HuggingFace(_) => { + let all_tokens = model.tokenizer().decode(self.tokens.clone(), true); let splitted = all_tokens.split_at(self.decoded_tokens.len()); splitted.1.to_vec() @@ -399,7 +399,7 @@ impl InferenceSession { for token_id in &self.tokens { // Buffer the token until it's valid UTF-8, then call the callback. if let Some(tokens) = - token_utf8_buf.push(&model.vocabulary().token(*token_id as usize)) + token_utf8_buf.push(&model.tokenizer().token(*token_id as usize)) { if let Err(e) = callback(InferenceResponse::SnapshotToken(tokens)) { return Err(InferenceError::UserCallback(Box::new(e))); @@ -472,14 +472,14 @@ impl InferenceSession { ) -> Result<(), TokenizationError> { // Implementation based on perplexity example of llama.cpp: // https://github.com/ggerganov/llama.cpp/blob/2d5db48371052087a83974abda3767d1aedec598/examples/perplexity/perplexity.cpp#L24 - let mut tokens = prompt.into().to_tokens(model.vocabulary(), true)?; + let mut tokens = prompt.into().to_tokens(model.tokenizer(), true)?; let mut count = 0; // TODO: make this handle , }, - /// The vocab file for the tokenizer could not be loaded. - #[error("could not load vocabulary file {path:?}: {error}")] - VocabularyLoadError { - /// The invalid vocabulary path + /// The tokenizer could not be loaded. + #[error("could not load tokenizer {path:?}: {error}")] + TokenizerLoadError { + /// The invalid tokenizer path path: PathBuf, /// The error that occurred. @@ -343,9 +343,9 @@ impl From for LoadError { } } } -impl From for LoadError { - fn from(value: VocabularyLoadError) -> Self { - LoadError::VocabularyLoadError { +impl From for LoadError { + fn from(value: TokenizerLoadError) -> Self { + LoadError::TokenizerLoadError { path: value.path, error: value.error, } @@ -405,7 +405,7 @@ pub trait TensorLoader { /// store any information about the architecture. pub fn load( path: &Path, - vocabulary_source: VocabularySource, + tokenizer_source: TokenizerSource, params: ModelParameters, load_progress_callback: impl FnMut(LoadProgress), ) -> Result { @@ -426,15 +426,15 @@ pub fn load( })?; let mut reader = BufReader::new(&file); - let vocabulary = vocabulary_source.retrieve(path)?; - let mut loader = Loader::new(vocabulary, load_progress_callback); + let tokenizer = tokenizer_source.retrieve(path)?; + let mut loader = Loader::new(tokenizer, load_progress_callback); ggml::format::load(&mut reader, &mut loader) .map_err(|err| LoadError::from_format_error(err, path.to_owned()))?; let Loader { hyperparameters, - vocabulary, + tokenizer, tensors, mut load_progress_callback, container_type, @@ -486,7 +486,7 @@ pub fn load( // TODO: Consider updating the progress callback to report the progress of the LoRA file. // Most LoRAs are small enough that this is not necessary, but it would be nice to have. let mut lora_loader: Loader = - Loader::new(ModelVocabulary::default().into(), |_| {}); + Loader::new(Tokenizer::empty_embedded(), |_| {}); ggml::format::load(&mut lora_reader, &mut lora_loader) .map_err(|err| LoadError::from_format_error(err, lora_path.to_owned()))?; @@ -533,7 +533,7 @@ pub fn load( loaded_tensors: Default::default(), }; - let model = KnownModel::new(hyperparameters, params, vocabulary, tl)?; + let model = KnownModel::new(hyperparameters, params, tokenizer, tl)?; (load_progress_callback)(LoadProgress::Loaded { file_size, @@ -549,8 +549,8 @@ pub struct Loader { load_progress_callback: F, // Input/Output - /// The vocabulary of the model. - pub vocabulary: Vocabulary, + /// The tokenizer of the model. + pub tokenizer: Tokenizer, // Output /// The container type of the model. @@ -562,13 +562,13 @@ pub struct Loader { } impl Loader { /// Creates a new loader. - pub fn new(vocabulary: Vocabulary, load_progress_callback: F) -> Self { + pub fn new(tokenizer: Tokenizer, load_progress_callback: F) -> Self { Self { load_progress_callback, container_type: ContainerType::Ggml, hyperparameters: Hp::default(), - vocabulary, + tokenizer, tensors: HashMap::default(), } } @@ -582,7 +582,7 @@ impl ggml::format::LoadHandler, score: f32) -> Result<(), LoadError> { - if let Vocabulary::Model(mv) = &mut self.vocabulary { + if let Tokenizer::Embedded(mv) = &mut self.tokenizer { let id = match TokenId::try_from(i) { Ok(id) => id, Err(err) => return Err(LoadError::InvalidIntegerConversion(err)), diff --git a/crates/llm-base/src/model/mod.rs b/crates/llm-base/src/model/mod.rs index 15bd8ee5..bee50f37 100644 --- a/crates/llm-base/src/model/mod.rs +++ b/crates/llm-base/src/model/mod.rs @@ -11,8 +11,8 @@ use regex::Regex; use thiserror::Error; use crate::{ - loader::TensorLoader, vocabulary::TokenId, FileType, InferenceParameters, InferenceSession, - InferenceSessionConfig, LoadError, LoadProgress, Vocabulary, VocabularySource, + loader::TensorLoader, tokenizer::TokenId, FileType, InferenceParameters, InferenceSession, + InferenceSessionConfig, LoadError, LoadProgress, Tokenizer, TokenizerSource, }; /// Common functions for model evaluation @@ -29,14 +29,14 @@ pub trait KnownModel: Send + Sync { /// is a helper function on top of [llm_base::load](crate::load). fn load( path: &Path, - vocabulary_source: VocabularySource, + tokenizer_source: TokenizerSource, params: ModelParameters, load_progress_callback: impl FnMut(LoadProgress), ) -> Result where Self: Sized, { - crate::load(path, vocabulary_source, params, load_progress_callback) + crate::load(path, tokenizer_source, params, load_progress_callback) } /// Creates a new model from the provided [ModelParameters] hyperparameters. @@ -44,7 +44,7 @@ pub trait KnownModel: Send + Sync { fn new( hyperparameters: Self::Hyperparameters, params: ModelParameters, - vocabulary: Vocabulary, + tokenizer: Tokenizer, tensor_loader: impl TensorLoader, ) -> Result where @@ -65,8 +65,8 @@ pub trait KnownModel: Send + Sync { output_request: &mut OutputRequest, ); - /// Get the vocabulary for this model. - fn vocabulary(&self) -> &Vocabulary; + /// Get the tokenizer for this model. + fn tokenizer(&self) -> &Tokenizer; /// Get the context size (configured with [ModelParameters::context_size]) used by /// this model. @@ -103,8 +103,8 @@ pub trait Model: Send + Sync { output_request: &mut OutputRequest, ); - /// Get the vocabulary for this model. - fn vocabulary(&self) -> &Vocabulary; + /// Get the tokenizer for this model. + fn tokenizer(&self) -> &Tokenizer; /// Get the context size (configured with [ModelParameters::context_size]) used by /// this model. @@ -131,8 +131,8 @@ impl> Model for M { KnownModel::evaluate(self, session, params, input_tokens, output_request) } - fn vocabulary(&self) -> &Vocabulary { - KnownModel::vocabulary(self) + fn tokenizer(&self) -> &Tokenizer { + KnownModel::tokenizer(self) } fn context_size(&self) -> usize { @@ -157,7 +157,7 @@ pub trait Hyperparameters: Sized + Default + Debug { /// Write the parameters in GGML format to a writer. fn write_ggml(&self, writer: &mut dyn Write) -> Result<(), HyperparametersWriteError>; - /// Get the number of tokens in the vocabulary. + /// Get the number of tokens in the embedded vocabulary, if any. fn n_vocabulary(&self) -> usize; /// Get the filetype of the model. diff --git a/crates/llm-base/src/quantize.rs b/crates/llm-base/src/quantize.rs index 696f7698..187a6fc0 100644 --- a/crates/llm-base/src/quantize.rs +++ b/crates/llm-base/src/quantize.rs @@ -2,7 +2,7 @@ use crate::{ loader::FileTypeFormat, model::HyperparametersWriteError, Hyperparameters, KnownModel, - LoadError, LoadProgress, Loader, Vocabulary, + LoadError, LoadProgress, Loader, Tokenizer, }; use ggml::format::{SaveError, SaveHandler, TensorLoadInfo, TensorSaveInfo}; use half::f16; @@ -139,7 +139,7 @@ impl QuantizeError { pub fn quantize( reader: &mut R, writer: &mut W, - vocabulary: Vocabulary, + tokenizer: Tokenizer, save_container_type: ggml::format::SaveContainerType, quantization_type: ggml::Type, progress_callback: impl Fn(QuantizeProgress), @@ -154,7 +154,7 @@ pub fn quantize( // Load the model let progress_callback = Arc::new(progress_callback); - let mut loader = Loader::::new(vocabulary, { + let mut loader = Loader::::new(tokenizer, { let progress_callback = progress_callback.clone(); move |p| { if let LoadProgress::HyperparametersLoaded = p { @@ -168,7 +168,7 @@ pub fn quantize( // Save the quantized model, quantizing as we go let Loader { mut hyperparameters, - vocabulary, + tokenizer, tensors, .. } = loader; @@ -180,14 +180,14 @@ pub fn quantize( .expect("format has no corresponding ftype"); } - let vocabulary = match vocabulary { - Vocabulary::Model(v) => v + let tokenizer = match tokenizer { + Tokenizer::Embedded(v) => v .id_to_token .iter() .cloned() .zip(v.id_to_token_score) .collect::>(), - Vocabulary::External(_) => vec![], + Tokenizer::HuggingFace(_) => vec![], }; let to_quantize = M::quantize_tensors(); @@ -205,7 +205,7 @@ pub fn quantize( writer, &mut saver, save_container_type, - &vocabulary, + &tokenizer, &tensors.keys().cloned().collect::>(), ) .map_err(|err| QuantizeError::from_format_error(err, PathBuf::default()))?; diff --git a/crates/llm-base/src/vocabulary.rs b/crates/llm-base/src/tokenizer.rs similarity index 75% rename from crates/llm-base/src/vocabulary.rs rename to crates/llm-base/src/tokenizer.rs index 43540c83..b914eb3f 100644 --- a/crates/llm-base/src/vocabulary.rs +++ b/crates/llm-base/src/tokenizer.rs @@ -7,9 +7,8 @@ use std::{ }; use thiserror::Error; -use tokenizers::Tokenizer; -/// The identifier of a token in a vocabulary. +/// The identifier of a token in a tokenizer. pub type TokenId = u32; pub(crate) type Token = Vec; pub(crate) type TokenScore = f32; @@ -25,21 +24,21 @@ pub enum TokenizationError { error: Box, }, #[error("the token ID {0} was invalid for this model")] - /// One of the tokens provided by the user was invalid, and did not belong to this model's vocabulary. + /// One of the tokens provided by the user was invalid, and did not belong to this model's tokenizer. InvalidTokenId(TokenId), } #[derive(Error, Debug)] -/// Errors related to loading the vocabulary. -#[error("error loading vocabulary from {path}: {error}")] -pub struct VocabularyLoadError { - /// The path to the vocabulary. +/// Errors related to loading the tokenizer. +#[error("error loading tokenizer from {path}: {error}")] +pub struct TokenizerLoadError { + /// The path to the tokenizer. pub path: PathBuf, /// The error that occurred during loading. pub error: Box, } -impl VocabularyLoadError { +impl TokenizerLoadError { fn new(path: impl Into, error: impl Into>) -> Self { Self { path: path.into(), @@ -49,111 +48,118 @@ impl VocabularyLoadError { } #[derive(Clone, Debug, PartialEq)] -/// The source of a vocabulary. -pub enum VocabularySource { +/// The source of a tokenizer. +pub enum TokenizerSource { /// Read the vocabulary from the model if available, and use a simplistic tokenizer. /// /// This is easy to use, but may not be the best choice for your use case, and is not /// guaranteed to be available for all models. - Model, + Embedded, - /// Read the vocabulary from a local HuggingFace-format tokenizer file, and use the + /// Read the tokenizer from a local HuggingFace-format tokenizer file, and use the /// HuggingFace tokenizer. HuggingFaceTokenizerFile(PathBuf), - /// Fetch the vocabulary from a remote HuggingFace repository. This will make a blocking - /// HTTP request to HuggingFace to retrieve the vocabulary and may store files locally, + /// Fetch the tokenizer from a remote HuggingFace repository. This will make a blocking + /// HTTP request to HuggingFace to retrieve the tokenizer and may store files locally, /// so it is not recommended for production use. This will use the HuggingFace tokenizer. HuggingFaceRemote(String), } -impl VocabularySource { - /// Retrieve the vocabulary from the source. +impl TokenizerSource { + /// Retrieve the tokenizer from the source. /// - /// Note that this may make a blocking HTTP request to HuggingFace to retrieve the vocabulary + /// Note that this may make a blocking HTTP request to HuggingFace to retrieve the tokenizer. /// if `self` is [`Self::HuggingFaceRemote`]. - pub fn retrieve(self, model_path: &Path) -> Result { + pub fn retrieve(self, model_path: &Path) -> Result { Ok(match self { - Self::HuggingFaceRemote(identifier) => ExternalVocabulary::new( - Tokenizer::from_pretrained(&identifier, None) - .map_err(|error| VocabularyLoadError::new(model_path, error))?, + Self::HuggingFaceRemote(identifier) => HuggingFaceTokenizer::new( + tokenizers::Tokenizer::from_pretrained(&identifier, None) + .map_err(|error| TokenizerLoadError::new(model_path, error))?, ) .into(), Self::HuggingFaceTokenizerFile(path) => { if !path.is_file() { - return Err(VocabularyLoadError::new( + return Err(TokenizerLoadError::new( path, std::io::Error::new( std::io::ErrorKind::NotFound, - "Vocabulary file not found", + "Tokenizer was not a file, or did not exist", ), )); } - ExternalVocabulary::new( - Tokenizer::from_file(&path) - .map_err(|error| VocabularyLoadError::new(path, error))?, + HuggingFaceTokenizer::new( + tokenizers::Tokenizer::from_file(&path) + .map_err(|error| TokenizerLoadError::new(path, error))?, ) .into() } - Self::Model => ModelVocabulary::default().into(), + Self::Embedded => EmbeddedTokenizer::default().into(), }) } } -/// Vocabulary enum -pub enum Vocabulary { +/// Encapsulates the tokenizer for a model, and provides methods to tokenize text. +pub enum Tokenizer { /// The vocabulary built-in to the model. - Model(ModelVocabulary), + Embedded(EmbeddedTokenizer), - /// A custom vocabulary provided by the user. - External(ExternalVocabulary), + /// A Hugging Face tokenizer. + HuggingFace(HuggingFaceTokenizer), } -impl From for Vocabulary { - fn from(v: ModelVocabulary) -> Self { - Self::Model(v) +impl From for Tokenizer { + fn from(v: EmbeddedTokenizer) -> Self { + Self::Embedded(v) } } -impl From for Vocabulary { - fn from(v: ExternalVocabulary) -> Self { - Self::External(v) +impl From for Tokenizer { + fn from(v: HuggingFaceTokenizer) -> Self { + Self::HuggingFace(v) } } -impl Vocabulary { - /// Converts a token to the token ID it represents in this vocabulary. +impl Tokenizer { + /// Creates an empty embedded tokenizer, for contexts where you need a tokenizer but don't + /// need to tokenize anything. + pub(crate) fn empty_embedded() -> Self { + Self::Embedded(EmbeddedTokenizer::default()) + } +} +impl Tokenizer { + /// Converts a token to the token ID it represents in this tokenizer. pub fn id(&self, token: &[u8]) -> Option { match self { - Vocabulary::Model(v) => v.id(token), - Vocabulary::External(v) => v.id(token), + Tokenizer::Embedded(v) => v.id(token), + Tokenizer::HuggingFace(v) => v.id(token), } } - /// Converts a token index to the token it represents in this vocabulary. + /// Converts a token index to the token it represents in this tokenizer. pub fn token(&self, idx: usize) -> Vec { match self { - Vocabulary::Model(v) => v.token(idx), - Vocabulary::External(v) => v.token(idx), + Tokenizer::Embedded(v) => v.token(idx), + Tokenizer::HuggingFace(v) => v.token(idx), } } - /// Returns the number of tokens in the vocabulary. + /// Returns the number of tokens in the tokenizer. pub fn len(&self) -> usize { match self { - Vocabulary::Model(v) => v.len(), - Vocabulary::External(v) => v.len(), + Tokenizer::Embedded(v) => v.len(), + Tokenizer::HuggingFace(v) => v.len(), } } - /// Returns whether the vocabulary is empty. + /// Returns whether the tokenizer is empty. pub fn is_empty(&self) -> bool { match self { - Vocabulary::Model(v) => v.is_empty(), - Vocabulary::External(v) => v.is_empty(), + Tokenizer::Embedded(v) => v.is_empty(), + Tokenizer::HuggingFace(v) => v.is_empty(), } } - /// Tokenize a `text` with this vocabulary. + /// Tokenize a `text` with this tokenizer. /// /// `bos` controls whether a beginning-of-string token should be inserted. pub fn tokenize( @@ -162,31 +168,31 @@ impl Vocabulary { bos: bool, ) -> Result, TokenId)>, TokenizationError> { match self { - Vocabulary::Model(v) => v.tokenize(text, bos), - Vocabulary::External(v) => v.tokenize(text, bos), + Tokenizer::Embedded(v) => v.tokenize(text, bos), + Tokenizer::HuggingFace(v) => v.tokenize(text, bos), } } - /// decode a list `tokens` with this vocabulary. + /// Decode a list `tokens` with this tokenizer. pub fn decode(&self, tokens: Vec, bos: bool) -> Vec { match self { - Vocabulary::Model(v) => v.decode(tokens, bos), - Vocabulary::External(v) => v.decode(tokens, bos), + Tokenizer::Embedded(v) => v.decode(tokens, bos), + Tokenizer::HuggingFace(v) => v.decode(tokens, bos), } } } #[derive(Debug, Error)] -/// Errors that can occur when using a model vocabulary. -pub enum ModelVocabularyError { - /// Arbitrary error that occurred during use of the model vocabulary. +/// Errors that can occur when using a model tokenizer. +pub enum ModelTokenizerError { + /// Arbitrary error that occurred during use of the model tokenizer. #[error("Arbitrary error: {0:?}")] Arbitrary(String), } -/// The built-in GGML vocabulary. +/// The built-in GGML tokenizer. #[derive(Debug, Clone, Default)] -pub struct ModelVocabulary { +pub struct EmbeddedTokenizer { // TODO: make these private /// Maps every integer (index) token ID to its corresponding token. pub id_to_token: Vec, @@ -198,12 +204,12 @@ pub struct ModelVocabulary { /// Maps a token to a token ID. pub token_to_id: HashMap, - /// The longest token in this vocabulary. + /// The longest token in this tokenizer. pub max_token_length: usize, } -impl ModelVocabulary { - /// Add a token to the vocabulary. +impl EmbeddedTokenizer { + /// Add a token to the internal vocabulary. /// /// The token added must have `id` directly after the last token in the vocabulary. /// @@ -229,23 +235,23 @@ impl ModelVocabulary { self.token_to_id.get(token).copied() } - /// Converts a token index to the token it represents in this vocabulary. + /// Converts a token index to the token it represents in this tokenizer. fn token(&self, idx: usize) -> Vec { self.id_to_token[idx].clone() } - /// Returns the number of tokens in the vocabulary. + /// Returns the number of tokens in the tokenizer. fn len(&self) -> usize { self.id_to_token.len() } - /// Returns whether the vocabulary is empty. + /// Returns whether the tokenizer is empty. fn is_empty(&self) -> bool { self.id_to_token.is_empty() } // SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece - /// Tokenize a `text` with this vocabulary. + /// Tokenize a `text` with this tokenizer. /// /// `bos` controls whether a beginning-of-string token should be inserted. fn tokenize( @@ -284,7 +290,7 @@ impl ModelVocabulary { let token_id = prev[i]; if token_id == 0 { return Err(TokenizationError::TokenizationFailed { - error: Box::new(ModelVocabularyError::Arbitrary( + error: Box::new(ModelTokenizerError::Arbitrary( "the backward pass for the tokenizer encountered a non-set token" .to_string(), )), @@ -306,7 +312,7 @@ impl ModelVocabulary { Ok(res) } - /// decode a list `tokens` with this vocabulary. + /// Decode a list `tokens` with this tokenizer. fn decode(&self, tokens: Vec, skip_special_tokens: bool) -> Vec { let mut vec = vec![]; @@ -322,45 +328,45 @@ impl ModelVocabulary { } } -/// A vocabulary that does not originate from the model file. +/// A Hugging Face tokenizer. #[derive(Debug, Clone)] -pub struct ExternalVocabulary { - tokenizer: Tokenizer, +pub struct HuggingFaceTokenizer { + tokenizer: tokenizers::Tokenizer, } -impl ExternalVocabulary { - /// Create a new `ExternalVocabulary`. - pub fn new(tokenizer: Tokenizer) -> Self { +impl HuggingFaceTokenizer { + /// Create a new `HuggingFaceTokenizer`. + pub fn new(tokenizer: tokenizers::Tokenizer) -> Self { Self { tokenizer } } } -impl ExternalVocabulary { +impl HuggingFaceTokenizer { fn id(&self, token: &[u8]) -> Option { self.tokenizer .token_to_id(std::str::from_utf8(token).unwrap()) } - /// Converts a token index to the token it represents in this vocabulary. + /// Converts a token index to the token it represents in this tokenizer. fn token(&self, idx: usize) -> Vec { self.tokenizer .decode(vec![idx as u32], true) - .expect("Cannot decode token from tokenizer vocabulary.") + .expect("Cannot decode token from tokenizer tokenizer.") .as_bytes() .to_vec() } - /// Returns the number of tokens in the vocabulary. + /// Returns the number of tokens in the tokenizer. fn len(&self) -> usize { self.tokenizer.get_vocab_size(false) } - /// Returns whether the vocabulary is empty. + /// Returns whether the tokenizer is empty. fn is_empty(&self) -> bool { self.tokenizer.get_vocab_size(false) == 0 } - /// Tokenize a `text` with this vocabulary. + /// Tokenize a `text` with this tokenizer. /// /// `bos` controls whether a beginning-of-string token should be inserted. fn tokenize( @@ -386,11 +392,11 @@ impl ExternalVocabulary { .collect()) } - /// decode a list `tokens` with this vocabulary. + /// Decode a list `tokens` with this tokenizer. fn decode(&self, tokens: Vec, skip_special_tokens: bool) -> Vec { self.tokenizer .decode(tokens, skip_special_tokens) - .expect("Cannot decode token from tokenizer vocabulary.") + .expect("Cannot decode token from tokenizer.") .as_bytes() .to_vec() } @@ -409,17 +415,17 @@ impl ExternalVocabulary { pub enum Prompt<'a> { /// A prompt specified as text. Text(&'a str), - /// A prompt specified as tokens for this model's vocabulary. + /// A prompt specified as tokens for this model's tokenizer. Tokens(&'a [TokenId]), } impl Prompt<'_> { - /// Converts this prompt to a list of tokens for this model's vocabulary. + /// Converts this prompt to a list of tokens for this model's tokenizer. /// /// Can return an error if [Self::Tokens] is used and includes a token ID that is not - /// in this model's vocabulary. + /// in this model's tokenizer. pub fn to_tokens( &self, - vocab: &Vocabulary, + vocab: &Tokenizer, beginning_of_sentence: bool, ) -> Result, TokenizationError> { Ok(match self { diff --git a/crates/llm/examples/embeddings.rs b/crates/llm/examples/embeddings.rs index a4a7fdeb..74207a1d 100644 --- a/crates/llm/examples/embeddings.rs +++ b/crates/llm/examples/embeddings.rs @@ -7,23 +7,23 @@ struct Args { model_architecture: llm::ModelArchitecture, model_path: PathBuf, #[arg(long, short = 'v')] - pub vocabulary_path: Option, + pub tokenizer_path: Option, #[arg(long, short = 'r')] - pub vocabulary_repository: Option, + pub tokenizer_repository: Option, #[arg(long, short = 'q')] pub query: Option, #[arg(long, short = 'c')] pub comparands: Vec, } impl Args { - pub fn to_vocabulary_source(&self) -> llm::VocabularySource { - match (&self.vocabulary_path, &self.vocabulary_repository) { + pub fn to_tokenizer_source(&self) -> llm::TokenizerSource { + match (&self.tokenizer_path, &self.tokenizer_repository) { (Some(_), Some(_)) => { - panic!("Cannot specify both --vocabulary-path and --vocabulary-repository"); + panic!("Cannot specify both --tokenizer-path and --tokenizer-repository"); } - (Some(path), None) => llm::VocabularySource::HuggingFaceTokenizerFile(path.to_owned()), - (None, Some(repo)) => llm::VocabularySource::HuggingFaceRemote(repo.to_owned()), - (None, None) => llm::VocabularySource::Model, + (Some(path), None) => llm::TokenizerSource::HuggingFaceTokenizerFile(path.to_owned()), + (None, Some(repo)) => llm::TokenizerSource::HuggingFaceRemote(repo.to_owned()), + (None, None) => llm::TokenizerSource::Embedded, } } } @@ -31,7 +31,7 @@ impl Args { fn main() { let args = Args::parse(); - let vocabulary_source = args.to_vocabulary_source(); + let tokenizer_source = args.to_tokenizer_source(); let model_architecture = args.model_architecture; let model_path = args.model_path; let query = args @@ -53,7 +53,7 @@ fn main() { let model = llm::load_dynamic( model_architecture, &model_path, - vocabulary_source, + tokenizer_source, model_params, llm::load_progress_callback_stdout, ) @@ -117,7 +117,7 @@ fn get_embeddings( all_logits: None, embeddings: Some(Vec::new()), }; - let vocab = model.vocabulary(); + let vocab = model.tokenizer(); let beginning_of_sentence = true; let query_token_ids = vocab .tokenize(query, beginning_of_sentence) diff --git a/crates/llm/examples/inference.rs b/crates/llm/examples/inference.rs index d2385b8c..aa740b02 100644 --- a/crates/llm/examples/inference.rs +++ b/crates/llm/examples/inference.rs @@ -8,19 +8,19 @@ struct Args { #[arg(long, short = 'p')] prompt: Option, #[arg(long, short = 'v')] - vocabulary_path: Option, + pub tokenizer_path: Option, #[arg(long, short = 'r')] - vocabulary_repository: Option, + pub tokenizer_repository: Option, } impl Args { - pub fn to_vocabulary_source(&self) -> llm::VocabularySource { - match (&self.vocabulary_path, &self.vocabulary_repository) { + pub fn to_tokenizer_source(&self) -> llm::TokenizerSource { + match (&self.tokenizer_path, &self.tokenizer_repository) { (Some(_), Some(_)) => { - panic!("Cannot specify both --vocabulary-path and --vocabulary-repository"); + panic!("Cannot specify both --tokenizer-path and --tokenizer-repository"); } - (Some(path), None) => llm::VocabularySource::HuggingFaceTokenizerFile(path.to_owned()), - (None, Some(repo)) => llm::VocabularySource::HuggingFaceRemote(repo.to_owned()), - (None, None) => llm::VocabularySource::Model, + (Some(path), None) => llm::TokenizerSource::HuggingFaceTokenizerFile(path.to_owned()), + (None, Some(repo)) => llm::TokenizerSource::HuggingFaceRemote(repo.to_owned()), + (None, None) => llm::TokenizerSource::Embedded, } } } @@ -28,7 +28,7 @@ impl Args { fn main() { let args = Args::parse(); - let vocabulary_source = args.to_vocabulary_source(); + let tokenizer_source = args.to_tokenizer_source(); let model_architecture = args.model_architecture; let model_path = args.model_path; let prompt = args @@ -41,7 +41,7 @@ fn main() { let model = llm::load_dynamic( model_architecture, &model_path, - vocabulary_source, + tokenizer_source, Default::default(), llm::load_progress_callback_stdout, ) diff --git a/crates/llm/examples/vicuna-chat.rs b/crates/llm/examples/vicuna-chat.rs index 98d94606..e08f0be3 100644 --- a/crates/llm/examples/vicuna-chat.rs +++ b/crates/llm/examples/vicuna-chat.rs @@ -7,19 +7,19 @@ struct Args { model_architecture: llm::ModelArchitecture, model_path: PathBuf, #[arg(long, short = 'v')] - pub vocabulary_path: Option, + pub tokenizer_path: Option, #[arg(long, short = 'r')] - pub vocabulary_repository: Option, + pub tokenizer_repository: Option, } impl Args { - pub fn to_vocabulary_source(&self) -> llm::VocabularySource { - match (&self.vocabulary_path, &self.vocabulary_repository) { + pub fn to_tokenizer_source(&self) -> llm::TokenizerSource { + match (&self.tokenizer_path, &self.tokenizer_repository) { (Some(_), Some(_)) => { - panic!("Cannot specify both --vocabulary-path and --vocabulary-repository"); + panic!("Cannot specify both --tokenizer-path and --tokenizer-repository"); } - (Some(path), None) => llm::VocabularySource::HuggingFaceTokenizerFile(path.to_owned()), - (None, Some(repo)) => llm::VocabularySource::HuggingFaceRemote(repo.to_owned()), - (None, None) => llm::VocabularySource::Model, + (Some(path), None) => llm::TokenizerSource::HuggingFaceTokenizerFile(path.to_owned()), + (None, Some(repo)) => llm::TokenizerSource::HuggingFaceRemote(repo.to_owned()), + (None, None) => llm::TokenizerSource::Embedded, } } } @@ -27,13 +27,13 @@ impl Args { fn main() { let args = Args::parse(); - let vocabulary_source = args.to_vocabulary_source(); + let tokenizer_source = args.to_tokenizer_source(); let model_architecture = args.model_architecture; let model_path = args.model_path; let model = llm::load_dynamic( model_architecture, &model_path, - vocabulary_source, + tokenizer_source, Default::default(), llm::load_progress_callback_stdout, ) diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs index 8adda7e7..30ea6c56 100644 --- a/crates/llm/src/lib.rs +++ b/crates/llm/src/lib.rs @@ -22,8 +22,8 @@ //! let llama = llm::load::( //! // path to GGML file //! std::path::Path::new("/path/to/model"), -//! // llm::VocabularySource -//! llm::VocabularySource::Model, +//! // llm::TokenizerSource +//! llm::TokenizerSource::Embedded, //! // llm::ModelParameters //! Default::default(), //! // load progress callback @@ -83,8 +83,8 @@ pub use llm_base::{ InferenceSessionConfig, InferenceSnapshot, InferenceSnapshotRef, InferenceStats, InvalidTokenBias, KnownModel, LoadError, LoadProgress, Loader, Model, ModelKVMemoryType, ModelParameters, OutputRequest, Prompt, QuantizeError, QuantizeProgress, Sampler, - SnapshotError, TokenBias, TokenId, TokenUtf8Buffer, TokenizationError, Vocabulary, - VocabularySource, + SnapshotError, TokenBias, TokenId, TokenUtf8Buffer, TokenizationError, Tokenizer, + TokenizerSource, }; use serde::Serialize; @@ -233,21 +233,21 @@ impl Display for ModelArchitecture { pub fn load_dynamic( architecture: ModelArchitecture, path: &Path, - vocabulary_source: VocabularySource, + tokenizer_source: TokenizerSource, params: ModelParameters, load_progress_callback: impl FnMut(LoadProgress), ) -> Result, LoadError> { - use ModelArchitecture::*; + use ModelArchitecture as MA; fn load_model( path: &Path, - vocabulary_source: VocabularySource, + tokenizer_source: TokenizerSource, params: ModelParameters, load_progress_callback: impl FnMut(LoadProgress), ) -> Result, LoadError> { Ok(Box::new(load::( path, - vocabulary_source, + tokenizer_source, params, load_progress_callback, )?)) @@ -255,30 +255,32 @@ pub fn load_dynamic( let model: Box = match architecture { #[cfg(feature = "bloom")] - Bloom => { - load_model::(path, vocabulary_source, params, load_progress_callback)? + MA::Bloom => { + load_model::(path, tokenizer_source, params, load_progress_callback)? } #[cfg(feature = "gpt2")] - Gpt2 => { - load_model::(path, vocabulary_source, params, load_progress_callback)? + MA::Gpt2 => { + load_model::(path, tokenizer_source, params, load_progress_callback)? } #[cfg(feature = "gptj")] - GptJ => { - load_model::(path, vocabulary_source, params, load_progress_callback)? + MA::GptJ => { + load_model::(path, tokenizer_source, params, load_progress_callback)? } #[cfg(feature = "gptneox")] - GptNeoX => { - load_model::(path, vocabulary_source, params, load_progress_callback)? + MA::GptNeoX => { + load_model::(path, tokenizer_source, params, load_progress_callback)? } #[cfg(feature = "llama")] - Llama => { - load_model::(path, vocabulary_source, params, load_progress_callback)? + MA::Llama => { + load_model::(path, tokenizer_source, params, load_progress_callback)? } #[cfg(feature = "mpt")] - Mpt => load_model::(path, vocabulary_source, params, load_progress_callback)?, + MA::Mpt => { + load_model::(path, tokenizer_source, params, load_progress_callback)? + } #[cfg(feature = "falcon")] - Falcon => { - load_model::(path, vocabulary_source, params, load_progress_callback)? + MA::Falcon => { + load_model::(path, tokenizer_source, params, load_progress_callback)? } }; diff --git a/crates/models/bloom/src/lib.rs b/crates/models/bloom/src/lib.rs index d44f143e..18cd5e5b 100644 --- a/crates/models/bloom/src/lib.rs +++ b/crates/models/bloom/src/lib.rs @@ -8,7 +8,7 @@ use llm_base::{ ggml, model::{common, HyperparametersWriteError}, util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig, - KnownModel, ModelParameters, OutputRequest, Regex, TokenId, Vocabulary, + KnownModel, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer, }; /// The BLOOM model. Ref: [Introducing BLOOM](https://bigscience.huggingface.co/blog/bloom) @@ -20,7 +20,7 @@ pub struct Bloom { context_size: usize, hyperparameters: Hyperparameters, - vocabulary: Vocabulary, + tokenizer: Tokenizer, // model-global weights // weighted token embeddings @@ -50,7 +50,7 @@ impl KnownModel for Bloom { fn new( hyperparameters: Self::Hyperparameters, params: ModelParameters, - vocabulary: Vocabulary, + tokenizer: Tokenizer, tensor_loader: impl llm_base::TensorLoader, ) -> Result { let mut tl = tensor_loader; @@ -96,7 +96,7 @@ impl KnownModel for Bloom { Ok(Bloom { hyperparameters, context_size, - vocabulary, + tokenizer, wte, norm, norm_bias, @@ -369,8 +369,8 @@ impl KnownModel for Bloom { common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len); } - fn vocabulary(&self) -> &Vocabulary { - &self.vocabulary + fn tokenizer(&self) -> &Tokenizer { + &self.tokenizer } fn context_size(&self) -> usize { @@ -378,11 +378,11 @@ impl KnownModel for Bloom { } fn bot_token_id(&self) -> Option { - self.vocabulary.id("".as_bytes()) + self.tokenizer.id("".as_bytes()) } fn eot_token_id(&self) -> TokenId { - self.vocabulary.id("".as_bytes()).unwrap() + self.tokenizer.id("".as_bytes()).unwrap() } fn quantize_tensors() -> Vec { diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs index 3b989e26..8ee37453 100644 --- a/crates/models/falcon/src/lib.rs +++ b/crates/models/falcon/src/lib.rs @@ -14,7 +14,7 @@ use llm_base::{ ggml, model::{common, HyperparametersWriteError}, util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig, - KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TokenId, Vocabulary, + KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer, }; /// The Falcon model. Ref: [Technology Innovation Institute](https://huggingface.co/tiiuae) @@ -27,7 +27,7 @@ pub struct Falcon { hyperparameters: Hyperparameters, - vocabulary: Vocabulary, + tokenizer: Tokenizer, // model-global weights // weighted token embeddings @@ -52,7 +52,7 @@ impl KnownModel for Falcon { fn new( hyperparameters: Self::Hyperparameters, params: ModelParameters, - vocabulary: Vocabulary, + tokenizer: Tokenizer, tensor_loader: impl llm_base::TensorLoader, ) -> Result { let mut tl = tensor_loader; @@ -88,7 +88,7 @@ impl KnownModel for Falcon { Ok(Falcon { hyperparameters, context_size, - vocabulary, + tokenizer, tok_embeddings, output_norm, output_norm_b, @@ -328,9 +328,8 @@ impl KnownModel for Falcon { common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len); } - /// Returns the vocabulary used by this model. - fn vocabulary(&self) -> &Vocabulary { - &self.vocabulary + fn tokenizer(&self) -> &Tokenizer { + &self.tokenizer } fn context_size(&self) -> usize { @@ -342,7 +341,7 @@ impl KnownModel for Falcon { } fn eot_token_id(&self) -> TokenId { - self.vocabulary.id("<|endoftext|>".as_bytes()).unwrap() + self.tokenizer.id("<|endoftext|>".as_bytes()).unwrap() } fn quantize_tensors() -> Vec { diff --git a/crates/models/gpt2/src/lib.rs b/crates/models/gpt2/src/lib.rs index abc0726d..1b2427a5 100644 --- a/crates/models/gpt2/src/lib.rs +++ b/crates/models/gpt2/src/lib.rs @@ -8,7 +8,7 @@ use llm_base::{ ggml, model::{common, HyperparametersWriteError}, util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig, - KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TokenId, Vocabulary, + KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer, }; /// The GPT-2 model. Ref: [The Illustrated GPT-2](https://jalammar.github.io/illustrated-gpt2/) @@ -20,7 +20,7 @@ pub struct Gpt2 { context_size: usize, hyperparameters: Hyperparameters, - vocabulary: Vocabulary, + tokenizer: Tokenizer, // model-global weights // normalization gain & bias @@ -49,7 +49,7 @@ impl KnownModel for Gpt2 { fn new( hyperparameters: Self::Hyperparameters, params: ModelParameters, - vocabulary: Vocabulary, + tokenizer: Tokenizer, tensor_loader: impl llm_base::TensorLoader, ) -> Result { let mut tl = tensor_loader; @@ -88,7 +88,7 @@ impl KnownModel for Gpt2 { Ok(Gpt2 { hyperparameters, context_size, - vocabulary, + tokenizer, layers, ln_f_g, ln_f_b, @@ -323,8 +323,8 @@ impl KnownModel for Gpt2 { common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len); } - fn vocabulary(&self) -> &Vocabulary { - &self.vocabulary + fn tokenizer(&self) -> &Tokenizer { + &self.tokenizer } fn context_size(&self) -> usize { @@ -336,7 +336,7 @@ impl KnownModel for Gpt2 { } fn eot_token_id(&self) -> TokenId { - self.vocabulary.id("<|endoftext|>".as_bytes()).unwrap() + self.tokenizer.id("<|endoftext|>".as_bytes()).unwrap() } fn quantize_tensors() -> Vec { diff --git a/crates/models/gptj/src/lib.rs b/crates/models/gptj/src/lib.rs index 5ec7d5bc..92ee8f4a 100644 --- a/crates/models/gptj/src/lib.rs +++ b/crates/models/gptj/src/lib.rs @@ -8,8 +8,7 @@ use llm_base::{ ggml, model::{common, HyperparametersWriteError}, util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig, - KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, - Vocabulary, + KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer, }; /// The GPT-J model. Ref: [GitHub](https://github.com/kingoflolz/mesh-transformer-jax/#gpt-j-6b) @@ -21,7 +20,7 @@ pub struct GptJ { context_size: usize, hyperparameters: Hyperparameters, - vocabulary: Vocabulary, + tokenizer: Tokenizer, // model-global weights // normalization gain & bias @@ -49,7 +48,7 @@ impl KnownModel for GptJ { fn new( hyperparameters: Self::Hyperparameters, params: ModelParameters, - vocabulary: Vocabulary, + tokenizer: Tokenizer, tensor_loader: impl TensorLoader, ) -> Result where @@ -89,7 +88,7 @@ impl KnownModel for GptJ { Ok(GptJ { hyperparameters, context_size, - vocabulary, + tokenizer, ln_f_g, ln_f_b, wte, @@ -292,8 +291,8 @@ impl KnownModel for GptJ { common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len); } - fn vocabulary(&self) -> &Vocabulary { - &self.vocabulary + fn tokenizer(&self) -> &Tokenizer { + &self.tokenizer } fn context_size(&self) -> usize { @@ -305,7 +304,7 @@ impl KnownModel for GptJ { } fn eot_token_id(&self) -> TokenId { - self.vocabulary.id("<|endoftext|>".as_bytes()).unwrap() + self.tokenizer.id("<|endoftext|>".as_bytes()).unwrap() } fn quantize_tensors() -> Vec { diff --git a/crates/models/gptneox/src/lib.rs b/crates/models/gptneox/src/lib.rs index 5b4ea0c0..84a5c417 100644 --- a/crates/models/gptneox/src/lib.rs +++ b/crates/models/gptneox/src/lib.rs @@ -9,8 +9,7 @@ use llm_base::{ ggml, model::{common, HyperparametersWriteError}, util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig, - KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, - Vocabulary, + KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer, }; /// The GPT-NeoX model. Ref: [GitHub](https://github.com/EleutherAI/gpt-neox) @@ -22,7 +21,7 @@ pub struct GptNeoX { context_size: usize, hyperparameters: Hyperparameters, - vocabulary: Vocabulary, + tokenizer: Tokenizer, // model-global weights // normalization gain & bias @@ -49,7 +48,7 @@ impl KnownModel for GptNeoX { fn new( hyperparameters: Hyperparameters, params: ModelParameters, - vocabulary: Vocabulary, + tokenizer: Tokenizer, tensor_loader: impl TensorLoader, ) -> Result where @@ -103,7 +102,7 @@ impl KnownModel for GptNeoX { Ok(GptNeoX { hyperparameters, context_size, - vocabulary, + tokenizer, ln_f_g, ln_f_b, wte, @@ -338,8 +337,8 @@ impl KnownModel for GptNeoX { common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, n); } - fn vocabulary(&self) -> &Vocabulary { - &self.vocabulary + fn tokenizer(&self) -> &Tokenizer { + &self.tokenizer } fn context_size(&self) -> usize { @@ -351,7 +350,7 @@ impl KnownModel for GptNeoX { } fn eot_token_id(&self) -> TokenId { - self.vocabulary.id("<|endoftext|>".as_bytes()).unwrap() + self.tokenizer.id("<|endoftext|>".as_bytes()).unwrap() } fn quantize_tensors() -> Vec { diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs index d4abb2e1..025352fd 100644 --- a/crates/models/llama/src/lib.rs +++ b/crates/models/llama/src/lib.rs @@ -7,8 +7,7 @@ use llm_base::{ ggml, model::{common, HyperparametersWriteError}, util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig, - KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, - Vocabulary, + KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer, }; /// The LLaMA model. Ref: [Introducing LLaMA](https://ai.facebook.com/blog/large-language-model-llama-meta-ai/) @@ -20,7 +19,7 @@ pub struct Llama { context_size: usize, hyperparameters: Hyperparameters, - vocabulary: Vocabulary, + tokenizer: Tokenizer, // model-global weights // weighted token embeddings @@ -46,7 +45,7 @@ impl KnownModel for Llama { fn new( hyperparameters: Self::Hyperparameters, params: ModelParameters, - vocabulary: Vocabulary, + tokenizer: Tokenizer, tensor_loader: impl TensorLoader, ) -> Result { let mut tl = tensor_loader; @@ -80,7 +79,7 @@ impl KnownModel for Llama { Ok(Self { hyperparameters, context_size, - vocabulary, + tokenizer, wte, norm, output, @@ -322,9 +321,8 @@ impl KnownModel for Llama { common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len); } - /// Returns the vocabulary used by this model. - fn vocabulary(&self) -> &Vocabulary { - &self.vocabulary + fn tokenizer(&self) -> &Tokenizer { + &self.tokenizer } fn context_size(&self) -> usize { diff --git a/crates/models/mpt/src/lib.rs b/crates/models/mpt/src/lib.rs index 10ce78e9..56e129e4 100644 --- a/crates/models/mpt/src/lib.rs +++ b/crates/models/mpt/src/lib.rs @@ -8,7 +8,7 @@ use llm_base::{ ggml::{self}, model::{common, HyperparametersWriteError}, util, FileType, GraphOutputs, InferenceParameters, InferenceSession, InferenceSessionConfig, - KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TokenId, Vocabulary, + KnownModel, LoadError, ModelParameters, OutputRequest, Regex, TokenId, Tokenizer, }; /// The MosaicML Pretrained Transformer (MPT) model. Ref: [Mosaic ML](https://www.mosaicml.com/blog/mpt-7b) @@ -20,7 +20,7 @@ pub struct Mpt { context_size: usize, hyperparameters: Hyperparameters, - vocabulary: Vocabulary, + tokenizer: Tokenizer, // model-global weights // weighted token embeddings @@ -44,7 +44,7 @@ impl KnownModel for Mpt { fn new( hyperparameters: Self::Hyperparameters, params: ModelParameters, - vocabulary: Vocabulary, + tokenizer: Tokenizer, tensor_loader: impl llm_base::TensorLoader, ) -> Result { let mut tl = tensor_loader; @@ -77,7 +77,7 @@ impl KnownModel for Mpt { Ok(Mpt { hyperparameters, context_size, - vocabulary, + tokenizer, wte, norm, layers, @@ -271,9 +271,8 @@ impl KnownModel for Mpt { common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, n); } - /// Returns the vocabulary used by this model. - fn vocabulary(&self) -> &Vocabulary { - &self.vocabulary + fn tokenizer(&self) -> &Tokenizer { + &self.tokenizer } fn context_size(&self) -> usize { @@ -281,11 +280,11 @@ impl KnownModel for Mpt { } fn bot_token_id(&self) -> Option { - self.vocabulary.id("<|padding|>".as_bytes()) + self.tokenizer.id("<|padding|>".as_bytes()) } fn eot_token_id(&self) -> TokenId { - self.vocabulary.id("<|endoftext|>".as_bytes()).unwrap() + self.tokenizer.id("<|endoftext|>".as_bytes()).unwrap() } fn quantize_tensors() -> Vec { From 60d61688d23b4ae6fe4a514c98bd1fe0154d03c5 Mon Sep 17 00:00:00 2001 From: Philpax Date: Thu, 29 Jun 2023 03:19:42 +0200 Subject: [PATCH 19/21] refactor(tokenizer): split into multiple files --- crates/llm-base/src/quantize.rs | 7 +- crates/llm-base/src/tokenizer/embedded.rs | 157 ++++++++++++ crates/llm-base/src/tokenizer/huggingface.rs | 75 ++++++ .../src/{tokenizer.rs => tokenizer/mod.rs} | 226 +----------------- 4 files changed, 238 insertions(+), 227 deletions(-) create mode 100644 crates/llm-base/src/tokenizer/embedded.rs create mode 100644 crates/llm-base/src/tokenizer/huggingface.rs rename crates/llm-base/src/{tokenizer.rs => tokenizer/mod.rs} (60%) diff --git a/crates/llm-base/src/quantize.rs b/crates/llm-base/src/quantize.rs index 187a6fc0..d3d2a0cf 100644 --- a/crates/llm-base/src/quantize.rs +++ b/crates/llm-base/src/quantize.rs @@ -181,12 +181,7 @@ pub fn quantize( } let tokenizer = match tokenizer { - Tokenizer::Embedded(v) => v - .id_to_token - .iter() - .cloned() - .zip(v.id_to_token_score) - .collect::>(), + Tokenizer::Embedded(v) => v.iter().collect::>(), Tokenizer::HuggingFace(_) => vec![], }; diff --git a/crates/llm-base/src/tokenizer/embedded.rs b/crates/llm-base/src/tokenizer/embedded.rs new file mode 100644 index 00000000..cf96b183 --- /dev/null +++ b/crates/llm-base/src/tokenizer/embedded.rs @@ -0,0 +1,157 @@ +use std::collections::HashMap; + +use thiserror::Error; + +use super::{Token, TokenId, TokenScore, TokenizationError}; + +#[derive(Debug, Error)] +/// Errors that can occur when using a model tokenizer. +pub enum EmbeddedTokenizerError { + /// Arbitrary error that occurred during use of the model tokenizer. + #[error("Arbitrary error: {0:?}")] + Arbitrary(String), +} + +/// The built-in GGML tokenizer. +#[derive(Debug, Clone, Default)] +pub struct EmbeddedTokenizer { + /// Maps every integer (index) token ID to its corresponding token. + id_to_token: Vec, + + /// Maps every integer (index) token ID to corresponding score. + id_to_token_score: Vec, + + // todo: use a radix tree + /// Maps a token to a token ID. + token_to_id: HashMap, + + /// The longest token in this tokenizer. + max_token_length: usize, +} + +impl EmbeddedTokenizer { + /// Add a token to the internal vocabulary. + /// + /// The token added must have `id` directly after the last token in the vocabulary. + /// + /// # Panics + /// - This function can panic if `id` does not correspond to the next token in the vocabulary. + /// That is, if there are already `n` tokens in the vocabulary, then `id` must be `n`. + pub(crate) fn push_token(&mut self, id: TokenId, content: Token, score: TokenScore) { + // These are loader invariants. If this is broken, then the loader is broken and this is a bug, + // not an issue with the model itself. + assert_eq!(self.id_to_token.len(), self.id_to_token_score.len()); + if self.id_to_token.len() != id as usize || self.id_to_token_score.len() != id as usize { + let expected_id = self.id_to_token.len() as TokenId; + panic!("the id of token added should be {expected_id}; is {id}"); + } + + self.max_token_length = self.max_token_length.max(content.len()); + self.id_to_token.push(content.clone()); + self.id_to_token_score.push(score); + self.token_to_id.insert(content, id); + } + + pub(crate) fn id(&self, token: &[u8]) -> Option { + self.token_to_id.get(token).copied() + } + + /// Converts a token index to the token it represents in this tokenizer. + pub(crate) fn token(&self, idx: usize) -> Vec { + self.id_to_token[idx].clone() + } + + /// Returns the number of tokens in the tokenizer. + pub(crate) fn len(&self) -> usize { + self.id_to_token.len() + } + + /// Returns whether the tokenizer is empty. + pub(crate) fn is_empty(&self) -> bool { + self.id_to_token.is_empty() + } + + // SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece + /// Tokenize a `text` with this tokenizer. + /// + /// `bos` controls whether a beginning-of-string token should be inserted. + pub(crate) fn tokenize( + &self, + text: &str, + bos: bool, + ) -> Result, TokenId)>, TokenizationError> { + let len = text.len(); + + let mut score = vec![0usize; len + 1]; + let mut prev = vec![TokenId::default(); len + 1]; + + for i in 0..len { + let max_len = (len - i).min(self.max_token_length); + for sub_len in 1..=max_len { + let sub = &text.as_bytes()[i..i + sub_len]; + let token = self.token_to_id.get(sub); + + if let Some(token) = token { + let token_score = sub.len() * sub.len(); + let local_score = score[i] + token_score; + let next = i + sub_len; + + if score[next] < local_score { + score[next] = local_score; + prev[next] = *token; + } + } + } + } + + // Backward pass + let mut res = vec![]; + let mut i = len; + while i > 0 { + let token_id = prev[i]; + if token_id == 0 { + return Err(TokenizationError::TokenizationFailed { + error: Box::new(EmbeddedTokenizerError::Arbitrary( + "the backward pass for the tokenizer encountered a non-set token" + .to_string(), + )), + }); + } + let token = self.id_to_token[token_id as usize].as_slice(); + res.push((token.to_vec(), token_id)); + i -= token.len(); + } + + if bos { + // TODO: replace with vocab.bos + res.push((vec![], 1)); + } + + // Pieces are in reverse order so correct that + res.reverse(); + + Ok(res) + } + + /// Decode a list `tokens` with this tokenizer. + pub(crate) fn decode(&self, tokens: Vec, skip_special_tokens: bool) -> Vec { + let mut vec = vec![]; + + for token in tokens { + if skip_special_tokens && token == 1 { + continue; + } + + vec.append(&mut self.id_to_token[token as usize].to_vec()); + } + + vec + } + + pub(crate) fn iter(&self) -> impl Iterator + '_ { + self.id_to_token + .iter() + .zip(self.id_to_token_score.iter()) + .map(|(token, score)| (token.clone(), *score)) + } +} diff --git a/crates/llm-base/src/tokenizer/huggingface.rs b/crates/llm-base/src/tokenizer/huggingface.rs new file mode 100644 index 00000000..8f3a5565 --- /dev/null +++ b/crates/llm-base/src/tokenizer/huggingface.rs @@ -0,0 +1,75 @@ +use super::{TokenId, TokenizationError}; + +/// A Hugging Face tokenizer. +#[derive(Debug, Clone)] +pub struct HuggingFaceTokenizer { + pub(crate) tokenizer: tokenizers::Tokenizer, +} + +impl HuggingFaceTokenizer { + /// Create a new `HuggingFaceTokenizer`. + pub fn new(tokenizer: tokenizers::Tokenizer) -> Self { + Self { tokenizer } + } +} + +impl HuggingFaceTokenizer { + pub(crate) fn id(&self, token: &[u8]) -> Option { + self.tokenizer + .token_to_id(std::str::from_utf8(token).unwrap()) + } + + /// Converts a token index to the token it represents in this tokenizer. + pub(crate) fn token(&self, idx: usize) -> Vec { + self.tokenizer + .decode(vec![idx as u32], true) + .expect("Cannot decode token from tokenizer tokenizer.") + .as_bytes() + .to_vec() + } + + /// Returns the number of tokens in the tokenizer. + pub(crate) fn len(&self) -> usize { + self.tokenizer.get_vocab_size(false) + } + + /// Returns whether the tokenizer is empty. + pub(crate) fn is_empty(&self) -> bool { + self.tokenizer.get_vocab_size(false) == 0 + } + + /// Tokenize a `text` with this tokenizer. + /// + /// `bos` controls whether a beginning-of-string token should be inserted. + pub(crate) fn tokenize( + &self, + text: &str, + bos: bool, + ) -> Result, TokenId)>, TokenizationError> { + let encoding = self + .tokenizer + .encode(text, false) + .map_err(|e| TokenizationError::TokenizationFailed { error: e })?; + + let encoding = self + .tokenizer + .post_process(encoding, None, bos) + .map_err(|e| TokenizationError::TokenizationFailed { error: e })?; + + Ok(encoding + .get_tokens() + .iter() + .map(|t| t.as_bytes().to_vec()) + .zip(encoding.get_ids().iter().copied()) + .collect()) + } + + /// Decode a list `tokens` with this tokenizer. + pub(crate) fn decode(&self, tokens: Vec, skip_special_tokens: bool) -> Vec { + self.tokenizer + .decode(tokens, skip_special_tokens) + .expect("Cannot decode token from tokenizer.") + .as_bytes() + .to_vec() + } +} diff --git a/crates/llm-base/src/tokenizer.rs b/crates/llm-base/src/tokenizer/mod.rs similarity index 60% rename from crates/llm-base/src/tokenizer.rs rename to crates/llm-base/src/tokenizer/mod.rs index b914eb3f..8f8fc69d 100644 --- a/crates/llm-base/src/tokenizer.rs +++ b/crates/llm-base/src/tokenizer/mod.rs @@ -1,5 +1,4 @@ use std::{ - collections::HashMap, error::Error, fmt::Display, path::{Path, PathBuf}, @@ -8,6 +7,11 @@ use std::{ use thiserror::Error; +mod embedded; +pub use embedded::*; +mod huggingface; +pub use huggingface::*; + /// The identifier of a token in a tokenizer. pub type TokenId = u32; pub(crate) type Token = Vec; @@ -182,226 +186,6 @@ impl Tokenizer { } } -#[derive(Debug, Error)] -/// Errors that can occur when using a model tokenizer. -pub enum ModelTokenizerError { - /// Arbitrary error that occurred during use of the model tokenizer. - #[error("Arbitrary error: {0:?}")] - Arbitrary(String), -} - -/// The built-in GGML tokenizer. -#[derive(Debug, Clone, Default)] -pub struct EmbeddedTokenizer { - // TODO: make these private - /// Maps every integer (index) token ID to its corresponding token. - pub id_to_token: Vec, - - /// Maps every integer (index) token ID to corresponding score. - pub id_to_token_score: Vec, - - // todo: use a radix tree - /// Maps a token to a token ID. - pub token_to_id: HashMap, - - /// The longest token in this tokenizer. - pub max_token_length: usize, -} - -impl EmbeddedTokenizer { - /// Add a token to the internal vocabulary. - /// - /// The token added must have `id` directly after the last token in the vocabulary. - /// - /// # Panics - /// - This function can panic if `id` does not correspond to the next token in the vocabulary. - /// That is, if there are already `n` tokens in the vocabulary, then `id` must be `n`. - pub(crate) fn push_token(&mut self, id: TokenId, content: Token, score: TokenScore) { - // These are loader invariants. If this is broken, then the loader is broken and this is a bug, - // not an issue with the model itself. - assert_eq!(self.id_to_token.len(), self.id_to_token_score.len()); - if self.id_to_token.len() != id as usize || self.id_to_token_score.len() != id as usize { - let expected_id = self.id_to_token.len() as TokenId; - panic!("the id of token added should be {expected_id}; is {id}"); - } - - self.max_token_length = self.max_token_length.max(content.len()); - self.id_to_token.push(content.clone()); - self.id_to_token_score.push(score); - self.token_to_id.insert(content, id); - } - - fn id(&self, token: &[u8]) -> Option { - self.token_to_id.get(token).copied() - } - - /// Converts a token index to the token it represents in this tokenizer. - fn token(&self, idx: usize) -> Vec { - self.id_to_token[idx].clone() - } - - /// Returns the number of tokens in the tokenizer. - fn len(&self) -> usize { - self.id_to_token.len() - } - - /// Returns whether the tokenizer is empty. - fn is_empty(&self) -> bool { - self.id_to_token.is_empty() - } - - // SentencePiece implementation after https://guillaume-be.github.io/2020-05-30/sentence_piece - /// Tokenize a `text` with this tokenizer. - /// - /// `bos` controls whether a beginning-of-string token should be inserted. - fn tokenize( - &self, - text: &str, - bos: bool, - ) -> Result, TokenId)>, TokenizationError> { - let len = text.len(); - - let mut score = vec![0usize; len + 1]; - let mut prev = vec![TokenId::default(); len + 1]; - - for i in 0..len { - let max_len = (len - i).min(self.max_token_length); - for sub_len in 1..=max_len { - let sub = &text.as_bytes()[i..i + sub_len]; - let token = self.token_to_id.get(sub); - - if let Some(token) = token { - let token_score = sub.len() * sub.len(); - let local_score = score[i] + token_score; - let next = i + sub_len; - - if score[next] < local_score { - score[next] = local_score; - prev[next] = *token; - } - } - } - } - - // Backward pass - let mut res = vec![]; - let mut i = len; - while i > 0 { - let token_id = prev[i]; - if token_id == 0 { - return Err(TokenizationError::TokenizationFailed { - error: Box::new(ModelTokenizerError::Arbitrary( - "the backward pass for the tokenizer encountered a non-set token" - .to_string(), - )), - }); - } - let token = self.id_to_token[token_id as usize].as_slice(); - res.push((token.to_vec(), token_id)); - i -= token.len(); - } - - if bos { - // TODO: replace with vocab.bos - res.push((vec![], 1)); - } - - // Pieces are in reverse order so correct that - res.reverse(); - - Ok(res) - } - - /// Decode a list `tokens` with this tokenizer. - fn decode(&self, tokens: Vec, skip_special_tokens: bool) -> Vec { - let mut vec = vec![]; - - for token in tokens { - if skip_special_tokens && token == 1 { - continue; - } - - vec.append(&mut self.id_to_token[token as usize].to_vec()); - } - - vec - } -} - -/// A Hugging Face tokenizer. -#[derive(Debug, Clone)] -pub struct HuggingFaceTokenizer { - tokenizer: tokenizers::Tokenizer, -} - -impl HuggingFaceTokenizer { - /// Create a new `HuggingFaceTokenizer`. - pub fn new(tokenizer: tokenizers::Tokenizer) -> Self { - Self { tokenizer } - } -} - -impl HuggingFaceTokenizer { - fn id(&self, token: &[u8]) -> Option { - self.tokenizer - .token_to_id(std::str::from_utf8(token).unwrap()) - } - - /// Converts a token index to the token it represents in this tokenizer. - fn token(&self, idx: usize) -> Vec { - self.tokenizer - .decode(vec![idx as u32], true) - .expect("Cannot decode token from tokenizer tokenizer.") - .as_bytes() - .to_vec() - } - - /// Returns the number of tokens in the tokenizer. - fn len(&self) -> usize { - self.tokenizer.get_vocab_size(false) - } - - /// Returns whether the tokenizer is empty. - fn is_empty(&self) -> bool { - self.tokenizer.get_vocab_size(false) == 0 - } - - /// Tokenize a `text` with this tokenizer. - /// - /// `bos` controls whether a beginning-of-string token should be inserted. - fn tokenize( - &self, - text: &str, - bos: bool, - ) -> Result, TokenId)>, TokenizationError> { - let encoding = self - .tokenizer - .encode(text, false) - .map_err(|e| TokenizationError::TokenizationFailed { error: e })?; - - let encoding = self - .tokenizer - .post_process(encoding, None, bos) - .map_err(|e| TokenizationError::TokenizationFailed { error: e })?; - - Ok(encoding - .get_tokens() - .iter() - .map(|t| t.as_bytes().to_vec()) - .zip(encoding.get_ids().iter().copied()) - .collect()) - } - - /// Decode a list `tokens` with this tokenizer. - fn decode(&self, tokens: Vec, skip_special_tokens: bool) -> Vec { - self.tokenizer - .decode(tokens, skip_special_tokens) - .expect("Cannot decode token from tokenizer.") - .as_bytes() - .to_vec() - } -} - #[derive(Debug, PartialEq, Clone, Copy)] /// Represents the prompt, which can be specified as either text or tokens. /// From 7e2f2bf059f3239681c85346323196155bf522c4 Mon Sep 17 00:00:00 2001 From: Philpax Date: Fri, 30 Jun 2023 01:38:13 +0200 Subject: [PATCH 20/21] fix #298: don't send new bytes if invalid decoding --- crates/llm-base/src/inference_session.rs | 36 ++++++++++++++++-------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs index 4d1489a4..8a1a85e6 100644 --- a/crates/llm-base/src/inference_session.rs +++ b/crates/llm-base/src/inference_session.rs @@ -304,13 +304,10 @@ impl InferenceSession { let mut token = match model.tokenizer() { crate::Tokenizer::Embedded(_) => model.tokenizer().token(tk as usize).to_vec(), crate::Tokenizer::HuggingFace(_) => { - let mut previous_tokens = self.tokens.clone(); - previous_tokens.push(tk); + let mut tokens = self.tokens.clone(); + tokens.push(tk); - let all_tokens = model.tokenizer().decode(previous_tokens, true); - let splitted = all_tokens.split_at(self.decoded_tokens.len()); - - splitted.1.to_vec() + get_newly_decoded_portion_huggingface(model, tokens, &self.decoded_tokens) } }; @@ -363,12 +360,11 @@ impl InferenceSession { crate::Tokenizer::Embedded(_) => { model.tokenizer().token(next_token as usize).to_vec() } - crate::Tokenizer::HuggingFace(_) => { - let all_tokens = model.tokenizer().decode(self.tokens.clone(), true); - let splitted = all_tokens.split_at(self.decoded_tokens.len()); - - splitted.1.to_vec() - } + crate::Tokenizer::HuggingFace(_) => get_newly_decoded_portion_huggingface( + model, + self.tokens.clone(), + &self.decoded_tokens, + ), }; self.decoded_tokens.append(&mut res.clone()); @@ -595,6 +591,22 @@ impl InferenceSession { } } +fn get_newly_decoded_portion_huggingface( + model: &dyn Model, + tokens: Vec, + decoded_tokens: &[u8], +) -> Vec { + let all_tokens = model.tokenizer().decode(tokens, true); + // The bytes here come from a lossily-decoded String, so we need to convert it back to a String + // to check if it ends with a replacement character. + let all_tokens = unsafe { String::from_utf8_unchecked(all_tokens) }; + if all_tokens.ends_with('�') { + // Return an empty vector: no valid text was generated from this token. + return vec![]; + } + all_tokens.as_bytes()[decoded_tokens.len()..].to_vec() +} + #[derive(Error, Debug)] /// Errors encountered during the inference process. pub enum InferenceError { From 9a222690cd0a9e3322bb6a926d54e90d08fb2c0f Mon Sep 17 00:00:00 2001 From: Philpax Date: Fri, 30 Jun 2023 03:17:14 +0200 Subject: [PATCH 21/21] fix #317 - cli move architecture into subcommands --- binaries/llm-cli/src/cli_args.rs | 69 +++------ binaries/llm-cli/src/main.rs | 220 +++++++++++++++-------------- crates/llm-base/src/loader.rs | 12 +- crates/llm-base/src/model/mod.rs | 3 +- crates/llm/examples/embeddings.rs | 2 +- crates/llm/examples/inference.rs | 2 +- crates/llm/examples/vicuna-chat.rs | 2 +- crates/llm/src/lib.rs | 91 +++++++----- 8 files changed, 208 insertions(+), 193 deletions(-) diff --git a/binaries/llm-cli/src/cli_args.rs b/binaries/llm-cli/src/cli_args.rs index ce7db33f..0da6e3c5 100644 --- a/binaries/llm-cli/src/cli_args.rs +++ b/binaries/llm-cli/src/cli_args.rs @@ -1,6 +1,6 @@ use std::{fmt, ops::Deref, path::PathBuf, sync::Arc}; -use clap::{Parser, Subcommand, ValueEnum}; +use clap::{Parser, ValueEnum}; use color_eyre::eyre::{bail, Result, WrapErr}; use llm::{ ggml_format, ElementType, InferenceParameters, InferenceSessionConfig, InvalidTokenBias, @@ -11,50 +11,6 @@ use rand::SeedableRng; #[derive(Parser, Debug)] #[command(author, version, about, long_about = None)] pub enum Args { - /// Use a BLOOM model - Bloom { - #[command(subcommand)] - args: BaseArgs, - }, - /// Use a GPT-2 model - Gpt2 { - #[command(subcommand)] - args: BaseArgs, - }, - /// Use a GPT-J model - #[clap(id = "gptj")] - GptJ { - #[command(subcommand)] - args: BaseArgs, - }, - /// Use a GPT-NeoX model - #[clap(id = "gptneox")] - GptNeoX { - #[command(subcommand)] - args: BaseArgs, - }, - /// Use a LLaMA model - Llama { - #[command(subcommand)] - args: BaseArgs, - }, - /// Use a MPT model - #[clap(id = "mpt")] - Mpt { - #[command(subcommand)] - args: BaseArgs, - }, - /// Use a Falcon model - #[clap(id = "falcon")] - #[cfg(feature = "falcon")] - Falcon { - #[command(subcommand)] - args: BaseArgs, - }, -} - -#[derive(Subcommand, Debug)] -pub enum BaseArgs { #[command()] /// Use a model to infer the next tokens in a sequence, and exit. Infer(Box), @@ -156,7 +112,7 @@ pub struct Info { pub tensors: bool, /// Show all of the tokens in the tokenizer. - #[arg(long, short = 'v')] + #[arg(long, short = 'k')] pub tokenizer: bool, } @@ -372,12 +328,22 @@ impl ModelTokenizer { } } +#[derive(Parser, Debug)] +pub struct ModelArchitecture { + /// The model architecture to use. Will attempt to guess if not specified. + #[arg(long, short = 'a')] + pub model_architecture: Option, +} + #[derive(Parser, Debug)] pub struct ModelAndTokenizer { /// Where to load the model from #[arg(long, short = 'm')] pub model_path: PathBuf, + #[command(flatten)] + pub architecture: ModelArchitecture, + #[command(flatten)] pub tokenizer: ModelTokenizer, } @@ -415,7 +381,7 @@ pub struct ModelLoad { pub lora_paths: Option>, } impl ModelLoad { - pub fn load(&self, use_gpu: bool) -> Result> { + pub fn load(&self, use_gpu: bool) -> Result> { let params = ModelParameters { prefer_mmap: !self.no_mmap, context_size: self.num_ctx_tokens, @@ -441,7 +407,8 @@ impl ModelLoad { } }; - let model = llm::load::( + let model = llm::load_dynamic( + self.model_and_tokenizer.architecture.model_architecture, &self.model_and_tokenizer.model_path, tokenizer_source, params, @@ -496,7 +463,6 @@ impl ModelLoad { } }, ) - .map(Box::new) .wrap_err("Could not load model"); if model.is_err() { @@ -507,7 +473,7 @@ impl ModelLoad { } } - Ok(model?) + model } } @@ -548,6 +514,9 @@ impl PromptFile { #[derive(Parser, Debug)] pub struct Quantize { + #[command(flatten)] + pub architecture: ModelArchitecture, + /// The path to the model to quantize #[arg()] pub source: PathBuf, diff --git a/binaries/llm-cli/src/main.rs b/binaries/llm-cli/src/main.rs index 45e4c127..443f6733 100644 --- a/binaries/llm-cli/src/main.rs +++ b/binaries/llm-cli/src/main.rs @@ -5,8 +5,8 @@ use std::{ }; use clap::Parser; -use cli_args::{Args, BaseArgs}; -use color_eyre::eyre::{Context, Result}; +use cli_args::Args; +use color_eyre::eyre::{Context, ContextCompat, Result}; use llm::{InferenceError, InferenceFeedback, InferenceResponse}; use rustyline::{ error::ReadlineError, @@ -25,35 +25,22 @@ fn main() -> Result<()> { .init(); color_eyre::install()?; - let cli_args = Args::parse(); - match &cli_args { - Args::Llama { args } => handle_args::(args), - Args::Bloom { args } => handle_args::(args), - Args::Gpt2 { args } => handle_args::(args), - Args::GptJ { args } => handle_args::(args), - Args::GptNeoX { args } => handle_args::(args), - Args::Mpt { args } => handle_args::(args), - #[cfg(feature = "falcon")] - Args::Falcon { args } => handle_args::(args), - } -} - -fn handle_args(args: &cli_args::BaseArgs) -> Result<()> { + let args = Args::parse(); match args { - BaseArgs::Infer(args) => infer::(args), - BaseArgs::Perplexity(args) => perplexity::(args), - BaseArgs::Info(args) => info::(args), - BaseArgs::PromptTokens(args) => prompt_tokens::(args), - BaseArgs::Repl(args) => interactive::(args, false), - BaseArgs::Chat(args) => interactive::(args, true), - BaseArgs::Quantize(args) => quantize::(args), + Args::Infer(args) => infer(&args), + Args::Perplexity(args) => perplexity(&args), + Args::Info(args) => info(&args), + Args::PromptTokens(args) => prompt_tokens(&args), + Args::Repl(args) => interactive(&args, false), + Args::Chat(args) => interactive(&args, true), + Args::Quantize(args) => quantize(&args), } } -fn infer(args: &cli_args::Infer) -> Result<()> { +fn infer(args: &cli_args::Infer) -> Result<()> { let prompt = load_prompt_file_with_prompt(&args.prompt_file, args.prompt.as_deref()); let inference_session_config = args.generate.inference_session_config(); - let model = args.model_load.load::(args.generate.use_gpu)?; + let model = args.model_load.load(args.generate.use_gpu)?; let (mut session, session_loaded) = snapshot::read_or_create_session( model.as_ref(), @@ -118,10 +105,10 @@ fn infer(args: &cli_args::Infer) -> Result<()> { Ok(()) } -fn perplexity(args: &cli_args::Perplexity) -> Result<()> { +fn perplexity(args: &cli_args::Perplexity) -> Result<()> { let prompt = load_prompt_file_with_prompt(&args.prompt_file, args.prompt.as_deref()); let inference_session_config = args.generate.inference_session_config(); - let model = args.model_load.load::(args.generate.use_gpu)?; + let model = args.model_load.load(args.generate.use_gpu)?; let (mut session, _) = snapshot::read_or_create_session( model.as_ref(), None, @@ -142,48 +129,62 @@ fn perplexity(args: &cli_args::Perplexity) -> Resu Ok(()) } -fn info(args: &cli_args::Info) -> Result<()> { - let model_path = &args.model_and_tokenizer.model_path; - let tokenizer = args.model_and_tokenizer.to_source()?.retrieve(model_path)?; +fn info(args: &cli_args::Info) -> Result<()> { + struct InfoVisitor<'a>(&'a cli_args::Info); + impl llm::ModelArchitectureVisitor> for InfoVisitor<'_> { + fn visit(&mut self) -> Result<()> { + let args = self.0; - let file = File::open(model_path)?; - let mut reader = BufReader::new(&file); - let mut loader: llm::Loader = llm::Loader::new(tokenizer, |_| { - // We purposely do not print progress here, as we are only interested in the metadata - }); + let model_path = &args.model_and_tokenizer.model_path; + let tokenizer = args.model_and_tokenizer.to_source()?.retrieve(model_path)?; - llm::ggml_format::load(&mut reader, &mut loader)?; + let file = File::open(model_path)?; + let mut reader = BufReader::new(&file); + let mut loader: llm::Loader = + llm::Loader::new(tokenizer, |_| { + // We purposely do not print progress here, as we are only interested in the metadata + }); - log::info!("Container type: {:?}", loader.container_type); - log::info!("Hyperparameters: {:?}", loader.hyperparameters); - log::info!("Tokenizer vocabulary size: {}", loader.tokenizer.len()); + llm::ggml_format::load(&mut reader, &mut loader)?; - if args.tokenizer { - log::info!("Tokens:"); - for i in 0..loader.tokenizer.len() { - log::info!("- {}: {}", i, utf8_or_array(&loader.tokenizer.token(i))); - } - } + log::info!("Container type: {:?}", loader.container_type); + log::info!("Hyperparameters: {:?}", loader.hyperparameters); + log::info!("Tokenizer vocabulary size: {}", loader.tokenizer.len()); - if args.tensors { - log::info!("Tensors:"); - for (name, tensor) in &loader.tensors { - log::info!("- {} ({:?} {:?})", name, tensor.element_type, tensor.dims()); - } - } + if args.tokenizer { + log::info!("Tokens:"); + for i in 0..loader.tokenizer.len() { + log::info!("- {}: {}", i, utf8_or_array(&loader.tokenizer.token(i))); + } + } - fn utf8_or_array(token: &[u8]) -> String { - std::str::from_utf8(token) - .map(|s| s.to_owned()) - .unwrap_or(format!("{:?}", token)) + if args.tensors { + log::info!("Tensors:"); + for (name, tensor) in &loader.tensors { + log::info!("- {} ({:?} {:?})", name, tensor.element_type, tensor.dims()); + } + } + + fn utf8_or_array(token: &[u8]) -> String { + std::str::from_utf8(token) + .map(|s| s.to_owned()) + .unwrap_or(format!("{:?}", token)) + } + + Ok(()) + } } - Ok(()) + args.model_and_tokenizer + .architecture + .model_architecture + .wrap_err("a model architecture is required at present")? + .visit(&mut InfoVisitor(args)) } -fn prompt_tokens(args: &cli_args::PromptTokens) -> Result<()> { +fn prompt_tokens(args: &cli_args::PromptTokens) -> Result<()> { let prompt = load_prompt_file_with_prompt(&args.prompt_file, args.prompt.as_deref()); - let model = args.model_load.load::(false)?; + let model = args.model_load.load(false)?; let toks = match model.tokenizer().tokenize(&prompt, false) { Ok(toks) => toks, Err(e) => { @@ -222,7 +223,7 @@ fn force_newline_event_seq() -> KeyEvent { KeyEvent(KeyCode::Enter, Modifiers::SHIFT) } -fn interactive( +fn interactive( args: &cli_args::Repl, // If set to false, the session will be cloned after each inference // to ensure that previous state is not carried over. @@ -230,7 +231,7 @@ fn interactive( ) -> Result<()> { let prompt_file = args.prompt_file.contents(); let inference_session_config = args.generate.inference_session_config(); - let model = args.model_load.load::(args.generate.use_gpu)?; + let model = args.model_load.load(args.generate.use_gpu)?; let (mut session, mut session_loaded) = snapshot::read_or_create_session( model.as_ref(), None, @@ -318,51 +319,64 @@ fn interactive( Ok(()) } -fn quantize(args: &cli_args::Quantize) -> Result<()> { +fn quantize(args: &cli_args::Quantize) -> Result<()> { use llm::QuantizeProgress; - let mut source = BufReader::new(std::fs::File::open(&args.source)?); - let mut destination = BufWriter::new(std::fs::File::create(&args.destination)?); - let tokenizer = args.tokenizer.to_source()?.retrieve(&args.source)?; - - llm::quantize::( - &mut source, - &mut destination, - tokenizer, - args.container_type.into(), - args.target.into(), - |progress| match progress { - QuantizeProgress::HyperparametersLoaded => log::info!("Loaded hyperparameters"), - QuantizeProgress::TensorLoading { - name, - dims, - element_type, - n_elements, - } => log::info!( - "Loading tensor `{name}` ({n_elements} ({dims:?}) {element_type} elements)" - ), - QuantizeProgress::TensorQuantizing { name } => log::info!("Quantizing tensor `{name}`"), - QuantizeProgress::TensorQuantized { - name, - original_size, - reduced_size, - history, - } => log::info!( - "Quantized tensor `{name}` from {original_size} to {reduced_size} bytes ({history:?})" - ), - QuantizeProgress::TensorSkipped { name, size } => { - log::info!("Skipped tensor `{name}` ({size} bytes)") - } - QuantizeProgress::Finished { - original_size, - reduced_size, - history, - } => log::info!( - "Finished quantization from {original_size} to {reduced_size} bytes ({history:?})" - ), - }, - ) - .wrap_err("failed to quantize model") + struct QuantizeVisitor<'a>(&'a cli_args::Quantize); + impl llm::ModelArchitectureVisitor> for QuantizeVisitor<'_> { + fn visit(&mut self) -> Result<()> { + let args = self.0; + + let mut source: BufReader = BufReader::new(std::fs::File::open(&args.source)?); + let mut destination: BufWriter = + BufWriter::new(std::fs::File::create(&args.destination)?); + let tokenizer: llm::Tokenizer = args.tokenizer.to_source()?.retrieve(&args.source)?; + + llm::quantize::( + &mut source, + &mut destination, + tokenizer, + args.container_type.into(), + args.target.into(), + |progress| match progress { + QuantizeProgress::HyperparametersLoaded => log::info!("Loaded hyperparameters"), + QuantizeProgress::TensorLoading { + name, + dims, + element_type, + n_elements, + } => log::info!( + "Loading tensor `{name}` ({n_elements} ({dims:?}) {element_type} elements)" + ), + QuantizeProgress::TensorQuantizing { name } => log::info!("Quantizing tensor `{name}`"), + QuantizeProgress::TensorQuantized { + name, + original_size, + reduced_size, + history, + } => log::info!( + "Quantized tensor `{name}` from {original_size} to {reduced_size} bytes ({history:?})" + ), + QuantizeProgress::TensorSkipped { name, size } => { + log::info!("Skipped tensor `{name}` ({size} bytes)") + } + QuantizeProgress::Finished { + original_size, + reduced_size, + history, + } => log::info!( + "Finished quantization from {original_size} to {reduced_size} bytes ({history:?})" + ), + }, + ) + .wrap_err("failed to quantize model") + } + } + + args.architecture + .model_architecture + .wrap_err("the architecture must be known for quantization")? + .visit(&mut QuantizeVisitor(args)) } fn load_prompt_file_with_prompt( diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs index 13b04516..1725535b 100644 --- a/crates/llm-base/src/loader.rs +++ b/crates/llm-base/src/loader.rs @@ -327,13 +327,21 @@ pub enum LoadError { }, /// The tokenizer could not be loaded. #[error("could not load tokenizer {path:?}: {error}")] - TokenizerLoadError { + TokenizerLoadFail { /// The invalid tokenizer path path: PathBuf, /// The error that occurred. error: Box, }, + /// There is insufficient information to guess the model architecture from the provided file. + /// + /// A model architecture must be provided to load the model. + #[error("could not guess model architecture from {path:?}")] + MissingModelArchitecture { + /// The path that failed. + path: PathBuf, + }, } impl From for LoadError { fn from(value: util::FindAllModelFilesError) -> Self { @@ -345,7 +353,7 @@ impl From for LoadError { } impl From for LoadError { fn from(value: TokenizerLoadError) -> Self { - LoadError::TokenizerLoadError { + LoadError::TokenizerLoadFail { path: value.path, error: value.error, } diff --git a/crates/llm-base/src/model/mod.rs b/crates/llm-base/src/model/mod.rs index bee50f37..45eb8650 100644 --- a/crates/llm-base/src/model/mod.rs +++ b/crates/llm-base/src/model/mod.rs @@ -177,7 +177,8 @@ pub enum HyperparametersWriteError { InvalidIntegerConversion(#[from] std::num::TryFromIntError), } -/// Parameters for tuning model instances +/// Parameters for model-wide behaviour. +#[derive(Debug, Clone)] pub struct ModelParameters { /// For [GGML formats](ggml::ContainerType) that support it, [mmap](https://en.wikipedia.org/wiki/Mmap) /// is the default. Although mmap typically improves performance, setting this value to `false` may diff --git a/crates/llm/examples/embeddings.rs b/crates/llm/examples/embeddings.rs index 74207a1d..0a6a999a 100644 --- a/crates/llm/examples/embeddings.rs +++ b/crates/llm/examples/embeddings.rs @@ -51,7 +51,7 @@ fn main() { // Load model let model_params = llm::ModelParameters::default(); let model = llm::load_dynamic( - model_architecture, + Some(model_architecture), &model_path, tokenizer_source, model_params, diff --git a/crates/llm/examples/inference.rs b/crates/llm/examples/inference.rs index aa740b02..51e7369a 100644 --- a/crates/llm/examples/inference.rs +++ b/crates/llm/examples/inference.rs @@ -39,7 +39,7 @@ fn main() { let now = std::time::Instant::now(); let model = llm::load_dynamic( - model_architecture, + Some(model_architecture), &model_path, tokenizer_source, Default::default(), diff --git a/crates/llm/examples/vicuna-chat.rs b/crates/llm/examples/vicuna-chat.rs index e08f0be3..7cdeb1d1 100644 --- a/crates/llm/examples/vicuna-chat.rs +++ b/crates/llm/examples/vicuna-chat.rs @@ -31,7 +31,7 @@ fn main() { let model_architecture = args.model_architecture; let model_path = args.model_path; let model = llm::load_dynamic( - model_architecture, + Some(model_architecture), &model_path, tokenizer_source, Default::default(), diff --git a/crates/llm/src/lib.rs b/crates/llm/src/lib.rs index 30ea6c56..c165deb5 100644 --- a/crates/llm/src/lib.rs +++ b/crates/llm/src/lib.rs @@ -153,6 +153,33 @@ impl ModelArchitecture { ]; } +/// Used to dispatch some code based on the model architecture. +pub trait ModelArchitectureVisitor { + /// Visit a model architecture. + fn visit(&mut self) -> R; +} +impl ModelArchitecture { + /// Use a visitor to dispatch some code based on the model architecture. + pub fn visit(&self, visitor: &mut impl ModelArchitectureVisitor) -> R { + match self { + #[cfg(feature = "bloom")] + Self::Bloom => visitor.visit::(), + #[cfg(feature = "gpt2")] + Self::Gpt2 => visitor.visit::(), + #[cfg(feature = "gptj")] + Self::GptJ => visitor.visit::(), + #[cfg(feature = "gptneox")] + Self::GptNeoX => visitor.visit::(), + #[cfg(feature = "llama")] + Self::Llama => visitor.visit::(), + #[cfg(feature = "mpt")] + Self::Mpt => visitor.visit::(), + #[cfg(feature = "falcon")] + Self::Falcon => visitor.visit::(), + } + } +} + /// An unsupported model architecture was specified. pub struct UnsupportedModelArchitecture(String); impl Display for UnsupportedModelArchitecture { @@ -227,18 +254,17 @@ impl Display for ModelArchitecture { } /// A helper function that loads the specified model from disk using an architecture -/// specified at runtime. +/// specified at runtime. If no architecture is specified, it will try to infer it +/// from the model's metadata. /// /// A wrapper around [load] that dispatches to the correct model. pub fn load_dynamic( - architecture: ModelArchitecture, + architecture: Option, path: &Path, tokenizer_source: TokenizerSource, params: ModelParameters, load_progress_callback: impl FnMut(LoadProgress), ) -> Result, LoadError> { - use ModelArchitecture as MA; - fn load_model( path: &Path, tokenizer_source: TokenizerSource, @@ -253,38 +279,35 @@ pub fn load_dynamic( )?)) } - let model: Box = match architecture { - #[cfg(feature = "bloom")] - MA::Bloom => { - load_model::(path, tokenizer_source, params, load_progress_callback)? - } - #[cfg(feature = "gpt2")] - MA::Gpt2 => { - load_model::(path, tokenizer_source, params, load_progress_callback)? - } - #[cfg(feature = "gptj")] - MA::GptJ => { - load_model::(path, tokenizer_source, params, load_progress_callback)? - } - #[cfg(feature = "gptneox")] - MA::GptNeoX => { - load_model::(path, tokenizer_source, params, load_progress_callback)? - } - #[cfg(feature = "llama")] - MA::Llama => { - load_model::(path, tokenizer_source, params, load_progress_callback)? - } - #[cfg(feature = "mpt")] - MA::Mpt => { - load_model::(path, tokenizer_source, params, load_progress_callback)? - } - #[cfg(feature = "falcon")] - MA::Falcon => { - load_model::(path, tokenizer_source, params, load_progress_callback)? + let architecture = architecture.ok_or_else(|| LoadError::MissingModelArchitecture { + path: path.to_owned(), + })?; + + struct LoadVisitor<'a, F: FnMut(LoadProgress)> { + path: &'a Path, + tokenizer_source: TokenizerSource, + params: ModelParameters, + load_progress_callback: F, + } + impl<'a, F: FnMut(LoadProgress)> ModelArchitectureVisitor, LoadError>> + for LoadVisitor<'a, F> + { + fn visit(&mut self) -> Result, LoadError> { + load_model::( + self.path, + self.tokenizer_source.clone(), + self.params.clone(), + &mut self.load_progress_callback, + ) } - }; + } - Ok(model) + architecture.visit(&mut LoadVisitor { + path, + tokenizer_source, + params, + load_progress_callback, + }) } #[cfg(test)]