From eaf6e031741ca2d3aafeff3e0f4dd7557a974d2b Mon Sep 17 00:00:00 2001 From: Georgi Gerganov Date: Mon, 27 May 2024 09:24:13 +0300 Subject: [PATCH] llama : add comments about experimental flags (#7544) --- llama.h | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/llama.h b/llama.h index 7671b8a57f4e7..3e4474bb94e9a 100644 --- a/llama.h +++ b/llama.h @@ -265,6 +265,8 @@ extern "C" { bool check_tensors; // validate model tensor data }; + // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations + // https://github.com/ggerganov/llama.cpp/pull/7544 struct llama_context_params { uint32_t seed; // RNG seed, -1 for random uint32_t n_ctx; // text context, 0 = from model @@ -291,14 +293,14 @@ extern "C" { ggml_backend_sched_eval_callback cb_eval; void * cb_eval_user_data; - enum ggml_type type_k; // data type for K cache - enum ggml_type type_v; // data type for V cache + enum ggml_type type_k; // data type for K cache [EXPERIMENTAL] + enum ggml_type type_v; // data type for V cache [EXPERIMENTAL] // Keep the booleans together to avoid misalignment during copy-by-value. bool logits_all; // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead) bool embeddings; // if true, extract embeddings (together with logits) bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU - bool flash_attn; // whether to use flash attention + bool flash_attn; // whether to use flash attention [EXPERIMENTAL] // Abort callback // if it returns true, execution of llama_decode() will be aborted