Review fixes

dudamsagar · Jul 15, 2023 · cea02f2 · cea02f2
1 parent a8986b8
commit cea02f2
Show file tree

Hide file tree

Showing 8 changed files with 174 additions and 176 deletions.
diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
@@ -1,4 +1,9 @@
-use std::{os::raw::c_int, ptr::NonNull, sync::Arc};
+use std::{
+    collections::HashMap,
+    os::raw::c_int,
+    ptr::NonNull,
+    sync::{Arc, Mutex},
+};
 
 use memmap2::Mmap;
 
@@ -21,6 +26,9 @@ pub struct Context {
 
     /// Whether the context can offload tensors to the GPU
     pub can_offload: bool,
+
+    /// Offloaded tensors
+    offloaded_tensors: Arc<Mutex<HashMap<String, Tensor>>>,
 }
 
 impl Context {
@@ -39,6 +47,7 @@ impl Context {
             mmap: None,
             buffer: Some(buffer),
             can_offload: false,
+            offloaded_tensors: Arc::new(Mutex::new(HashMap::new())),
         }
     }
 
@@ -57,6 +66,7 @@ impl Context {
             mmap: Some(mmap),
             buffer: None,
             can_offload: false,
+            offloaded_tensors: Arc::new(Mutex::new(HashMap::new())),
         }
     }
 
@@ -76,24 +86,21 @@ impl Context {
             mmap: None,
             buffer: None,
             can_offload: false,
+            offloaded_tensors: Arc::new(Mutex::new(HashMap::new())),
         }
     }
 
     /// If offloading is enabled, all tensors created by this context will be offloaded to the GPU
-    pub fn enable_offloading(&mut self) {
-        self.can_offload = true;
-    }
-
-    /// Disables the offloading of tensors to the GPU
-    pub fn disable_offloading(&mut self) {
-        self.can_offload = false;
+    pub fn set_offloading(&mut self, can_offload: bool) {
+        self.can_offload = can_offload;
     }
 
     /// Wraps a raw tensor with a weak pointer to the context.
     fn new_tensor_raw(&self, raw: *mut sys::ggml_tensor) -> Tensor {
         let tensor = Tensor {
             ptr: NonNull::new(raw).expect("Should not be null"),
             ctx: Arc::downgrade(&self.ptr),
+            offloaded_tensors: Arc::downgrade(&self.offloaded_tensors),
         };
 
         if self.can_offload {
@@ -495,6 +502,12 @@ impl Drop for Context {
     fn drop(&mut self) {
         // SAFETY: The only non-weak copy of ptr is no longer accessible after this drop call.
         unsafe {
+            // if we moved tensors to an acceleratoor we need to free them
+            for (_, tensor) in self.offloaded_tensors.lock().unwrap().drain() {
+                if tensor.backend() != crate::Backend::Cpu {
+                    crate::accelerator_free_tensor(&tensor);
+                }
+            }
             sys::ggml_free(self.ptr.as_ptr());
         }
     }

diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
@@ -30,19 +30,19 @@ mod tests;
 pub mod metal;
 
 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
-///Accelerators supported by `ggml`.
+/// Accelerators supported by `ggml`.
 pub enum Accelerator {
-    ///CuBLAS accelerated
+    /// CuBLAS accelerated
     CuBLAS,
-    ///CLBlast accelerated
+    /// CLBlast accelerated
     CLBlast,
-    ///Metal accelerated
+    /// Metal accelerated
     Metal,
-    ///Cpu accelerated
+    /// Cpu accelerated
     None,
 }
 
-///Returns the accelerator `ggml` was compiled with.
+/// Returns the accelerator `ggml` was compiled with.
 pub fn get_accelerator() -> Accelerator {
     #[cfg(feature = "cublas")]
     return Accelerator::CLBlast;
@@ -55,14 +55,14 @@ pub fn get_accelerator() -> Accelerator {
 }
 
 #[derive(Default, Debug, Copy, Clone, PartialEq, Eq)]
-///Backend to use for a tensor.
+/// Backend to use for a tensor.
 pub enum Backend {
     /// CPU backend
     #[default]
     Cpu,
     /// GPU backend
     Gpu,
-    ///Multi-GPU backend
+    /// Multi-GPU backend
     GpuSplit,
 }
 
@@ -184,6 +184,9 @@ pub const QNT_VERSION_FACTOR: u32 = sys::GGML_QNT_VERSION_FACTOR;
 /// The size of a `ggml` object.
 pub const OBJECT_SIZE: usize = sys::GGML_OBJECT_SIZE;
 
+/// The maximum length of a `ggml` tensor-name.
+pub const MAX_NAME_LENGTH: u32 = sys::GGML_MAX_NAME;
+
 #[derive(Debug, Copy, Clone, PartialEq, Eq, Default)]
 /// The type of a value in `ggml`.
 pub enum Type {
@@ -537,7 +540,7 @@ pub fn cpu_has_gpublas() -> bool {
 }
 
 /// Sets the name of a tensor.
-pub fn set_name(tensor: &Tensor, name: &str) {
+pub fn set_tensor_name(tensor: &Tensor, name: &str) {
     let c_name = std::ffi::CString::new(name).unwrap();
     unsafe { sys::ggml_set_name(tensor.ptr.as_ptr(), c_name.as_ptr()) };
 }

diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs
@@ -1,4 +1,9 @@
-use std::{os::raw::c_void, ptr::NonNull, sync::Weak};
+use std::{
+    collections::HashMap,
+    os::raw::c_void,
+    ptr::NonNull,
+    sync::{Mutex, Weak},
+};
 
 use crate::{i64_to_usize, sys, Type};
 
@@ -7,6 +12,7 @@ use crate::{i64_to_usize, sys, Type};
 pub struct Tensor {
     pub(crate) ptr: NonNull<sys::ggml_tensor>,
     pub(crate) ctx: Weak<NonNull<sys::ggml_context>>,
+    pub(crate) offloaded_tensors: Weak<Mutex<HashMap<String, Tensor>>>,
 }
 
 impl Tensor {
@@ -15,9 +21,18 @@ impl Tensor {
     /// Exposed for purposes of determining context size.
     pub const C_TYPE_SIZE: usize = std::mem::size_of::<sys::ggml_tensor>();
 
-    ///Sets the name of the tensor
-    pub fn set_name(&mut self, name: &str) -> &Tensor {
-        assert!(name.len() <= 48, "Name is too long!");
+    /// Sets the name of the tensor.
+    ///
+    /// # Safety
+    ///
+    /// The name must be a valid UTF-8 string and must not be longer than [crate::MAX_NAME_LENGTH] characters.
+    pub fn set_name(mut self, name: &str) -> Tensor {
+        assert!(
+            name.len() <= crate::MAX_NAME_LENGTH.try_into().unwrap(),
+            "Name '{}' is too long, max length is {} characters",
+            name,
+            crate::MAX_NAME_LENGTH
+        );
 
         let bytes = name.as_bytes();
         let mut array = [0i8; 48];
@@ -27,29 +42,60 @@ impl Tensor {
         self
     }
 
-    ///Gets the name of the tensor
-    pub fn get_name(&self) -> String {
+    /// Gets the name of the tensor
+    pub fn name(&self) -> String {
         let name = unsafe { self.ptr.as_ref().name };
         let mut name = name.iter().map(|&x| x as u8).collect::<Vec<_>>();
         name.retain(|&x| x != 0);
         String::from_utf8(name).unwrap()
     }
 
-    ///Sets the acceleration backend of the tensor
+    /// Sets the acceleration backend of the tensor.
+    ///
+    /// # Caution
+    ///
+    /// This will not move the data to the new backend! See [Tensor::transfer_to] if you want to move the data to the new backend.
     pub fn set_backend(&mut self, backend: crate::Backend) {
         unsafe { crate::set_tensor_backend(self.ptr.as_mut(), backend) }
     }
 
-    ///Gets the acceleration backend of the tensor
-    pub fn get_backend(&self) -> crate::Backend {
+    /// Gets the acceleration backend of the tensor
+    pub fn backend(&self) -> crate::Backend {
         unsafe { crate::get_tensor_backend(self.ptr.as_ref()) }
     }
 
+    /// Sets the tensors acceleration backend and moves the tensors data to the new backend.
+    pub fn transfer_to<E>(mut self, backend: crate::Backend) -> Result<Tensor, E> {
+        let current_backend = self.backend();
+        self.set_backend(backend);
+
+        if backend != crate::Backend::Cpu {
+            crate::accelerator_transform_tensor(&mut self);
+            if current_backend == crate::Backend::Cpu {
+                // tensor was moved from cpu to accelerator => We need to keep track of the data to free it later from the accelerator
+                self.with_alive_ctx_mut(|| {
+                    if let Some(offloaded_tensors) = self.offloaded_tensors.upgrade() {
+                        //TODO: Do we need to check if the tensor is already in the map?
+                        offloaded_tensors
+                            .lock()
+                            .unwrap()
+                            .insert(self.name(), self.share());
+                    } else {
+                        panic!("Using a context after it was dropped!")
+                    }
+                })
+            }
+        }
+
+        Ok(self)
+    }
+
     /// Creates a shared copy of this tensor pointer.
     pub fn share(&self) -> Self {
         Tensor {
             ptr: self.ptr,
             ctx: Weak::clone(&self.ctx),
+            offloaded_tensors: Weak::clone(&self.offloaded_tensors),
         }
     }
 

diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
@@ -26,24 +26,6 @@ fn scratch_buffers() -> ScratchBuffers {
     ]
 }
 
-fn kv_memory(
-    context: &Context,
-    config: &InferenceSessionConfig,
-    n_elements: usize,
-) -> (Tensor, Tensor) {
-    let memory_k = context.new_tensor_1d(config.memory_k_type.into(), n_elements);
-    let memory_v = context.new_tensor_1d(config.memory_v_type.into(), n_elements);
-    ggml::set_name(&memory_k, "memory_k");
-    ggml::set_name(&memory_v, "memory_v");
-
-    if config.use_gpu {
-        ggml::accelerator_offload_tensor_no_scratch(&memory_k);
-        ggml::accelerator_offload_tensor_no_scratch(&memory_v);
-    }
-
-    (memory_k, memory_v)
-}
-
 /// Result of graph building
 pub struct GraphOutputs {
     /// The output containing the model's result
@@ -238,7 +220,7 @@ impl InferenceSession {
         self.ctx0 = ggml::Context::init_buffer(self.ctx0.buffer.take().unwrap());
         let ctx0 = &mut self.ctx0;
         let mut embd = ctx0.new_tensor_1d(ggml::Type::I32, input_tokens.len());
-        ggml::set_name(&embd, "embd");
+        ggml::set_tensor_name(&embd, "embd");
 
         let bc = BuildContext {
             ctx0: RefCell::new(ctx0),
@@ -946,3 +928,24 @@ pub fn feed_prompt_callback<'a, E: std::error::Error + 'static>(
         None => Ok(InferenceFeedback::Continue),
     }
 }
+
+/// Create the memory K/V tensors for the inference-session.
+fn kv_memory(
+    context: &Context,
+    config: &InferenceSessionConfig,
+    n_elements: usize,
+) -> (Tensor, Tensor) {
+    let memory_k = context
+        .new_tensor_1d(config.memory_k_type.into(), n_elements)
+        .set_name("memory_k");
+    let memory_v = context
+        .new_tensor_1d(config.memory_v_type.into(), n_elements)
+        .set_name("memory_v");
+
+    if config.use_gpu {
+        ggml::accelerator_offload_tensor_no_scratch(&memory_k);
+        ggml::accelerator_offload_tensor_no_scratch(&memory_v);
+    }
+
+    (memory_k, memory_v)
+}
diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs
@@ -396,8 +396,6 @@ impl LoadError {
 pub trait TensorLoader<E: std::error::Error> {
     /// Gets a tensor from the loader.
     fn load(&mut self, name: &str) -> Result<ggml::Tensor, E>;
-    /// Gets a tensor from the loader and tries to offload it to the specified backend.
-    fn offload(&mut self, name: &str, backend: ggml::Backend) -> Result<ggml::Tensor, E>;
     /// Finish loading the model, and extract all of the state from the loader.
     fn finish(self) -> (Context, HashMap<String, ggml::Tensor>);
 }
@@ -672,15 +670,6 @@ impl TensorLoader<LoadError> for MmapCompatibleLoader<'_> {
         Ok(tensor)
     }
 
-    fn offload(&mut self, name: &str, backend: ggml::Backend) -> Result<ggml::Tensor, LoadError> {
-        let mut tensor = self.load(name)?;
-        if backend != ggml::Backend::Cpu {
-            tensor.set_backend(backend);
-            crate::ggml::accelerator_transform_tensor(&mut tensor);
-        }
-        Ok(tensor)
-    }
-
     fn finish(self) -> (Context, HashMap<String, ggml::Tensor>) {
         (self.context, self.loaded_tensors)
     }
@@ -752,16 +741,15 @@ impl<'a> FileContext<'a> {
             }
         }
 
-        // The tensor name is truncated to 32 characters.
-
-        let tensor_name = if name.len() > 32 {
-            &name[name.len() - 32..]
+        // The tensor name is truncated to it's maximum length.
+        let max_name_length: usize = ggml::MAX_NAME_LENGTH.try_into().unwrap();
+        let tensor_name = if name.len() >= max_name_length {
+            &name[name.len() - max_name_length..]
         } else {
             name
         };
-        tensor.set_name(tensor_name);
 
-        Ok(tensor)
+        Ok(tensor.set_name(tensor_name))
     }
 }
 

diff --git a/crates/llm-base/src/model/mod.rs b/crates/llm-base/src/model/mod.rs
@@ -204,7 +204,7 @@ pub struct ModelParameters {
     pub lora_adapters: Option<Vec<PathBuf>>,
     /// Whether to use GPU acceleration when available
     pub use_gpu: bool,
-    /// The number of layers to offload to the gpu. If `None`, all layers will be offloaded.
+    /// If `use_gpu` is active this defines the number of layers to offload to the gpu. If `None`, all layers will be offloaded.
     pub gpu_layers: Option<usize>,
 }
 
@@ -231,6 +231,15 @@ impl ModelParameters {
             true
         }
     }
+
+    /// Returns the backend to use for the given layer.
+    pub fn backend(&self, layer: usize) -> ggml::Backend {
+        if self.should_offload(layer) {
+            ggml::Backend::Gpu
+        } else {
+            ggml::Backend::Cpu
+        }
+    }
 }
 
 /// Used in a call to [Model::evaluate] or [InferenceSession::infer] to request