Skip to content

Commit

Permalink
Review fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
LLukas22 committed Jul 15, 2023
1 parent a8986b8 commit cea02f2
Show file tree
Hide file tree
Showing 8 changed files with 174 additions and 176 deletions.
29 changes: 21 additions & 8 deletions crates/ggml/src/context.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
use std::{os::raw::c_int, ptr::NonNull, sync::Arc};
use std::{
collections::HashMap,
os::raw::c_int,
ptr::NonNull,
sync::{Arc, Mutex},
};

use memmap2::Mmap;

Expand All @@ -21,6 +26,9 @@ pub struct Context {

/// Whether the context can offload tensors to the GPU
pub can_offload: bool,

/// Offloaded tensors
offloaded_tensors: Arc<Mutex<HashMap<String, Tensor>>>,
}

impl Context {
Expand All @@ -39,6 +47,7 @@ impl Context {
mmap: None,
buffer: Some(buffer),
can_offload: false,
offloaded_tensors: Arc::new(Mutex::new(HashMap::new())),
}
}

Expand All @@ -57,6 +66,7 @@ impl Context {
mmap: Some(mmap),
buffer: None,
can_offload: false,
offloaded_tensors: Arc::new(Mutex::new(HashMap::new())),
}
}

Expand All @@ -76,24 +86,21 @@ impl Context {
mmap: None,
buffer: None,
can_offload: false,
offloaded_tensors: Arc::new(Mutex::new(HashMap::new())),
}
}

/// If offloading is enabled, all tensors created by this context will be offloaded to the GPU
pub fn enable_offloading(&mut self) {
self.can_offload = true;
}

/// Disables the offloading of tensors to the GPU
pub fn disable_offloading(&mut self) {
self.can_offload = false;
pub fn set_offloading(&mut self, can_offload: bool) {
self.can_offload = can_offload;
}

/// Wraps a raw tensor with a weak pointer to the context.
fn new_tensor_raw(&self, raw: *mut sys::ggml_tensor) -> Tensor {
let tensor = Tensor {
ptr: NonNull::new(raw).expect("Should not be null"),
ctx: Arc::downgrade(&self.ptr),
offloaded_tensors: Arc::downgrade(&self.offloaded_tensors),
};

if self.can_offload {
Expand Down Expand Up @@ -495,6 +502,12 @@ impl Drop for Context {
fn drop(&mut self) {
// SAFETY: The only non-weak copy of ptr is no longer accessible after this drop call.
unsafe {
// if we moved tensors to an acceleratoor we need to free them
for (_, tensor) in self.offloaded_tensors.lock().unwrap().drain() {
if tensor.backend() != crate::Backend::Cpu {
crate::accelerator_free_tensor(&tensor);
}
}
sys::ggml_free(self.ptr.as_ptr());
}
}
Expand Down
21 changes: 12 additions & 9 deletions crates/ggml/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,19 +30,19 @@ mod tests;
pub mod metal;

#[derive(Debug, Copy, Clone, PartialEq, Eq)]
///Accelerators supported by `ggml`.
/// Accelerators supported by `ggml`.
pub enum Accelerator {
///CuBLAS accelerated
/// CuBLAS accelerated
CuBLAS,
///CLBlast accelerated
/// CLBlast accelerated
CLBlast,
///Metal accelerated
/// Metal accelerated
Metal,
///Cpu accelerated
/// Cpu accelerated
None,
}

///Returns the accelerator `ggml` was compiled with.
/// Returns the accelerator `ggml` was compiled with.
pub fn get_accelerator() -> Accelerator {
#[cfg(feature = "cublas")]
return Accelerator::CLBlast;
Expand All @@ -55,14 +55,14 @@ pub fn get_accelerator() -> Accelerator {
}

#[derive(Default, Debug, Copy, Clone, PartialEq, Eq)]
///Backend to use for a tensor.
/// Backend to use for a tensor.
pub enum Backend {
/// CPU backend
#[default]
Cpu,
/// GPU backend
Gpu,
///Multi-GPU backend
/// Multi-GPU backend
GpuSplit,
}

Expand Down Expand Up @@ -184,6 +184,9 @@ pub const QNT_VERSION_FACTOR: u32 = sys::GGML_QNT_VERSION_FACTOR;
/// The size of a `ggml` object.
pub const OBJECT_SIZE: usize = sys::GGML_OBJECT_SIZE;

/// The maximum length of a `ggml` tensor-name.
pub const MAX_NAME_LENGTH: u32 = sys::GGML_MAX_NAME;

#[derive(Debug, Copy, Clone, PartialEq, Eq, Default)]
/// The type of a value in `ggml`.
pub enum Type {
Expand Down Expand Up @@ -537,7 +540,7 @@ pub fn cpu_has_gpublas() -> bool {
}

/// Sets the name of a tensor.
pub fn set_name(tensor: &Tensor, name: &str) {
pub fn set_tensor_name(tensor: &Tensor, name: &str) {
let c_name = std::ffi::CString::new(name).unwrap();
unsafe { sys::ggml_set_name(tensor.ptr.as_ptr(), c_name.as_ptr()) };
}
Expand Down
64 changes: 55 additions & 9 deletions crates/ggml/src/tensor.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,9 @@
use std::{os::raw::c_void, ptr::NonNull, sync::Weak};
use std::{
collections::HashMap,
os::raw::c_void,
ptr::NonNull,
sync::{Mutex, Weak},
};

use crate::{i64_to_usize, sys, Type};

Expand All @@ -7,6 +12,7 @@ use crate::{i64_to_usize, sys, Type};
pub struct Tensor {
pub(crate) ptr: NonNull<sys::ggml_tensor>,
pub(crate) ctx: Weak<NonNull<sys::ggml_context>>,
pub(crate) offloaded_tensors: Weak<Mutex<HashMap<String, Tensor>>>,
}

impl Tensor {
Expand All @@ -15,9 +21,18 @@ impl Tensor {
/// Exposed for purposes of determining context size.
pub const C_TYPE_SIZE: usize = std::mem::size_of::<sys::ggml_tensor>();

///Sets the name of the tensor
pub fn set_name(&mut self, name: &str) -> &Tensor {
assert!(name.len() <= 48, "Name is too long!");
/// Sets the name of the tensor.
///
/// # Safety
///
/// The name must be a valid UTF-8 string and must not be longer than [crate::MAX_NAME_LENGTH] characters.
pub fn set_name(mut self, name: &str) -> Tensor {
assert!(
name.len() <= crate::MAX_NAME_LENGTH.try_into().unwrap(),
"Name '{}' is too long, max length is {} characters",
name,
crate::MAX_NAME_LENGTH
);

let bytes = name.as_bytes();
let mut array = [0i8; 48];
Expand All @@ -27,29 +42,60 @@ impl Tensor {
self
}

///Gets the name of the tensor
pub fn get_name(&self) -> String {
/// Gets the name of the tensor
pub fn name(&self) -> String {
let name = unsafe { self.ptr.as_ref().name };
let mut name = name.iter().map(|&x| x as u8).collect::<Vec<_>>();
name.retain(|&x| x != 0);
String::from_utf8(name).unwrap()
}

///Sets the acceleration backend of the tensor
/// Sets the acceleration backend of the tensor.
///
/// # Caution
///
/// This will not move the data to the new backend! See [Tensor::transfer_to] if you want to move the data to the new backend.
pub fn set_backend(&mut self, backend: crate::Backend) {
unsafe { crate::set_tensor_backend(self.ptr.as_mut(), backend) }
}

///Gets the acceleration backend of the tensor
pub fn get_backend(&self) -> crate::Backend {
/// Gets the acceleration backend of the tensor
pub fn backend(&self) -> crate::Backend {
unsafe { crate::get_tensor_backend(self.ptr.as_ref()) }
}

/// Sets the tensors acceleration backend and moves the tensors data to the new backend.
pub fn transfer_to<E>(mut self, backend: crate::Backend) -> Result<Tensor, E> {
let current_backend = self.backend();
self.set_backend(backend);

if backend != crate::Backend::Cpu {
crate::accelerator_transform_tensor(&mut self);
if current_backend == crate::Backend::Cpu {
// tensor was moved from cpu to accelerator => We need to keep track of the data to free it later from the accelerator
self.with_alive_ctx_mut(|| {
if let Some(offloaded_tensors) = self.offloaded_tensors.upgrade() {
//TODO: Do we need to check if the tensor is already in the map?
offloaded_tensors
.lock()
.unwrap()
.insert(self.name(), self.share());
} else {
panic!("Using a context after it was dropped!")
}
})
}
}

Ok(self)
}

/// Creates a shared copy of this tensor pointer.
pub fn share(&self) -> Self {
Tensor {
ptr: self.ptr,
ctx: Weak::clone(&self.ctx),
offloaded_tensors: Weak::clone(&self.offloaded_tensors),
}
}

Expand Down
41 changes: 22 additions & 19 deletions crates/llm-base/src/inference_session.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,24 +26,6 @@ fn scratch_buffers() -> ScratchBuffers {
]
}

fn kv_memory(
context: &Context,
config: &InferenceSessionConfig,
n_elements: usize,
) -> (Tensor, Tensor) {
let memory_k = context.new_tensor_1d(config.memory_k_type.into(), n_elements);
let memory_v = context.new_tensor_1d(config.memory_v_type.into(), n_elements);
ggml::set_name(&memory_k, "memory_k");
ggml::set_name(&memory_v, "memory_v");

if config.use_gpu {
ggml::accelerator_offload_tensor_no_scratch(&memory_k);
ggml::accelerator_offload_tensor_no_scratch(&memory_v);
}

(memory_k, memory_v)
}

/// Result of graph building
pub struct GraphOutputs {
/// The output containing the model's result
Expand Down Expand Up @@ -238,7 +220,7 @@ impl InferenceSession {
self.ctx0 = ggml::Context::init_buffer(self.ctx0.buffer.take().unwrap());
let ctx0 = &mut self.ctx0;
let mut embd = ctx0.new_tensor_1d(ggml::Type::I32, input_tokens.len());
ggml::set_name(&embd, "embd");
ggml::set_tensor_name(&embd, "embd");

let bc = BuildContext {
ctx0: RefCell::new(ctx0),
Expand Down Expand Up @@ -946,3 +928,24 @@ pub fn feed_prompt_callback<'a, E: std::error::Error + 'static>(
None => Ok(InferenceFeedback::Continue),
}
}

/// Create the memory K/V tensors for the inference-session.
fn kv_memory(
context: &Context,
config: &InferenceSessionConfig,
n_elements: usize,
) -> (Tensor, Tensor) {
let memory_k = context
.new_tensor_1d(config.memory_k_type.into(), n_elements)
.set_name("memory_k");
let memory_v = context
.new_tensor_1d(config.memory_v_type.into(), n_elements)
.set_name("memory_v");

if config.use_gpu {
ggml::accelerator_offload_tensor_no_scratch(&memory_k);
ggml::accelerator_offload_tensor_no_scratch(&memory_v);
}

(memory_k, memory_v)
}
22 changes: 5 additions & 17 deletions crates/llm-base/src/loader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -396,8 +396,6 @@ impl LoadError {
pub trait TensorLoader<E: std::error::Error> {
/// Gets a tensor from the loader.
fn load(&mut self, name: &str) -> Result<ggml::Tensor, E>;
/// Gets a tensor from the loader and tries to offload it to the specified backend.
fn offload(&mut self, name: &str, backend: ggml::Backend) -> Result<ggml::Tensor, E>;
/// Finish loading the model, and extract all of the state from the loader.
fn finish(self) -> (Context, HashMap<String, ggml::Tensor>);
}
Expand Down Expand Up @@ -672,15 +670,6 @@ impl TensorLoader<LoadError> for MmapCompatibleLoader<'_> {
Ok(tensor)
}

fn offload(&mut self, name: &str, backend: ggml::Backend) -> Result<ggml::Tensor, LoadError> {
let mut tensor = self.load(name)?;
if backend != ggml::Backend::Cpu {
tensor.set_backend(backend);
crate::ggml::accelerator_transform_tensor(&mut tensor);
}
Ok(tensor)
}

fn finish(self) -> (Context, HashMap<String, ggml::Tensor>) {
(self.context, self.loaded_tensors)
}
Expand Down Expand Up @@ -752,16 +741,15 @@ impl<'a> FileContext<'a> {
}
}

// The tensor name is truncated to 32 characters.

let tensor_name = if name.len() > 32 {
&name[name.len() - 32..]
// The tensor name is truncated to it's maximum length.
let max_name_length: usize = ggml::MAX_NAME_LENGTH.try_into().unwrap();
let tensor_name = if name.len() >= max_name_length {
&name[name.len() - max_name_length..]
} else {
name
};
tensor.set_name(tensor_name);

Ok(tensor)
Ok(tensor.set_name(tensor_name))
}
}

Expand Down
11 changes: 10 additions & 1 deletion crates/llm-base/src/model/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -204,7 +204,7 @@ pub struct ModelParameters {
pub lora_adapters: Option<Vec<PathBuf>>,
/// Whether to use GPU acceleration when available
pub use_gpu: bool,
/// The number of layers to offload to the gpu. If `None`, all layers will be offloaded.
/// If `use_gpu` is active this defines the number of layers to offload to the gpu. If `None`, all layers will be offloaded.
pub gpu_layers: Option<usize>,
}

Expand All @@ -231,6 +231,15 @@ impl ModelParameters {
true
}
}

/// Returns the backend to use for the given layer.
pub fn backend(&self, layer: usize) -> ggml::Backend {
if self.should_offload(layer) {
ggml::Backend::Gpu
} else {
ggml::Backend::Cpu
}
}
}

/// Used in a call to [Model::evaluate] or [InferenceSession::infer] to request
Expand Down
Loading

0 comments on commit cea02f2

Please sign in to comment.