Skip to content

Commit

Permalink
ggml-backend update: buffer types, backend registry, graph compare, t…
Browse files Browse the repository at this point in the history
…ests (ggerganov#620)

* ggml-backend update

* update metal backend

* show metal logs with ggml-backend

* move buffer types to functions

* cuda: add per-device backends

* cuda: add host buffer type

* fix metal build

* ggml_backend_alloc_ctx_tensors : ignore allocated tensors

* ggml_backend_compare_graph_backend fixes

* ci : try to fix metal build

* metal : first print device info, then build kernels

* ci : disable GGML_METAL on Github Actions

* test-backend-ops initial impl (unary and get_rows)

* more op tests

* cleanup

* print test params, add more tests cases for add and mul

* add tests for im2col

* better f16 init

* metal : add basic impl of supports_op

* add test for ggml_concat

* update im2col test params, show callstack with GGML_ASSERT on CUDA failures

* add more rope tests

* add more rope and mul_mat test cases

* add more get_rows test cases
ggml-ci

* add more norm and rms_norm test cases with different eps

* ci : fix metal resource path

ggml-ci

* tests : silence warning

* add ggml_backend_tensor_alloc and ggml_backend_view_init for initializing tensors without ggml-alloc

* add mul_mat test cases without dims 3 and 4
ggml-ci

* check for nans and infs
ggml-ci

* add diag_mask_inf test cases without dims 3 and 4
ggml-ci

* fix cuda leak while backend reg

* fix msvc issues

* remove backend_sched debug causes by default

* gpt-2 : increase graph size

ggml-ci

---------

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
  • Loading branch information
slaren and ggerganov authored Nov 30, 2023
1 parent a5e4560 commit 38f46af
Show file tree
Hide file tree
Showing 24 changed files with 2,388 additions and 448 deletions.
4 changes: 2 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
- name: Dependencies
run: |
wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list
sudo apt-get update
sudo apt-get install -y --no-install-recommends llvm intel-oneapi-runtime-opencl intel-oneapi-runtime-compilers libclblast-dev
- name: Create Build Environment
Expand Down Expand Up @@ -59,7 +59,7 @@ jobs:

- name: Configure CMake
working-directory: ./build
run: cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DGGML_TEST_COVERAGE=ON -DGGML_METAL=ON ..
run: cmake -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DGGML_TEST_COVERAGE=ON ..

- name: Build
working-directory: ./build
Expand Down
8 changes: 8 additions & 0 deletions ci/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,10 @@ function gg_run_ctest_debug {
(time cmake -DCMAKE_BUILD_TYPE=Debug ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log

if [ ! -z ${GG_BUILD_METAL} ]; then
export GGML_METAL_PATH_RESOURCES="$(pwd)/bin"
fi

(time ctest --output-on-failure -E test-opt ) 2>&1 | tee -a $OUT/${ci}-ctest.log

set +e
Expand Down Expand Up @@ -122,6 +126,10 @@ function gg_run_ctest_release {
(time cmake -DCMAKE_BUILD_TYPE=Release ${CMAKE_EXTRA} .. ) 2>&1 | tee -a $OUT/${ci}-cmake.log
(time make -j ) 2>&1 | tee -a $OUT/${ci}-make.log

if [ ! -z ${GG_BUILD_METAL} ]; then
export GGML_METAL_PATH_RESOURCES="$(pwd)/bin"
fi

if [ -z $GG_BUILD_LOW_PERF ]; then
(time ctest --output-on-failure ) 2>&1 | tee -a $OUT/${ci}-ctest.log
else
Expand Down
113 changes: 56 additions & 57 deletions examples/gpt-2/main-backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

#define GPT2_MAX_NODES 4096

static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
Expand Down Expand Up @@ -177,47 +179,6 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

auto & ctx = model.ctx;

size_t buffer_size = 0;

{
const auto & hparams = model.hparams;

const int n_embd = hparams.n_embd;
const int n_layer = hparams.n_layer;
const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab;

buffer_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
buffer_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b

buffer_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
buffer_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
buffer_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head

buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b

buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b

buffer_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
buffer_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b

buffer_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
buffer_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b

buffer_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
buffer_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b

buffer_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
buffer_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b

buffer_size += (6 + 12*n_layer)*128; // alignment overhead

printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
printf("%s: backend buffer size = %6.2f MB\n", __func__, buffer_size/(1024.0*1024.0));
}

// create the ggml context
{
size_t n_tensors = 2 + 6 + 12*model.hparams.n_layer;
Expand All @@ -227,8 +188,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
/*.no_alloc =*/ true,
};

model.ctx = ggml_init(params);
if (!model.ctx) {
ctx = ggml_init(params);
if (!ctx) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return false;
}
Expand All @@ -238,7 +199,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
#ifdef GGML_USE_CUBLAS
if (n_gpu_layers > 0) {
fprintf(stderr, "%s: using CUDA backend\n", __func__);
model.backend = ggml_backend_cuda_init();
model.backend = ggml_backend_cuda_init(0);
if (!model.backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
}
Expand Down Expand Up @@ -267,10 +228,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
return false;
}

// allocate weights buffer
model.buffer_w = ggml_backend_alloc_buffer(model.backend, buffer_size);

// prepare memory for the weights
// create the tensors for the model
{
const auto & hparams = model.hparams;

Expand Down Expand Up @@ -338,6 +296,12 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
}
}

// allocate the model tensors in a backend buffer
model.buffer_w = ggml_backend_alloc_ctx_tensors(ctx, model.backend);

printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
printf("%s: backend buffer size = %6.2f MB\n", __func__, ggml_backend_buffer_get_size(model.buffer_w)/(1024.0*1024.0));

// override the default training context with the user-provided
model.hparams.n_ctx = n_ctx;

Expand Down Expand Up @@ -378,8 +342,6 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &

// load weights
{
ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_w);

size_t total_size = 0;

bool has_lm_head = false;
Expand Down Expand Up @@ -440,8 +402,6 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
return false;
}

ggml_allocr_alloc(alloc, tensor);

if (ggml_backend_is_cpu (model.backend)
#ifdef GGML_USE_METAL
|| ggml_backend_is_metal(model.backend)
Expand Down Expand Up @@ -470,7 +430,6 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
total_size += ggml_nbytes(tensor);
}

ggml_allocr_free(alloc);
printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
}

Expand All @@ -495,7 +454,7 @@ struct ggml_cgraph * gpt2_graph(
const int n_head = hparams.n_head;

// since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static size_t buf_size = ggml_tensor_overhead()*GPT2_MAX_NODES + ggml_graph_overhead_custom(GPT2_MAX_NODES, false);
static std::vector<uint8_t> buf(buf_size);

struct ggml_init_params params = {
Expand All @@ -506,7 +465,7 @@ struct ggml_cgraph * gpt2_graph(

struct ggml_context * ctx0 = ggml_init(params);

struct ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GPT2_MAX_NODES, false);

struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
ggml_allocr_alloc(allocr, embd);
Expand Down Expand Up @@ -801,16 +760,56 @@ bool gpt2_eval(
// allocate tensors
ggml_allocr_alloc_graph(allocr, gf);

// run the computation
// set backend options
if (ggml_backend_is_cpu(model.backend)) {
ggml_backend_cpu_set_n_threads(model.backend, n_threads);
}

#ifdef GGML_USE_METAL
if (ggml_backend_is_metal(model.backend)) {
ggml_backend_metal_set_n_cb(model.backend, n_threads);
}
#endif
ggml_backend_graph_compute(model.backend, gf);

// test
#if 0 && defined(GGML_USE_CUBLAS)
if (ggml_backend_is_cuda(model.backend)) {
auto eval_callback = [](int index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data) {
auto tv1 = tensor_to_float(t1);
auto tv2 = tensor_to_float(t2);

#if 1
float sim = cosine_similarity(tv1, tv2);
float len1 = vec_len(tv1);
float len2 = vec_len(tv2);
float lenr = len1/len2;
float lenrd = std::abs(1.0f-lenr);

float angle = acosf(sim)*180.0f/M_PI;

if (angle > 0.5f || lenrd > 0.05f) {
printf("%3d [%15s] %s: sim = %f, a = %f, lenrd = %f\n", index, ggml_op_desc(t1), t1->name, sim, angle, lenrd);
}
assert(sim > 0.90f);
#else
float dist = distance(tv1, tv2) / vec_len(tv1);
if (dist > 0.01f) {
printf("%3d [%15s] %s: distance = %f\n", index, ggml_op_desc(t1), t1->name, dist);
}
#endif

return true;
};
ggml_backend_t backend_cpu = ggml_backend_cpu_init();
ggml_backend_compare_graph_backend(model.backend, backend_cpu, gf, eval_callback, nullptr);
ggml_backend_free(backend_cpu);
//printf("done\n");
} else
#endif
{
// run the computation
ggml_backend_graph_compute(model.backend, gf);
}

//if (n_past%100 == 0) {
// ggml_graph_print (&gf);
Expand Down
8 changes: 5 additions & 3 deletions examples/gpt-2/main-batched.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

#define GPT2_MAX_NODES 4096

static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
Expand Down Expand Up @@ -286,7 +288,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
#ifdef GGML_USE_CUBLAS
if (n_gpu_layers > 0) {
fprintf(stderr, "%s: using CUDA backend\n", __func__);
model.backend = ggml_backend_cuda_init();
model.backend = ggml_backend_cuda_init(0);
if (!model.backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
}
Expand Down Expand Up @@ -551,7 +553,7 @@ struct ggml_cgraph * gpt2_graph(
const int32_t kv_head = ggml_allocr_is_measure(allocr) ? n_ctx - n_tokens : kv_cache.head;

// since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static size_t buf_size = ggml_tensor_overhead()*GPT2_MAX_NODES + ggml_graph_overhead_custom(GPT2_MAX_NODES, false);
static std::vector<uint8_t> buf(buf_size);

struct ggml_init_params params = {
Expand All @@ -562,7 +564,7 @@ struct ggml_cgraph * gpt2_graph(

struct ggml_context * ctx0 = ggml_init(params);

struct ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GPT2_MAX_NODES, false);

struct ggml_tensor * inpL;
if (batch.token) {
Expand Down
8 changes: 5 additions & 3 deletions examples/gpt-2/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@
#pragma warning(disable: 4244 4267) // possible loss of data
#endif

#define GPT2_MAX_NODES 4096

static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
Expand Down Expand Up @@ -107,7 +109,7 @@ void init_backends(gpt2_model & model, const gpt_params & params) {
#ifdef GGML_USE_CUBLAS
if (params.n_gpu_layers > 0) {
fprintf(stderr, "%s: using CUDA backend\n", __func__);
gpu_backend = ggml_backend_cuda_init();
gpu_backend = ggml_backend_cuda_init(0);
if (!gpu_backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
}
Expand Down Expand Up @@ -553,7 +555,7 @@ struct ggml_cgraph * gpt2_graph(
const int n_head = hparams.n_head;

// since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static size_t buf_size = ggml_tensor_overhead()*GPT2_MAX_NODES + ggml_graph_overhead_custom(GPT2_MAX_NODES, false);
static std::vector<uint8_t> buf(buf_size);

struct ggml_init_params params = {
Expand All @@ -564,7 +566,7 @@ struct ggml_cgraph * gpt2_graph(

struct ggml_context * ctx0 = ggml_init(params);

struct ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GPT2_MAX_NODES, false);

struct ggml_tensor * embd = ggml_view_1d(ctx0, model.embd, N, 0);

Expand Down
2 changes: 1 addition & 1 deletion examples/whisper/whisper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1063,7 +1063,7 @@ static ggml_backend_t whisper_backend_init(const whisper_context_params & params
#ifdef GGML_USE_CUBLAS
if (params.use_gpu && ggml_cublas_loaded()) {
WHISPER_LOG_INFO("%s: using CUDA backend\n", __func__);
backend_gpu = ggml_backend_cuda_init();
backend_gpu = ggml_backend_cuda_init(0);
if (!backend_gpu) {
WHISPER_LOG_ERROR("%s: ggml_backend_cuda_init() failed\n", __func__);
}
Expand Down
7 changes: 7 additions & 0 deletions include/ggml/ggml-alloc.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ extern "C" {

struct ggml_backend;
struct ggml_backend_buffer;
struct ggml_backend_buffer_type;

//
// Legacy API
Expand Down Expand Up @@ -80,6 +81,12 @@ GGML_API void ggml_gallocr_alloc_graph_n(
struct ggml_hash_set hash_set,
ggml_tallocr_t * hash_node_talloc);


// Utils
// Create a buffer and allocate all the tensors in a ggml_context
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft);
GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend);

#ifdef __cplusplus
}
#endif
Loading

0 comments on commit 38f46af

Please sign in to comment.