Skip to content

Commit 83595ec

Browse files
committed
minor fixes
1 parent 09ab5c1 commit 83595ec

File tree

2 files changed

+30
-22
lines changed

2 files changed

+30
-22
lines changed

ggml-backend.c

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ struct ggml_buffer ggml_backend_alloc_buffer(struct ggml_backend * backend, size
1414
buffer.mem_size = ggml_tensor_overhead() * max_tensors;
1515
buffer.mem_buffer = malloc(buffer.mem_size);
1616
buffer.backend = backend;
17-
// size += 128 * max_tensors; // alignment overhead
17+
size += 128 * max_tensors; // alignment overhead
1818
buffer.backend_buffer = backend->interface->alloc_buffer(backend->context, size);
1919
return buffer;
2020
}
@@ -172,7 +172,7 @@ static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_context_t ctx, struct
172172
}
173173

174174
static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_context_t ctx, struct ggml_tensor * src, struct ggml_tensor * dst) {
175-
ggml_backend_set_tensor(dst, src->data, 0, ggml_nbytes(src));
175+
ggml_backend_set_tensor_async(dst, src->data, 0, ggml_nbytes(src));
176176

177177
UNUSED(ctx);
178178
}
@@ -409,7 +409,7 @@ void ggml_graph_splits_compute(struct ggml_graph_splits * splits) {
409409
ggml_backend_cpy_tensor(split->dst_inputs[j], split->src_inputs[j]);
410410
}
411411
}
412-
ggml_backend_synchronize(split->dst_inputs[0]->backend);
412+
// ggml_backend_synchronize(split->dst_inputs[0]->backend);
413413
copy_us += ggml_time_us() - copy_start_us;
414414

415415
#if 0
@@ -419,7 +419,7 @@ void ggml_graph_splits_compute(struct ggml_graph_splits * splits) {
419419
#endif
420420
uint64_t start = ggml_time_us();
421421
ggml_backend_graph_compute(split->dst_inputs[0]->backend, split->graph);
422-
ggml_backend_synchronize(split->dst_inputs[0]->backend);
422+
//ggml_backend_synchronize(split->dst_inputs[0]->backend);
423423
uint64_t end = ggml_time_us();
424424
if (strcmp(ggml_backend_name(split->dst_inputs[0]->backend), "CPU") == 0) {
425425
compute_cpu_us += end - start;

llama.cpp

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -621,8 +621,9 @@ struct llama_model_loader {
621621
}
622622
LLAMA_ASSERT(lt.ggml_tensor); // unused tensors should have been caught by load_data already
623623

624-
bool is_cpu = lt.ggml_tensor->backend == &model->backend_cpu; // TODO
624+
bool is_cpu = lt.ggml_tensor->backend == &model->backend_cpu;
625625

626+
// select buffer to load data into
626627
if (!use_mmap) {
627628
if (is_cpu) {
628629
lt.data = (uint8_t *) lt.ggml_tensor->data;
@@ -638,7 +639,7 @@ struct llama_model_loader {
638639
if (is_cpu) {
639640
if (use_mmap) {
640641
lt.ggml_tensor->data = lt.data;
641-
// TODO: this assumes that the data is contiguous, which may not always be the case
642+
// TODO: this assumes that the data to lock is contiguous, which may not always be the case
642643
if (lmlock) {
643644
lock_size += lt.size;
644645
lmlock->grow_to(lock_size);
@@ -1199,6 +1200,10 @@ static ggml_graph_splits llama_build_graph(
11991200
inpL = ggml_get_rows(ctx_i, model.tok_embeddings, token_in);
12001201
}
12011202

1203+
// reuse the scale tensor for all layers since it requires a memory transfer
1204+
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx_kv, 1.0f/sqrtf(float(n_embd)/n_head));
1205+
ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
1206+
12021207
struct ggml_tensor * cur = nullptr;
12031208
for (int il = 0; il < n_layer; ++il) {
12041209
struct ggml_context * ctx_l = ctx_ls[il];
@@ -1239,9 +1244,6 @@ static ggml_graph_splits llama_build_graph(
12391244
struct ggml_tensor * Vcur = ggml_transpose(ctx_l, ggml_reshape_2d(ctx_l, tmpv, n_embd, N));
12401245
ggml_set_name(Vcur, "Vcur");
12411246

1242-
//ggml_graph_splits_add(&splits, &Kcur, ctx_kv, "Kcur");
1243-
//ggml_graph_splits_add(&splits, &Vcur, ctx_kv, "Vcur");
1244-
//ggml_graph_splits_add(&splits, &Qcur, ctx_kv, "Qcur");
12451247
ggml_tensor ** attn_inputs[] = {&Kcur, &Vcur, &Qcur, NULL};
12461248
ggml_graph_splits_add_n(&splits, attn_inputs, ctx_kv, "l%d_attn", il);
12471249

@@ -1288,9 +1290,6 @@ static ggml_graph_splits llama_build_graph(
12881290
ggml_set_name(KQ, "KQ");
12891291

12901292
// KQ_scaled = KQ / sqrt(n_embd/n_head)
1291-
struct ggml_tensor * KQ_scale = ggml_new_f32(ctx_kv, 1.0f/sqrtf(float(n_embd)/n_head));
1292-
ggml_set_name(KQ_scale, "1/sqrt(n_embd/n_head)");
1293-
12941293
// KQ_scaled shape [n_past + N, N, n_head, 1]
12951294
struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx_kv, KQ, KQ_scale);
12961295
ggml_set_name(KQ_scaled, "KQ_scaled");
@@ -1367,7 +1366,7 @@ static ggml_graph_splits llama_build_graph(
13671366
cur = ggml_mul_mat(ctx_l,
13681367
model.layers[il].w1,
13691368
cur);
1370-
ggml_set_name(cur, "result_w2");
1369+
ggml_set_name(cur, "result_w1");
13711370

13721371
// SILU activation
13731372
cur = ggml_silu(ctx_l, cur);
@@ -1503,6 +1502,12 @@ static bool llama_eval_internal(
15031502

15041503
LLAMA_ASSERT(lctx.graph_logits != nullptr);
15051504

1505+
1506+
// for big prompts, if BLAS is enabled, it is better to use only one thread
1507+
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1508+
n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
1509+
ggml_backend_cpu_set_n_threads(const_cast<ggml_backend*>(&model.backend_cpu), n_threads);
1510+
15061511
struct ggml_graph_splits splits = llama_build_graph(lctx, N, n_past, embd_input);
15071512

15081513
// TODO: use backend functions
@@ -1514,11 +1519,7 @@ static bool llama_eval_internal(
15141519
ggml_backend_set_tensor(lctx.graph_embeddings_in, embd, 0, N*n_embd*ggml_element_size(lctx.graph_embeddings_in));
15151520
}
15161521

1517-
// for big prompts, if BLAS is enabled, it is better to use only one thread
1518-
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1519-
n_threads = N >= 32 && ggml_cpu_has_blas() ? 1 : n_threads;
15201522

1521-
ggml_backend_cpu_set_n_threads(const_cast<ggml_backend*>(&model.backend_cpu), n_threads);
15221523

15231524
// run the computation
15241525
ggml_graph_splits_compute(&splits);
@@ -1545,21 +1546,28 @@ static bool llama_eval_internal(
15451546

15461547
if (lctx.logits_all) {
15471548
logits_out.resize(n_vocab * N);
1548-
ggml_backend_get_tensor(lctx.graph_logits, logits_out.data(), 0, N*n_vocab*sizeof(float));
1549+
ggml_backend_get_tensor_async(lctx.graph_logits, logits_out.data(), 0, N*n_vocab*sizeof(float));
15491550
} else {
15501551
// return result for just the last token
15511552
logits_out.resize(n_vocab);
1552-
ggml_backend_get_tensor(lctx.graph_logits, logits_out.data(), 0, n_vocab*sizeof(float));
1553+
ggml_backend_get_tensor_async(lctx.graph_logits, logits_out.data(), 0, n_vocab*sizeof(float));
15531554
}
15541555
}
15551556

15561557
// extract embeddings
15571558
if (!lctx.embedding.empty()) {
15581559
auto & embedding_out = lctx.embedding;
15591560
embedding_out.resize(n_embd);
1560-
ggml_backend_get_tensor(lctx.graph_embeddings_out, embedding_out.data(), 0, n_embd*sizeof(float));
1561+
ggml_backend_get_tensor_async(lctx.graph_embeddings_out, embedding_out.data(), 0, n_embd*sizeof(float));
15611562
}
15621563

1564+
#ifdef GGML_USE_CUDA
1565+
// wait for the async copy to finish
1566+
if (lctx.model.n_gpu_layers > 0) {
1567+
ggml_backend_synchronize(const_cast<ggml_backend*>(&lctx.model.backend_cuda));
1568+
}
1569+
#endif
1570+
15631571
// measure the performance only for the single-token evals
15641572
if (N == 1) {
15651573
lctx.t_eval_us += ggml_time_us() - t_start_us;
@@ -2543,7 +2551,7 @@ struct llama_context * llama_new_context_with_model(
25432551
// initialize the graph input/output buffers
25442552
// input buffer
25452553
{
2546-
size_t buf_input_size = 1024;
2554+
size_t buf_input_size = 0;
25472555
buf_input_size += hparams.n_ctx * ggml_type_size(GGML_TYPE_F32); // input tokens
25482556
// TODO: input embeddings should be optional to save memory
25492557
buf_input_size += hparams.n_embd * hparams.n_ctx * ggml_type_size(GGML_TYPE_F32); // input embeddings
@@ -2562,7 +2570,7 @@ struct llama_context * llama_new_context_with_model(
25622570
}
25632571
// output buffer
25642572
{
2565-
size_t buf_output_size = 1024;
2573+
size_t buf_output_size = 0;
25662574
if (params.logits_all) {
25672575
buf_output_size += hparams.n_ctx * hparams.n_vocab * ggml_type_size(GGML_TYPE_F32);
25682576
} else {

0 commit comments

Comments
 (0)