@@ -621,8 +621,9 @@ struct llama_model_loader {
621
621
}
622
622
LLAMA_ASSERT (lt.ggml_tensor ); // unused tensors should have been caught by load_data already
623
623
624
- bool is_cpu = lt.ggml_tensor ->backend == &model->backend_cpu ; // TODO
624
+ bool is_cpu = lt.ggml_tensor ->backend == &model->backend_cpu ;
625
625
626
+ // select buffer to load data into
626
627
if (!use_mmap) {
627
628
if (is_cpu) {
628
629
lt.data = (uint8_t *) lt.ggml_tensor ->data ;
@@ -638,7 +639,7 @@ struct llama_model_loader {
638
639
if (is_cpu) {
639
640
if (use_mmap) {
640
641
lt.ggml_tensor ->data = lt.data ;
641
- // TODO: this assumes that the data is contiguous, which may not always be the case
642
+ // TODO: this assumes that the data to lock is contiguous, which may not always be the case
642
643
if (lmlock) {
643
644
lock_size += lt.size ;
644
645
lmlock->grow_to (lock_size);
@@ -1199,6 +1200,10 @@ static ggml_graph_splits llama_build_graph(
1199
1200
inpL = ggml_get_rows (ctx_i, model.tok_embeddings , token_in);
1200
1201
}
1201
1202
1203
+ // reuse the scale tensor for all layers since it requires a memory transfer
1204
+ struct ggml_tensor * KQ_scale = ggml_new_f32 (ctx_kv, 1 .0f /sqrtf (float (n_embd)/n_head));
1205
+ ggml_set_name (KQ_scale, " 1/sqrt(n_embd/n_head)" );
1206
+
1202
1207
struct ggml_tensor * cur = nullptr ;
1203
1208
for (int il = 0 ; il < n_layer; ++il) {
1204
1209
struct ggml_context * ctx_l = ctx_ls[il];
@@ -1239,9 +1244,6 @@ static ggml_graph_splits llama_build_graph(
1239
1244
struct ggml_tensor * Vcur = ggml_transpose (ctx_l, ggml_reshape_2d (ctx_l, tmpv, n_embd, N));
1240
1245
ggml_set_name (Vcur, " Vcur" );
1241
1246
1242
- // ggml_graph_splits_add(&splits, &Kcur, ctx_kv, "Kcur");
1243
- // ggml_graph_splits_add(&splits, &Vcur, ctx_kv, "Vcur");
1244
- // ggml_graph_splits_add(&splits, &Qcur, ctx_kv, "Qcur");
1245
1247
ggml_tensor ** attn_inputs[] = {&Kcur, &Vcur, &Qcur, NULL };
1246
1248
ggml_graph_splits_add_n (&splits, attn_inputs, ctx_kv, " l%d_attn" , il);
1247
1249
@@ -1288,9 +1290,6 @@ static ggml_graph_splits llama_build_graph(
1288
1290
ggml_set_name (KQ, " KQ" );
1289
1291
1290
1292
// KQ_scaled = KQ / sqrt(n_embd/n_head)
1291
- struct ggml_tensor * KQ_scale = ggml_new_f32 (ctx_kv, 1 .0f /sqrtf (float (n_embd)/n_head));
1292
- ggml_set_name (KQ_scale, " 1/sqrt(n_embd/n_head)" );
1293
-
1294
1293
// KQ_scaled shape [n_past + N, N, n_head, 1]
1295
1294
struct ggml_tensor * KQ_scaled = ggml_scale_inplace (ctx_kv, KQ, KQ_scale);
1296
1295
ggml_set_name (KQ_scaled, " KQ_scaled" );
@@ -1367,7 +1366,7 @@ static ggml_graph_splits llama_build_graph(
1367
1366
cur = ggml_mul_mat (ctx_l,
1368
1367
model.layers [il].w1 ,
1369
1368
cur);
1370
- ggml_set_name (cur, " result_w2 " );
1369
+ ggml_set_name (cur, " result_w1 " );
1371
1370
1372
1371
// SILU activation
1373
1372
cur = ggml_silu (ctx_l, cur);
@@ -1503,6 +1502,12 @@ static bool llama_eval_internal(
1503
1502
1504
1503
LLAMA_ASSERT (lctx.graph_logits != nullptr );
1505
1504
1505
+
1506
+ // for big prompts, if BLAS is enabled, it is better to use only one thread
1507
+ // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1508
+ n_threads = N >= 32 && ggml_cpu_has_blas () ? 1 : n_threads;
1509
+ ggml_backend_cpu_set_n_threads (const_cast <ggml_backend*>(&model.backend_cpu ), n_threads);
1510
+
1506
1511
struct ggml_graph_splits splits = llama_build_graph (lctx, N, n_past, embd_input);
1507
1512
1508
1513
// TODO: use backend functions
@@ -1514,11 +1519,7 @@ static bool llama_eval_internal(
1514
1519
ggml_backend_set_tensor (lctx.graph_embeddings_in , embd, 0 , N*n_embd*ggml_element_size (lctx.graph_embeddings_in ));
1515
1520
}
1516
1521
1517
- // for big prompts, if BLAS is enabled, it is better to use only one thread
1518
- // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
1519
- n_threads = N >= 32 && ggml_cpu_has_blas () ? 1 : n_threads;
1520
1522
1521
- ggml_backend_cpu_set_n_threads (const_cast <ggml_backend*>(&model.backend_cpu ), n_threads);
1522
1523
1523
1524
// run the computation
1524
1525
ggml_graph_splits_compute (&splits);
@@ -1545,21 +1546,28 @@ static bool llama_eval_internal(
1545
1546
1546
1547
if (lctx.logits_all ) {
1547
1548
logits_out.resize (n_vocab * N);
1548
- ggml_backend_get_tensor (lctx.graph_logits , logits_out.data (), 0 , N*n_vocab*sizeof (float ));
1549
+ ggml_backend_get_tensor_async (lctx.graph_logits , logits_out.data (), 0 , N*n_vocab*sizeof (float ));
1549
1550
} else {
1550
1551
// return result for just the last token
1551
1552
logits_out.resize (n_vocab);
1552
- ggml_backend_get_tensor (lctx.graph_logits , logits_out.data (), 0 , n_vocab*sizeof (float ));
1553
+ ggml_backend_get_tensor_async (lctx.graph_logits , logits_out.data (), 0 , n_vocab*sizeof (float ));
1553
1554
}
1554
1555
}
1555
1556
1556
1557
// extract embeddings
1557
1558
if (!lctx.embedding .empty ()) {
1558
1559
auto & embedding_out = lctx.embedding ;
1559
1560
embedding_out.resize (n_embd);
1560
- ggml_backend_get_tensor (lctx.graph_embeddings_out , embedding_out.data (), 0 , n_embd*sizeof (float ));
1561
+ ggml_backend_get_tensor_async (lctx.graph_embeddings_out , embedding_out.data (), 0 , n_embd*sizeof (float ));
1561
1562
}
1562
1563
1564
+ #ifdef GGML_USE_CUDA
1565
+ // wait for the async copy to finish
1566
+ if (lctx.model .n_gpu_layers > 0 ) {
1567
+ ggml_backend_synchronize (const_cast <ggml_backend*>(&lctx.model .backend_cuda ));
1568
+ }
1569
+ #endif
1570
+
1563
1571
// measure the performance only for the single-token evals
1564
1572
if (N == 1 ) {
1565
1573
lctx.t_eval_us += ggml_time_us () - t_start_us;
@@ -2543,7 +2551,7 @@ struct llama_context * llama_new_context_with_model(
2543
2551
// initialize the graph input/output buffers
2544
2552
// input buffer
2545
2553
{
2546
- size_t buf_input_size = 1024 ;
2554
+ size_t buf_input_size = 0 ;
2547
2555
buf_input_size += hparams.n_ctx * ggml_type_size (GGML_TYPE_F32); // input tokens
2548
2556
// TODO: input embeddings should be optional to save memory
2549
2557
buf_input_size += hparams.n_embd * hparams.n_ctx * ggml_type_size (GGML_TYPE_F32); // input embeddings
@@ -2562,7 +2570,7 @@ struct llama_context * llama_new_context_with_model(
2562
2570
}
2563
2571
// output buffer
2564
2572
{
2565
- size_t buf_output_size = 1024 ;
2573
+ size_t buf_output_size = 0 ;
2566
2574
if (params.logits_all ) {
2567
2575
buf_output_size += hparams.n_ctx * hparams.n_vocab * ggml_type_size (GGML_TYPE_F32);
2568
2576
} else {
0 commit comments