Skip to content

Commit f7cee89

Browse files
committed
llama : llama_perf + option to disable timings during decode
ggml-ci
1 parent df270ef commit f7cee89

File tree

6 files changed

+94
-29
lines changed

6 files changed

+94
-29
lines changed

common/common.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2629,6 +2629,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
26292629
cparams.cb_eval_user_data = params.cb_eval_user_data;
26302630
cparams.offload_kqv = !params.no_kv_offload;
26312631
cparams.flash_attn = params.flash_attn;
2632+
cparams.no_perf = params.no_perf;
26322633

26332634
cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
26342635
cparams.type_v = kv_cache_type_from_str(params.cache_type_v);

common/common.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,7 @@ struct gpt_params {
180180
bool simple_io = false; // improves compatibility with subprocesses and limited consoles
181181
bool cont_batching = true; // insert new sequences for decoding on-the-fly
182182
bool flash_attn = false; // flash attention
183+
bool no_perf = false; // no perf (TODO: add llama_arg)
183184

184185
bool input_prefix_bos = false; // prefix BOS to user inputs, preceding input_prefix
185186
bool logits_all = false; // return logits for all tokens in the batch

common/sampling.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ std::string gpt_sampler_params::print() const {
139139
struct gpt_sampler * gpt_sampler_init(const struct llama_model * model, const struct gpt_sampler_params & params) {
140140
llama_sampler_chain_params lparams = llama_sampler_chain_default_params();
141141

142-
lparams.no_perf = false; // TODO: control via params
142+
lparams.no_perf = params.no_perf;
143143

144144
auto * result = new gpt_sampler {
145145
/* .params = */ params,

common/sampling.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ struct gpt_sampler_params {
3939
float mirostat_eta = 0.10f; // learning rate
4040
bool penalize_nl = false; // consider newlines as a repeatable token
4141
bool ignore_eos = false;
42+
bool no_perf = false; // disable performance metrics
4243

4344
std::vector<enum gpt_sampler_type> samplers = {
4445
GPT_SAMPLER_TYPE_TOP_K,

include/llama.h

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -343,7 +343,7 @@ extern "C" {
343343
bool embeddings; // if true, extract embeddings (together with logits)
344344
bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
345345
bool flash_attn; // whether to use flash attention [EXPERIMENTAL]
346-
//bool no_perf; // whether to measure performance timings, TODO: implement
346+
bool no_perf; // whether to measure performance timings
347347

348348
// Abort callback
349349
// if it returns true, execution of llama_decode() will be aborted
@@ -1168,11 +1168,30 @@ extern "C" {
11681168
// NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements.
11691169
//
11701170

1171+
// performance timing information
1172+
struct llama_perf_data {
1173+
// llama_context
1174+
double t_start_ms;
1175+
double t_load_ms;
1176+
double t_p_eval_ms;
1177+
double t_eval_ms;
1178+
1179+
int32_t n_p_eval;
1180+
int32_t n_eval;
1181+
1182+
// llama_sampler_chain
1183+
double t_sample_ms;
1184+
1185+
int32_t n_sample;
1186+
};
1187+
11711188
enum llama_perf_type {
11721189
LLAMA_PERF_TYPE_CONTEXT = 0,
11731190
LLAMA_PERF_TYPE_SAMPLER_CHAIN = 1,
11741191
};
11751192

1193+
LLAMA_API struct llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type);
1194+
11761195
LLAMA_API void llama_perf_print(const void * ctx, enum llama_perf_type type);
11771196
LLAMA_API void llama_perf_reset( void * ctx, enum llama_perf_type type);
11781197

src/llama.cpp

Lines changed: 70 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -2482,6 +2482,7 @@ struct llama_cparams {
24822482
bool causal_attn;
24832483
bool offload_kqv;
24842484
bool flash_attn;
2485+
bool no_perf;
24852486

24862487
enum llama_pooling_type pooling_type;
24872488

@@ -6647,8 +6648,6 @@ static bool llm_load_tensors(
66476648
bool use_mlock,
66486649
llama_progress_callback progress_callback,
66496650
void * progress_callback_user_data) {
6650-
model.t_start_us = ggml_time_us();
6651-
66526651
auto & hparams = model.hparams;
66536652

66546653
model.split_mode = split_mode;
@@ -8579,14 +8578,13 @@ static bool llm_load_tensors(
85798578
}
85808579
}
85818580

8582-
// loading time will be recalculate after the first eval, so
8583-
// we take page faults deferred by mmap() into consideration
8584-
model.t_load_us = ggml_time_us() - model.t_start_us;
85858581
return true;
85868582
}
85878583

85888584
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
85898585
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
8586+
model.t_start_us = ggml_time_us();
8587+
85908588
try {
85918589
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
85928590

@@ -8648,6 +8646,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
86488646
return -1;
86498647
}
86508648

8649+
// loading time will be recalculate after the first eval, so
8650+
// we take page faults deferred by mmap() into consideration
8651+
model.t_load_us = ggml_time_us() - model.t_start_us;
8652+
86518653
return 0;
86528654
}
86538655

@@ -17915,6 +17917,7 @@ struct llama_context_params llama_context_default_params() {
1791517917
/*.embeddings =*/ false,
1791617918
/*.offload_kqv =*/ true,
1791717919
/*.flash_attn =*/ false,
17920+
/*.no_perf =*/ true,
1791817921
/*.abort_callback =*/ nullptr,
1791917922
/*.abort_callback_data =*/ nullptr,
1792017923
};
@@ -18125,6 +18128,7 @@ struct llama_context * llama_new_context_with_model(
1812518128
cparams.embeddings = params.embeddings;
1812618129
cparams.offload_kqv = params.offload_kqv;
1812718130
cparams.flash_attn = params.flash_attn;
18131+
cparams.no_perf = params.no_perf;
1812818132
cparams.pooling_type = params.pooling_type;
1812918133

1813018134
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
@@ -20043,10 +20047,14 @@ void llama_synchronize(struct llama_context * ctx) {
2004320047

2004420048
// add the evaluation to the stats
2004520049
if (ctx->n_queued_tokens == 1) {
20046-
ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
20050+
if (!ctx->cparams.no_perf) {
20051+
ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
20052+
}
2004720053
ctx->n_eval++;
2004820054
} else if (ctx->n_queued_tokens > 1) {
20049-
ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
20055+
if (!ctx->cparams.no_perf) {
20056+
ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
20057+
}
2005020058
ctx->n_p_eval += ctx->n_queued_tokens;
2005120059
}
2005220060

@@ -20653,39 +20661,74 @@ const char * llama_print_system_info(void) {
2065320661
return s.c_str();
2065420662
}
2065520663

20656-
void llama_perf_print(const void * ctx, enum llama_perf_type type) {
20664+
// struct llama_perf_data {
20665+
// double t_start_ms;
20666+
// double t_end_ms;
20667+
// double t_load_ms;
20668+
// double t_sample_ms;
20669+
// double t_p_eval_ms;
20670+
// double t_eval_ms;
20671+
//
20672+
// int32_t n_sample;
20673+
// int32_t n_p_eval;
20674+
// int32_t n_eval;
20675+
// };
20676+
20677+
llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type) {
20678+
llama_perf_data data = {};
20679+
20680+
if (ctx == nullptr) {
20681+
return data;
20682+
}
20683+
2065720684
switch (type) {
2065820685
case LLAMA_PERF_TYPE_CONTEXT:
2065920686
{
2066020687
const auto * p = (const struct llama_context *) ctx;
2066120688

20662-
const double t_start_ms = 1e-3 * p->t_start_us;
20663-
const double t_end_ms = 1.00 * ggml_time_ms();
20664-
const double t_load_ms = 1e-3 * p->t_load_us;
20665-
const double t_p_eval_ms = 1e-3 * p->t_p_eval_us;
20666-
const double t_eval_ms = 1e-3 * p->t_eval_us;
20689+
data.t_start_ms = 1e-3 * p->t_start_us;
20690+
data.t_load_ms = 1e-3 * p->t_load_us;;
20691+
data.t_p_eval_ms = 1e-3 * p->t_p_eval_us;
20692+
data.t_eval_ms = 1e-3 * p->t_eval_us;
20693+
data.n_p_eval = std::max(1, p->n_p_eval);
20694+
data.n_eval = std::max(1, p->n_eval);
20695+
} break;
20696+
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
20697+
{
20698+
const auto * smpl = (const struct llama_sampler *) ctx;
20699+
const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
20700+
20701+
data.t_sample_ms = 1e-3 * p->t_sample_us;
20702+
data.n_sample = std::max(0, p->n_sample);
20703+
} break;
20704+
default:
20705+
GGML_ABORT("invalid perf type");
20706+
}
20707+
20708+
return data;
20709+
}
2066720710

20668-
const int32_t n_p_eval = std::max(0, p->n_p_eval);
20669-
const int32_t n_eval = std::max(1, p->n_eval);
20711+
void llama_perf_print(const void * ctx, enum llama_perf_type type) {
20712+
switch (type) {
20713+
case LLAMA_PERF_TYPE_CONTEXT:
20714+
{
20715+
const auto data = llama_perf_get(ctx, type);
20716+
20717+
const double t_end_ms = 1e-3 * ggml_time_us();
2067020718

20671-
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms);
20719+
LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data.t_load_ms);
2067220720
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
20673-
__func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
20721+
__func__, data.t_p_eval_ms, data.n_p_eval, data.t_p_eval_ms / data.n_p_eval, 1e3 / data.t_p_eval_ms * data.n_p_eval);
2067420722
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
20675-
__func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
20676-
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
20723+
__func__, data.t_eval_ms, data.n_eval, data.t_eval_ms / data.n_eval, 1e3 / data.t_eval_ms * data.n_eval);
20724+
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data.t_start_ms), (data.n_p_eval + data.n_eval));
2067720725
} break;
2067820726
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
2067920727
{
20680-
const auto * smpl = (const struct llama_sampler *) ctx;
20681-
const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
20682-
20683-
const double t_sampler_ms = 1e-3 * p->t_sample_us;
20684-
20685-
const int32_t n_sampler = std::max(0, p->n_sample);
20728+
const auto data = llama_perf_get(ctx, type);
2068620729

2068720730
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
20688-
__func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler, 1e3 / t_sampler_ms * n_sampler);
20731+
__func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample, 1e3 / data.t_sample_ms * data.n_sample);
2068920732
} break;
2069020733
default:
2069120734
GGML_ABORT("invalid perf type");
@@ -20705,7 +20748,7 @@ void llama_perf_reset(void * ctx, enum llama_perf_type type) {
2070520748
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
2070620749
{
2070720750
auto * smpl = (struct llama_sampler *) ctx;
20708-
auto * p = (struct llama_sampler_chain *) smpl->ctx;
20751+
auto * p = (struct llama_sampler_chain *) smpl->ctx;
2070920752

2071020753
p->t_sample_us = p->n_sample = 0;
2071120754
} break;

0 commit comments

Comments
 (0)