@@ -2482,6 +2482,7 @@ struct llama_cparams {
2482
2482
bool causal_attn;
2483
2483
bool offload_kqv;
2484
2484
bool flash_attn;
2485
+ bool no_perf;
2485
2486
2486
2487
enum llama_pooling_type pooling_type;
2487
2488
@@ -6647,8 +6648,6 @@ static bool llm_load_tensors(
6647
6648
bool use_mlock,
6648
6649
llama_progress_callback progress_callback,
6649
6650
void * progress_callback_user_data) {
6650
- model.t_start_us = ggml_time_us();
6651
-
6652
6651
auto & hparams = model.hparams;
6653
6652
6654
6653
model.split_mode = split_mode;
@@ -8579,14 +8578,13 @@ static bool llm_load_tensors(
8579
8578
}
8580
8579
}
8581
8580
8582
- // loading time will be recalculate after the first eval, so
8583
- // we take page faults deferred by mmap() into consideration
8584
- model.t_load_us = ggml_time_us() - model.t_start_us;
8585
8581
return true;
8586
8582
}
8587
8583
8588
8584
// Returns 0 on success, -1 on error, and -2 on cancellation via llama_progress_callback
8589
8585
static int llama_model_load(const std::string & fname, llama_model & model, llama_model_params & params) {
8586
+ model.t_start_us = ggml_time_us();
8587
+
8590
8588
try {
8591
8589
llama_model_loader ml(fname, params.use_mmap, params.check_tensors, params.kv_overrides);
8592
8590
@@ -8648,6 +8646,10 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
8648
8646
return -1;
8649
8647
}
8650
8648
8649
+ // loading time will be recalculate after the first eval, so
8650
+ // we take page faults deferred by mmap() into consideration
8651
+ model.t_load_us = ggml_time_us() - model.t_start_us;
8652
+
8651
8653
return 0;
8652
8654
}
8653
8655
@@ -17915,6 +17917,7 @@ struct llama_context_params llama_context_default_params() {
17915
17917
/*.embeddings =*/ false,
17916
17918
/*.offload_kqv =*/ true,
17917
17919
/*.flash_attn =*/ false,
17920
+ /*.no_perf =*/ true,
17918
17921
/*.abort_callback =*/ nullptr,
17919
17922
/*.abort_callback_data =*/ nullptr,
17920
17923
};
@@ -18125,6 +18128,7 @@ struct llama_context * llama_new_context_with_model(
18125
18128
cparams.embeddings = params.embeddings;
18126
18129
cparams.offload_kqv = params.offload_kqv;
18127
18130
cparams.flash_attn = params.flash_attn;
18131
+ cparams.no_perf = params.no_perf;
18128
18132
cparams.pooling_type = params.pooling_type;
18129
18133
18130
18134
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
@@ -20043,10 +20047,14 @@ void llama_synchronize(struct llama_context * ctx) {
20043
20047
20044
20048
// add the evaluation to the stats
20045
20049
if (ctx->n_queued_tokens == 1) {
20046
- ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
20050
+ if (!ctx->cparams.no_perf) {
20051
+ ctx->t_eval_us += ggml_time_us() - ctx->t_compute_start_us;
20052
+ }
20047
20053
ctx->n_eval++;
20048
20054
} else if (ctx->n_queued_tokens > 1) {
20049
- ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
20055
+ if (!ctx->cparams.no_perf) {
20056
+ ctx->t_p_eval_us += ggml_time_us() - ctx->t_compute_start_us;
20057
+ }
20050
20058
ctx->n_p_eval += ctx->n_queued_tokens;
20051
20059
}
20052
20060
@@ -20653,39 +20661,74 @@ const char * llama_print_system_info(void) {
20653
20661
return s.c_str();
20654
20662
}
20655
20663
20656
- void llama_perf_print(const void * ctx, enum llama_perf_type type) {
20664
+ // struct llama_perf_data {
20665
+ // double t_start_ms;
20666
+ // double t_end_ms;
20667
+ // double t_load_ms;
20668
+ // double t_sample_ms;
20669
+ // double t_p_eval_ms;
20670
+ // double t_eval_ms;
20671
+ //
20672
+ // int32_t n_sample;
20673
+ // int32_t n_p_eval;
20674
+ // int32_t n_eval;
20675
+ // };
20676
+
20677
+ llama_perf_data llama_perf_get(const void * ctx, enum llama_perf_type type) {
20678
+ llama_perf_data data = {};
20679
+
20680
+ if (ctx == nullptr) {
20681
+ return data;
20682
+ }
20683
+
20657
20684
switch (type) {
20658
20685
case LLAMA_PERF_TYPE_CONTEXT:
20659
20686
{
20660
20687
const auto * p = (const struct llama_context *) ctx;
20661
20688
20662
- const double t_start_ms = 1e-3 * p->t_start_us;
20663
- const double t_end_ms = 1.00 * ggml_time_ms();
20664
- const double t_load_ms = 1e-3 * p->t_load_us;
20665
- const double t_p_eval_ms = 1e-3 * p->t_p_eval_us;
20666
- const double t_eval_ms = 1e-3 * p->t_eval_us;
20689
+ data.t_start_ms = 1e-3 * p->t_start_us;
20690
+ data.t_load_ms = 1e-3 * p->t_load_us;;
20691
+ data.t_p_eval_ms = 1e-3 * p->t_p_eval_us;
20692
+ data.t_eval_ms = 1e-3 * p->t_eval_us;
20693
+ data.n_p_eval = std::max(1, p->n_p_eval);
20694
+ data.n_eval = std::max(1, p->n_eval);
20695
+ } break;
20696
+ case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
20697
+ {
20698
+ const auto * smpl = (const struct llama_sampler *) ctx;
20699
+ const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
20700
+
20701
+ data.t_sample_ms = 1e-3 * p->t_sample_us;
20702
+ data.n_sample = std::max(0, p->n_sample);
20703
+ } break;
20704
+ default:
20705
+ GGML_ABORT("invalid perf type");
20706
+ }
20707
+
20708
+ return data;
20709
+ }
20667
20710
20668
- const int32_t n_p_eval = std::max(0, p->n_p_eval);
20669
- const int32_t n_eval = std::max(1, p->n_eval);
20711
+ void llama_perf_print(const void * ctx, enum llama_perf_type type) {
20712
+ switch (type) {
20713
+ case LLAMA_PERF_TYPE_CONTEXT:
20714
+ {
20715
+ const auto data = llama_perf_get(ctx, type);
20716
+
20717
+ const double t_end_ms = 1e-3 * ggml_time_us();
20670
20718
20671
- LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, t_load_ms);
20719
+ LLAMA_LOG_INFO("%s: load time = %10.2f ms\n", __func__, data. t_load_ms);
20672
20720
LLAMA_LOG_INFO("%s: prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)\n",
20673
- __func__, t_p_eval_ms, n_p_eval, t_p_eval_ms / n_p_eval, 1e3 / t_p_eval_ms * n_p_eval);
20721
+ __func__, data. t_p_eval_ms, data. n_p_eval, data. t_p_eval_ms / data. n_p_eval, 1e3 / data. t_p_eval_ms * data. n_p_eval);
20674
20722
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
20675
- __func__, t_eval_ms, n_eval, t_eval_ms / n_eval, 1e3 / t_eval_ms * n_eval);
20676
- LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - t_start_ms), (n_p_eval + n_eval));
20723
+ __func__, data. t_eval_ms, data. n_eval, data. t_eval_ms / data. n_eval, 1e3 / data. t_eval_ms * data. n_eval);
20724
+ LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (t_end_ms - data. t_start_ms), (data. n_p_eval + data. n_eval));
20677
20725
} break;
20678
20726
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
20679
20727
{
20680
- const auto * smpl = (const struct llama_sampler *) ctx;
20681
- const auto * p = (const struct llama_sampler_chain *) smpl->ctx;
20682
-
20683
- const double t_sampler_ms = 1e-3 * p->t_sample_us;
20684
-
20685
- const int32_t n_sampler = std::max(0, p->n_sample);
20728
+ const auto data = llama_perf_get(ctx, type);
20686
20729
20687
20730
LLAMA_LOG_INFO("%s: sampling time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
20688
- __func__, t_sampler_ms, n_sampler, t_sampler_ms / n_sampler , 1e3 / t_sampler_ms * n_sampler );
20731
+ __func__, data.t_sample_ms, data.n_sample, data.t_sample_ms / data.n_sample , 1e3 / data.t_sample_ms * data.n_sample );
20689
20732
} break;
20690
20733
default:
20691
20734
GGML_ABORT("invalid perf type");
@@ -20705,7 +20748,7 @@ void llama_perf_reset(void * ctx, enum llama_perf_type type) {
20705
20748
case LLAMA_PERF_TYPE_SAMPLER_CHAIN:
20706
20749
{
20707
20750
auto * smpl = (struct llama_sampler *) ctx;
20708
- auto * p = (struct llama_sampler_chain *) smpl->ctx;
20751
+ auto * p = (struct llama_sampler_chain *) smpl->ctx;
20709
20752
20710
20753
p->t_sample_us = p->n_sample = 0;
20711
20754
} break;
0 commit comments