diff --git a/common/speculative.cpp b/common/speculative.cpp index e786cd63ab24..1aae7aa4dc47 100644 --- a/common/speculative.cpp +++ b/common/speculative.cpp @@ -2099,3 +2099,23 @@ void common_speculative_print_stats(const common_speculative * spec) { str_perf.c_str()); } } + +std::vector common_speculative_get_stats(const common_speculative * spec) { + std::vector result; + if (spec == nullptr) { + return result; + } + + result.reserve(spec->impls.size()); + for (const auto & impl : spec->impls) { + result.push_back({ + common_speculative_type_to_str(impl->type), + (uint64_t) impl->n_gen_drafts, + (uint64_t) impl->n_acc_drafts, + (uint64_t) impl->n_gen_tokens, + (uint64_t) impl->n_acc_tokens, + }); + } + + return result; +} diff --git a/common/speculative.h b/common/speculative.h index 02fba8877f39..00fca4c430e1 100644 --- a/common/speculative.h +++ b/common/speculative.h @@ -3,8 +3,21 @@ #include "llama.h" #include "common.h" +#include +#include +#include + struct common_speculative; +struct common_speculative_stats { + std::string spec_type; + + uint64_t n_gen_drafts = 0; + uint64_t n_acc_drafts = 0; + uint64_t n_gen_tokens = 0; + uint64_t n_acc_tokens = 0; +}; + // comma separated list of all types std::string common_speculative_type_name_str(); @@ -67,3 +80,6 @@ void common_speculative_cancel(common_speculative * spec); // print statistics about the speculative decoding void common_speculative_print_stats(const common_speculative * spec); + +// snapshot statistics about the speculative decoding +std::vector common_speculative_get_stats(const common_speculative * spec); diff --git a/tools/server/README.md b/tools/server/README.md index b924225a0fd0..082ce28523e2 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1029,6 +1029,40 @@ Available metrics: - `llamacpp:requests_processing`: Number of requests processing. - `llamacpp:requests_deferred`: Number of requests deferred. - `llamacpp:n_tokens_max`: High watermark of the context size observed. +- `llamacpp:speculative_drafts_generated_total{spec_type="..."}`: Number of speculative draft batches generated. +- `llamacpp:speculative_drafts_accepted_total{spec_type="..."}`: Number of speculative draft batches accepted at least partially. +- `llamacpp:speculative_draft_tokens_generated_total{spec_type="..."}`: Number of speculative draft tokens generated. +- `llamacpp:speculative_draft_tokens_accepted_total{spec_type="..."}`: Number of speculative draft tokens accepted by the target model. + +The speculative counters use the same source counters as the server's `statistics ` log line and are aggregated across slots. The `spec_type` label is the speculative implementation name, such as `mtp`, `nextn`, `draft`, `eagle3`, or an n-gram type. A server with no configured speculative implementation exports the metric metadata but no speculative series. + +Example Grafana/Prometheus expressions: + +```promql +rate(llamacpp:speculative_drafts_accepted_total[5m]) +/ +rate(llamacpp:speculative_drafts_generated_total[5m]) +``` + +```promql +rate(llamacpp:speculative_draft_tokens_accepted_total[5m]) +/ +rate(llamacpp:speculative_draft_tokens_generated_total[5m]) +``` + +To graph all speculative modes together, aggregate before dividing: + +```promql +sum(rate(llamacpp:speculative_drafts_accepted_total[5m])) +/ +sum(rate(llamacpp:speculative_drafts_generated_total[5m])) +``` + +Verify locally with: + +```bash +curl -s http://localhost:8080/metrics | rg 'speculative|draft' +``` ### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file. diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 65703c056106..b330275331d5 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -1919,6 +1919,7 @@ struct server_context_impl { int n_idle_slots = 0; int n_processing_slots = 0; + std::map speculative_stats_by_type; for (server_slot & slot : slots) { json slot_data = slot.to_json(slots_debug == 0); @@ -1929,6 +1930,15 @@ struct server_context_impl { n_idle_slots++; } + for (const auto & stats : common_speculative_get_stats(slot.spec)) { + auto & agg = speculative_stats_by_type[stats.spec_type]; + agg.spec_type = stats.spec_type; + agg.n_gen_drafts += stats.n_gen_drafts; + agg.n_acc_drafts += stats.n_acc_drafts; + agg.n_gen_tokens += stats.n_gen_tokens; + agg.n_acc_tokens += stats.n_acc_tokens; + } + slots_data.push_back(slot_data); } SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots); @@ -1955,6 +1965,9 @@ struct server_context_impl { res->n_decode_total = metrics.n_decode_total; res->n_busy_slots_total = metrics.n_busy_slots_total; + for (const auto & el : speculative_stats_by_type) { + res->speculative_stats.push_back(el.second); + } if (task.metrics_reset_bucket) { metrics.reset_bucket(); @@ -3645,6 +3658,46 @@ void server_routes::init_routes() { } } + struct speculative_metric_def { + const char * name; + const char * help; + uint64_t common_speculative_stats::* value; + }; + + static const speculative_metric_def speculative_metrics_def[] = { + { + "speculative_drafts_generated_total", + "Number of speculative draft batches generated.", + &common_speculative_stats::n_gen_drafts, + }, + { + "speculative_drafts_accepted_total", + "Number of speculative draft batches accepted at least partially.", + &common_speculative_stats::n_acc_drafts, + }, + { + "speculative_draft_tokens_generated_total", + "Number of speculative draft tokens generated.", + &common_speculative_stats::n_gen_tokens, + }, + { + "speculative_draft_tokens_accepted_total", + "Number of speculative draft tokens accepted by the target model.", + &common_speculative_stats::n_acc_tokens, + }, + }; + + for (const auto & metric_def : speculative_metrics_def) { + prometheus << "# HELP llamacpp:" << metric_def.name << " " << metric_def.help << "\n" + << "# TYPE llamacpp:" << metric_def.name << " counter\n"; + + for (const auto & stats : res_task->speculative_stats) { + prometheus << "llamacpp:" << metric_def.name + << "{spec_type=\"" << stats.spec_type << "\"} " + << stats.*(metric_def.value) << "\n"; + } + } + res->headers["Process-Start-Time-Unix"] = std::to_string(res_task->t_start); res->content_type = "text/plain; version=0.0.4"; res->status = 200; diff --git a/tools/server/server-task.h b/tools/server/server-task.h index 95f39207b18c..1aecb0a7aaaa 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -2,6 +2,7 @@ #include "common.h" #include "llama.h" +#include "speculative.h" #include #include @@ -526,6 +527,8 @@ struct server_task_result_metrics : server_task_result { uint64_t n_decode_total = 0; uint64_t n_busy_slots_total = 0; + std::vector speculative_stats; + // while we can also use std::vector this requires copying the slot object which can be quite messy // therefore, we use json to temporarily store the slot.to_json() result json slots_data = json::array();