From 122fb87d8c6359d537381532d87a2186027295d6 Mon Sep 17 00:00:00 2001 From: Moritz Date: Tue, 19 May 2026 10:21:22 +0200 Subject: [PATCH] server: expose speculative decoding counters in Prometheus metrics Adds two new counters to the /metrics endpoint: - llamacpp:spec_tokens_drafted_total - llamacpp:spec_tokens_accepted_total Accumulated in server_metrics::on_prediction() from the per-slot n_draft_total and n_draft_accepted fields. Divide accepted by drafted to get acceptance rate. --- tools/server/README.md | 2 ++ tools/server/server-context.cpp | 17 +++++++++++++++++ tools/server/server-task.h | 3 +++ 3 files changed, 22 insertions(+) diff --git a/tools/server/README.md b/tools/server/README.md index 11098af28830..74b00c5faf5d 100644 --- a/tools/server/README.md +++ b/tools/server/README.md @@ -1064,6 +1064,8 @@ In *router mode* the query param `?model={model_id}` has to be set. This endpoin | `llamacpp:n_tokens_max` | Counter | High watermark of the context size observed. | | `llamacpp:n_decode_total` | Counter | Total Number of llama_decode() calls. | | `llamacpp:n_busy_slots_per_decode` | Gauge | Average number of busy slots per llama_decode() call. | +| `llamacpp:spec_tokens_drafted_total` | Counter | Number of speculative draft tokens generated. | +| `llamacpp:spec_tokens_accepted_total` | Counter | Number of speculative draft tokens accepted. | ### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file. diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp index 1ce7f0958279..86a8d66f35cf 100644 --- a/tools/server/server-context.cpp +++ b/tools/server/server-context.cpp @@ -564,6 +564,9 @@ struct server_metrics { uint64_t n_decode_total = 0; uint64_t n_busy_slots_total = 0; + uint64_t n_spec_tokens_drafted_total = 0; + uint64_t n_spec_tokens_accepted_total = 0; + void init() { t_start = ggml_time_us(); } @@ -582,6 +585,9 @@ struct server_metrics { n_tokens_predicted += slot.n_decoded; t_tokens_generation += slot.t_token_generation; t_tokens_generation_total += slot.t_token_generation; + + n_spec_tokens_drafted_total += slot.n_draft_total; + n_spec_tokens_accepted_total += slot.n_draft_accepted; } void on_decoded(const std::vector & slots) { @@ -2001,6 +2007,9 @@ struct server_context_impl { res->n_decode_total = metrics.n_decode_total; res->n_busy_slots_total = metrics.n_busy_slots_total; + res->n_spec_tokens_drafted_total = metrics.n_spec_tokens_drafted_total; + res->n_spec_tokens_accepted_total = metrics.n_spec_tokens_accepted_total; + if (task.metrics_reset_bucket) { metrics.reset_bucket(); } @@ -3713,6 +3722,14 @@ void server_routes::init_routes() { {"name", "n_tokens_max"}, {"help", "Largest observed n_tokens."}, {"value", res_task->n_tokens_max} + }, { + {"name", "spec_tokens_drafted_total"}, + {"help", "Number of speculative draft tokens generated."}, + {"value", (uint64_t) res_task->n_spec_tokens_drafted_total} + }, { + {"name", "spec_tokens_accepted_total"}, + {"help", "Number of speculative draft tokens accepted."}, + {"value", (uint64_t) res_task->n_spec_tokens_accepted_total} }}}, {"gauge", {{ {"name", "prompt_tokens_seconds"}, diff --git a/tools/server/server-task.h b/tools/server/server-task.h index 64bdecd794f1..053419ec4a0a 100644 --- a/tools/server/server-task.h +++ b/tools/server/server-task.h @@ -526,6 +526,9 @@ struct server_task_result_metrics : server_task_result { uint64_t n_decode_total = 0; uint64_t n_busy_slots_total = 0; + uint64_t n_spec_tokens_drafted_total = 0; + uint64_t n_spec_tokens_accepted_total = 0; + // while we can also use std::vector this requires copying the slot object which can be quite messy // therefore, we use json to temporarily store the slot.to_json() result json slots_data = json::array();