From 122fb87d8c6359d537381532d87a2186027295d6 Mon Sep 17 00:00:00 2001
From: Moritz <moritz@schmitzvonhuelst.de>
Date: Tue, 19 May 2026 10:21:22 +0200
Subject: [PATCH] server: expose speculative decoding counters in Prometheus
 metrics

Adds two new counters to the /metrics endpoint:
- llamacpp:spec_tokens_drafted_total
- llamacpp:spec_tokens_accepted_total

Accumulated in server_metrics::on_prediction() from the per-slot
n_draft_total and n_draft_accepted fields. Divide accepted by drafted
to get acceptance rate.
---
 tools/server/README.md          |  2 ++
 tools/server/server-context.cpp | 17 +++++++++++++++++
 tools/server/server-task.h      |  3 +++
 3 files changed, 22 insertions(+)

diff --git a/tools/server/README.md b/tools/server/README.md
index 11098af28830..74b00c5faf5d 100644
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -1064,6 +1064,8 @@ In *router mode* the query param `?model={model_id}` has to be set. This endpoin
 | `llamacpp:n_tokens_max` | Counter | High watermark of the context size observed. |
 | `llamacpp:n_decode_total` | Counter | Total Number of llama_decode() calls. |
 | `llamacpp:n_busy_slots_per_decode` | Gauge | Average number of busy slots per llama_decode() call. |
+| `llamacpp:spec_tokens_drafted_total` | Counter | Number of speculative draft tokens generated. |
+| `llamacpp:spec_tokens_accepted_total` | Counter | Number of speculative draft tokens accepted. |
 
 ### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
 
diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
index 1ce7f0958279..86a8d66f35cf 100644
--- a/tools/server/server-context.cpp
+++ b/tools/server/server-context.cpp
@@ -564,6 +564,9 @@ struct server_metrics {
     uint64_t n_decode_total     = 0;
     uint64_t n_busy_slots_total = 0;
 
+    uint64_t n_spec_tokens_drafted_total  = 0;
+    uint64_t n_spec_tokens_accepted_total = 0;
+
     void init() {
         t_start = ggml_time_us();
     }
@@ -582,6 +585,9 @@ struct server_metrics {
         n_tokens_predicted         += slot.n_decoded;
         t_tokens_generation        += slot.t_token_generation;
         t_tokens_generation_total  += slot.t_token_generation;
+
+        n_spec_tokens_drafted_total  += slot.n_draft_total;
+        n_spec_tokens_accepted_total += slot.n_draft_accepted;
     }
 
     void on_decoded(const std::vector<server_slot> & slots) {
@@ -2001,6 +2007,9 @@ struct server_context_impl {
                     res->n_decode_total          = metrics.n_decode_total;
                     res->n_busy_slots_total      = metrics.n_busy_slots_total;
 
+                    res->n_spec_tokens_drafted_total  = metrics.n_spec_tokens_drafted_total;
+                    res->n_spec_tokens_accepted_total = metrics.n_spec_tokens_accepted_total;
+
                     if (task.metrics_reset_bucket) {
                         metrics.reset_bucket();
                     }
@@ -3713,6 +3722,14 @@ void server_routes::init_routes() {
                     {"name",  "n_tokens_max"},
                     {"help",  "Largest observed n_tokens."},
                     {"value",  res_task->n_tokens_max}
+            }, {
+                    {"name",  "spec_tokens_drafted_total"},
+                    {"help",  "Number of speculative draft tokens generated."},
+                    {"value",  (uint64_t) res_task->n_spec_tokens_drafted_total}
+            }, {
+                    {"name",  "spec_tokens_accepted_total"},
+                    {"help",  "Number of speculative draft tokens accepted."},
+                    {"value",  (uint64_t) res_task->n_spec_tokens_accepted_total}
             }}},
             {"gauge", {{
                     {"name",  "prompt_tokens_seconds"},
diff --git a/tools/server/server-task.h b/tools/server/server-task.h
index 64bdecd794f1..053419ec4a0a 100644
--- a/tools/server/server-task.h
+++ b/tools/server/server-task.h
@@ -526,6 +526,9 @@ struct server_task_result_metrics : server_task_result {
     uint64_t n_decode_total     = 0;
     uint64_t n_busy_slots_total = 0;
 
+    uint64_t n_spec_tokens_drafted_total  = 0;
+    uint64_t n_spec_tokens_accepted_total = 0;
+
     // while we can also use std::vector<server_slot> this requires copying the slot object which can be quite messy
     // therefore, we use json to temporarily store the slot.to_json() result
     json slots_data = json::array();