ggml-org · boxcee · May 19, 2026
@@ -1064,6 +1064,8 @@ In *router mode* the query param `?model={model_id}` has to be set. This endpoin
 | `llamacpp:n_tokens_max` | Counter | High watermark of the context size observed. |
 | `llamacpp:n_decode_total` | Counter | Total Number of llama_decode() calls. |
 | `llamacpp:n_busy_slots_per_decode` | Gauge | Average number of busy slots per llama_decode() call. |
+| `llamacpp:spec_tokens_drafted_total` | Counter | Number of speculative draft tokens generated. |
+| `llamacpp:spec_tokens_accepted_total` | Counter | Number of speculative draft tokens accepted. |
 
 ### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
 

@@ -564,6 +564,9 @@ struct server_metrics {
     uint64_t n_decode_total     = 0;
     uint64_t n_busy_slots_total = 0;
 
+    uint64_t n_spec_tokens_drafted_total  = 0;
+    uint64_t n_spec_tokens_accepted_total = 0;
+
     void init() {
         t_start = ggml_time_us();
     }
@@ -582,6 +585,9 @@ struct server_metrics {
         n_tokens_predicted         += slot.n_decoded;
         t_tokens_generation        += slot.t_token_generation;
         t_tokens_generation_total  += slot.t_token_generation;
+
+        n_spec_tokens_drafted_total  += slot.n_draft_total;
+        n_spec_tokens_accepted_total += slot.n_draft_accepted;
     }
 
     void on_decoded(const std::vector<server_slot> & slots) {
@@ -2001,6 +2007,9 @@ struct server_context_impl {
                     res->n_decode_total          = metrics.n_decode_total;
                     res->n_busy_slots_total      = metrics.n_busy_slots_total;
 
+                    res->n_spec_tokens_drafted_total  = metrics.n_spec_tokens_drafted_total;
+                    res->n_spec_tokens_accepted_total = metrics.n_spec_tokens_accepted_total;
+
                     if (task.metrics_reset_bucket) {
                         metrics.reset_bucket();
                     }
@@ -3713,6 +3722,14 @@ void server_routes::init_routes() {
                     {"name",  "n_tokens_max"},
                     {"help",  "Largest observed n_tokens."},
                     {"value",  res_task->n_tokens_max}
+            }, {
+                    {"name",  "spec_tokens_drafted_total"},
+                    {"help",  "Number of speculative draft tokens generated."},
+                    {"value",  (uint64_t) res_task->n_spec_tokens_drafted_total}
+            }, {
+                    {"name",  "spec_tokens_accepted_total"},
+                    {"help",  "Number of speculative draft tokens accepted."},
+                    {"value",  (uint64_t) res_task->n_spec_tokens_accepted_total}
             }}},
             {"gauge", {{
                     {"name",  "prompt_tokens_seconds"},

@@ -526,6 +526,9 @@ struct server_task_result_metrics : server_task_result {
     uint64_t n_decode_total     = 0;
     uint64_t n_busy_slots_total = 0;
 
+    uint64_t n_spec_tokens_drafted_total  = 0;
+    uint64_t n_spec_tokens_accepted_total = 0;
+
     // while we can also use std::vector<server_slot> this requires copying the slot object which can be quite messy
     // therefore, we use json to temporarily store the slot.to_json() result
     json slots_data = json::array();