AtomicBot-ai · nycdubliner · May 31, 2026
diff --git a/common/speculative.cpp b/common/speculative.cpp
@@ -2099,3 +2099,23 @@ void common_speculative_print_stats(const common_speculative * spec) {
                 str_perf.c_str());
     }
 }
+
+std::vector<common_speculative_stats> common_speculative_get_stats(const common_speculative * spec) {
+    std::vector<common_speculative_stats> result;
+    if (spec == nullptr) {
+        return result;
+    }
+
+    result.reserve(spec->impls.size());
+    for (const auto & impl : spec->impls) {
+        result.push_back({
+            common_speculative_type_to_str(impl->type),
+            (uint64_t) impl->n_gen_drafts,
+            (uint64_t) impl->n_acc_drafts,
+            (uint64_t) impl->n_gen_tokens,
+            (uint64_t) impl->n_acc_tokens,
+        });
+    }
+
+    return result;
+}
diff --git a/common/speculative.h b/common/speculative.h
@@ -3,8 +3,21 @@
 #include "llama.h"
 #include "common.h"
 
+#include <cstdint>
+#include <string>
+#include <vector>
+
 struct common_speculative;
 
+struct common_speculative_stats {
+    std::string spec_type;
+
+    uint64_t n_gen_drafts = 0;
+    uint64_t n_acc_drafts = 0;
+    uint64_t n_gen_tokens = 0;
+    uint64_t n_acc_tokens = 0;
+};
+
 // comma separated list of all types
 std::string common_speculative_type_name_str();
 
@@ -67,3 +80,6 @@ void common_speculative_cancel(common_speculative * spec);
 
 // print statistics about the speculative decoding
 void common_speculative_print_stats(const common_speculative * spec);
+
+// snapshot statistics about the speculative decoding
+std::vector<common_speculative_stats> common_speculative_get_stats(const common_speculative * spec);
diff --git a/tools/server/README.md b/tools/server/README.md
@@ -1029,6 +1029,40 @@ Available metrics:
 - `llamacpp:requests_processing`: Number of requests processing.
 - `llamacpp:requests_deferred`: Number of requests deferred.
 - `llamacpp:n_tokens_max`: High watermark of the context size observed.
+- `llamacpp:speculative_drafts_generated_total{spec_type="..."}`: Number of speculative draft batches generated.
+- `llamacpp:speculative_drafts_accepted_total{spec_type="..."}`: Number of speculative draft batches accepted at least partially.
+- `llamacpp:speculative_draft_tokens_generated_total{spec_type="..."}`: Number of speculative draft tokens generated.
+- `llamacpp:speculative_draft_tokens_accepted_total{spec_type="..."}`: Number of speculative draft tokens accepted by the target model.
+
+The speculative counters use the same source counters as the server's `statistics <type>` log line and are aggregated across slots. The `spec_type` label is the speculative implementation name, such as `mtp`, `nextn`, `draft`, `eagle3`, or an n-gram type. A server with no configured speculative implementation exports the metric metadata but no speculative series.
+
+Example Grafana/Prometheus expressions:
+
+```promql
+rate(llamacpp:speculative_drafts_accepted_total[5m])
+/
+rate(llamacpp:speculative_drafts_generated_total[5m])
+```
+
+```promql
+rate(llamacpp:speculative_draft_tokens_accepted_total[5m])
+/
+rate(llamacpp:speculative_draft_tokens_generated_total[5m])
+```
+
+To graph all speculative modes together, aggregate before dividing:
+
+```promql
+sum(rate(llamacpp:speculative_drafts_accepted_total[5m]))
+/
+sum(rate(llamacpp:speculative_drafts_generated_total[5m]))
+```
+
+Verify locally with:
+
+```bash
+curl -s http://localhost:8080/metrics | rg 'speculative|draft'
+```
 
 ### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.
 

diff --git a/tools/server/server-context.cpp b/tools/server/server-context.cpp
@@ -1919,6 +1919,7 @@ struct server_context_impl {
 
                     int n_idle_slots       = 0;
                     int n_processing_slots = 0;
+                    std::map<std::string, common_speculative_stats> speculative_stats_by_type;
 
                     for (server_slot & slot : slots) {
                         json slot_data = slot.to_json(slots_debug == 0);
@@ -1929,6 +1930,15 @@ struct server_context_impl {
                             n_idle_slots++;
                         }
 
+                        for (const auto & stats : common_speculative_get_stats(slot.spec)) {
+                            auto & agg = speculative_stats_by_type[stats.spec_type];
+                            agg.spec_type      = stats.spec_type;
+                            agg.n_gen_drafts  += stats.n_gen_drafts;
+                            agg.n_acc_drafts  += stats.n_acc_drafts;
+                            agg.n_gen_tokens  += stats.n_gen_tokens;
+                            agg.n_acc_tokens  += stats.n_acc_tokens;
+                        }
+
                         slots_data.push_back(slot_data);
                     }
                     SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots);
@@ -1955,6 +1965,9 @@ struct server_context_impl {
 
                     res->n_decode_total          = metrics.n_decode_total;
                     res->n_busy_slots_total      = metrics.n_busy_slots_total;
+                    for (const auto & el : speculative_stats_by_type) {
+                        res->speculative_stats.push_back(el.second);
+                    }
 
                     if (task.metrics_reset_bucket) {
                         metrics.reset_bucket();
@@ -3645,6 +3658,46 @@ void server_routes::init_routes() {
             }
         }
 
+        struct speculative_metric_def {
+            const char * name;
+            const char * help;
+            uint64_t common_speculative_stats::* value;
+        };
+
+        static const speculative_metric_def speculative_metrics_def[] = {
+            {
+                "speculative_drafts_generated_total",
+                "Number of speculative draft batches generated.",
+                &common_speculative_stats::n_gen_drafts,
+            },
+            {
+                "speculative_drafts_accepted_total",
+                "Number of speculative draft batches accepted at least partially.",
+                &common_speculative_stats::n_acc_drafts,
+            },
+            {
+                "speculative_draft_tokens_generated_total",
+                "Number of speculative draft tokens generated.",
+                &common_speculative_stats::n_gen_tokens,
+            },
+            {
+                "speculative_draft_tokens_accepted_total",
+                "Number of speculative draft tokens accepted by the target model.",
+                &common_speculative_stats::n_acc_tokens,
+            },
+        };
+
+        for (const auto & metric_def : speculative_metrics_def) {
+            prometheus << "# HELP llamacpp:" << metric_def.name << " " << metric_def.help << "\n"
+                       << "# TYPE llamacpp:" << metric_def.name << " counter\n";
+
+            for (const auto & stats : res_task->speculative_stats) {
+                prometheus << "llamacpp:" << metric_def.name
+                           << "{spec_type=\"" << stats.spec_type << "\"} "
+                           << stats.*(metric_def.value) << "\n";
+            }
+        }
+
         res->headers["Process-Start-Time-Unix"] = std::to_string(res_task->t_start);
         res->content_type = "text/plain; version=0.0.4";
         res->status = 200;

diff --git a/tools/server/server-task.h b/tools/server/server-task.h
@@ -2,6 +2,7 @@
 
 #include "common.h"
 #include "llama.h"
+#include "speculative.h"
 
 #include <string>
 #include <unordered_set>
@@ -526,6 +527,8 @@ struct server_task_result_metrics : server_task_result {
     uint64_t n_decode_total     = 0;
     uint64_t n_busy_slots_total = 0;
 
+    std::vector<common_speculative_stats> speculative_stats;
+
     // while we can also use std::vector<server_slot> this requires copying the slot object which can be quite messy
     // therefore, we use json to temporarily store the slot.to_json() result
     json slots_data = json::array();