Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 20 additions & 0 deletions common/speculative.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2099,3 +2099,23 @@ void common_speculative_print_stats(const common_speculative * spec) {
str_perf.c_str());
}
}

std::vector<common_speculative_stats> common_speculative_get_stats(const common_speculative * spec) {
std::vector<common_speculative_stats> result;
if (spec == nullptr) {
return result;
}

result.reserve(spec->impls.size());
for (const auto & impl : spec->impls) {
result.push_back({
common_speculative_type_to_str(impl->type),
(uint64_t) impl->n_gen_drafts,
(uint64_t) impl->n_acc_drafts,
(uint64_t) impl->n_gen_tokens,
(uint64_t) impl->n_acc_tokens,
});
}

return result;
}
16 changes: 16 additions & 0 deletions common/speculative.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,21 @@
#include "llama.h"
#include "common.h"

#include <cstdint>
#include <string>
#include <vector>

struct common_speculative;

struct common_speculative_stats {
std::string spec_type;

uint64_t n_gen_drafts = 0;
uint64_t n_acc_drafts = 0;
uint64_t n_gen_tokens = 0;
uint64_t n_acc_tokens = 0;
};

// comma separated list of all types
std::string common_speculative_type_name_str();

Expand Down Expand Up @@ -67,3 +80,6 @@ void common_speculative_cancel(common_speculative * spec);

// print statistics about the speculative decoding
void common_speculative_print_stats(const common_speculative * spec);

// snapshot statistics about the speculative decoding
std::vector<common_speculative_stats> common_speculative_get_stats(const common_speculative * spec);
34 changes: 34 additions & 0 deletions tools/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1029,6 +1029,40 @@ Available metrics:
- `llamacpp:requests_processing`: Number of requests processing.
- `llamacpp:requests_deferred`: Number of requests deferred.
- `llamacpp:n_tokens_max`: High watermark of the context size observed.
- `llamacpp:speculative_drafts_generated_total{spec_type="..."}`: Number of speculative draft batches generated.
- `llamacpp:speculative_drafts_accepted_total{spec_type="..."}`: Number of speculative draft batches accepted at least partially.
- `llamacpp:speculative_draft_tokens_generated_total{spec_type="..."}`: Number of speculative draft tokens generated.
- `llamacpp:speculative_draft_tokens_accepted_total{spec_type="..."}`: Number of speculative draft tokens accepted by the target model.

The speculative counters use the same source counters as the server's `statistics <type>` log line and are aggregated across slots. The `spec_type` label is the speculative implementation name, such as `mtp`, `nextn`, `draft`, `eagle3`, or an n-gram type. A server with no configured speculative implementation exports the metric metadata but no speculative series.

Example Grafana/Prometheus expressions:

```promql
rate(llamacpp:speculative_drafts_accepted_total[5m])
/
rate(llamacpp:speculative_drafts_generated_total[5m])
```

```promql
rate(llamacpp:speculative_draft_tokens_accepted_total[5m])
/
rate(llamacpp:speculative_draft_tokens_generated_total[5m])
```

To graph all speculative modes together, aggregate before dividing:

```promql
sum(rate(llamacpp:speculative_drafts_accepted_total[5m]))
/
sum(rate(llamacpp:speculative_drafts_generated_total[5m]))
```

Verify locally with:

```bash
curl -s http://localhost:8080/metrics | rg 'speculative|draft'
```

### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.

Expand Down
53 changes: 53 additions & 0 deletions tools/server/server-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1919,6 +1919,7 @@ struct server_context_impl {

int n_idle_slots = 0;
int n_processing_slots = 0;
std::map<std::string, common_speculative_stats> speculative_stats_by_type;

for (server_slot & slot : slots) {
json slot_data = slot.to_json(slots_debug == 0);
Expand All @@ -1929,6 +1930,15 @@ struct server_context_impl {
n_idle_slots++;
}

for (const auto & stats : common_speculative_get_stats(slot.spec)) {
auto & agg = speculative_stats_by_type[stats.spec_type];
agg.spec_type = stats.spec_type;
agg.n_gen_drafts += stats.n_gen_drafts;
agg.n_acc_drafts += stats.n_acc_drafts;
agg.n_gen_tokens += stats.n_gen_tokens;
agg.n_acc_tokens += stats.n_acc_tokens;
}

slots_data.push_back(slot_data);
}
SRV_DBG("n_idle_slots = %d, n_processing_slots = %d\n", n_idle_slots, n_processing_slots);
Expand All @@ -1955,6 +1965,9 @@ struct server_context_impl {

res->n_decode_total = metrics.n_decode_total;
res->n_busy_slots_total = metrics.n_busy_slots_total;
for (const auto & el : speculative_stats_by_type) {
res->speculative_stats.push_back(el.second);
}

if (task.metrics_reset_bucket) {
metrics.reset_bucket();
Expand Down Expand Up @@ -3645,6 +3658,46 @@ void server_routes::init_routes() {
}
}

struct speculative_metric_def {
const char * name;
const char * help;
uint64_t common_speculative_stats::* value;
};

static const speculative_metric_def speculative_metrics_def[] = {
{
"speculative_drafts_generated_total",
"Number of speculative draft batches generated.",
&common_speculative_stats::n_gen_drafts,
},
{
"speculative_drafts_accepted_total",
"Number of speculative draft batches accepted at least partially.",
&common_speculative_stats::n_acc_drafts,
},
{
"speculative_draft_tokens_generated_total",
"Number of speculative draft tokens generated.",
&common_speculative_stats::n_gen_tokens,
},
{
"speculative_draft_tokens_accepted_total",
"Number of speculative draft tokens accepted by the target model.",
&common_speculative_stats::n_acc_tokens,
},
};

for (const auto & metric_def : speculative_metrics_def) {
prometheus << "# HELP llamacpp:" << metric_def.name << " " << metric_def.help << "\n"
<< "# TYPE llamacpp:" << metric_def.name << " counter\n";

for (const auto & stats : res_task->speculative_stats) {
prometheus << "llamacpp:" << metric_def.name
<< "{spec_type=\"" << stats.spec_type << "\"} "
<< stats.*(metric_def.value) << "\n";
}
}

res->headers["Process-Start-Time-Unix"] = std::to_string(res_task->t_start);
res->content_type = "text/plain; version=0.0.4";
res->status = 200;
Expand Down
3 changes: 3 additions & 0 deletions tools/server/server-task.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "common.h"
#include "llama.h"
#include "speculative.h"

#include <string>
#include <unordered_set>
Expand Down Expand Up @@ -526,6 +527,8 @@ struct server_task_result_metrics : server_task_result {
uint64_t n_decode_total = 0;
uint64_t n_busy_slots_total = 0;

std::vector<common_speculative_stats> speculative_stats;

// while we can also use std::vector<server_slot> this requires copying the slot object which can be quite messy
// therefore, we use json to temporarily store the slot.to_json() result
json slots_data = json::array();
Expand Down