Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions tools/server/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -1064,6 +1064,8 @@ In *router mode* the query param `?model={model_id}` has to be set. This endpoin
| `llamacpp:n_tokens_max` | Counter | High watermark of the context size observed. |
| `llamacpp:n_decode_total` | Counter | Total Number of llama_decode() calls. |
| `llamacpp:n_busy_slots_per_decode` | Gauge | Average number of busy slots per llama_decode() call. |
| `llamacpp:spec_tokens_drafted_total` | Counter | Number of speculative draft tokens generated. |
| `llamacpp:spec_tokens_accepted_total` | Counter | Number of speculative draft tokens accepted. |

### POST `/slots/{id_slot}?action=save`: Save the prompt cache of the specified slot to a file.

Expand Down
17 changes: 17 additions & 0 deletions tools/server/server-context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -564,6 +564,9 @@ struct server_metrics {
uint64_t n_decode_total = 0;
uint64_t n_busy_slots_total = 0;

uint64_t n_spec_tokens_drafted_total = 0;
uint64_t n_spec_tokens_accepted_total = 0;

void init() {
t_start = ggml_time_us();
}
Expand All @@ -582,6 +585,9 @@ struct server_metrics {
n_tokens_predicted += slot.n_decoded;
t_tokens_generation += slot.t_token_generation;
t_tokens_generation_total += slot.t_token_generation;

n_spec_tokens_drafted_total += slot.n_draft_total;
n_spec_tokens_accepted_total += slot.n_draft_accepted;
}

void on_decoded(const std::vector<server_slot> & slots) {
Expand Down Expand Up @@ -2001,6 +2007,9 @@ struct server_context_impl {
res->n_decode_total = metrics.n_decode_total;
res->n_busy_slots_total = metrics.n_busy_slots_total;

res->n_spec_tokens_drafted_total = metrics.n_spec_tokens_drafted_total;
res->n_spec_tokens_accepted_total = metrics.n_spec_tokens_accepted_total;

if (task.metrics_reset_bucket) {
metrics.reset_bucket();
}
Expand Down Expand Up @@ -3713,6 +3722,14 @@ void server_routes::init_routes() {
{"name", "n_tokens_max"},
{"help", "Largest observed n_tokens."},
{"value", res_task->n_tokens_max}
}, {
{"name", "spec_tokens_drafted_total"},
{"help", "Number of speculative draft tokens generated."},
{"value", (uint64_t) res_task->n_spec_tokens_drafted_total}
}, {
{"name", "spec_tokens_accepted_total"},
{"help", "Number of speculative draft tokens accepted."},
{"value", (uint64_t) res_task->n_spec_tokens_accepted_total}
}}},
{"gauge", {{
{"name", "prompt_tokens_seconds"},
Expand Down
3 changes: 3 additions & 0 deletions tools/server/server-task.h
Original file line number Diff line number Diff line change
Expand Up @@ -526,6 +526,9 @@ struct server_task_result_metrics : server_task_result {
uint64_t n_decode_total = 0;
uint64_t n_busy_slots_total = 0;

uint64_t n_spec_tokens_drafted_total = 0;
uint64_t n_spec_tokens_accepted_total = 0;

// while we can also use std::vector<server_slot> this requires copying the slot object which can be quite messy
// therefore, we use json to temporarily store the slot.to_json() result
json slots_data = json::array();
Expand Down