Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions lmcache/integration/vllm/vllm_multi_process_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import enum
import os
import threading
import time

# Third Party
import torch
Expand Down Expand Up @@ -936,6 +937,10 @@ def __init__(
# Prevents re-reporting the same ID after drain clears tracking sets.
self._returned_finished: set[str] = set()

# Timestamps recorded when submit_store_request is called, used to
# measure E2E wall-clock time until the future is resolved.
self._store_submit_times: dict[str, float] = {}

self.model_name = model_name
self.parallel_strategy = parallel_strategy

Expand Down Expand Up @@ -1185,6 +1190,7 @@ def submit_store_request(
)
self.store_futures[request_id] = future
self.store_events[request_id] = event
self._store_submit_times[request_id] = time.perf_counter()

@_lmcache_nvtx_annotate
def submit_retrieve_request(
Expand Down Expand Up @@ -1345,6 +1351,7 @@ def get_finished(
self.retrieve_futures.clear()
self.store_events.clear()
self.retrieve_events.clear()
self._store_submit_times.clear()

ret_stores = self._process_finished_stores(
finished_stores, finished_req_ids_from_engine
Expand All @@ -1363,6 +1370,15 @@ def get_finished(
if not s_future.query():
continue

_t_done = time.perf_counter()
_t_submit = self._store_submit_times.pop(request_id, None)
if _t_submit is not None:
logger.info(
"[E2E-STORE] req=%s e2e=%.3f ms",
request_id,
(_t_done - _t_submit) * 1000,
)

s_result = s_future.result()
finished_stores.add(request_id)

Expand Down