DataZooDE · jrosskopf · Jun 29, 2026 · Jun 29, 2026
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/escurel-eval/Cargo.toml b/crates/escurel-eval/Cargo.toml
@@ -38,6 +38,7 @@ tracing = "0.1"
 [dev-dependencies]
 tempfile = "3"
 tokio = { version = "1", features = ["macros", "rt-multi-thread"] }
+async-trait = "0.1"
 
 [lints]
 workspace = true
diff --git a/crates/escurel-eval/src/latency.rs b/crates/escurel-eval/src/latency.rs
@@ -48,9 +48,30 @@ pub fn percentiles(samples: &mut [f64]) -> LatencyStats {
     }
 }
 
-/// Run each query once (sequentially), timing the `search` call. Returns the
-/// per-query ranked doc ids (deduped to doc granularity, for the metrics) and
-/// the latency distribution — computed in the same sweep.
+/// One query → its top-`k` doc ranking, replicating the server's native-lane
+/// path (`escurel-server`'s `tool_search`): fetch the (possibly wider) rerank
+/// candidate pool, apply the cross-encoder rerank stage, then truncate to `k`.
+///
+/// `Indexer::search` itself does NOT rerank — the rerank stage lives in the
+/// server dispatcher (`rerank_hits` after `search`), so the harness has to
+/// invoke it the same way or the rerank configs are a silent no-op. With rerank
+/// disabled, `rerank_candidate_pool(k) == k` and `rerank_hits` is a no-op, so
+/// this is exactly `search(q, k)`.
+pub async fn search_ranked(
+    indexer: &Indexer,
+    query: &str,
+    k: usize,
+) -> Result<Vec<escurel_index::SearchHit>, EvalError> {
+    let pool = indexer.rerank_candidate_pool(k);
+    let hits = indexer.search(query, pool, None, None, None, None).await?;
+    let mut hits = indexer.rerank_hits(query, hits).await?;
+    hits.truncate(k);
+    Ok(hits)
+}
+
+/// Run each query once (sequentially), timing the full retrieve + rerank call.
+/// Returns the per-query ranked doc ids (deduped to doc granularity, for the
+/// metrics) and the latency distribution — computed in the same sweep.
 pub async fn run_queries(
     indexer: &Indexer,
     queries: &[Query],
@@ -60,7 +81,7 @@ pub async fn run_queries(
     let mut samples = Vec::with_capacity(queries.len());
     for q in queries {
         let start = Instant::now();
-        let hits = indexer.search(&q.text, k, None, None, None, None).await?;
+        let hits = search_ranked(indexer, &q.text, k).await?;
         samples.push(start.elapsed().as_secs_f64() * 1000.0);
         ranked_per_query.push(dedup_doc_ids(&hits));
     }
@@ -101,9 +122,7 @@ pub async fn measure_qps(
             while Instant::now() < deadline {
                 let idx = cursor.fetch_add(1, Ordering::Relaxed) % queries.len();
                 let start = Instant::now();
-                let r = indexer
-                    .search(&queries[idx].text, k, None, None, None, None)
-                    .await;
+                let r = search_ranked(&indexer, &queries[idx].text, k).await;
                 if r.is_ok() {
                     samples.push(start.elapsed().as_secs_f64() * 1000.0);
                     completed.fetch_add(1, Ordering::Relaxed);

diff --git a/crates/escurel-eval/tests/smoke.rs b/crates/escurel-eval/tests/smoke.rs
@@ -11,7 +11,7 @@ use std::path::PathBuf;
 use std::sync::Arc;
 use std::time::Duration;
 
-use escurel_embed::{Embedder, HashEmbedder};
+use escurel_embed::{Candidate, EmbedError, Embedder, HashEmbedder, Ranked, Reranker};
 use escurel_eval::config::RunConfig;
 use escurel_eval::dataset::Dataset;
 use escurel_eval::{QpsParams, run_matrix};
@@ -21,6 +21,32 @@ fn fixture_dir() -> PathBuf {
     PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("fixtures/tiny")
 }
 
+/// Deterministic reranker that REVERSES the first-stage order. Used only to
+/// prove the harness actually invokes the rerank stage (`rerank_hits`) — if the
+/// rerank config were a silent no-op, its ranking would equal single_pass.
+#[derive(Debug)]
+struct ReverseReranker;
+
+#[async_trait::async_trait]
+impl Reranker for ReverseReranker {
+    async fn rerank(
+        &self,
+        _query: &str,
+        candidates: &[Candidate],
+    ) -> Result<Vec<Ranked>, EmbedError> {
+        let n = candidates.len();
+        Ok(candidates
+            .iter()
+            .rev()
+            .enumerate()
+            .map(|(i, c)| Ranked {
+                id: c.id.clone(),
+                score: (n - i) as f32,
+            })
+            .collect())
+    }
+}
+
 #[tokio::test]
 async fn end_to_end_over_tiny_fixture() {
     let dataset = Dataset::load(&fixture_dir()).expect("load fixture");
@@ -38,14 +64,15 @@ async fn end_to_end_over_tiny_fixture() {
             coarse_dim: 128,
             coarse_candidates: 500,
         },
+        RunConfig::Rerank { candidates: 10 },
     ];
 
     let report = run_matrix(
         &dataset,
         &db_path,
         &store_dir,
         Arc::clone(&embedder),
-        None, // no reranker (rerank configs would be skipped)
+        Some(Arc::new(ReverseReranker)),
         &configs,
         "doc",
         10,
@@ -62,7 +89,7 @@ async fn end_to_end_over_tiny_fixture() {
     assert_eq!(report.dim, 768);
     assert_eq!(report.corpus_docs, 8);
     assert_eq!(report.queries, 4);
-    assert_eq!(report.results.len(), 2, "single_pass + two_pass");
+    assert_eq!(report.results.len(), 3, "single_pass + two_pass + rerank");
 
     for r in &report.results {
         // Every metric is a probability-like value.
@@ -84,20 +111,41 @@ async fn end_to_end_over_tiny_fixture() {
         assert_eq!(r.latency.n, 4, "{} latency samples", r.config);
         // All 8 docs fit in k=10, so every query's relevant doc is found.
         assert_eq!(r.recall_at_10, 1.0, "{} recall@10", r.config);
-        // The BM25 lane ranks the matching doc near the top.
-        assert!(r.ndcg_at_10 >= 0.5, "{} ndcg@10={}", r.config, r.ndcg_at_10);
         // The concurrent QPS pass completed some searches.
         let qps = r.qps.as_ref().expect("qps measured");
         assert!(qps.completed > 0, "{} qps completed 0", r.config);
     }
 
+    let by = |name: &str| {
+        report
+            .results
+            .iter()
+            .find(|r| r.config == name)
+            .unwrap_or_else(|| panic!("config {name} present"))
+    };
+    let single = by("single_pass");
+    let two = by("two_pass");
+    let rerank = by("rerank");
+
+    // The BM25 lane ranks the matching doc near the top for first-stage configs.
+    assert!(
+        single.ndcg_at_10 >= 0.5,
+        "single ndcg@10={}",
+        single.ndcg_at_10
+    );
     // Two-pass with a corpus-covering shortlist preserves single-pass recall.
-    assert_eq!(
-        report.results[0].recall_at_10,
-        report.results[1].recall_at_10
+    assert_eq!(single.recall_at_10, two.recall_at_10);
+    // The rerank stage is actually APPLIED: the ReverseReranker flips the order,
+    // so the rerank ranking must differ from single_pass (it does not silently
+    // no-op). Reversal pushes the relevant doc down, so nDCG drops.
+    assert!(
+        rerank.ndcg_at_10 < single.ndcg_at_10,
+        "rerank must change the ranking (applied={}, single={})",
+        rerank.ndcg_at_10,
+        single.ndcg_at_10
     );
 
-    // JSON renders.
+    // JSON renders all three configs.
     let json = report.to_json();
-    assert_eq!(json["results"].as_array().unwrap().len(), 2);
+    assert_eq!(json["results"].as_array().unwrap().len(), 3);
 }
diff --git a/docs/eval/baseline-scifact.json b/docs/eval/baseline-scifact.json
@@ -0,0 +1,122 @@
+{
+  "corpus_docs": 1000,
+  "dataset": "scifact-1k",
+  "dim": 768,
+  "k": 100,
+  "model_id": "BAAI/bge-base-en-v1.5",
+  "queries": 300,
+  "results": [
+    {
+      "config": "single_pass",
+      "latency": {
+        "mean_ms": 147.60474049666655,
+        "n": 300,
+        "p50_ms": 146.055297,
+        "p95_ms": 182.5911,
+        "p99_ms": 190.951605
+      },
+      "map": 0.8224983668316723,
+      "mrr": 0.8306838866258879,
+      "ndcg_at_10": 0.845865680550998,
+      "ndcg_at_100": 0.8632518249574073,
+      "qps": {
+        "completed": 105,
+        "latency": {
+          "mean_ms": 627.4837098666667,
+          "n": 105,
+          "p50_ms": 633.607573,
+          "p95_ms": 722.991633,
+          "p99_ms": 838.0994539999999
+        },
+        "qps": 12.517082351369911,
+        "secs": 8.388536326
+      },
+      "recall_at_10": 0.9208888888888889,
+      "recall_at_100": 0.9933333333333333
+    },
+    {
+      "config": "two_pass",
+      "latency": {
+        "mean_ms": 176.21182580666675,
+        "n": 300,
+        "p50_ms": 174.259779,
+        "p95_ms": 213.43055999999999,
+        "p99_ms": 223.933607
+      },
+      "map": 0.822881412747501,
+      "mrr": 0.8310725998283941,
+      "ndcg_at_10": 0.8471483082800391,
+      "ndcg_at_100": 0.8635981384652979,
+      "qps": {
+        "completed": 85,
+        "latency": {
+          "mean_ms": 779.1540350470589,
+          "n": 85,
+          "p50_ms": 787.6516310000001,
+          "p95_ms": 885.695209,
+          "p99_ms": 1116.4758450000002
+        },
+        "qps": 10.017490532577238,
+        "secs": 8.485159005
+      },
+      "recall_at_10": 0.9242222222222222,
+      "recall_at_100": 0.9933333333333333
+    },
+    {
+      "config": "rerank",
+      "latency": {
+        "mean_ms": 15869.816934796661,
+        "n": 300,
+        "p50_ms": 15103.900015000001,
+        "p95_ms": 21417.791509000002,
+        "p99_ms": 22625.405078
+      },
+      "map": 0.6319104056519331,
+      "mrr": 0.6443926411809954,
+      "ndcg_at_10": 0.6713375743014715,
+      "ndcg_at_100": 0.710396174387697,
+      "qps": {
+        "completed": 8,
+        "latency": {
+          "mean_ms": 26286.12628475,
+          "n": 8,
+          "p50_ms": 24889.192969,
+          "p95_ms": 32166.650187000003,
+          "p99_ms": 32166.650187000003
+        },
+        "qps": 0.24870395626692937,
+        "secs": 32.166758101
+      },
+      "recall_at_10": 0.8019999999999999,
+      "recall_at_100": 0.9933333333333333
+    },
+    {
+      "config": "two_pass_rerank",
+      "latency": {
+        "mean_ms": 15749.06994372334,
+        "n": 300,
+        "p50_ms": 15030.227543,
+        "p95_ms": 21038.890063,
+        "p99_ms": 22432.801917999997
+      },
+      "map": 0.6320441085138567,
+      "mrr": 0.6444835149328776,
+      "ndcg_at_10": 0.671893236170446,
+      "ndcg_at_100": 0.7104921297998338,
+      "qps": {
+        "completed": 8,
+        "latency": {
+          "mean_ms": 28451.25987075,
+          "n": 8,
+          "p50_ms": 27499.714644,
+          "p95_ms": 36061.501938,
+          "p99_ms": 36061.501938
+        },
+        "qps": 0.2218425255278265,
+        "secs": 36.061616144
+      },
+      "recall_at_10": 0.8031111111111111,
+      "recall_at_100": 0.9933333333333333
+    }
+  ]
+}
diff --git a/docs/eval/baseline-scifact.md b/docs/eval/baseline-scifact.md
@@ -0,0 +1,82 @@
+# Baseline — BEIR SciFact (1k subsample)
+
+A committed `escurel-eval` run, produced by:
+
+```bash
+escurel-eval --dataset datasets/scifact-1k --skill paper \
+  --embed-model BAAI/bge-base-en-v1.5 --reranker BAAI/bge-reranker-base \
+  --k 100 --coarse-dim 128 --coarse-candidates 500 \
+  --qps-workers 8 --qps-secs 8 --format json
+```
+
+Raw JSON: [`baseline-scifact.json`](baseline-scifact.json).
+
+## Setup
+
+| | |
+|---|---|
+| Dataset | BEIR SciFact, **1000-doc qrels-preserving subsample** (all 283 test-judged docs + 717 distractors) |
+| Queries | 300 (the SciFact test split) |
+| Embedder | `BAAI/bge-base-en-v1.5` (768-d BERT) |
+| Reranker | `BAAI/bge-reranker-base` (XLM-RoBERTa cross-encoder) |
+| `k` / coarse | k=100, coarse_dim=128, coarse_candidates=500 |
+| Hardware | CPU only (candle, no BLAS) |
+
+**Why a 1k subsample, not the full 5183-doc corpus:** candle CPU BERT embedding
+(no BLAS in the default build) runs at ~0.5 docs/s, so the full corpus is hours
+of ingest. The subsample keeps every judged doc, so recall/nDCG are well-defined;
+it is an easier corpus than the full set, so treat the **absolute** numbers as
+indicative and the **per-config deltas** as the signal. The harness runs the full
+corpus unchanged on a BLAS/GPU build.
+
+## Results
+
+| config | nDCG@10 | nDCG@100 | recall@10 | recall@100 | MRR | MAP | p50 ms | p95 ms | QPS |
+|---|---|---|---|---|---|---|---|---|---|
+| single_pass      | **0.846** | 0.863 | 0.921 | 0.993 | 0.831 | 0.822 | 146 | 183 | 12.5 |
+| two_pass         | 0.847 | 0.864 | 0.924 | 0.993 | 0.831 | 0.823 | 174 | 213 | 10.0 |
+| rerank           | 0.671 | 0.710 | 0.802 | 0.993 | 0.644 | 0.632 | 15104 | 21418 | 0.2 |
+| two_pass_rerank  | 0.672 | 0.711 | 0.803 | 0.993 | 0.645 | 0.632 | 15030 | 21039 | 0.2 |
+
+(p50/p95 are the sequential per-query latency; QPS is the 8-worker concurrent
+pass — the `Indexer` connection mutex serializes DuckDB, so this is single-writer
+throughput.)
+
+## Findings
+
+**#218 two-pass — quality-neutral, small latency cost (as designed).**
+`single_pass` → `two_pass`: nDCG@10 +0.001, recall@10 +0.003 (noise), p50 +28 ms.
+The coarse 128-d prefix shortlist (500 of 1000 docs) preserves the full-dim
+ranking here. The latency *increase* is expected: the truncate-on-read coarse
+pass is a cheaper-per-row scan, **not** a low-dim ANN index, so on this corpus
+size it adds work rather than saving it — exactly the trade-off the #218 PR
+documented (a second 128-d HNSW index is the throughput win, deferred). Two-pass
+pays off at corpus sizes where the full-dim HNSW scan dominates, not at 1k docs.
+
+**#215 rerank — regresses quality AND latency here. Two real causes:**
+
+1. **Quality drop (nDCG@10 0.846 → 0.671).** bge-base single-pass is already a
+   strong retriever on SciFact, and the rerank stage scores the **200-char block
+   snippet**, not the full abstract (`rerank_passage` uses `SearchHit.snippet`,
+   the hydrated lead — a deliberate latency choice in the #215 stage PR). On
+   abstract-length docs the cross-encoder sees ~13% of the passage and reorders
+   *worse* than the bi-encoder that embedded the whole doc. **Actionable:** feed
+   the reranker fuller passage text (refetch the block `body`), at least for
+   document/RAG skills.
+2. **Latency (~15 s/query sequential, QPS 0.2).** A CPU cross-encoder scoring 100
+   `(query, passage)` pairs per query is ~15 s; concurrent throughput collapses
+   to 0.2 QPS. **Actionable:** rerank only makes sense on GPU, and/or with a much
+   smaller `rerank_candidates` (e.g. 20–50), and/or a lighter CE head.
+
+The harness did its job: it turned "the reranker is wired in" into a measured,
+falsifiable result — on this benchmark, the rerank stage as currently configured
+(snippet passages, CPU, 100 candidates) is a net negative, and the report points
+at the two concrete levers to change that.
+
+## Caveats
+
+- Absolute nDCG is on a 1k subsample (easier than full SciFact) and is **not** the
+  ADR-0001 460-block target — those numbers await escurel's own corpus in this
+  same BEIR format (`docs/eval/README.md`).
+- CPU-only; GPU / BLAS would change the latency picture (and make the full corpus
+  + rerank tractable).