diff --git a/Cargo.lock b/Cargo.lock index 62eb7ca..ef47e88 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2662,6 +2662,7 @@ name = "escurel-eval" version = "1.0.0" dependencies = [ "anyhow", + "async-trait", "clap", "duckdb", "escurel-embed", diff --git a/crates/escurel-eval/Cargo.toml b/crates/escurel-eval/Cargo.toml index bbb1c75..b595ff6 100644 --- a/crates/escurel-eval/Cargo.toml +++ b/crates/escurel-eval/Cargo.toml @@ -38,6 +38,7 @@ tracing = "0.1" [dev-dependencies] tempfile = "3" tokio = { version = "1", features = ["macros", "rt-multi-thread"] } +async-trait = "0.1" [lints] workspace = true diff --git a/crates/escurel-eval/src/latency.rs b/crates/escurel-eval/src/latency.rs index 243d00b..51e4b62 100644 --- a/crates/escurel-eval/src/latency.rs +++ b/crates/escurel-eval/src/latency.rs @@ -48,9 +48,30 @@ pub fn percentiles(samples: &mut [f64]) -> LatencyStats { } } -/// Run each query once (sequentially), timing the `search` call. Returns the -/// per-query ranked doc ids (deduped to doc granularity, for the metrics) and -/// the latency distribution — computed in the same sweep. +/// One query → its top-`k` doc ranking, replicating the server's native-lane +/// path (`escurel-server`'s `tool_search`): fetch the (possibly wider) rerank +/// candidate pool, apply the cross-encoder rerank stage, then truncate to `k`. +/// +/// `Indexer::search` itself does NOT rerank — the rerank stage lives in the +/// server dispatcher (`rerank_hits` after `search`), so the harness has to +/// invoke it the same way or the rerank configs are a silent no-op. With rerank +/// disabled, `rerank_candidate_pool(k) == k` and `rerank_hits` is a no-op, so +/// this is exactly `search(q, k)`. +pub async fn search_ranked( + indexer: &Indexer, + query: &str, + k: usize, +) -> Result, EvalError> { + let pool = indexer.rerank_candidate_pool(k); + let hits = indexer.search(query, pool, None, None, None, None).await?; + let mut hits = indexer.rerank_hits(query, hits).await?; + hits.truncate(k); + Ok(hits) +} + +/// Run each query once (sequentially), timing the full retrieve + rerank call. +/// Returns the per-query ranked doc ids (deduped to doc granularity, for the +/// metrics) and the latency distribution — computed in the same sweep. pub async fn run_queries( indexer: &Indexer, queries: &[Query], @@ -60,7 +81,7 @@ pub async fn run_queries( let mut samples = Vec::with_capacity(queries.len()); for q in queries { let start = Instant::now(); - let hits = indexer.search(&q.text, k, None, None, None, None).await?; + let hits = search_ranked(indexer, &q.text, k).await?; samples.push(start.elapsed().as_secs_f64() * 1000.0); ranked_per_query.push(dedup_doc_ids(&hits)); } @@ -101,9 +122,7 @@ pub async fn measure_qps( while Instant::now() < deadline { let idx = cursor.fetch_add(1, Ordering::Relaxed) % queries.len(); let start = Instant::now(); - let r = indexer - .search(&queries[idx].text, k, None, None, None, None) - .await; + let r = search_ranked(&indexer, &queries[idx].text, k).await; if r.is_ok() { samples.push(start.elapsed().as_secs_f64() * 1000.0); completed.fetch_add(1, Ordering::Relaxed); diff --git a/crates/escurel-eval/tests/smoke.rs b/crates/escurel-eval/tests/smoke.rs index bff7162..cc6478a 100644 --- a/crates/escurel-eval/tests/smoke.rs +++ b/crates/escurel-eval/tests/smoke.rs @@ -11,7 +11,7 @@ use std::path::PathBuf; use std::sync::Arc; use std::time::Duration; -use escurel_embed::{Embedder, HashEmbedder}; +use escurel_embed::{Candidate, EmbedError, Embedder, HashEmbedder, Ranked, Reranker}; use escurel_eval::config::RunConfig; use escurel_eval::dataset::Dataset; use escurel_eval::{QpsParams, run_matrix}; @@ -21,6 +21,32 @@ fn fixture_dir() -> PathBuf { PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("fixtures/tiny") } +/// Deterministic reranker that REVERSES the first-stage order. Used only to +/// prove the harness actually invokes the rerank stage (`rerank_hits`) — if the +/// rerank config were a silent no-op, its ranking would equal single_pass. +#[derive(Debug)] +struct ReverseReranker; + +#[async_trait::async_trait] +impl Reranker for ReverseReranker { + async fn rerank( + &self, + _query: &str, + candidates: &[Candidate], + ) -> Result, EmbedError> { + let n = candidates.len(); + Ok(candidates + .iter() + .rev() + .enumerate() + .map(|(i, c)| Ranked { + id: c.id.clone(), + score: (n - i) as f32, + }) + .collect()) + } +} + #[tokio::test] async fn end_to_end_over_tiny_fixture() { let dataset = Dataset::load(&fixture_dir()).expect("load fixture"); @@ -38,6 +64,7 @@ async fn end_to_end_over_tiny_fixture() { coarse_dim: 128, coarse_candidates: 500, }, + RunConfig::Rerank { candidates: 10 }, ]; let report = run_matrix( @@ -45,7 +72,7 @@ async fn end_to_end_over_tiny_fixture() { &db_path, &store_dir, Arc::clone(&embedder), - None, // no reranker (rerank configs would be skipped) + Some(Arc::new(ReverseReranker)), &configs, "doc", 10, @@ -62,7 +89,7 @@ async fn end_to_end_over_tiny_fixture() { assert_eq!(report.dim, 768); assert_eq!(report.corpus_docs, 8); assert_eq!(report.queries, 4); - assert_eq!(report.results.len(), 2, "single_pass + two_pass"); + assert_eq!(report.results.len(), 3, "single_pass + two_pass + rerank"); for r in &report.results { // Every metric is a probability-like value. @@ -84,20 +111,41 @@ async fn end_to_end_over_tiny_fixture() { assert_eq!(r.latency.n, 4, "{} latency samples", r.config); // All 8 docs fit in k=10, so every query's relevant doc is found. assert_eq!(r.recall_at_10, 1.0, "{} recall@10", r.config); - // The BM25 lane ranks the matching doc near the top. - assert!(r.ndcg_at_10 >= 0.5, "{} ndcg@10={}", r.config, r.ndcg_at_10); // The concurrent QPS pass completed some searches. let qps = r.qps.as_ref().expect("qps measured"); assert!(qps.completed > 0, "{} qps completed 0", r.config); } + let by = |name: &str| { + report + .results + .iter() + .find(|r| r.config == name) + .unwrap_or_else(|| panic!("config {name} present")) + }; + let single = by("single_pass"); + let two = by("two_pass"); + let rerank = by("rerank"); + + // The BM25 lane ranks the matching doc near the top for first-stage configs. + assert!( + single.ndcg_at_10 >= 0.5, + "single ndcg@10={}", + single.ndcg_at_10 + ); // Two-pass with a corpus-covering shortlist preserves single-pass recall. - assert_eq!( - report.results[0].recall_at_10, - report.results[1].recall_at_10 + assert_eq!(single.recall_at_10, two.recall_at_10); + // The rerank stage is actually APPLIED: the ReverseReranker flips the order, + // so the rerank ranking must differ from single_pass (it does not silently + // no-op). Reversal pushes the relevant doc down, so nDCG drops. + assert!( + rerank.ndcg_at_10 < single.ndcg_at_10, + "rerank must change the ranking (applied={}, single={})", + rerank.ndcg_at_10, + single.ndcg_at_10 ); - // JSON renders. + // JSON renders all three configs. let json = report.to_json(); - assert_eq!(json["results"].as_array().unwrap().len(), 2); + assert_eq!(json["results"].as_array().unwrap().len(), 3); } diff --git a/docs/eval/baseline-scifact.json b/docs/eval/baseline-scifact.json new file mode 100644 index 0000000..7289cdb --- /dev/null +++ b/docs/eval/baseline-scifact.json @@ -0,0 +1,122 @@ +{ + "corpus_docs": 1000, + "dataset": "scifact-1k", + "dim": 768, + "k": 100, + "model_id": "BAAI/bge-base-en-v1.5", + "queries": 300, + "results": [ + { + "config": "single_pass", + "latency": { + "mean_ms": 147.60474049666655, + "n": 300, + "p50_ms": 146.055297, + "p95_ms": 182.5911, + "p99_ms": 190.951605 + }, + "map": 0.8224983668316723, + "mrr": 0.8306838866258879, + "ndcg_at_10": 0.845865680550998, + "ndcg_at_100": 0.8632518249574073, + "qps": { + "completed": 105, + "latency": { + "mean_ms": 627.4837098666667, + "n": 105, + "p50_ms": 633.607573, + "p95_ms": 722.991633, + "p99_ms": 838.0994539999999 + }, + "qps": 12.517082351369911, + "secs": 8.388536326 + }, + "recall_at_10": 0.9208888888888889, + "recall_at_100": 0.9933333333333333 + }, + { + "config": "two_pass", + "latency": { + "mean_ms": 176.21182580666675, + "n": 300, + "p50_ms": 174.259779, + "p95_ms": 213.43055999999999, + "p99_ms": 223.933607 + }, + "map": 0.822881412747501, + "mrr": 0.8310725998283941, + "ndcg_at_10": 0.8471483082800391, + "ndcg_at_100": 0.8635981384652979, + "qps": { + "completed": 85, + "latency": { + "mean_ms": 779.1540350470589, + "n": 85, + "p50_ms": 787.6516310000001, + "p95_ms": 885.695209, + "p99_ms": 1116.4758450000002 + }, + "qps": 10.017490532577238, + "secs": 8.485159005 + }, + "recall_at_10": 0.9242222222222222, + "recall_at_100": 0.9933333333333333 + }, + { + "config": "rerank", + "latency": { + "mean_ms": 15869.816934796661, + "n": 300, + "p50_ms": 15103.900015000001, + "p95_ms": 21417.791509000002, + "p99_ms": 22625.405078 + }, + "map": 0.6319104056519331, + "mrr": 0.6443926411809954, + "ndcg_at_10": 0.6713375743014715, + "ndcg_at_100": 0.710396174387697, + "qps": { + "completed": 8, + "latency": { + "mean_ms": 26286.12628475, + "n": 8, + "p50_ms": 24889.192969, + "p95_ms": 32166.650187000003, + "p99_ms": 32166.650187000003 + }, + "qps": 0.24870395626692937, + "secs": 32.166758101 + }, + "recall_at_10": 0.8019999999999999, + "recall_at_100": 0.9933333333333333 + }, + { + "config": "two_pass_rerank", + "latency": { + "mean_ms": 15749.06994372334, + "n": 300, + "p50_ms": 15030.227543, + "p95_ms": 21038.890063, + "p99_ms": 22432.801917999997 + }, + "map": 0.6320441085138567, + "mrr": 0.6444835149328776, + "ndcg_at_10": 0.671893236170446, + "ndcg_at_100": 0.7104921297998338, + "qps": { + "completed": 8, + "latency": { + "mean_ms": 28451.25987075, + "n": 8, + "p50_ms": 27499.714644, + "p95_ms": 36061.501938, + "p99_ms": 36061.501938 + }, + "qps": 0.2218425255278265, + "secs": 36.061616144 + }, + "recall_at_10": 0.8031111111111111, + "recall_at_100": 0.9933333333333333 + } + ] +} diff --git a/docs/eval/baseline-scifact.md b/docs/eval/baseline-scifact.md new file mode 100644 index 0000000..e728465 --- /dev/null +++ b/docs/eval/baseline-scifact.md @@ -0,0 +1,82 @@ +# Baseline — BEIR SciFact (1k subsample) + +A committed `escurel-eval` run, produced by: + +```bash +escurel-eval --dataset datasets/scifact-1k --skill paper \ + --embed-model BAAI/bge-base-en-v1.5 --reranker BAAI/bge-reranker-base \ + --k 100 --coarse-dim 128 --coarse-candidates 500 \ + --qps-workers 8 --qps-secs 8 --format json +``` + +Raw JSON: [`baseline-scifact.json`](baseline-scifact.json). + +## Setup + +| | | +|---|---| +| Dataset | BEIR SciFact, **1000-doc qrels-preserving subsample** (all 283 test-judged docs + 717 distractors) | +| Queries | 300 (the SciFact test split) | +| Embedder | `BAAI/bge-base-en-v1.5` (768-d BERT) | +| Reranker | `BAAI/bge-reranker-base` (XLM-RoBERTa cross-encoder) | +| `k` / coarse | k=100, coarse_dim=128, coarse_candidates=500 | +| Hardware | CPU only (candle, no BLAS) | + +**Why a 1k subsample, not the full 5183-doc corpus:** candle CPU BERT embedding +(no BLAS in the default build) runs at ~0.5 docs/s, so the full corpus is hours +of ingest. The subsample keeps every judged doc, so recall/nDCG are well-defined; +it is an easier corpus than the full set, so treat the **absolute** numbers as +indicative and the **per-config deltas** as the signal. The harness runs the full +corpus unchanged on a BLAS/GPU build. + +## Results + +| config | nDCG@10 | nDCG@100 | recall@10 | recall@100 | MRR | MAP | p50 ms | p95 ms | QPS | +|---|---|---|---|---|---|---|---|---|---| +| single_pass | **0.846** | 0.863 | 0.921 | 0.993 | 0.831 | 0.822 | 146 | 183 | 12.5 | +| two_pass | 0.847 | 0.864 | 0.924 | 0.993 | 0.831 | 0.823 | 174 | 213 | 10.0 | +| rerank | 0.671 | 0.710 | 0.802 | 0.993 | 0.644 | 0.632 | 15104 | 21418 | 0.2 | +| two_pass_rerank | 0.672 | 0.711 | 0.803 | 0.993 | 0.645 | 0.632 | 15030 | 21039 | 0.2 | + +(p50/p95 are the sequential per-query latency; QPS is the 8-worker concurrent +pass — the `Indexer` connection mutex serializes DuckDB, so this is single-writer +throughput.) + +## Findings + +**#218 two-pass — quality-neutral, small latency cost (as designed).** +`single_pass` → `two_pass`: nDCG@10 +0.001, recall@10 +0.003 (noise), p50 +28 ms. +The coarse 128-d prefix shortlist (500 of 1000 docs) preserves the full-dim +ranking here. The latency *increase* is expected: the truncate-on-read coarse +pass is a cheaper-per-row scan, **not** a low-dim ANN index, so on this corpus +size it adds work rather than saving it — exactly the trade-off the #218 PR +documented (a second 128-d HNSW index is the throughput win, deferred). Two-pass +pays off at corpus sizes where the full-dim HNSW scan dominates, not at 1k docs. + +**#215 rerank — regresses quality AND latency here. Two real causes:** + +1. **Quality drop (nDCG@10 0.846 → 0.671).** bge-base single-pass is already a + strong retriever on SciFact, and the rerank stage scores the **200-char block + snippet**, not the full abstract (`rerank_passage` uses `SearchHit.snippet`, + the hydrated lead — a deliberate latency choice in the #215 stage PR). On + abstract-length docs the cross-encoder sees ~13% of the passage and reorders + *worse* than the bi-encoder that embedded the whole doc. **Actionable:** feed + the reranker fuller passage text (refetch the block `body`), at least for + document/RAG skills. +2. **Latency (~15 s/query sequential, QPS 0.2).** A CPU cross-encoder scoring 100 + `(query, passage)` pairs per query is ~15 s; concurrent throughput collapses + to 0.2 QPS. **Actionable:** rerank only makes sense on GPU, and/or with a much + smaller `rerank_candidates` (e.g. 20–50), and/or a lighter CE head. + +The harness did its job: it turned "the reranker is wired in" into a measured, +falsifiable result — on this benchmark, the rerank stage as currently configured +(snippet passages, CPU, 100 candidates) is a net negative, and the report points +at the two concrete levers to change that. + +## Caveats + +- Absolute nDCG is on a 1k subsample (easier than full SciFact) and is **not** the + ADR-0001 460-block target — those numbers await escurel's own corpus in this + same BEIR format (`docs/eval/README.md`). +- CPU-only; GPU / BLAS would change the latency picture (and make the full corpus + + rerank tractable).