Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/escurel-eval/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ tracing = "0.1"
[dev-dependencies]
tempfile = "3"
tokio = { version = "1", features = ["macros", "rt-multi-thread"] }
async-trait = "0.1"

[lints]
workspace = true
33 changes: 26 additions & 7 deletions crates/escurel-eval/src/latency.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,9 +48,30 @@ pub fn percentiles(samples: &mut [f64]) -> LatencyStats {
}
}

/// Run each query once (sequentially), timing the `search` call. Returns the
/// per-query ranked doc ids (deduped to doc granularity, for the metrics) and
/// the latency distribution — computed in the same sweep.
/// One query → its top-`k` doc ranking, replicating the server's native-lane
/// path (`escurel-server`'s `tool_search`): fetch the (possibly wider) rerank
/// candidate pool, apply the cross-encoder rerank stage, then truncate to `k`.
///
/// `Indexer::search` itself does NOT rerank — the rerank stage lives in the
/// server dispatcher (`rerank_hits` after `search`), so the harness has to
/// invoke it the same way or the rerank configs are a silent no-op. With rerank
/// disabled, `rerank_candidate_pool(k) == k` and `rerank_hits` is a no-op, so
/// this is exactly `search(q, k)`.
pub async fn search_ranked(
indexer: &Indexer,
query: &str,
k: usize,
) -> Result<Vec<escurel_index::SearchHit>, EvalError> {
let pool = indexer.rerank_candidate_pool(k);
let hits = indexer.search(query, pool, None, None, None, None).await?;
let mut hits = indexer.rerank_hits(query, hits).await?;
hits.truncate(k);
Ok(hits)
}

/// Run each query once (sequentially), timing the full retrieve + rerank call.
/// Returns the per-query ranked doc ids (deduped to doc granularity, for the
/// metrics) and the latency distribution — computed in the same sweep.
pub async fn run_queries(
indexer: &Indexer,
queries: &[Query],
Expand All @@ -60,7 +81,7 @@ pub async fn run_queries(
let mut samples = Vec::with_capacity(queries.len());
for q in queries {
let start = Instant::now();
let hits = indexer.search(&q.text, k, None, None, None, None).await?;
let hits = search_ranked(indexer, &q.text, k).await?;
samples.push(start.elapsed().as_secs_f64() * 1000.0);
ranked_per_query.push(dedup_doc_ids(&hits));
}
Expand Down Expand Up @@ -101,9 +122,7 @@ pub async fn measure_qps(
while Instant::now() < deadline {
let idx = cursor.fetch_add(1, Ordering::Relaxed) % queries.len();
let start = Instant::now();
let r = indexer
.search(&queries[idx].text, k, None, None, None, None)
.await;
let r = search_ranked(&indexer, &queries[idx].text, k).await;
if r.is_ok() {
samples.push(start.elapsed().as_secs_f64() * 1000.0);
completed.fetch_add(1, Ordering::Relaxed);
Expand Down
68 changes: 58 additions & 10 deletions crates/escurel-eval/tests/smoke.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use std::path::PathBuf;
use std::sync::Arc;
use std::time::Duration;

use escurel_embed::{Embedder, HashEmbedder};
use escurel_embed::{Candidate, EmbedError, Embedder, HashEmbedder, Ranked, Reranker};
use escurel_eval::config::RunConfig;
use escurel_eval::dataset::Dataset;
use escurel_eval::{QpsParams, run_matrix};
Expand All @@ -21,6 +21,32 @@ fn fixture_dir() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("fixtures/tiny")
}

/// Deterministic reranker that REVERSES the first-stage order. Used only to
/// prove the harness actually invokes the rerank stage (`rerank_hits`) — if the
/// rerank config were a silent no-op, its ranking would equal single_pass.
#[derive(Debug)]
struct ReverseReranker;

#[async_trait::async_trait]
impl Reranker for ReverseReranker {
async fn rerank(
&self,
_query: &str,
candidates: &[Candidate],
) -> Result<Vec<Ranked>, EmbedError> {
let n = candidates.len();
Ok(candidates
.iter()
.rev()
.enumerate()
.map(|(i, c)| Ranked {
id: c.id.clone(),
score: (n - i) as f32,
})
.collect())
}
}

#[tokio::test]
async fn end_to_end_over_tiny_fixture() {
let dataset = Dataset::load(&fixture_dir()).expect("load fixture");
Expand All @@ -38,14 +64,15 @@ async fn end_to_end_over_tiny_fixture() {
coarse_dim: 128,
coarse_candidates: 500,
},
RunConfig::Rerank { candidates: 10 },
];

let report = run_matrix(
&dataset,
&db_path,
&store_dir,
Arc::clone(&embedder),
None, // no reranker (rerank configs would be skipped)
Some(Arc::new(ReverseReranker)),
&configs,
"doc",
10,
Expand All @@ -62,7 +89,7 @@ async fn end_to_end_over_tiny_fixture() {
assert_eq!(report.dim, 768);
assert_eq!(report.corpus_docs, 8);
assert_eq!(report.queries, 4);
assert_eq!(report.results.len(), 2, "single_pass + two_pass");
assert_eq!(report.results.len(), 3, "single_pass + two_pass + rerank");

for r in &report.results {
// Every metric is a probability-like value.
Expand All @@ -84,20 +111,41 @@ async fn end_to_end_over_tiny_fixture() {
assert_eq!(r.latency.n, 4, "{} latency samples", r.config);
// All 8 docs fit in k=10, so every query's relevant doc is found.
assert_eq!(r.recall_at_10, 1.0, "{} recall@10", r.config);
// The BM25 lane ranks the matching doc near the top.
assert!(r.ndcg_at_10 >= 0.5, "{} ndcg@10={}", r.config, r.ndcg_at_10);
// The concurrent QPS pass completed some searches.
let qps = r.qps.as_ref().expect("qps measured");
assert!(qps.completed > 0, "{} qps completed 0", r.config);
}

let by = |name: &str| {
report
.results
.iter()
.find(|r| r.config == name)
.unwrap_or_else(|| panic!("config {name} present"))
};
let single = by("single_pass");
let two = by("two_pass");
let rerank = by("rerank");

// The BM25 lane ranks the matching doc near the top for first-stage configs.
assert!(
single.ndcg_at_10 >= 0.5,
"single ndcg@10={}",
single.ndcg_at_10
);
// Two-pass with a corpus-covering shortlist preserves single-pass recall.
assert_eq!(
report.results[0].recall_at_10,
report.results[1].recall_at_10
assert_eq!(single.recall_at_10, two.recall_at_10);
// The rerank stage is actually APPLIED: the ReverseReranker flips the order,
// so the rerank ranking must differ from single_pass (it does not silently
// no-op). Reversal pushes the relevant doc down, so nDCG drops.
assert!(
rerank.ndcg_at_10 < single.ndcg_at_10,
"rerank must change the ranking (applied={}, single={})",
rerank.ndcg_at_10,
single.ndcg_at_10
);

// JSON renders.
// JSON renders all three configs.
let json = report.to_json();
assert_eq!(json["results"].as_array().unwrap().len(), 2);
assert_eq!(json["results"].as_array().unwrap().len(), 3);
}
122 changes: 122 additions & 0 deletions docs/eval/baseline-scifact.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
{
"corpus_docs": 1000,
"dataset": "scifact-1k",
"dim": 768,
"k": 100,
"model_id": "BAAI/bge-base-en-v1.5",
"queries": 300,
"results": [
{
"config": "single_pass",
"latency": {
"mean_ms": 147.60474049666655,
"n": 300,
"p50_ms": 146.055297,
"p95_ms": 182.5911,
"p99_ms": 190.951605
},
"map": 0.8224983668316723,
"mrr": 0.8306838866258879,
"ndcg_at_10": 0.845865680550998,
"ndcg_at_100": 0.8632518249574073,
"qps": {
"completed": 105,
"latency": {
"mean_ms": 627.4837098666667,
"n": 105,
"p50_ms": 633.607573,
"p95_ms": 722.991633,
"p99_ms": 838.0994539999999
},
"qps": 12.517082351369911,
"secs": 8.388536326
},
"recall_at_10": 0.9208888888888889,
"recall_at_100": 0.9933333333333333
},
{
"config": "two_pass",
"latency": {
"mean_ms": 176.21182580666675,
"n": 300,
"p50_ms": 174.259779,
"p95_ms": 213.43055999999999,
"p99_ms": 223.933607
},
"map": 0.822881412747501,
"mrr": 0.8310725998283941,
"ndcg_at_10": 0.8471483082800391,
"ndcg_at_100": 0.8635981384652979,
"qps": {
"completed": 85,
"latency": {
"mean_ms": 779.1540350470589,
"n": 85,
"p50_ms": 787.6516310000001,
"p95_ms": 885.695209,
"p99_ms": 1116.4758450000002
},
"qps": 10.017490532577238,
"secs": 8.485159005
},
"recall_at_10": 0.9242222222222222,
"recall_at_100": 0.9933333333333333
},
{
"config": "rerank",
"latency": {
"mean_ms": 15869.816934796661,
"n": 300,
"p50_ms": 15103.900015000001,
"p95_ms": 21417.791509000002,
"p99_ms": 22625.405078
},
"map": 0.6319104056519331,
"mrr": 0.6443926411809954,
"ndcg_at_10": 0.6713375743014715,
"ndcg_at_100": 0.710396174387697,
"qps": {
"completed": 8,
"latency": {
"mean_ms": 26286.12628475,
"n": 8,
"p50_ms": 24889.192969,
"p95_ms": 32166.650187000003,
"p99_ms": 32166.650187000003
},
"qps": 0.24870395626692937,
"secs": 32.166758101
},
"recall_at_10": 0.8019999999999999,
"recall_at_100": 0.9933333333333333
},
{
"config": "two_pass_rerank",
"latency": {
"mean_ms": 15749.06994372334,
"n": 300,
"p50_ms": 15030.227543,
"p95_ms": 21038.890063,
"p99_ms": 22432.801917999997
},
"map": 0.6320441085138567,
"mrr": 0.6444835149328776,
"ndcg_at_10": 0.671893236170446,
"ndcg_at_100": 0.7104921297998338,
"qps": {
"completed": 8,
"latency": {
"mean_ms": 28451.25987075,
"n": 8,
"p50_ms": 27499.714644,
"p95_ms": 36061.501938,
"p99_ms": 36061.501938
},
"qps": 0.2218425255278265,
"secs": 36.061616144
},
"recall_at_10": 0.8031111111111111,
"recall_at_100": 0.9933333333333333
}
]
}
82 changes: 82 additions & 0 deletions docs/eval/baseline-scifact.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
# Baseline — BEIR SciFact (1k subsample)

A committed `escurel-eval` run, produced by:

```bash
escurel-eval --dataset datasets/scifact-1k --skill paper \
--embed-model BAAI/bge-base-en-v1.5 --reranker BAAI/bge-reranker-base \
--k 100 --coarse-dim 128 --coarse-candidates 500 \
--qps-workers 8 --qps-secs 8 --format json
```

Raw JSON: [`baseline-scifact.json`](baseline-scifact.json).

## Setup

| | |
|---|---|
| Dataset | BEIR SciFact, **1000-doc qrels-preserving subsample** (all 283 test-judged docs + 717 distractors) |
| Queries | 300 (the SciFact test split) |
| Embedder | `BAAI/bge-base-en-v1.5` (768-d BERT) |
| Reranker | `BAAI/bge-reranker-base` (XLM-RoBERTa cross-encoder) |
| `k` / coarse | k=100, coarse_dim=128, coarse_candidates=500 |
| Hardware | CPU only (candle, no BLAS) |

**Why a 1k subsample, not the full 5183-doc corpus:** candle CPU BERT embedding
(no BLAS in the default build) runs at ~0.5 docs/s, so the full corpus is hours
of ingest. The subsample keeps every judged doc, so recall/nDCG are well-defined;
it is an easier corpus than the full set, so treat the **absolute** numbers as
indicative and the **per-config deltas** as the signal. The harness runs the full
corpus unchanged on a BLAS/GPU build.

## Results

| config | nDCG@10 | nDCG@100 | recall@10 | recall@100 | MRR | MAP | p50 ms | p95 ms | QPS |
|---|---|---|---|---|---|---|---|---|---|
| single_pass | **0.846** | 0.863 | 0.921 | 0.993 | 0.831 | 0.822 | 146 | 183 | 12.5 |
| two_pass | 0.847 | 0.864 | 0.924 | 0.993 | 0.831 | 0.823 | 174 | 213 | 10.0 |
| rerank | 0.671 | 0.710 | 0.802 | 0.993 | 0.644 | 0.632 | 15104 | 21418 | 0.2 |
| two_pass_rerank | 0.672 | 0.711 | 0.803 | 0.993 | 0.645 | 0.632 | 15030 | 21039 | 0.2 |

(p50/p95 are the sequential per-query latency; QPS is the 8-worker concurrent
pass — the `Indexer` connection mutex serializes DuckDB, so this is single-writer
throughput.)

## Findings

**#218 two-pass — quality-neutral, small latency cost (as designed).**
`single_pass` → `two_pass`: nDCG@10 +0.001, recall@10 +0.003 (noise), p50 +28 ms.
The coarse 128-d prefix shortlist (500 of 1000 docs) preserves the full-dim
ranking here. The latency *increase* is expected: the truncate-on-read coarse
pass is a cheaper-per-row scan, **not** a low-dim ANN index, so on this corpus
size it adds work rather than saving it — exactly the trade-off the #218 PR
documented (a second 128-d HNSW index is the throughput win, deferred). Two-pass
pays off at corpus sizes where the full-dim HNSW scan dominates, not at 1k docs.

**#215 rerank — regresses quality AND latency here. Two real causes:**

1. **Quality drop (nDCG@10 0.846 → 0.671).** bge-base single-pass is already a
strong retriever on SciFact, and the rerank stage scores the **200-char block
snippet**, not the full abstract (`rerank_passage` uses `SearchHit.snippet`,
the hydrated lead — a deliberate latency choice in the #215 stage PR). On
abstract-length docs the cross-encoder sees ~13% of the passage and reorders
*worse* than the bi-encoder that embedded the whole doc. **Actionable:** feed
the reranker fuller passage text (refetch the block `body`), at least for
document/RAG skills.
2. **Latency (~15 s/query sequential, QPS 0.2).** A CPU cross-encoder scoring 100
`(query, passage)` pairs per query is ~15 s; concurrent throughput collapses
to 0.2 QPS. **Actionable:** rerank only makes sense on GPU, and/or with a much
smaller `rerank_candidates` (e.g. 20–50), and/or a lighter CE head.

The harness did its job: it turned "the reranker is wired in" into a measured,
falsifiable result — on this benchmark, the rerank stage as currently configured
(snippet passages, CPU, 100 candidates) is a net negative, and the report points
at the two concrete levers to change that.

## Caveats

- Absolute nDCG is on a 1k subsample (easier than full SciFact) and is **not** the
ADR-0001 460-block target — those numbers await escurel's own corpus in this
same BEIR format (`docs/eval/README.md`).
- CPU-only; GPU / BLAS would change the latency picture (and make the full corpus
+ rerank tractable).
Loading