From 2c8a05bc075aae4e65a8a39de21ca9415914d204 Mon Sep 17 00:00:00 2001
From: muhammadkh4n <muhammadkh4n@gmail.com>
Date: Tue, 9 Jun 2026 23:28:25 +0500
Subject: [PATCH] feat(bench): add A4 iterative-retrieval arm + multi-hop
 scoring core
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Stage-1 building blocks for the A1/A3/A4 multi-hop eval (D2). No wiring
yet — the dataset loaders, the adapter, and the BenchmarkOpts.iterative
flag land next.

- retrieval/iterative.ts: dependency-injected iterative/agentic retrieval
  (retrieve -> LLM names the bridge -> re-retrieve). recall and
  proposeNextQuery are injected so the control flow is unit-testable with no
  OpenAI/Neo4j. A round-robin rank interleave reserves top-K slots for each
  hop's evidence so the round-2 bridge survives into top-K; cycle, maxRounds,
  and STOP guards bound the loop.
- multihop/types.ts: normalized MultiHopItem for MuSiQue / 2WikiMultiHopQA /
  HotpotQA-distractor — one shape so the arms stay dataset-agnostic.
- multihop/scoring.ts: judge-free retrieval metrics — recall@K, all-support@K,
  and bridge-recall@K (hop > 1 supporting paragraphs), the metric that isolates
  the dissimilar hop-2 evidence single-shot dense misses. Bridge recall is
  not-applicable (null) for datasets that do not label hops.

Tests: 13 new (7 iterative control-flow incl. bridge recovery, 6 scoring),
all Mac-runnable (type-only core imports, no native bindings). tsc clean.
---
 packages/bench/src/multihop/scoring.ts       | 102 +++++++++++++
 packages/bench/src/multihop/types.ts         |  75 ++++++++++
 packages/bench/src/retrieval/iterative.ts    | 145 +++++++++++++++++++
 packages/bench/test/iterative.test.ts        | 125 ++++++++++++++++
 packages/bench/test/multihop-scoring.test.ts | 130 +++++++++++++++++
 5 files changed, 577 insertions(+)
 create mode 100644 packages/bench/src/multihop/scoring.ts
 create mode 100644 packages/bench/src/multihop/types.ts
 create mode 100644 packages/bench/src/retrieval/iterative.ts
 create mode 100644 packages/bench/test/iterative.test.ts
 create mode 100644 packages/bench/test/multihop-scoring.test.ts

diff --git a/packages/bench/src/multihop/scoring.ts b/packages/bench/src/multihop/scoring.ts
new file mode 100644
index 0000000..404c913
--- /dev/null
+++ b/packages/bench/src/multihop/scoring.ts
@@ -0,0 +1,102 @@
+import type {
+  MultiHopItem,
+  MultiHopPrediction,
+  MultiHopArmMetrics,
+} from './types.js'
+
+/** Idxs of the gold supporting paragraphs. */
+export function supportingIdxs(item: MultiHopItem): number[] {
+  return item.paragraphs.filter((p) => p.isSupporting).map((p) => p.idx)
+}
+
+/**
+ * Bridge paragraphs: supporting paragraphs at hop > 1 — evidence NOT directly
+ * cued by the top-level question, which single-shot dense ranks low. Returns
+ * null when the dataset does not label decomposition hops, so callers mark the
+ * bridge metric not-applicable rather than reporting a misleading zero.
+ */
+export function bridgeIdxs(item: MultiHopItem): number[] | null {
+  const supporting = item.paragraphs.filter((p) => p.isSupporting)
+  const labeled = supporting.some((p) => p.hop !== undefined)
+  if (!labeled) return null
+  return supporting.filter((p) => (p.hop ?? 1) > 1).map((p) => p.idx)
+}
+
+function fractionInTopK(retrieved: number[], gold: number[], k: number): number {
+  if (gold.length === 0) return 0
+  const topK = new Set(retrieved.slice(0, k))
+  let hit = 0
+  for (const g of gold) if (topK.has(g)) hit++
+  return hit / gold.length
+}
+
+function allInTopK(retrieved: number[], gold: number[], k: number): boolean {
+  if (gold.length === 0) return false
+  const topK = new Set(retrieved.slice(0, k))
+  return gold.every((g) => topK.has(g))
+}
+
+/** Bridge-recall sentinel for items whose dataset does not label hops. */
+const BRIDGE_NA = -1
+
+export function scoreRetrieval(
+  item: MultiHopItem,
+  retrievedParagraphIdxs: number[],
+  ks: number[],
+): Pick<
+  MultiHopPrediction,
+  'allSupportAtK' | 'supportRecallAtK' | 'bridgeRecallAtK'
+> {
+  const support = supportingIdxs(item)
+  const bridges = bridgeIdxs(item)
+  const allSupportAtK: Record<number, boolean> = {}
+  const supportRecallAtK: Record<number, number> = {}
+  const bridgeRecallAtK: Record<number, number> = {}
+
+  for (const k of ks) {
+    allSupportAtK[k] = allInTopK(retrievedParagraphIdxs, support, k)
+    supportRecallAtK[k] = fractionInTopK(retrievedParagraphIdxs, support, k)
+    bridgeRecallAtK[k] =
+      bridges === null ? BRIDGE_NA : fractionInTopK(retrievedParagraphIdxs, bridges, k)
+  }
+
+  return { allSupportAtK, supportRecallAtK, bridgeRecallAtK }
+}
+
+export function aggregateArmMetrics(
+  arm: string,
+  predictions: MultiHopPrediction[],
+  ks: number[],
+): MultiHopArmMetrics {
+  const n = predictions.length
+  const allSupportAtK: Record<number, number> = {}
+  const supportRecallAtK: Record<number, number> = {}
+  const bridgeRecallAtK: Record<number, number | null> = {}
+
+  for (const k of ks) {
+    let allSum = 0
+    let supSum = 0
+    let bridgeSum = 0
+    let bridgeCount = 0
+    for (const p of predictions) {
+      if (p.allSupportAtK[k]) allSum++
+      supSum += p.supportRecallAtK[k] ?? 0
+      const b = p.bridgeRecallAtK[k]
+      if (b !== undefined && b >= 0) {
+        bridgeSum += b
+        bridgeCount++
+      }
+    }
+    allSupportAtK[k] = n > 0 ? allSum / n : 0
+    supportRecallAtK[k] = n > 0 ? supSum / n : 0
+    // null when no item in this run labels hops → bridge metric not applicable.
+    bridgeRecallAtK[k] = bridgeCount > 0 ? bridgeSum / bridgeCount : null
+  }
+
+  const meanRounds =
+    n > 0
+      ? predictions.reduce((acc, p) => acc + Math.max(1, p.queries.length), 0) / n
+      : 0
+
+  return { arm, n, allSupportAtK, supportRecallAtK, bridgeRecallAtK, meanRounds }
+}
diff --git a/packages/bench/src/multihop/types.ts b/packages/bench/src/multihop/types.ts
new file mode 100644
index 0000000..953f19d
--- /dev/null
+++ b/packages/bench/src/multihop/types.ts
@@ -0,0 +1,75 @@
+/**
+ * Normalized multi-hop QA types for the D2 (research multi-hop) distribution.
+ *
+ * MuSiQue-Ans, 2WikiMultiHopQA and HotpotQA-distractor all share the same
+ * "distractor setting" shape: a question, a small bag of paragraphs (a few gold
+ * supporting + many distractors), and a gold answer. We normalize all three to
+ * one item so the adapter and the arms (A1/A3/A4) are dataset-agnostic.
+ *
+ * Why the distractor setting (per-question paragraph bag) rather than a pooled
+ * corpus: it is the canonical benchmark setting, it is far cheaper to ingest,
+ * and — crucially — it still contains the hard part. The hop-2+ "bridge"
+ * paragraph is not lexically/semantically similar to the original question, so
+ * single-shot dense ranks it low; recovering it is exactly what a graph (A3) or
+ * iterative retrieval (A4) must do.
+ */
+export type MultiHopDataset = 'musique' | '2wiki' | 'hotpotqa'
+
+export interface MultiHopParagraph {
+  /** Stable index within this item's paragraph bag. */
+  idx: number
+  title: string
+  text: string
+  /** Gold supporting paragraph for the answer. */
+  isSupporting: boolean
+  /**
+   * 1-based hop position among supporting paragraphs when the dataset labels
+   * decomposition order (MuSiQue). Hops > 1 are "bridge" evidence — not directly
+   * cued by the top-level question. undefined when unlabeled (2wiki/hotpot) or
+   * non-supporting.
+   */
+  hop?: number
+}
+
+export interface MultiHopItem {
+  id: string
+  question: string
+  answer: string
+  /** Acceptable answer variants for EM/F1 (MuSiQue answer_aliases, etc.). */
+  answerAliases: string[]
+  paragraphs: MultiHopParagraph[]
+  dataset: MultiHopDataset
+}
+
+export interface MultiHopPrediction {
+  itemId: string
+  question: string
+  goldAnswer: string
+  dataset: MultiHopDataset
+  arm: string
+  /** Retrieved paragraph idxs in rank order (deduped). */
+  retrievedParagraphIdxs: number[]
+  /** Every gold supporting paragraph present in top-K, per K. */
+  allSupportAtK: Record<number, boolean>
+  /** Fraction of gold supporting paragraphs in top-K, per K. */
+  supportRecallAtK: Record<number, number>
+  /** Fraction of bridge (hop > 1) supporting paragraphs in top-K, per K.
+   *  -1 when the dataset does not label hops (metric not applicable). */
+  bridgeRecallAtK: Record<number, number>
+  /** Queries issued, for the iterative arm (A4); single-element otherwise. */
+  queries: string[]
+}
+
+export interface MultiHopArmMetrics {
+  arm: string
+  n: number
+  /** Mean "all gold supporting in top-K", per K. */
+  allSupportAtK: Record<number, number>
+  /** Mean fraction of supporting paragraphs in top-K, per K. */
+  supportRecallAtK: Record<number, number>
+  /** Mean bridge recall in top-K, per K, over items where hops are labeled.
+   *  null when no item in the run labels hops. */
+  bridgeRecallAtK: Record<number, number | null>
+  /** Mean number of retrieval rounds (1 for single-shot arms). */
+  meanRounds: number
+}
diff --git a/packages/bench/src/retrieval/iterative.ts b/packages/bench/src/retrieval/iterative.ts
new file mode 100644
index 0000000..00a9864
--- /dev/null
+++ b/packages/bench/src/retrieval/iterative.ts
@@ -0,0 +1,145 @@
+import type { BenchRecallResult, BenchScoredMemory } from '../merge-associations.js'
+
+/**
+ * Iterative / agentic retrieval (the A4 arm).
+ *
+ * The cheap, no-graph multi-hop competitor every modern agent already uses:
+ * retrieve → let an LLM name the bridge entity / next sub-question → re-retrieve,
+ * for a few rounds. It recovers hop-2+ evidence that single-shot dense misses
+ * (the bridge paragraph is not similar to the original question, so vector
+ * search alone cannot surface it) WITHOUT a graph, a second datastore, or
+ * per-ingest extraction.
+ *
+ * It is the linchpin of the eval: if a correctly-wired graph (A3, PPR bound)
+ * cannot beat A4 on bridge-entity multi-hop — the graph's home turf — then the
+ * universal case for a graph engine collapses, because A4 is cheaper and
+ * generalizes across distributions.
+ *
+ * Dependencies are injected (`recall`, `proposeNextQuery`) so the control flow
+ * is unit-testable with no OpenAI/Neo4j: the LLM "name the bridge" step and the
+ * underlying memory recall are both stubs in tests and real clients in the
+ * adapter.
+ */
+export interface IterativeRecallDeps {
+  /** Bound recall over the memory under test. For A4 the graph is OFF. */
+  recall: (query: string) => Promise<BenchRecallResult>
+  /**
+   * The agentic step: given the original multi-hop question and the memories
+   * gathered so far, return the NEXT single-hop retrieval query (a sub-question
+   * or bridge entity), or null to stop (enough evidence / no further hop).
+   */
+  proposeNextQuery: (
+    question: string,
+    retrievedSoFar: readonly BenchScoredMemory[],
+  ) => Promise<string | null>
+}
+
+export interface IterativeRecallOpts {
+  /** Max retrieval rounds, including the first. Default 3. */
+  maxRounds?: number
+  /** Cap on returned memories. Default 20. */
+  limit?: number
+  /**
+   * Mirror the adapter's scoring pool: when true, append the graph
+   * spreading-activation channel to each round's pool. A4 runs graph-off so
+   * this is normally false; kept for parity if iterative is ever combined with
+   * the graph arm.
+   */
+  mergeAssociationsIntoTopK?: boolean
+}
+
+export interface IterativeRecallTrace {
+  /** Queries issued in order, starting with the original question. */
+  queries: string[]
+  /** Memory count returned by each round's recall. */
+  perRoundCounts: number[]
+  /** Number of recall rounds actually run. */
+  rounds: number
+}
+
+export interface IterativeRecallResult {
+  memories: BenchScoredMemory[]
+  trace: IterativeRecallTrace
+}
+
+function dedupeByBestRelevance(
+  memories: readonly BenchScoredMemory[],
+): BenchScoredMemory[] {
+  const best = new Map<string, BenchScoredMemory>()
+  for (const m of memories) {
+    const prev = best.get(m.id)
+    if (!prev || m.relevance > prev.relevance) best.set(m.id, m)
+  }
+  return [...best.values()]
+}
+
+/**
+ * Round-robin interleave the per-round ranked lists, deduping by id.
+ *
+ * This is the faithful multi-hop merge: it reserves top slots for EACH round's
+ * best evidence, so the round-2 bridge paragraph (high-relevance to its
+ * sub-query) lands near the top instead of being buried under round-1's full
+ * ranked list. Pure dense (A1) is exactly round 1 alone; the interleave is what
+ * lets later hops survive into top-K.
+ */
+function interleaveByRank(
+  perRound: readonly (readonly BenchScoredMemory[])[],
+  limit: number,
+): BenchScoredMemory[] {
+  const seen = new Set<string>()
+  const out: BenchScoredMemory[] = []
+  const maxLen = perRound.reduce((acc, r) => Math.max(acc, r.length), 0)
+  for (let rank = 0; rank < maxLen && out.length < limit; rank++) {
+    for (const round of perRound) {
+      if (rank >= round.length) continue
+      const m = round[rank]
+      if (seen.has(m.id)) continue
+      seen.add(m.id)
+      out.push(m)
+      if (out.length >= limit) break
+    }
+  }
+  return out
+}
+
+export async function iterativeRecall(
+  question: string,
+  deps: IterativeRecallDeps,
+  opts: IterativeRecallOpts = {},
+): Promise<IterativeRecallResult> {
+  const maxRounds = Math.max(1, opts.maxRounds ?? 3)
+  const limit = opts.limit ?? 20
+
+  const perRound: BenchScoredMemory[][] = []
+  const queries: string[] = []
+  let query = question
+
+  for (let round = 0; round < maxRounds; round++) {
+    queries.push(query)
+    const result = await deps.recall(query)
+    const pool = opts.mergeAssociationsIntoTopK
+      ? [...result.memories, ...result.associations]
+      : result.memories
+    perRound.push(pool)
+
+    if (round === maxRounds - 1) break
+
+    const accumulated = dedupeByBestRelevance(perRound.flat())
+    const next = await deps.proposeNextQuery(question, accumulated)
+    const trimmed = next?.trim()
+    if (!trimmed) break
+    // Cycle guard: an agent that re-proposes a query it already ran would loop
+    // without adding evidence.
+    if (queries.includes(trimmed)) break
+    query = trimmed
+  }
+
+  return {
+    memories: interleaveByRank(perRound, limit),
+    trace: {
+      queries,
+      perRoundCounts: perRound.map((r) => r.length),
+      rounds: perRound.length,
+    },
+  }
+}
diff --git a/packages/bench/test/iterative.test.ts b/packages/bench/test/iterative.test.ts
new file mode 100644
index 0000000..10f739e
--- /dev/null
+++ b/packages/bench/test/iterative.test.ts
@@ -0,0 +1,125 @@
+import { describe, it, expect, vi } from 'vitest'
+import { iterativeRecall } from '../src/retrieval/iterative.js'
+import type { BenchRecallResult, BenchScoredMemory } from '../src/merge-associations.js'
+
+function mem(id: string, relevance: number): BenchScoredMemory {
+  return {
+    id,
+    type: 'episode',
+    content: `content-${id}`,
+    relevance,
+    source: 'vector',
+    metadata: {},
+  } as unknown as BenchScoredMemory
+}
+
+function recallResult(
+  memories: BenchScoredMemory[],
+  associations: BenchScoredMemory[] = [],
+): BenchRecallResult {
+  return {
+    memories,
+    associations,
+    intent: undefined,
+    primed: [],
+    estimatedTokens: 0,
+    formatted: '',
+  } as unknown as BenchRecallResult
+}
+
+describe('iterativeRecall (A4 control flow, no OpenAI/Neo4j)', () => {
+  it('runs a single round when the agent proposes STOP (null)', async () => {
+    const recall = vi.fn(async () => recallResult([mem('a', 0.9), mem('b', 0.8)]))
+    const proposeNextQuery = vi.fn(async () => null)
+
+    const out = await iterativeRecall('q', { recall, proposeNextQuery })
+
+    expect(recall).toHaveBeenCalledTimes(1)
+    expect(recall).toHaveBeenCalledWith('q')
+    expect(out.memories.map((m) => m.id)).toEqual(['a', 'b'])
+    expect(out.trace.rounds).toBe(1)
+    expect(out.trace.queries).toEqual(['q'])
+  })
+
+  it('recovers a hop-2 bridge memory that single-shot dense missed', async () => {
+    // Round 1 surfaces hop-1 evidence but NOT the bridge paragraph (it is not
+    // similar to the original question). The agent names the bridge; round 2
+    // retrieves it. Recovering it is the entire reason A4 exists.
+    const recall = vi.fn(async (query: string) =>
+      query === 'q'
+        ? recallResult([mem('hop1', 0.9)])
+        : recallResult([mem('bridge', 0.95)]),
+    )
+    const proposeNextQuery = vi.fn(async () => 'who directed the film')
+
+    const out = await iterativeRecall('q', { recall, proposeNextQuery }, { maxRounds: 2 })
+
+    const ids = out.memories.map((m) => m.id)
+    expect(ids).toContain('hop1')
+    expect(ids).toContain('bridge')
+    expect(recall).toHaveBeenCalledTimes(2)
+    expect(out.trace.queries).toEqual(['q', 'who directed the film'])
+  })
+
+  it('stops at maxRounds even when the agent keeps proposing new queries', async () => {
+    let n = 0
+    const recall = vi.fn(async () => recallResult([mem(`m${n++}`, 0.5)]))
+    let c = 0
+    const proposeNextQuery = vi.fn(async () => `q-${c++}`)
+
+    const out = await iterativeRecall('q', { recall, proposeNextQuery }, { maxRounds: 3 })
+
+    expect(recall).toHaveBeenCalledTimes(3)
+    expect(out.trace.rounds).toBe(3)
+    // Not proposed after the final round.
+    expect(proposeNextQuery).toHaveBeenCalledTimes(2)
+  })
+
+  it('breaks the loop when the agent re-proposes an already-issued query', async () => {
+    const recall = vi.fn(async () => recallResult([mem('a', 0.5)]))
+    const proposeNextQuery = vi.fn(async () => 'q') // identical to the original question
+
+    const out = await iterativeRecall('q', { recall, proposeNextQuery }, { maxRounds: 5 })
+
+    expect(recall).toHaveBeenCalledTimes(1)
+    expect(out.trace.rounds).toBe(1)
+  })
+
+  it('interleaves rounds by rank and dedupes by id', async () => {
+    const recall = vi.fn(async (query: string) =>
+      query === 'q'
+        ? recallResult([mem('a', 0.9), mem('b', 0.7)])
+        : recallResult([mem('c', 0.8), mem('a', 0.6)]),
+    )
+    const proposeNextQuery = vi.fn(async () => 'bridge')
+
+    const out = await iterativeRecall('q', { recall, proposeNextQuery }, { maxRounds: 2, limit: 10 })
+
+    // rank0: r1=a, r2=c; rank1: r1=b, r2=a(dup→skip) → [a, c, b]
+    expect(out.memories.map((m) => m.id)).toEqual(['a', 'c', 'b'])
+  })
+
+  it('caps output at the limit', async () => {
+    const recall = vi.fn(async () =>
+      recallResult([mem('a', 0.9), mem('b', 0.8), mem('c', 0.7)]),
+    )
+    const proposeNextQuery = vi.fn(async () => null)
+
+    const out = await iterativeRecall('q', { recall, proposeNextQuery }, { limit: 2 })
+
+    expect(out.memories.map((m) => m.id)).toEqual(['a', 'b'])
+  })
+
+  it('appends the association channel when mergeAssociationsIntoTopK is set', async () => {
+    const recall = vi.fn(async () => recallResult([mem('a', 0.9)], [mem('assoc', 0.5)]))
+    const proposeNextQuery = vi.fn(async () => null)
+
+    const out = await iterativeRecall(
+      'q',
+      { recall, proposeNextQuery },
+      { mergeAssociationsIntoTopK: true },
+    )
+
+    expect(out.memories.map((m) => m.id)).toEqual(['a', 'assoc'])
+  })
+})
diff --git a/packages/bench/test/multihop-scoring.test.ts b/packages/bench/test/multihop-scoring.test.ts
new file mode 100644
index 0000000..c7bb9b9
--- /dev/null
+++ b/packages/bench/test/multihop-scoring.test.ts
@@ -0,0 +1,130 @@
+import { describe, it, expect } from 'vitest'
+import {
+  scoreRetrieval,
+  supportingIdxs,
+  bridgeIdxs,
+  aggregateArmMetrics,
+} from '../src/multihop/scoring.js'
+import type { MultiHopItem, MultiHopPrediction } from '../src/multihop/types.js'
+
+// MuSiQue-style item: hop-1 evidence at idx1, hop-2 BRIDGE at idx3, distractors
+// at idx0/idx2. A dense ranking that keys off question similarity surfaces the
+// hop-1 paragraph but ranks the bridge low.
+const labeled: MultiHopItem = {
+  id: 'm1',
+  question: 'spouse of the director of film X',
+  answer: 'a',
+  answerAliases: [],
+  dataset: 'musique',
+  paragraphs: [
+    { idx: 0, title: 'd0', text: '', isSupporting: false },
+    { idx: 1, title: 's1', text: '', isSupporting: true, hop: 1 },
+    { idx: 2, title: 'd2', text: '', isSupporting: false },
+    { idx: 3, title: 's2', text: '', isSupporting: true, hop: 2 },
+  ],
+}
+
+// 2wiki/hotpot-style item: gold supporting paragraphs but no hop labels.
+const unlabeled: MultiHopItem = {
+  ...labeled,
+  id: 'w1',
+  dataset: '2wiki',
+  paragraphs: [
+    { idx: 0, title: 'd0', text: '', isSupporting: false },
+    { idx: 1, title: 's1', text: '', isSupporting: true },
+    { idx: 2, title: 's2', text: '', isSupporting: true },
+  ],
+}
+
+describe('multihop scoring', () => {
+  it('identifies supporting and bridge paragraphs', () => {
+    expect(supportingIdxs(labeled)).toEqual([1, 3])
+    expect(bridgeIdxs(labeled)).toEqual([3]) // only hop > 1
+    expect(supportingIdxs(unlabeled)).toEqual([1, 2])
+    expect(bridgeIdxs(unlabeled)).toBeNull() // no hop labels → not applicable
+  })
+
+  it('isolates the missed bridge at tight K, full recall at wide K', () => {
+    // Dense-like ranking: hop-1 first, bridge buried last.
+    const ranked = [1, 0, 2, 3]
+    const s = scoreRetrieval(labeled, ranked, [2, 4])
+
+    // k=2: hop-1 present, bridge (idx3) NOT in top-2.
+    expect(s.allSupportAtK[2]).toBe(false)
+    expect(s.supportRecallAtK[2]).toBeCloseTo(0.5)
+    expect(s.bridgeRecallAtK[2]).toBe(0)
+
+    // k=4: everything retrieved.
+    expect(s.allSupportAtK[4]).toBe(true)
+    expect(s.supportRecallAtK[4]).toBeCloseTo(1)
+    expect(s.bridgeRecallAtK[4]).toBeCloseTo(1)
+  })
+
+  it('rewards an arm that lifts the bridge into top-K', () => {
+    // Iterative/graph-like ranking: bridge rescued to rank 2.
+    const rescued = [1, 3, 0, 2]
+    const s = scoreRetrieval(labeled, rescued, [2])
+    expect(s.allSupportAtK[2]).toBe(true)
+    expect(s.bridgeRecallAtK[2]).toBeCloseTo(1)
+  })
+
+  it('marks bridge recall not-applicable (-1) for unlabeled datasets', () => {
+    const s = scoreRetrieval(unlabeled, [1, 2], [2])
+    expect(s.bridgeRecallAtK[2]).toBe(-1)
+    expect(s.allSupportAtK[2]).toBe(true)
+  })
+
+  it('aggregates arm metrics and treats unlabeled bridge recall as null', () => {
+    const preds: MultiHopPrediction[] = [
+      {
+        itemId: 'a',
+        question: 'q',
+        goldAnswer: 'a',
+        dataset: '2wiki',
+        arm: 'a1',
+        retrievedParagraphIdxs: [1, 2],
+        allSupportAtK: { 2: true },
+        supportRecallAtK: { 2: 1 },
+        bridgeRecallAtK: { 2: -1 },
+        queries: ['q'],
+      },
+      {
+        itemId: 'b',
+        question: 'q',
+        goldAnswer: 'a',
+        dataset: '2wiki',
+        arm: 'a1',
+        retrievedParagraphIdxs: [0, 1],
+        allSupportAtK: { 2: false },
+        supportRecallAtK: { 2: 0.5 },
+        bridgeRecallAtK: { 2: -1 },
+        queries: ['q'],
+      },
+    ]
+
+    const m = aggregateArmMetrics('a1', preds, [2])
+    expect(m.n).toBe(2)
+    expect(m.allSupportAtK[2]).toBeCloseTo(0.5)
+    expect(m.supportRecallAtK[2]).toBeCloseTo(0.75)
+    expect(m.bridgeRecallAtK[2]).toBeNull() // all -1 → not applicable
+    expect(m.meanRounds).toBe(1)
+  })
+
+  it('averages rounds for the iterative arm', () => {
+    const preds: MultiHopPrediction[] = [
+      {
+        itemId: 'a', question: 'q', goldAnswer: 'a', dataset: 'musique', arm: 'a4',
+        retrievedParagraphIdxs: [1, 3], allSupportAtK: { 2: true },
+        supportRecallAtK: { 2: 1 }, bridgeRecallAtK: { 2: 1 }, queries: ['q', 'bridge'],
+      },
+      {
+        itemId: 'b', question: 'q', goldAnswer: 'a', dataset: 'musique', arm: 'a4',
+        retrievedParagraphIdxs: [1], allSupportAtK: { 2: false },
+        supportRecallAtK: { 2: 0.5 }, bridgeRecallAtK: { 2: 0 }, queries: ['q', 'b2', 'b3'],
+      },
+    ]
+    const m = aggregateArmMetrics('a4', preds, [2])
+    expect(m.bridgeRecallAtK[2]).toBeCloseTo(0.5)
+    expect(m.meanRounds).toBeCloseTo(2.5) // (2 + 3) / 2
+  })
+})