muhammadkh4n · muhammadkh4n · Jun 10, 2026 · Jun 9, 2026
diff --git a/packages/bench/src/multihop/scoring.ts b/packages/bench/src/multihop/scoring.ts
@@ -0,0 +1,102 @@
+import type {
+  MultiHopItem,
+  MultiHopPrediction,
+  MultiHopArmMetrics,
+} from './types.js'
+
+/** Idxs of the gold supporting paragraphs. */
+export function supportingIdxs(item: MultiHopItem): number[] {
+  return item.paragraphs.filter((p) => p.isSupporting).map((p) => p.idx)
+}
+
+/**
+ * Bridge paragraphs: supporting paragraphs at hop > 1 — evidence NOT directly
+ * cued by the top-level question, which single-shot dense ranks low. Returns
+ * null when the dataset does not label decomposition hops, so callers mark the
+ * bridge metric not-applicable rather than reporting a misleading zero.
+ */
+export function bridgeIdxs(item: MultiHopItem): number[] | null {
+  const supporting = item.paragraphs.filter((p) => p.isSupporting)
+  const labeled = supporting.some((p) => p.hop !== undefined)
+  if (!labeled) return null
+  return supporting.filter((p) => (p.hop ?? 1) > 1).map((p) => p.idx)
+}
+
+function fractionInTopK(retrieved: number[], gold: number[], k: number): number {
+  if (gold.length === 0) return 0
+  const topK = new Set(retrieved.slice(0, k))
+  let hit = 0
+  for (const g of gold) if (topK.has(g)) hit++
+  return hit / gold.length
+}
+
+function allInTopK(retrieved: number[], gold: number[], k: number): boolean {
+  if (gold.length === 0) return false
+  const topK = new Set(retrieved.slice(0, k))
+  return gold.every((g) => topK.has(g))
+}
+
+/** Bridge-recall sentinel for items whose dataset does not label hops. */
+const BRIDGE_NA = -1
+
+export function scoreRetrieval(
+  item: MultiHopItem,
+  retrievedParagraphIdxs: number[],
+  ks: number[],
+): Pick<
+  MultiHopPrediction,
+  'allSupportAtK' | 'supportRecallAtK' | 'bridgeRecallAtK'
+> {
+  const support = supportingIdxs(item)
+  const bridges = bridgeIdxs(item)
+  const allSupportAtK: Record<number, boolean> = {}
+  const supportRecallAtK: Record<number, number> = {}
+  const bridgeRecallAtK: Record<number, number> = {}
+
+  for (const k of ks) {
+    allSupportAtK[k] = allInTopK(retrievedParagraphIdxs, support, k)
+    supportRecallAtK[k] = fractionInTopK(retrievedParagraphIdxs, support, k)
+    bridgeRecallAtK[k] =
+      bridges === null ? BRIDGE_NA : fractionInTopK(retrievedParagraphIdxs, bridges, k)
+  }
+
+  return { allSupportAtK, supportRecallAtK, bridgeRecallAtK }
+}
+
+export function aggregateArmMetrics(
+  arm: string,
+  predictions: MultiHopPrediction[],
+  ks: number[],
+): MultiHopArmMetrics {
+  const n = predictions.length
+  const allSupportAtK: Record<number, number> = {}
+  const supportRecallAtK: Record<number, number> = {}
+  const bridgeRecallAtK: Record<number, number | null> = {}
+
+  for (const k of ks) {
+    let allSum = 0
+    let supSum = 0
+    let bridgeSum = 0
+    let bridgeCount = 0
+    for (const p of predictions) {
+      if (p.allSupportAtK[k]) allSum++
+      supSum += p.supportRecallAtK[k] ?? 0
+      const b = p.bridgeRecallAtK[k]
+      if (b !== undefined && b >= 0) {
+        bridgeSum += b
+        bridgeCount++
+      }
+    }
+    allSupportAtK[k] = n > 0 ? allSum / n : 0
+    supportRecallAtK[k] = n > 0 ? supSum / n : 0
+    // null when no item in this run labels hops → bridge metric not applicable.
+    bridgeRecallAtK[k] = bridgeCount > 0 ? bridgeSum / bridgeCount : null
+  }
+
+  const meanRounds =
+    n > 0
+      ? predictions.reduce((acc, p) => acc + Math.max(1, p.queries.length), 0) / n
+      : 0
+
+  return { arm, n, allSupportAtK, supportRecallAtK, bridgeRecallAtK, meanRounds }
+}
diff --git a/packages/bench/src/multihop/types.ts b/packages/bench/src/multihop/types.ts
@@ -0,0 +1,75 @@
+/**
+ * Normalized multi-hop QA types for the D2 (research multi-hop) distribution.
+ *
+ * MuSiQue-Ans, 2WikiMultiHopQA and HotpotQA-distractor all share the same
+ * "distractor setting" shape: a question, a small bag of paragraphs (a few gold
+ * supporting + many distractors), and a gold answer. We normalize all three to
+ * one item so the adapter and the arms (A1/A3/A4) are dataset-agnostic.
+ *
+ * Why the distractor setting (per-question paragraph bag) rather than a pooled
+ * corpus: it is the canonical benchmark setting, it is far cheaper to ingest,
+ * and — crucially — it still contains the hard part. The hop-2+ "bridge"
+ * paragraph is not lexically/semantically similar to the original question, so
+ * single-shot dense ranks it low; recovering it is exactly what a graph (A3) or
+ * iterative retrieval (A4) must do.
+ */
+export type MultiHopDataset = 'musique' | '2wiki' | 'hotpotqa'
+
+export interface MultiHopParagraph {
+  /** Stable index within this item's paragraph bag. */
+  idx: number
+  title: string
+  text: string
+  /** Gold supporting paragraph for the answer. */
+  isSupporting: boolean
+  /**
+   * 1-based hop position among supporting paragraphs when the dataset labels
+   * decomposition order (MuSiQue). Hops > 1 are "bridge" evidence — not directly
+   * cued by the top-level question. undefined when unlabeled (2wiki/hotpot) or
+   * non-supporting.
+   */
+  hop?: number
+}
+
+export interface MultiHopItem {
+  id: string
+  question: string
+  answer: string
+  /** Acceptable answer variants for EM/F1 (MuSiQue answer_aliases, etc.). */
+  answerAliases: string[]
+  paragraphs: MultiHopParagraph[]
+  dataset: MultiHopDataset
+}
+
+export interface MultiHopPrediction {
+  itemId: string
+  question: string
+  goldAnswer: string
+  dataset: MultiHopDataset
+  arm: string
+  /** Retrieved paragraph idxs in rank order (deduped). */
+  retrievedParagraphIdxs: number[]
+  /** Every gold supporting paragraph present in top-K, per K. */
+  allSupportAtK: Record<number, boolean>
+  /** Fraction of gold supporting paragraphs in top-K, per K. */
+  supportRecallAtK: Record<number, number>
+  /** Fraction of bridge (hop > 1) supporting paragraphs in top-K, per K.
+   *  -1 when the dataset does not label hops (metric not applicable). */
+  bridgeRecallAtK: Record<number, number>
+  /** Queries issued, for the iterative arm (A4); single-element otherwise. */
+  queries: string[]
+}
+
+export interface MultiHopArmMetrics {
+  arm: string
+  n: number
+  /** Mean "all gold supporting in top-K", per K. */
+  allSupportAtK: Record<number, number>
+  /** Mean fraction of supporting paragraphs in top-K, per K. */
+  supportRecallAtK: Record<number, number>
+  /** Mean bridge recall in top-K, per K, over items where hops are labeled.
+   *  null when no item in the run labels hops. */
+  bridgeRecallAtK: Record<number, number | null>
+  /** Mean number of retrieval rounds (1 for single-shot arms). */
+  meanRounds: number
+}
diff --git a/packages/bench/src/retrieval/iterative.ts b/packages/bench/src/retrieval/iterative.ts
@@ -0,0 +1,145 @@
+import type { BenchRecallResult, BenchScoredMemory } from '../merge-associations.js'
+
+/**
+ * Iterative / agentic retrieval (the A4 arm).
+ *
+ * The cheap, no-graph multi-hop competitor every modern agent already uses:
+ * retrieve → let an LLM name the bridge entity / next sub-question → re-retrieve,
+ * for a few rounds. It recovers hop-2+ evidence that single-shot dense misses
+ * (the bridge paragraph is not similar to the original question, so vector
+ * search alone cannot surface it) WITHOUT a graph, a second datastore, or
+ * per-ingest extraction.
+ *
+ * It is the linchpin of the eval: if a correctly-wired graph (A3, PPR bound)
+ * cannot beat A4 on bridge-entity multi-hop — the graph's home turf — then the
+ * universal case for a graph engine collapses, because A4 is cheaper and
+ * generalizes across distributions.
+ *
+ * Dependencies are injected (`recall`, `proposeNextQuery`) so the control flow
+ * is unit-testable with no OpenAI/Neo4j: the LLM "name the bridge" step and the
+ * underlying memory recall are both stubs in tests and real clients in the
+ * adapter.
+ */
+export interface IterativeRecallDeps {
+  /** Bound recall over the memory under test. For A4 the graph is OFF. */
+  recall: (query: string) => Promise<BenchRecallResult>
+  /**
+   * The agentic step: given the original multi-hop question and the memories
+   * gathered so far, return the NEXT single-hop retrieval query (a sub-question
+   * or bridge entity), or null to stop (enough evidence / no further hop).
+   */
+  proposeNextQuery: (
+    question: string,
+    retrievedSoFar: readonly BenchScoredMemory[],
+  ) => Promise<string | null>
+}
+
+export interface IterativeRecallOpts {
+  /** Max retrieval rounds, including the first. Default 3. */
+  maxRounds?: number
+  /** Cap on returned memories. Default 20. */
+  limit?: number
+  /**
+   * Mirror the adapter's scoring pool: when true, append the graph
+   * spreading-activation channel to each round's pool. A4 runs graph-off so
+   * this is normally false; kept for parity if iterative is ever combined with
+   * the graph arm.
+   */
+  mergeAssociationsIntoTopK?: boolean
+}
+
+export interface IterativeRecallTrace {
+  /** Queries issued in order, starting with the original question. */
+  queries: string[]
+  /** Memory count returned by each round's recall. */
+  perRoundCounts: number[]
+  /** Number of recall rounds actually run. */
+  rounds: number
+}
+
+export interface IterativeRecallResult {
+  memories: BenchScoredMemory[]
+  trace: IterativeRecallTrace
+}
+
+function dedupeByBestRelevance(
+  memories: readonly BenchScoredMemory[],
+): BenchScoredMemory[] {
+  const best = new Map<string, BenchScoredMemory>()
+  for (const m of memories) {
+    const prev = best.get(m.id)
+    if (!prev || m.relevance > prev.relevance) best.set(m.id, m)
+  }
+  return [...best.values()]
+}
+
+/**
+ * Round-robin interleave the per-round ranked lists, deduping by id.
+ *
+ * This is the faithful multi-hop merge: it reserves top slots for EACH round's
+ * best evidence, so the round-2 bridge paragraph (high-relevance to its
+ * sub-query) lands near the top instead of being buried under round-1's full
+ * ranked list. Pure dense (A1) is exactly round 1 alone; the interleave is what
+ * lets later hops survive into top-K.
+ */
+function interleaveByRank(
+  perRound: readonly (readonly BenchScoredMemory[])[],
+  limit: number,
+): BenchScoredMemory[] {
+  const seen = new Set<string>()
+  const out: BenchScoredMemory[] = []
+  const maxLen = perRound.reduce((acc, r) => Math.max(acc, r.length), 0)
+  for (let rank = 0; rank < maxLen && out.length < limit; rank++) {
+    for (const round of perRound) {
+      if (rank >= round.length) continue
+      const m = round[rank]
+      if (seen.has(m.id)) continue
+      seen.add(m.id)
+      out.push(m)
+      if (out.length >= limit) break
+    }
+  }
+  return out
+}
+
+export async function iterativeRecall(
+  question: string,
+  deps: IterativeRecallDeps,
+  opts: IterativeRecallOpts = {},
+): Promise<IterativeRecallResult> {
+  const maxRounds = Math.max(1, opts.maxRounds ?? 3)
+  const limit = opts.limit ?? 20
+
+  const perRound: BenchScoredMemory[][] = []
+  const queries: string[] = []
+  let query = question
+
+  for (let round = 0; round < maxRounds; round++) {
+    queries.push(query)
+    const result = await deps.recall(query)
+    const pool = opts.mergeAssociationsIntoTopK
+      ? [...result.memories, ...result.associations]
+      : result.memories
+    perRound.push(pool)
+
+    if (round === maxRounds - 1) break
+
+    const accumulated = dedupeByBestRelevance(perRound.flat())
+    const next = await deps.proposeNextQuery(question, accumulated)
+    const trimmed = next?.trim()
+    if (!trimmed) break
+    // Cycle guard: an agent that re-proposes a query it already ran would loop
+    // without adding evidence.
+    if (queries.includes(trimmed)) break
+    query = trimmed
+  }
+
+  return {
+    memories: interleaveByRank(perRound, limit),
+    trace: {
+      queries,
+      perRoundCounts: perRound.map((r) => r.length),
+      rounds: perRound.length,
+    },
+  }
+}