Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
102 changes: 102 additions & 0 deletions packages/bench/src/multihop/scoring.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
import type {
MultiHopItem,
MultiHopPrediction,
MultiHopArmMetrics,
} from './types.js'

/** Idxs of the gold supporting paragraphs. */
export function supportingIdxs(item: MultiHopItem): number[] {
return item.paragraphs.filter((p) => p.isSupporting).map((p) => p.idx)
}

/**
* Bridge paragraphs: supporting paragraphs at hop > 1 — evidence NOT directly
* cued by the top-level question, which single-shot dense ranks low. Returns
* null when the dataset does not label decomposition hops, so callers mark the
* bridge metric not-applicable rather than reporting a misleading zero.
*/
export function bridgeIdxs(item: MultiHopItem): number[] | null {
const supporting = item.paragraphs.filter((p) => p.isSupporting)
const labeled = supporting.some((p) => p.hop !== undefined)
if (!labeled) return null
return supporting.filter((p) => (p.hop ?? 1) > 1).map((p) => p.idx)
}

function fractionInTopK(retrieved: number[], gold: number[], k: number): number {
if (gold.length === 0) return 0
const topK = new Set(retrieved.slice(0, k))
let hit = 0
for (const g of gold) if (topK.has(g)) hit++
return hit / gold.length
}

function allInTopK(retrieved: number[], gold: number[], k: number): boolean {
if (gold.length === 0) return false
const topK = new Set(retrieved.slice(0, k))
return gold.every((g) => topK.has(g))
}

/** Bridge-recall sentinel for items whose dataset does not label hops. */
const BRIDGE_NA = -1

export function scoreRetrieval(
item: MultiHopItem,
retrievedParagraphIdxs: number[],
ks: number[],
): Pick<
MultiHopPrediction,
'allSupportAtK' | 'supportRecallAtK' | 'bridgeRecallAtK'
> {
const support = supportingIdxs(item)
const bridges = bridgeIdxs(item)
const allSupportAtK: Record<number, boolean> = {}
const supportRecallAtK: Record<number, number> = {}
const bridgeRecallAtK: Record<number, number> = {}

for (const k of ks) {
allSupportAtK[k] = allInTopK(retrievedParagraphIdxs, support, k)
supportRecallAtK[k] = fractionInTopK(retrievedParagraphIdxs, support, k)
bridgeRecallAtK[k] =
bridges === null ? BRIDGE_NA : fractionInTopK(retrievedParagraphIdxs, bridges, k)
}

return { allSupportAtK, supportRecallAtK, bridgeRecallAtK }
}

export function aggregateArmMetrics(
arm: string,
predictions: MultiHopPrediction[],
ks: number[],
): MultiHopArmMetrics {
const n = predictions.length
const allSupportAtK: Record<number, number> = {}
const supportRecallAtK: Record<number, number> = {}
const bridgeRecallAtK: Record<number, number | null> = {}

for (const k of ks) {
let allSum = 0
let supSum = 0
let bridgeSum = 0
let bridgeCount = 0
for (const p of predictions) {
if (p.allSupportAtK[k]) allSum++
supSum += p.supportRecallAtK[k] ?? 0
const b = p.bridgeRecallAtK[k]
if (b !== undefined && b >= 0) {
bridgeSum += b
bridgeCount++
}
}
allSupportAtK[k] = n > 0 ? allSum / n : 0
supportRecallAtK[k] = n > 0 ? supSum / n : 0
// null when no item in this run labels hops → bridge metric not applicable.
bridgeRecallAtK[k] = bridgeCount > 0 ? bridgeSum / bridgeCount : null
}

const meanRounds =
n > 0
? predictions.reduce((acc, p) => acc + Math.max(1, p.queries.length), 0) / n
: 0

return { arm, n, allSupportAtK, supportRecallAtK, bridgeRecallAtK, meanRounds }
}
75 changes: 75 additions & 0 deletions packages/bench/src/multihop/types.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/**
* Normalized multi-hop QA types for the D2 (research multi-hop) distribution.
*
* MuSiQue-Ans, 2WikiMultiHopQA and HotpotQA-distractor all share the same
* "distractor setting" shape: a question, a small bag of paragraphs (a few gold
* supporting + many distractors), and a gold answer. We normalize all three to
* one item so the adapter and the arms (A1/A3/A4) are dataset-agnostic.
*
* Why the distractor setting (per-question paragraph bag) rather than a pooled
* corpus: it is the canonical benchmark setting, it is far cheaper to ingest,
* and — crucially — it still contains the hard part. The hop-2+ "bridge"
* paragraph is not lexically/semantically similar to the original question, so
* single-shot dense ranks it low; recovering it is exactly what a graph (A3) or
* iterative retrieval (A4) must do.
*/
export type MultiHopDataset = 'musique' | '2wiki' | 'hotpotqa'

export interface MultiHopParagraph {
/** Stable index within this item's paragraph bag. */
idx: number
title: string
text: string
/** Gold supporting paragraph for the answer. */
isSupporting: boolean
/**
* 1-based hop position among supporting paragraphs when the dataset labels
* decomposition order (MuSiQue). Hops > 1 are "bridge" evidence — not directly
* cued by the top-level question. undefined when unlabeled (2wiki/hotpot) or
* non-supporting.
*/
hop?: number
}

export interface MultiHopItem {
id: string
question: string
answer: string
/** Acceptable answer variants for EM/F1 (MuSiQue answer_aliases, etc.). */
answerAliases: string[]
paragraphs: MultiHopParagraph[]
dataset: MultiHopDataset
}

export interface MultiHopPrediction {
itemId: string
question: string
goldAnswer: string
dataset: MultiHopDataset
arm: string
/** Retrieved paragraph idxs in rank order (deduped). */
retrievedParagraphIdxs: number[]
/** Every gold supporting paragraph present in top-K, per K. */
allSupportAtK: Record<number, boolean>
/** Fraction of gold supporting paragraphs in top-K, per K. */
supportRecallAtK: Record<number, number>
/** Fraction of bridge (hop > 1) supporting paragraphs in top-K, per K.
* -1 when the dataset does not label hops (metric not applicable). */
bridgeRecallAtK: Record<number, number>
/** Queries issued, for the iterative arm (A4); single-element otherwise. */
queries: string[]
}

export interface MultiHopArmMetrics {
arm: string
n: number
/** Mean "all gold supporting in top-K", per K. */
allSupportAtK: Record<number, number>
/** Mean fraction of supporting paragraphs in top-K, per K. */
supportRecallAtK: Record<number, number>
/** Mean bridge recall in top-K, per K, over items where hops are labeled.
* null when no item in the run labels hops. */
bridgeRecallAtK: Record<number, number | null>
/** Mean number of retrieval rounds (1 for single-shot arms). */
meanRounds: number
}
145 changes: 145 additions & 0 deletions packages/bench/src/retrieval/iterative.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
import type { BenchRecallResult, BenchScoredMemory } from '../merge-associations.js'

/**
* Iterative / agentic retrieval (the A4 arm).
*
* The cheap, no-graph multi-hop competitor every modern agent already uses:
* retrieve → let an LLM name the bridge entity / next sub-question → re-retrieve,
* for a few rounds. It recovers hop-2+ evidence that single-shot dense misses
* (the bridge paragraph is not similar to the original question, so vector
* search alone cannot surface it) WITHOUT a graph, a second datastore, or
* per-ingest extraction.
*
* It is the linchpin of the eval: if a correctly-wired graph (A3, PPR bound)
* cannot beat A4 on bridge-entity multi-hop — the graph's home turf — then the
* universal case for a graph engine collapses, because A4 is cheaper and
* generalizes across distributions.
*
* Dependencies are injected (`recall`, `proposeNextQuery`) so the control flow
* is unit-testable with no OpenAI/Neo4j: the LLM "name the bridge" step and the
* underlying memory recall are both stubs in tests and real clients in the
* adapter.
*/
export interface IterativeRecallDeps {
/** Bound recall over the memory under test. For A4 the graph is OFF. */
recall: (query: string) => Promise<BenchRecallResult>
/**
* The agentic step: given the original multi-hop question and the memories
* gathered so far, return the NEXT single-hop retrieval query (a sub-question
* or bridge entity), or null to stop (enough evidence / no further hop).
*/
proposeNextQuery: (
question: string,
retrievedSoFar: readonly BenchScoredMemory[],
) => Promise<string | null>
}

export interface IterativeRecallOpts {
/** Max retrieval rounds, including the first. Default 3. */
maxRounds?: number
/** Cap on returned memories. Default 20. */
limit?: number
/**
* Mirror the adapter's scoring pool: when true, append the graph
* spreading-activation channel to each round's pool. A4 runs graph-off so
* this is normally false; kept for parity if iterative is ever combined with
* the graph arm.
*/
mergeAssociationsIntoTopK?: boolean
}

export interface IterativeRecallTrace {
/** Queries issued in order, starting with the original question. */
queries: string[]
/** Memory count returned by each round's recall. */
perRoundCounts: number[]
/** Number of recall rounds actually run. */
rounds: number
}

export interface IterativeRecallResult {
memories: BenchScoredMemory[]
trace: IterativeRecallTrace
}

function dedupeByBestRelevance(
memories: readonly BenchScoredMemory[],
): BenchScoredMemory[] {
const best = new Map<string, BenchScoredMemory>()
for (const m of memories) {
const prev = best.get(m.id)
if (!prev || m.relevance > prev.relevance) best.set(m.id, m)
}
return [...best.values()]
}

/**
* Round-robin interleave the per-round ranked lists, deduping by id.
*
* This is the faithful multi-hop merge: it reserves top slots for EACH round's
* best evidence, so the round-2 bridge paragraph (high-relevance to its
* sub-query) lands near the top instead of being buried under round-1's full
* ranked list. Pure dense (A1) is exactly round 1 alone; the interleave is what
* lets later hops survive into top-K.
*/
function interleaveByRank(
perRound: readonly (readonly BenchScoredMemory[])[],
limit: number,
): BenchScoredMemory[] {
const seen = new Set<string>()
const out: BenchScoredMemory[] = []
const maxLen = perRound.reduce((acc, r) => Math.max(acc, r.length), 0)
for (let rank = 0; rank < maxLen && out.length < limit; rank++) {
for (const round of perRound) {
if (rank >= round.length) continue
const m = round[rank]
if (seen.has(m.id)) continue
seen.add(m.id)
out.push(m)
if (out.length >= limit) break
}
}
return out
}

export async function iterativeRecall(
question: string,
deps: IterativeRecallDeps,
opts: IterativeRecallOpts = {},
): Promise<IterativeRecallResult> {
const maxRounds = Math.max(1, opts.maxRounds ?? 3)
const limit = opts.limit ?? 20

const perRound: BenchScoredMemory[][] = []
const queries: string[] = []
let query = question

for (let round = 0; round < maxRounds; round++) {
queries.push(query)
const result = await deps.recall(query)
const pool = opts.mergeAssociationsIntoTopK
? [...result.memories, ...result.associations]
: result.memories
perRound.push(pool)

if (round === maxRounds - 1) break

const accumulated = dedupeByBestRelevance(perRound.flat())
const next = await deps.proposeNextQuery(question, accumulated)
const trimmed = next?.trim()
if (!trimmed) break
// Cycle guard: an agent that re-proposes a query it already ran would loop
// without adding evidence.
if (queries.includes(trimmed)) break
query = trimmed
}

return {
memories: interleaveByRank(perRound, limit),
trace: {
queries,
perRoundCounts: perRound.map((r) => r.length),
rounds: perRound.length,
},
}
}
Loading