muhammadkh4n · muhammadkh4n · Jun 8, 2026 · Jun 6, 2026 · Jun 6, 2026 · Jun 6, 2026
diff --git a/.gitignore b/.gitignore
@@ -19,7 +19,10 @@ supabase/.temp/
 .understand-anything/
 
 # Bench / forensics output (locally generated, not committed)
-results/
+results/*
+# …except committed Phase 0 gate baselines (results/gates/graph-eval-baseline.json).
+# Uses results/* (not results/) so this negation can re-include the subdir.
+!results/gates/
 data/
 
 # Build artifacts (prevent leaking compiled files into src/)

diff --git a/packages/bench/bin/engram-bench.ts b/packages/bench/bin/engram-bench.ts
@@ -10,9 +10,12 @@
 
 import * as fs from 'node:fs/promises'
 import * as path from 'node:path'
+import { createHash } from 'node:crypto'
+import { execFileSync } from 'node:child_process'
 import { LoCoMoAdapter } from '../src/locomo/adapter.js'
 import { LongMemEvalAdapter } from '../src/longmemeval/adapter.js'
 import { compareLoCoMo, compareLongMemEval } from '../src/runner/compare.js'
+import { compareMatrix } from '../src/runner/compare-matrix.js'
 import { formatLoCoMoTable, formatLongMemEvalTable, formatComparisonTable } from '../src/metrics/table.js'
 import type { BenchmarkOpts } from '../src/types.js'
 
@@ -27,6 +30,8 @@ function parseArgs(argv: string[]) {
     if (arg === '--consolidate') { args['consolidate'] = true; continue }
     if (arg === '--graph') { args['graph'] = true; continue }
     if (arg === '--compare') { args['compare'] = true; continue }
+    if (arg === '--matrix') { args['matrix'] = true; continue }
+    if (arg === '--require-graph') { args['requireGraph'] = true; continue }
     if (arg === '--verbose') { args['verbose'] = true; continue }
     if (arg.startsWith('--')) {
       const key = arg.slice(2)
@@ -51,13 +56,37 @@ function parseArgs(argv: string[]) {
     consolidate: args['consolidate'] !== false,
     graph: args['graph'] !== false,
     compare: args['compare'] === true,
+    matrix: args['matrix'] === true,
+    requireGraph: args['requireGraph'] === true,
+    categories: typeof args['categories'] === 'string'
+      ? (args['categories'] as string).split(',').map((s) => parseInt(s.trim(), 10)).filter((n) => !Number.isNaN(n))
+      : undefined,
     topK: parseInt(args['top-k'] as string ?? '10', 10) || 10,
     limit: parseInt(args['limit'] as string ?? '0', 10) || 0,
     noRerank: args['noRerank'] === true,
     verbose: args['verbose'] === true,
   }
 }
 
+/** sha256 fingerprint of the corpus (file contents, or dir name:size listing). */
+async function hashCorpus(p: string): Promise<string> {
+  try {
+    const st = await fs.stat(p)
+    const hash = createHash('sha256')
+    if (st.isDirectory()) {
+      for (const e of (await fs.readdir(p)).sort()) {
+        const s = await fs.stat(path.join(p, e))
+        hash.update(`${e}:${s.size}\n`)
+      }
+    } else {
+      hash.update(await fs.readFile(p))
+    }
+    return hash.digest('hex')
+  } catch {
+    return 'unknown'
+  }
+}
+
 async function main() {
   const args = parseArgs(process.argv.slice(2))
 
@@ -77,6 +106,41 @@ async function main() {
   console.log(`Consolidation: ${args.consolidate ? 'ON' : 'OFF'}`)
   console.log('')
 
+  if (args.matrix) {
+    console.log('Running 4-cell {graph}x{rerank} ablation matrix...')
+    if (args.requireGraph) console.log('requireGraph: ON (a graph cell without a bench Neo4j will hard-fail)')
+    let commit = 'unknown'
+    try { commit = execFileSync('git', ['rev-parse', 'HEAD']).toString().trim() } catch { /* not a git checkout */ }
+    const corpusSha256 = await hashCorpus(args.dataPath)
+
+    const result = await compareMatrix(
+      args.benchmark as 'locomo' | 'longmemeval',
+      args.dataPath,
+      {
+        consolidate: args.consolidate,
+        topK: args.topK,
+        limit: args.limit > 0 ? args.limit : undefined,
+        ...(args.categories ? { categories: args.categories } : {}),
+      },
+      { requireGraph: args.requireGraph, commit, corpusSha256 },
+    )
+
+    for (const cell of result.cells) {
+      console.log(
+        `  graph=${cell.graph ? 'ON ' : 'OFF'} rerank=${cell.rerank ? 'ON ' : 'OFF'}` +
+        `  graphEffect=${cell.graphEffect.toFixed(4)} (n=${cell.graphVisibleN})`,
+      )
+    }
+
+    const gatesDir = path.resolve('./results/gates')
+    await fs.mkdir(gatesDir, { recursive: true })
+    const outFile = path.join(gatesDir, 'graph-eval-baseline.json')
+    await fs.writeFile(outFile, JSON.stringify(result, null, 2), 'utf8')
+    console.log(`\nMatrix baseline written to: ${outFile}`)
+    console.log(`Provenance: commit=${commit.slice(0, 8)} corpus=${corpusSha256.slice(0, 12)} gate=${result.provenance.neo4jGateState}`)
+    return
+  }
+
   if (args.compare) {
     console.log('Running comparison mode...')
     let comparisonResult

diff --git a/packages/bench/src/bench-memory-handle.ts b/packages/bench/src/bench-memory-handle.ts
@@ -0,0 +1,39 @@
+// Dependency-light home for the bench memory handle + the requireGraph guard.
+// Kept separate from memory-factory.ts (which pulls in heavy runtime deps like
+// the ONNX reranker) so the guard and its types stay unit-testable without
+// loading native binaries. All imports here are type-only → erased at runtime.
+import type { Memory } from '@engram-mem/core'
+import type { NeuralGraph } from '@engram-mem/graph'
+import type { RerankerBackend } from './types.js'
+
+/** What createBenchMemory wired up — exposed so graph cells can reach the graph. */
+export interface BenchMemoryConfig {
+  graph: NeuralGraph | null
+  rerankerBackend: RerankerBackend
+}
+
+export interface BenchMemoryHandle {
+  memory: Memory
+  config: BenchMemoryConfig
+  /** True iff a real bench Neo4j was wired (env present AND reachable). */
+  graphActuallyWired: boolean
+}
+
+/**
+ * Hard-fail guard for graph cells. A graph cell that runs without a real Neo4j
+ * silently falls back to SQL-only and would report a SQL delta as a graph
+ * result — the exact "the graph was never measured" trap. Convert that silent
+ * fallback into a loud throw so a mis-provisioned matrix cell fails fast instead
+ * of fabricating a graph number.
+ */
+export function requireGraph(handle: BenchMemoryHandle): NeuralGraph {
+  if (!handle.graphActuallyWired || !handle.config.graph) {
+    throw new Error(
+      '[engram-bench] requireGraph: a graph cell was requested but the bench ' +
+      'Neo4j is not wired. Set ENGRAM_BENCH_NEO4J_URI + ENGRAM_BENCH_NEO4J_PASSWORD ' +
+      '(a bench-specific Neo4j, NOT the production NEO4J_URI). Refusing to report ' +
+      'a SQL-only result as a graph result.',
+    )
+  }
+  return handle.config.graph
+}
diff --git a/packages/bench/src/classification/classify-recall-structure.ts b/packages/bench/src/classification/classify-recall-structure.ts
@@ -0,0 +1,73 @@
+// Phase 0 — label a question's recall STRUCTURE so graphEffect is measured on
+// the graph-relevant split (multi_hop/temporal, where spreading activation
+// should help) instead of the saturated aggregate. Deterministic by design:
+// no LLM in the gate path, so the committed labels are reproducible.
+
+export type RecallStructure = 'lookup' | 'multi_hop' | 'temporal' | 'aggregation'
+
+export interface QuestionContext {
+  question: string
+  goldAnswer: string
+  /** Gold evidence ids: LoCoMo dia ids, or LongMemEval answer_session_ids. */
+  goldIds: string[]
+  /** LoCoMo category if known: 1=single_hop 2=multi_hop 3=temporal 4=open_domain 5=adversarial. */
+  category?: number
+  /** LongMemEval ability if known: temporal_reasoning, multi_session_reasoning, ... */
+  ability?: string
+}
+
+export interface RecallStructureLabel {
+  type: RecallStructure
+  confidence: number
+  reasoning: string
+}
+
+/** The structures where graph spreading activation is expected to add lift. */
+export const GRAPH_RELEVANT: ReadonlySet<RecallStructure> = new Set(['multi_hop', 'temporal'])
+
+// Low-confidence fallback signal only (used when neither category nor ability
+// is available). Years, month names, and ordering/relative-time words.
+const TEMPORAL_RE =
+  /\b(19|20)\d{2}\b|\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\b|\b(yesterday|today|tomorrow|week|month|year|date|when|before|after|since|until|earlier|later|ago|first|last|recent)\b/i
+
+/**
+ * Classify a question's recall structure. Authoritative dataset signals win:
+ * LoCoMo `category` first, then LongMemEval `ability`. Only when neither is
+ * present do we fall back to structural heuristics (gold cardinality + a
+ * temporal-token scan).
+ */
+export function classifyRecallStructure(ctx: QuestionContext): RecallStructureLabel {
+  // 1. LoCoMo category — authoritative.
+  if (ctx.category != null) {
+    switch (ctx.category) {
+      case 2: return { type: 'multi_hop', confidence: 0.9, reasoning: 'LoCoMo category 2 (multi_hop)' }
+      case 3: return { type: 'temporal', confidence: 0.9, reasoning: 'LoCoMo category 3 (temporal)' }
+      case 1: return { type: 'lookup', confidence: 0.9, reasoning: 'LoCoMo category 1 (single_hop)' }
+      case 4: return { type: 'lookup', confidence: 0.7, reasoning: 'LoCoMo category 4 (open_domain) -> lookup' }
+      case 5: return { type: 'lookup', confidence: 0.6, reasoning: 'LoCoMo category 5 (adversarial) -> lookup' }
+    }
+  }
+
+  // 2. LongMemEval ability — authoritative.
+  if (ctx.ability) {
+    const a = ctx.ability.toLowerCase()
+    if (a.includes('temporal')) return { type: 'temporal', confidence: 0.85, reasoning: `ability=${ctx.ability}` }
+    if (a.includes('multi_session') || a.includes('multi-session')) return { type: 'multi_hop', confidence: 0.85, reasoning: `ability=${ctx.ability}` }
+    if (a.includes('knowledge_update')) return { type: 'multi_hop', confidence: 0.7, reasoning: `ability=${ctx.ability} (updates link sessions)` }
+    if (a.includes('information_extraction')) return { type: 'lookup', confidence: 0.8, reasoning: `ability=${ctx.ability}` }
+    if (a.includes('abstention')) return { type: 'lookup', confidence: 0.7, reasoning: `ability=${ctx.ability}` }
+  }
+
+  // 3. Heuristic fallback.
+  const text = `${ctx.question} ${ctx.goldAnswer}`
+  if (ctx.goldIds.length >= 3) {
+    return { type: 'aggregation', confidence: 0.6, reasoning: `${ctx.goldIds.length} gold ids -> synthesis` }
+  }
+  if (TEMPORAL_RE.test(text)) {
+    return { type: 'temporal', confidence: 0.55, reasoning: 'temporal token in question/answer' }
+  }
+  if (ctx.goldIds.length >= 2) {
+    return { type: 'multi_hop', confidence: 0.6, reasoning: `${ctx.goldIds.length} gold ids -> cross-session` }
+  }
+  return { type: 'lookup', confidence: 0.5, reasoning: 'single gold id, no temporal signal' }
+}
diff --git a/packages/bench/src/index.ts b/packages/bench/src/index.ts
@@ -1,9 +1,22 @@
 export { LoCoMoAdapter } from './locomo/adapter.js'
 export { LongMemEvalAdapter } from './longmemeval/adapter.js'
 export { compareLoCoMo, compareLongMemEval } from './runner/compare.js'
+export { compareMatrix } from './runner/compare-matrix.js'
+export { extractLoCoMoOutcomes, extractLongMemEvalOutcomes } from './runner/matrix-outcomes.js'
+export type { ComparisonMatrixResult, MatrixCell, BaselineProvenance } from './types.js'
 export { computeRetrievalF1, recallAtK } from './metrics/f1.js'
 export { formatLoCoMoTable, formatLongMemEvalTable, formatComparisonTable } from './metrics/table.js'
-export { createBenchMemory } from './memory-factory.js'
+export { createBenchMemory, requireGraph } from './memory-factory.js'
+export type { BenchMemoryHandle, BenchMemoryConfig } from './memory-factory.js'
+export { wipeBenchGraph, tryCreateBenchGraph } from './bench-graph.js'
+export { mergeAssociationsIntoScored } from './merge-associations.js'
+export type { BenchRecallResult, BenchScoredMemory } from './merge-associations.js'
+export { graphVerdict, MIN_POWER_N, DEFAULT_EPSILON } from './metrics/graph-verdict.js'
+export type { GraphVerdict, GraphVerdictInput } from './metrics/graph-verdict.js'
+export { classifyRecallStructure, GRAPH_RELEVANT } from './classification/classify-recall-structure.js'
+export type { RecallStructure, QuestionContext, RecallStructureLabel } from './classification/classify-recall-structure.js'
+export { computeGraphEffect } from './metrics/graph-effect.js'
+export type { QuestionOutcome, GraphEffectResult } from './metrics/graph-effect.js'
 export type {
   BenchmarkOpts, BenchmarkMetrics,
   LoCoMoCategory, LoCoMoQAPrediction, LoCoMoCategoryMetrics,

diff --git a/packages/bench/src/locomo/adapter.ts b/packages/bench/src/locomo/adapter.ts
@@ -8,6 +8,8 @@ import type {
 import type { LoCoMoConversationFile, LoCoMoTurn } from './types.js'
 import { computeRetrievalF1 } from '../metrics/f1.js'
 import { createBenchMemory } from '../memory-factory.js'
+import { mergeAssociationsIntoScored } from '../merge-associations.js'
+import { wipeBenchGraph } from '../bench-graph.js'
 
 export class LoCoMoAdapter {
   async loadDataset(dataPath: string): Promise<LoCoMoConversationFile[]> {
@@ -184,7 +186,7 @@ export class LoCoMoAdapter {
   async evaluateDataset(
     conversations: LoCoMoConversationFile[],
     memory: Memory,
-    opts?: Pick<BenchmarkOpts, 'topK'>,
+    opts?: Pick<BenchmarkOpts, 'topK' | 'mergeAssociationsIntoTopK' | 'categories'>,
   ): Promise<LoCoMoConversationResult[]> {
     const topK = opts?.topK ?? 10
     const convResults: LoCoMoConversationResult[] = []
@@ -194,8 +196,14 @@ export class LoCoMoAdapter {
       const qaPredictions: LoCoMoQAPrediction[] = []
 
       for (const qa of conv.qa) {
+        // Gate-corpus filter: score only the requested categories (e.g. [2,3]
+        // multi-hop/temporal). The conversation was already ingested whole, so
+        // the graph the recall traverses is unaffected — only scoring narrows.
+        if (opts?.categories && !opts.categories.includes(qa.category)) continue
         const recallResult = await memory.recall(qa.question)
-        const topMemories = recallResult.memories.slice(0, topK)
+        const topMemories = mergeAssociationsIntoScored(
+          recallResult, opts?.mergeAssociationsIntoTopK,
+        ).slice(0, topK)
 
         const prediction = topMemories
           .map(m => m.content)
@@ -249,7 +257,12 @@ export class LoCoMoAdapter {
     conv: LoCoMoConversationFile,
     opts?: BenchmarkOpts,
   ): Promise<{ result: LoCoMoConversationResult; ingestMs: number; evalMs: number }> {
-    const memory = await createBenchMemory(opts)
+    const { memory, config } = await createBenchMemory(opts)
+
+    // Per-conversation graph isolation: Neo4j is shared, so wipe before ingest
+    // or the previous conversation's nodes pollute this one's spreading
+    // activation (matching the per-conv fresh :memory: SQLite invariant).
+    if (config.graph) await wipeBenchGraph(config.graph)
 
     const ingestStart = Date.now()
     const { episodesIngested, sessionsCreated } = await this.ingestConversation(conv, memory, {
@@ -260,6 +273,10 @@ export class LoCoMoAdapter {
       await memory.consolidate('light')
       await memory.consolidate('deep')
     }
+    // Drain fire-and-forget graph decomposition (+ consolidation) writes before
+    // eval. Without this, recall runs against a half-built graph and the graph
+    // cells produce empty associations — spuriously zeroing graphEffect.
+    await memory.flushPendingWrites()
     const ingestMs = Date.now() - ingestStart
 
     const evalStart = Date.now()

diff --git a/packages/bench/src/locomo/forensics/local-recall-sweep.ts b/packages/bench/src/locomo/forensics/local-recall-sweep.ts
@@ -88,7 +88,7 @@ async function main(): Promise<void> {
     const convStart = Date.now()
     console.log(`[${i + 1}/${conversations.length}] ${convId} — fresh memory + ingest`)
 
-    const memory = await createBenchMemory(benchOpts)
+    const { memory } = await createBenchMemory(benchOpts)
 
     try {
       const ingestStart = Date.now()

diff --git a/packages/bench/src/locomo/judge-adapter.ts b/packages/bench/src/locomo/judge-adapter.ts
@@ -363,7 +363,7 @@ async function benchConversation(
   const nQs = opts.smoke ? (opts.smokeQuestions ?? 5) : qas.length
 
   console.log(`  [engram-mem] Conv ${convIdx} (${sid}): ingesting...`)
-  const memory = await createBenchMemory({
+  const { memory } = await createBenchMemory({
     graph: opts.graph ?? false,
     ...(opts.rerankerBackend ? { rerankerBackend: opts.rerankerBackend } : {}),
     ...(opts.onnxRerankerModel ? { onnxRerankerModel: opts.onnxRerankerModel } : {}),

diff --git a/packages/bench/src/longmemeval/adapter.ts b/packages/bench/src/longmemeval/adapter.ts
@@ -22,6 +22,8 @@ import type {
 } from '../types.js'
 import type { LongMemEvalQuestion, LongMemEvalQuestionType } from './types.js'
 import { createBenchMemory } from '../memory-factory.js'
+import { mergeAssociationsIntoScored } from '../merge-associations.js'
+import { wipeBenchGraph } from '../bench-graph.js'
 
 export class LongMemEvalAdapter {
   /**
@@ -135,17 +137,26 @@ export class LongMemEvalAdapter {
     ingestMs: number
     evalMs: number
   }> {
-    const memory = await createBenchMemory(opts)
+    const { memory, config } = await createBenchMemory(opts)
     const topK = opts?.topK ?? 10
 
     try {
+      // Per-question graph isolation: Neo4j is a shared external process (unlike
+      // the per-call fresh :memory: SQLite), so wipe it before ingest or prior
+      // questions' nodes pollute this question's spreading activation.
+      if (config.graph) await wipeBenchGraph(config.graph)
       const ingestStart = Date.now()
       const { episodesIngested, sessionsCreated } = await this.ingestQuestion(question, memory)
+      // Drain fire-and-forget graph decomposition writes before recall, or the
+      // graph cell recalls against a half-built graph (spurious graphEffect=0).
+      await memory.flushPendingWrites()
       const ingestMs = Date.now() - ingestStart
 
       const evalStart = Date.now()
       const recallResult = await memory.recall(question.question)
-      const topMemories = recallResult.memories.slice(0, topK)
+      const topMemories = mergeAssociationsIntoScored(
+        recallResult, opts?.mergeAssociationsIntoTopK,
+      ).slice(0, topK)
 
       // Deduplicate retrieved sessions in rank order
       const seen = new Set<string>()

diff --git a/packages/bench/src/longmemeval/forensics/recall-sweep.ts b/packages/bench/src/longmemeval/forensics/recall-sweep.ts
@@ -86,7 +86,7 @@ async function main(): Promise<void> {
     // BUT — runQuestion currently slices to topK before computing recall@K.
     // For the sweep we want a fuller view: retrieve max(K_VALUES) once, then
     // compute recall@K from the same list. We need a slightly different path.
-    const memory = await createBenchMemory(benchOpts)
+    const { memory } = await createBenchMemory(benchOpts)
     let episodes = 0
     let ingestMs = 0
     let evalMs = 0