Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ supabase/.temp/
.understand-anything/

# Bench / forensics output (locally generated, not committed)
results/
results/*
# …except committed Phase 0 gate baselines (results/gates/graph-eval-baseline.json).
# Uses results/* (not results/) so this negation can re-include the subdir.
!results/gates/
data/

# Build artifacts (prevent leaking compiled files into src/)
Expand Down
64 changes: 64 additions & 0 deletions packages/bench/bin/engram-bench.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,12 @@

import * as fs from 'node:fs/promises'
import * as path from 'node:path'
import { createHash } from 'node:crypto'
import { execFileSync } from 'node:child_process'
import { LoCoMoAdapter } from '../src/locomo/adapter.js'
import { LongMemEvalAdapter } from '../src/longmemeval/adapter.js'
import { compareLoCoMo, compareLongMemEval } from '../src/runner/compare.js'
import { compareMatrix } from '../src/runner/compare-matrix.js'
import { formatLoCoMoTable, formatLongMemEvalTable, formatComparisonTable } from '../src/metrics/table.js'
import type { BenchmarkOpts } from '../src/types.js'

Expand All @@ -27,6 +30,8 @@ function parseArgs(argv: string[]) {
if (arg === '--consolidate') { args['consolidate'] = true; continue }
if (arg === '--graph') { args['graph'] = true; continue }
if (arg === '--compare') { args['compare'] = true; continue }
if (arg === '--matrix') { args['matrix'] = true; continue }
if (arg === '--require-graph') { args['requireGraph'] = true; continue }
if (arg === '--verbose') { args['verbose'] = true; continue }
if (arg.startsWith('--')) {
const key = arg.slice(2)
Expand All @@ -51,13 +56,37 @@ function parseArgs(argv: string[]) {
consolidate: args['consolidate'] !== false,
graph: args['graph'] !== false,
compare: args['compare'] === true,
matrix: args['matrix'] === true,
requireGraph: args['requireGraph'] === true,
categories: typeof args['categories'] === 'string'
? (args['categories'] as string).split(',').map((s) => parseInt(s.trim(), 10)).filter((n) => !Number.isNaN(n))
: undefined,
topK: parseInt(args['top-k'] as string ?? '10', 10) || 10,
limit: parseInt(args['limit'] as string ?? '0', 10) || 0,
noRerank: args['noRerank'] === true,
verbose: args['verbose'] === true,
}
}

/** sha256 fingerprint of the corpus (file contents, or dir name:size listing). */
async function hashCorpus(p: string): Promise<string> {
try {
const st = await fs.stat(p)
const hash = createHash('sha256')
if (st.isDirectory()) {
for (const e of (await fs.readdir(p)).sort()) {
const s = await fs.stat(path.join(p, e))
hash.update(`${e}:${s.size}\n`)
}
} else {
hash.update(await fs.readFile(p))
}
return hash.digest('hex')
} catch {
return 'unknown'
}
}

async function main() {
const args = parseArgs(process.argv.slice(2))

Expand All @@ -77,6 +106,41 @@ async function main() {
console.log(`Consolidation: ${args.consolidate ? 'ON' : 'OFF'}`)
console.log('')

if (args.matrix) {
console.log('Running 4-cell {graph}x{rerank} ablation matrix...')
if (args.requireGraph) console.log('requireGraph: ON (a graph cell without a bench Neo4j will hard-fail)')
let commit = 'unknown'
try { commit = execFileSync('git', ['rev-parse', 'HEAD']).toString().trim() } catch { /* not a git checkout */ }
const corpusSha256 = await hashCorpus(args.dataPath)

const result = await compareMatrix(
args.benchmark as 'locomo' | 'longmemeval',
args.dataPath,
{
consolidate: args.consolidate,
topK: args.topK,
limit: args.limit > 0 ? args.limit : undefined,
...(args.categories ? { categories: args.categories } : {}),
},
{ requireGraph: args.requireGraph, commit, corpusSha256 },
)

for (const cell of result.cells) {
console.log(
` graph=${cell.graph ? 'ON ' : 'OFF'} rerank=${cell.rerank ? 'ON ' : 'OFF'}` +
` graphEffect=${cell.graphEffect.toFixed(4)} (n=${cell.graphVisibleN})`,
)
}

const gatesDir = path.resolve('./results/gates')
await fs.mkdir(gatesDir, { recursive: true })
const outFile = path.join(gatesDir, 'graph-eval-baseline.json')
await fs.writeFile(outFile, JSON.stringify(result, null, 2), 'utf8')
console.log(`\nMatrix baseline written to: ${outFile}`)
console.log(`Provenance: commit=${commit.slice(0, 8)} corpus=${corpusSha256.slice(0, 12)} gate=${result.provenance.neo4jGateState}`)
return
}

if (args.compare) {
console.log('Running comparison mode...')
let comparisonResult
Expand Down
39 changes: 39 additions & 0 deletions packages/bench/src/bench-memory-handle.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Dependency-light home for the bench memory handle + the requireGraph guard.
// Kept separate from memory-factory.ts (which pulls in heavy runtime deps like
// the ONNX reranker) so the guard and its types stay unit-testable without
// loading native binaries. All imports here are type-only → erased at runtime.
import type { Memory } from '@engram-mem/core'
import type { NeuralGraph } from '@engram-mem/graph'
import type { RerankerBackend } from './types.js'

/** What createBenchMemory wired up — exposed so graph cells can reach the graph. */
export interface BenchMemoryConfig {
graph: NeuralGraph | null
rerankerBackend: RerankerBackend
}

export interface BenchMemoryHandle {
memory: Memory
config: BenchMemoryConfig
/** True iff a real bench Neo4j was wired (env present AND reachable). */
graphActuallyWired: boolean
}

/**
* Hard-fail guard for graph cells. A graph cell that runs without a real Neo4j
* silently falls back to SQL-only and would report a SQL delta as a graph
* result — the exact "the graph was never measured" trap. Convert that silent
* fallback into a loud throw so a mis-provisioned matrix cell fails fast instead
* of fabricating a graph number.
*/
export function requireGraph(handle: BenchMemoryHandle): NeuralGraph {
if (!handle.graphActuallyWired || !handle.config.graph) {
throw new Error(
'[engram-bench] requireGraph: a graph cell was requested but the bench ' +
'Neo4j is not wired. Set ENGRAM_BENCH_NEO4J_URI + ENGRAM_BENCH_NEO4J_PASSWORD ' +
'(a bench-specific Neo4j, NOT the production NEO4J_URI). Refusing to report ' +
'a SQL-only result as a graph result.',
)
}
return handle.config.graph
}
73 changes: 73 additions & 0 deletions packages/bench/src/classification/classify-recall-structure.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
// Phase 0 — label a question's recall STRUCTURE so graphEffect is measured on
// the graph-relevant split (multi_hop/temporal, where spreading activation
// should help) instead of the saturated aggregate. Deterministic by design:
// no LLM in the gate path, so the committed labels are reproducible.

export type RecallStructure = 'lookup' | 'multi_hop' | 'temporal' | 'aggregation'

export interface QuestionContext {
question: string
goldAnswer: string
/** Gold evidence ids: LoCoMo dia ids, or LongMemEval answer_session_ids. */
goldIds: string[]
/** LoCoMo category if known: 1=single_hop 2=multi_hop 3=temporal 4=open_domain 5=adversarial. */
category?: number
/** LongMemEval ability if known: temporal_reasoning, multi_session_reasoning, ... */
ability?: string
}

export interface RecallStructureLabel {
type: RecallStructure
confidence: number
reasoning: string
}

/** The structures where graph spreading activation is expected to add lift. */
export const GRAPH_RELEVANT: ReadonlySet<RecallStructure> = new Set(['multi_hop', 'temporal'])

// Low-confidence fallback signal only (used when neither category nor ability
// is available). Years, month names, and ordering/relative-time words.
const TEMPORAL_RE =
/\b(19|20)\d{2}\b|\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\b|\b(yesterday|today|tomorrow|week|month|year|date|when|before|after|since|until|earlier|later|ago|first|last|recent)\b/i

/**
* Classify a question's recall structure. Authoritative dataset signals win:
* LoCoMo `category` first, then LongMemEval `ability`. Only when neither is
* present do we fall back to structural heuristics (gold cardinality + a
* temporal-token scan).
*/
export function classifyRecallStructure(ctx: QuestionContext): RecallStructureLabel {
// 1. LoCoMo category — authoritative.
if (ctx.category != null) {
switch (ctx.category) {
case 2: return { type: 'multi_hop', confidence: 0.9, reasoning: 'LoCoMo category 2 (multi_hop)' }
case 3: return { type: 'temporal', confidence: 0.9, reasoning: 'LoCoMo category 3 (temporal)' }
case 1: return { type: 'lookup', confidence: 0.9, reasoning: 'LoCoMo category 1 (single_hop)' }
case 4: return { type: 'lookup', confidence: 0.7, reasoning: 'LoCoMo category 4 (open_domain) -> lookup' }
case 5: return { type: 'lookup', confidence: 0.6, reasoning: 'LoCoMo category 5 (adversarial) -> lookup' }
}
}

// 2. LongMemEval ability — authoritative.
if (ctx.ability) {
const a = ctx.ability.toLowerCase()
if (a.includes('temporal')) return { type: 'temporal', confidence: 0.85, reasoning: `ability=${ctx.ability}` }
if (a.includes('multi_session') || a.includes('multi-session')) return { type: 'multi_hop', confidence: 0.85, reasoning: `ability=${ctx.ability}` }
if (a.includes('knowledge_update')) return { type: 'multi_hop', confidence: 0.7, reasoning: `ability=${ctx.ability} (updates link sessions)` }
if (a.includes('information_extraction')) return { type: 'lookup', confidence: 0.8, reasoning: `ability=${ctx.ability}` }
if (a.includes('abstention')) return { type: 'lookup', confidence: 0.7, reasoning: `ability=${ctx.ability}` }
}

// 3. Heuristic fallback.
const text = `${ctx.question} ${ctx.goldAnswer}`
if (ctx.goldIds.length >= 3) {
return { type: 'aggregation', confidence: 0.6, reasoning: `${ctx.goldIds.length} gold ids -> synthesis` }
}
if (TEMPORAL_RE.test(text)) {
return { type: 'temporal', confidence: 0.55, reasoning: 'temporal token in question/answer' }
}
if (ctx.goldIds.length >= 2) {
return { type: 'multi_hop', confidence: 0.6, reasoning: `${ctx.goldIds.length} gold ids -> cross-session` }
}
return { type: 'lookup', confidence: 0.5, reasoning: 'single gold id, no temporal signal' }
}
15 changes: 14 additions & 1 deletion packages/bench/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,22 @@
export { LoCoMoAdapter } from './locomo/adapter.js'
export { LongMemEvalAdapter } from './longmemeval/adapter.js'
export { compareLoCoMo, compareLongMemEval } from './runner/compare.js'
export { compareMatrix } from './runner/compare-matrix.js'
export { extractLoCoMoOutcomes, extractLongMemEvalOutcomes } from './runner/matrix-outcomes.js'
export type { ComparisonMatrixResult, MatrixCell, BaselineProvenance } from './types.js'
export { computeRetrievalF1, recallAtK } from './metrics/f1.js'
export { formatLoCoMoTable, formatLongMemEvalTable, formatComparisonTable } from './metrics/table.js'
export { createBenchMemory } from './memory-factory.js'
export { createBenchMemory, requireGraph } from './memory-factory.js'
export type { BenchMemoryHandle, BenchMemoryConfig } from './memory-factory.js'
export { wipeBenchGraph, tryCreateBenchGraph } from './bench-graph.js'
export { mergeAssociationsIntoScored } from './merge-associations.js'
export type { BenchRecallResult, BenchScoredMemory } from './merge-associations.js'
export { graphVerdict, MIN_POWER_N, DEFAULT_EPSILON } from './metrics/graph-verdict.js'
export type { GraphVerdict, GraphVerdictInput } from './metrics/graph-verdict.js'
export { classifyRecallStructure, GRAPH_RELEVANT } from './classification/classify-recall-structure.js'
export type { RecallStructure, QuestionContext, RecallStructureLabel } from './classification/classify-recall-structure.js'
export { computeGraphEffect } from './metrics/graph-effect.js'
export type { QuestionOutcome, GraphEffectResult } from './metrics/graph-effect.js'
export type {
BenchmarkOpts, BenchmarkMetrics,
LoCoMoCategory, LoCoMoQAPrediction, LoCoMoCategoryMetrics,
Expand Down
23 changes: 20 additions & 3 deletions packages/bench/src/locomo/adapter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ import type {
import type { LoCoMoConversationFile, LoCoMoTurn } from './types.js'
import { computeRetrievalF1 } from '../metrics/f1.js'
import { createBenchMemory } from '../memory-factory.js'
import { mergeAssociationsIntoScored } from '../merge-associations.js'
import { wipeBenchGraph } from '../bench-graph.js'

export class LoCoMoAdapter {
async loadDataset(dataPath: string): Promise<LoCoMoConversationFile[]> {
Expand Down Expand Up @@ -184,7 +186,7 @@ export class LoCoMoAdapter {
async evaluateDataset(
conversations: LoCoMoConversationFile[],
memory: Memory,
opts?: Pick<BenchmarkOpts, 'topK'>,
opts?: Pick<BenchmarkOpts, 'topK' | 'mergeAssociationsIntoTopK' | 'categories'>,
): Promise<LoCoMoConversationResult[]> {
const topK = opts?.topK ?? 10
const convResults: LoCoMoConversationResult[] = []
Expand All @@ -194,8 +196,14 @@ export class LoCoMoAdapter {
const qaPredictions: LoCoMoQAPrediction[] = []

for (const qa of conv.qa) {
// Gate-corpus filter: score only the requested categories (e.g. [2,3]
// multi-hop/temporal). The conversation was already ingested whole, so
// the graph the recall traverses is unaffected — only scoring narrows.
if (opts?.categories && !opts.categories.includes(qa.category)) continue
const recallResult = await memory.recall(qa.question)
const topMemories = recallResult.memories.slice(0, topK)
const topMemories = mergeAssociationsIntoScored(
recallResult, opts?.mergeAssociationsIntoTopK,
).slice(0, topK)

const prediction = topMemories
.map(m => m.content)
Expand Down Expand Up @@ -249,7 +257,12 @@ export class LoCoMoAdapter {
conv: LoCoMoConversationFile,
opts?: BenchmarkOpts,
): Promise<{ result: LoCoMoConversationResult; ingestMs: number; evalMs: number }> {
const memory = await createBenchMemory(opts)
const { memory, config } = await createBenchMemory(opts)

// Per-conversation graph isolation: Neo4j is shared, so wipe before ingest
// or the previous conversation's nodes pollute this one's spreading
// activation (matching the per-conv fresh :memory: SQLite invariant).
if (config.graph) await wipeBenchGraph(config.graph)

const ingestStart = Date.now()
const { episodesIngested, sessionsCreated } = await this.ingestConversation(conv, memory, {
Expand All @@ -260,6 +273,10 @@ export class LoCoMoAdapter {
await memory.consolidate('light')
await memory.consolidate('deep')
}
// Drain fire-and-forget graph decomposition (+ consolidation) writes before
// eval. Without this, recall runs against a half-built graph and the graph
// cells produce empty associations — spuriously zeroing graphEffect.
await memory.flushPendingWrites()
const ingestMs = Date.now() - ingestStart

const evalStart = Date.now()
Expand Down
2 changes: 1 addition & 1 deletion packages/bench/src/locomo/forensics/local-recall-sweep.ts
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ async function main(): Promise<void> {
const convStart = Date.now()
console.log(`[${i + 1}/${conversations.length}] ${convId} — fresh memory + ingest`)

const memory = await createBenchMemory(benchOpts)
const { memory } = await createBenchMemory(benchOpts)

try {
const ingestStart = Date.now()
Expand Down
2 changes: 1 addition & 1 deletion packages/bench/src/locomo/judge-adapter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ async function benchConversation(
const nQs = opts.smoke ? (opts.smokeQuestions ?? 5) : qas.length

console.log(` [engram-mem] Conv ${convIdx} (${sid}): ingesting...`)
const memory = await createBenchMemory({
const { memory } = await createBenchMemory({
graph: opts.graph ?? false,
...(opts.rerankerBackend ? { rerankerBackend: opts.rerankerBackend } : {}),
...(opts.onnxRerankerModel ? { onnxRerankerModel: opts.onnxRerankerModel } : {}),
Expand Down
15 changes: 13 additions & 2 deletions packages/bench/src/longmemeval/adapter.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ import type {
} from '../types.js'
import type { LongMemEvalQuestion, LongMemEvalQuestionType } from './types.js'
import { createBenchMemory } from '../memory-factory.js'
import { mergeAssociationsIntoScored } from '../merge-associations.js'
import { wipeBenchGraph } from '../bench-graph.js'

export class LongMemEvalAdapter {
/**
Expand Down Expand Up @@ -135,17 +137,26 @@ export class LongMemEvalAdapter {
ingestMs: number
evalMs: number
}> {
const memory = await createBenchMemory(opts)
const { memory, config } = await createBenchMemory(opts)
const topK = opts?.topK ?? 10

try {
// Per-question graph isolation: Neo4j is a shared external process (unlike
// the per-call fresh :memory: SQLite), so wipe it before ingest or prior
// questions' nodes pollute this question's spreading activation.
if (config.graph) await wipeBenchGraph(config.graph)
const ingestStart = Date.now()
const { episodesIngested, sessionsCreated } = await this.ingestQuestion(question, memory)
// Drain fire-and-forget graph decomposition writes before recall, or the
// graph cell recalls against a half-built graph (spurious graphEffect=0).
await memory.flushPendingWrites()
const ingestMs = Date.now() - ingestStart

const evalStart = Date.now()
const recallResult = await memory.recall(question.question)
const topMemories = recallResult.memories.slice(0, topK)
const topMemories = mergeAssociationsIntoScored(
recallResult, opts?.mergeAssociationsIntoTopK,
).slice(0, topK)

// Deduplicate retrieved sessions in rank order
const seen = new Set<string>()
Expand Down
2 changes: 1 addition & 1 deletion packages/bench/src/longmemeval/forensics/recall-sweep.ts
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ async function main(): Promise<void> {
// BUT — runQuestion currently slices to topK before computing recall@K.
// For the sweep we want a fuller view: retrieve max(K_VALUES) once, then
// compute recall@K from the same list. We need a slightly different path.
const memory = await createBenchMemory(benchOpts)
const { memory } = await createBenchMemory(benchOpts)
let episodes = 0
let ingestMs = 0
let evalMs = 0
Expand Down
Loading