From f4d125d45e960823527298a2d751fe55aced4465 Mon Sep 17 00:00:00 2001 From: muhammadkh4n Date: Sat, 6 Jun 2026 05:13:44 +0500 Subject: [PATCH 1/9] fix(core,sqlite): forget() tombstones instead of inverting (Phase 1, offline path) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The shipped forget() was inverted: on episodes it called recordAccess() (access_count++), a term recall ranking REWARDS via accessBoost — so forgetting a memory RAISED its recall rank; on semantic it floored a confidence value no recall path reads; procedural was a no-op. Net: forget did nothing useful or worse, which is why ~7,200 "forgotten" memories resurfaced across 19 manual gardening sessions. This lands the offline-testable core of the fix (core contract + SQLite + PostgREST adapter; the PostgREST schema/RPC + Neo4j gate follow): - storage.ts: add markForgotten(ids): Promise to Episode/Semantic/ Procedural storage — a tombstone that sets forgotten_at and touches NEITHER access_count NOR confidence. - memory.ts forget(): rewrite the confirm path to call markForgotten per tier; drop the recordAccess/recordAccessAndBoost calls and the confidence floor (the forgotten_at tombstone is the single source of truth). Remove the now-unused CONFIDENCE_FLOOR. - sqlite: migration v5 adds forgotten_at (+ partial index) to episodes/ semantic/procedural; markForgotten impls; AND forgotten_at IS NULL gate cloned onto every recall path (vectorSearch + textBoost + the per-store hybrid/BM25/vector fallbacks), mirroring the proven superseded_by gate. - postgrest adapters: markForgotten via table PATCH (schema lands next). - tests: forget-e2e (recall gate excludes a tombstoned memory while the sibling survives; forget removes matched content; access_count NOT bumped; confirm=false no-op; idempotent at both levels) — would fail on old code. migration test updated to v5 + a forgotten_at column assertion. sqlite 106/106, core 495/495, typecheck clean. --- packages/core/src/adapters/storage.ts | 19 +++ packages/core/src/memory.ts | 39 +++---- packages/core/test/retrieval/mock-storage.ts | 3 + packages/postgrest/src/episodes.ts | 13 +++ packages/postgrest/src/procedural.ts | 13 +++ packages/postgrest/src/semantic.ts | 13 +++ packages/sqlite/src/adapter.ts | 20 ++-- packages/sqlite/src/episodes.ts | 15 ++- packages/sqlite/src/migrations.ts | 16 +++ packages/sqlite/src/procedural.ts | 17 ++- packages/sqlite/src/semantic.ts | 14 ++- packages/sqlite/test/forget-e2e.test.ts | 116 +++++++++++++++++++ packages/sqlite/test/migrations.test.ts | 14 ++- 13 files changed, 268 insertions(+), 44 deletions(-) create mode 100644 packages/sqlite/test/forget-e2e.test.ts diff --git a/packages/core/src/adapters/storage.ts b/packages/core/src/adapters/storage.ts index 1675e7c..0a97fff 100644 --- a/packages/core/src/adapters/storage.ts +++ b/packages/core/src/adapters/storage.ts @@ -25,6 +25,13 @@ export interface EpisodeStorage { getUnconsolidatedSessions(): Promise markConsolidated(ids: string[]): Promise recordAccess(id: string): Promise + /** + * Tombstone the given memories (sets forgotten_at). Forgotten memories are + * excluded from every recall path but retained for audit/undo. Distinct + * from recordAccess — does NOT touch access_count. Returns the number of + * rows newly tombstoned. Idempotent. + */ + markForgotten(ids: string[]): Promise /** Find earliest created_at across episodes referenced by the given digest IDs */ findEarliestInDigests?(digestIds: string[]): Promise<{ createdAt: Date } | null> /** Fast COUNT(*) for stats(). Falls back to N-scan when not implemented. */ @@ -49,6 +56,12 @@ export interface SemanticStorage { getUnaccessed(days: number): Promise recordAccessAndBoost(id: string, confidenceBoost: number): Promise markSuperseded(id: string, supersededBy: string): Promise + /** + * Tombstone the given memories (sets forgotten_at). Forgotten memories are + * excluded from every recall path but retained for audit/undo. Does NOT + * touch confidence or access_count. Returns rows newly tombstoned. Idempotent. + */ + markForgotten(ids: string[]): Promise batchDecay(opts: { daysThreshold: number; decayRate: number }): Promise /** Per-ID gradient decay (PageRank-modulated). Falls back to batchDecay when not implemented. */ batchDecayGradient?(updates: Array<{ id: string; effectiveDecayRate: number; daysThreshold: number }>): Promise @@ -74,6 +87,12 @@ export interface ProceduralStorage { search(query: string, opts?: SearchOptions): Promise[]> searchByTrigger(activity: string, opts?: SearchOptions): Promise[]> recordAccess(id: string): Promise + /** + * Tombstone the given memories (sets forgotten_at). Excluded from recall, + * retained for audit/undo. Does NOT touch access_count. Returns rows newly + * tombstoned. Idempotent. + */ + markForgotten(ids: string[]): Promise incrementObservation(id: string): Promise batchDecay(opts: { daysThreshold: number; decayRate: number }): Promise /** Per-ID gradient decay (PageRank-modulated). Falls back to batchDecay when not implemented. */ diff --git a/packages/core/src/memory.ts b/packages/core/src/memory.ts index 549c6b4..15ab317 100644 --- a/packages/core/src/memory.ts +++ b/packages/core/src/memory.ts @@ -93,7 +93,6 @@ export interface SessionHandle { // --------------------------------------------------------------------------- const DEFAULT_SESSION_ID = 'default' -const CONFIDENCE_FLOOR = 0.05 // Minimum relevance required for a memory to count as "affected" by forget(). // computeScore() sums cosine similarity + bm25 boost + recency + access + role // bumps; a typical strong match lands around 0.6–1.1, weak semantic adjacency @@ -836,9 +835,9 @@ export class Memory { // --------------------------------------------------------------------------- /** - * Deprioritize memories (lossless — sets confidence to 0.05, marks - * metadata.forgotten). Returns a preview by default; pass confirm=true - * to actually apply. + * Forget memories — tombstones them (forgotten_at) so they are excluded + * from every recall path while retained for audit/undo. Lossless and + * idempotent. Returns a preview by default; pass confirm=true to apply. */ async forget( query: string, @@ -890,27 +889,23 @@ export class Memory { return { count: filtered.length, previewed: filtered } } - // Apply forgetting: lossless deprioritization + // Apply forgetting: tombstone the matched memories (forgotten_at). They are + // excluded from every recall path but retained for audit/undo. Idempotent. + // Deliberately does NOT touch access_count or confidence: the old behavior + // called recordAccess/recordAccessAndBoost, which incremented access_count — + // a term recall ranking REWARDS — so forgetting an episode raised its recall + // rank, and the floored confidence was a value no recall path ever read. + const idsByType: Record<'episode' | 'semantic' | 'procedural', string[]> = { + episode: [], semantic: [], procedural: [], + } for (const memory of filtered) { - if (memory.type === 'semantic') { - await this.storage.semantic.recordAccessAndBoost( - memory.id, - CONFIDENCE_FLOOR - 1 // set to floor by applying a large negative boost - ) - // Mark metadata.forgotten by re-inserting with updated metadata is - // not directly supported; we apply the confidence floor via available API. - // The storage interface supports recordAccessAndBoost but not direct update. - // We use a large negative boost to drive confidence toward floor. - } else if (memory.type === 'procedural') { - // No direct confidence update API for procedural; mark via the - // observationCount mechanism — no decay is available without batchDecay. - // We skip procedural direct update as the interface doesn't support it. - } else if (memory.type === 'episode') { - // Episodes are lossless; we can mark via metadata but there's no update - // API on EpisodeStorage. We record access to at least touch the episode. - await this.storage.episodes.recordAccess(memory.id) + if (memory.type === 'episode' || memory.type === 'semantic' || memory.type === 'procedural') { + idsByType[memory.type].push(memory.id) } } + if (idsByType.semantic.length > 0) await this.storage.semantic.markForgotten(idsByType.semantic) + if (idsByType.procedural.length > 0) await this.storage.procedural.markForgotten(idsByType.procedural) + if (idsByType.episode.length > 0) await this.storage.episodes.markForgotten(idsByType.episode) return { count: filtered.length, previewed: filtered } } diff --git a/packages/core/test/retrieval/mock-storage.ts b/packages/core/test/retrieval/mock-storage.ts index 33b03f4..94d6bb1 100644 --- a/packages/core/test/retrieval/mock-storage.ts +++ b/packages/core/test/retrieval/mock-storage.ts @@ -198,6 +198,7 @@ export function createMockStorage(opts: MockStorageOptions = {}): StorageAdapter getUnconsolidatedSessions: vi.fn().mockResolvedValue([]), markConsolidated: vi.fn().mockResolvedValue(undefined), recordAccess: vi.fn().mockResolvedValue(undefined), + markForgotten: vi.fn().mockResolvedValue(0), } const digests: DigestStorage = { @@ -214,6 +215,7 @@ export function createMockStorage(opts: MockStorageOptions = {}): StorageAdapter getUnaccessed: vi.fn().mockResolvedValue([]), recordAccessAndBoost: vi.fn().mockResolvedValue(undefined), markSuperseded: vi.fn().mockResolvedValue(undefined), + markForgotten: vi.fn().mockResolvedValue(0), batchDecay: vi.fn().mockResolvedValue(0), } @@ -222,6 +224,7 @@ export function createMockStorage(opts: MockStorageOptions = {}): StorageAdapter search: vi.fn().mockResolvedValue(proceduralResults), searchByTrigger: vi.fn().mockResolvedValue([]), recordAccess: vi.fn().mockResolvedValue(undefined), + markForgotten: vi.fn().mockResolvedValue(0), incrementObservation: vi.fn().mockResolvedValue(undefined), batchDecay: vi.fn().mockResolvedValue(0), } diff --git a/packages/postgrest/src/episodes.ts b/packages/postgrest/src/episodes.ts index aa6b6ca..db0dbac 100644 --- a/packages/postgrest/src/episodes.ts +++ b/packages/postgrest/src/episodes.ts @@ -222,6 +222,19 @@ export class PostgRestEpisodeStorage implements EpisodeStorage { if (error) throw new Error(`Episode recordAccess failed: ${error.message}`) } + + async markForgotten(ids: string[]): Promise { + if (ids.length === 0) return 0 + const { data, error } = await this.client + .from('memory_episodes') + .update({ forgotten_at: new Date().toISOString() }) + .in('id', ids) + .is('forgotten_at', null) + .select('id') + if (error) throw new Error(`Episode markForgotten failed: ${error.message}`) + return (data ?? []).length + } + async findEarliestInDigests(digestIds: string[]): Promise<{ createdAt: Date } | null> { if (digestIds.length === 0) return null const { data: digests, error: dErr } = await this.client diff --git a/packages/postgrest/src/procedural.ts b/packages/postgrest/src/procedural.ts index 30ccd1b..292e5cc 100644 --- a/packages/postgrest/src/procedural.ts +++ b/packages/postgrest/src/procedural.ts @@ -133,6 +133,19 @@ export class PostgRestProceduralStorage implements ProceduralStorage { if (error) throw new Error(`Procedural recordAccess failed: ${error.message}`) } + + async markForgotten(ids: string[]): Promise { + if (ids.length === 0) return 0 + const { data, error } = await this.client + .from('memory_procedural') + .update({ forgotten_at: new Date().toISOString() }) + .in('id', ids) + .is('forgotten_at', null) + .select('id') + if (error) throw new Error(`Procedural markForgotten failed: ${error.message}`) + return (data ?? []).length + } + async incrementObservation(id: string): Promise { const { data: current, error: fetchErr } = await this.client .from('memory_procedural') diff --git a/packages/postgrest/src/semantic.ts b/packages/postgrest/src/semantic.ts index 0bf6876..e4c8762 100644 --- a/packages/postgrest/src/semantic.ts +++ b/packages/postgrest/src/semantic.ts @@ -141,6 +141,19 @@ export class PostgRestSemanticStorage implements SemanticStorage { if (err2) throw new Error(`Semantic markSuperseded (new) failed: ${err2.message}`) } + + async markForgotten(ids: string[]): Promise { + if (ids.length === 0) return 0 + const { data, error } = await this.client + .from('memory_semantic') + .update({ forgotten_at: new Date().toISOString() }) + .in('id', ids) + .is('forgotten_at', null) + .select('id') + if (error) throw new Error(`Semantic markForgotten failed: ${error.message}`) + return (data ?? []).length + } + async batchDecay(opts: { daysThreshold: number; decayRate: number }): Promise { // Call engram_decay_pass and extract semantic_decayed count const { data, error } = await this.client.rpc('engram_decay_pass', { diff --git a/packages/sqlite/src/adapter.ts b/packages/sqlite/src/adapter.ts index 56f46ab..cc30f73 100644 --- a/packages/sqlite/src/adapter.ts +++ b/packages/sqlite/src/adapter.ts @@ -206,10 +206,10 @@ export class SqliteStorageAdapter implements StorageAdapter { let sql: string let params: unknown[] if (opts?.sessionId) { - sql = `SELECT * FROM episodes WHERE embedding IS NOT NULL AND session_id = ?${projectFilter} LIMIT ?` + sql = `SELECT * FROM episodes WHERE embedding IS NOT NULL AND forgotten_at IS NULL AND session_id = ?${projectFilter} LIMIT ?` params = [opts.sessionId, ...projectParams, scanLimit] } else { - sql = `SELECT * FROM episodes WHERE embedding IS NOT NULL${projectFilter} LIMIT ?` + sql = `SELECT * FROM episodes WHERE embedding IS NOT NULL AND forgotten_at IS NULL${projectFilter} LIMIT ?` params = [...projectParams, scanLimit] } const rows = db.prepare(sql).all(...params) as EpisodeRow[] @@ -242,7 +242,7 @@ export class SqliteStorageAdapter implements StorageAdapter { if (tiers.includes('semantic')) { const rows = db.prepare( - `SELECT * FROM semantic WHERE embedding IS NOT NULL AND superseded_by IS NULL${projectFilter} LIMIT ?` + `SELECT * FROM semantic WHERE embedding IS NOT NULL AND superseded_by IS NULL AND forgotten_at IS NULL${projectFilter} LIMIT ?` ).all(...projectParams, scanLimit) as SemanticRow[] for (const row of rows) { if (!row.embedding) continue @@ -256,7 +256,7 @@ export class SqliteStorageAdapter implements StorageAdapter { if (tiers.includes('procedural')) { const rows = db.prepare( - `SELECT * FROM procedural WHERE embedding IS NOT NULL${projectFilter} LIMIT ?` + `SELECT * FROM procedural WHERE embedding IS NOT NULL AND forgotten_at IS NULL${projectFilter} LIMIT ?` ).all(...projectParams, scanLimit) as ProceduralRow[] for (const row of rows) { if (!row.embedding) continue @@ -288,10 +288,10 @@ export class SqliteStorageAdapter implements StorageAdapter { const projectId = opts?.projectId try { - let sql = 'SELECT e.id, rank FROM episodes_fts f JOIN episodes e ON e.rowid = f.rowid WHERE episodes_fts MATCH ? ORDER BY rank LIMIT ?' + let sql = 'SELECT e.id, rank FROM episodes_fts f JOIN episodes e ON e.rowid = f.rowid WHERE episodes_fts MATCH ? AND e.forgotten_at IS NULL ORDER BY rank LIMIT ?' const params: unknown[] = [ftsQuery, limit] if (projectId) { - sql = `SELECT e.id, rank FROM episodes_fts f JOIN episodes e ON e.rowid = f.rowid WHERE episodes_fts MATCH ? AND (e.project_id = ? OR e.project_id IS NULL) ORDER BY rank LIMIT ?` + sql = `SELECT e.id, rank FROM episodes_fts f JOIN episodes e ON e.rowid = f.rowid WHERE episodes_fts MATCH ? AND e.forgotten_at IS NULL AND (e.project_id = ? OR e.project_id IS NULL) ORDER BY rank LIMIT ?` params.splice(1, 0, projectId) } const epRows = db.prepare(sql).all(...params) as Array<{ id: string; rank: number }> @@ -310,10 +310,10 @@ export class SqliteStorageAdapter implements StorageAdapter { } catch { /* FTS5 table may not exist */ } try { - let sql = 'SELECT s.id, rank FROM semantic_fts f JOIN semantic s ON s.rowid = f.rowid WHERE semantic_fts MATCH ? ORDER BY rank LIMIT ?' + let sql = 'SELECT s.id, rank FROM semantic_fts f JOIN semantic s ON s.rowid = f.rowid WHERE semantic_fts MATCH ? AND s.superseded_by IS NULL AND s.forgotten_at IS NULL ORDER BY rank LIMIT ?' const params: unknown[] = [ftsQuery, limit] if (projectId) { - sql = `SELECT s.id, rank FROM semantic_fts f JOIN semantic s ON s.rowid = f.rowid WHERE semantic_fts MATCH ? AND (s.project_id = ? OR s.project_id IS NULL) ORDER BY rank LIMIT ?` + sql = `SELECT s.id, rank FROM semantic_fts f JOIN semantic s ON s.rowid = f.rowid WHERE semantic_fts MATCH ? AND s.superseded_by IS NULL AND s.forgotten_at IS NULL AND (s.project_id = ? OR s.project_id IS NULL) ORDER BY rank LIMIT ?` params.splice(1, 0, projectId) } const smRows = db.prepare(sql).all(...params) as Array<{ id: string; rank: number }> @@ -321,10 +321,10 @@ export class SqliteStorageAdapter implements StorageAdapter { } catch { /* FTS5 table may not exist */ } try { - let sql = 'SELECT p.id, rank FROM procedural_fts f JOIN procedural p ON p.rowid = f.rowid WHERE procedural_fts MATCH ? ORDER BY rank LIMIT ?' + let sql = 'SELECT p.id, rank FROM procedural_fts f JOIN procedural p ON p.rowid = f.rowid WHERE procedural_fts MATCH ? AND p.forgotten_at IS NULL ORDER BY rank LIMIT ?' const params: unknown[] = [ftsQuery, limit] if (projectId) { - sql = `SELECT p.id, rank FROM procedural_fts f JOIN procedural p ON p.rowid = f.rowid WHERE procedural_fts MATCH ? AND (p.project_id = ? OR p.project_id IS NULL) ORDER BY rank LIMIT ?` + sql = `SELECT p.id, rank FROM procedural_fts f JOIN procedural p ON p.rowid = f.rowid WHERE procedural_fts MATCH ? AND p.forgotten_at IS NULL AND (p.project_id = ? OR p.project_id IS NULL) ORDER BY rank LIMIT ?` params.splice(1, 0, projectId) } const prRows = db.prepare(sql).all(...params) as Array<{ id: string; rank: number }> diff --git a/packages/sqlite/src/episodes.ts b/packages/sqlite/src/episodes.ts index 7f967dd..94f90df 100644 --- a/packages/sqlite/src/episodes.ts +++ b/packages/sqlite/src/episodes.ts @@ -74,7 +74,7 @@ export class SqliteEpisodeStorage implements EpisodeStorage { SELECT e.*, -episodes_fts.rank AS bm25_score FROM episodes_fts JOIN episodes e ON episodes_fts.rowid = e.rowid - WHERE episodes_fts MATCH ? + WHERE episodes_fts MATCH ? AND e.forgotten_at IS NULL ` const params: unknown[] = [ftsQuery] if (sessionId) { @@ -86,7 +86,7 @@ export class SqliteEpisodeStorage implements EpisodeStorage { }, recentVectorSql: ` SELECT id, embedding FROM episodes - WHERE embedding IS NOT NULL + WHERE embedding IS NOT NULL AND forgotten_at IS NULL ORDER BY created_at DESC LIMIT ? `, @@ -105,7 +105,7 @@ export class SqliteEpisodeStorage implements EpisodeStorage { SELECT e.*, -episodes_fts.rank AS bm25_score FROM episodes_fts JOIN episodes e ON episodes_fts.rowid = e.rowid - WHERE episodes_fts MATCH ? + WHERE episodes_fts MATCH ? AND e.forgotten_at IS NULL ` const params: unknown[] = [ftsQuery] @@ -189,6 +189,15 @@ export class SqliteEpisodeStorage implements EpisodeStorage { .run(id) } + async markForgotten(ids: string[]): Promise { + if (ids.length === 0) return 0 + const placeholders = ids.map(() => '?').join(',') + const res = this.db + .prepare(`UPDATE episodes SET forgotten_at = julianday('now') WHERE id IN (${placeholders}) AND forgotten_at IS NULL`) + .run(...ids) + return res.changes + } + async findEarliestInDigests(digestIds: string[]): Promise<{ createdAt: Date } | null> { if (digestIds.length === 0) return null const placeholders = digestIds.map(() => '?').join(',') diff --git a/packages/sqlite/src/migrations.ts b/packages/sqlite/src/migrations.ts index 1d654dd..01ee9bd 100644 --- a/packages/sqlite/src/migrations.ts +++ b/packages/sqlite/src/migrations.ts @@ -348,4 +348,20 @@ export function runMigrations(db: Database.Database): void { db.pragma('user_version = 4') } + + if (currentVersion < 5) { + // V5: forgotten_at tombstone on the three recallable memory tables. + // forget() stamps forgotten_at; every recall path filters forgotten_at IS NULL. + // Distinct from superseded_by (supersession lineage) — this is explicit user/GC + // forgetting. Rows are retained for audit/undo, never destroyed here. + const tables = ['episodes', 'semantic', 'procedural'] as const + for (const table of tables) { + const cols = db.prepare(`PRAGMA table_info(${table})`).all() as Array<{ name: string }> + if (!cols.some(c => c.name === 'forgotten_at')) { + db.exec(`ALTER TABLE ${table} ADD COLUMN forgotten_at REAL`) + db.exec(`CREATE INDEX IF NOT EXISTS idx_${table}_forgotten ON ${table}(forgotten_at) WHERE forgotten_at IS NOT NULL`) + } + } + db.pragma('user_version = 5') + } } diff --git a/packages/sqlite/src/procedural.ts b/packages/sqlite/src/procedural.ts index 596a1cb..15268e3 100644 --- a/packages/sqlite/src/procedural.ts +++ b/packages/sqlite/src/procedural.ts @@ -69,13 +69,13 @@ export class SqliteProceduralStorage implements ProceduralStorage { `SELECT p.*, -procedural_fts.rank AS bm25_score FROM procedural_fts JOIN procedural p ON procedural_fts.rowid = p.rowid - WHERE procedural_fts MATCH ? + WHERE procedural_fts MATCH ? AND p.forgotten_at IS NULL ORDER BY rank LIMIT 50` ) .all(ftsQuery) as Array, recentVectorSql: ` SELECT id, embedding FROM procedural - WHERE embedding IS NOT NULL + WHERE embedding IS NOT NULL AND forgotten_at IS NULL ORDER BY created_at DESC LIMIT ? `, @@ -102,7 +102,7 @@ export class SqliteProceduralStorage implements ProceduralStorage { `SELECT p.*, -procedural_fts.rank AS bm25_score FROM procedural_fts JOIN procedural p ON procedural_fts.rowid = p.rowid - WHERE procedural_fts MATCH ? + WHERE procedural_fts MATCH ? AND p.forgotten_at IS NULL ORDER BY rank LIMIT ?` ) .all(ftsQuery, limit) as (ProceduralRow & { bm25_score: number })[] @@ -131,7 +131,7 @@ export class SqliteProceduralStorage implements ProceduralStorage { `SELECT p.*, -procedural_fts.rank AS bm25_score FROM procedural_fts JOIN procedural p ON procedural_fts.rowid = p.rowid - WHERE procedural_fts MATCH ? + WHERE procedural_fts MATCH ? AND p.forgotten_at IS NULL ORDER BY rank LIMIT ?` ) .all(columnQuery, limit) as (ProceduralRow & { bm25_score: number })[] @@ -156,6 +156,15 @@ export class SqliteProceduralStorage implements ProceduralStorage { .run(id) } + async markForgotten(ids: string[]): Promise { + if (ids.length === 0) return 0 + const placeholders = ids.map(() => '?').join(',') + const res = this.db + .prepare(`UPDATE procedural SET forgotten_at = julianday('now') WHERE id IN (${placeholders}) AND forgotten_at IS NULL`) + .run(...ids) + return res.changes + } + async incrementObservation(id: string): Promise { this.db .prepare( diff --git a/packages/sqlite/src/semantic.ts b/packages/sqlite/src/semantic.ts index 52f110b..47e7b08 100644 --- a/packages/sqlite/src/semantic.ts +++ b/packages/sqlite/src/semantic.ts @@ -67,7 +67,7 @@ export class SqliteSemanticStorage implements SemanticStorage { FROM semantic_fts JOIN semantic s ON semantic_fts.rowid = s.rowid WHERE semantic_fts MATCH ? - AND s.superseded_by IS NULL + AND s.superseded_by IS NULL AND s.forgotten_at IS NULL ORDER BY rank LIMIT 50` ) .all(ftsQuery) as Array, @@ -75,6 +75,7 @@ export class SqliteSemanticStorage implements SemanticStorage { SELECT id, embedding FROM semantic WHERE embedding IS NOT NULL AND superseded_by IS NULL + AND forgotten_at IS NULL ORDER BY created_at DESC LIMIT ? `, @@ -102,7 +103,7 @@ export class SqliteSemanticStorage implements SemanticStorage { FROM semantic_fts JOIN semantic s ON semantic_fts.rowid = s.rowid WHERE semantic_fts MATCH ? - AND s.superseded_by IS NULL + AND s.superseded_by IS NULL AND s.forgotten_at IS NULL ORDER BY rank LIMIT ?` ) .all(ftsQuery, limit) as (SemanticRow & { bm25_score: number })[] @@ -147,6 +148,15 @@ export class SqliteSemanticStorage implements SemanticStorage { txn() } + async markForgotten(ids: string[]): Promise { + if (ids.length === 0) return 0 + const placeholders = ids.map(() => '?').join(',') + const res = this.db + .prepare(`UPDATE semantic SET forgotten_at = julianday('now') WHERE id IN (${placeholders}) AND forgotten_at IS NULL`) + .run(...ids) + return res.changes + } + async batchDecay(opts: { daysThreshold: number; decayRate: number }): Promise { const result = this.db .prepare( diff --git a/packages/sqlite/test/forget-e2e.test.ts b/packages/sqlite/test/forget-e2e.test.ts new file mode 100644 index 0000000..a341cf7 --- /dev/null +++ b/packages/sqlite/test/forget-e2e.test.ts @@ -0,0 +1,116 @@ +/** + * forget() is a real tombstone, not an inverted boost. + * + * The shipped bug (pre-overhaul): forget() called recordAccess/ + * recordAccessAndBoost, which incremented access_count — a term recall + * ranking REWARDS — so "forgetting" an episode RAISED its recall rank, while + * the floored confidence was a value no recall path ever read. Net: forget + * did nothing useful (or worse). The fix tombstones forgotten_at and gates + * every recall path on it, touching neither access_count nor confidence. + * + * The fake intelligence routes content/queries onto disjoint embedding axes by + * keyword so a forget query matches the intended memory and not its sibling. + */ +import { describe, it, expect, beforeEach, afterEach } from 'vitest' +import { createMemory, type IntelligenceAdapter, type Memory } from '@engram-mem/core' +import { SqliteStorageAdapter } from '../src/adapter.js' + +const DIM = 1536 +function embedText(text: string): number[] { + const v = new Array(DIM).fill(0) + if (/deploy|key|rotat|staging|monday/i.test(text)) v[0] = 1 + else if (/billing|cron|midnight|nightly/i.test(text)) v[1] = 1 + else v[2] = 1 + return v +} +const intel: IntelligenceAdapter = { + async embed(text: string): Promise { + return embedText(text) + }, + dimensions(): number { + return DIM + }, +} + +const DEPLOY = 'the staging deploy key must be rotated every monday' +const BILLING = 'the billing cron job runs at midnight nightly' + +describe('forget() tombstone', () => { + let storage: SqliteStorageAdapter + let mem: Memory + + beforeEach(async () => { + storage = new SqliteStorageAdapter(':memory:') + await storage.initialize() + mem = createMemory({ storage, intelligence: intel }) + await mem.initialize() + await mem.ingest({ role: 'user', content: DEPLOY }) + await mem.ingest({ role: 'user', content: BILLING }) + await mem.flushPendingWrites?.() + }) + afterEach(async () => { + await storage.dispose() + }) + + async function recallHas(query: string, needle: string): Promise { + const r = await mem.recall(query) + return r.memories.some((m) => m.content.includes(needle)) + } + async function deployEpisodeId(): Promise { + const rows = await storage.vectorSearch(embedText('deploy key'), { limit: 10 }) + const ep = rows.find( + (r) => r.item.type === 'episode' && (r.item.data as { content: string }).content.includes('deploy key'), + ) + if (!ep) throw new Error('deploy episode not found') + return (ep.item.data as { id: string }).id + } + + it('recall gate: a tombstoned memory is excluded; its sibling survives', async () => { + // Tombstone exactly one memory and prove the recall gate honors it while + // leaving the non-forgotten sibling fully recallable (the core of the fix). + expect(await recallHas('what is the deploy key rotation policy?', 'deploy key')).toBe(true) + expect(await recallHas('when does the billing cron run?', 'billing cron')).toBe(true) + + const id = await deployEpisodeId() + expect(await storage.episodes.markForgotten([id])).toBe(1) + + expect(await recallHas('what is the deploy key rotation policy?', 'deploy key')).toBe(false) + expect(await recallHas('when does the billing cron run?', 'billing cron')).toBe(true) + }) + + it('forget(confirm=true) removes matched content from recall', async () => { + expect(await recallHas('what is the deploy key rotation policy?', 'deploy key')).toBe(true) + const res = await mem.forget('deploy key rotation', { confirm: true }) + expect(res.count).toBeGreaterThanOrEqual(1) + expect(await recallHas('what is the deploy key rotation policy?', 'deploy key')).toBe(false) + }) + + it('markForgotten does NOT touch access_count (the inversion regression)', async () => { + const id = await deployEpisodeId() + const before = (await storage.episodes.getByIds([id]))[0]!.accessCount + const n = await storage.episodes.markForgotten([id]) + expect(n).toBe(1) + const after = (await storage.episodes.getByIds([id]))[0]!.accessCount + expect(after).toBe(before) // old recordAccess would have bumped this by 1 + }) + + it('confirm=false is a preview no-op', async () => { + const preview = await mem.forget('deploy key rotation') // confirm defaults false + expect(preview.count).toBeGreaterThanOrEqual(1) + expect(await recallHas('what is the deploy key rotation policy?', 'deploy key')).toBe(true) + }) + + it('is idempotent — re-forgetting a tombstoned memory is a no-op', async () => { + await mem.forget('deploy key rotation', { confirm: true }) + const second = await mem.forget('deploy key rotation', { confirm: true }) + expect(second.count).toBe(0) // already gated out of recall, nothing left to match + expect(await recallHas('what is the deploy key rotation policy?', 'deploy key')).toBe(false) + }) + + it('markForgotten is idempotent at the storage level', async () => { + const id = await deployEpisodeId() + expect(await storage.episodes.markForgotten([id])).toBe(1) + expect(await storage.episodes.markForgotten([id])).toBe(0) // already tombstoned + expect(await storage.episodes.markForgotten([])).toBe(0) + }) +}) diff --git a/packages/sqlite/test/migrations.test.ts b/packages/sqlite/test/migrations.test.ts index 5549217..386c236 100644 --- a/packages/sqlite/test/migrations.test.ts +++ b/packages/sqlite/test/migrations.test.ts @@ -42,9 +42,17 @@ describe('SQLite migrations', () => { expect(tables).toContain('procedural_fts') }) - it('sets schema version to 4 after all migrations', () => { + it('sets schema version to 5 after all migrations', () => { runMigrations(db) - expect(getSchemaVersion(db)).toBe(4) + expect(getSchemaVersion(db)).toBe(5) + }) + + it('v5 adds forgotten_at to the recallable memory tables', () => { + runMigrations(db) + for (const table of ['episodes', 'semantic', 'procedural']) { + const cols = db.prepare(`PRAGMA table_info(${table})`).all() as Array<{ name: string }> + expect(cols.some((c) => c.name === 'forgotten_at')).toBe(true) + } }) it('creates episode_parts table (dual-storage architecture)', () => { @@ -61,7 +69,7 @@ describe('SQLite migrations', () => { it('is idempotent (running twice does not error)', () => { runMigrations(db) runMigrations(db) - expect(getSchemaVersion(db)).toBe(4) + expect(getSchemaVersion(db)).toBe(5) }) it('enforces foreign keys on memories table', () => { From 7ef97f983a82f60faee155a82c7cf1abbb61e3c7 Mon Sep 17 00:00:00 2001 From: muhammadkh4n Date: Sun, 7 Jun 2026 04:01:20 +0500 Subject: [PATCH 2/9] feat(postgrest): gate recall RPCs on forgotten_at; engram_mark_forgotten (Phase 1 production path) Carry the forget() tombstone into the PostgREST schema so forget removes content from every recall path on the production (Postgres+pgvector) backend, matching the SQLite v5 offline path. - forgotten_at timestamptz on memory_episodes/semantic/procedural, added both in the CREATE TABLE bodies and via idempotent ADD COLUMN IF NOT EXISTS so re-applying onto an already-provisioned DB actually adds the column. - engram_mark_forgotten(p_memory_type, p_ids): stamps forgotten_at and touches neither access_count nor confidence (writing access_count was the inverted- forget bug; flooring confidence was dead). Idempotent, returns rows stamped. - AND forgotten_at IS NULL gate in all 4 recall RPCs (engram_hybrid_recall, engram_recall, engram_text_boost, engram_vector_search) across the episode, semantic and procedural branches. Digests are not forgettable. - partial indexes on tombstoned rows (lockstep with the SQLite v5 indexes). - EOF post-apply smoke executes every recall RPC + engram_mark_forgotten so a missing column or broken gate surfaces at apply time. The dump emits functions before tables and relies on check_function_bodies=false, so the forgotten_at columns live in the table section and the smoke is the call-time guard; there is no migration runner. Verified on Postgres 17 + pgvector: forget round-trip across all three types, sibling survives, forgotten row's access_count unchanged. New schema-gate test pins the predicates; postgrest suite 71/71, typecheck clean. --- packages/postgrest/schema.sql | 127 +++++++++++++++++- .../postgrest/test/forgotten-at-gate.test.ts | 104 ++++++++++++++ 2 files changed, 227 insertions(+), 4 deletions(-) create mode 100644 packages/postgrest/test/forgotten-at-gate.test.ts diff --git a/packages/postgrest/schema.sql b/packages/postgrest/schema.sql index c600ae6..ef64a37 100644 --- a/packages/postgrest/schema.sql +++ b/packages/postgrest/schema.sql @@ -49,6 +49,27 @@ SET row_security = off; -- pgvector extension required for vector(1536) columns CREATE EXTENSION IF NOT EXISTS vector; +-- ============================================================================= +-- forget() tombstone — within-file ordering note +-- ----------------------------------------------------------------------------- +-- Phase 1 adds a `forgotten_at timestamptz` tombstone to memory_episodes / +-- memory_semantic / memory_procedural. forget() stamps it; every recall RPC +-- below gates on `forgotten_at IS NULL` (a 1:1 clone of the proven +-- `superseded_by IS NULL` gate). It is intentionally NOT added to +-- memory_digests (consolidation artifacts are not directly forgettable). +-- +-- This file is a pg_dump: functions are emitted ABOVE the tables they read, +-- which is only valid because `SET check_function_bodies = false` (above) +-- defers body validation to call time. The forgotten_at columns are therefore +-- added in the TABLE section (CREATE TABLE bodies + an idempotent +-- `ADD COLUMN IF NOT EXISTS` block for already-provisioned DBs, since +-- CREATE TABLE IF NOT EXISTS is a no-op there) and the partial indexes in the +-- INDEX section — both physically before the only call sites in this file: the +-- post-apply smoke at EOF, which EXECUTES every recall RPC so a missing column +-- or broken gate fails LOUDLY at apply time. There is no migration runner; the +-- single sequential `psql -f schema.sql` apply is the ordering guarantee. +-- ============================================================================= + -- -- Name: public; Type: SCHEMA; Schema: -; Owner: - -- @@ -130,6 +151,7 @@ CREATE OR REPLACE FUNCTION public.engram_hybrid_recall(p_query_text text, p_quer SELECT me.id, ROW_NUMBER() OVER (ORDER BY ts_rank_cd(me.fts, websearch_to_tsquery('english', p_query_text)) DESC) AS rank_ix FROM memory_episodes me WHERE p_include_episodes AND me.fts @@ websearch_to_tsquery('english', p_query_text) + AND me.forgotten_at IS NULL AND (p_session_id IS NULL OR me.session_id = p_session_id) AND (p_project_id IS NULL OR me.project_id = p_project_id OR me.project_id IS NULL) LIMIT p_match_count * 2 @@ -138,6 +160,7 @@ CREATE OR REPLACE FUNCTION public.engram_hybrid_recall(p_query_text text, p_quer SELECT me.id, ROW_NUMBER() OVER (ORDER BY me.embedding <=> p_query_embedding) AS rank_ix FROM memory_episodes me WHERE p_include_episodes AND me.embedding IS NOT NULL + AND me.forgotten_at IS NULL AND (p_session_id IS NULL OR me.session_id = p_session_id) AND (p_project_id IS NULL OR me.project_id = p_project_id OR me.project_id IS NULL) ORDER BY me.embedding <=> p_query_embedding LIMIT p_match_count * 2 @@ -178,12 +201,12 @@ CREATE OR REPLACE FUNCTION public.engram_hybrid_recall(p_query_text text, p_quer SELECT * FROM ( WITH ft AS ( SELECT ms.id, ROW_NUMBER() OVER (ORDER BY ts_rank_cd(ms.fts, websearch_to_tsquery('english', p_query_text)) DESC) AS rank_ix - FROM memory_semantic ms WHERE p_include_semantic AND ms.fts @@ websearch_to_tsquery('english', p_query_text) AND ms.superseded_by IS NULL + FROM memory_semantic ms WHERE p_include_semantic AND ms.fts @@ websearch_to_tsquery('english', p_query_text) AND ms.superseded_by IS NULL AND ms.forgotten_at IS NULL AND (p_project_id IS NULL OR ms.project_id = p_project_id OR ms.project_id IS NULL) LIMIT p_match_count * 2 ), vs AS ( SELECT ms.id, ROW_NUMBER() OVER (ORDER BY ms.embedding <=> p_query_embedding) AS rank_ix - FROM memory_semantic ms WHERE p_include_semantic AND ms.embedding IS NOT NULL AND ms.superseded_by IS NULL + FROM memory_semantic ms WHERE p_include_semantic AND ms.embedding IS NOT NULL AND ms.superseded_by IS NULL AND ms.forgotten_at IS NULL AND (p_project_id IS NULL OR ms.project_id = p_project_id OR ms.project_id IS NULL) ORDER BY ms.embedding <=> p_query_embedding LIMIT p_match_count * 2 ) @@ -200,12 +223,12 @@ CREATE OR REPLACE FUNCTION public.engram_hybrid_recall(p_query_text text, p_quer SELECT * FROM ( WITH ft AS ( SELECT mp.id, ROW_NUMBER() OVER (ORDER BY ts_rank_cd(mp.fts, websearch_to_tsquery('english', p_query_text)) DESC) AS rank_ix - FROM memory_procedural mp WHERE p_include_procedural AND mp.fts @@ websearch_to_tsquery('english', p_query_text) + FROM memory_procedural mp WHERE p_include_procedural AND mp.fts @@ websearch_to_tsquery('english', p_query_text) AND mp.forgotten_at IS NULL AND (p_project_id IS NULL OR mp.project_id = p_project_id OR mp.project_id IS NULL) LIMIT p_match_count * 2 ), vs AS ( SELECT mp.id, ROW_NUMBER() OVER (ORDER BY mp.embedding <=> p_query_embedding) AS rank_ix - FROM memory_procedural mp WHERE p_include_procedural AND mp.embedding IS NOT NULL + FROM memory_procedural mp WHERE p_include_procedural AND mp.embedding IS NOT NULL AND mp.forgotten_at IS NULL AND (p_project_id IS NULL OR mp.project_id = p_project_id OR mp.project_id IS NULL) ORDER BY mp.embedding <=> p_query_embedding LIMIT p_match_count * 2 ) @@ -236,6 +259,7 @@ CREATE OR REPLACE FUNCTION public.engram_recall(p_query_embedding public.vector, (1-(embedding<=>p_query_embedding))::float AS similarity, entities FROM memory_episodes WHERE p_include_episodes AND embedding IS NOT NULL + AND forgotten_at IS NULL AND (p_session_id IS NULL OR session_id = p_session_id) AND (p_project_id IS NULL OR project_id = p_project_id OR project_id IS NULL) AND (1-(embedding<=>p_query_embedding)) >= p_min_similarity @@ -257,6 +281,7 @@ CREATE OR REPLACE FUNCTION public.engram_recall(p_query_embedding public.vector, (1-(embedding<=>p_query_embedding))::float, ARRAY[]::text[] FROM memory_semantic WHERE p_include_semantic AND embedding IS NOT NULL AND superseded_by IS NULL + AND forgotten_at IS NULL AND (p_project_id IS NULL OR project_id = p_project_id OR project_id IS NULL) AND (1-(embedding<=>p_query_embedding)) >= p_min_similarity ORDER BY embedding<=>p_query_embedding LIMIT p_match_count @@ -267,6 +292,7 @@ CREATE OR REPLACE FUNCTION public.engram_recall(p_query_embedding public.vector, (1-(embedding<=>p_query_embedding))::float, ARRAY[]::text[] FROM memory_procedural WHERE p_include_procedural AND embedding IS NOT NULL + AND forgotten_at IS NULL AND (p_project_id IS NULL OR project_id = p_project_id OR project_id IS NULL) AND (1-(embedding<=>p_query_embedding)) >= p_min_similarity ORDER BY embedding<=>p_query_embedding LIMIT p_match_count @@ -295,6 +321,38 @@ BEGIN END; $$; +-- +-- Name: engram_mark_forgotten(text, uuid[]); Type: FUNCTION; Schema: public; Owner: - +-- + +-- Tombstone primitive for forget(). Sets forgotten_at and touches NOTHING else +-- (deliberately no access_count / confidence write — that was the inverted- +-- forget() bug). Idempotent: the `forgotten_at IS NULL` guard makes a repeat +-- forget a no-op (returns 0). Mirrors the per-store markForgotten storage +-- contract (returns the number of rows newly tombstoned). +CREATE OR REPLACE FUNCTION public.engram_mark_forgotten(p_memory_type text, p_ids uuid[]) RETURNS integer + LANGUAGE plpgsql SECURITY DEFINER + SET search_path TO 'public' + AS $$ +DECLARE v_count integer; +BEGIN + IF p_memory_type = 'episode' THEN + UPDATE memory_episodes SET forgotten_at = now() + WHERE id = ANY(p_ids) AND forgotten_at IS NULL; + ELSIF p_memory_type = 'semantic' THEN + UPDATE memory_semantic SET forgotten_at = now() + WHERE id = ANY(p_ids) AND forgotten_at IS NULL; + ELSIF p_memory_type = 'procedural' THEN + UPDATE memory_procedural SET forgotten_at = now() + WHERE id = ANY(p_ids) AND forgotten_at IS NULL; + ELSE + RAISE EXCEPTION 'engram_mark_forgotten: unknown memory_type %', p_memory_type; + END IF; + GET DIAGNOSTICS v_count = ROW_COUNT; + RETURN v_count; +END; $$; + + -- -- Name: engram_text_boost(text, integer, text); Type: FUNCTION; Schema: public; Owner: - -- @@ -312,6 +370,7 @@ CREATE OR REPLACE FUNCTION public.engram_text_boost(p_query_terms text, p_match_ ts_rank_cd(me.fts, to_tsquery('english', p_query_terms))::float AS rank_score FROM memory_episodes me WHERE me.fts @@ to_tsquery('english', p_query_terms) + AND me.forgotten_at IS NULL AND (p_session_id IS NULL OR me.session_id = p_session_id) AND (p_project_id IS NULL OR me.project_id = p_project_id OR me.project_id IS NULL) @@ -330,6 +389,7 @@ CREATE OR REPLACE FUNCTION public.engram_text_boost(p_query_terms text, p_match_ FROM memory_semantic ms WHERE ms.fts @@ to_tsquery('english', p_query_terms) AND ms.superseded_by IS NULL + AND ms.forgotten_at IS NULL AND (p_project_id IS NULL OR ms.project_id = p_project_id OR ms.project_id IS NULL) UNION ALL @@ -338,6 +398,7 @@ CREATE OR REPLACE FUNCTION public.engram_text_boost(p_query_terms text, p_match_ ts_rank_cd(mp.fts, to_tsquery('english', p_query_terms))::float FROM memory_procedural mp WHERE mp.fts @@ to_tsquery('english', p_query_terms) + AND mp.forgotten_at IS NULL AND (p_project_id IS NULL OR mp.project_id = p_project_id OR mp.project_id IS NULL) ) combined ORDER BY rank_score DESC @@ -380,6 +441,7 @@ CREATE OR REPLACE FUNCTION public.engram_vector_search(p_query_embedding public. me.entities, me.metadata FROM memory_episodes me WHERE me.embedding IS NOT NULL + AND me.forgotten_at IS NULL AND (p_session_id IS NULL OR me.session_id = p_session_id) AND (p_project_id IS NULL OR me.project_id = p_project_id OR me.project_id IS NULL) @@ -405,6 +467,7 @@ CREATE OR REPLACE FUNCTION public.engram_vector_search(p_query_embedding public. ARRAY[]::text[], ms.metadata FROM memory_semantic ms WHERE ms.embedding IS NOT NULL AND ms.superseded_by IS NULL + AND ms.forgotten_at IS NULL AND (p_project_id IS NULL OR ms.project_id = p_project_id OR ms.project_id IS NULL) UNION ALL @@ -417,6 +480,7 @@ CREATE OR REPLACE FUNCTION public.engram_vector_search(p_query_embedding public. ARRAY[]::text[], mp.metadata FROM memory_procedural mp WHERE mp.embedding IS NOT NULL + AND mp.forgotten_at IS NULL AND (p_project_id IS NULL OR mp.project_id = p_project_id OR mp.project_id IS NULL) ORDER BY similarity DESC @@ -633,6 +697,7 @@ CREATE TABLE IF NOT EXISTS public.memory_episodes ( searchable_content text, fts tsvector GENERATED ALWAYS AS (to_tsvector('english'::regconfig, content)) STORED, project_id text, + forgotten_at timestamp with time zone, CONSTRAINT memory_episodes_role_check CHECK ((role = ANY (ARRAY['user'::text, 'assistant'::text, 'system'::text]))) ); @@ -678,6 +743,7 @@ CREATE TABLE IF NOT EXISTS public.memory_procedural ( updated_at timestamp with time zone DEFAULT now() NOT NULL, fts tsvector GENERATED ALWAYS AS (to_tsvector('english'::regconfig, ((trigger_text || ' '::text) || procedure))) STORED, project_id text, + forgotten_at timestamp with time zone, CONSTRAINT memory_procedural_category_check CHECK ((category = ANY (ARRAY['workflow'::text, 'preference'::text, 'habit'::text, 'pattern'::text, 'convention'::text]))), CONSTRAINT memory_procedural_confidence_check CHECK (((confidence >= (0.0)::double precision) AND (confidence <= (1.0)::double precision))), CONSTRAINT memory_procedural_decay_rate_check CHECK (((decay_rate > (0.0)::double precision) AND (decay_rate <= (1.0)::double precision))) @@ -708,6 +774,7 @@ CREATE TABLE IF NOT EXISTS public.memory_semantic ( valid_from timestamp with time zone, valid_until timestamp with time zone, project_id text, + forgotten_at timestamp with time zone, CONSTRAINT memory_knowledge_confidence_check CHECK (((confidence >= (0)::double precision) AND (confidence <= (1)::double precision))) ); @@ -750,6 +817,18 @@ CREATE TABLE IF NOT EXISTS public.sensory_snapshots ( ); +-- +-- forget() tombstone columns — idempotent ADD COLUMN for already-provisioned +-- DBs (CREATE TABLE IF NOT EXISTS above is a no-op there, so the column in the +-- table body never lands on an existing DB). Placed after the CREATE TABLEs and +-- before the partial indexes / post-apply smoke that read it. See the ordering +-- note at the top of this file. NOT added to memory_digests by design. +-- +ALTER TABLE public.memory_episodes ADD COLUMN IF NOT EXISTS forgotten_at timestamp with time zone; +ALTER TABLE public.memory_semantic ADD COLUMN IF NOT EXISTS forgotten_at timestamp with time zone; +ALTER TABLE public.memory_procedural ADD COLUMN IF NOT EXISTS forgotten_at timestamp with time zone; + + -- -- Name: community_summaries community_summaries_pkey; Type: CONSTRAINT; Schema: public; Owner: - -- @@ -1115,6 +1194,19 @@ CREATE INDEX IF NOT EXISTS idx_write_buffer_created ON public.memory_write_buffe CREATE INDEX IF NOT EXISTS idx_write_buffer_status ON public.memory_write_buffer USING btree (status); +-- +-- forget() tombstone partial indexes: index only the (rare) tombstoned rows so +-- forgotten-row enumeration (Phase 2 reclamation / audit) is cheap. The hot +-- `forgotten_at IS NULL` recall predicate matches the majority of rows and is +-- driven by the vector/fts indexes; it needs no index of its own. Mirrors the +-- SQLite v5 `WHERE forgotten_at IS NOT NULL` partial indexes (lockstep). +-- + +CREATE INDEX IF NOT EXISTS idx_episodes_forgotten ON public.memory_episodes USING btree (forgotten_at) WHERE (forgotten_at IS NOT NULL); +CREATE INDEX IF NOT EXISTS idx_semantic_forgotten ON public.memory_semantic USING btree (forgotten_at) WHERE (forgotten_at IS NOT NULL); +CREATE INDEX IF NOT EXISTS idx_procedural_forgotten ON public.memory_procedural USING btree (forgotten_at) WHERE (forgotten_at IS NOT NULL); + + -- -- Name: episode_parts episode_parts_episode_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: - -- @@ -1283,6 +1375,33 @@ DROP POLICY IF EXISTS service_role_all ON public.sensory_snapshots; CREATE POLICY service_role_all ON public.sensory_snapshots TO service_role USING (true) WITH CHECK (true); +-- +-- Post-apply smoke (no migration runner exists to enforce column-before-function +-- ordering). Executes every recall RPC + the forget primitive against the just- +-- applied schema so a missing forgotten_at column or a broken gate fails HERE: it +-- aborts the apply under `psql -v ON_ERROR_STOP=1`, and otherwise surfaces as a +-- loud ERROR line in the apply log. Read-only except engram_mark_forgotten on the +-- nil UUID (matches nothing -> returns 0). Idempotent and safe to re-run. All +-- names schema-qualified because the dump sets search_path = ''. +-- + +DO $smoke$ +DECLARE + v_unit public.vector := ('[1' || repeat(',0', 1535) || ']')::public.vector; + v_n integer; +BEGIN + PERFORM public.engram_recall(v_unit, NULL, 1); + PERFORM public.engram_hybrid_recall('smoke', v_unit, 1); + PERFORM public.engram_text_boost('smoke', 1); + PERFORM public.engram_vector_search(v_unit, 1); + v_n := public.engram_mark_forgotten('episode', ARRAY['00000000-0000-0000-0000-000000000000']::uuid[]); + v_n := public.engram_mark_forgotten('semantic', ARRAY['00000000-0000-0000-0000-000000000000']::uuid[]); + v_n := public.engram_mark_forgotten('procedural', ARRAY['00000000-0000-0000-0000-000000000000']::uuid[]); + RAISE NOTICE 'engram schema smoke OK: 4 recall RPCs + engram_mark_forgotten callable; forgotten_at gate live'; +END; +$smoke$; + + -- -- PostgreSQL database dump complete -- diff --git a/packages/postgrest/test/forgotten-at-gate.test.ts b/packages/postgrest/test/forgotten-at-gate.test.ts new file mode 100644 index 0000000..099c82b --- /dev/null +++ b/packages/postgrest/test/forgotten-at-gate.test.ts @@ -0,0 +1,104 @@ +/** + * Phase 1 (forget tombstone) — schema-level regression gate. + * + * schema.sql is GENERATED from a production pg_dump. A future re-dump that + * forgets to carry the `forgotten_at IS NULL` predicate would silently make + * forget() leak again (the exact class of the inverted-forget bug). These + * assertions pin the invariant in the committed file. The runtime behaviour + * (forget removes from every recall path, sibling survives, access_count + * unchanged) is proven against live Postgres+pgvector; here we pin the source. + * + * Counts are exact on purpose: a dropped gate lowers a count; gating a wrong + * table (e.g. memory_digests, which must NOT be forgettable) raises it. + */ +import { describe, it, expect } from 'vitest' +import { readFileSync } from 'node:fs' + +const schema = readFileSync(new URL('../schema.sql', import.meta.url), 'utf8') + +/** Extract a `CREATE OR REPLACE FUNCTION public.(...) AS $$ $$;` body. */ +function functionBody(name: string): string { + const re = new RegExp( + `CREATE OR REPLACE FUNCTION public\\.${name}\\([\\s\\S]*?AS \\$\\$([\\s\\S]*?)\\$\\$;`, + ) + const m = schema.match(re) + if (!m) throw new Error(`function ${name} not found in schema.sql`) + return m[1] +} + +function count(haystack: string, needle: string): number { + return haystack.split(needle).length - 1 +} + +// Branch counts per recall function: hybrid has ft+vs per type (2 each), +// the others have one branch per type. Digests are intentionally NOT gated. +const RECALL_FUNCTIONS = [ + { name: 'engram_hybrid_recall', forgottenGates: 6, supersededGates: 2 }, + { name: 'engram_recall', forgottenGates: 3, supersededGates: 1 }, + { name: 'engram_text_boost', forgottenGates: 3, supersededGates: 1 }, + { name: 'engram_vector_search', forgottenGates: 3, supersededGates: 1 }, +] as const + +describe('schema.sql forgotten_at recall gates', () => { + for (const fn of RECALL_FUNCTIONS) { + it(`${fn.name} gates episode+semantic+procedural on forgotten_at IS NULL (x${fn.forgottenGates})`, () => { + const body = functionBody(fn.name) + expect(count(body, 'forgotten_at IS NULL')).toBe(fn.forgottenGates) + }) + + it(`${fn.name} still carries the semantic superseded_by gate (x${fn.supersededGates})`, () => { + const body = functionBody(fn.name) + expect(count(body, 'superseded_by IS NULL')).toBe(fn.supersededGates) + }) + + it(`${fn.name} does NOT gate the digest branch (digests are not forgettable)`, () => { + const body = functionBody(fn.name) + // The digest CTE/branch references memory_digests via the `md` alias; + // it must never carry a forgotten_at predicate. + expect(body).not.toMatch(/md\.forgotten_at/) + }) + } +}) + +describe('schema.sql engram_mark_forgotten primitive', () => { + const body = functionBody('engram_mark_forgotten') + + it('stamps forgotten_at for all three forgettable types', () => { + expect(body).toMatch(/UPDATE memory_episodes SET forgotten_at = now\(\)/) + expect(body).toMatch(/UPDATE memory_semantic SET forgotten_at = now\(\)/) + expect(body).toMatch(/UPDATE memory_procedural SET forgotten_at = now\(\)/) + }) + + it('is idempotent: only stamps rows not already forgotten', () => { + expect(count(body, 'forgotten_at IS NULL')).toBe(3) + }) + + it('touches NEITHER access_count NOR confidence (the inversion fix)', () => { + // Forget must be a pure tombstone — writing access_count rewarded the + // forgotten memory via accessBoost; writing confidence collides with decay. + expect(body).not.toMatch(/access_count/) + expect(body).not.toMatch(/confidence/) + }) +}) + +describe('schema.sql forgotten_at columns + indexes', () => { + it('adds an idempotent forgotten_at column to the 3 forgettable tables', () => { + for (const table of ['memory_episodes', 'memory_semantic', 'memory_procedural']) { + expect(schema).toMatch( + new RegExp(`ALTER TABLE public\\.${table} ADD COLUMN IF NOT EXISTS forgotten_at`), + ) + } + }) + + it('does NOT add forgotten_at to memory_digests', () => { + expect(schema).not.toMatch(/ALTER TABLE public\.memory_digests ADD COLUMN IF NOT EXISTS forgotten_at/) + }) + + it('creates a partial index on tombstoned rows for each forgettable table', () => { + for (const idx of ['idx_episodes_forgotten', 'idx_semantic_forgotten', 'idx_procedural_forgotten']) { + expect(schema).toMatch( + new RegExp(`CREATE INDEX IF NOT EXISTS ${idx} [\\s\\S]*?WHERE \\(forgotten_at IS NOT NULL\\)`), + ) + } + }) +}) From 45700f4d476e9b4ea8c834774362bd0db3526958 Mon Sep 17 00:00:00 2001 From: muhammadkh4n Date: Sun, 7 Jun 2026 04:01:37 +0500 Subject: [PATCH 3/9] feat(graph,core): tombstone Neo4j Memory nodes so forget cannot leak through the graph MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the SQL forget gate, a forgotten memory could still surface through graph spreading activation — and would leak into authoritative recall once associations are merged into the scored pool. Close the graph channel. - GraphPort.forgetMemories?(ids): optional, capability-guarded port method. - NeuralGraph.forgetMemories stamps forgottenAt on :Memory {id} nodes (idempotent, returns count). Memory nodes are uniformly :Memory regardless of memoryType, so one match covers episode/semantic/procedural ids. - spreading-activation Cypher gates traversal on coalesce(n.forgottenAt, n.deletedAt) IS NULL (NULL-permissive), which also prunes paths that pass through a forgotten node. - Memory.forget() calls graph.forgetMemories after the SQL tombstone, capability-guarded and non-fatal: SQL stays the source of truth, and a graph hiccup or absent Neo4j never fails the forget. Verified on Neo4j 5: spreading-activation 12/12 incl. endpoint exclusion, path-through pruning, sibling survival, idempotency (1 then 0) and cross-type semantic gating. core 495/495, sqlite 106/106, typecheck clean. --- packages/core/src/adapters/graph.ts | 12 +++++ packages/core/src/memory.ts | 17 +++++++ packages/graph/src/neural-graph.ts | 29 +++++++++++ packages/graph/src/spreading-activation.ts | 3 ++ .../graph/test/spreading-activation.test.ts | 50 +++++++++++++++++++ 5 files changed, 111 insertions(+) diff --git a/packages/core/src/adapters/graph.ts b/packages/core/src/adapters/graph.ts index ddc73cd..4f58940 100644 --- a/packages/core/src/adapters/graph.ts +++ b/packages/core/src/adapters/graph.ts @@ -112,6 +112,18 @@ export interface GraphPort { lookupEntityNodes(names: string[]): Promise spreadActivation(opts: GraphSpreadActivationOpts): Promise strengthenTraversedEdges(pairs: Array<[string, string]>): Promise + + /** + * Phase 1 (forget tombstone): stamp `forgottenAt` on the Neo4j Memory nodes + * with these ids, so spreading activation excludes them — its path filter + * gates on `coalesce(n.forgottenAt, n.deletedAt) IS NULL`. Without this, + * forget() hides a memory in SQL recall but it can still surface through the + * graph association channel. Optional (only NeuralGraph implements it); core + * calls it capability-guarded and treats failure as non-fatal. Returns the + * number of nodes newly tombstoned. + */ + forgetMemories?(ids: string[]): Promise + // Wave 3: raw Cypher execution for consolidation operations // Returns the driver-native result type — consolidation code accesses // .records and .summary.counters via the GraphQueryResult shape, but diff --git a/packages/core/src/memory.ts b/packages/core/src/memory.ts index 15ab317..23e9752 100644 --- a/packages/core/src/memory.ts +++ b/packages/core/src/memory.ts @@ -907,6 +907,23 @@ export class Memory { if (idsByType.procedural.length > 0) await this.storage.procedural.markForgotten(idsByType.procedural) if (idsByType.episode.length > 0) await this.storage.episodes.markForgotten(idsByType.episode) + // Also tombstone the Neo4j Memory nodes so forget cannot leak back through + // the graph association channel: spreading activation gates traversal on + // coalesce(forgottenAt, deletedAt) IS NULL. SQL remains the source of truth — + // a graph hiccup (or absent Neo4j) must never fail the forget. Capability- + // guarded because forgetMemories is optional on GraphPort. + const graph = this._graph + if (graph && typeof graph.forgetMemories === 'function') { + const forgottenIds = [...idsByType.episode, ...idsByType.semantic, ...idsByType.procedural] + if (forgottenIds.length > 0) { + try { + await graph.forgetMemories(forgottenIds) + } catch (err) { + console.warn('[engram] forget: graph tombstone failed (non-fatal):', err) + } + } + } + return { count: filtered.length, previewed: filtered } } diff --git a/packages/graph/src/neural-graph.ts b/packages/graph/src/neural-graph.ts index 95d30d7..8c0a786 100644 --- a/packages/graph/src/neural-graph.ts +++ b/packages/graph/src/neural-graph.ts @@ -221,6 +221,35 @@ export class NeuralGraph { } } + /** + * Phase 1 (forget tombstone): stamp `forgottenAt` on the :Memory nodes with + * these ids so spreading activation excludes them — the path filter gates on + * `coalesce(n.forgottenAt, n.deletedAt) IS NULL`. Idempotent: only stamps + * nodes not already forgotten. Returns the number of nodes newly tombstoned. + * Memory nodes are uniformly `:Memory {id}` regardless of memoryType, so a + * single match covers episode/semantic/procedural ids. + */ + async forgetMemories(ids: string[]): Promise { + if (ids.length === 0) return 0 + const now = new Date().toISOString() + const session = this.driver.session() + try { + const result = await session.executeWrite(async (tx: ManagedTransaction) => { + return tx.run( + `MATCH (m:Memory) + WHERE m.id IN $ids AND m.forgottenAt IS NULL + SET m.forgottenAt = $now + RETURN count(m) AS forgotten`, + { ids, now } + ) + }) + const raw = result.records[0]?.get('forgotten') + return neo4j.isInt(raw) ? (raw as { toNumber(): number }).toNumber() : Number(raw ?? 0) + } finally { + await session.close() + } + } + async addPersonNode(input: PersonNodeInput): Promise { const id = `person:${normalizeForId(input.name)}` const now = new Date().toISOString() diff --git a/packages/graph/src/spreading-activation.ts b/packages/graph/src/spreading-activation.ts index bab0485..fd9775d 100644 --- a/packages/graph/src/spreading-activation.ts +++ b/packages/graph/src/spreading-activation.ts @@ -43,6 +43,9 @@ export class SpreadingActivation { OR NOT n:Memory OR n.projectId = $projectId OR n.projectId IS NULL) + AND ALL(n IN nodes(path) WHERE + NOT n:Memory + OR coalesce(n.forgottenAt, n.deletedAt) IS NULL) WITH neighbor, reduce( activation = 1.0, diff --git a/packages/graph/test/spreading-activation.test.ts b/packages/graph/test/spreading-activation.test.ts index d382b36..6557d77 100644 --- a/packages/graph/test/spreading-activation.test.ts +++ b/packages/graph/test/spreading-activation.test.ts @@ -189,4 +189,54 @@ describe.skipIf(!neo4jReady)('SpreadingActivation (integration)', () => { }) expect(unscoped.map((r) => r.nodeId)).toContain('ep-beta') }) + + // Phase 1 — forget tombstone must close the graph channel. After forget, + // SQL recall hides the memory; without this gate the same memory still + // surfaces through graph spreading activation (and would leak once + // associations are merged into authoritative recall). + it('forgetMemories excludes the node and prunes paths through it; siblings survive', async () => { + // Chain a -> b -> c (all :Memory). Sibling a -> d, not through b. + await graph.addMemoryNode({ id: 'm-a', memoryType: 'episode', label: 'A' }) + await graph.addMemoryNode({ id: 'm-b', memoryType: 'episode', label: 'B' }) + await graph.addMemoryNode({ id: 'm-c', memoryType: 'episode', label: 'C' }) + await graph.addMemoryNode({ id: 'm-d', memoryType: 'episode', label: 'D' }) + await graph.addEdge('m-a', 'm-b', 'TEMPORAL', 0.9) + await graph.addEdge('m-b', 'm-c', 'TEMPORAL', 0.9) + await graph.addEdge('m-a', 'm-d', 'TEMPORAL', 0.9) + + const before = await activation.activate(['m-a'], { + maxHops: 2, decayPerHop: 0.8, minActivation: 0.001, maxNodes: 50, + }) + const beforeIds = before.map((r) => r.nodeId) + expect(beforeIds).toContain('m-b') // hop 1 + expect(beforeIds).toContain('m-c') // hop 2, only reachable through b + expect(beforeIds).toContain('m-d') // sibling + + expect(await graph.forgetMemories(['m-b'])).toBe(1) + + const after = await activation.activate(['m-a'], { + maxHops: 2, decayPerHop: 0.8, minActivation: 0.001, maxNodes: 50, + }) + const afterIds = after.map((r) => r.nodeId) + expect(afterIds).not.toContain('m-b') // forgotten endpoint excluded + expect(afterIds).not.toContain('m-c') // path through forgotten b is pruned + expect(afterIds).toContain('m-d') // independent path intact + }) + + it('forgetMemories is idempotent and gates a forgotten semantic node', async () => { + await graph.addMemoryNode({ id: 'm-x', memoryType: 'episode', label: 'X' }) + await graph.addMemoryNode({ id: 'm-y', memoryType: 'semantic', label: 'Y' }) + await graph.addEdge('m-x', 'm-y', 'TEMPORAL', 0.9) + + // NULL-permissive: an unforgotten node activates normally. + const before = await activation.activate(['m-x'], { maxHops: 1, decayPerHop: 0.8, minActivation: 0.001 }) + expect(before.map((r) => r.nodeId)).toContain('m-y') + + expect(await graph.forgetMemories(['m-y'])).toBe(1) // first stamp + expect(await graph.forgetMemories(['m-y'])).toBe(0) // already forgotten — idempotent + expect(await graph.forgetMemories([])).toBe(0) // empty no-op + + const after = await activation.activate(['m-x'], { maxHops: 1, decayPerHop: 0.8, minActivation: 0.001 }) + expect(after.map((r) => r.nodeId)).not.toContain('m-y') + }) }) From e539bfcf3d966a7a9ed48cb3de70c2c032592e46 Mon Sep 17 00:00:00 2001 From: muhammadkh4n Date: Sun, 7 Jun 2026 04:18:22 +0500 Subject: [PATCH 4/9] =?UTF-8?q?feat(bench):=20make=20the=20metric=20see=20?= =?UTF-8?q?the=20graph=20=E2=80=94=20mergeAssociationsIntoScored=20(Phase?= =?UTF-8?q?=200,=20unit=201)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Discovery #1: both bench adapters scored only recallResult.memories, but graph spreading-activation output lands in the separate recallResult.associations channel. So graph:true vs graph:false mathematically could not move recall@K — "the graph doesn't help" was a measurement-instrumentation bug, not a verdict. - mergeAssociationsIntoScored(recallResult, flag): when the flag is set, unions associations after the memory channel; otherwise returns memories unchanged. - BenchmarkOpts.mergeAssociationsIntoTopK (default false → byte-identical runs). - Wired into both LongMemEval (runQuestion) and LoCoMo (evaluateDataset) scoring. - Score-scale-safe by construction: both adapters score by gold-id set-membership over the deduped top-K, not score magnitude. So unioning the graph-relevance- ranked associations after the MMR/cross-encoder-ranked memories cannot be confounded by the scale mismatch — a gold id is either in the first K deduped ids or it is not. Memory-first ordering means associations can only RESCUE a gold id the memory channel missed, never displace one. - associations-visible-to-scored.test.ts: deterministic invariant (no Neo4j, no LLM, no dataset) — gold present in the scored pool with merge ON, absent OFF. This is the "associations-visible" invariant the symmetric kill criterion (later unit) depends on. - Adds packages/bench/vitest.config.ts so the gate test runs under turbo/CI. Default off → zero behaviour change to existing runs. bench typecheck clean, 4/4. --- packages/bench/src/index.ts | 2 + packages/bench/src/locomo/adapter.ts | 7 +- packages/bench/src/longmemeval/adapter.ts | 5 +- packages/bench/src/merge-associations.ts | 42 +++++++++ packages/bench/src/types.ts | 9 ++ .../associations-visible-to-scored.test.ts | 92 +++++++++++++++++++ packages/bench/vitest.config.ts | 9 ++ 7 files changed, 163 insertions(+), 3 deletions(-) create mode 100644 packages/bench/src/merge-associations.ts create mode 100644 packages/bench/test/associations-visible-to-scored.test.ts create mode 100644 packages/bench/vitest.config.ts diff --git a/packages/bench/src/index.ts b/packages/bench/src/index.ts index e3393e8..8cd2dbe 100644 --- a/packages/bench/src/index.ts +++ b/packages/bench/src/index.ts @@ -4,6 +4,8 @@ export { compareLoCoMo, compareLongMemEval } from './runner/compare.js' export { computeRetrievalF1, recallAtK } from './metrics/f1.js' export { formatLoCoMoTable, formatLongMemEvalTable, formatComparisonTable } from './metrics/table.js' export { createBenchMemory } from './memory-factory.js' +export { mergeAssociationsIntoScored } from './merge-associations.js' +export type { BenchRecallResult, BenchScoredMemory } from './merge-associations.js' export type { BenchmarkOpts, BenchmarkMetrics, LoCoMoCategory, LoCoMoQAPrediction, LoCoMoCategoryMetrics, diff --git a/packages/bench/src/locomo/adapter.ts b/packages/bench/src/locomo/adapter.ts index 14e460a..da384d9 100644 --- a/packages/bench/src/locomo/adapter.ts +++ b/packages/bench/src/locomo/adapter.ts @@ -8,6 +8,7 @@ import type { import type { LoCoMoConversationFile, LoCoMoTurn } from './types.js' import { computeRetrievalF1 } from '../metrics/f1.js' import { createBenchMemory } from '../memory-factory.js' +import { mergeAssociationsIntoScored } from '../merge-associations.js' export class LoCoMoAdapter { async loadDataset(dataPath: string): Promise { @@ -184,7 +185,7 @@ export class LoCoMoAdapter { async evaluateDataset( conversations: LoCoMoConversationFile[], memory: Memory, - opts?: Pick, + opts?: Pick, ): Promise { const topK = opts?.topK ?? 10 const convResults: LoCoMoConversationResult[] = [] @@ -195,7 +196,9 @@ export class LoCoMoAdapter { for (const qa of conv.qa) { const recallResult = await memory.recall(qa.question) - const topMemories = recallResult.memories.slice(0, topK) + const topMemories = mergeAssociationsIntoScored( + recallResult, opts?.mergeAssociationsIntoTopK, + ).slice(0, topK) const prediction = topMemories .map(m => m.content) diff --git a/packages/bench/src/longmemeval/adapter.ts b/packages/bench/src/longmemeval/adapter.ts index 87b028b..9afff54 100644 --- a/packages/bench/src/longmemeval/adapter.ts +++ b/packages/bench/src/longmemeval/adapter.ts @@ -22,6 +22,7 @@ import type { } from '../types.js' import type { LongMemEvalQuestion, LongMemEvalQuestionType } from './types.js' import { createBenchMemory } from '../memory-factory.js' +import { mergeAssociationsIntoScored } from '../merge-associations.js' export class LongMemEvalAdapter { /** @@ -145,7 +146,9 @@ export class LongMemEvalAdapter { const evalStart = Date.now() const recallResult = await memory.recall(question.question) - const topMemories = recallResult.memories.slice(0, topK) + const topMemories = mergeAssociationsIntoScored( + recallResult, opts?.mergeAssociationsIntoTopK, + ).slice(0, topK) // Deduplicate retrieved sessions in rank order const seen = new Set() diff --git a/packages/bench/src/merge-associations.ts b/packages/bench/src/merge-associations.ts new file mode 100644 index 0000000..d20b05f --- /dev/null +++ b/packages/bench/src/merge-associations.ts @@ -0,0 +1,42 @@ +import type { Memory } from '@engram-mem/core' + +// Derived from Memory.recall()'s return type — core does not re-export +// RecallResult / RetrievedMemory by name, so we pin the shape structurally. +// This keeps the bench decoupled from core's internal module paths. +export type BenchRecallResult = Awaited> +export type BenchScoredMemory = BenchRecallResult['memories'][number] + +/** + * The pool a bench adapter scores. By default this is just the SQL/vector + * recall channel (`recallResult.memories`). When `mergeAssociationsIntoTopK` + * is true, the graph spreading-activation channel (`recallResult.associations`) + * is appended so it becomes visible to recall@K. + * + * Why this is the fix for "the benchmark literally cannot see the graph": + * both bench adapters score by gold-id SET-MEMBERSHIP in the deduped top-K + * (LongMemEval matches answer_session_ids against metadata.lmeSessionId; + * LoCoMo matches qa.evidence against metadata.locomoDiaId). Membership, not + * score magnitude, decides a hit — so unioning the graph-relevance-ranked + * associations after the MMR/cross-encoder-ranked memories is scale-safe by + * construction: a gold id is either in the first K deduped ids or it is not. + * No cross-encoder re-run over the union is needed. + * + * Ordering is memories-first, associations-appended: the graph channel can + * only RESCUE a gold id the memory channel missed; it cannot displace a + * memory-channel gold id out of top-K unless the memory pool already held ≥K + * non-gold entries ahead of it. That asymmetry is precisely the question — + * does the graph recover misses? — so we measure it directly. + * + * Associations carry the same `metadata` as their source memory (spreading + * activation spreads `...episode.metadata`), so the gold-id keys ride through. + * + * With the flag false (default) this returns `recallResult.memories` by + * reference — byte-identical behaviour to pre-Phase-0 runs. + */ +export function mergeAssociationsIntoScored( + recallResult: BenchRecallResult, + mergeAssociationsIntoTopK: boolean | undefined, +): BenchScoredMemory[] { + if (!mergeAssociationsIntoTopK) return recallResult.memories + return [...recallResult.memories, ...recallResult.associations] +} diff --git a/packages/bench/src/types.ts b/packages/bench/src/types.ts index db39517..02bea19 100644 --- a/packages/bench/src/types.ts +++ b/packages/bench/src/types.ts @@ -7,6 +7,15 @@ export interface BenchmarkOpts { topK?: number // default 10 limit?: number // max conversations to evaluate (default: all) noRerank?: boolean // disable cross-encoder reranking for A/B comparison + /** + * Phase 0: merge the graph spreading-activation channel + * (`recallResult.associations`) into the scored top-K pool before recall@K + * is computed. Default false → byte-identical to pre-Phase-0 runs. The + * adapters score by gold-id set-membership (scale-independent), so unioning + * the graph-relevance-ranked associations after the MMR/rerank'd memories is + * safe. This is what makes graph:true vs graph:false able to move the metric. + */ + mergeAssociationsIntoTopK?: boolean /** * Cross-encoder backend. 'openai' (default) uses LLM pointwise scoring via * gpt-4o-mini; 'onnx' uses a local mxbai-rerank ONNX model (no API cost, diff --git a/packages/bench/test/associations-visible-to-scored.test.ts b/packages/bench/test/associations-visible-to-scored.test.ts new file mode 100644 index 0000000..fdef1f5 --- /dev/null +++ b/packages/bench/test/associations-visible-to-scored.test.ts @@ -0,0 +1,92 @@ +/** + * Phase 0 invariant — the metric MUST be able to see the graph channel. + * + * Discovery #1: both bench adapters score only `recallResult.memories`, but + * graph spreading-activation output lands in the separate + * `recallResult.associations` channel. So `graph:true` vs `graph:false` + * mathematically could not move recall@K — the graph was never measured. + * + * This pins the fix: with the merge flag ON, a gold id that lives ONLY in + * associations becomes visible to the scored top-K; with it OFF, the scored + * pool is byte-identical to today. This is the "associations-visible + * invariant" the symmetric kill-criterion (Unit 8) later depends on. + * + * Deterministic: a hand-built recall result, no dataset, no Neo4j, no LLM. + */ +import { describe, it, expect } from 'vitest' +import { + mergeAssociationsIntoScored, + type BenchRecallResult, + type BenchScoredMemory, +} from '../src/merge-associations.js' + +const GOLD_SESSION = 'gold-sess' + +function mem(id: string, lmeSessionId: string): BenchScoredMemory { + return { + id, + type: 'episode', + content: `content-${id}`, + relevance: 0.9, // strong channel — post MMR/rerank + source: 'recall', + metadata: { lmeSessionId }, + } +} + +function assoc(id: string, lmeSessionId: string): BenchScoredMemory { + return { + id, + type: 'episode', + content: `content-${id}`, + relevance: 0.12, // graph-relevance scale — deliberately lower than memories + source: 'association', + metadata: { lmeSessionId, activationSource: 'spreading_activation' }, + } +} + +// Gold session lives ONLY in the association channel; the memory channel holds +// only noise. This is the case the graph is supposed to rescue. +function makeFixture(): BenchRecallResult { + return { + memories: [mem('m1', 'noise-a'), mem('m2', 'noise-b')], + associations: [assoc('a1', GOLD_SESSION)], + intent: {} as BenchRecallResult['intent'], + primed: [], + estimatedTokens: 0, + formatted: '', + } +} + +// Mirror the adapters' gold-id set-membership over the deduped top-K. +function goldInTopK(pool: BenchScoredMemory[], k = 10): boolean { + const seen = new Set() + for (const m of pool.slice(0, k)) { + const sid = m.metadata?.['lmeSessionId'] as string | undefined + if (sid) seen.add(sid) + } + return seen.has(GOLD_SESSION) +} + +describe('associations are visible to the scored pool iff merge is ON', () => { + it('gold session is ABSENT from the scored pool when merge is OFF', () => { + const scored = mergeAssociationsIntoScored(makeFixture(), false) + expect(goldInTopK(scored)).toBe(false) + }) + + it('gold session is PRESENT in the scored pool when merge is ON', () => { + const scored = mergeAssociationsIntoScored(makeFixture(), true) + expect(goldInTopK(scored)).toBe(true) + }) + + it('merge OFF returns the memories array unchanged (no behaviour drift)', () => { + const fixture = makeFixture() + expect(mergeAssociationsIntoScored(fixture, false)).toBe(fixture.memories) + expect(mergeAssociationsIntoScored(fixture, undefined)).toBe(fixture.memories) + }) + + it('merge ON appends associations after memories (memory-first ordering)', () => { + const fixture = makeFixture() + const scored = mergeAssociationsIntoScored(fixture, true) + expect(scored.map((m) => m.id)).toEqual(['m1', 'm2', 'a1']) + }) +}) diff --git a/packages/bench/vitest.config.ts b/packages/bench/vitest.config.ts new file mode 100644 index 0000000..c03431f --- /dev/null +++ b/packages/bench/vitest.config.ts @@ -0,0 +1,9 @@ +import { defineConfig } from 'vitest/config' + +export default defineConfig({ + test: { + include: ['test/**/*.test.ts'], + environment: 'node', + testTimeout: 10000, + }, +}) From 1d7012a427b337fc2d04dc4979cb625aa0afdec4 Mon Sep 17 00:00:00 2001 From: muhammadkh4n Date: Sun, 7 Jun 2026 04:20:17 +0500 Subject: [PATCH 5/9] feat(bench): symmetric kill criterion for the graph bet (Phase 0) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Encodes the red-team rule that prevents the historical mistake — concluding "kill the graph" from the saturated LongMemEval-S aggregate (~98.8% recall@5, where nothing has headroom to move). graphVerdict() demands POSITIVE evidence of no-effect before a kill: - kill ONLY when the primary aggregate delta is null/negative AND graphEffect is flat (≤ epsilon) on a graph-visible split with n ≥ 100, and the associations-visible invariant is green. Never the aggregate alone. - keep as soon as graphEffect clears epsilon on a powered, visible split — even when the saturated aggregate is flat or negative. - insufficient_power when underpowered (n<100), when the invariant is red, or in the ambiguous aggregate-positive-but-flat-effect case. Pure and deterministic; 6 table-driven cases pin the asymmetry, the power gate, and the invariant dependency. bench typecheck clean, 10/10. --- packages/bench/src/index.ts | 2 + packages/bench/src/metrics/graph-verdict.ts | 70 ++++++++++++++++++ packages/bench/test/graph-verdict.test.ts | 78 +++++++++++++++++++++ 3 files changed, 150 insertions(+) create mode 100644 packages/bench/src/metrics/graph-verdict.ts create mode 100644 packages/bench/test/graph-verdict.test.ts diff --git a/packages/bench/src/index.ts b/packages/bench/src/index.ts index 8cd2dbe..5b25b78 100644 --- a/packages/bench/src/index.ts +++ b/packages/bench/src/index.ts @@ -6,6 +6,8 @@ export { formatLoCoMoTable, formatLongMemEvalTable, formatComparisonTable } from export { createBenchMemory } from './memory-factory.js' export { mergeAssociationsIntoScored } from './merge-associations.js' export type { BenchRecallResult, BenchScoredMemory } from './merge-associations.js' +export { graphVerdict, MIN_POWER_N, DEFAULT_EPSILON } from './metrics/graph-verdict.js' +export type { GraphVerdict, GraphVerdictInput } from './metrics/graph-verdict.js' export type { BenchmarkOpts, BenchmarkMetrics, LoCoMoCategory, LoCoMoQAPrediction, LoCoMoCategoryMetrics, diff --git a/packages/bench/src/metrics/graph-verdict.ts b/packages/bench/src/metrics/graph-verdict.ts new file mode 100644 index 0000000..7ce9505 --- /dev/null +++ b/packages/bench/src/metrics/graph-verdict.ts @@ -0,0 +1,70 @@ +// Phase 0 — symmetric kill criterion for the graph bet. +// +// The historical failure: concluding "the graph doesn't help, kill it" from a +// SATURATED aggregate (LongMemEval-S sits at ~98.8% recall@5 — there is almost +// no headroom for ANY change to move it). A null delta there means "no signal", +// not "no value". This module encodes the red-team rule that makes killing the +// graph require POSITIVE evidence of no-effect on a split where an effect could +// actually show up, with enough samples to trust it. + +export type GraphVerdict = 'kill' | 'keep' | 'insufficient_power' + +export interface GraphVerdictInput { + /** + * Primary aggregate delta: recall@K(graph) − recall@K(no-graph) on the full + * (saturated) corpus, e.g. LongMemEval-S. Near-zero is expected even if the + * graph helps, because the corpus is saturated — so this alone never decides. + */ + primaryAggregateDelta: number + /** + * graphEffect: recall@K(merge ON) − recall@K(merge OFF) on the GRAPH-VISIBLE + * split (questions where the graph channel could plausibly contribute). This + * is the scale-independent set-membership lift — the signal that matters. + */ + graphEffect: number + /** Size of the graph-visible split. Below MIN_POWER_N, no verdict is allowed. */ + graphVisibleN: number + /** + * The associations-visible-to-scored invariant must be green: if the metric + * structurally cannot see the graph channel, every delta is measurement noise + * and no kill/keep verdict is trustworthy. + */ + associationsVisibleInvariantGreen: boolean + /** Equivalence margin below which graphEffect counts as "flat". */ + epsilon?: number +} + +/** Minimum graph-visible sample size to render any verdict. Below this → no decision. */ +export const MIN_POWER_N = 100 +/** Default flatness margin for graphEffect. */ +export const DEFAULT_EPSILON = 0.005 + +/** + * Render a verdict on the graph bet. + * + * - `keep` — graphEffect > ε on a powered (n≥100), graph-visible split. + * - `kill` — BOTH a null/negative aggregate delta AND a flat + * (≤ ε) graphEffect, on a powered split, with the + * invariant green. Never the aggregate alone. + * - `insufficient_power` — invariant red, OR n < 100, OR the ambiguous case + * (aggregate positive but graphEffect flat). + */ +export function graphVerdict(input: GraphVerdictInput): GraphVerdict { + const epsilon = input.epsilon ?? DEFAULT_EPSILON + + // The metric must be able to see the graph at all — else any delta is noise. + if (!input.associationsVisibleInvariantGreen) return 'insufficient_power' + + // Underpowered → never decide. Deciding on n << 100 was the historical error. + if (input.graphVisibleN < MIN_POWER_N) return 'insufficient_power' + + // The graph demonstrably helps the visible split → keep. + if (input.graphEffect > epsilon) return 'keep' + + // graphEffect is flat (≤ ε). Killing additionally requires the aggregate to + // be null/negative — BOTH conditions, never the saturated aggregate alone. + if (input.primaryAggregateDelta <= 0) return 'kill' + + // Aggregate positive but graphEffect flat: ambiguous → no decision. + return 'insufficient_power' +} diff --git a/packages/bench/test/graph-verdict.test.ts b/packages/bench/test/graph-verdict.test.ts new file mode 100644 index 0000000..01fc770 --- /dev/null +++ b/packages/bench/test/graph-verdict.test.ts @@ -0,0 +1,78 @@ +/** + * Phase 0 — symmetric kill criterion regression gate. + * + * Locks the rule that prevents the historical mistake: never conclude "kill the + * graph" from the saturated aggregate alone. Killing requires BOTH a null/ + * negative aggregate delta AND a flat graphEffect on a powered (n≥100), + * graph-visible split, with the associations-visible invariant green. + */ +import { describe, it, expect } from 'vitest' +import { + graphVerdict, + MIN_POWER_N, + DEFAULT_EPSILON, + type GraphVerdictInput, +} from '../src/metrics/graph-verdict.js' + +const powered: Pick = { + graphVisibleN: 150, + associationsVisibleInvariantGreen: true, +} + +describe('graphVerdict — symmetric kill criterion', () => { + it('KEEP when graphEffect clears epsilon on a powered, visible split', () => { + expect(graphVerdict({ ...powered, primaryAggregateDelta: 0, graphEffect: 0.04 })).toBe('keep') + // Keep holds even when the saturated aggregate is flat/negative. + expect(graphVerdict({ ...powered, primaryAggregateDelta: -0.002, graphEffect: 0.03 })).toBe('keep') + }) + + it('KILL only when BOTH the aggregate is null/negative AND graphEffect is flat', () => { + expect(graphVerdict({ ...powered, primaryAggregateDelta: 0, graphEffect: 0 })).toBe('kill') + expect(graphVerdict({ ...powered, primaryAggregateDelta: -0.01, graphEffect: 0.002 })).toBe('kill') + }) + + it('does NOT kill on a flat graphEffect when the aggregate is POSITIVE (the key asymmetry)', () => { + expect(graphVerdict({ ...powered, primaryAggregateDelta: 0.01, graphEffect: 0 })).toBe('insufficient_power') + }) + + it('never decides below the power threshold, even with a flat effect + negative aggregate', () => { + expect( + graphVerdict({ + graphVisibleN: MIN_POWER_N - 1, + associationsVisibleInvariantGreen: true, + primaryAggregateDelta: -0.05, + graphEffect: 0, + }), + ).toBe('insufficient_power') + // Exactly at the threshold is enough to decide. + expect( + graphVerdict({ + graphVisibleN: MIN_POWER_N, + associationsVisibleInvariantGreen: true, + primaryAggregateDelta: 0, + graphEffect: 0, + }), + ).toBe('kill') + }) + + it('never decides when the associations-visible invariant is red', () => { + // Would otherwise be a clear KEEP, but the metric cannot see the graph. + expect( + graphVerdict({ + graphVisibleN: 500, + associationsVisibleInvariantGreen: false, + primaryAggregateDelta: 0.1, + graphEffect: 0.2, + }), + ).toBe('insufficient_power') + }) + + it('treats graphEffect exactly at epsilon as flat (not a keep)', () => { + expect( + graphVerdict({ ...powered, primaryAggregateDelta: 0, graphEffect: DEFAULT_EPSILON }), + ).toBe('kill') + expect( + graphVerdict({ ...powered, primaryAggregateDelta: 0, graphEffect: DEFAULT_EPSILON + 1e-6 }), + ).toBe('keep') + }) +}) From ed1e49b852552e57a6373cce32efd36517c1338a Mon Sep 17 00:00:00 2001 From: muhammadkh4n Date: Sun, 7 Jun 2026 04:49:28 +0500 Subject: [PATCH 6/9] feat(bench): requireGraph guard + per-unit graph isolation (Phase 0, units 3-4) createBenchMemory now returns a {memory, config, graphActuallyWired} handle instead of a bare Memory, so graph cells can reach the graph handle and hard- fail when it is absent. - requireGraph(handle): throws if a graph cell runs without a real bench Neo4j, killing the silent SQL-only fallback that would otherwise report a SQL delta as a graph result (the "graph was never measured" trap). Lives in a dependency-light bench-memory-handle module so the guard + types are unit-testable without loading the ONNX native binding. - wipeBenchGraph wired into LongMemEval runQuestion and LoCoMo runConversation before ingest: Neo4j is a shared external process (unlike the per-call fresh :memory: SQLite), so each question/conversation must start with a clean graph or the previous unit's nodes pollute spreading activation. (wipeBenchGraph existed but was called nowhere.) - Migrated all 5 createBenchMemory callers to destructure the handle. bench typecheck clean, 12/12. --- packages/bench/src/bench-memory-handle.ts | 39 +++++++++++++++++++ packages/bench/src/index.ts | 4 +- packages/bench/src/locomo/adapter.ts | 8 +++- .../locomo/forensics/local-recall-sweep.ts | 2 +- packages/bench/src/locomo/judge-adapter.ts | 2 +- packages/bench/src/longmemeval/adapter.ts | 7 +++- .../src/longmemeval/forensics/recall-sweep.ts | 2 +- packages/bench/src/memory-factory.ts | 16 ++++++-- packages/bench/test/require-graph.test.ts | 33 ++++++++++++++++ 9 files changed, 104 insertions(+), 9 deletions(-) create mode 100644 packages/bench/src/bench-memory-handle.ts create mode 100644 packages/bench/test/require-graph.test.ts diff --git a/packages/bench/src/bench-memory-handle.ts b/packages/bench/src/bench-memory-handle.ts new file mode 100644 index 0000000..8b95f4f --- /dev/null +++ b/packages/bench/src/bench-memory-handle.ts @@ -0,0 +1,39 @@ +// Dependency-light home for the bench memory handle + the requireGraph guard. +// Kept separate from memory-factory.ts (which pulls in heavy runtime deps like +// the ONNX reranker) so the guard and its types stay unit-testable without +// loading native binaries. All imports here are type-only → erased at runtime. +import type { Memory } from '@engram-mem/core' +import type { NeuralGraph } from '@engram-mem/graph' +import type { RerankerBackend } from './types.js' + +/** What createBenchMemory wired up — exposed so graph cells can reach the graph. */ +export interface BenchMemoryConfig { + graph: NeuralGraph | null + rerankerBackend: RerankerBackend +} + +export interface BenchMemoryHandle { + memory: Memory + config: BenchMemoryConfig + /** True iff a real bench Neo4j was wired (env present AND reachable). */ + graphActuallyWired: boolean +} + +/** + * Hard-fail guard for graph cells. A graph cell that runs without a real Neo4j + * silently falls back to SQL-only and would report a SQL delta as a graph + * result — the exact "the graph was never measured" trap. Convert that silent + * fallback into a loud throw so a mis-provisioned matrix cell fails fast instead + * of fabricating a graph number. + */ +export function requireGraph(handle: BenchMemoryHandle): NeuralGraph { + if (!handle.graphActuallyWired || !handle.config.graph) { + throw new Error( + '[engram-bench] requireGraph: a graph cell was requested but the bench ' + + 'Neo4j is not wired. Set ENGRAM_BENCH_NEO4J_URI + ENGRAM_BENCH_NEO4J_PASSWORD ' + + '(a bench-specific Neo4j, NOT the production NEO4J_URI). Refusing to report ' + + 'a SQL-only result as a graph result.', + ) + } + return handle.config.graph +} diff --git a/packages/bench/src/index.ts b/packages/bench/src/index.ts index 5b25b78..7f7e8c9 100644 --- a/packages/bench/src/index.ts +++ b/packages/bench/src/index.ts @@ -3,7 +3,9 @@ export { LongMemEvalAdapter } from './longmemeval/adapter.js' export { compareLoCoMo, compareLongMemEval } from './runner/compare.js' export { computeRetrievalF1, recallAtK } from './metrics/f1.js' export { formatLoCoMoTable, formatLongMemEvalTable, formatComparisonTable } from './metrics/table.js' -export { createBenchMemory } from './memory-factory.js' +export { createBenchMemory, requireGraph } from './memory-factory.js' +export type { BenchMemoryHandle, BenchMemoryConfig } from './memory-factory.js' +export { wipeBenchGraph, tryCreateBenchGraph } from './bench-graph.js' export { mergeAssociationsIntoScored } from './merge-associations.js' export type { BenchRecallResult, BenchScoredMemory } from './merge-associations.js' export { graphVerdict, MIN_POWER_N, DEFAULT_EPSILON } from './metrics/graph-verdict.js' diff --git a/packages/bench/src/locomo/adapter.ts b/packages/bench/src/locomo/adapter.ts index da384d9..34798fd 100644 --- a/packages/bench/src/locomo/adapter.ts +++ b/packages/bench/src/locomo/adapter.ts @@ -9,6 +9,7 @@ import type { LoCoMoConversationFile, LoCoMoTurn } from './types.js' import { computeRetrievalF1 } from '../metrics/f1.js' import { createBenchMemory } from '../memory-factory.js' import { mergeAssociationsIntoScored } from '../merge-associations.js' +import { wipeBenchGraph } from '../bench-graph.js' export class LoCoMoAdapter { async loadDataset(dataPath: string): Promise { @@ -252,7 +253,12 @@ export class LoCoMoAdapter { conv: LoCoMoConversationFile, opts?: BenchmarkOpts, ): Promise<{ result: LoCoMoConversationResult; ingestMs: number; evalMs: number }> { - const memory = await createBenchMemory(opts) + const { memory, config } = await createBenchMemory(opts) + + // Per-conversation graph isolation: Neo4j is shared, so wipe before ingest + // or the previous conversation's nodes pollute this one's spreading + // activation (matching the per-conv fresh :memory: SQLite invariant). + if (config.graph) await wipeBenchGraph(config.graph) const ingestStart = Date.now() const { episodesIngested, sessionsCreated } = await this.ingestConversation(conv, memory, { diff --git a/packages/bench/src/locomo/forensics/local-recall-sweep.ts b/packages/bench/src/locomo/forensics/local-recall-sweep.ts index 3588e83..a3ad6e8 100644 --- a/packages/bench/src/locomo/forensics/local-recall-sweep.ts +++ b/packages/bench/src/locomo/forensics/local-recall-sweep.ts @@ -88,7 +88,7 @@ async function main(): Promise { const convStart = Date.now() console.log(`[${i + 1}/${conversations.length}] ${convId} — fresh memory + ingest`) - const memory = await createBenchMemory(benchOpts) + const { memory } = await createBenchMemory(benchOpts) try { const ingestStart = Date.now() diff --git a/packages/bench/src/locomo/judge-adapter.ts b/packages/bench/src/locomo/judge-adapter.ts index 047ad08..8fabe9e 100644 --- a/packages/bench/src/locomo/judge-adapter.ts +++ b/packages/bench/src/locomo/judge-adapter.ts @@ -363,7 +363,7 @@ async function benchConversation( const nQs = opts.smoke ? (opts.smokeQuestions ?? 5) : qas.length console.log(` [engram-mem] Conv ${convIdx} (${sid}): ingesting...`) - const memory = await createBenchMemory({ + const { memory } = await createBenchMemory({ graph: opts.graph ?? false, ...(opts.rerankerBackend ? { rerankerBackend: opts.rerankerBackend } : {}), ...(opts.onnxRerankerModel ? { onnxRerankerModel: opts.onnxRerankerModel } : {}), diff --git a/packages/bench/src/longmemeval/adapter.ts b/packages/bench/src/longmemeval/adapter.ts index 9afff54..d8fb8dd 100644 --- a/packages/bench/src/longmemeval/adapter.ts +++ b/packages/bench/src/longmemeval/adapter.ts @@ -23,6 +23,7 @@ import type { import type { LongMemEvalQuestion, LongMemEvalQuestionType } from './types.js' import { createBenchMemory } from '../memory-factory.js' import { mergeAssociationsIntoScored } from '../merge-associations.js' +import { wipeBenchGraph } from '../bench-graph.js' export class LongMemEvalAdapter { /** @@ -136,10 +137,14 @@ export class LongMemEvalAdapter { ingestMs: number evalMs: number }> { - const memory = await createBenchMemory(opts) + const { memory, config } = await createBenchMemory(opts) const topK = opts?.topK ?? 10 try { + // Per-question graph isolation: Neo4j is a shared external process (unlike + // the per-call fresh :memory: SQLite), so wipe it before ingest or prior + // questions' nodes pollute this question's spreading activation. + if (config.graph) await wipeBenchGraph(config.graph) const ingestStart = Date.now() const { episodesIngested, sessionsCreated } = await this.ingestQuestion(question, memory) const ingestMs = Date.now() - ingestStart diff --git a/packages/bench/src/longmemeval/forensics/recall-sweep.ts b/packages/bench/src/longmemeval/forensics/recall-sweep.ts index a53c506..9ba0adf 100644 --- a/packages/bench/src/longmemeval/forensics/recall-sweep.ts +++ b/packages/bench/src/longmemeval/forensics/recall-sweep.ts @@ -86,7 +86,7 @@ async function main(): Promise { // BUT — runQuestion currently slices to topK before computing recall@K. // For the sweep we want a fuller view: retrieve max(K_VALUES) once, then // compute recall@K from the same list. We need a slightly different path. - const memory = await createBenchMemory(benchOpts) + const { memory } = await createBenchMemory(benchOpts) let episodes = 0 let ingestMs = 0 let evalMs = 0 diff --git a/packages/bench/src/memory-factory.ts b/packages/bench/src/memory-factory.ts index 57ed250..bc1295e 100644 --- a/packages/bench/src/memory-factory.ts +++ b/packages/bench/src/memory-factory.ts @@ -1,10 +1,16 @@ import { SqliteStorageAdapter } from '@engram-mem/sqlite' import { openaiIntelligence } from '@engram-mem/openai' import { createMemory } from '@engram-mem/core' -import type { Memory, IntelligenceAdapter } from '@engram-mem/core' +import type { IntelligenceAdapter } from '@engram-mem/core' import { createOnnxReranker, type OnnxReranker } from '@engram-mem/rerank-onnx' import type { BenchmarkOpts } from './types.js' import { tryCreateBenchGraph } from './bench-graph.js' +import type { BenchMemoryHandle } from './bench-memory-handle.js' + +// Re-exported so existing importers of these from memory-factory keep working; +// the definitions live in the dependency-light bench-memory-handle module. +export type { BenchMemoryConfig, BenchMemoryHandle } from './bench-memory-handle.js' +export { requireGraph } from './bench-memory-handle.js' /** * Create an in-memory SQLite-backed Memory instance for benchmark use. @@ -30,7 +36,7 @@ import { tryCreateBenchGraph } from './bench-graph.js' * 'onnx' → local mxbai-rerank ONNX model * 'none' → rerank disabled (same as noRerank) */ -export async function createBenchMemory(opts?: BenchmarkOpts): Promise { +export async function createBenchMemory(opts?: BenchmarkOpts): Promise { const storage = new SqliteStorageAdapter(':memory:') const apiKey = opts?.openaiApiKey ?? process.env['OPENAI_API_KEY'] @@ -51,7 +57,11 @@ export async function createBenchMemory(opts?: BenchmarkOpts): Promise { }) await memory.initialize() - return memory + return { + memory, + config: { graph, rerankerBackend: backend }, + graphActuallyWired: graph !== null, + } } function resolveBackend(opts?: BenchmarkOpts): 'openai' | 'onnx' | 'none' { diff --git a/packages/bench/test/require-graph.test.ts b/packages/bench/test/require-graph.test.ts new file mode 100644 index 0000000..e667908 --- /dev/null +++ b/packages/bench/test/require-graph.test.ts @@ -0,0 +1,33 @@ +/** + * Phase 0 — requireGraph hard-fail guard. + * + * A graph matrix cell that runs without a real bench Neo4j would silently fall + * back to SQL-only and report a SQL delta as a "graph" result — the exact + * measurement trap Phase 0 exists to kill. requireGraph converts that silent + * fallback into a loud throw. (The success path needs a live NeuralGraph and is + * exercised by the matrix runner against a real bench Neo4j.) + */ +import { describe, it, expect } from 'vitest' +import { requireGraph, type BenchMemoryHandle } from '../src/bench-memory-handle.js' + +const fakeMemory = {} as BenchMemoryHandle['memory'] + +describe('requireGraph', () => { + it('throws when the graph was never wired (no silent SQL-only fallback)', () => { + const handle: BenchMemoryHandle = { + memory: fakeMemory, + config: { graph: null, rerankerBackend: 'none' }, + graphActuallyWired: false, + } + expect(() => requireGraph(handle)).toThrow(/Neo4j is not wired/) + }) + + it('throws defensively when graphActuallyWired is true but the handle is null', () => { + const handle: BenchMemoryHandle = { + memory: fakeMemory, + config: { graph: null, rerankerBackend: 'openai' }, + graphActuallyWired: true, + } + expect(() => requireGraph(handle)).toThrow() + }) +}) From dd9c6c4b53d7919f03073cbf28d2831f51991e6f Mon Sep 17 00:00:00 2001 From: muhammadkh4n Date: Sun, 7 Jun 2026 04:53:30 +0500 Subject: [PATCH 7/9] feat(bench): recall-structure classifier + graphEffect metric + cat 2/3 gate filter (Phase 0, units 6-8b) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - classifyRecallStructure: deterministic label {lookup, multi_hop, temporal, aggregation} from dataset signals (LoCoMo category, LongMemEval ability) with a gold-cardinality + temporal-token heuristic fallback. GRAPH_RELEVANT = {multi_hop, temporal} — the split where spreading activation should help. - computeGraphEffect: recall@K(merge ON) − recall@K(merge OFF) on the graph-relevant split (or the stronger graph-visible split when per-question graphCouldContribute is supplied). This is the scale-independent lift that feeds graphVerdict; an empty split returns zero effect, so with the n<100 power gate no decision is ever fabricated. - LoCoMo categories filter (BenchmarkOpts.categories): score only the requested categories (e.g. [2,3]) while ingesting the corpus whole — filters the metric, not the graph the recall traverses. Canonical category map locked from judge-adapter: 1=single_hop 2=multi_hop 3=temporal 4=open_domain 5=adversarial. bench typecheck clean, 20/20. --- .../classify-recall-structure.ts | 73 +++++++++++++++++++ packages/bench/src/index.ts | 4 + packages/bench/src/locomo/adapter.ts | 6 +- packages/bench/src/metrics/graph-effect.ts | 56 ++++++++++++++ packages/bench/src/types.ts | 9 +++ .../test/classify-recall-structure.test.ts | 46 ++++++++++++ packages/bench/test/graph-effect.test.ts | 42 +++++++++++ 7 files changed, 235 insertions(+), 1 deletion(-) create mode 100644 packages/bench/src/classification/classify-recall-structure.ts create mode 100644 packages/bench/src/metrics/graph-effect.ts create mode 100644 packages/bench/test/classify-recall-structure.test.ts create mode 100644 packages/bench/test/graph-effect.test.ts diff --git a/packages/bench/src/classification/classify-recall-structure.ts b/packages/bench/src/classification/classify-recall-structure.ts new file mode 100644 index 0000000..e956c82 --- /dev/null +++ b/packages/bench/src/classification/classify-recall-structure.ts @@ -0,0 +1,73 @@ +// Phase 0 — label a question's recall STRUCTURE so graphEffect is measured on +// the graph-relevant split (multi_hop/temporal, where spreading activation +// should help) instead of the saturated aggregate. Deterministic by design: +// no LLM in the gate path, so the committed labels are reproducible. + +export type RecallStructure = 'lookup' | 'multi_hop' | 'temporal' | 'aggregation' + +export interface QuestionContext { + question: string + goldAnswer: string + /** Gold evidence ids: LoCoMo dia ids, or LongMemEval answer_session_ids. */ + goldIds: string[] + /** LoCoMo category if known: 1=single_hop 2=multi_hop 3=temporal 4=open_domain 5=adversarial. */ + category?: number + /** LongMemEval ability if known: temporal_reasoning, multi_session_reasoning, ... */ + ability?: string +} + +export interface RecallStructureLabel { + type: RecallStructure + confidence: number + reasoning: string +} + +/** The structures where graph spreading activation is expected to add lift. */ +export const GRAPH_RELEVANT: ReadonlySet = new Set(['multi_hop', 'temporal']) + +// Low-confidence fallback signal only (used when neither category nor ability +// is available). Years, month names, and ordering/relative-time words. +const TEMPORAL_RE = + /\b(19|20)\d{2}\b|\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\b|\b(yesterday|today|tomorrow|week|month|year|date|when|before|after|since|until|earlier|later|ago|first|last|recent)\b/i + +/** + * Classify a question's recall structure. Authoritative dataset signals win: + * LoCoMo `category` first, then LongMemEval `ability`. Only when neither is + * present do we fall back to structural heuristics (gold cardinality + a + * temporal-token scan). + */ +export function classifyRecallStructure(ctx: QuestionContext): RecallStructureLabel { + // 1. LoCoMo category — authoritative. + if (ctx.category != null) { + switch (ctx.category) { + case 2: return { type: 'multi_hop', confidence: 0.9, reasoning: 'LoCoMo category 2 (multi_hop)' } + case 3: return { type: 'temporal', confidence: 0.9, reasoning: 'LoCoMo category 3 (temporal)' } + case 1: return { type: 'lookup', confidence: 0.9, reasoning: 'LoCoMo category 1 (single_hop)' } + case 4: return { type: 'lookup', confidence: 0.7, reasoning: 'LoCoMo category 4 (open_domain) -> lookup' } + case 5: return { type: 'lookup', confidence: 0.6, reasoning: 'LoCoMo category 5 (adversarial) -> lookup' } + } + } + + // 2. LongMemEval ability — authoritative. + if (ctx.ability) { + const a = ctx.ability.toLowerCase() + if (a.includes('temporal')) return { type: 'temporal', confidence: 0.85, reasoning: `ability=${ctx.ability}` } + if (a.includes('multi_session') || a.includes('multi-session')) return { type: 'multi_hop', confidence: 0.85, reasoning: `ability=${ctx.ability}` } + if (a.includes('knowledge_update')) return { type: 'multi_hop', confidence: 0.7, reasoning: `ability=${ctx.ability} (updates link sessions)` } + if (a.includes('information_extraction')) return { type: 'lookup', confidence: 0.8, reasoning: `ability=${ctx.ability}` } + if (a.includes('abstention')) return { type: 'lookup', confidence: 0.7, reasoning: `ability=${ctx.ability}` } + } + + // 3. Heuristic fallback. + const text = `${ctx.question} ${ctx.goldAnswer}` + if (ctx.goldIds.length >= 3) { + return { type: 'aggregation', confidence: 0.6, reasoning: `${ctx.goldIds.length} gold ids -> synthesis` } + } + if (TEMPORAL_RE.test(text)) { + return { type: 'temporal', confidence: 0.55, reasoning: 'temporal token in question/answer' } + } + if (ctx.goldIds.length >= 2) { + return { type: 'multi_hop', confidence: 0.6, reasoning: `${ctx.goldIds.length} gold ids -> cross-session` } + } + return { type: 'lookup', confidence: 0.5, reasoning: 'single gold id, no temporal signal' } +} diff --git a/packages/bench/src/index.ts b/packages/bench/src/index.ts index 7f7e8c9..9c12a76 100644 --- a/packages/bench/src/index.ts +++ b/packages/bench/src/index.ts @@ -10,6 +10,10 @@ export { mergeAssociationsIntoScored } from './merge-associations.js' export type { BenchRecallResult, BenchScoredMemory } from './merge-associations.js' export { graphVerdict, MIN_POWER_N, DEFAULT_EPSILON } from './metrics/graph-verdict.js' export type { GraphVerdict, GraphVerdictInput } from './metrics/graph-verdict.js' +export { classifyRecallStructure, GRAPH_RELEVANT } from './classification/classify-recall-structure.js' +export type { RecallStructure, QuestionContext, RecallStructureLabel } from './classification/classify-recall-structure.js' +export { computeGraphEffect } from './metrics/graph-effect.js' +export type { QuestionOutcome, GraphEffectResult } from './metrics/graph-effect.js' export type { BenchmarkOpts, BenchmarkMetrics, LoCoMoCategory, LoCoMoQAPrediction, LoCoMoCategoryMetrics, diff --git a/packages/bench/src/locomo/adapter.ts b/packages/bench/src/locomo/adapter.ts index 34798fd..38a8728 100644 --- a/packages/bench/src/locomo/adapter.ts +++ b/packages/bench/src/locomo/adapter.ts @@ -186,7 +186,7 @@ export class LoCoMoAdapter { async evaluateDataset( conversations: LoCoMoConversationFile[], memory: Memory, - opts?: Pick, + opts?: Pick, ): Promise { const topK = opts?.topK ?? 10 const convResults: LoCoMoConversationResult[] = [] @@ -196,6 +196,10 @@ export class LoCoMoAdapter { const qaPredictions: LoCoMoQAPrediction[] = [] for (const qa of conv.qa) { + // Gate-corpus filter: score only the requested categories (e.g. [2,3] + // multi-hop/temporal). The conversation was already ingested whole, so + // the graph the recall traverses is unaffected — only scoring narrows. + if (opts?.categories && !opts.categories.includes(qa.category)) continue const recallResult = await memory.recall(qa.question) const topMemories = mergeAssociationsIntoScored( recallResult, opts?.mergeAssociationsIntoTopK, diff --git a/packages/bench/src/metrics/graph-effect.ts b/packages/bench/src/metrics/graph-effect.ts new file mode 100644 index 0000000..494538e --- /dev/null +++ b/packages/bench/src/metrics/graph-effect.ts @@ -0,0 +1,56 @@ +// Phase 0 — graphEffect: the scale-independent recall@K lift the graph buys, +// measured on the split where the graph could actually help. Feeds graphVerdict. +import { GRAPH_RELEVANT, type RecallStructure } from '../classification/classify-recall-structure.js' + +export interface QuestionOutcome { + id: string + /** recall@K with the graph channel merged OUT (memories only). */ + recallAtKMergeOff: boolean + /** recall@K with the graph channel merged IN (memories + associations). */ + recallAtKMergeOn: boolean + /** Structural label (from classifyRecallStructure). */ + structure: RecallStructure + /** + * Optional, stronger split signal: did the graph channel surface the gold id + * at all (in either cell)? When present on ANY outcome, the split narrows to + * truly graph-VISIBLE questions (a question the graph never touched cannot + * move and only dilutes n). When absent, the split falls back to the + * graph-RELEVANT structural label (multi_hop/temporal). + */ + graphCouldContribute?: boolean +} + +export interface GraphEffectResult { + /** mergeOnRecall − mergeOffRecall on the split. */ + graphEffect: number + /** Size of the split — the n the power gate (>=100) checks. */ + graphVisibleN: number + mergeOnRecall: number + mergeOffRecall: number + splitDefinition: 'graph-relevant' | 'graph-visible' +} + +/** + * Compute graphEffect over the appropriate split. Uses the graph-VISIBLE split + * when any outcome carries `graphCouldContribute`, otherwise the graph-RELEVANT + * structural split. Returns a zero-effect, n=0 result on an empty split (the + * verdict layer treats n<100 as insufficient_power, so this never fabricates a + * decision). + */ +export function computeGraphEffect(outcomes: QuestionOutcome[]): GraphEffectResult { + const useVisible = outcomes.some((o) => o.graphCouldContribute !== undefined) + const split = outcomes.filter((o) => + useVisible ? o.graphCouldContribute === true : GRAPH_RELEVANT.has(o.structure), + ) + const n = split.length + const mergeOnRecall = n === 0 ? 0 : split.filter((o) => o.recallAtKMergeOn).length / n + const mergeOffRecall = n === 0 ? 0 : split.filter((o) => o.recallAtKMergeOff).length / n + + return { + graphEffect: mergeOnRecall - mergeOffRecall, + graphVisibleN: n, + mergeOnRecall, + mergeOffRecall, + splitDefinition: useVisible ? 'graph-visible' : 'graph-relevant', + } +} diff --git a/packages/bench/src/types.ts b/packages/bench/src/types.ts index 02bea19..2e4b84a 100644 --- a/packages/bench/src/types.ts +++ b/packages/bench/src/types.ts @@ -16,6 +16,15 @@ export interface BenchmarkOpts { * safe. This is what makes graph:true vs graph:false able to move the metric. */ mergeAssociationsIntoTopK?: boolean + /** + * Phase 0: restrict LoCoMo SCORING to these QA categories + * (1=single_hop, 2=multi_hop, 3=temporal, 4=open_domain, 5=adversarial). + * The corpus is still ingested WHOLE — only the metric is filtered — so + * spreading activation keeps the full graph to traverse. Use [2,3] + * (multi-hop + temporal) for the non-saturated graph-relevant gate corpus. + * Undefined = score every category (current behaviour). + */ + categories?: number[] /** * Cross-encoder backend. 'openai' (default) uses LLM pointwise scoring via * gpt-4o-mini; 'onnx' uses a local mxbai-rerank ONNX model (no API cost, diff --git a/packages/bench/test/classify-recall-structure.test.ts b/packages/bench/test/classify-recall-structure.test.ts new file mode 100644 index 0000000..70ef033 --- /dev/null +++ b/packages/bench/test/classify-recall-structure.test.ts @@ -0,0 +1,46 @@ +/** + * Phase 0 — recall-structure classifier (deterministic, no LLM). + * Locks the dataset-signal precedence and the graph-relevant set. + */ +import { describe, it, expect } from 'vitest' +import { + classifyRecallStructure, + GRAPH_RELEVANT, + type QuestionContext, +} from '../src/classification/classify-recall-structure.js' + +const base: QuestionContext = { question: 'q', goldAnswer: 'a', goldIds: ['x'] } + +describe('classifyRecallStructure', () => { + it('maps LoCoMo categories authoritatively', () => { + expect(classifyRecallStructure({ ...base, category: 2 }).type).toBe('multi_hop') + expect(classifyRecallStructure({ ...base, category: 3 }).type).toBe('temporal') + expect(classifyRecallStructure({ ...base, category: 1 }).type).toBe('lookup') + expect(classifyRecallStructure({ ...base, category: 4 }).type).toBe('lookup') + expect(classifyRecallStructure({ ...base, category: 5 }).type).toBe('lookup') + }) + + it('maps LongMemEval abilities authoritatively', () => { + expect(classifyRecallStructure({ ...base, ability: 'temporal_reasoning' }).type).toBe('temporal') + expect(classifyRecallStructure({ ...base, ability: 'multi_session_reasoning' }).type).toBe('multi_hop') + expect(classifyRecallStructure({ ...base, ability: 'knowledge_updates' }).type).toBe('multi_hop') + expect(classifyRecallStructure({ ...base, ability: 'information_extraction' }).type).toBe('lookup') + expect(classifyRecallStructure({ ...base, ability: 'abstention' }).type).toBe('lookup') + }) + + it('category wins over ability and heuristics', () => { + const label = classifyRecallStructure({ ...base, category: 2, ability: 'temporal_reasoning', goldIds: ['1', '2', '3', '4'] }) + expect(label.type).toBe('multi_hop') + }) + + it('falls back to cardinality + temporal heuristics when no signal', () => { + expect(classifyRecallStructure({ question: 'q', goldAnswer: 'a', goldIds: ['1', '2', '3', '4'] }).type).toBe('aggregation') + expect(classifyRecallStructure({ question: 'when did X move', goldAnswer: 'in 2021', goldIds: ['1'] }).type).toBe('temporal') + expect(classifyRecallStructure({ question: 'q', goldAnswer: 'a', goldIds: ['1', '2'] }).type).toBe('multi_hop') + expect(classifyRecallStructure({ question: 'q', goldAnswer: 'a', goldIds: ['1'] }).type).toBe('lookup') + }) + + it('GRAPH_RELEVANT is exactly {multi_hop, temporal}', () => { + expect([...GRAPH_RELEVANT].sort()).toEqual(['multi_hop', 'temporal']) + }) +}) diff --git a/packages/bench/test/graph-effect.test.ts b/packages/bench/test/graph-effect.test.ts new file mode 100644 index 0000000..b0baa2b --- /dev/null +++ b/packages/bench/test/graph-effect.test.ts @@ -0,0 +1,42 @@ +/** + * Phase 0 — graphEffect metric. Deterministic; pins the split selection and the + * recall@K lift computation that feeds the symmetric kill criterion. + */ +import { describe, it, expect } from 'vitest' +import { computeGraphEffect, type QuestionOutcome } from '../src/metrics/graph-effect.js' + +describe('computeGraphEffect', () => { + it('measures merge-on minus merge-off recall on the graph-relevant split', () => { + const outcomes: QuestionOutcome[] = [ + { id: '1', recallAtKMergeOff: false, recallAtKMergeOn: true, structure: 'multi_hop' }, // rescued + { id: '2', recallAtKMergeOff: true, recallAtKMergeOn: true, structure: 'temporal' }, // unchanged + { id: '3', recallAtKMergeOff: false, recallAtKMergeOn: true, structure: 'lookup' }, // excluded + ] + const r = computeGraphEffect(outcomes) + expect(r.splitDefinition).toBe('graph-relevant') + expect(r.graphVisibleN).toBe(2) // multi_hop + temporal only + expect(r.mergeOffRecall).toBe(0.5) + expect(r.mergeOnRecall).toBe(1.0) + expect(r.graphEffect).toBe(0.5) + }) + + it('uses the graph-visible split when graphCouldContribute is present', () => { + const outcomes: QuestionOutcome[] = [ + { id: '1', recallAtKMergeOff: false, recallAtKMergeOn: true, structure: 'lookup', graphCouldContribute: true }, + { id: '2', recallAtKMergeOff: true, recallAtKMergeOn: true, structure: 'multi_hop', graphCouldContribute: false }, + ] + const r = computeGraphEffect(outcomes) + expect(r.splitDefinition).toBe('graph-visible') + expect(r.graphVisibleN).toBe(1) + expect(r.graphEffect).toBe(1.0) + }) + + it('returns zero effect and n=0 on an empty split (never fabricates a decision)', () => { + const r = computeGraphEffect([ + { id: '1', recallAtKMergeOff: true, recallAtKMergeOn: true, structure: 'lookup' }, + ]) + expect(r.graphVisibleN).toBe(0) + expect(r.graphEffect).toBe(0) + expect(r.mergeOnRecall).toBe(0) + }) +}) From 037ef728b7c7d43898feb9273fd301c4cb32c12c Mon Sep 17 00:00:00 2001 From: muhammadkh4n Date: Sun, 7 Jun 2026 05:11:48 +0500 Subject: [PATCH 8/9] feat(bench): 4-cell {graph}x{rerank} ablation matrix runner (Phase 0, unit 5) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completes the Phase 0 measurement harness. compareMatrix runs the 4 cells, each graph-on cell with mergeAssociationsIntoTopK so the graph channel is visible to recall@K, and computes graphEffect as the recall@K lift on the graph-relevant split by pairing each graph-on cell's per-question outcomes against its same-rerank graph-off sibling (one recall per question — no double-scoring). - requireGraph hard-fails before any graph cell runs when no bench Neo4j is wired, so a SQL-only fallback can never be reported as a graph result. - extract{LongMemEval,LoCoMo}Outcomes live in a dependency-light matrix-outcomes module (no adapter/onnx import) so the pairing + classification stays unit-testable without native binaries; chained through computeGraphEffect. - BaselineProvenance: git HEAD + corpus sha256 + flags + Neo4j-gate-state, written to results/gates/graph-eval-baseline.json (gitignore switched to results/* + !results/gates/ so the baseline can be committed). - CLI: --matrix, --require-graph, --categories 2,3. Pure orchestration unit-tested; the adapter-running wrapper + the live baseline are validated against the bench runtime on the server. bench typecheck clean, 23/23. --- .gitignore | 5 +- packages/bench/bin/engram-bench.ts | 64 ++++++++++++++ packages/bench/src/index.ts | 3 + packages/bench/src/runner/compare-matrix.ts | 93 ++++++++++++++++++++ packages/bench/src/runner/matrix-outcomes.ts | 61 +++++++++++++ packages/bench/src/types.ts | 29 ++++++ packages/bench/test/matrix-outcomes.test.ts | 69 +++++++++++++++ 7 files changed, 323 insertions(+), 1 deletion(-) create mode 100644 packages/bench/src/runner/compare-matrix.ts create mode 100644 packages/bench/src/runner/matrix-outcomes.ts create mode 100644 packages/bench/test/matrix-outcomes.test.ts diff --git a/.gitignore b/.gitignore index 2a7ccdc..8974fc4 100644 --- a/.gitignore +++ b/.gitignore @@ -19,7 +19,10 @@ supabase/.temp/ .understand-anything/ # Bench / forensics output (locally generated, not committed) -results/ +results/* +# …except committed Phase 0 gate baselines (results/gates/graph-eval-baseline.json). +# Uses results/* (not results/) so this negation can re-include the subdir. +!results/gates/ data/ # Build artifacts (prevent leaking compiled files into src/) diff --git a/packages/bench/bin/engram-bench.ts b/packages/bench/bin/engram-bench.ts index 0aff14c..07cbc6c 100644 --- a/packages/bench/bin/engram-bench.ts +++ b/packages/bench/bin/engram-bench.ts @@ -10,9 +10,12 @@ import * as fs from 'node:fs/promises' import * as path from 'node:path' +import { createHash } from 'node:crypto' +import { execFileSync } from 'node:child_process' import { LoCoMoAdapter } from '../src/locomo/adapter.js' import { LongMemEvalAdapter } from '../src/longmemeval/adapter.js' import { compareLoCoMo, compareLongMemEval } from '../src/runner/compare.js' +import { compareMatrix } from '../src/runner/compare-matrix.js' import { formatLoCoMoTable, formatLongMemEvalTable, formatComparisonTable } from '../src/metrics/table.js' import type { BenchmarkOpts } from '../src/types.js' @@ -27,6 +30,8 @@ function parseArgs(argv: string[]) { if (arg === '--consolidate') { args['consolidate'] = true; continue } if (arg === '--graph') { args['graph'] = true; continue } if (arg === '--compare') { args['compare'] = true; continue } + if (arg === '--matrix') { args['matrix'] = true; continue } + if (arg === '--require-graph') { args['requireGraph'] = true; continue } if (arg === '--verbose') { args['verbose'] = true; continue } if (arg.startsWith('--')) { const key = arg.slice(2) @@ -51,6 +56,11 @@ function parseArgs(argv: string[]) { consolidate: args['consolidate'] !== false, graph: args['graph'] !== false, compare: args['compare'] === true, + matrix: args['matrix'] === true, + requireGraph: args['requireGraph'] === true, + categories: typeof args['categories'] === 'string' + ? (args['categories'] as string).split(',').map((s) => parseInt(s.trim(), 10)).filter((n) => !Number.isNaN(n)) + : undefined, topK: parseInt(args['top-k'] as string ?? '10', 10) || 10, limit: parseInt(args['limit'] as string ?? '0', 10) || 0, noRerank: args['noRerank'] === true, @@ -58,6 +68,25 @@ function parseArgs(argv: string[]) { } } +/** sha256 fingerprint of the corpus (file contents, or dir name:size listing). */ +async function hashCorpus(p: string): Promise { + try { + const st = await fs.stat(p) + const hash = createHash('sha256') + if (st.isDirectory()) { + for (const e of (await fs.readdir(p)).sort()) { + const s = await fs.stat(path.join(p, e)) + hash.update(`${e}:${s.size}\n`) + } + } else { + hash.update(await fs.readFile(p)) + } + return hash.digest('hex') + } catch { + return 'unknown' + } +} + async function main() { const args = parseArgs(process.argv.slice(2)) @@ -77,6 +106,41 @@ async function main() { console.log(`Consolidation: ${args.consolidate ? 'ON' : 'OFF'}`) console.log('') + if (args.matrix) { + console.log('Running 4-cell {graph}x{rerank} ablation matrix...') + if (args.requireGraph) console.log('requireGraph: ON (a graph cell without a bench Neo4j will hard-fail)') + let commit = 'unknown' + try { commit = execFileSync('git', ['rev-parse', 'HEAD']).toString().trim() } catch { /* not a git checkout */ } + const corpusSha256 = await hashCorpus(args.dataPath) + + const result = await compareMatrix( + args.benchmark as 'locomo' | 'longmemeval', + args.dataPath, + { + consolidate: args.consolidate, + topK: args.topK, + limit: args.limit > 0 ? args.limit : undefined, + ...(args.categories ? { categories: args.categories } : {}), + }, + { requireGraph: args.requireGraph, commit, corpusSha256 }, + ) + + for (const cell of result.cells) { + console.log( + ` graph=${cell.graph ? 'ON ' : 'OFF'} rerank=${cell.rerank ? 'ON ' : 'OFF'}` + + ` graphEffect=${cell.graphEffect.toFixed(4)} (n=${cell.graphVisibleN})`, + ) + } + + const gatesDir = path.resolve('./results/gates') + await fs.mkdir(gatesDir, { recursive: true }) + const outFile = path.join(gatesDir, 'graph-eval-baseline.json') + await fs.writeFile(outFile, JSON.stringify(result, null, 2), 'utf8') + console.log(`\nMatrix baseline written to: ${outFile}`) + console.log(`Provenance: commit=${commit.slice(0, 8)} corpus=${corpusSha256.slice(0, 12)} gate=${result.provenance.neo4jGateState}`) + return + } + if (args.compare) { console.log('Running comparison mode...') let comparisonResult diff --git a/packages/bench/src/index.ts b/packages/bench/src/index.ts index 9c12a76..dab2146 100644 --- a/packages/bench/src/index.ts +++ b/packages/bench/src/index.ts @@ -1,6 +1,9 @@ export { LoCoMoAdapter } from './locomo/adapter.js' export { LongMemEvalAdapter } from './longmemeval/adapter.js' export { compareLoCoMo, compareLongMemEval } from './runner/compare.js' +export { compareMatrix } from './runner/compare-matrix.js' +export { extractLoCoMoOutcomes, extractLongMemEvalOutcomes } from './runner/matrix-outcomes.js' +export type { ComparisonMatrixResult, MatrixCell, BaselineProvenance } from './types.js' export { computeRetrievalF1, recallAtK } from './metrics/f1.js' export { formatLoCoMoTable, formatLongMemEvalTable, formatComparisonTable } from './metrics/table.js' export { createBenchMemory, requireGraph } from './memory-factory.js' diff --git a/packages/bench/src/runner/compare-matrix.ts b/packages/bench/src/runner/compare-matrix.ts new file mode 100644 index 0000000..937c645 --- /dev/null +++ b/packages/bench/src/runner/compare-matrix.ts @@ -0,0 +1,93 @@ +// Phase 0 — 4-cell {graph}×{rerank} ablation matrix. The graph-on cell of each +// rerank row runs with mergeAssociationsIntoTopK; graphEffect is the recall@K +// lift on the graph-relevant split, computed by pairing that cell's per-question +// outcomes against its same-rerank graph-off sibling. requireGraph hard-fails a +// graph cell that has no real bench Neo4j, so a SQL-only fallback can never be +// reported as a graph result. +import { LoCoMoAdapter } from '../locomo/adapter.js' +import { LongMemEvalAdapter } from '../longmemeval/adapter.js' +import { createBenchMemory } from '../memory-factory.js' +import { requireGraph } from '../bench-memory-handle.js' +import { computeGraphEffect } from '../metrics/graph-effect.js' +import { extractLoCoMoOutcomes, extractLongMemEvalOutcomes } from './matrix-outcomes.js' +import type { + BenchmarkOpts, + ComparisonMatrixResult, + MatrixCell, + BaselineProvenance, + LoCoMoResult, + LongMemEvalResult, +} from '../types.js' + +type MatrixOpts = Omit + +export interface MatrixHooks { + /** Hard-fail (throw) if a graph cell would run without a real bench Neo4j. */ + requireGraph?: boolean + /** Provenance: git rev-parse HEAD (computed by the caller — keeps this pure of child_process). */ + commit?: string + /** Provenance: sha256 of the corpus file(s). */ + corpusSha256?: string + /** Provenance timestamp (ISO). */ + timestamp?: string +} + +/** + * Run the 4-cell matrix. The graph-on cells set mergeAssociationsIntoTopK so the + * graph channel is visible to recall@K; rerank is toggled via rerankerBackend + * ('none' off, the requested/openai backend on). Returns each cell plus the + * graphEffect on the graph-relevant split and full provenance. + */ +export async function compareMatrix( + benchmark: 'locomo' | 'longmemeval', + dataPath: string, + opts: MatrixOpts = {}, + hooks: MatrixHooks = {}, +): Promise { + // Hard-fail BEFORE any work if a graph run is requested without a bench Neo4j. + if (hooks.requireGraph) { + const probe = await createBenchMemory({ ...opts, graph: true }) + try { + requireGraph(probe) + } finally { + await probe.memory.dispose().catch(() => { /* probe cleanup non-fatal */ }) + } + } + + const runCell = (graph: boolean, rerank: boolean): Promise => { + const cellOpts: BenchmarkOpts = { + ...opts, + graph, + mergeAssociationsIntoTopK: graph, + rerankerBackend: rerank ? (opts.rerankerBackend ?? 'openai') : 'none', + } + return benchmark === 'locomo' + ? new LoCoMoAdapter().run(dataPath, cellOpts) + : new LongMemEvalAdapter().run(dataPath, cellOpts) + } + + const cells: MatrixCell[] = [] + for (const rerank of [true, false]) { + const off = await runCell(false, rerank) + const on = await runCell(true, rerank) + const outcomes = + benchmark === 'locomo' + ? extractLoCoMoOutcomes(on as LoCoMoResult, off as LoCoMoResult) + : extractLongMemEvalOutcomes(on as LongMemEvalResult, off as LongMemEvalResult) + const effect = computeGraphEffect(outcomes) + cells.push({ graph: false, rerank, result: off, graphEffect: 0, graphVisibleN: 0 }) + cells.push({ graph: true, rerank, result: on, graphEffect: effect.graphEffect, graphVisibleN: effect.graphVisibleN }) + } + + const provenance: BaselineProvenance = { + flags: { ...opts, requireGraph: hooks.requireGraph ?? false }, + corpusPath: dataPath, + corpusSha256: hooks.corpusSha256 ?? 'unknown', + commit: hooks.commit ?? 'unknown', + neo4jGateState: 'forgotten-gate-on', + mergeAssociationsIntoTopK: true, + timestamp: hooks.timestamp ?? new Date().toISOString(), + } + + return { benchmark, cells, provenance } +} diff --git a/packages/bench/src/runner/matrix-outcomes.ts b/packages/bench/src/runner/matrix-outcomes.ts new file mode 100644 index 0000000..ee2e111 --- /dev/null +++ b/packages/bench/src/runner/matrix-outcomes.ts @@ -0,0 +1,61 @@ +// Dependency-light outcome extraction for the ablation matrix. Kept separate +// from compare-matrix.ts (which imports the onnx-heavy adapters) so the pairing +// + classification logic stays unit-testable without native binaries. Pure: +// imports only the classifier and types. +import { classifyRecallStructure } from '../classification/classify-recall-structure.js' +import type { QuestionOutcome } from '../metrics/graph-effect.js' +import type { LoCoMoResult, LongMemEvalResult } from '../types.js' + +/** Pair LongMemEval predictions (graph-on vs graph-off) into classified outcomes. */ +export function extractLongMemEvalOutcomes( + on: LongMemEvalResult, + off: LongMemEvalResult, +): QuestionOutcome[] { + const offById = new Map(off.predictions.map((p) => [p.questionId, p])) + const outcomes: QuestionOutcome[] = [] + for (const onP of on.predictions) { + const offP = offById.get(onP.questionId) + if (!offP) continue + const structure = classifyRecallStructure({ + question: onP.question, + goldAnswer: onP.goldAnswer, + goldIds: onP.goldSessionIds, + ability: onP.ability, + }).type + outcomes.push({ + id: onP.questionId, + recallAtKMergeOff: offP.recallAt5, + recallAtKMergeOn: onP.recallAt5, + structure, + }) + } + return outcomes +} + +/** Pair LoCoMo qa predictions (graph-on vs graph-off) into classified outcomes. */ +export function extractLoCoMoOutcomes(on: LoCoMoResult, off: LoCoMoResult): QuestionOutcome[] { + const offById = new Map() + for (const c of off.conversations) { + for (const qa of c.qaPredictions) offById.set(qa.qaId, qa) + } + const outcomes: QuestionOutcome[] = [] + for (const c of on.conversations) { + for (const qa of c.qaPredictions) { + const offQa = offById.get(qa.qaId) + if (!offQa) continue + const structure = classifyRecallStructure({ + question: qa.question, + goldAnswer: qa.goldAnswer, + goldIds: [], + category: qa.category, + }).type + outcomes.push({ + id: qa.qaId, + recallAtKMergeOff: offQa.recallAtK, + recallAtKMergeOn: qa.recallAtK, + structure, + }) + } + } + return outcomes +} diff --git a/packages/bench/src/types.ts b/packages/bench/src/types.ts index 2e4b84a..b7209c4 100644 --- a/packages/bench/src/types.ts +++ b/packages/bench/src/types.ts @@ -154,3 +154,32 @@ export interface ComparisonDelta { evalTimeDeltaMs: number tokensDelta: number } + +// Phase 0 — 4-cell {graph}×{rerank} ablation matrix. +export interface MatrixCell { + graph: boolean + rerank: boolean + result: LoCoMoResult | LongMemEvalResult + /** recall@K lift on the graph-relevant split vs the same-rerank graph-off cell. 0 for graph-off cells. */ + graphEffect: number + /** Size of the split graphEffect was computed over (the power gate checks >=100). */ + graphVisibleN: number +} + +export interface BaselineProvenance { + flags: Record + corpusPath: string + corpusSha256: string + /** git rev-parse HEAD at run time. */ + commit: string + /** Whether the Neo4j forgotten/valid_until gates were active during the run. */ + neo4jGateState: string + mergeAssociationsIntoTopK: boolean + timestamp: string +} + +export interface ComparisonMatrixResult { + benchmark: 'locomo' | 'longmemeval' + cells: MatrixCell[] + provenance: BaselineProvenance +} diff --git a/packages/bench/test/matrix-outcomes.test.ts b/packages/bench/test/matrix-outcomes.test.ts new file mode 100644 index 0000000..34d3a33 --- /dev/null +++ b/packages/bench/test/matrix-outcomes.test.ts @@ -0,0 +1,69 @@ +/** + * Phase 0 — matrix outcome extraction. Pure: pairs a graph-on result's + * per-question predictions against its graph-off sibling, classifies each, and + * (chained with computeGraphEffect) yields the graphEffect the matrix reports. + * No adapters, no Neo4j, no onnx. + */ +import { describe, it, expect } from 'vitest' +import { + extractLongMemEvalOutcomes, + extractLoCoMoOutcomes, +} from '../src/runner/matrix-outcomes.js' +import { computeGraphEffect } from '../src/metrics/graph-effect.js' +import type { LongMemEvalResult, LoCoMoResult } from '../src/types.js' + +function lmePred(id: string, recallAt5: boolean, ability: string) { + return { + questionId: id, question: 'q', goldAnswer: 'a', goldSessionIds: ['s'], + prediction: '', recalledSessionIds: [], recallAt5, recallAt10: recallAt5, ability, + } +} +const lme = (preds: ReturnType[]) => + ({ predictions: preds } as unknown as LongMemEvalResult) + +function locomoQa(id: string, recallAtK: boolean, category: number) { + return { qaId: id, question: 'q', goldAnswer: 'a', prediction: '', retrievalF1: 0, recallAtK, category } +} +const locomo = (convs: ReturnType[][]) => + ({ conversations: convs.map((qaPredictions) => ({ qaPredictions })) } as unknown as LoCoMoResult) + +describe('extractLongMemEvalOutcomes', () => { + it('pairs by question id, classifies by ability, and feeds graphEffect', () => { + const on = lme([lmePred('q1', true, 'multi_session_reasoning'), lmePred('q2', true, 'information_extraction')]) + const off = lme([lmePred('q1', false, 'multi_session_reasoning'), lmePred('q2', true, 'information_extraction')]) + + const outcomes = extractLongMemEvalOutcomes(on, off) + expect(outcomes).toHaveLength(2) + const q1 = outcomes.find((o) => o.id === 'q1')! + expect(q1.structure).toBe('multi_hop') // multi_session_reasoning + expect(q1.recallAtKMergeOff).toBe(false) + expect(q1.recallAtKMergeOn).toBe(true) + + // graph rescued q1 (multi_hop, graph-relevant); q2 (lookup) is excluded. + const effect = computeGraphEffect(outcomes) + expect(effect.graphVisibleN).toBe(1) + expect(effect.graphEffect).toBe(1.0) + }) + + it('drops questions missing from the graph-off cell', () => { + expect(extractLongMemEvalOutcomes(lme([lmePred('q1', true, 'temporal_reasoning')]), lme([]))).toHaveLength(0) + }) +}) + +describe('extractLoCoMoOutcomes', () => { + it('pairs by qaId across conversations and classifies by category', () => { + const on = locomo([[locomoQa('c:q1', true, 2), locomoQa('c:q2', true, 1)]]) + const off = locomo([[locomoQa('c:q1', false, 2), locomoQa('c:q2', true, 1)]]) + + const outcomes = extractLoCoMoOutcomes(on, off) + expect(outcomes).toHaveLength(2) + const q1 = outcomes.find((o) => o.id === 'c:q1')! + expect(q1.structure).toBe('multi_hop') // category 2 + expect(q1.recallAtKMergeOff).toBe(false) + expect(q1.recallAtKMergeOn).toBe(true) + + const effect = computeGraphEffect(outcomes) + expect(effect.graphVisibleN).toBe(1) // only the cat-2 question is graph-relevant + expect(effect.graphEffect).toBe(1.0) + }) +}) From 5dff19579e8c6d9faa035e62f7959ca7bf032553 Mon Sep 17 00:00:00 2001 From: muhammadkh4n Date: Sun, 7 Jun 2026 14:11:31 +0500 Subject: [PATCH 9/9] fix(bench): flush pending graph writes before eval (Phase 0 correctness) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Graph decomposition in memory.ingest() is fire-and-forget (pushed to _pendingWrites). Neither bench adapter awaited it, so recall ran against a half-built graph — the graph cells produced empty/sparse associations and graphEffect was spuriously ~0. This is exactly the measurement bug that makes "the graph doesn't help" look true when the graph was never given a chance. Call memory.flushPendingWrites() at the ingest→eval boundary in both adapters (LoCoMo runConversation after consolidation; LongMemEval runQuestion after ingest) so the graph is fully built before recall. bench typecheck clean, 23/23. --- packages/bench/src/locomo/adapter.ts | 4 ++++ packages/bench/src/longmemeval/adapter.ts | 3 +++ 2 files changed, 7 insertions(+) diff --git a/packages/bench/src/locomo/adapter.ts b/packages/bench/src/locomo/adapter.ts index 38a8728..669a768 100644 --- a/packages/bench/src/locomo/adapter.ts +++ b/packages/bench/src/locomo/adapter.ts @@ -273,6 +273,10 @@ export class LoCoMoAdapter { await memory.consolidate('light') await memory.consolidate('deep') } + // Drain fire-and-forget graph decomposition (+ consolidation) writes before + // eval. Without this, recall runs against a half-built graph and the graph + // cells produce empty associations — spuriously zeroing graphEffect. + await memory.flushPendingWrites() const ingestMs = Date.now() - ingestStart const evalStart = Date.now() diff --git a/packages/bench/src/longmemeval/adapter.ts b/packages/bench/src/longmemeval/adapter.ts index d8fb8dd..bcc5cb9 100644 --- a/packages/bench/src/longmemeval/adapter.ts +++ b/packages/bench/src/longmemeval/adapter.ts @@ -147,6 +147,9 @@ export class LongMemEvalAdapter { if (config.graph) await wipeBenchGraph(config.graph) const ingestStart = Date.now() const { episodesIngested, sessionsCreated } = await this.ingestQuestion(question, memory) + // Drain fire-and-forget graph decomposition writes before recall, or the + // graph cell recalls against a half-built graph (spurious graphEffect=0). + await memory.flushPendingWrites() const ingestMs = Date.now() - ingestStart const evalStart = Date.now()