From f4d125d45e960823527298a2d751fe55aced4465 Mon Sep 17 00:00:00 2001
From: muhammadkh4n <muhammadkh4n@gmail.com>
Date: Sat, 6 Jun 2026 05:13:44 +0500
Subject: [PATCH 1/9] fix(core,sqlite): forget() tombstones instead of
 inverting (Phase 1, offline path)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The shipped forget() was inverted: on episodes it called recordAccess()
(access_count++), a term recall ranking REWARDS via accessBoost — so
forgetting a memory RAISED its recall rank; on semantic it floored a
confidence value no recall path reads; procedural was a no-op. Net: forget
did nothing useful or worse, which is why ~7,200 "forgotten" memories
resurfaced across 19 manual gardening sessions.

This lands the offline-testable core of the fix (core contract + SQLite +
PostgREST adapter; the PostgREST schema/RPC + Neo4j gate follow):

- storage.ts: add markForgotten(ids): Promise<number> to Episode/Semantic/
  Procedural storage — a tombstone that sets forgotten_at and touches
  NEITHER access_count NOR confidence.
- memory.ts forget(): rewrite the confirm path to call markForgotten per
  tier; drop the recordAccess/recordAccessAndBoost calls and the confidence
  floor (the forgotten_at tombstone is the single source of truth). Remove
  the now-unused CONFIDENCE_FLOOR.
- sqlite: migration v5 adds forgotten_at (+ partial index) to episodes/
  semantic/procedural; markForgotten impls; AND forgotten_at IS NULL gate
  cloned onto every recall path (vectorSearch + textBoost + the per-store
  hybrid/BM25/vector fallbacks), mirroring the proven superseded_by gate.
- postgrest adapters: markForgotten via table PATCH (schema lands next).
- tests: forget-e2e (recall gate excludes a tombstoned memory while the
  sibling survives; forget removes matched content; access_count NOT bumped;
  confirm=false no-op; idempotent at both levels) — would fail on old code.
  migration test updated to v5 + a forgotten_at column assertion.

sqlite 106/106, core 495/495, typecheck clean.
---
 packages/core/src/adapters/storage.ts        |  19 +++
 packages/core/src/memory.ts                  |  39 +++----
 packages/core/test/retrieval/mock-storage.ts |   3 +
 packages/postgrest/src/episodes.ts           |  13 +++
 packages/postgrest/src/procedural.ts         |  13 +++
 packages/postgrest/src/semantic.ts           |  13 +++
 packages/sqlite/src/adapter.ts               |  20 ++--
 packages/sqlite/src/episodes.ts              |  15 ++-
 packages/sqlite/src/migrations.ts            |  16 +++
 packages/sqlite/src/procedural.ts            |  17 ++-
 packages/sqlite/src/semantic.ts              |  14 ++-
 packages/sqlite/test/forget-e2e.test.ts      | 116 +++++++++++++++++++
 packages/sqlite/test/migrations.test.ts      |  14 ++-
 13 files changed, 268 insertions(+), 44 deletions(-)
 create mode 100644 packages/sqlite/test/forget-e2e.test.ts
diff --git a/packages/core/src/adapters/storage.ts b/packages/core/src/adapters/storage.ts
index 1675e7c..0a97fff 100644
--- a/packages/core/src/adapters/storage.ts
+++ b/packages/core/src/adapters/storage.ts
@@ -25,6 +25,13 @@ export interface EpisodeStorage {
   getUnconsolidatedSessions(): Promise<string[]>
   markConsolidated(ids: string[]): Promise<void>
   recordAccess(id: string): Promise<void>
+  /**
+   * Tombstone the given memories (sets forgotten_at). Forgotten memories are
+   * excluded from every recall path but retained for audit/undo. Distinct
+   * from recordAccess — does NOT touch access_count. Returns the number of
+   * rows newly tombstoned. Idempotent.
+   */
+  markForgotten(ids: string[]): Promise<number>
   /** Find earliest created_at across episodes referenced by the given digest IDs */
   findEarliestInDigests?(digestIds: string[]): Promise<{ createdAt: Date } | null>
   /** Fast COUNT(*) for stats(). Falls back to N-scan when not implemented. */
@@ -49,6 +56,12 @@ export interface SemanticStorage {
   getUnaccessed(days: number): Promise<SemanticMemory[]>
   recordAccessAndBoost(id: string, confidenceBoost: number): Promise<void>
   markSuperseded(id: string, supersededBy: string): Promise<void>
+  /**
+   * Tombstone the given memories (sets forgotten_at). Forgotten memories are
+   * excluded from every recall path but retained for audit/undo. Does NOT
+   * touch confidence or access_count. Returns rows newly tombstoned. Idempotent.
+   */
+  markForgotten(ids: string[]): Promise<number>
   batchDecay(opts: { daysThreshold: number; decayRate: number }): Promise<number>
   /** Per-ID gradient decay (PageRank-modulated). Falls back to batchDecay when not implemented. */
   batchDecayGradient?(updates: Array<{ id: string; effectiveDecayRate: number; daysThreshold: number }>): Promise<number>
@@ -74,6 +87,12 @@ export interface ProceduralStorage {
   search(query: string, opts?: SearchOptions): Promise<SearchResult<ProceduralMemory>[]>
   searchByTrigger(activity: string, opts?: SearchOptions): Promise<SearchResult<ProceduralMemory>[]>
   recordAccess(id: string): Promise<void>
+  /**
+   * Tombstone the given memories (sets forgotten_at). Excluded from recall,
+   * retained for audit/undo. Does NOT touch access_count. Returns rows newly
+   * tombstoned. Idempotent.
+   */
+  markForgotten(ids: string[]): Promise<number>
   incrementObservation(id: string): Promise<void>
   batchDecay(opts: { daysThreshold: number; decayRate: number }): Promise<number>
   /** Per-ID gradient decay (PageRank-modulated). Falls back to batchDecay when not implemented. */
diff --git a/packages/core/src/memory.ts b/packages/core/src/memory.ts
index 549c6b4..15ab317 100644
--- a/packages/core/src/memory.ts
+++ b/packages/core/src/memory.ts
@@ -93,7 +93,6 @@ export interface SessionHandle {
 // ---------------------------------------------------------------------------
 
 const DEFAULT_SESSION_ID = 'default'
-const CONFIDENCE_FLOOR = 0.05
 // Minimum relevance required for a memory to count as "affected" by forget().
 // computeScore() sums cosine similarity + bm25 boost + recency + access + role
 // bumps; a typical strong match lands around 0.6–1.1, weak semantic adjacency
@@ -836,9 +835,9 @@ export class Memory {
   // ---------------------------------------------------------------------------
 
   /**
-   * Deprioritize memories (lossless — sets confidence to 0.05, marks
-   * metadata.forgotten). Returns a preview by default; pass confirm=true
-   * to actually apply.
+   * Forget memories — tombstones them (forgotten_at) so they are excluded
+   * from every recall path while retained for audit/undo. Lossless and
+   * idempotent. Returns a preview by default; pass confirm=true to apply.
    */
   async forget(
     query: string,
@@ -890,27 +889,23 @@ export class Memory {
       return { count: filtered.length, previewed: filtered }
     }
 
-    // Apply forgetting: lossless deprioritization
+    // Apply forgetting: tombstone the matched memories (forgotten_at). They are
+    // excluded from every recall path but retained for audit/undo. Idempotent.
+    // Deliberately does NOT touch access_count or confidence: the old behavior
+    // called recordAccess/recordAccessAndBoost, which incremented access_count —
+    // a term recall ranking REWARDS — so forgetting an episode raised its recall
+    // rank, and the floored confidence was a value no recall path ever read.
+    const idsByType: Record<'episode' | 'semantic' | 'procedural', string[]> = {
+      episode: [], semantic: [], procedural: [],
+    }
     for (const memory of filtered) {
-      if (memory.type === 'semantic') {
-        await this.storage.semantic.recordAccessAndBoost(
-          memory.id,
-          CONFIDENCE_FLOOR - 1 // set to floor by applying a large negative boost
-        )
-        // Mark metadata.forgotten by re-inserting with updated metadata is
-        // not directly supported; we apply the confidence floor via available API.
-        // The storage interface supports recordAccessAndBoost but not direct update.
-        // We use a large negative boost to drive confidence toward floor.
-      } else if (memory.type === 'procedural') {
-        // No direct confidence update API for procedural; mark via the
-        // observationCount mechanism — no decay is available without batchDecay.
-        // We skip procedural direct update as the interface doesn't support it.
-      } else if (memory.type === 'episode') {
-        // Episodes are lossless; we can mark via metadata but there's no update
-        // API on EpisodeStorage. We record access to at least touch the episode.
-        await this.storage.episodes.recordAccess(memory.id)
+      if (memory.type === 'episode' || memory.type === 'semantic' || memory.type === 'procedural') {
+        idsByType[memory.type].push(memory.id)
       }
     }
+    if (idsByType.semantic.length > 0) await this.storage.semantic.markForgotten(idsByType.semantic)
+    if (idsByType.procedural.length > 0) await this.storage.procedural.markForgotten(idsByType.procedural)
+    if (idsByType.episode.length > 0) await this.storage.episodes.markForgotten(idsByType.episode)
 
     return { count: filtered.length, previewed: filtered }
   }
diff --git a/packages/core/test/retrieval/mock-storage.ts b/packages/core/test/retrieval/mock-storage.ts
index 33b03f4..94d6bb1 100644
--- a/packages/core/test/retrieval/mock-storage.ts
+++ b/packages/core/test/retrieval/mock-storage.ts
@@ -198,6 +198,7 @@ export function createMockStorage(opts: MockStorageOptions = {}): StorageAdapter
     getUnconsolidatedSessions: vi.fn().mockResolvedValue([]),
     markConsolidated: vi.fn().mockResolvedValue(undefined),
     recordAccess: vi.fn().mockResolvedValue(undefined),
+    markForgotten: vi.fn().mockResolvedValue(0),
   }
 
   const digests: DigestStorage = {
@@ -214,6 +215,7 @@ export function createMockStorage(opts: MockStorageOptions = {}): StorageAdapter
     getUnaccessed: vi.fn().mockResolvedValue([]),
     recordAccessAndBoost: vi.fn().mockResolvedValue(undefined),
     markSuperseded: vi.fn().mockResolvedValue(undefined),
+    markForgotten: vi.fn().mockResolvedValue(0),
     batchDecay: vi.fn().mockResolvedValue(0),
   }
 
@@ -222,6 +224,7 @@ export function createMockStorage(opts: MockStorageOptions = {}): StorageAdapter
     search: vi.fn().mockResolvedValue(proceduralResults),
     searchByTrigger: vi.fn().mockResolvedValue([]),
     recordAccess: vi.fn().mockResolvedValue(undefined),
+    markForgotten: vi.fn().mockResolvedValue(0),
     incrementObservation: vi.fn().mockResolvedValue(undefined),
     batchDecay: vi.fn().mockResolvedValue(0),
   }
diff --git a/packages/postgrest/src/episodes.ts b/packages/postgrest/src/episodes.ts
index aa6b6ca..db0dbac 100644
--- a/packages/postgrest/src/episodes.ts
+++ b/packages/postgrest/src/episodes.ts
@@ -222,6 +222,19 @@ export class PostgRestEpisodeStorage implements EpisodeStorage {
     if (error) throw new Error(`Episode recordAccess failed: ${error.message}`)
   }
 
+
+  async markForgotten(ids: string[]): Promise<number> {
+    if (ids.length === 0) return 0
+    const { data, error } = await this.client
+      .from('memory_episodes')
+      .update({ forgotten_at: new Date().toISOString() })
+      .in('id', ids)
+      .is('forgotten_at', null)
+      .select('id')
+    if (error) throw new Error(`Episode markForgotten failed: ${error.message}`)
+    return (data ?? []).length
+  }
+
   async findEarliestInDigests(digestIds: string[]): Promise<{ createdAt: Date } | null> {
     if (digestIds.length === 0) return null
     const { data: digests, error: dErr } = await this.client
diff --git a/packages/postgrest/src/procedural.ts b/packages/postgrest/src/procedural.ts
index 30ccd1b..292e5cc 100644
--- a/packages/postgrest/src/procedural.ts
+++ b/packages/postgrest/src/procedural.ts
@@ -133,6 +133,19 @@ export class PostgRestProceduralStorage implements ProceduralStorage {
     if (error) throw new Error(`Procedural recordAccess failed: ${error.message}`)
   }
 
+
+  async markForgotten(ids: string[]): Promise<number> {
+    if (ids.length === 0) return 0
+    const { data, error } = await this.client
+      .from('memory_procedural')
+      .update({ forgotten_at: new Date().toISOString() })
+      .in('id', ids)
+      .is('forgotten_at', null)
+      .select('id')
+    if (error) throw new Error(`Procedural markForgotten failed: ${error.message}`)
+    return (data ?? []).length
+  }
+
   async incrementObservation(id: string): Promise<void> {
     const { data: current, error: fetchErr } = await this.client
       .from('memory_procedural')
diff --git a/packages/postgrest/src/semantic.ts b/packages/postgrest/src/semantic.ts
index 0bf6876..e4c8762 100644
--- a/packages/postgrest/src/semantic.ts
+++ b/packages/postgrest/src/semantic.ts
@@ -141,6 +141,19 @@ export class PostgRestSemanticStorage implements SemanticStorage {
     if (err2) throw new Error(`Semantic markSuperseded (new) failed: ${err2.message}`)
   }
 
+
+  async markForgotten(ids: string[]): Promise<number> {
+    if (ids.length === 0) return 0
+    const { data, error } = await this.client
+      .from('memory_semantic')
+      .update({ forgotten_at: new Date().toISOString() })
+      .in('id', ids)
+      .is('forgotten_at', null)
+      .select('id')
+    if (error) throw new Error(`Semantic markForgotten failed: ${error.message}`)
+    return (data ?? []).length
+  }
+
   async batchDecay(opts: { daysThreshold: number; decayRate: number }): Promise<number> {
     // Call engram_decay_pass and extract semantic_decayed count
     const { data, error } = await this.client.rpc('engram_decay_pass', {
diff --git a/packages/sqlite/src/adapter.ts b/packages/sqlite/src/adapter.ts
index 56f46ab..cc30f73 100644
--- a/packages/sqlite/src/adapter.ts
+++ b/packages/sqlite/src/adapter.ts
@@ -206,10 +206,10 @@ export class SqliteStorageAdapter implements StorageAdapter {
       let sql: string
       let params: unknown[]
       if (opts?.sessionId) {
-        sql = `SELECT * FROM episodes WHERE embedding IS NOT NULL AND session_id = ?${projectFilter} LIMIT ?`
+        sql = `SELECT * FROM episodes WHERE embedding IS NOT NULL AND forgotten_at IS NULL AND session_id = ?${projectFilter} LIMIT ?`
         params = [opts.sessionId, ...projectParams, scanLimit]
       } else {
-        sql = `SELECT * FROM episodes WHERE embedding IS NOT NULL${projectFilter} LIMIT ?`
+        sql = `SELECT * FROM episodes WHERE embedding IS NOT NULL AND forgotten_at IS NULL${projectFilter} LIMIT ?`
         params = [...projectParams, scanLimit]
       }
       const rows = db.prepare(sql).all(...params) as EpisodeRow[]
@@ -242,7 +242,7 @@ export class SqliteStorageAdapter implements StorageAdapter {
 
     if (tiers.includes('semantic')) {
       const rows = db.prepare(
-        `SELECT * FROM semantic WHERE embedding IS NOT NULL AND superseded_by IS NULL${projectFilter} LIMIT ?`
+        `SELECT * FROM semantic WHERE embedding IS NOT NULL AND superseded_by IS NULL AND forgotten_at IS NULL${projectFilter} LIMIT ?`
       ).all(...projectParams, scanLimit) as SemanticRow[]
       for (const row of rows) {
         if (!row.embedding) continue
@@ -256,7 +256,7 @@ export class SqliteStorageAdapter implements StorageAdapter {
 
     if (tiers.includes('procedural')) {
       const rows = db.prepare(
-        `SELECT * FROM procedural WHERE embedding IS NOT NULL${projectFilter} LIMIT ?`
+        `SELECT * FROM procedural WHERE embedding IS NOT NULL AND forgotten_at IS NULL${projectFilter} LIMIT ?`
       ).all(...projectParams, scanLimit) as ProceduralRow[]
       for (const row of rows) {
         if (!row.embedding) continue
@@ -288,10 +288,10 @@ export class SqliteStorageAdapter implements StorageAdapter {
     const projectId = opts?.projectId
 
     try {
-      let sql = 'SELECT e.id, rank FROM episodes_fts f JOIN episodes e ON e.rowid = f.rowid WHERE episodes_fts MATCH ? ORDER BY rank LIMIT ?'
+      let sql = 'SELECT e.id, rank FROM episodes_fts f JOIN episodes e ON e.rowid = f.rowid WHERE episodes_fts MATCH ? AND e.forgotten_at IS NULL ORDER BY rank LIMIT ?'
       const params: unknown[] = [ftsQuery, limit]
       if (projectId) {
-        sql = `SELECT e.id, rank FROM episodes_fts f JOIN episodes e ON e.rowid = f.rowid WHERE episodes_fts MATCH ? AND (e.project_id = ? OR e.project_id IS NULL) ORDER BY rank LIMIT ?`
+        sql = `SELECT e.id, rank FROM episodes_fts f JOIN episodes e ON e.rowid = f.rowid WHERE episodes_fts MATCH ? AND e.forgotten_at IS NULL AND (e.project_id = ? OR e.project_id IS NULL) ORDER BY rank LIMIT ?`
         params.splice(1, 0, projectId)
       }
       const epRows = db.prepare(sql).all(...params) as Array<{ id: string; rank: number }>
@@ -310,10 +310,10 @@ export class SqliteStorageAdapter implements StorageAdapter {
     } catch { /* FTS5 table may not exist */ }
 
     try {
-      let sql = 'SELECT s.id, rank FROM semantic_fts f JOIN semantic s ON s.rowid = f.rowid WHERE semantic_fts MATCH ? ORDER BY rank LIMIT ?'
+      let sql = 'SELECT s.id, rank FROM semantic_fts f JOIN semantic s ON s.rowid = f.rowid WHERE semantic_fts MATCH ? AND s.superseded_by IS NULL AND s.forgotten_at IS NULL ORDER BY rank LIMIT ?'
       const params: unknown[] = [ftsQuery, limit]
       if (projectId) {
-        sql = `SELECT s.id, rank FROM semantic_fts f JOIN semantic s ON s.rowid = f.rowid WHERE semantic_fts MATCH ? AND (s.project_id = ? OR s.project_id IS NULL) ORDER BY rank LIMIT ?`
+        sql = `SELECT s.id, rank FROM semantic_fts f JOIN semantic s ON s.rowid = f.rowid WHERE semantic_fts MATCH ? AND s.superseded_by IS NULL AND s.forgotten_at IS NULL AND (s.project_id = ? OR s.project_id IS NULL) ORDER BY rank LIMIT ?`
         params.splice(1, 0, projectId)
       }
       const smRows = db.prepare(sql).all(...params) as Array<{ id: string; rank: number }>
@@ -321,10 +321,10 @@ export class SqliteStorageAdapter implements StorageAdapter {
     } catch { /* FTS5 table may not exist */ }
 
     try {
-      let sql = 'SELECT p.id, rank FROM procedural_fts f JOIN procedural p ON p.rowid = f.rowid WHERE procedural_fts MATCH ? ORDER BY rank LIMIT ?'
+      let sql = 'SELECT p.id, rank FROM procedural_fts f JOIN procedural p ON p.rowid = f.rowid WHERE procedural_fts MATCH ? AND p.forgotten_at IS NULL ORDER BY rank LIMIT ?'
       const params: unknown[] = [ftsQuery, limit]
       if (projectId) {
-        sql = `SELECT p.id, rank FROM procedural_fts f JOIN procedural p ON p.rowid = f.rowid WHERE procedural_fts MATCH ? AND (p.project_id = ? OR p.project_id IS NULL) ORDER BY rank LIMIT ?`
+        sql = `SELECT p.id, rank FROM procedural_fts f JOIN procedural p ON p.rowid = f.rowid WHERE procedural_fts MATCH ? AND p.forgotten_at IS NULL AND (p.project_id = ? OR p.project_id IS NULL) ORDER BY rank LIMIT ?`
         params.splice(1, 0, projectId)
       }
       const prRows = db.prepare(sql).all(...params) as Array<{ id: string; rank: number }>
diff --git a/packages/sqlite/src/episodes.ts b/packages/sqlite/src/episodes.ts
index 7f967dd..94f90df 100644
--- a/packages/sqlite/src/episodes.ts
+++ b/packages/sqlite/src/episodes.ts
@@ -74,7 +74,7 @@ export class SqliteEpisodeStorage implements EpisodeStorage {
               SELECT e.*, -episodes_fts.rank AS bm25_score
               FROM episodes_fts
               JOIN episodes e ON episodes_fts.rowid = e.rowid
-              WHERE episodes_fts MATCH ?
+              WHERE episodes_fts MATCH ? AND e.forgotten_at IS NULL
             `
             const params: unknown[] = [ftsQuery]
             if (sessionId) {
@@ -86,7 +86,7 @@ export class SqliteEpisodeStorage implements EpisodeStorage {
           },
           recentVectorSql: `
             SELECT id, embedding FROM episodes
-            WHERE embedding IS NOT NULL
+            WHERE embedding IS NOT NULL AND forgotten_at IS NULL
             ORDER BY created_at DESC
             LIMIT ?
           `,
@@ -105,7 +105,7 @@ export class SqliteEpisodeStorage implements EpisodeStorage {
       SELECT e.*, -episodes_fts.rank AS bm25_score
       FROM episodes_fts
       JOIN episodes e ON episodes_fts.rowid = e.rowid
-      WHERE episodes_fts MATCH ?
+      WHERE episodes_fts MATCH ? AND e.forgotten_at IS NULL
     `
     const params: unknown[] = [ftsQuery]
 
@@ -189,6 +189,15 @@ export class SqliteEpisodeStorage implements EpisodeStorage {
       .run(id)
   }
 
+  async markForgotten(ids: string[]): Promise<number> {
+    if (ids.length === 0) return 0
+    const placeholders = ids.map(() => '?').join(',')
+    const res = this.db
+      .prepare(`UPDATE episodes SET forgotten_at = julianday('now') WHERE id IN (${placeholders}) AND forgotten_at IS NULL`)
+      .run(...ids)
+    return res.changes
+  }
+
   async findEarliestInDigests(digestIds: string[]): Promise<{ createdAt: Date } | null> {
     if (digestIds.length === 0) return null
     const placeholders = digestIds.map(() => '?').join(',')
diff --git a/packages/sqlite/src/migrations.ts b/packages/sqlite/src/migrations.ts
index 1d654dd..01ee9bd 100644
--- a/packages/sqlite/src/migrations.ts
+++ b/packages/sqlite/src/migrations.ts
@@ -348,4 +348,20 @@ export function runMigrations(db: Database.Database): void {
 
     db.pragma('user_version = 4')
   }
+
+  if (currentVersion < 5) {
+    // V5: forgotten_at tombstone on the three recallable memory tables.
+    // forget() stamps forgotten_at; every recall path filters forgotten_at IS NULL.
+    // Distinct from superseded_by (supersession lineage) — this is explicit user/GC
+    // forgetting. Rows are retained for audit/undo, never destroyed here.
+    const tables = ['episodes', 'semantic', 'procedural'] as const
+    for (const table of tables) {
+      const cols = db.prepare(`PRAGMA table_info(${table})`).all() as Array<{ name: string }>
+      if (!cols.some(c => c.name === 'forgotten_at')) {
+        db.exec(`ALTER TABLE ${table} ADD COLUMN forgotten_at REAL`)
+        db.exec(`CREATE INDEX IF NOT EXISTS idx_${table}_forgotten ON ${table}(forgotten_at) WHERE forgotten_at IS NOT NULL`)
+      }
+    }
+    db.pragma('user_version = 5')
+  }
 }
diff --git a/packages/sqlite/src/procedural.ts b/packages/sqlite/src/procedural.ts
index 596a1cb..15268e3 100644
--- a/packages/sqlite/src/procedural.ts
+++ b/packages/sqlite/src/procedural.ts
@@ -69,13 +69,13 @@ export class SqliteProceduralStorage implements ProceduralStorage {
                 `SELECT p.*, -procedural_fts.rank AS bm25_score
                  FROM procedural_fts
                  JOIN procedural p ON procedural_fts.rowid = p.rowid
-                 WHERE procedural_fts MATCH ?
+                 WHERE procedural_fts MATCH ? AND p.forgotten_at IS NULL
                  ORDER BY rank LIMIT 50`
               )
               .all(ftsQuery) as Array<ProceduralRow & { bm25_score: number }>,
           recentVectorSql: `
             SELECT id, embedding FROM procedural
-            WHERE embedding IS NOT NULL
+            WHERE embedding IS NOT NULL AND forgotten_at IS NULL
             ORDER BY created_at DESC
             LIMIT ?
           `,
@@ -102,7 +102,7 @@ export class SqliteProceduralStorage implements ProceduralStorage {
         `SELECT p.*, -procedural_fts.rank AS bm25_score
          FROM procedural_fts
          JOIN procedural p ON procedural_fts.rowid = p.rowid
-         WHERE procedural_fts MATCH ?
+         WHERE procedural_fts MATCH ? AND p.forgotten_at IS NULL
          ORDER BY rank LIMIT ?`
       )
       .all(ftsQuery, limit) as (ProceduralRow & { bm25_score: number })[]
@@ -131,7 +131,7 @@ export class SqliteProceduralStorage implements ProceduralStorage {
         `SELECT p.*, -procedural_fts.rank AS bm25_score
          FROM procedural_fts
          JOIN procedural p ON procedural_fts.rowid = p.rowid
-         WHERE procedural_fts MATCH ?
+         WHERE procedural_fts MATCH ? AND p.forgotten_at IS NULL
          ORDER BY rank LIMIT ?`
       )
       .all(columnQuery, limit) as (ProceduralRow & { bm25_score: number })[]
@@ -156,6 +156,15 @@ export class SqliteProceduralStorage implements ProceduralStorage {
       .run(id)
   }
 
+  async markForgotten(ids: string[]): Promise<number> {
+    if (ids.length === 0) return 0
+    const placeholders = ids.map(() => '?').join(',')
+    const res = this.db
+      .prepare(`UPDATE procedural SET forgotten_at = julianday('now') WHERE id IN (${placeholders}) AND forgotten_at IS NULL`)
+      .run(...ids)
+    return res.changes
+  }
+
   async incrementObservation(id: string): Promise<void> {
     this.db
       .prepare(
diff --git a/packages/sqlite/src/semantic.ts b/packages/sqlite/src/semantic.ts
index 52f110b..47e7b08 100644
--- a/packages/sqlite/src/semantic.ts
+++ b/packages/sqlite/src/semantic.ts
@@ -67,7 +67,7 @@ export class SqliteSemanticStorage implements SemanticStorage {
                  FROM semantic_fts
                  JOIN semantic s ON semantic_fts.rowid = s.rowid
                  WHERE semantic_fts MATCH ?
-                   AND s.superseded_by IS NULL
+                   AND s.superseded_by IS NULL AND s.forgotten_at IS NULL
                  ORDER BY rank LIMIT 50`
               )
               .all(ftsQuery) as Array<SemanticRow & { bm25_score: number }>,
@@ -75,6 +75,7 @@ export class SqliteSemanticStorage implements SemanticStorage {
             SELECT id, embedding FROM semantic
             WHERE embedding IS NOT NULL
               AND superseded_by IS NULL
+              AND forgotten_at IS NULL
             ORDER BY created_at DESC
             LIMIT ?
           `,
@@ -102,7 +103,7 @@ export class SqliteSemanticStorage implements SemanticStorage {
          FROM semantic_fts
          JOIN semantic s ON semantic_fts.rowid = s.rowid
          WHERE semantic_fts MATCH ?
-           AND s.superseded_by IS NULL
+           AND s.superseded_by IS NULL AND s.forgotten_at IS NULL
          ORDER BY rank LIMIT ?`
       )
       .all(ftsQuery, limit) as (SemanticRow & { bm25_score: number })[]
@@ -147,6 +148,15 @@ export class SqliteSemanticStorage implements SemanticStorage {
     txn()
   }
 
+  async markForgotten(ids: string[]): Promise<number> {
+    if (ids.length === 0) return 0
+    const placeholders = ids.map(() => '?').join(',')
+    const res = this.db
+      .prepare(`UPDATE semantic SET forgotten_at = julianday('now') WHERE id IN (${placeholders}) AND forgotten_at IS NULL`)
+      .run(...ids)
+    return res.changes
+  }
+
   async batchDecay(opts: { daysThreshold: number; decayRate: number }): Promise<number> {
     const result = this.db
       .prepare(
diff --git a/packages/sqlite/test/forget-e2e.test.ts b/packages/sqlite/test/forget-e2e.test.ts
new file mode 100644
index 0000000..a341cf7
--- /dev/null
+++ b/packages/sqlite/test/forget-e2e.test.ts
@@ -0,0 +1,116 @@
+/**
+ * forget() is a real tombstone, not an inverted boost.
+ *
+ * The shipped bug (pre-overhaul): forget() called recordAccess/
+ * recordAccessAndBoost, which incremented access_count — a term recall
+ * ranking REWARDS — so "forgetting" an episode RAISED its recall rank, while
+ * the floored confidence was a value no recall path ever read. Net: forget
+ * did nothing useful (or worse). The fix tombstones forgotten_at and gates
+ * every recall path on it, touching neither access_count nor confidence.
+ *
+ * The fake intelligence routes content/queries onto disjoint embedding axes by
+ * keyword so a forget query matches the intended memory and not its sibling.
+ */
+import { describe, it, expect, beforeEach, afterEach } from 'vitest'
+import { createMemory, type IntelligenceAdapter, type Memory } from '@engram-mem/core'
+import { SqliteStorageAdapter } from '../src/adapter.js'
+
+const DIM = 1536
+function embedText(text: string): number[] {
+  const v = new Array(DIM).fill(0)
+  if (/deploy|key|rotat|staging|monday/i.test(text)) v[0] = 1
+  else if (/billing|cron|midnight|nightly/i.test(text)) v[1] = 1
+  else v[2] = 1
+  return v
+}
+const intel: IntelligenceAdapter = {
+  async embed(text: string): Promise<number[]> {
+    return embedText(text)
+  },
+  dimensions(): number {
+    return DIM
+  },
+}
+
+const DEPLOY = 'the staging deploy key must be rotated every monday'
+const BILLING = 'the billing cron job runs at midnight nightly'
+
+describe('forget() tombstone', () => {
+  let storage: SqliteStorageAdapter
+  let mem: Memory
+
+  beforeEach(async () => {
+    storage = new SqliteStorageAdapter(':memory:')
+    await storage.initialize()
+    mem = createMemory({ storage, intelligence: intel })
+    await mem.initialize()
+    await mem.ingest({ role: 'user', content: DEPLOY })
+    await mem.ingest({ role: 'user', content: BILLING })
+    await mem.flushPendingWrites?.()
+  })
+  afterEach(async () => {
+    await storage.dispose()
+  })
+
+  async function recallHas(query: string, needle: string): Promise<boolean> {
+    const r = await mem.recall(query)
+    return r.memories.some((m) => m.content.includes(needle))
+  }
+  async function deployEpisodeId(): Promise<string> {
+    const rows = await storage.vectorSearch(embedText('deploy key'), { limit: 10 })
+    const ep = rows.find(
+      (r) => r.item.type === 'episode' && (r.item.data as { content: string }).content.includes('deploy key'),
+    )
+    if (!ep) throw new Error('deploy episode not found')
+    return (ep.item.data as { id: string }).id
+  }
+
+  it('recall gate: a tombstoned memory is excluded; its sibling survives', async () => {
+    // Tombstone exactly one memory and prove the recall gate honors it while
+    // leaving the non-forgotten sibling fully recallable (the core of the fix).
+    expect(await recallHas('what is the deploy key rotation policy?', 'deploy key')).toBe(true)
+    expect(await recallHas('when does the billing cron run?', 'billing cron')).toBe(true)
+
+    const id = await deployEpisodeId()
+    expect(await storage.episodes.markForgotten([id])).toBe(1)
+
+    expect(await recallHas('what is the deploy key rotation policy?', 'deploy key')).toBe(false)
+    expect(await recallHas('when does the billing cron run?', 'billing cron')).toBe(true)
+  })
+
+  it('forget(confirm=true) removes matched content from recall', async () => {
+    expect(await recallHas('what is the deploy key rotation policy?', 'deploy key')).toBe(true)
+    const res = await mem.forget('deploy key rotation', { confirm: true })
+    expect(res.count).toBeGreaterThanOrEqual(1)
+    expect(await recallHas('what is the deploy key rotation policy?', 'deploy key')).toBe(false)
+  })
+
+  it('markForgotten does NOT touch access_count (the inversion regression)', async () => {
+    const id = await deployEpisodeId()
+    const before = (await storage.episodes.getByIds([id]))[0]!.accessCount
+    const n = await storage.episodes.markForgotten([id])
+    expect(n).toBe(1)
+    const after = (await storage.episodes.getByIds([id]))[0]!.accessCount
+    expect(after).toBe(before) // old recordAccess would have bumped this by 1
+  })
+
+  it('confirm=false is a preview no-op', async () => {
+    const preview = await mem.forget('deploy key rotation') // confirm defaults false
+    expect(preview.count).toBeGreaterThanOrEqual(1)
+    expect(await recallHas('what is the deploy key rotation policy?', 'deploy key')).toBe(true)
+  })
+
+  it('is idempotent — re-forgetting a tombstoned memory is a no-op', async () => {
+    await mem.forget('deploy key rotation', { confirm: true })
+    const second = await mem.forget('deploy key rotation', { confirm: true })
+    expect(second.count).toBe(0) // already gated out of recall, nothing left to match
+    expect(await recallHas('what is the deploy key rotation policy?', 'deploy key')).toBe(false)
+  })
+
+  it('markForgotten is idempotent at the storage level', async () => {
+    const id = await deployEpisodeId()
+    expect(await storage.episodes.markForgotten([id])).toBe(1)
+    expect(await storage.episodes.markForgotten([id])).toBe(0) // already tombstoned
+    expect(await storage.episodes.markForgotten([])).toBe(0)
+  })
+})
diff --git a/packages/sqlite/test/migrations.test.ts b/packages/sqlite/test/migrations.test.ts
index 5549217..386c236 100644
--- a/packages/sqlite/test/migrations.test.ts
+++ b/packages/sqlite/test/migrations.test.ts
@@ -42,9 +42,17 @@ describe('SQLite migrations', () => {
     expect(tables).toContain('procedural_fts')
   })
 
-  it('sets schema version to 4 after all migrations', () => {
+  it('sets schema version to 5 after all migrations', () => {
     runMigrations(db)
-    expect(getSchemaVersion(db)).toBe(4)
+    expect(getSchemaVersion(db)).toBe(5)
+  })
+
+  it('v5 adds forgotten_at to the recallable memory tables', () => {
+    runMigrations(db)
+    for (const table of ['episodes', 'semantic', 'procedural']) {
+      const cols = db.prepare(`PRAGMA table_info(${table})`).all() as Array<{ name: string }>
+      expect(cols.some((c) => c.name === 'forgotten_at')).toBe(true)
+    }
   })
 
   it('creates episode_parts table (dual-storage architecture)', () => {
@@ -61,7 +69,7 @@ describe('SQLite migrations', () => {
   it('is idempotent (running twice does not error)', () => {
     runMigrations(db)
     runMigrations(db)
-    expect(getSchemaVersion(db)).toBe(4)
+    expect(getSchemaVersion(db)).toBe(5)
   })
 
   it('enforces foreign keys on memories table', () => {

From 7ef97f983a82f60faee155a82c7cf1abbb61e3c7 Mon Sep 17 00:00:00 2001
From: muhammadkh4n <muhammadkh4n@gmail.com>
Date: Sun, 7 Jun 2026 04:01:20 +0500
Subject: [PATCH 2/9] feat(postgrest): gate recall RPCs on forgotten_at;
 engram_mark_forgotten (Phase 1 production path)

Carry the forget() tombstone into the PostgREST schema so forget removes
content from every recall path on the production (Postgres+pgvector) backend,
matching the SQLite v5 offline path.

- forgotten_at timestamptz on memory_episodes/semantic/procedural, added both
  in the CREATE TABLE bodies and via idempotent ADD COLUMN IF NOT EXISTS so
  re-applying onto an already-provisioned DB actually adds the column.
- engram_mark_forgotten(p_memory_type, p_ids): stamps forgotten_at and touches
  neither access_count nor confidence (writing access_count was the inverted-
  forget bug; flooring confidence was dead). Idempotent, returns rows stamped.
- AND forgotten_at IS NULL gate in all 4 recall RPCs (engram_hybrid_recall,
  engram_recall, engram_text_boost, engram_vector_search) across the episode,
  semantic and procedural branches. Digests are not forgettable.
- partial indexes on tombstoned rows (lockstep with the SQLite v5 indexes).
- EOF post-apply smoke executes every recall RPC + engram_mark_forgotten so a
  missing column or broken gate surfaces at apply time. The dump emits
  functions before tables and relies on check_function_bodies=false, so the
  forgotten_at columns live in the table section and the smoke is the
  call-time guard; there is no migration runner.

Verified on Postgres 17 + pgvector: forget round-trip across all three types,
sibling survives, forgotten row's access_count unchanged. New schema-gate test
pins the predicates; postgrest suite 71/71, typecheck clean.
---
 packages/postgrest/schema.sql                 | 127 +++++++++++++++++-
 .../postgrest/test/forgotten-at-gate.test.ts  | 104 ++++++++++++++
 2 files changed, 227 insertions(+), 4 deletions(-)
 create mode 100644 packages/postgrest/test/forgotten-at-gate.test.ts

diff --git a/packages/postgrest/schema.sql b/packages/postgrest/schema.sql
index c600ae6..ef64a37 100644
--- a/packages/postgrest/schema.sql
+++ b/packages/postgrest/schema.sql
@@ -49,6 +49,27 @@ SET row_security = off;
 -- pgvector extension required for vector(1536) columns
 CREATE EXTENSION IF NOT EXISTS vector;
 
+-- =============================================================================
+-- forget() tombstone — within-file ordering note
+-- -----------------------------------------------------------------------------
+-- Phase 1 adds a `forgotten_at timestamptz` tombstone to memory_episodes /
+-- memory_semantic / memory_procedural. forget() stamps it; every recall RPC
+-- below gates on `forgotten_at IS NULL` (a 1:1 clone of the proven
+-- `superseded_by IS NULL` gate). It is intentionally NOT added to
+-- memory_digests (consolidation artifacts are not directly forgettable).
+--
+-- This file is a pg_dump: functions are emitted ABOVE the tables they read,
+-- which is only valid because `SET check_function_bodies = false` (above)
+-- defers body validation to call time. The forgotten_at columns are therefore
+-- added in the TABLE section (CREATE TABLE bodies + an idempotent
+-- `ADD COLUMN IF NOT EXISTS` block for already-provisioned DBs, since
+-- CREATE TABLE IF NOT EXISTS is a no-op there) and the partial indexes in the
+-- INDEX section — both physically before the only call sites in this file: the
+-- post-apply smoke at EOF, which EXECUTES every recall RPC so a missing column
+-- or broken gate fails LOUDLY at apply time. There is no migration runner; the
+-- single sequential `psql -f schema.sql` apply is the ordering guarantee.
+-- =============================================================================
+
 --
 -- Name: public; Type: SCHEMA; Schema: -; Owner: -
 --
@@ -130,6 +151,7 @@ CREATE OR REPLACE FUNCTION public.engram_hybrid_recall(p_query_text text, p_quer
       SELECT me.id, ROW_NUMBER() OVER (ORDER BY ts_rank_cd(me.fts, websearch_to_tsquery('english', p_query_text)) DESC) AS rank_ix
       FROM memory_episodes me
       WHERE p_include_episodes AND me.fts @@ websearch_to_tsquery('english', p_query_text)
+        AND me.forgotten_at IS NULL
         AND (p_session_id IS NULL OR me.session_id = p_session_id)
         AND (p_project_id IS NULL OR me.project_id = p_project_id OR me.project_id IS NULL)
       LIMIT p_match_count * 2
@@ -138,6 +160,7 @@ CREATE OR REPLACE FUNCTION public.engram_hybrid_recall(p_query_text text, p_quer
       SELECT me.id, ROW_NUMBER() OVER (ORDER BY me.embedding <=> p_query_embedding) AS rank_ix
       FROM memory_episodes me
       WHERE p_include_episodes AND me.embedding IS NOT NULL
+        AND me.forgotten_at IS NULL
         AND (p_session_id IS NULL OR me.session_id = p_session_id)
         AND (p_project_id IS NULL OR me.project_id = p_project_id OR me.project_id IS NULL)
       ORDER BY me.embedding <=> p_query_embedding LIMIT p_match_count * 2
@@ -178,12 +201,12 @@ CREATE OR REPLACE FUNCTION public.engram_hybrid_recall(p_query_text text, p_quer
   SELECT * FROM (
     WITH ft AS (
       SELECT ms.id, ROW_NUMBER() OVER (ORDER BY ts_rank_cd(ms.fts, websearch_to_tsquery('english', p_query_text)) DESC) AS rank_ix
-      FROM memory_semantic ms WHERE p_include_semantic AND ms.fts @@ websearch_to_tsquery('english', p_query_text) AND ms.superseded_by IS NULL
+      FROM memory_semantic ms WHERE p_include_semantic AND ms.fts @@ websearch_to_tsquery('english', p_query_text) AND ms.superseded_by IS NULL AND ms.forgotten_at IS NULL
         AND (p_project_id IS NULL OR ms.project_id = p_project_id OR ms.project_id IS NULL) LIMIT p_match_count * 2
     ),
     vs AS (
       SELECT ms.id, ROW_NUMBER() OVER (ORDER BY ms.embedding <=> p_query_embedding) AS rank_ix
-      FROM memory_semantic ms WHERE p_include_semantic AND ms.embedding IS NOT NULL AND ms.superseded_by IS NULL
+      FROM memory_semantic ms WHERE p_include_semantic AND ms.embedding IS NOT NULL AND ms.superseded_by IS NULL AND ms.forgotten_at IS NULL
         AND (p_project_id IS NULL OR ms.project_id = p_project_id OR ms.project_id IS NULL)
       ORDER BY ms.embedding <=> p_query_embedding LIMIT p_match_count * 2
     )
@@ -200,12 +223,12 @@ CREATE OR REPLACE FUNCTION public.engram_hybrid_recall(p_query_text text, p_quer
   SELECT * FROM (
     WITH ft AS (
       SELECT mp.id, ROW_NUMBER() OVER (ORDER BY ts_rank_cd(mp.fts, websearch_to_tsquery('english', p_query_text)) DESC) AS rank_ix
-      FROM memory_procedural mp WHERE p_include_procedural AND mp.fts @@ websearch_to_tsquery('english', p_query_text)
+      FROM memory_procedural mp WHERE p_include_procedural AND mp.fts @@ websearch_to_tsquery('english', p_query_text) AND mp.forgotten_at IS NULL
         AND (p_project_id IS NULL OR mp.project_id = p_project_id OR mp.project_id IS NULL) LIMIT p_match_count * 2
     ),
     vs AS (
       SELECT mp.id, ROW_NUMBER() OVER (ORDER BY mp.embedding <=> p_query_embedding) AS rank_ix
-      FROM memory_procedural mp WHERE p_include_procedural AND mp.embedding IS NOT NULL
+      FROM memory_procedural mp WHERE p_include_procedural AND mp.embedding IS NOT NULL AND mp.forgotten_at IS NULL
         AND (p_project_id IS NULL OR mp.project_id = p_project_id OR mp.project_id IS NULL)
       ORDER BY mp.embedding <=> p_query_embedding LIMIT p_match_count * 2
     )
@@ -236,6 +259,7 @@ CREATE OR REPLACE FUNCTION public.engram_recall(p_query_embedding public.vector,
            (1-(embedding<=>p_query_embedding))::float AS similarity, entities
     FROM memory_episodes
     WHERE p_include_episodes AND embedding IS NOT NULL
+      AND forgotten_at IS NULL
       AND (p_session_id IS NULL OR session_id = p_session_id)
       AND (p_project_id IS NULL OR project_id = p_project_id OR project_id IS NULL)
       AND (1-(embedding<=>p_query_embedding)) >= p_min_similarity
@@ -257,6 +281,7 @@ CREATE OR REPLACE FUNCTION public.engram_recall(p_query_embedding public.vector,
            (1-(embedding<=>p_query_embedding))::float, ARRAY[]::text[]
     FROM memory_semantic
     WHERE p_include_semantic AND embedding IS NOT NULL AND superseded_by IS NULL
+      AND forgotten_at IS NULL
       AND (p_project_id IS NULL OR project_id = p_project_id OR project_id IS NULL)
       AND (1-(embedding<=>p_query_embedding)) >= p_min_similarity
     ORDER BY embedding<=>p_query_embedding LIMIT p_match_count
@@ -267,6 +292,7 @@ CREATE OR REPLACE FUNCTION public.engram_recall(p_query_embedding public.vector,
            (1-(embedding<=>p_query_embedding))::float, ARRAY[]::text[]
     FROM memory_procedural
     WHERE p_include_procedural AND embedding IS NOT NULL
+      AND forgotten_at IS NULL
       AND (p_project_id IS NULL OR project_id = p_project_id OR project_id IS NULL)
       AND (1-(embedding<=>p_query_embedding)) >= p_min_similarity
     ORDER BY embedding<=>p_query_embedding LIMIT p_match_count
@@ -295,6 +321,38 @@ BEGIN
 END; $$;
 
 
+--
+-- Name: engram_mark_forgotten(text, uuid[]); Type: FUNCTION; Schema: public; Owner: -
+--
+
+-- Tombstone primitive for forget(). Sets forgotten_at and touches NOTHING else
+-- (deliberately no access_count / confidence write — that was the inverted-
+-- forget() bug). Idempotent: the `forgotten_at IS NULL` guard makes a repeat
+-- forget a no-op (returns 0). Mirrors the per-store markForgotten storage
+-- contract (returns the number of rows newly tombstoned).
+CREATE OR REPLACE FUNCTION public.engram_mark_forgotten(p_memory_type text, p_ids uuid[]) RETURNS integer
+    LANGUAGE plpgsql SECURITY DEFINER
+    SET search_path TO 'public'
+    AS $$
+DECLARE v_count integer;
+BEGIN
+  IF p_memory_type = 'episode' THEN
+    UPDATE memory_episodes SET forgotten_at = now()
+      WHERE id = ANY(p_ids) AND forgotten_at IS NULL;
+  ELSIF p_memory_type = 'semantic' THEN
+    UPDATE memory_semantic SET forgotten_at = now()
+      WHERE id = ANY(p_ids) AND forgotten_at IS NULL;
+  ELSIF p_memory_type = 'procedural' THEN
+    UPDATE memory_procedural SET forgotten_at = now()
+      WHERE id = ANY(p_ids) AND forgotten_at IS NULL;
+  ELSE
+    RAISE EXCEPTION 'engram_mark_forgotten: unknown memory_type %', p_memory_type;
+  END IF;
+  GET DIAGNOSTICS v_count = ROW_COUNT;
+  RETURN v_count;
+END; $$;
+
+
 --
 -- Name: engram_text_boost(text, integer, text); Type: FUNCTION; Schema: public; Owner: -
 --
@@ -312,6 +370,7 @@ CREATE OR REPLACE FUNCTION public.engram_text_boost(p_query_terms text, p_match_
       ts_rank_cd(me.fts, to_tsquery('english', p_query_terms))::float AS rank_score
     FROM memory_episodes me
     WHERE me.fts @@ to_tsquery('english', p_query_terms)
+      AND me.forgotten_at IS NULL
       AND (p_session_id IS NULL OR me.session_id = p_session_id)
       AND (p_project_id IS NULL OR me.project_id = p_project_id OR me.project_id IS NULL)
 
@@ -330,6 +389,7 @@ CREATE OR REPLACE FUNCTION public.engram_text_boost(p_query_terms text, p_match_
     FROM memory_semantic ms
     WHERE ms.fts @@ to_tsquery('english', p_query_terms)
       AND ms.superseded_by IS NULL
+      AND ms.forgotten_at IS NULL
       AND (p_project_id IS NULL OR ms.project_id = p_project_id OR ms.project_id IS NULL)
 
     UNION ALL
@@ -338,6 +398,7 @@ CREATE OR REPLACE FUNCTION public.engram_text_boost(p_query_terms text, p_match_
       ts_rank_cd(mp.fts, to_tsquery('english', p_query_terms))::float
     FROM memory_procedural mp
     WHERE mp.fts @@ to_tsquery('english', p_query_terms)
+      AND mp.forgotten_at IS NULL
       AND (p_project_id IS NULL OR mp.project_id = p_project_id OR mp.project_id IS NULL)
   ) combined
   ORDER BY rank_score DESC
@@ -380,6 +441,7 @@ CREATE OR REPLACE FUNCTION public.engram_vector_search(p_query_embedding public.
     me.entities, me.metadata
   FROM memory_episodes me
   WHERE me.embedding IS NOT NULL
+    AND me.forgotten_at IS NULL
     AND (p_session_id IS NULL OR me.session_id = p_session_id)
     AND (p_project_id IS NULL OR me.project_id = p_project_id OR me.project_id IS NULL)
 
@@ -405,6 +467,7 @@ CREATE OR REPLACE FUNCTION public.engram_vector_search(p_query_embedding public.
     ARRAY[]::text[], ms.metadata
   FROM memory_semantic ms
   WHERE ms.embedding IS NOT NULL AND ms.superseded_by IS NULL
+    AND ms.forgotten_at IS NULL
     AND (p_project_id IS NULL OR ms.project_id = p_project_id OR ms.project_id IS NULL)
 
   UNION ALL
@@ -417,6 +480,7 @@ CREATE OR REPLACE FUNCTION public.engram_vector_search(p_query_embedding public.
     ARRAY[]::text[], mp.metadata
   FROM memory_procedural mp
   WHERE mp.embedding IS NOT NULL
+    AND mp.forgotten_at IS NULL
     AND (p_project_id IS NULL OR mp.project_id = p_project_id OR mp.project_id IS NULL)
 
   ORDER BY similarity DESC
@@ -633,6 +697,7 @@ CREATE TABLE IF NOT EXISTS public.memory_episodes (
     searchable_content text,
     fts tsvector GENERATED ALWAYS AS (to_tsvector('english'::regconfig, content)) STORED,
     project_id text,
+    forgotten_at timestamp with time zone,
     CONSTRAINT memory_episodes_role_check CHECK ((role = ANY (ARRAY['user'::text, 'assistant'::text, 'system'::text])))
 );
 
@@ -678,6 +743,7 @@ CREATE TABLE IF NOT EXISTS public.memory_procedural (
     updated_at timestamp with time zone DEFAULT now() NOT NULL,
     fts tsvector GENERATED ALWAYS AS (to_tsvector('english'::regconfig, ((trigger_text || ' '::text) || procedure))) STORED,
     project_id text,
+    forgotten_at timestamp with time zone,
     CONSTRAINT memory_procedural_category_check CHECK ((category = ANY (ARRAY['workflow'::text, 'preference'::text, 'habit'::text, 'pattern'::text, 'convention'::text]))),
     CONSTRAINT memory_procedural_confidence_check CHECK (((confidence >= (0.0)::double precision) AND (confidence <= (1.0)::double precision))),
     CONSTRAINT memory_procedural_decay_rate_check CHECK (((decay_rate > (0.0)::double precision) AND (decay_rate <= (1.0)::double precision)))
@@ -708,6 +774,7 @@ CREATE TABLE IF NOT EXISTS public.memory_semantic (
     valid_from timestamp with time zone,
     valid_until timestamp with time zone,
     project_id text,
+    forgotten_at timestamp with time zone,
     CONSTRAINT memory_knowledge_confidence_check CHECK (((confidence >= (0)::double precision) AND (confidence <= (1)::double precision)))
 );
 
@@ -750,6 +817,18 @@ CREATE TABLE IF NOT EXISTS public.sensory_snapshots (
 );
 
 
+--
+-- forget() tombstone columns — idempotent ADD COLUMN for already-provisioned
+-- DBs (CREATE TABLE IF NOT EXISTS above is a no-op there, so the column in the
+-- table body never lands on an existing DB). Placed after the CREATE TABLEs and
+-- before the partial indexes / post-apply smoke that read it. See the ordering
+-- note at the top of this file. NOT added to memory_digests by design.
+--
+ALTER TABLE public.memory_episodes ADD COLUMN IF NOT EXISTS forgotten_at timestamp with time zone;
+ALTER TABLE public.memory_semantic ADD COLUMN IF NOT EXISTS forgotten_at timestamp with time zone;
+ALTER TABLE public.memory_procedural ADD COLUMN IF NOT EXISTS forgotten_at timestamp with time zone;
+
+
 --
 -- Name: community_summaries community_summaries_pkey; Type: CONSTRAINT; Schema: public; Owner: -
 --
@@ -1115,6 +1194,19 @@ CREATE INDEX IF NOT EXISTS idx_write_buffer_created ON public.memory_write_buffe
 CREATE INDEX IF NOT EXISTS idx_write_buffer_status ON public.memory_write_buffer USING btree (status);
 
 
+--
+-- forget() tombstone partial indexes: index only the (rare) tombstoned rows so
+-- forgotten-row enumeration (Phase 2 reclamation / audit) is cheap. The hot
+-- `forgotten_at IS NULL` recall predicate matches the majority of rows and is
+-- driven by the vector/fts indexes; it needs no index of its own. Mirrors the
+-- SQLite v5 `WHERE forgotten_at IS NOT NULL` partial indexes (lockstep).
+--
+
+CREATE INDEX IF NOT EXISTS idx_episodes_forgotten ON public.memory_episodes USING btree (forgotten_at) WHERE (forgotten_at IS NOT NULL);
+CREATE INDEX IF NOT EXISTS idx_semantic_forgotten ON public.memory_semantic USING btree (forgotten_at) WHERE (forgotten_at IS NOT NULL);
+CREATE INDEX IF NOT EXISTS idx_procedural_forgotten ON public.memory_procedural USING btree (forgotten_at) WHERE (forgotten_at IS NOT NULL);
+
+
 --
 -- Name: episode_parts episode_parts_episode_id_fkey; Type: FK CONSTRAINT; Schema: public; Owner: -
 --
@@ -1283,6 +1375,33 @@ DROP POLICY IF EXISTS service_role_all ON public.sensory_snapshots;
 CREATE POLICY service_role_all ON public.sensory_snapshots TO service_role USING (true) WITH CHECK (true);
 
 
+--
+-- Post-apply smoke (no migration runner exists to enforce column-before-function
+-- ordering). Executes every recall RPC + the forget primitive against the just-
+-- applied schema so a missing forgotten_at column or a broken gate fails HERE: it
+-- aborts the apply under `psql -v ON_ERROR_STOP=1`, and otherwise surfaces as a
+-- loud ERROR line in the apply log. Read-only except engram_mark_forgotten on the
+-- nil UUID (matches nothing -> returns 0). Idempotent and safe to re-run. All
+-- names schema-qualified because the dump sets search_path = ''.
+--
+
+DO $smoke$
+DECLARE
+  v_unit public.vector := ('[1' || repeat(',0', 1535) || ']')::public.vector;
+  v_n integer;
+BEGIN
+  PERFORM public.engram_recall(v_unit, NULL, 1);
+  PERFORM public.engram_hybrid_recall('smoke', v_unit, 1);
+  PERFORM public.engram_text_boost('smoke', 1);
+  PERFORM public.engram_vector_search(v_unit, 1);
+  v_n := public.engram_mark_forgotten('episode', ARRAY['00000000-0000-0000-0000-000000000000']::uuid[]);
+  v_n := public.engram_mark_forgotten('semantic', ARRAY['00000000-0000-0000-0000-000000000000']::uuid[]);
+  v_n := public.engram_mark_forgotten('procedural', ARRAY['00000000-0000-0000-0000-000000000000']::uuid[]);
+  RAISE NOTICE 'engram schema smoke OK: 4 recall RPCs + engram_mark_forgotten callable; forgotten_at gate live';
+END;
+$smoke$;
+
+
 --
 -- PostgreSQL database dump complete
 --
diff --git a/packages/postgrest/test/forgotten-at-gate.test.ts b/packages/postgrest/test/forgotten-at-gate.test.ts
new file mode 100644
index 0000000..099c82b
--- /dev/null
+++ b/packages/postgrest/test/forgotten-at-gate.test.ts
@@ -0,0 +1,104 @@
+/**
+ * Phase 1 (forget tombstone) — schema-level regression gate.
+ *
+ * schema.sql is GENERATED from a production pg_dump. A future re-dump that
+ * forgets to carry the `forgotten_at IS NULL` predicate would silently make
+ * forget() leak again (the exact class of the inverted-forget bug). These
+ * assertions pin the invariant in the committed file. The runtime behaviour
+ * (forget removes from every recall path, sibling survives, access_count
+ * unchanged) is proven against live Postgres+pgvector; here we pin the source.
+ *
+ * Counts are exact on purpose: a dropped gate lowers a count; gating a wrong
+ * table (e.g. memory_digests, which must NOT be forgettable) raises it.
+ */
+import { describe, it, expect } from 'vitest'
+import { readFileSync } from 'node:fs'
+
+const schema = readFileSync(new URL('../schema.sql', import.meta.url), 'utf8')
+
+/** Extract a `CREATE OR REPLACE FUNCTION public.<name>(...) AS $$ <body> $$;` body. */
+function functionBody(name: string): string {
+  const re = new RegExp(
+    `CREATE OR REPLACE FUNCTION public\\.${name}\\([\\s\\S]*?AS \\$\\$([\\s\\S]*?)\\$\\$;`,
+  )
+  const m = schema.match(re)
+  if (!m) throw new Error(`function ${name} not found in schema.sql`)
+  return m[1]
+}
+
+function count(haystack: string, needle: string): number {
+  return haystack.split(needle).length - 1
+}
+
+// Branch counts per recall function: hybrid has ft+vs per type (2 each),
+// the others have one branch per type. Digests are intentionally NOT gated.
+const RECALL_FUNCTIONS = [
+  { name: 'engram_hybrid_recall', forgottenGates: 6, supersededGates: 2 },
+  { name: 'engram_recall', forgottenGates: 3, supersededGates: 1 },
+  { name: 'engram_text_boost', forgottenGates: 3, supersededGates: 1 },
+  { name: 'engram_vector_search', forgottenGates: 3, supersededGates: 1 },
+] as const
+
+describe('schema.sql forgotten_at recall gates', () => {
+  for (const fn of RECALL_FUNCTIONS) {
+    it(`${fn.name} gates episode+semantic+procedural on forgotten_at IS NULL (x${fn.forgottenGates})`, () => {
+      const body = functionBody(fn.name)
+      expect(count(body, 'forgotten_at IS NULL')).toBe(fn.forgottenGates)
+    })
+
+    it(`${fn.name} still carries the semantic superseded_by gate (x${fn.supersededGates})`, () => {
+      const body = functionBody(fn.name)
+      expect(count(body, 'superseded_by IS NULL')).toBe(fn.supersededGates)
+    })
+
+    it(`${fn.name} does NOT gate the digest branch (digests are not forgettable)`, () => {
+      const body = functionBody(fn.name)
+      // The digest CTE/branch references memory_digests via the `md` alias;
+      // it must never carry a forgotten_at predicate.
+      expect(body).not.toMatch(/md\.forgotten_at/)
+    })
+  }
+})
+
+describe('schema.sql engram_mark_forgotten primitive', () => {
+  const body = functionBody('engram_mark_forgotten')
+
+  it('stamps forgotten_at for all three forgettable types', () => {
+    expect(body).toMatch(/UPDATE memory_episodes SET forgotten_at = now\(\)/)
+    expect(body).toMatch(/UPDATE memory_semantic SET forgotten_at = now\(\)/)
+    expect(body).toMatch(/UPDATE memory_procedural SET forgotten_at = now\(\)/)
+  })
+
+  it('is idempotent: only stamps rows not already forgotten', () => {
+    expect(count(body, 'forgotten_at IS NULL')).toBe(3)
+  })
+
+  it('touches NEITHER access_count NOR confidence (the inversion fix)', () => {
+    // Forget must be a pure tombstone — writing access_count rewarded the
+    // forgotten memory via accessBoost; writing confidence collides with decay.
+    expect(body).not.toMatch(/access_count/)
+    expect(body).not.toMatch(/confidence/)
+  })
+})
+
+describe('schema.sql forgotten_at columns + indexes', () => {
+  it('adds an idempotent forgotten_at column to the 3 forgettable tables', () => {
+    for (const table of ['memory_episodes', 'memory_semantic', 'memory_procedural']) {
+      expect(schema).toMatch(
+        new RegExp(`ALTER TABLE public\\.${table} ADD COLUMN IF NOT EXISTS forgotten_at`),
+      )
+    }
+  })
+
+  it('does NOT add forgotten_at to memory_digests', () => {
+    expect(schema).not.toMatch(/ALTER TABLE public\.memory_digests ADD COLUMN IF NOT EXISTS forgotten_at/)
+  })
+
+  it('creates a partial index on tombstoned rows for each forgettable table', () => {
+    for (const idx of ['idx_episodes_forgotten', 'idx_semantic_forgotten', 'idx_procedural_forgotten']) {
+      expect(schema).toMatch(
+        new RegExp(`CREATE INDEX IF NOT EXISTS ${idx} [\\s\\S]*?WHERE \\(forgotten_at IS NOT NULL\\)`),
+      )
+    }
+  })
+})

From 45700f4d476e9b4ea8c834774362bd0db3526958 Mon Sep 17 00:00:00 2001
From: muhammadkh4n <muhammadkh4n@gmail.com>
Date: Sun, 7 Jun 2026 04:01:37 +0500
Subject: [PATCH 3/9] feat(graph,core): tombstone Neo4j Memory nodes so forget
 cannot leak through the graph
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After the SQL forget gate, a forgotten memory could still surface through graph
spreading activation — and would leak into authoritative recall once
associations are merged into the scored pool. Close the graph channel.

- GraphPort.forgetMemories?(ids): optional, capability-guarded port method.
- NeuralGraph.forgetMemories stamps forgottenAt on :Memory {id} nodes
  (idempotent, returns count). Memory nodes are uniformly :Memory regardless of
  memoryType, so one match covers episode/semantic/procedural ids.
- spreading-activation Cypher gates traversal on
  coalesce(n.forgottenAt, n.deletedAt) IS NULL (NULL-permissive), which also
  prunes paths that pass through a forgotten node.
- Memory.forget() calls graph.forgetMemories after the SQL tombstone,
  capability-guarded and non-fatal: SQL stays the source of truth, and a graph
  hiccup or absent Neo4j never fails the forget.

Verified on Neo4j 5: spreading-activation 12/12 incl. endpoint exclusion,
path-through pruning, sibling survival, idempotency (1 then 0) and cross-type
semantic gating. core 495/495, sqlite 106/106, typecheck clean.
---
 packages/core/src/adapters/graph.ts           | 12 +++++
 packages/core/src/memory.ts                   | 17 +++++++
 packages/graph/src/neural-graph.ts            | 29 +++++++++++
 packages/graph/src/spreading-activation.ts    |  3 ++
 .../graph/test/spreading-activation.test.ts   | 50 +++++++++++++++++++
 5 files changed, 111 insertions(+)

diff --git a/packages/core/src/adapters/graph.ts b/packages/core/src/adapters/graph.ts
index ddc73cd..4f58940 100644
--- a/packages/core/src/adapters/graph.ts
+++ b/packages/core/src/adapters/graph.ts
@@ -112,6 +112,18 @@ export interface GraphPort {
   lookupEntityNodes(names: string[]): Promise<GraphEntitySeedResult[]>
   spreadActivation(opts: GraphSpreadActivationOpts): Promise<GraphActivatedNode[]>
   strengthenTraversedEdges(pairs: Array<[string, string]>): Promise<void>
+
+  /**
+   * Phase 1 (forget tombstone): stamp `forgottenAt` on the Neo4j Memory nodes
+   * with these ids, so spreading activation excludes them — its path filter
+   * gates on `coalesce(n.forgottenAt, n.deletedAt) IS NULL`. Without this,
+   * forget() hides a memory in SQL recall but it can still surface through the
+   * graph association channel. Optional (only NeuralGraph implements it); core
+   * calls it capability-guarded and treats failure as non-fatal. Returns the
+   * number of nodes newly tombstoned.
+   */
+  forgetMemories?(ids: string[]): Promise<number>
+
   // Wave 3: raw Cypher execution for consolidation operations
   // Returns the driver-native result type — consolidation code accesses
   // .records and .summary.counters via the GraphQueryResult shape, but
diff --git a/packages/core/src/memory.ts b/packages/core/src/memory.ts
index 15ab317..23e9752 100644
--- a/packages/core/src/memory.ts
+++ b/packages/core/src/memory.ts
@@ -907,6 +907,23 @@ export class Memory {
     if (idsByType.procedural.length > 0) await this.storage.procedural.markForgotten(idsByType.procedural)
     if (idsByType.episode.length > 0) await this.storage.episodes.markForgotten(idsByType.episode)
 
+    // Also tombstone the Neo4j Memory nodes so forget cannot leak back through
+    // the graph association channel: spreading activation gates traversal on
+    // coalesce(forgottenAt, deletedAt) IS NULL. SQL remains the source of truth —
+    // a graph hiccup (or absent Neo4j) must never fail the forget. Capability-
+    // guarded because forgetMemories is optional on GraphPort.
+    const graph = this._graph
+    if (graph && typeof graph.forgetMemories === 'function') {
+      const forgottenIds = [...idsByType.episode, ...idsByType.semantic, ...idsByType.procedural]
+      if (forgottenIds.length > 0) {
+        try {
+          await graph.forgetMemories(forgottenIds)
+        } catch (err) {
+          console.warn('[engram] forget: graph tombstone failed (non-fatal):', err)
+        }
+      }
+    }
+
     return { count: filtered.length, previewed: filtered }
   }
 
diff --git a/packages/graph/src/neural-graph.ts b/packages/graph/src/neural-graph.ts
index 95d30d7..8c0a786 100644
--- a/packages/graph/src/neural-graph.ts
+++ b/packages/graph/src/neural-graph.ts
@@ -221,6 +221,35 @@ export class NeuralGraph {
     }
   }
 
+  /**
+   * Phase 1 (forget tombstone): stamp `forgottenAt` on the :Memory nodes with
+   * these ids so spreading activation excludes them — the path filter gates on
+   * `coalesce(n.forgottenAt, n.deletedAt) IS NULL`. Idempotent: only stamps
+   * nodes not already forgotten. Returns the number of nodes newly tombstoned.
+   * Memory nodes are uniformly `:Memory {id}` regardless of memoryType, so a
+   * single match covers episode/semantic/procedural ids.
+   */
+  async forgetMemories(ids: string[]): Promise<number> {
+    if (ids.length === 0) return 0
+    const now = new Date().toISOString()
+    const session = this.driver.session()
+    try {
+      const result = await session.executeWrite(async (tx: ManagedTransaction) => {
+        return tx.run(
+          `MATCH (m:Memory)
+           WHERE m.id IN $ids AND m.forgottenAt IS NULL
+           SET m.forgottenAt = $now
+           RETURN count(m) AS forgotten`,
+          { ids, now }
+        )
+      })
+      const raw = result.records[0]?.get('forgotten')
+      return neo4j.isInt(raw) ? (raw as { toNumber(): number }).toNumber() : Number(raw ?? 0)
+    } finally {
+      await session.close()
+    }
+  }
+
   async addPersonNode(input: PersonNodeInput): Promise<string> {
     const id = `person:${normalizeForId(input.name)}`
     const now = new Date().toISOString()
diff --git a/packages/graph/src/spreading-activation.ts b/packages/graph/src/spreading-activation.ts
index bab0485..fd9775d 100644
--- a/packages/graph/src/spreading-activation.ts
+++ b/packages/graph/src/spreading-activation.ts
@@ -43,6 +43,9 @@ export class SpreadingActivation {
                 OR NOT n:Memory
                 OR n.projectId = $projectId
                 OR n.projectId IS NULL)
+          AND ALL(n IN nodes(path) WHERE
+                NOT n:Memory
+                OR coalesce(n.forgottenAt, n.deletedAt) IS NULL)
         WITH neighbor,
              reduce(
                activation = 1.0,
diff --git a/packages/graph/test/spreading-activation.test.ts b/packages/graph/test/spreading-activation.test.ts
index d382b36..6557d77 100644
--- a/packages/graph/test/spreading-activation.test.ts
+++ b/packages/graph/test/spreading-activation.test.ts
@@ -189,4 +189,54 @@ describe.skipIf(!neo4jReady)('SpreadingActivation (integration)', () => {
     })
     expect(unscoped.map((r) => r.nodeId)).toContain('ep-beta')
   })
+
+  // Phase 1 — forget tombstone must close the graph channel. After forget,
+  // SQL recall hides the memory; without this gate the same memory still
+  // surfaces through graph spreading activation (and would leak once
+  // associations are merged into authoritative recall).
+  it('forgetMemories excludes the node and prunes paths through it; siblings survive', async () => {
+    // Chain a -> b -> c (all :Memory). Sibling a -> d, not through b.
+    await graph.addMemoryNode({ id: 'm-a', memoryType: 'episode', label: 'A' })
+    await graph.addMemoryNode({ id: 'm-b', memoryType: 'episode', label: 'B' })
+    await graph.addMemoryNode({ id: 'm-c', memoryType: 'episode', label: 'C' })
+    await graph.addMemoryNode({ id: 'm-d', memoryType: 'episode', label: 'D' })
+    await graph.addEdge('m-a', 'm-b', 'TEMPORAL', 0.9)
+    await graph.addEdge('m-b', 'm-c', 'TEMPORAL', 0.9)
+    await graph.addEdge('m-a', 'm-d', 'TEMPORAL', 0.9)
+
+    const before = await activation.activate(['m-a'], {
+      maxHops: 2, decayPerHop: 0.8, minActivation: 0.001, maxNodes: 50,
+    })
+    const beforeIds = before.map((r) => r.nodeId)
+    expect(beforeIds).toContain('m-b') // hop 1
+    expect(beforeIds).toContain('m-c') // hop 2, only reachable through b
+    expect(beforeIds).toContain('m-d') // sibling
+
+    expect(await graph.forgetMemories(['m-b'])).toBe(1)
+
+    const after = await activation.activate(['m-a'], {
+      maxHops: 2, decayPerHop: 0.8, minActivation: 0.001, maxNodes: 50,
+    })
+    const afterIds = after.map((r) => r.nodeId)
+    expect(afterIds).not.toContain('m-b') // forgotten endpoint excluded
+    expect(afterIds).not.toContain('m-c') // path through forgotten b is pruned
+    expect(afterIds).toContain('m-d')      // independent path intact
+  })
+
+  it('forgetMemories is idempotent and gates a forgotten semantic node', async () => {
+    await graph.addMemoryNode({ id: 'm-x', memoryType: 'episode', label: 'X' })
+    await graph.addMemoryNode({ id: 'm-y', memoryType: 'semantic', label: 'Y' })
+    await graph.addEdge('m-x', 'm-y', 'TEMPORAL', 0.9)
+
+    // NULL-permissive: an unforgotten node activates normally.
+    const before = await activation.activate(['m-x'], { maxHops: 1, decayPerHop: 0.8, minActivation: 0.001 })
+    expect(before.map((r) => r.nodeId)).toContain('m-y')
+
+    expect(await graph.forgetMemories(['m-y'])).toBe(1) // first stamp
+    expect(await graph.forgetMemories(['m-y'])).toBe(0) // already forgotten — idempotent
+    expect(await graph.forgetMemories([])).toBe(0)       // empty no-op
+
+    const after = await activation.activate(['m-x'], { maxHops: 1, decayPerHop: 0.8, minActivation: 0.001 })
+    expect(after.map((r) => r.nodeId)).not.toContain('m-y')
+  })
 })

From e539bfcf3d966a7a9ed48cb3de70c2c032592e46 Mon Sep 17 00:00:00 2001
From: muhammadkh4n <muhammadkh4n@gmail.com>
Date: Sun, 7 Jun 2026 04:18:22 +0500
Subject: [PATCH 4/9] =?UTF-8?q?feat(bench):=20make=20the=20metric=20see=20?=
 =?UTF-8?q?the=20graph=20=E2=80=94=20mergeAssociationsIntoScored=20(Phase?=
 =?UTF-8?q?=200,=20unit=201)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Discovery #1: both bench adapters scored only recallResult.memories, but graph
spreading-activation output lands in the separate recallResult.associations
channel. So graph:true vs graph:false mathematically could not move recall@K —
"the graph doesn't help" was a measurement-instrumentation bug, not a verdict.

- mergeAssociationsIntoScored(recallResult, flag): when the flag is set, unions
  associations after the memory channel; otherwise returns memories unchanged.
- BenchmarkOpts.mergeAssociationsIntoTopK (default false → byte-identical runs).
- Wired into both LongMemEval (runQuestion) and LoCoMo (evaluateDataset) scoring.
- Score-scale-safe by construction: both adapters score by gold-id set-membership
  over the deduped top-K, not score magnitude. So unioning the graph-relevance-
  ranked associations after the MMR/cross-encoder-ranked memories cannot be
  confounded by the scale mismatch — a gold id is either in the first K deduped
  ids or it is not. Memory-first ordering means associations can only RESCUE a
  gold id the memory channel missed, never displace one.
- associations-visible-to-scored.test.ts: deterministic invariant (no Neo4j, no
  LLM, no dataset) — gold present in the scored pool with merge ON, absent OFF.
  This is the "associations-visible" invariant the symmetric kill criterion
  (later unit) depends on.
- Adds packages/bench/vitest.config.ts so the gate test runs under turbo/CI.

Default off → zero behaviour change to existing runs. bench typecheck clean, 4/4.
---
 packages/bench/src/index.ts                   |  2 +
 packages/bench/src/locomo/adapter.ts          |  7 +-
 packages/bench/src/longmemeval/adapter.ts     |  5 +-
 packages/bench/src/merge-associations.ts      | 42 +++++++++
 packages/bench/src/types.ts                   |  9 ++
 .../associations-visible-to-scored.test.ts    | 92 +++++++++++++++++++
 packages/bench/vitest.config.ts               |  9 ++
 7 files changed, 163 insertions(+), 3 deletions(-)
 create mode 100644 packages/bench/src/merge-associations.ts
 create mode 100644 packages/bench/test/associations-visible-to-scored.test.ts
 create mode 100644 packages/bench/vitest.config.ts

diff --git a/packages/bench/src/index.ts b/packages/bench/src/index.ts
index e3393e8..8cd2dbe 100644
--- a/packages/bench/src/index.ts
+++ b/packages/bench/src/index.ts
@@ -4,6 +4,8 @@ export { compareLoCoMo, compareLongMemEval } from './runner/compare.js'
 export { computeRetrievalF1, recallAtK } from './metrics/f1.js'
 export { formatLoCoMoTable, formatLongMemEvalTable, formatComparisonTable } from './metrics/table.js'
 export { createBenchMemory } from './memory-factory.js'
+export { mergeAssociationsIntoScored } from './merge-associations.js'
+export type { BenchRecallResult, BenchScoredMemory } from './merge-associations.js'
 export type {
   BenchmarkOpts, BenchmarkMetrics,
   LoCoMoCategory, LoCoMoQAPrediction, LoCoMoCategoryMetrics,
diff --git a/packages/bench/src/locomo/adapter.ts b/packages/bench/src/locomo/adapter.ts
index 14e460a..da384d9 100644
--- a/packages/bench/src/locomo/adapter.ts
+++ b/packages/bench/src/locomo/adapter.ts
@@ -8,6 +8,7 @@ import type {
 import type { LoCoMoConversationFile, LoCoMoTurn } from './types.js'
 import { computeRetrievalF1 } from '../metrics/f1.js'
 import { createBenchMemory } from '../memory-factory.js'
+import { mergeAssociationsIntoScored } from '../merge-associations.js'
 
 export class LoCoMoAdapter {
   async loadDataset(dataPath: string): Promise<LoCoMoConversationFile[]> {
@@ -184,7 +185,7 @@ export class LoCoMoAdapter {
   async evaluateDataset(
     conversations: LoCoMoConversationFile[],
     memory: Memory,
-    opts?: Pick<BenchmarkOpts, 'topK'>,
+    opts?: Pick<BenchmarkOpts, 'topK' | 'mergeAssociationsIntoTopK'>,
   ): Promise<LoCoMoConversationResult[]> {
     const topK = opts?.topK ?? 10
     const convResults: LoCoMoConversationResult[] = []
@@ -195,7 +196,9 @@ export class LoCoMoAdapter {
 
       for (const qa of conv.qa) {
         const recallResult = await memory.recall(qa.question)
-        const topMemories = recallResult.memories.slice(0, topK)
+        const topMemories = mergeAssociationsIntoScored(
+          recallResult, opts?.mergeAssociationsIntoTopK,
+        ).slice(0, topK)
 
         const prediction = topMemories
           .map(m => m.content)
diff --git a/packages/bench/src/longmemeval/adapter.ts b/packages/bench/src/longmemeval/adapter.ts
index 87b028b..9afff54 100644
--- a/packages/bench/src/longmemeval/adapter.ts
+++ b/packages/bench/src/longmemeval/adapter.ts
@@ -22,6 +22,7 @@ import type {
 } from '../types.js'
 import type { LongMemEvalQuestion, LongMemEvalQuestionType } from './types.js'
 import { createBenchMemory } from '../memory-factory.js'
+import { mergeAssociationsIntoScored } from '../merge-associations.js'
 
 export class LongMemEvalAdapter {
   /**
@@ -145,7 +146,9 @@ export class LongMemEvalAdapter {
 
       const evalStart = Date.now()
       const recallResult = await memory.recall(question.question)
-      const topMemories = recallResult.memories.slice(0, topK)
+      const topMemories = mergeAssociationsIntoScored(
+        recallResult, opts?.mergeAssociationsIntoTopK,
+      ).slice(0, topK)
 
       // Deduplicate retrieved sessions in rank order
       const seen = new Set<string>()
diff --git a/packages/bench/src/merge-associations.ts b/packages/bench/src/merge-associations.ts
new file mode 100644
index 0000000..d20b05f
--- /dev/null
+++ b/packages/bench/src/merge-associations.ts
@@ -0,0 +1,42 @@
+import type { Memory } from '@engram-mem/core'
+
+// Derived from Memory.recall()'s return type — core does not re-export
+// RecallResult / RetrievedMemory by name, so we pin the shape structurally.
+// This keeps the bench decoupled from core's internal module paths.
+export type BenchRecallResult = Awaited<ReturnType<Memory['recall']>>
+export type BenchScoredMemory = BenchRecallResult['memories'][number]
+
+/**
+ * The pool a bench adapter scores. By default this is just the SQL/vector
+ * recall channel (`recallResult.memories`). When `mergeAssociationsIntoTopK`
+ * is true, the graph spreading-activation channel (`recallResult.associations`)
+ * is appended so it becomes visible to recall@K.
+ *
+ * Why this is the fix for "the benchmark literally cannot see the graph":
+ * both bench adapters score by gold-id SET-MEMBERSHIP in the deduped top-K
+ * (LongMemEval matches answer_session_ids against metadata.lmeSessionId;
+ * LoCoMo matches qa.evidence against metadata.locomoDiaId). Membership, not
+ * score magnitude, decides a hit — so unioning the graph-relevance-ranked
+ * associations after the MMR/cross-encoder-ranked memories is scale-safe by
+ * construction: a gold id is either in the first K deduped ids or it is not.
+ * No cross-encoder re-run over the union is needed.
+ *
+ * Ordering is memories-first, associations-appended: the graph channel can
+ * only RESCUE a gold id the memory channel missed; it cannot displace a
+ * memory-channel gold id out of top-K unless the memory pool already held ≥K
+ * non-gold entries ahead of it. That asymmetry is precisely the question —
+ * does the graph recover misses? — so we measure it directly.
+ *
+ * Associations carry the same `metadata` as their source memory (spreading
+ * activation spreads `...episode.metadata`), so the gold-id keys ride through.
+ *
+ * With the flag false (default) this returns `recallResult.memories` by
+ * reference — byte-identical behaviour to pre-Phase-0 runs.
+ */
+export function mergeAssociationsIntoScored(
+  recallResult: BenchRecallResult,
+  mergeAssociationsIntoTopK: boolean | undefined,
+): BenchScoredMemory[] {
+  if (!mergeAssociationsIntoTopK) return recallResult.memories
+  return [...recallResult.memories, ...recallResult.associations]
+}
diff --git a/packages/bench/src/types.ts b/packages/bench/src/types.ts
index db39517..02bea19 100644
--- a/packages/bench/src/types.ts
+++ b/packages/bench/src/types.ts
@@ -7,6 +7,15 @@ export interface BenchmarkOpts {
   topK?: number          // default 10
   limit?: number         // max conversations to evaluate (default: all)
   noRerank?: boolean     // disable cross-encoder reranking for A/B comparison
+  /**
+   * Phase 0: merge the graph spreading-activation channel
+   * (`recallResult.associations`) into the scored top-K pool before recall@K
+   * is computed. Default false → byte-identical to pre-Phase-0 runs. The
+   * adapters score by gold-id set-membership (scale-independent), so unioning
+   * the graph-relevance-ranked associations after the MMR/rerank'd memories is
+   * safe. This is what makes graph:true vs graph:false able to move the metric.
+   */
+  mergeAssociationsIntoTopK?: boolean
   /**
    * Cross-encoder backend. 'openai' (default) uses LLM pointwise scoring via
    * gpt-4o-mini; 'onnx' uses a local mxbai-rerank ONNX model (no API cost,
diff --git a/packages/bench/test/associations-visible-to-scored.test.ts b/packages/bench/test/associations-visible-to-scored.test.ts
new file mode 100644
index 0000000..fdef1f5
--- /dev/null
+++ b/packages/bench/test/associations-visible-to-scored.test.ts
@@ -0,0 +1,92 @@
+/**
+ * Phase 0 invariant — the metric MUST be able to see the graph channel.
+ *
+ * Discovery #1: both bench adapters score only `recallResult.memories`, but
+ * graph spreading-activation output lands in the separate
+ * `recallResult.associations` channel. So `graph:true` vs `graph:false`
+ * mathematically could not move recall@K — the graph was never measured.
+ *
+ * This pins the fix: with the merge flag ON, a gold id that lives ONLY in
+ * associations becomes visible to the scored top-K; with it OFF, the scored
+ * pool is byte-identical to today. This is the "associations-visible
+ * invariant" the symmetric kill-criterion (Unit 8) later depends on.
+ *
+ * Deterministic: a hand-built recall result, no dataset, no Neo4j, no LLM.
+ */
+import { describe, it, expect } from 'vitest'
+import {
+  mergeAssociationsIntoScored,
+  type BenchRecallResult,
+  type BenchScoredMemory,
+} from '../src/merge-associations.js'
+
+const GOLD_SESSION = 'gold-sess'
+
+function mem(id: string, lmeSessionId: string): BenchScoredMemory {
+  return {
+    id,
+    type: 'episode',
+    content: `content-${id}`,
+    relevance: 0.9, // strong channel — post MMR/rerank
+    source: 'recall',
+    metadata: { lmeSessionId },
+  }
+}
+
+function assoc(id: string, lmeSessionId: string): BenchScoredMemory {
+  return {
+    id,
+    type: 'episode',
+    content: `content-${id}`,
+    relevance: 0.12, // graph-relevance scale — deliberately lower than memories
+    source: 'association',
+    metadata: { lmeSessionId, activationSource: 'spreading_activation' },
+  }
+}
+
+// Gold session lives ONLY in the association channel; the memory channel holds
+// only noise. This is the case the graph is supposed to rescue.
+function makeFixture(): BenchRecallResult {
+  return {
+    memories: [mem('m1', 'noise-a'), mem('m2', 'noise-b')],
+    associations: [assoc('a1', GOLD_SESSION)],
+    intent: {} as BenchRecallResult['intent'],
+    primed: [],
+    estimatedTokens: 0,
+    formatted: '',
+  }
+}
+
+// Mirror the adapters' gold-id set-membership over the deduped top-K.
+function goldInTopK(pool: BenchScoredMemory[], k = 10): boolean {
+  const seen = new Set<string>()
+  for (const m of pool.slice(0, k)) {
+    const sid = m.metadata?.['lmeSessionId'] as string | undefined
+    if (sid) seen.add(sid)
+  }
+  return seen.has(GOLD_SESSION)
+}
+
+describe('associations are visible to the scored pool iff merge is ON', () => {
+  it('gold session is ABSENT from the scored pool when merge is OFF', () => {
+    const scored = mergeAssociationsIntoScored(makeFixture(), false)
+    expect(goldInTopK(scored)).toBe(false)
+  })
+
+  it('gold session is PRESENT in the scored pool when merge is ON', () => {
+    const scored = mergeAssociationsIntoScored(makeFixture(), true)
+    expect(goldInTopK(scored)).toBe(true)
+  })
+
+  it('merge OFF returns the memories array unchanged (no behaviour drift)', () => {
+    const fixture = makeFixture()
+    expect(mergeAssociationsIntoScored(fixture, false)).toBe(fixture.memories)
+    expect(mergeAssociationsIntoScored(fixture, undefined)).toBe(fixture.memories)
+  })
+
+  it('merge ON appends associations after memories (memory-first ordering)', () => {
+    const fixture = makeFixture()
+    const scored = mergeAssociationsIntoScored(fixture, true)
+    expect(scored.map((m) => m.id)).toEqual(['m1', 'm2', 'a1'])
+  })
+})
diff --git a/packages/bench/vitest.config.ts b/packages/bench/vitest.config.ts
new file mode 100644
index 0000000..c03431f
--- /dev/null
+++ b/packages/bench/vitest.config.ts
@@ -0,0 +1,9 @@
+import { defineConfig } from 'vitest/config'
+
+export default defineConfig({
+  test: {
+    include: ['test/**/*.test.ts'],
+    environment: 'node',
+    testTimeout: 10000,
+  },
+})

From 1d7012a427b337fc2d04dc4979cb625aa0afdec4 Mon Sep 17 00:00:00 2001
From: muhammadkh4n <muhammadkh4n@gmail.com>
Date: Sun, 7 Jun 2026 04:20:17 +0500
Subject: [PATCH 5/9] feat(bench): symmetric kill criterion for the graph bet
 (Phase 0)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Encodes the red-team rule that prevents the historical mistake — concluding
"kill the graph" from the saturated LongMemEval-S aggregate (~98.8% recall@5,
where nothing has headroom to move). graphVerdict() demands POSITIVE evidence
of no-effect before a kill:

- kill ONLY when the primary aggregate delta is null/negative AND graphEffect is
  flat (≤ epsilon) on a graph-visible split with n ≥ 100, and the
  associations-visible invariant is green. Never the aggregate alone.
- keep as soon as graphEffect clears epsilon on a powered, visible split — even
  when the saturated aggregate is flat or negative.
- insufficient_power when underpowered (n<100), when the invariant is red, or in
  the ambiguous aggregate-positive-but-flat-effect case.

Pure and deterministic; 6 table-driven cases pin the asymmetry, the power gate,
and the invariant dependency. bench typecheck clean, 10/10.
---
 packages/bench/src/index.ts                 |  2 +
 packages/bench/src/metrics/graph-verdict.ts | 70 ++++++++++++++++++
 packages/bench/test/graph-verdict.test.ts   | 78 +++++++++++++++++++++
 3 files changed, 150 insertions(+)
 create mode 100644 packages/bench/src/metrics/graph-verdict.ts
 create mode 100644 packages/bench/test/graph-verdict.test.ts

diff --git a/packages/bench/src/index.ts b/packages/bench/src/index.ts
index 8cd2dbe..5b25b78 100644
--- a/packages/bench/src/index.ts
+++ b/packages/bench/src/index.ts
@@ -6,6 +6,8 @@ export { formatLoCoMoTable, formatLongMemEvalTable, formatComparisonTable } from
 export { createBenchMemory } from './memory-factory.js'
 export { mergeAssociationsIntoScored } from './merge-associations.js'
 export type { BenchRecallResult, BenchScoredMemory } from './merge-associations.js'
+export { graphVerdict, MIN_POWER_N, DEFAULT_EPSILON } from './metrics/graph-verdict.js'
+export type { GraphVerdict, GraphVerdictInput } from './metrics/graph-verdict.js'
 export type {
   BenchmarkOpts, BenchmarkMetrics,
   LoCoMoCategory, LoCoMoQAPrediction, LoCoMoCategoryMetrics,
diff --git a/packages/bench/src/metrics/graph-verdict.ts b/packages/bench/src/metrics/graph-verdict.ts
new file mode 100644
index 0000000..7ce9505
--- /dev/null
+++ b/packages/bench/src/metrics/graph-verdict.ts
@@ -0,0 +1,70 @@
+// Phase 0 — symmetric kill criterion for the graph bet.
+//
+// The historical failure: concluding "the graph doesn't help, kill it" from a
+// SATURATED aggregate (LongMemEval-S sits at ~98.8% recall@5 — there is almost
+// no headroom for ANY change to move it). A null delta there means "no signal",
+// not "no value". This module encodes the red-team rule that makes killing the
+// graph require POSITIVE evidence of no-effect on a split where an effect could
+// actually show up, with enough samples to trust it.
+
+export type GraphVerdict = 'kill' | 'keep' | 'insufficient_power'
+
+export interface GraphVerdictInput {
+  /**
+   * Primary aggregate delta: recall@K(graph) − recall@K(no-graph) on the full
+   * (saturated) corpus, e.g. LongMemEval-S. Near-zero is expected even if the
+   * graph helps, because the corpus is saturated — so this alone never decides.
+   */
+  primaryAggregateDelta: number
+  /**
+   * graphEffect: recall@K(merge ON) − recall@K(merge OFF) on the GRAPH-VISIBLE
+   * split (questions where the graph channel could plausibly contribute). This
+   * is the scale-independent set-membership lift — the signal that matters.
+   */
+  graphEffect: number
+  /** Size of the graph-visible split. Below MIN_POWER_N, no verdict is allowed. */
+  graphVisibleN: number
+  /**
+   * The associations-visible-to-scored invariant must be green: if the metric
+   * structurally cannot see the graph channel, every delta is measurement noise
+   * and no kill/keep verdict is trustworthy.
+   */
+  associationsVisibleInvariantGreen: boolean
+  /** Equivalence margin below which graphEffect counts as "flat". */
+  epsilon?: number
+}
+
+/** Minimum graph-visible sample size to render any verdict. Below this → no decision. */
+export const MIN_POWER_N = 100
+/** Default flatness margin for graphEffect. */
+export const DEFAULT_EPSILON = 0.005
+
+/**
+ * Render a verdict on the graph bet.
+ *
+ * - `keep`               — graphEffect > ε on a powered (n≥100), graph-visible split.
+ * - `kill`               — BOTH a null/negative aggregate delta AND a flat
+ *                          (≤ ε) graphEffect, on a powered split, with the
+ *                          invariant green. Never the aggregate alone.
+ * - `insufficient_power` — invariant red, OR n < 100, OR the ambiguous case
+ *                          (aggregate positive but graphEffect flat).
+ */
+export function graphVerdict(input: GraphVerdictInput): GraphVerdict {
+  const epsilon = input.epsilon ?? DEFAULT_EPSILON
+
+  // The metric must be able to see the graph at all — else any delta is noise.
+  if (!input.associationsVisibleInvariantGreen) return 'insufficient_power'
+
+  // Underpowered → never decide. Deciding on n << 100 was the historical error.
+  if (input.graphVisibleN < MIN_POWER_N) return 'insufficient_power'
+
+  // The graph demonstrably helps the visible split → keep.
+  if (input.graphEffect > epsilon) return 'keep'
+
+  // graphEffect is flat (≤ ε). Killing additionally requires the aggregate to
+  // be null/negative — BOTH conditions, never the saturated aggregate alone.
+  if (input.primaryAggregateDelta <= 0) return 'kill'
+
+  // Aggregate positive but graphEffect flat: ambiguous → no decision.
+  return 'insufficient_power'
+}
diff --git a/packages/bench/test/graph-verdict.test.ts b/packages/bench/test/graph-verdict.test.ts
new file mode 100644
index 0000000..01fc770
--- /dev/null
+++ b/packages/bench/test/graph-verdict.test.ts
@@ -0,0 +1,78 @@
+/**
+ * Phase 0 — symmetric kill criterion regression gate.
+ *
+ * Locks the rule that prevents the historical mistake: never conclude "kill the
+ * graph" from the saturated aggregate alone. Killing requires BOTH a null/
+ * negative aggregate delta AND a flat graphEffect on a powered (n≥100),
+ * graph-visible split, with the associations-visible invariant green.
+ */
+import { describe, it, expect } from 'vitest'
+import {
+  graphVerdict,
+  MIN_POWER_N,
+  DEFAULT_EPSILON,
+  type GraphVerdictInput,
+} from '../src/metrics/graph-verdict.js'
+
+const powered: Pick<GraphVerdictInput, 'graphVisibleN' | 'associationsVisibleInvariantGreen'> = {
+  graphVisibleN: 150,
+  associationsVisibleInvariantGreen: true,
+}
+
+describe('graphVerdict — symmetric kill criterion', () => {
+  it('KEEP when graphEffect clears epsilon on a powered, visible split', () => {
+    expect(graphVerdict({ ...powered, primaryAggregateDelta: 0, graphEffect: 0.04 })).toBe('keep')
+    // Keep holds even when the saturated aggregate is flat/negative.
+    expect(graphVerdict({ ...powered, primaryAggregateDelta: -0.002, graphEffect: 0.03 })).toBe('keep')
+  })
+
+  it('KILL only when BOTH the aggregate is null/negative AND graphEffect is flat', () => {
+    expect(graphVerdict({ ...powered, primaryAggregateDelta: 0, graphEffect: 0 })).toBe('kill')
+    expect(graphVerdict({ ...powered, primaryAggregateDelta: -0.01, graphEffect: 0.002 })).toBe('kill')
+  })
+
+  it('does NOT kill on a flat graphEffect when the aggregate is POSITIVE (the key asymmetry)', () => {
+    expect(graphVerdict({ ...powered, primaryAggregateDelta: 0.01, graphEffect: 0 })).toBe('insufficient_power')
+  })
+
+  it('never decides below the power threshold, even with a flat effect + negative aggregate', () => {
+    expect(
+      graphVerdict({
+        graphVisibleN: MIN_POWER_N - 1,
+        associationsVisibleInvariantGreen: true,
+        primaryAggregateDelta: -0.05,
+        graphEffect: 0,
+      }),
+    ).toBe('insufficient_power')
+    // Exactly at the threshold is enough to decide.
+    expect(
+      graphVerdict({
+        graphVisibleN: MIN_POWER_N,
+        associationsVisibleInvariantGreen: true,
+        primaryAggregateDelta: 0,
+        graphEffect: 0,
+      }),
+    ).toBe('kill')
+  })
+
+  it('never decides when the associations-visible invariant is red', () => {
+    // Would otherwise be a clear KEEP, but the metric cannot see the graph.
+    expect(
+      graphVerdict({
+        graphVisibleN: 500,
+        associationsVisibleInvariantGreen: false,
+        primaryAggregateDelta: 0.1,
+        graphEffect: 0.2,
+      }),
+    ).toBe('insufficient_power')
+  })
+
+  it('treats graphEffect exactly at epsilon as flat (not a keep)', () => {
+    expect(
+      graphVerdict({ ...powered, primaryAggregateDelta: 0, graphEffect: DEFAULT_EPSILON }),
+    ).toBe('kill')
+    expect(
+      graphVerdict({ ...powered, primaryAggregateDelta: 0, graphEffect: DEFAULT_EPSILON + 1e-6 }),
+    ).toBe('keep')
+  })
+})

From ed1e49b852552e57a6373cce32efd36517c1338a Mon Sep 17 00:00:00 2001
From: muhammadkh4n <muhammadkh4n@gmail.com>
Date: Sun, 7 Jun 2026 04:49:28 +0500
Subject: [PATCH 6/9] feat(bench): requireGraph guard + per-unit graph
 isolation (Phase 0, units 3-4)

createBenchMemory now returns a {memory, config, graphActuallyWired} handle
instead of a bare Memory, so graph cells can reach the graph handle and hard-
fail when it is absent.

- requireGraph(handle): throws if a graph cell runs without a real bench Neo4j,
  killing the silent SQL-only fallback that would otherwise report a SQL delta
  as a graph result (the "graph was never measured" trap). Lives in a
  dependency-light bench-memory-handle module so the guard + types are
  unit-testable without loading the ONNX native binding.
- wipeBenchGraph wired into LongMemEval runQuestion and LoCoMo runConversation
  before ingest: Neo4j is a shared external process (unlike the per-call fresh
  :memory: SQLite), so each question/conversation must start with a clean graph
  or the previous unit's nodes pollute spreading activation. (wipeBenchGraph
  existed but was called nowhere.)
- Migrated all 5 createBenchMemory callers to destructure the handle.

bench typecheck clean, 12/12.
---
 packages/bench/src/bench-memory-handle.ts     | 39 +++++++++++++++++++
 packages/bench/src/index.ts                   |  4 +-
 packages/bench/src/locomo/adapter.ts          |  8 +++-
 .../locomo/forensics/local-recall-sweep.ts    |  2 +-
 packages/bench/src/locomo/judge-adapter.ts    |  2 +-
 packages/bench/src/longmemeval/adapter.ts     |  7 +++-
 .../src/longmemeval/forensics/recall-sweep.ts |  2 +-
 packages/bench/src/memory-factory.ts          | 16 ++++++--
 packages/bench/test/require-graph.test.ts     | 33 ++++++++++++++++
 9 files changed, 104 insertions(+), 9 deletions(-)
 create mode 100644 packages/bench/src/bench-memory-handle.ts
 create mode 100644 packages/bench/test/require-graph.test.ts

diff --git a/packages/bench/src/bench-memory-handle.ts b/packages/bench/src/bench-memory-handle.ts
new file mode 100644
index 0000000..8b95f4f
--- /dev/null
+++ b/packages/bench/src/bench-memory-handle.ts
@@ -0,0 +1,39 @@
+// Dependency-light home for the bench memory handle + the requireGraph guard.
+// Kept separate from memory-factory.ts (which pulls in heavy runtime deps like
+// the ONNX reranker) so the guard and its types stay unit-testable without
+// loading native binaries. All imports here are type-only → erased at runtime.
+import type { Memory } from '@engram-mem/core'
+import type { NeuralGraph } from '@engram-mem/graph'
+import type { RerankerBackend } from './types.js'
+
+/** What createBenchMemory wired up — exposed so graph cells can reach the graph. */
+export interface BenchMemoryConfig {
+  graph: NeuralGraph | null
+  rerankerBackend: RerankerBackend
+}
+
+export interface BenchMemoryHandle {
+  memory: Memory
+  config: BenchMemoryConfig
+  /** True iff a real bench Neo4j was wired (env present AND reachable). */
+  graphActuallyWired: boolean
+}
+
+/**
+ * Hard-fail guard for graph cells. A graph cell that runs without a real Neo4j
+ * silently falls back to SQL-only and would report a SQL delta as a graph
+ * result — the exact "the graph was never measured" trap. Convert that silent
+ * fallback into a loud throw so a mis-provisioned matrix cell fails fast instead
+ * of fabricating a graph number.
+ */
+export function requireGraph(handle: BenchMemoryHandle): NeuralGraph {
+  if (!handle.graphActuallyWired || !handle.config.graph) {
+    throw new Error(
+      '[engram-bench] requireGraph: a graph cell was requested but the bench ' +
+      'Neo4j is not wired. Set ENGRAM_BENCH_NEO4J_URI + ENGRAM_BENCH_NEO4J_PASSWORD ' +
+      '(a bench-specific Neo4j, NOT the production NEO4J_URI). Refusing to report ' +
+      'a SQL-only result as a graph result.',
+    )
+  }
+  return handle.config.graph
+}
diff --git a/packages/bench/src/index.ts b/packages/bench/src/index.ts
index 5b25b78..7f7e8c9 100644
--- a/packages/bench/src/index.ts
+++ b/packages/bench/src/index.ts
@@ -3,7 +3,9 @@ export { LongMemEvalAdapter } from './longmemeval/adapter.js'
 export { compareLoCoMo, compareLongMemEval } from './runner/compare.js'
 export { computeRetrievalF1, recallAtK } from './metrics/f1.js'
 export { formatLoCoMoTable, formatLongMemEvalTable, formatComparisonTable } from './metrics/table.js'
-export { createBenchMemory } from './memory-factory.js'
+export { createBenchMemory, requireGraph } from './memory-factory.js'
+export type { BenchMemoryHandle, BenchMemoryConfig } from './memory-factory.js'
+export { wipeBenchGraph, tryCreateBenchGraph } from './bench-graph.js'
 export { mergeAssociationsIntoScored } from './merge-associations.js'
 export type { BenchRecallResult, BenchScoredMemory } from './merge-associations.js'
 export { graphVerdict, MIN_POWER_N, DEFAULT_EPSILON } from './metrics/graph-verdict.js'
diff --git a/packages/bench/src/locomo/adapter.ts b/packages/bench/src/locomo/adapter.ts
index da384d9..34798fd 100644
--- a/packages/bench/src/locomo/adapter.ts
+++ b/packages/bench/src/locomo/adapter.ts
@@ -9,6 +9,7 @@ import type { LoCoMoConversationFile, LoCoMoTurn } from './types.js'
 import { computeRetrievalF1 } from '../metrics/f1.js'
 import { createBenchMemory } from '../memory-factory.js'
 import { mergeAssociationsIntoScored } from '../merge-associations.js'
+import { wipeBenchGraph } from '../bench-graph.js'
 
 export class LoCoMoAdapter {
   async loadDataset(dataPath: string): Promise<LoCoMoConversationFile[]> {
@@ -252,7 +253,12 @@ export class LoCoMoAdapter {
     conv: LoCoMoConversationFile,
     opts?: BenchmarkOpts,
   ): Promise<{ result: LoCoMoConversationResult; ingestMs: number; evalMs: number }> {
-    const memory = await createBenchMemory(opts)
+    const { memory, config } = await createBenchMemory(opts)
+
+    // Per-conversation graph isolation: Neo4j is shared, so wipe before ingest
+    // or the previous conversation's nodes pollute this one's spreading
+    // activation (matching the per-conv fresh :memory: SQLite invariant).
+    if (config.graph) await wipeBenchGraph(config.graph)
 
     const ingestStart = Date.now()
     const { episodesIngested, sessionsCreated } = await this.ingestConversation(conv, memory, {
diff --git a/packages/bench/src/locomo/forensics/local-recall-sweep.ts b/packages/bench/src/locomo/forensics/local-recall-sweep.ts
index 3588e83..a3ad6e8 100644
--- a/packages/bench/src/locomo/forensics/local-recall-sweep.ts
+++ b/packages/bench/src/locomo/forensics/local-recall-sweep.ts
@@ -88,7 +88,7 @@ async function main(): Promise<void> {
     const convStart = Date.now()
     console.log(`[${i + 1}/${conversations.length}] ${convId} — fresh memory + ingest`)
 
-    const memory = await createBenchMemory(benchOpts)
+    const { memory } = await createBenchMemory(benchOpts)
 
     try {
       const ingestStart = Date.now()
diff --git a/packages/bench/src/locomo/judge-adapter.ts b/packages/bench/src/locomo/judge-adapter.ts
index 047ad08..8fabe9e 100644
--- a/packages/bench/src/locomo/judge-adapter.ts
+++ b/packages/bench/src/locomo/judge-adapter.ts
@@ -363,7 +363,7 @@ async function benchConversation(
   const nQs = opts.smoke ? (opts.smokeQuestions ?? 5) : qas.length
 
   console.log(`  [engram-mem] Conv ${convIdx} (${sid}): ingesting...`)
-  const memory = await createBenchMemory({
+  const { memory } = await createBenchMemory({
     graph: opts.graph ?? false,
     ...(opts.rerankerBackend ? { rerankerBackend: opts.rerankerBackend } : {}),
     ...(opts.onnxRerankerModel ? { onnxRerankerModel: opts.onnxRerankerModel } : {}),
diff --git a/packages/bench/src/longmemeval/adapter.ts b/packages/bench/src/longmemeval/adapter.ts
index 9afff54..d8fb8dd 100644
--- a/packages/bench/src/longmemeval/adapter.ts
+++ b/packages/bench/src/longmemeval/adapter.ts
@@ -23,6 +23,7 @@ import type {
 import type { LongMemEvalQuestion, LongMemEvalQuestionType } from './types.js'
 import { createBenchMemory } from '../memory-factory.js'
 import { mergeAssociationsIntoScored } from '../merge-associations.js'
+import { wipeBenchGraph } from '../bench-graph.js'
 
 export class LongMemEvalAdapter {
   /**
@@ -136,10 +137,14 @@ export class LongMemEvalAdapter {
     ingestMs: number
     evalMs: number
   }> {
-    const memory = await createBenchMemory(opts)
+    const { memory, config } = await createBenchMemory(opts)
     const topK = opts?.topK ?? 10
 
     try {
+      // Per-question graph isolation: Neo4j is a shared external process (unlike
+      // the per-call fresh :memory: SQLite), so wipe it before ingest or prior
+      // questions' nodes pollute this question's spreading activation.
+      if (config.graph) await wipeBenchGraph(config.graph)
       const ingestStart = Date.now()
       const { episodesIngested, sessionsCreated } = await this.ingestQuestion(question, memory)
       const ingestMs = Date.now() - ingestStart
diff --git a/packages/bench/src/longmemeval/forensics/recall-sweep.ts b/packages/bench/src/longmemeval/forensics/recall-sweep.ts
index a53c506..9ba0adf 100644
--- a/packages/bench/src/longmemeval/forensics/recall-sweep.ts
+++ b/packages/bench/src/longmemeval/forensics/recall-sweep.ts
@@ -86,7 +86,7 @@ async function main(): Promise<void> {
     // BUT — runQuestion currently slices to topK before computing recall@K.
     // For the sweep we want a fuller view: retrieve max(K_VALUES) once, then
     // compute recall@K from the same list. We need a slightly different path.
-    const memory = await createBenchMemory(benchOpts)
+    const { memory } = await createBenchMemory(benchOpts)
     let episodes = 0
     let ingestMs = 0
     let evalMs = 0
diff --git a/packages/bench/src/memory-factory.ts b/packages/bench/src/memory-factory.ts
index 57ed250..bc1295e 100644
--- a/packages/bench/src/memory-factory.ts
+++ b/packages/bench/src/memory-factory.ts
@@ -1,10 +1,16 @@
 import { SqliteStorageAdapter } from '@engram-mem/sqlite'
 import { openaiIntelligence } from '@engram-mem/openai'
 import { createMemory } from '@engram-mem/core'
-import type { Memory, IntelligenceAdapter } from '@engram-mem/core'
+import type { IntelligenceAdapter } from '@engram-mem/core'
 import { createOnnxReranker, type OnnxReranker } from '@engram-mem/rerank-onnx'
 import type { BenchmarkOpts } from './types.js'
 import { tryCreateBenchGraph } from './bench-graph.js'
+import type { BenchMemoryHandle } from './bench-memory-handle.js'
+
+// Re-exported so existing importers of these from memory-factory keep working;
+// the definitions live in the dependency-light bench-memory-handle module.
+export type { BenchMemoryConfig, BenchMemoryHandle } from './bench-memory-handle.js'
+export { requireGraph } from './bench-memory-handle.js'
 
 /**
  * Create an in-memory SQLite-backed Memory instance for benchmark use.
@@ -30,7 +36,7 @@ import { tryCreateBenchGraph } from './bench-graph.js'
  *     'onnx'                                    → local mxbai-rerank ONNX model
  *     'none'                                    → rerank disabled (same as noRerank)
  */
-export async function createBenchMemory(opts?: BenchmarkOpts): Promise<Memory> {
+export async function createBenchMemory(opts?: BenchmarkOpts): Promise<BenchMemoryHandle> {
   const storage = new SqliteStorageAdapter(':memory:')
 
   const apiKey = opts?.openaiApiKey ?? process.env['OPENAI_API_KEY']
@@ -51,7 +57,11 @@ export async function createBenchMemory(opts?: BenchmarkOpts): Promise<Memory> {
   })
 
   await memory.initialize()
-  return memory
+  return {
+    memory,
+    config: { graph, rerankerBackend: backend },
+    graphActuallyWired: graph !== null,
+  }
 }
 
 function resolveBackend(opts?: BenchmarkOpts): 'openai' | 'onnx' | 'none' {
diff --git a/packages/bench/test/require-graph.test.ts b/packages/bench/test/require-graph.test.ts
new file mode 100644
index 0000000..e667908
--- /dev/null
+++ b/packages/bench/test/require-graph.test.ts
@@ -0,0 +1,33 @@
+/**
+ * Phase 0 — requireGraph hard-fail guard.
+ *
+ * A graph matrix cell that runs without a real bench Neo4j would silently fall
+ * back to SQL-only and report a SQL delta as a "graph" result — the exact
+ * measurement trap Phase 0 exists to kill. requireGraph converts that silent
+ * fallback into a loud throw. (The success path needs a live NeuralGraph and is
+ * exercised by the matrix runner against a real bench Neo4j.)
+ */
+import { describe, it, expect } from 'vitest'
+import { requireGraph, type BenchMemoryHandle } from '../src/bench-memory-handle.js'
+
+const fakeMemory = {} as BenchMemoryHandle['memory']
+
+describe('requireGraph', () => {
+  it('throws when the graph was never wired (no silent SQL-only fallback)', () => {
+    const handle: BenchMemoryHandle = {
+      memory: fakeMemory,
+      config: { graph: null, rerankerBackend: 'none' },
+      graphActuallyWired: false,
+    }
+    expect(() => requireGraph(handle)).toThrow(/Neo4j is not wired/)
+  })
+
+  it('throws defensively when graphActuallyWired is true but the handle is null', () => {
+    const handle: BenchMemoryHandle = {
+      memory: fakeMemory,
+      config: { graph: null, rerankerBackend: 'openai' },
+      graphActuallyWired: true,
+    }
+    expect(() => requireGraph(handle)).toThrow()
+  })
+})

From dd9c6c4b53d7919f03073cbf28d2831f51991e6f Mon Sep 17 00:00:00 2001
From: muhammadkh4n <muhammadkh4n@gmail.com>
Date: Sun, 7 Jun 2026 04:53:30 +0500
Subject: [PATCH 7/9] feat(bench): recall-structure classifier + graphEffect
 metric + cat 2/3 gate filter (Phase 0, units 6-8b)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- classifyRecallStructure: deterministic label {lookup, multi_hop, temporal,
  aggregation} from dataset signals (LoCoMo category, LongMemEval ability) with
  a gold-cardinality + temporal-token heuristic fallback. GRAPH_RELEVANT =
  {multi_hop, temporal} — the split where spreading activation should help.
- computeGraphEffect: recall@K(merge ON) − recall@K(merge OFF) on the
  graph-relevant split (or the stronger graph-visible split when per-question
  graphCouldContribute is supplied). This is the scale-independent lift that
  feeds graphVerdict; an empty split returns zero effect, so with the n<100
  power gate no decision is ever fabricated.
- LoCoMo categories filter (BenchmarkOpts.categories): score only the requested
  categories (e.g. [2,3]) while ingesting the corpus whole — filters the metric,
  not the graph the recall traverses. Canonical category map locked from
  judge-adapter: 1=single_hop 2=multi_hop 3=temporal 4=open_domain 5=adversarial.

bench typecheck clean, 20/20.
---
 .../classify-recall-structure.ts              | 73 +++++++++++++++++++
 packages/bench/src/index.ts                   |  4 +
 packages/bench/src/locomo/adapter.ts          |  6 +-
 packages/bench/src/metrics/graph-effect.ts    | 56 ++++++++++++++
 packages/bench/src/types.ts                   |  9 +++
 .../test/classify-recall-structure.test.ts    | 46 ++++++++++++
 packages/bench/test/graph-effect.test.ts      | 42 +++++++++++
 7 files changed, 235 insertions(+), 1 deletion(-)
 create mode 100644 packages/bench/src/classification/classify-recall-structure.ts
 create mode 100644 packages/bench/src/metrics/graph-effect.ts
 create mode 100644 packages/bench/test/classify-recall-structure.test.ts
 create mode 100644 packages/bench/test/graph-effect.test.ts

diff --git a/packages/bench/src/classification/classify-recall-structure.ts b/packages/bench/src/classification/classify-recall-structure.ts
new file mode 100644
index 0000000..e956c82
--- /dev/null
+++ b/packages/bench/src/classification/classify-recall-structure.ts
@@ -0,0 +1,73 @@
+// Phase 0 — label a question's recall STRUCTURE so graphEffect is measured on
+// the graph-relevant split (multi_hop/temporal, where spreading activation
+// should help) instead of the saturated aggregate. Deterministic by design:
+// no LLM in the gate path, so the committed labels are reproducible.
+
+export type RecallStructure = 'lookup' | 'multi_hop' | 'temporal' | 'aggregation'
+
+export interface QuestionContext {
+  question: string
+  goldAnswer: string
+  /** Gold evidence ids: LoCoMo dia ids, or LongMemEval answer_session_ids. */
+  goldIds: string[]
+  /** LoCoMo category if known: 1=single_hop 2=multi_hop 3=temporal 4=open_domain 5=adversarial. */
+  category?: number
+  /** LongMemEval ability if known: temporal_reasoning, multi_session_reasoning, ... */
+  ability?: string
+}
+
+export interface RecallStructureLabel {
+  type: RecallStructure
+  confidence: number
+  reasoning: string
+}
+
+/** The structures where graph spreading activation is expected to add lift. */
+export const GRAPH_RELEVANT: ReadonlySet<RecallStructure> = new Set(['multi_hop', 'temporal'])
+
+// Low-confidence fallback signal only (used when neither category nor ability
+// is available). Years, month names, and ordering/relative-time words.
+const TEMPORAL_RE =
+  /\b(19|20)\d{2}\b|\b(jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)[a-z]*\b|\b(yesterday|today|tomorrow|week|month|year|date|when|before|after|since|until|earlier|later|ago|first|last|recent)\b/i
+
+/**
+ * Classify a question's recall structure. Authoritative dataset signals win:
+ * LoCoMo `category` first, then LongMemEval `ability`. Only when neither is
+ * present do we fall back to structural heuristics (gold cardinality + a
+ * temporal-token scan).
+ */
+export function classifyRecallStructure(ctx: QuestionContext): RecallStructureLabel {
+  // 1. LoCoMo category — authoritative.
+  if (ctx.category != null) {
+    switch (ctx.category) {
+      case 2: return { type: 'multi_hop', confidence: 0.9, reasoning: 'LoCoMo category 2 (multi_hop)' }
+      case 3: return { type: 'temporal', confidence: 0.9, reasoning: 'LoCoMo category 3 (temporal)' }
+      case 1: return { type: 'lookup', confidence: 0.9, reasoning: 'LoCoMo category 1 (single_hop)' }
+      case 4: return { type: 'lookup', confidence: 0.7, reasoning: 'LoCoMo category 4 (open_domain) -> lookup' }
+      case 5: return { type: 'lookup', confidence: 0.6, reasoning: 'LoCoMo category 5 (adversarial) -> lookup' }
+    }
+  }
+
+  // 2. LongMemEval ability — authoritative.
+  if (ctx.ability) {
+    const a = ctx.ability.toLowerCase()
+    if (a.includes('temporal')) return { type: 'temporal', confidence: 0.85, reasoning: `ability=${ctx.ability}` }
+    if (a.includes('multi_session') || a.includes('multi-session')) return { type: 'multi_hop', confidence: 0.85, reasoning: `ability=${ctx.ability}` }
+    if (a.includes('knowledge_update')) return { type: 'multi_hop', confidence: 0.7, reasoning: `ability=${ctx.ability} (updates link sessions)` }
+    if (a.includes('information_extraction')) return { type: 'lookup', confidence: 0.8, reasoning: `ability=${ctx.ability}` }
+    if (a.includes('abstention')) return { type: 'lookup', confidence: 0.7, reasoning: `ability=${ctx.ability}` }
+  }
+
+  // 3. Heuristic fallback.
+  const text = `${ctx.question} ${ctx.goldAnswer}`
+  if (ctx.goldIds.length >= 3) {
+    return { type: 'aggregation', confidence: 0.6, reasoning: `${ctx.goldIds.length} gold ids -> synthesis` }
+  }
+  if (TEMPORAL_RE.test(text)) {
+    return { type: 'temporal', confidence: 0.55, reasoning: 'temporal token in question/answer' }
+  }
+  if (ctx.goldIds.length >= 2) {
+    return { type: 'multi_hop', confidence: 0.6, reasoning: `${ctx.goldIds.length} gold ids -> cross-session` }
+  }
+  return { type: 'lookup', confidence: 0.5, reasoning: 'single gold id, no temporal signal' }
+}
diff --git a/packages/bench/src/index.ts b/packages/bench/src/index.ts
index 7f7e8c9..9c12a76 100644
--- a/packages/bench/src/index.ts
+++ b/packages/bench/src/index.ts
@@ -10,6 +10,10 @@ export { mergeAssociationsIntoScored } from './merge-associations.js'
 export type { BenchRecallResult, BenchScoredMemory } from './merge-associations.js'
 export { graphVerdict, MIN_POWER_N, DEFAULT_EPSILON } from './metrics/graph-verdict.js'
 export type { GraphVerdict, GraphVerdictInput } from './metrics/graph-verdict.js'
+export { classifyRecallStructure, GRAPH_RELEVANT } from './classification/classify-recall-structure.js'
+export type { RecallStructure, QuestionContext, RecallStructureLabel } from './classification/classify-recall-structure.js'
+export { computeGraphEffect } from './metrics/graph-effect.js'
+export type { QuestionOutcome, GraphEffectResult } from './metrics/graph-effect.js'
 export type {
   BenchmarkOpts, BenchmarkMetrics,
   LoCoMoCategory, LoCoMoQAPrediction, LoCoMoCategoryMetrics,
diff --git a/packages/bench/src/locomo/adapter.ts b/packages/bench/src/locomo/adapter.ts
index 34798fd..38a8728 100644
--- a/packages/bench/src/locomo/adapter.ts
+++ b/packages/bench/src/locomo/adapter.ts
@@ -186,7 +186,7 @@ export class LoCoMoAdapter {
   async evaluateDataset(
     conversations: LoCoMoConversationFile[],
     memory: Memory,
-    opts?: Pick<BenchmarkOpts, 'topK' | 'mergeAssociationsIntoTopK'>,
+    opts?: Pick<BenchmarkOpts, 'topK' | 'mergeAssociationsIntoTopK' | 'categories'>,
   ): Promise<LoCoMoConversationResult[]> {
     const topK = opts?.topK ?? 10
     const convResults: LoCoMoConversationResult[] = []
@@ -196,6 +196,10 @@ export class LoCoMoAdapter {
       const qaPredictions: LoCoMoQAPrediction[] = []
 
       for (const qa of conv.qa) {
+        // Gate-corpus filter: score only the requested categories (e.g. [2,3]
+        // multi-hop/temporal). The conversation was already ingested whole, so
+        // the graph the recall traverses is unaffected — only scoring narrows.
+        if (opts?.categories && !opts.categories.includes(qa.category)) continue
         const recallResult = await memory.recall(qa.question)
         const topMemories = mergeAssociationsIntoScored(
           recallResult, opts?.mergeAssociationsIntoTopK,
diff --git a/packages/bench/src/metrics/graph-effect.ts b/packages/bench/src/metrics/graph-effect.ts
new file mode 100644
index 0000000..494538e
--- /dev/null
+++ b/packages/bench/src/metrics/graph-effect.ts
@@ -0,0 +1,56 @@
+// Phase 0 — graphEffect: the scale-independent recall@K lift the graph buys,
+// measured on the split where the graph could actually help. Feeds graphVerdict.
+import { GRAPH_RELEVANT, type RecallStructure } from '../classification/classify-recall-structure.js'
+
+export interface QuestionOutcome {
+  id: string
+  /** recall@K with the graph channel merged OUT (memories only). */
+  recallAtKMergeOff: boolean
+  /** recall@K with the graph channel merged IN (memories + associations). */
+  recallAtKMergeOn: boolean
+  /** Structural label (from classifyRecallStructure). */
+  structure: RecallStructure
+  /**
+   * Optional, stronger split signal: did the graph channel surface the gold id
+   * at all (in either cell)? When present on ANY outcome, the split narrows to
+   * truly graph-VISIBLE questions (a question the graph never touched cannot
+   * move and only dilutes n). When absent, the split falls back to the
+   * graph-RELEVANT structural label (multi_hop/temporal).
+   */
+  graphCouldContribute?: boolean
+}
+
+export interface GraphEffectResult {
+  /** mergeOnRecall − mergeOffRecall on the split. */
+  graphEffect: number
+  /** Size of the split — the n the power gate (>=100) checks. */
+  graphVisibleN: number
+  mergeOnRecall: number
+  mergeOffRecall: number
+  splitDefinition: 'graph-relevant' | 'graph-visible'
+}
+
+/**
+ * Compute graphEffect over the appropriate split. Uses the graph-VISIBLE split
+ * when any outcome carries `graphCouldContribute`, otherwise the graph-RELEVANT
+ * structural split. Returns a zero-effect, n=0 result on an empty split (the
+ * verdict layer treats n<100 as insufficient_power, so this never fabricates a
+ * decision).
+ */
+export function computeGraphEffect(outcomes: QuestionOutcome[]): GraphEffectResult {
+  const useVisible = outcomes.some((o) => o.graphCouldContribute !== undefined)
+  const split = outcomes.filter((o) =>
+    useVisible ? o.graphCouldContribute === true : GRAPH_RELEVANT.has(o.structure),
+  )
+  const n = split.length
+  const mergeOnRecall = n === 0 ? 0 : split.filter((o) => o.recallAtKMergeOn).length / n
+  const mergeOffRecall = n === 0 ? 0 : split.filter((o) => o.recallAtKMergeOff).length / n
+
+  return {
+    graphEffect: mergeOnRecall - mergeOffRecall,
+    graphVisibleN: n,
+    mergeOnRecall,
+    mergeOffRecall,
+    splitDefinition: useVisible ? 'graph-visible' : 'graph-relevant',
+  }
+}
diff --git a/packages/bench/src/types.ts b/packages/bench/src/types.ts
index 02bea19..2e4b84a 100644
--- a/packages/bench/src/types.ts
+++ b/packages/bench/src/types.ts
@@ -16,6 +16,15 @@ export interface BenchmarkOpts {
    * safe. This is what makes graph:true vs graph:false able to move the metric.
    */
   mergeAssociationsIntoTopK?: boolean
+  /**
+   * Phase 0: restrict LoCoMo SCORING to these QA categories
+   * (1=single_hop, 2=multi_hop, 3=temporal, 4=open_domain, 5=adversarial).
+   * The corpus is still ingested WHOLE — only the metric is filtered — so
+   * spreading activation keeps the full graph to traverse. Use [2,3]
+   * (multi-hop + temporal) for the non-saturated graph-relevant gate corpus.
+   * Undefined = score every category (current behaviour).
+   */
+  categories?: number[]
   /**
    * Cross-encoder backend. 'openai' (default) uses LLM pointwise scoring via
    * gpt-4o-mini; 'onnx' uses a local mxbai-rerank ONNX model (no API cost,
diff --git a/packages/bench/test/classify-recall-structure.test.ts b/packages/bench/test/classify-recall-structure.test.ts
new file mode 100644
index 0000000..70ef033
--- /dev/null
+++ b/packages/bench/test/classify-recall-structure.test.ts
@@ -0,0 +1,46 @@
+/**
+ * Phase 0 — recall-structure classifier (deterministic, no LLM).
+ * Locks the dataset-signal precedence and the graph-relevant set.
+ */
+import { describe, it, expect } from 'vitest'
+import {
+  classifyRecallStructure,
+  GRAPH_RELEVANT,
+  type QuestionContext,
+} from '../src/classification/classify-recall-structure.js'
+
+const base: QuestionContext = { question: 'q', goldAnswer: 'a', goldIds: ['x'] }
+
+describe('classifyRecallStructure', () => {
+  it('maps LoCoMo categories authoritatively', () => {
+    expect(classifyRecallStructure({ ...base, category: 2 }).type).toBe('multi_hop')
+    expect(classifyRecallStructure({ ...base, category: 3 }).type).toBe('temporal')
+    expect(classifyRecallStructure({ ...base, category: 1 }).type).toBe('lookup')
+    expect(classifyRecallStructure({ ...base, category: 4 }).type).toBe('lookup')
+    expect(classifyRecallStructure({ ...base, category: 5 }).type).toBe('lookup')
+  })
+
+  it('maps LongMemEval abilities authoritatively', () => {
+    expect(classifyRecallStructure({ ...base, ability: 'temporal_reasoning' }).type).toBe('temporal')
+    expect(classifyRecallStructure({ ...base, ability: 'multi_session_reasoning' }).type).toBe('multi_hop')
+    expect(classifyRecallStructure({ ...base, ability: 'knowledge_updates' }).type).toBe('multi_hop')
+    expect(classifyRecallStructure({ ...base, ability: 'information_extraction' }).type).toBe('lookup')
+    expect(classifyRecallStructure({ ...base, ability: 'abstention' }).type).toBe('lookup')
+  })
+
+  it('category wins over ability and heuristics', () => {
+    const label = classifyRecallStructure({ ...base, category: 2, ability: 'temporal_reasoning', goldIds: ['1', '2', '3', '4'] })
+    expect(label.type).toBe('multi_hop')
+  })
+
+  it('falls back to cardinality + temporal heuristics when no signal', () => {
+    expect(classifyRecallStructure({ question: 'q', goldAnswer: 'a', goldIds: ['1', '2', '3', '4'] }).type).toBe('aggregation')
+    expect(classifyRecallStructure({ question: 'when did X move', goldAnswer: 'in 2021', goldIds: ['1'] }).type).toBe('temporal')
+    expect(classifyRecallStructure({ question: 'q', goldAnswer: 'a', goldIds: ['1', '2'] }).type).toBe('multi_hop')
+    expect(classifyRecallStructure({ question: 'q', goldAnswer: 'a', goldIds: ['1'] }).type).toBe('lookup')
+  })
+
+  it('GRAPH_RELEVANT is exactly {multi_hop, temporal}', () => {
+    expect([...GRAPH_RELEVANT].sort()).toEqual(['multi_hop', 'temporal'])
+  })
+})
diff --git a/packages/bench/test/graph-effect.test.ts b/packages/bench/test/graph-effect.test.ts
new file mode 100644
index 0000000..b0baa2b
--- /dev/null
+++ b/packages/bench/test/graph-effect.test.ts
@@ -0,0 +1,42 @@
+/**
+ * Phase 0 — graphEffect metric. Deterministic; pins the split selection and the
+ * recall@K lift computation that feeds the symmetric kill criterion.
+ */
+import { describe, it, expect } from 'vitest'
+import { computeGraphEffect, type QuestionOutcome } from '../src/metrics/graph-effect.js'
+
+describe('computeGraphEffect', () => {
+  it('measures merge-on minus merge-off recall on the graph-relevant split', () => {
+    const outcomes: QuestionOutcome[] = [
+      { id: '1', recallAtKMergeOff: false, recallAtKMergeOn: true, structure: 'multi_hop' }, // rescued
+      { id: '2', recallAtKMergeOff: true, recallAtKMergeOn: true, structure: 'temporal' }, // unchanged
+      { id: '3', recallAtKMergeOff: false, recallAtKMergeOn: true, structure: 'lookup' }, // excluded
+    ]
+    const r = computeGraphEffect(outcomes)
+    expect(r.splitDefinition).toBe('graph-relevant')
+    expect(r.graphVisibleN).toBe(2) // multi_hop + temporal only
+    expect(r.mergeOffRecall).toBe(0.5)
+    expect(r.mergeOnRecall).toBe(1.0)
+    expect(r.graphEffect).toBe(0.5)
+  })
+
+  it('uses the graph-visible split when graphCouldContribute is present', () => {
+    const outcomes: QuestionOutcome[] = [
+      { id: '1', recallAtKMergeOff: false, recallAtKMergeOn: true, structure: 'lookup', graphCouldContribute: true },
+      { id: '2', recallAtKMergeOff: true, recallAtKMergeOn: true, structure: 'multi_hop', graphCouldContribute: false },
+    ]
+    const r = computeGraphEffect(outcomes)
+    expect(r.splitDefinition).toBe('graph-visible')
+    expect(r.graphVisibleN).toBe(1)
+    expect(r.graphEffect).toBe(1.0)
+  })
+
+  it('returns zero effect and n=0 on an empty split (never fabricates a decision)', () => {
+    const r = computeGraphEffect([
+      { id: '1', recallAtKMergeOff: true, recallAtKMergeOn: true, structure: 'lookup' },
+    ])
+    expect(r.graphVisibleN).toBe(0)
+    expect(r.graphEffect).toBe(0)
+    expect(r.mergeOnRecall).toBe(0)
+  })
+})

From 037ef728b7c7d43898feb9273fd301c4cb32c12c Mon Sep 17 00:00:00 2001
From: muhammadkh4n <muhammadkh4n@gmail.com>
Date: Sun, 7 Jun 2026 05:11:48 +0500
Subject: [PATCH 8/9] feat(bench): 4-cell {graph}x{rerank} ablation matrix
 runner (Phase 0, unit 5)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Completes the Phase 0 measurement harness. compareMatrix runs the 4 cells, each
graph-on cell with mergeAssociationsIntoTopK so the graph channel is visible to
recall@K, and computes graphEffect as the recall@K lift on the graph-relevant
split by pairing each graph-on cell's per-question outcomes against its
same-rerank graph-off sibling (one recall per question — no double-scoring).

- requireGraph hard-fails before any graph cell runs when no bench Neo4j is
  wired, so a SQL-only fallback can never be reported as a graph result.
- extract{LongMemEval,LoCoMo}Outcomes live in a dependency-light matrix-outcomes
  module (no adapter/onnx import) so the pairing + classification stays
  unit-testable without native binaries; chained through computeGraphEffect.
- BaselineProvenance: git HEAD + corpus sha256 + flags + Neo4j-gate-state,
  written to results/gates/graph-eval-baseline.json (gitignore switched to
  results/* + !results/gates/ so the baseline can be committed).
- CLI: --matrix, --require-graph, --categories 2,3.

Pure orchestration unit-tested; the adapter-running wrapper + the live baseline
are validated against the bench runtime on the server. bench typecheck clean, 23/23.
---
 .gitignore                                   |  5 +-
 packages/bench/bin/engram-bench.ts           | 64 ++++++++++++++
 packages/bench/src/index.ts                  |  3 +
 packages/bench/src/runner/compare-matrix.ts  | 93 ++++++++++++++++++++
 packages/bench/src/runner/matrix-outcomes.ts | 61 +++++++++++++
 packages/bench/src/types.ts                  | 29 ++++++
 packages/bench/test/matrix-outcomes.test.ts  | 69 +++++++++++++++
 7 files changed, 323 insertions(+), 1 deletion(-)
 create mode 100644 packages/bench/src/runner/compare-matrix.ts
 create mode 100644 packages/bench/src/runner/matrix-outcomes.ts
 create mode 100644 packages/bench/test/matrix-outcomes.test.ts

diff --git a/.gitignore b/.gitignore
index 2a7ccdc..8974fc4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,7 +19,10 @@ supabase/.temp/
 .understand-anything/
 
 # Bench / forensics output (locally generated, not committed)
-results/
+results/*
+# …except committed Phase 0 gate baselines (results/gates/graph-eval-baseline.json).
+# Uses results/* (not results/) so this negation can re-include the subdir.
+!results/gates/
 data/
 
 # Build artifacts (prevent leaking compiled files into src/)
diff --git a/packages/bench/bin/engram-bench.ts b/packages/bench/bin/engram-bench.ts
index 0aff14c..07cbc6c 100644
--- a/packages/bench/bin/engram-bench.ts
+++ b/packages/bench/bin/engram-bench.ts
@@ -10,9 +10,12 @@
 
 import * as fs from 'node:fs/promises'
 import * as path from 'node:path'
+import { createHash } from 'node:crypto'
+import { execFileSync } from 'node:child_process'
 import { LoCoMoAdapter } from '../src/locomo/adapter.js'
 import { LongMemEvalAdapter } from '../src/longmemeval/adapter.js'
 import { compareLoCoMo, compareLongMemEval } from '../src/runner/compare.js'
+import { compareMatrix } from '../src/runner/compare-matrix.js'
 import { formatLoCoMoTable, formatLongMemEvalTable, formatComparisonTable } from '../src/metrics/table.js'
 import type { BenchmarkOpts } from '../src/types.js'
 
@@ -27,6 +30,8 @@ function parseArgs(argv: string[]) {
     if (arg === '--consolidate') { args['consolidate'] = true; continue }
     if (arg === '--graph') { args['graph'] = true; continue }
     if (arg === '--compare') { args['compare'] = true; continue }
+    if (arg === '--matrix') { args['matrix'] = true; continue }
+    if (arg === '--require-graph') { args['requireGraph'] = true; continue }
     if (arg === '--verbose') { args['verbose'] = true; continue }
     if (arg.startsWith('--')) {
       const key = arg.slice(2)
@@ -51,6 +56,11 @@ function parseArgs(argv: string[]) {
     consolidate: args['consolidate'] !== false,
     graph: args['graph'] !== false,
     compare: args['compare'] === true,
+    matrix: args['matrix'] === true,
+    requireGraph: args['requireGraph'] === true,
+    categories: typeof args['categories'] === 'string'
+      ? (args['categories'] as string).split(',').map((s) => parseInt(s.trim(), 10)).filter((n) => !Number.isNaN(n))
+      : undefined,
     topK: parseInt(args['top-k'] as string ?? '10', 10) || 10,
     limit: parseInt(args['limit'] as string ?? '0', 10) || 0,
     noRerank: args['noRerank'] === true,
@@ -58,6 +68,25 @@ function parseArgs(argv: string[]) {
   }
 }
 
+/** sha256 fingerprint of the corpus (file contents, or dir name:size listing). */
+async function hashCorpus(p: string): Promise<string> {
+  try {
+    const st = await fs.stat(p)
+    const hash = createHash('sha256')
+    if (st.isDirectory()) {
+      for (const e of (await fs.readdir(p)).sort()) {
+        const s = await fs.stat(path.join(p, e))
+        hash.update(`${e}:${s.size}\n`)
+      }
+    } else {
+      hash.update(await fs.readFile(p))
+    }
+    return hash.digest('hex')
+  } catch {
+    return 'unknown'
+  }
+}
+
 async function main() {
   const args = parseArgs(process.argv.slice(2))
 
@@ -77,6 +106,41 @@ async function main() {
   console.log(`Consolidation: ${args.consolidate ? 'ON' : 'OFF'}`)
   console.log('')
 
+  if (args.matrix) {
+    console.log('Running 4-cell {graph}x{rerank} ablation matrix...')
+    if (args.requireGraph) console.log('requireGraph: ON (a graph cell without a bench Neo4j will hard-fail)')
+    let commit = 'unknown'
+    try { commit = execFileSync('git', ['rev-parse', 'HEAD']).toString().trim() } catch { /* not a git checkout */ }
+    const corpusSha256 = await hashCorpus(args.dataPath)
+
+    const result = await compareMatrix(
+      args.benchmark as 'locomo' | 'longmemeval',
+      args.dataPath,
+      {
+        consolidate: args.consolidate,
+        topK: args.topK,
+        limit: args.limit > 0 ? args.limit : undefined,
+        ...(args.categories ? { categories: args.categories } : {}),
+      },
+      { requireGraph: args.requireGraph, commit, corpusSha256 },
+    )
+
+    for (const cell of result.cells) {
+      console.log(
+        `  graph=${cell.graph ? 'ON ' : 'OFF'} rerank=${cell.rerank ? 'ON ' : 'OFF'}` +
+        `  graphEffect=${cell.graphEffect.toFixed(4)} (n=${cell.graphVisibleN})`,
+      )
+    }
+
+    const gatesDir = path.resolve('./results/gates')
+    await fs.mkdir(gatesDir, { recursive: true })
+    const outFile = path.join(gatesDir, 'graph-eval-baseline.json')
+    await fs.writeFile(outFile, JSON.stringify(result, null, 2), 'utf8')
+    console.log(`\nMatrix baseline written to: ${outFile}`)
+    console.log(`Provenance: commit=${commit.slice(0, 8)} corpus=${corpusSha256.slice(0, 12)} gate=${result.provenance.neo4jGateState}`)
+    return
+  }
+
   if (args.compare) {
     console.log('Running comparison mode...')
     let comparisonResult
diff --git a/packages/bench/src/index.ts b/packages/bench/src/index.ts
index 9c12a76..dab2146 100644
--- a/packages/bench/src/index.ts
+++ b/packages/bench/src/index.ts
@@ -1,6 +1,9 @@
 export { LoCoMoAdapter } from './locomo/adapter.js'
 export { LongMemEvalAdapter } from './longmemeval/adapter.js'
 export { compareLoCoMo, compareLongMemEval } from './runner/compare.js'
+export { compareMatrix } from './runner/compare-matrix.js'
+export { extractLoCoMoOutcomes, extractLongMemEvalOutcomes } from './runner/matrix-outcomes.js'
+export type { ComparisonMatrixResult, MatrixCell, BaselineProvenance } from './types.js'
 export { computeRetrievalF1, recallAtK } from './metrics/f1.js'
 export { formatLoCoMoTable, formatLongMemEvalTable, formatComparisonTable } from './metrics/table.js'
 export { createBenchMemory, requireGraph } from './memory-factory.js'
diff --git a/packages/bench/src/runner/compare-matrix.ts b/packages/bench/src/runner/compare-matrix.ts
new file mode 100644
index 0000000..937c645
--- /dev/null
+++ b/packages/bench/src/runner/compare-matrix.ts
@@ -0,0 +1,93 @@
+// Phase 0 — 4-cell {graph}×{rerank} ablation matrix. The graph-on cell of each
+// rerank row runs with mergeAssociationsIntoTopK; graphEffect is the recall@K
+// lift on the graph-relevant split, computed by pairing that cell's per-question
+// outcomes against its same-rerank graph-off sibling. requireGraph hard-fails a
+// graph cell that has no real bench Neo4j, so a SQL-only fallback can never be
+// reported as a graph result.
+import { LoCoMoAdapter } from '../locomo/adapter.js'
+import { LongMemEvalAdapter } from '../longmemeval/adapter.js'
+import { createBenchMemory } from '../memory-factory.js'
+import { requireGraph } from '../bench-memory-handle.js'
+import { computeGraphEffect } from '../metrics/graph-effect.js'
+import { extractLoCoMoOutcomes, extractLongMemEvalOutcomes } from './matrix-outcomes.js'
+import type {
+  BenchmarkOpts,
+  ComparisonMatrixResult,
+  MatrixCell,
+  BaselineProvenance,
+  LoCoMoResult,
+  LongMemEvalResult,
+} from '../types.js'
+
+type MatrixOpts = Omit<BenchmarkOpts, 'graph' | 'noRerank' | 'mergeAssociationsIntoTopK'>
+
+export interface MatrixHooks {
+  /** Hard-fail (throw) if a graph cell would run without a real bench Neo4j. */
+  requireGraph?: boolean
+  /** Provenance: git rev-parse HEAD (computed by the caller — keeps this pure of child_process). */
+  commit?: string
+  /** Provenance: sha256 of the corpus file(s). */
+  corpusSha256?: string
+  /** Provenance timestamp (ISO). */
+  timestamp?: string
+}
+
+/**
+ * Run the 4-cell matrix. The graph-on cells set mergeAssociationsIntoTopK so the
+ * graph channel is visible to recall@K; rerank is toggled via rerankerBackend
+ * ('none' off, the requested/openai backend on). Returns each cell plus the
+ * graphEffect on the graph-relevant split and full provenance.
+ */
+export async function compareMatrix(
+  benchmark: 'locomo' | 'longmemeval',
+  dataPath: string,
+  opts: MatrixOpts = {},
+  hooks: MatrixHooks = {},
+): Promise<ComparisonMatrixResult> {
+  // Hard-fail BEFORE any work if a graph run is requested without a bench Neo4j.
+  if (hooks.requireGraph) {
+    const probe = await createBenchMemory({ ...opts, graph: true })
+    try {
+      requireGraph(probe)
+    } finally {
+      await probe.memory.dispose().catch(() => { /* probe cleanup non-fatal */ })
+    }
+  }
+
+  const runCell = (graph: boolean, rerank: boolean): Promise<LoCoMoResult | LongMemEvalResult> => {
+    const cellOpts: BenchmarkOpts = {
+      ...opts,
+      graph,
+      mergeAssociationsIntoTopK: graph,
+      rerankerBackend: rerank ? (opts.rerankerBackend ?? 'openai') : 'none',
+    }
+    return benchmark === 'locomo'
+      ? new LoCoMoAdapter().run(dataPath, cellOpts)
+      : new LongMemEvalAdapter().run(dataPath, cellOpts)
+  }
+
+  const cells: MatrixCell[] = []
+  for (const rerank of [true, false]) {
+    const off = await runCell(false, rerank)
+    const on = await runCell(true, rerank)
+    const outcomes =
+      benchmark === 'locomo'
+        ? extractLoCoMoOutcomes(on as LoCoMoResult, off as LoCoMoResult)
+        : extractLongMemEvalOutcomes(on as LongMemEvalResult, off as LongMemEvalResult)
+    const effect = computeGraphEffect(outcomes)
+    cells.push({ graph: false, rerank, result: off, graphEffect: 0, graphVisibleN: 0 })
+    cells.push({ graph: true, rerank, result: on, graphEffect: effect.graphEffect, graphVisibleN: effect.graphVisibleN })
+  }
+
+  const provenance: BaselineProvenance = {
+    flags: { ...opts, requireGraph: hooks.requireGraph ?? false },
+    corpusPath: dataPath,
+    corpusSha256: hooks.corpusSha256 ?? 'unknown',
+    commit: hooks.commit ?? 'unknown',
+    neo4jGateState: 'forgotten-gate-on',
+    mergeAssociationsIntoTopK: true,
+    timestamp: hooks.timestamp ?? new Date().toISOString(),
+  }
+
+  return { benchmark, cells, provenance }
+}
diff --git a/packages/bench/src/runner/matrix-outcomes.ts b/packages/bench/src/runner/matrix-outcomes.ts
new file mode 100644
index 0000000..ee2e111
--- /dev/null
+++ b/packages/bench/src/runner/matrix-outcomes.ts
@@ -0,0 +1,61 @@
+// Dependency-light outcome extraction for the ablation matrix. Kept separate
+// from compare-matrix.ts (which imports the onnx-heavy adapters) so the pairing
+// + classification logic stays unit-testable without native binaries. Pure:
+// imports only the classifier and types.
+import { classifyRecallStructure } from '../classification/classify-recall-structure.js'
+import type { QuestionOutcome } from '../metrics/graph-effect.js'
+import type { LoCoMoResult, LongMemEvalResult } from '../types.js'
+
+/** Pair LongMemEval predictions (graph-on vs graph-off) into classified outcomes. */
+export function extractLongMemEvalOutcomes(
+  on: LongMemEvalResult,
+  off: LongMemEvalResult,
+): QuestionOutcome[] {
+  const offById = new Map(off.predictions.map((p) => [p.questionId, p]))
+  const outcomes: QuestionOutcome[] = []
+  for (const onP of on.predictions) {
+    const offP = offById.get(onP.questionId)
+    if (!offP) continue
+    const structure = classifyRecallStructure({
+      question: onP.question,
+      goldAnswer: onP.goldAnswer,
+      goldIds: onP.goldSessionIds,
+      ability: onP.ability,
+    }).type
+    outcomes.push({
+      id: onP.questionId,
+      recallAtKMergeOff: offP.recallAt5,
+      recallAtKMergeOn: onP.recallAt5,
+      structure,
+    })
+  }
+  return outcomes
+}
+
+/** Pair LoCoMo qa predictions (graph-on vs graph-off) into classified outcomes. */
+export function extractLoCoMoOutcomes(on: LoCoMoResult, off: LoCoMoResult): QuestionOutcome[] {
+  const offById = new Map<string, { recallAtK: boolean }>()
+  for (const c of off.conversations) {
+    for (const qa of c.qaPredictions) offById.set(qa.qaId, qa)
+  }
+  const outcomes: QuestionOutcome[] = []
+  for (const c of on.conversations) {
+    for (const qa of c.qaPredictions) {
+      const offQa = offById.get(qa.qaId)
+      if (!offQa) continue
+      const structure = classifyRecallStructure({
+        question: qa.question,
+        goldAnswer: qa.goldAnswer,
+        goldIds: [],
+        category: qa.category,
+      }).type
+      outcomes.push({
+        id: qa.qaId,
+        recallAtKMergeOff: offQa.recallAtK,
+        recallAtKMergeOn: qa.recallAtK,
+        structure,
+      })
+    }
+  }
+  return outcomes
+}
diff --git a/packages/bench/src/types.ts b/packages/bench/src/types.ts
index 2e4b84a..b7209c4 100644
--- a/packages/bench/src/types.ts
+++ b/packages/bench/src/types.ts
@@ -154,3 +154,32 @@ export interface ComparisonDelta {
   evalTimeDeltaMs: number
   tokensDelta: number
 }
+
+// Phase 0 — 4-cell {graph}×{rerank} ablation matrix.
+export interface MatrixCell {
+  graph: boolean
+  rerank: boolean
+  result: LoCoMoResult | LongMemEvalResult
+  /** recall@K lift on the graph-relevant split vs the same-rerank graph-off cell. 0 for graph-off cells. */
+  graphEffect: number
+  /** Size of the split graphEffect was computed over (the power gate checks >=100). */
+  graphVisibleN: number
+}
+
+export interface BaselineProvenance {
+  flags: Record<string, unknown>
+  corpusPath: string
+  corpusSha256: string
+  /** git rev-parse HEAD at run time. */
+  commit: string
+  /** Whether the Neo4j forgotten/valid_until gates were active during the run. */
+  neo4jGateState: string
+  mergeAssociationsIntoTopK: boolean
+  timestamp: string
+}
+
+export interface ComparisonMatrixResult {
+  benchmark: 'locomo' | 'longmemeval'
+  cells: MatrixCell[]
+  provenance: BaselineProvenance
+}
diff --git a/packages/bench/test/matrix-outcomes.test.ts b/packages/bench/test/matrix-outcomes.test.ts
new file mode 100644
index 0000000..34d3a33
--- /dev/null
+++ b/packages/bench/test/matrix-outcomes.test.ts
@@ -0,0 +1,69 @@
+/**
+ * Phase 0 — matrix outcome extraction. Pure: pairs a graph-on result's
+ * per-question predictions against its graph-off sibling, classifies each, and
+ * (chained with computeGraphEffect) yields the graphEffect the matrix reports.
+ * No adapters, no Neo4j, no onnx.
+ */
+import { describe, it, expect } from 'vitest'
+import {
+  extractLongMemEvalOutcomes,
+  extractLoCoMoOutcomes,
+} from '../src/runner/matrix-outcomes.js'
+import { computeGraphEffect } from '../src/metrics/graph-effect.js'
+import type { LongMemEvalResult, LoCoMoResult } from '../src/types.js'
+
+function lmePred(id: string, recallAt5: boolean, ability: string) {
+  return {
+    questionId: id, question: 'q', goldAnswer: 'a', goldSessionIds: ['s'],
+    prediction: '', recalledSessionIds: [], recallAt5, recallAt10: recallAt5, ability,
+  }
+}
+const lme = (preds: ReturnType<typeof lmePred>[]) =>
+  ({ predictions: preds } as unknown as LongMemEvalResult)
+
+function locomoQa(id: string, recallAtK: boolean, category: number) {
+  return { qaId: id, question: 'q', goldAnswer: 'a', prediction: '', retrievalF1: 0, recallAtK, category }
+}
+const locomo = (convs: ReturnType<typeof locomoQa>[][]) =>
+  ({ conversations: convs.map((qaPredictions) => ({ qaPredictions })) } as unknown as LoCoMoResult)
+
+describe('extractLongMemEvalOutcomes', () => {
+  it('pairs by question id, classifies by ability, and feeds graphEffect', () => {
+    const on = lme([lmePred('q1', true, 'multi_session_reasoning'), lmePred('q2', true, 'information_extraction')])
+    const off = lme([lmePred('q1', false, 'multi_session_reasoning'), lmePred('q2', true, 'information_extraction')])
+
+    const outcomes = extractLongMemEvalOutcomes(on, off)
+    expect(outcomes).toHaveLength(2)
+    const q1 = outcomes.find((o) => o.id === 'q1')!
+    expect(q1.structure).toBe('multi_hop') // multi_session_reasoning
+    expect(q1.recallAtKMergeOff).toBe(false)
+    expect(q1.recallAtKMergeOn).toBe(true)
+
+    // graph rescued q1 (multi_hop, graph-relevant); q2 (lookup) is excluded.
+    const effect = computeGraphEffect(outcomes)
+    expect(effect.graphVisibleN).toBe(1)
+    expect(effect.graphEffect).toBe(1.0)
+  })
+
+  it('drops questions missing from the graph-off cell', () => {
+    expect(extractLongMemEvalOutcomes(lme([lmePred('q1', true, 'temporal_reasoning')]), lme([]))).toHaveLength(0)
+  })
+})
+
+describe('extractLoCoMoOutcomes', () => {
+  it('pairs by qaId across conversations and classifies by category', () => {
+    const on = locomo([[locomoQa('c:q1', true, 2), locomoQa('c:q2', true, 1)]])
+    const off = locomo([[locomoQa('c:q1', false, 2), locomoQa('c:q2', true, 1)]])
+
+    const outcomes = extractLoCoMoOutcomes(on, off)
+    expect(outcomes).toHaveLength(2)
+    const q1 = outcomes.find((o) => o.id === 'c:q1')!
+    expect(q1.structure).toBe('multi_hop') // category 2
+    expect(q1.recallAtKMergeOff).toBe(false)
+    expect(q1.recallAtKMergeOn).toBe(true)
+
+    const effect = computeGraphEffect(outcomes)
+    expect(effect.graphVisibleN).toBe(1) // only the cat-2 question is graph-relevant
+    expect(effect.graphEffect).toBe(1.0)
+  })
+})

From 5dff19579e8c6d9faa035e62f7959ca7bf032553 Mon Sep 17 00:00:00 2001
From: muhammadkh4n <muhammadkh4n@gmail.com>
Date: Sun, 7 Jun 2026 14:11:31 +0500
Subject: [PATCH 9/9] fix(bench): flush pending graph writes before eval (Phase
 0 correctness)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Graph decomposition in memory.ingest() is fire-and-forget (pushed to
_pendingWrites). Neither bench adapter awaited it, so recall ran against a
half-built graph — the graph cells produced empty/sparse associations and
graphEffect was spuriously ~0. This is exactly the measurement bug that makes
"the graph doesn't help" look true when the graph was never given a chance.

Call memory.flushPendingWrites() at the ingest→eval boundary in both adapters
(LoCoMo runConversation after consolidation; LongMemEval runQuestion after
ingest) so the graph is fully built before recall.

bench typecheck clean, 23/23.
---
 packages/bench/src/locomo/adapter.ts      | 4 ++++
 packages/bench/src/longmemeval/adapter.ts | 3 +++
 2 files changed, 7 insertions(+)

diff --git a/packages/bench/src/locomo/adapter.ts b/packages/bench/src/locomo/adapter.ts
index 38a8728..669a768 100644
--- a/packages/bench/src/locomo/adapter.ts
+++ b/packages/bench/src/locomo/adapter.ts
@@ -273,6 +273,10 @@ export class LoCoMoAdapter {
       await memory.consolidate('light')
       await memory.consolidate('deep')
     }
+    // Drain fire-and-forget graph decomposition (+ consolidation) writes before
+    // eval. Without this, recall runs against a half-built graph and the graph
+    // cells produce empty associations — spuriously zeroing graphEffect.
+    await memory.flushPendingWrites()
     const ingestMs = Date.now() - ingestStart
 
     const evalStart = Date.now()
diff --git a/packages/bench/src/longmemeval/adapter.ts b/packages/bench/src/longmemeval/adapter.ts
index d8fb8dd..bcc5cb9 100644
--- a/packages/bench/src/longmemeval/adapter.ts
+++ b/packages/bench/src/longmemeval/adapter.ts
@@ -147,6 +147,9 @@ export class LongMemEvalAdapter {
       if (config.graph) await wipeBenchGraph(config.graph)
       const ingestStart = Date.now()
       const { episodesIngested, sessionsCreated } = await this.ingestQuestion(question, memory)
+      // Drain fire-and-forget graph decomposition writes before recall, or the
+      // graph cell recalls against a half-built graph (spurious graphEffect=0).
+      await memory.flushPendingWrites()
       const ingestMs = Date.now() - ingestStart
 
       const evalStart = Date.now()