From 427986fe0ed1db4797e5e76278cce0870d2328e9 Mon Sep 17 00:00:00 2001 From: Chi Date: Fri, 12 Jun 2026 00:24:10 +0700 Subject: [PATCH 1/9] schema-driven preset merge engine to future-proof new features --- backend/database/preset_schema.py | 80 ++ backend/presets.py | 884 ++++++++++-------- .../test_preset_schema_coverage.py | 221 +++++ tests/integration/test_presets.py | 8 +- 4 files changed, 781 insertions(+), 412 deletions(-) create mode 100644 backend/database/preset_schema.py create mode 100644 tests/integration/test_preset_schema_coverage.py diff --git a/backend/database/preset_schema.py b/backend/database/preset_schema.py new file mode 100644 index 0000000..d96b357 --- /dev/null +++ b/backend/database/preset_schema.py @@ -0,0 +1,80 @@ +"""Product / security policy for the preset engine -- the single source of truth. + +The merge engine in ``backend/presets.py`` derives *all* of its mechanics (merge +order, id remapping, FK rewrite, child-replace scope) from the live schema via +``PRAGMA`` introspection, so adding a child table or a new FK column needs **zero** +edits there. What is *not* a schema fact -- which root table belongs to which +user-facing domain, which machinery tables to ignore, which columns carry secrets -- +lives here and only here. ``tests/integration/test_preset_schema_coverage.py`` fails +loudly the moment a freshly-migrated table or a sensitive-looking column is not +accounted for below. +""" + +from __future__ import annotations + +# Root table -> user-facing domain. A *root* owns no other table (it has no +# ``ON DELETE CASCADE`` foreign key pointing at a parent). Every non-root table +# auto-joins its root's domain by following ownership edges upward, so only the +# roots need listing here. This is the schema-driven replacement for the old +# hand-maintained DOMAIN_TABLES map. +DOMAIN_ROOTS: dict[str, str] = { + "conversations": "chats", + "character_cards": "characters", + "worlds": "lorebooks", + "mood_fragments": "fragments", + "interactive_fragments": "fragments", + "phrase_bank": "phrase_bank", + "settings": "configs", + "endpoints": "configs", + "user_personas": "configs", +} + +# Machinery / legacy tables the engine neither exports nor merges: +# * orb_preset_meta -- the preset's own descriptor row +# * schema_migrations -- migration bookkeeping (stamped separately) +# * message_attachments -- always empty post-0020 (migration moves its rows to +# user_attachments and the table is retained only as a fresh-install artefact) +EXCLUDED_TABLES: frozenset[str] = frozenset({"orb_preset_meta", "schema_migrations", "message_attachments"}) + +# Secret / personal columns blanked when the ``configs`` domain is *not* exported, +# so a shared preset never leaks an API key, the user's identity, or their prompts. +# Maps ``(table, column) -> replacement value``. This is a security decision, not a +# schema fact, so it is declared rather than derived. Entries on a non-singleton +# table (e.g. endpoints.api_key) are moot for the export scrub -- those rows are +# deleted wholesale -- but are listed so the coverage check sees every key column +# accounted for, and so the key-stripping export path can find them generically. +SECRET_COLUMNS: dict[tuple[str, str], str] = { + ("settings", "api_key"): "", + ("settings", "user_name"): "User", + ("settings", "user_description"): "", + ("settings", "system_prompt"): "", + ("settings", "shared_system_prompt"): "", + ("settings", "agent_shared_system_prompt"): "", + ("endpoints", "api_key"): "", +} + +# Exporting one domain implies exporting another (a product rule, not a schema +# fact): chats are meaningless without their character cards. +IMPLIED_DOMAINS: dict[str, frozenset[str]] = { + "chats": frozenset({"characters"}), +} + +# Columns carried across the settings-singleton overwrite on import rather than +# taken from the file: they describe local ``workflow_attachments`` rows that an +# import retains, not a user-facing config (see bootstrap.reset_to_defaults). +PRESERVED_COLUMNS: dict[str, tuple[str, ...]] = { + "settings": ("attachment_cache_budget_bytes", "attachment_access_counter"), +} + +# Markers that flag a column as security-sensitive. The coverage check fails on any +# matching column not present in SECRET_COLUMNS, so a newly added secret cannot slip +# into a shared preset unnoticed. Matched as *suffixes* (plus the "secret" substring) +# rather than loose substrings, so ``api_key`` / ``auth_token`` are caught while +# innocuous names like ``max_tokens`` or ``top_k`` are not. +SENSITIVE_SUFFIXES: tuple[str, ...] = ("_key", "password", "token") +SENSITIVE_SUBSTRINGS: tuple[str, ...] = ("secret",) + + +def is_sensitive_column(name: str) -> bool: + c = name.lower() + return c.endswith(SENSITIVE_SUFFIXES) or any(s in c for s in SENSITIVE_SUBSTRINGS) diff --git a/backend/presets.py b/backend/presets.py index a97df17..7af6856 100644 --- a/backend/presets.py +++ b/backend/presets.py @@ -27,6 +27,7 @@ from __future__ import annotations +import dataclasses import datetime import json import os @@ -34,34 +35,219 @@ import shutil import sqlite3 +from .database import preset_schema as ps from .database.migrations import MIGRATIONS, run_pending META_TABLE = "orb_preset_meta" -# Domain -> tables it owns. Order within a domain is informational; the merge -# order across domains is fixed in apply_preset(). -DOMAIN_TABLES: dict[str, list[str]] = { - "characters": ["character_cards"], - "chats": [ - "conversations", - "messages", - "director_state", - "conversation_logs", - "user_attachments", - "workflow_attachments", - ], - "lorebooks": ["worlds", "lorebook_entries"], - "fragments": ["mood_fragments", "interactive_fragments"], - "phrase_bank": ["phrase_bank"], - "configs": ["settings", "endpoints", "model_configs", "user_personas"], -} -ALL_DOMAINS = list(DOMAIN_TABLES.keys()) +# The set of user-facing domains, derived from the declared roots. Order is +# informational (meta stores them sorted); the actual merge order is the +# schema-derived topological sort in _build_schema_model(). +ALL_DOMAINS: list[str] = sorted(set(ps.DOMAIN_ROOTS.values())) class PresetError(Exception): """Raised for caller-facing preset failures (bad file, version skew, etc.).""" +# ── schema model (derived from the live schema, zero hand-maintenance) ─────── +# +# Everything the merge engine needs -- table classification, the FK graph, a +# safe insert order, which edges to defer -- is read from the live database with +# PRAGMA. Adding a table or an FK column therefore requires no edit here: the +# model simply grows. The only hand-declared inputs are the product/security +# policy in backend/database/preset_schema.py. + + +@dataclasses.dataclass +class _FK: + """One foreign-key edge of a table, classified by what it means for a merge.""" + + table: str # the child table the column lives on + from_col: str + parent: str + to_col: str + on_delete: str # 'CASCADE' | 'SET NULL' | 'NO ACTION' | ... + notnull: bool # is from_col declared NOT NULL? + + @property + def is_self(self) -> bool: + return self.parent == self.table + + @property + def kind(self) -> str: + # ownership = "this row is part of that entity" (deleting the parent + # deletes the child); crossref = "soft pointer to another entity". + if self.is_self: + return "self" + return "ownership" if self.on_delete == "CASCADE" else "crossref" + + +@dataclasses.dataclass +class _Table: + name: str + cols: list[str] + pk: list[str] + kind: str # 'singleton' | 'stable' | 'surrogate' + fks: list[_FK] + owner_fk: _FK | None # the single ownership (CASCADE, non-self) parent edge + + def fk(self, col: str) -> _FK | None: + for f in self.fks: + if f.from_col == col: + return f + return None + + +@dataclasses.dataclass +class _Schema: + tables: dict[str, _Table] + order: list[str] # topological insert order (parents before children) + deferred: set[tuple[str, str]] # (table, from_col) edges set NULL on insert, fixed up after + + def root_of(self, table: str) -> _Table: + """Climb ownership edges to the entity root (the table with no owner).""" + t = self.tables[table] + while t.owner_fk is not None: + t = self.tables[t.owner_fk.parent] + return t + + def domain_of(self, table: str) -> str | None: + return ps.DOMAIN_ROOTS.get(self.root_of(table).name) + + def domain_tables(self, domain: str) -> list[str]: + """Tables belonging to *domain*, in topological (parent-first) order.""" + return [t for t in self.order if self.domain_of(t) == domain] + + +_SINGLETON_RE = re.compile(r"check\s*\(\s*id\s*=\s*1\s*\)", re.IGNORECASE) + + +def _build_schema_model(conn: sqlite3.Connection) -> _Schema: + """Introspect the live schema (the ``main`` database) into an in-memory model. + + Classification is read straight from PRAGMA + the stored DDL: + * *singleton* -- a ``CHECK (id = 1)`` table (settings): updated in place. + * *surrogate* -- a lone INTEGER primary key (an autoincrement rowid): its id + is not portable, so rows reinsert under fresh ids with an old->new map. + * *stable* -- everything else (a TEXT primary key, or a PK that is itself + a foreign key like director_state.conversation_id): identity is portable, + so rows upsert by primary key. + """ + rows = conn.execute("SELECT name, sql FROM sqlite_master WHERE type = 'table'").fetchall() + ddl = {name: (sql or "") for name, sql in rows} + names = [n for n in ddl if n not in ps.EXCLUDED_TABLES and not n.startswith("sqlite_")] + + tables: dict[str, _Table] = {} + for name in names: + info = conn.execute(f"PRAGMA table_info({name})").fetchall() + cols = [r[1] for r in info] + types = {r[1]: (r[2] or "").upper() for r in info} + notnull = {r[1]: bool(r[3]) for r in info} + pk = [r[1] for r in sorted((r for r in info if r[5]), key=lambda r: r[5])] + + fks: list[_FK] = [] + for r in conn.execute(f"PRAGMA foreign_key_list({name})").fetchall(): + parent, from_col, to_col, on_delete = r[2], r[3], r[4], r[6] + if to_col is None: # implicit reference to the parent's primary key + to_col = tables[parent].pk[0] if parent in tables else "id" + fks.append(_FK(name, from_col, parent, to_col, on_delete, notnull.get(from_col, False))) + + if _SINGLETON_RE.search(ddl[name]): + kind = "singleton" + elif len(pk) == 1 and types.get(pk[0], "").startswith("INTEGER"): + kind = "surrogate" + else: + kind = "stable" + + owner = next((f for f in fks if f.kind == "ownership"), None) + tables[name] = _Table(name, cols, pk, kind, fks, owner) + + order, deferred = _topo_order(tables) + return _Schema(tables, order, deferred) + + +def _topo_order(tables: dict[str, _Table]) -> tuple[list[str], set[tuple[str, str]]]: + """Order tables so every non-deferred FK's parent is inserted before its child. + + Self edges are deferred from the start (a row references its own table, which + cannot exist yet). Genuine cycles -- conversations.active_leaf_id <-> messages, + endpoints.active_model_config_id <-> model_configs -- are broken by deferring a + *crossref* edge inside the cycle (never an ownership edge, which defines the + tree). Deferred columns are inserted NULL and fixed up once every id-map exists. + """ + deferred: set[tuple[str, str]] = set() + for t in tables.values(): + for f in t.fks: + if f.is_self: + deferred.add((t.name, f.from_col)) + + placed: set[str] = set() + order: list[str] = [] + while len(placed) < len(tables): + progressed = False + for name, t in tables.items(): + if name in placed: + continue + unmet = any( + f.parent in tables and f.parent not in placed and not f.is_self and (name, f.from_col) not in deferred + for f in t.fks + ) + if not unmet: + order.append(name) + placed.add(name) + progressed = True + if progressed: + continue + # Stalled: a cycle remains. Break it by deferring one crossref edge whose + # parent is still unplaced. + broke = False + for name, t in tables.items(): + if name in placed: + continue + for f in t.fks: + if f.kind == "crossref" and f.parent not in placed and (name, f.from_col) not in deferred: + deferred.add((name, f.from_col)) + broke = True + break + if broke: + break + if not broke: + raise PresetError("Unbreakable foreign-key cycle in schema") + return order, deferred + + +def schema_coverage_problems(conn: sqlite3.Connection) -> list[str]: + """Return human-readable reasons the live schema is not fully covered by the + declared preset policy -- empty when everything is accounted for. + + The drift backstop: a new table that no DOMAIN_ROOT owns, a foreign key whose + parent the engine never classified, or a secret-looking column missing from + SECRET_COLUMNS each surfaces here (and fails the coverage test) the moment the + schema changes, instead of silently dropping data or aborting an import later. + """ + schema = _build_schema_model(conn) + problems: list[str] = [] + for name, t in schema.tables.items(): + if schema.domain_of(name) is None: + problems.append( + f"table {name!r} reaches no DOMAIN_ROOT via ownership; assign its root " + f"a domain in DOMAIN_ROOTS or add {name!r} to EXCLUDED_TABLES" + ) + for fk in t.fks: + if fk.parent not in schema.tables: + problems.append( + f"{name}.{fk.from_col} references unclassified parent {fk.parent!r} " f"(excluded or unknown table)" + ) + for col in t.cols: + if ps.is_sensitive_column(col) and (name, col) not in ps.SECRET_COLUMNS: + problems.append( + f"column {name}.{col} looks secret but is not in SECRET_COLUMNS; " + f"add it (with its scrub value) or rename it" + ) + return problems + + # ── paths ─────────────────────────────────────────────────────────────────── @@ -189,46 +375,28 @@ def _cols(conn: sqlite3.Connection, table: str) -> list[str]: return [r[1] for r in conn.execute(f"PRAGMA table_info({table})").fetchall()] -def _upsert(conn: sqlite3.Connection, table: str) -> int: - """INSERT OR REPLACE every preset row into main, keyed by the table's PK.""" - cols = _cols(conn, table) - collist = ",".join(cols) - ph = ",".join("?" * len(cols)) - rows = conn.execute(f"SELECT {collist} FROM preset.{table}").fetchall() - for row in rows: - conn.execute(f"INSERT OR REPLACE INTO main.{table} ({collist}) VALUES ({ph})", row) - return len(rows) - - -def _insert_no_id(conn: sqlite3.Connection, table: str, where: str = "") -> int: - """Insert preset rows into main, dropping the autoincrement ``id``.""" - cols = [c for c in _cols(conn, table) if c != "id"] - collist = ",".join(cols) - ph = ",".join("?" * len(cols)) - rows = conn.execute(f"SELECT {collist} FROM preset.{table} {where}").fetchall() - for row in rows: - conn.execute(f"INSERT INTO main.{table} ({collist}) VALUES ({ph})", row) - return len(rows) - - # ── export ────────────────────────────────────────────────────────────────── -def _scrub_configs(conn: sqlite3.Connection) -> None: +def _scrub_configs(conn: sqlite3.Connection, schema: _Schema) -> None: """Strip personal config + secrets when 'configs' is not exported. - Deleting endpoints/personas auto-nulls their references on the settings row - via the schema's ON DELETE SET NULL (and cascades model_configs). The free - text fields are blanked so a shared preset never leaks prompts or identity. - Import ignores configs in such a preset anyway (gated by meta), so exact - default values do not matter -- only that nothing personal remains. + Deleting the configs domain's non-singleton roots (endpoints, user_personas) + auto-nulls their references on the settings row via the schema's ON DELETE SET + NULL and cascades model_configs (this runs FK-on, on the export clone). The + singleton's secret/free-text columns are then blanked per SECRET_COLUMNS so a + shared preset never leaks a key, identity, or prompts. Import ignores configs + in such a preset anyway (gated by meta), so exact default values do not matter + -- only that nothing personal remains. Both the set of configs roots and the + blanked columns are derived/declared, so a new configs table or secret column + is covered without editing here. """ - conn.execute("DELETE FROM endpoints") # cascades model_configs; SET NULL on settings refs - conn.execute("DELETE FROM user_personas") # SET NULL on settings.active_persona_id - conn.execute( - "UPDATE settings SET api_key = '', user_name = 'User', user_description = '', " - "system_prompt = '', shared_system_prompt = '', agent_shared_system_prompt = ''" - ) + for root, domain in ps.DOMAIN_ROOTS.items(): + if domain == "configs" and schema.tables[root].kind != "singleton": + conn.execute(f"DELETE FROM {root}") + for (table, col), blank in ps.SECRET_COLUMNS.items(): + if schema.tables[table].kind == "singleton": + conn.execute(f"UPDATE {table} SET {col} = ?", (blank,)) def build_preset(selected_domains, strip_keys: bool, label: str = "") -> str: @@ -239,8 +407,9 @@ def build_preset(selected_domains, strip_keys: bool, label: str = "") -> str: unknown = selected - set(ALL_DOMAINS) if unknown: raise PresetError(f"Unknown domains: {sorted(unknown)}") - if "chats" in selected: - selected.add("characters") # chats are meaningless without their character + for trigger, implied in ps.IMPLIED_DOMAINS.items(): + if trigger in selected: + selected |= implied # e.g. chats are meaningless without their character if not selected: raise PresetError("Select at least one domain to export") @@ -255,22 +424,23 @@ def build_preset(selected_domains, strip_keys: bool, label: str = "") -> str: c = sqlite3.connect(tmp, isolation_level=None) try: c.execute("PRAGMA foreign_keys=ON") - if "chats" not in selected: - c.execute("DELETE FROM conversations") # cascades messages/logs/attachments/director_state - if "characters" not in selected: - c.execute("DELETE FROM character_cards") - if "lorebooks" not in selected: - c.execute("DELETE FROM worlds") # cascades lorebook_entries; SET NULL character_cards.world_id - if "fragments" not in selected: - c.execute("DELETE FROM mood_fragments") - c.execute("DELETE FROM interactive_fragments") - if "phrase_bank" not in selected: - c.execute("DELETE FROM phrase_bank") - if "configs" not in selected: - _scrub_configs(c) - elif strip_keys: - c.execute("UPDATE settings SET api_key = ''") - c.execute("UPDATE endpoints SET api_key = ''") + schema = _build_schema_model(c) + # Prune each unselected domain by deleting its root tables: with FK on, a + # CASCADE prunes the owned children and a SET NULL clears soft pointers, so + # no per-child delete is hand-coded. configs is special (it scrubs the + # singleton in place rather than deleting it). + for domain in ALL_DOMAINS: + if domain in selected: + continue + if domain == "configs": + _scrub_configs(c, schema) + continue + for root, root_domain in ps.DOMAIN_ROOTS.items(): + if root_domain == domain and schema.tables[root].kind != "singleton": + c.execute(f"DELETE FROM {root}") + if "configs" in selected and strip_keys: + for table, col in ((t, col) for (t, col) in ps.SECRET_COLUMNS if col == "api_key"): + c.execute(f"UPDATE {table} SET {col} = ''") keys_stripped = True _stamp_migrations(c) _write_meta(c, sorted(selected), label, kind, keys_stripped) @@ -284,312 +454,255 @@ def build_preset(selected_domains, strip_keys: bool, label: str = "") -> str: # ── merge (apply) ───────────────────────────────────────────────────────── +# +# One generic engine drives every domain. Given the schema model it: +# A. (restore only) wipes each additive domain so it ends up matching the file. +# B. clears the subtree each incoming entity replaces (child-replace scope). +# C. inserts/upserts every covered table in topological order, dropping +# surrogate ids (recording an old->new map) and rewriting FK columns. +# D. fixes up deferred self/cycle back-pointers once every id-map exists. +# E. reconciles soft pointers from *other* domains into any fully-replaced table. +# Adding a child table or an FK column needs no edit here -- the model grows and +# these passes pick it up. + + +def _existing(conn: sqlite3.Connection, cache: dict[str, set], parent: str, to_col: str) -> set: + """Memoised set of a parent table's current key values in ``main``. + + Used for the "keep this value -- it still resolves locally" branch of the FK + rewrite. A parent is always fully inserted/upserted before any child consults + it (topological order), and parents are never re-touched afterwards, so the + set is stable once built. + """ + if parent not in cache: + cache[parent] = {r[0] for r in conn.execute(f"SELECT {to_col} FROM main.{parent}")} + return cache[parent] + + +def _resolve_fk(value, fk: _FK, idmaps: dict[str, dict[int, int]], conn, cache) -> tuple[object, bool]: + """Translate one FK value for a row being merged. Returns ``(new_value, drop)``. + + The single rule that replaces every bespoke remap: + * ``None`` stays ``None``. + * if the parent was surrogate-remapped this merge, the value is portable + only through that map -- in the map -> the new id; not in it -> dangling + (its old surrogate id means nothing locally). + * otherwise (stable/untouched parent) the value is portable as-is -> keep it + if it still resolves in ``main``, else dangling. + * a dangling value is dropped-as-NULL for a SET NULL / nullable column, or + the whole child row is dropped for a NOT NULL ownership (CASCADE) column. + """ + if value is None: + return None, False + pmap = idmaps.get(fk.parent) + if pmap is not None: + if value in pmap: + return pmap[value], False + elif value in _existing(conn, cache, fk.parent, fk.to_col): + return value, False + # dangling + if fk.kind == "ownership" and fk.notnull: + return None, True + return None, False + + +def _scope_clause(schema: _Schema, table: str, root: str) -> str: + """A WHERE clause selecting ``main.table`` rows owned by the *incoming* roots. + + Walks the ownership chain up from ``table`` to ``root``, building nested + subqueries: the final hop targets ``preset.root`` (the entities being + re-imported), the intermediate hops join through ``main``. This generalises + the hand-written "delete this conversation's message tree" prune. + """ + fk = schema.tables[table].owner_fk + assert fk is not None + if fk.parent == root: + return f"{fk.from_col} IN (SELECT {fk.to_col} FROM preset.{root})" + inner = _scope_clause(schema, fk.parent, root) + return f"{fk.from_col} IN (SELECT {fk.to_col} FROM main.{fk.parent} WHERE {inner})" + + +def _merge_table(conn, schema, table, idmaps, cache) -> None: + """Insert/upsert one covered table, rewriting FKs and recording its id-map.""" + t = schema.tables[table] + cols = t.cols + deferred = {c for (tbl, c) in schema.deferred if tbl == table} + fks = {f.from_col: f for f in t.fks} + + if t.kind == "singleton": + # Update the lone row in place; never insert/delete it. PRESERVED_COLUMNS + # keep their local values (cache bookkeeping, not config from the file). + pk = t.pk[0] + keep = set(t.pk) | set(ps.PRESERVED_COLUMNS.get(table, ())) + row = conn.execute(f"SELECT {','.join(cols)} FROM preset.{table} WHERE {pk} = 1").fetchone() + if row is None: + return + sets, vals = [], [] + for c, v in zip(cols, row): + if c in keep: + continue + if c in fks: + v, _ = _resolve_fk(v, fks[c], idmaps, conn, cache) + sets.append(f"{c} = ?") + vals.append(v) + conn.execute(f"UPDATE main.{table} SET {', '.join(sets)} WHERE {pk} = 1", vals) + return - -def _merge_lorebooks(conn: sqlite3.Connection) -> None: - _upsert(conn, "worlds") - for (wid,) in conn.execute("SELECT id FROM preset.worlds").fetchall(): - conn.execute("DELETE FROM main.lorebook_entries WHERE world_id = ?", (wid,)) - _insert_no_id(conn, "lorebook_entries") - - -def _merge_characters(conn: sqlite3.Connection) -> None: - _upsert(conn, "character_cards") - # Drop links to worlds that aren't present locally (FK would otherwise dangle). - conn.execute( - "UPDATE main.character_cards SET world_id = NULL " - "WHERE world_id IS NOT NULL AND world_id NOT IN (SELECT id FROM main.worlds)" - ) - - -def _merge_chats(conn: sqlite3.Connection) -> None: - conv_ids = [r[0] for r in conn.execute("SELECT id FROM preset.conversations").fetchall()] - if not conv_ids: + if t.kind == "stable": + # Identity is portable: upsert by primary key (the child-replace in + # phase B already cleared any subtree this row owns). + ph = ",".join("?" * len(cols)) + for row in conn.execute(f"SELECT {','.join(cols)} FROM preset.{table}").fetchall(): + vals = list(row) + for i, c in enumerate(cols): + if c in deferred: + vals[i] = None # fixed up once the referenced rows exist + elif c in fks: + vals[i], _ = _resolve_fk(vals[i], fks[c], idmaps, conn, cache) + conn.execute(f"INSERT OR REPLACE INTO main.{table} ({','.join(cols)}) VALUES ({ph})", vals) return - # 1. Replace each conversation wholesale. apply runs with foreign_keys=OFF, - # so ON DELETE CASCADE does not fire -- clear the old subtree by hand - # (child rows first) or the previous messages/logs/attachments survive - # alongside the freshly imported ones. - conv_cols = _cols(conn, "conversations") - ali = conv_cols.index("active_leaf_id") - collist = ",".join(conv_cols) - ph = ",".join("?" * len(conv_cols)) - conv_ph = ",".join("?" * len(conv_ids)) - old_msgs = f"SELECT id FROM main.messages WHERE conversation_id IN ({conv_ph})" - conn.execute(f"DELETE FROM main.workflow_attachments WHERE message_id IN ({old_msgs})", conv_ids) - conn.execute(f"DELETE FROM main.user_attachments WHERE message_id IN ({old_msgs})", conv_ids) - conn.execute(f"DELETE FROM main.conversation_logs WHERE conversation_id IN ({conv_ph})", conv_ids) - conn.execute(f"DELETE FROM main.director_state WHERE conversation_id IN ({conv_ph})", conv_ids) - conn.execute(f"DELETE FROM main.messages WHERE conversation_id IN ({conv_ph})", conv_ids) - conn.execute(f"DELETE FROM main.conversations WHERE id IN ({conv_ph})", conv_ids) - for row in conn.execute(f"SELECT {collist} FROM preset.conversations").fetchall(): - vals = list(row) - vals[ali] = None # set after messages exist - conn.execute(f"INSERT INTO main.conversations ({collist}) VALUES ({ph})", vals) - - # 2. Messages: remap integer ids, inserting parents before children. - msg_cols = _cols(conn, "messages") - id_i = msg_cols.index("id") - par_i = msg_cols.index("parent_id") - ins_cols = [c for c in msg_cols if c != "id"] - ins_par = ins_cols.index("parent_id") - ins_sql = f"INSERT INTO main.messages ({','.join(ins_cols)}) VALUES ({','.join('?' * len(ins_cols))})" - rows = conn.execute(f"SELECT {','.join(msg_cols)} FROM preset.messages").fetchall() - msg_map: dict[int, int] = {} - pending = list(rows) - progressed = True - while pending and progressed: - progressed = False - still = [] - for r in pending: - parent = r[par_i] - if parent is None or parent in msg_map: - vals = [r[msg_cols.index(c)] for c in ins_cols] - vals[ins_par] = msg_map[parent] if parent is not None else None - cur = conn.execute(ins_sql, vals) - assert cur.lastrowid is not None - msg_map[r[id_i]] = cur.lastrowid - progressed = True - else: - still.append(r) - pending = still - for r in pending: # orphaned/cyclic parent: attach to root - vals = [r[msg_cols.index(c)] for c in ins_cols] - vals[ins_par] = None - cur = conn.execute(ins_sql, vals) - assert cur.lastrowid is not None - msg_map[r[id_i]] = cur.lastrowid - - # 3. director_state keyed by conversation_id (cleared above). - _insert_no_id_keep_all(conn, "director_state") - - # 4. conversation_logs: drop id, remap nullable message_id. - _insert_remap_message(conn, "conversation_logs", msg_map, nullable=True) - - # 5. user_attachments: drop id, remap NOT NULL message_id. - _insert_remap_message(conn, "user_attachments", msg_map, nullable=False) - - # 6. workflow_attachments: drop id, remap message_id + self-refs (two-pass). - _merge_workflow_attachments(conn, msg_map) - - # 7. Point each conversation at its remapped active leaf. - for (cid,) in [(c,) for c in conv_ids]: - leaf = conn.execute("SELECT active_leaf_id FROM preset.conversations WHERE id = ?", (cid,)).fetchone() - old = leaf[0] if leaf else None - if old is not None and old in msg_map: - conn.execute("UPDATE main.conversations SET active_leaf_id = ? WHERE id = ?", (msg_map[old], cid)) - - -def _insert_no_id_keep_all(conn: sqlite3.Connection, table: str) -> None: - """Copy rows whose PK is not autoincrement (e.g. director_state).""" - cols = _cols(conn, table) - collist = ",".join(cols) - ph = ",".join("?" * len(cols)) - for row in conn.execute(f"SELECT {collist} FROM preset.{table}").fetchall(): - conn.execute(f"INSERT OR REPLACE INTO main.{table} ({collist}) VALUES ({ph})", row) - - -def _insert_remap_message(conn: sqlite3.Connection, table: str, msg_map: dict[int, int], nullable: bool) -> None: - cols = [c for c in _cols(conn, table) if c != "id"] - mi = cols.index("message_id") - ph = ",".join("?" * len(cols)) + # surrogate: reinsert dropping the autoincrement id, record old->new. + (pk,) = t.pk + ins_cols = [c for c in cols if c != pk] + ph = ",".join("?" * len(ins_cols)) + idmap: dict[int, int] = {} for row in conn.execute(f"SELECT {','.join(cols)} FROM preset.{table}").fetchall(): - vals = list(row) - old = vals[mi] - if old is None: - new = None - elif old in msg_map: - new = msg_map[old] - else: - if not nullable: - continue # message wasn't imported; drop the orphan attachment/log - new = None - vals[mi] = new - conn.execute(f"INSERT INTO main.{table} ({','.join(cols)}) VALUES ({ph})", vals) - - -def _merge_workflow_attachments(conn: sqlite3.Connection, msg_map: dict[int, int]) -> None: - table = "workflow_attachments" - all_cols = _cols(conn, table) - id_i = all_cols.index("id") - cols = [c for c in all_cols if c != "id"] - mi = cols.index("message_id") - par_i = cols.index("parent_attachment_id") - sib_i = cols.index("active_sibling_id") - ph = ",".join("?" * len(cols)) - attach_map: dict[int, int] = {} - deferred: list[tuple[int, int | None, int | None]] = [] # (new_id, old_parent, old_sibling) - for row in conn.execute(f"SELECT {','.join(all_cols)} FROM preset.{table}").fetchall(): - old_id = row[id_i] - vals = [row[all_cols.index(c)] for c in cols] - old_msg = vals[mi] - if old_msg not in msg_map: - continue # message not imported - vals[mi] = msg_map[old_msg] - old_parent, old_sib = vals[par_i], vals[sib_i] - vals[par_i] = None - vals[sib_i] = None - cur = conn.execute(f"INSERT INTO main.{table} ({','.join(cols)}) VALUES ({ph})", vals) - assert cur.lastrowid is not None - attach_map[old_id] = cur.lastrowid - deferred.append((cur.lastrowid, old_parent, old_sib)) - for new_id, old_parent, old_sib in deferred: - conn.execute( - f"UPDATE main.{table} SET parent_attachment_id = ?, active_sibling_id = ? WHERE id = ?", - ( - attach_map.get(old_parent) if old_parent is not None else None, - attach_map.get(old_sib) if old_sib is not None else None, - new_id, - ), - ) - - -def _merge_configs(conn: sqlite3.Connection) -> dict[int, int]: - # Preserve attachment-cache bookkeeping across the settings overwrite (see - # the rationale in bootstrap.reset_to_defaults). - cur = conn.execute( - "SELECT attachment_cache_budget_bytes, attachment_access_counter FROM main.settings WHERE id = 1" - ).fetchone() - - # apply runs with foreign_keys=OFF, so deleting endpoints does NOT cascade to - # model_configs -- clear them by hand or the old rows are left orphaned (their - # endpoint gone), which trips the foreign_key_check at the end of apply. - conn.execute("DELETE FROM main.model_configs") - conn.execute("DELETE FROM main.endpoints") - conn.execute("DELETE FROM main.user_personas") - - # personas - persona_map: dict[int, int] = {} - p_cols = _cols(conn, "user_personas") - p_id = p_cols.index("id") - p_ins = [c for c in p_cols if c != "id"] - p_ph = ",".join("?" * len(p_ins)) - for row in conn.execute(f"SELECT {','.join(p_cols)} FROM preset.user_personas").fetchall(): - vals = [row[p_cols.index(c)] for c in p_ins] - new = conn.execute(f"INSERT INTO main.user_personas ({','.join(p_ins)}) VALUES ({p_ph})", vals).lastrowid - assert new is not None - persona_map[row[p_id]] = new - - # endpoints first, with model-config back-refs nulled - endpoint_map: dict[int, int] = {} - e_cols = _cols(conn, "endpoints") - e_id = e_cols.index("id") - e_ins = [c for c in e_cols if c != "id"] - e_amc = e_ins.index("active_model_config_id") - e_agmc = e_ins.index("agent_active_model_config_id") - e_ph = ",".join("?" * len(e_ins)) - for row in conn.execute(f"SELECT {','.join(e_cols)} FROM preset.endpoints").fetchall(): - vals = [row[e_cols.index(c)] for c in e_ins] - vals[e_amc] = None - vals[e_agmc] = None - new = conn.execute(f"INSERT INTO main.endpoints ({','.join(e_ins)}) VALUES ({e_ph})", vals).lastrowid - assert new is not None - endpoint_map[row[e_id]] = new - - # model_configs with remapped endpoint_id - mc_map: dict[int, int] = {} - m_cols = _cols(conn, "model_configs") - m_id = m_cols.index("id") - m_ins = [c for c in m_cols if c != "id"] - m_ep = m_ins.index("endpoint_id") - m_ph = ",".join("?" * len(m_ins)) - for row in conn.execute(f"SELECT {','.join(m_cols)} FROM preset.model_configs").fetchall(): - vals = [row[m_cols.index(c)] for c in m_ins] - vals[m_ep] = endpoint_map.get(row[m_cols.index("endpoint_id")]) - new = conn.execute(f"INSERT INTO main.model_configs ({','.join(m_ins)}) VALUES ({m_ph})", vals).lastrowid + rowd = dict(zip(cols, row)) + vals, drop = [], False + for c in ins_cols: + v = rowd[c] + if c in deferred: + v = None + elif c in fks: + v, drop = _resolve_fk(v, fks[c], idmaps, conn, cache) + if drop: + break + vals.append(v) + if drop: + continue # an owning parent did not survive the import; drop the orphan + new = conn.execute(f"INSERT INTO main.{table} ({','.join(ins_cols)}) VALUES ({ph})", vals).lastrowid assert new is not None - mc_map[row[m_id]] = new + idmap[rowd[pk]] = new + idmaps[table] = idmap - # fix endpoint -> model_config back-refs - for row in conn.execute("SELECT id, active_model_config_id, agent_active_model_config_id FROM preset.endpoints").fetchall(): - conn.execute( - "UPDATE main.endpoints SET active_model_config_id = ?, agent_active_model_config_id = ? WHERE id = ?", - (mc_map.get(row[1]), mc_map.get(row[2]), endpoint_map[row[0]]), - ) - # settings: overwrite the singleton, remapping its FK refs, keeping cache cols - s_cols = _cols(conn, "settings") - ps = conn.execute(f"SELECT {','.join(s_cols)} FROM preset.settings WHERE id = 1").fetchone() - if ps: - sets, vals = [], [] - for i, c in enumerate(s_cols): - if c in ("id", "attachment_cache_budget_bytes", "attachment_access_counter"): - continue - v = ps[i] - if c in ("active_endpoint_id", "agent_endpoint_id"): - v = endpoint_map.get(v) if v is not None else None - elif c == "active_persona_id": - v = persona_map.get(v) if v is not None else None - sets.append(f"{c} = ?") - vals.append(v) - conn.execute(f"UPDATE main.settings SET {', '.join(sets)} WHERE id = 1", vals) - if cur is not None: - conn.execute( - "UPDATE main.settings SET attachment_cache_budget_bytes = ?, attachment_access_counter = ? WHERE id = 1", - (cur[0], cur[1]), - ) - return persona_map - - -def _reconcile_persona_locks(conn: sqlite3.Connection, persona_map: dict[int, int]) -> None: - """Realign character_cards/conversations.persona_lock_id after a merge. - - persona_lock_id mirrors a user_persona, the same way world_id mirrors a - world -- and like world_id it must be remapped or cleared on import or the - final foreign_key_check aborts the whole apply. Two things can leave it - stale: (1) freshly imported characters/chats carry the *file's* persona ids, - and when configs travelled along those personas were reinserted under new - ids (persona_map), so remap; (2) anything still unresolved -- the file - didn't carry the persona, or a configs replace removed the persona a - pre-existing local lock pointed at -- is nulled, mirroring the dangling - world_id treatment in _merge_characters. - - The remap keys off the lock value alone, so a pre-existing local lock that - happens to share a numeric id with a file persona is repointed at that file - persona rather than nulled; harmless, since a configs replace wipes the - local personas those locks referenced anyway. +def _fixup_deferred(conn, schema, table, from_col, idmaps, cache) -> None: + """Resolve a deferred (self or cycle) FK column once every id-map exists. + + The column was inserted NULL; now translate the file's original value through + the same rule and write it back, keyed by the row's new identity. Covers + messages.parent_id, the workflow-attachment self refs, conversations' + active_leaf_id, and the endpoints<->model_configs back-pointers in one pass. """ - tables = ("character_cards", "conversations") - if persona_map: - conn.execute("CREATE TEMP TABLE _persona_remap (old INTEGER PRIMARY KEY, new INTEGER)") - conn.executemany("INSERT INTO _persona_remap (old, new) VALUES (?, ?)", list(persona_map.items())) - for table in tables: - # single-pass remap (no UPDATE chaining) via the lookup table - conn.execute( - f"UPDATE main.{table} SET persona_lock_id = " - "(SELECT new FROM _persona_remap WHERE old = persona_lock_id) " - "WHERE persona_lock_id IN (SELECT old FROM _persona_remap)" - ) - conn.execute("DROP TABLE _persona_remap") - for table in tables: + t = schema.tables[table] + fk = t.fk(from_col) + assert fk is not None + pk = t.pk[0] + own_map = idmaps.get(table) # surrogate tables only + for row in conn.execute(f"SELECT {pk}, {from_col} FROM preset.{table}").fetchall(): + old_pk, old_val = row[0], row[1] + new_pk = own_map[old_pk] if own_map is not None else old_pk + if own_map is not None and old_pk not in own_map: + continue # row was dropped during insert + new_val, _ = _resolve_fk(old_val, fk, idmaps, conn, cache) + conn.execute(f"UPDATE main.{table} SET {from_col} = ? WHERE {pk} = ?", (new_val, new_pk)) + + +def _reconcile_crossref(conn, schema, fk: _FK, idmaps, cache) -> None: + """Realign every row of a soft-pointer column after its parent was *fully* + replaced, including rows the import never touched. + + A full table replace (re-keying user_personas, or wiping worlds on a restore) + can orphan pointers held by rows in *other* domains -- a pre-existing + character's persona_lock_id, a stale world link. This is the generalised + successor to _reconcile_persona_locks and the world_id null-out: remap through + the parent's old->new map where one exists, then NULL whatever still dangles. + (Same-domain children are not reconciled here: the domain's own replace already + rebuilt them, and their surrogate parent ids are not portable across it.) + """ + table, col = fk.table, fk.from_col + pmap = idmaps.get(fk.parent) + if pmap: + conn.execute("CREATE TEMP TABLE _fk_remap (old INTEGER PRIMARY KEY, new INTEGER)") + conn.executemany("INSERT INTO _fk_remap (old, new) VALUES (?, ?)", list(pmap.items())) conn.execute( - f"UPDATE main.{table} SET persona_lock_id = NULL " - "WHERE persona_lock_id IS NOT NULL " - "AND persona_lock_id NOT IN (SELECT id FROM main.user_personas)" + f"UPDATE main.{table} SET {col} = (SELECT new FROM _fk_remap WHERE old = {col}) " + f"WHERE {col} IN (SELECT old FROM _fk_remap)" ) + conn.execute("DROP TABLE _fk_remap") + conn.execute( + f"UPDATE main.{table} SET {col} = NULL " + f"WHERE {col} IS NOT NULL AND {col} NOT IN (SELECT {fk.to_col} FROM main.{fk.parent})" + ) -# Domains whose apply-merge is additive (upsert / per-parent replace). A -# domain-scoped *restore* must empty these before merging so the file's rows -# land in an empty domain and the domain ends up exactly matching the file. -# `configs` (overwrites the settings singleton in place) and `phrase_bank` -# (its merge already deletes first) are full replacements on apply already, so -# they are deliberately absent. -_REPLACE_WIPE_DOMAINS = ("characters", "chats", "lorebooks", "fragments") - - -def _replace_wipe(conn: sqlite3.Connection, included: set[str]) -> None: - """Empty each covered additive domain ahead of its merge (restore only). - - Deletes child-first (``reversed(DOMAIN_TABLES)``) to stay correct even if - foreign keys are ever on; apply runs FK-off, so the final - ``foreign_key_check`` is what actually guards the committed state. - """ - for domain in _REPLACE_WIPE_DOMAINS: - if domain in included: - for table in reversed(DOMAIN_TABLES[domain]): - conn.execute(f"DELETE FROM main.{table}") +def _merge(conn: sqlite3.Connection, included: set[str], replace: bool) -> dict[str, int]: + schema = _build_schema_model(conn) + inc = [t for t in schema.order if schema.domain_of(t) in included] + fully_replaced: set[str] = set() + + # A. Restore only: empty each additive domain (one whose entity root is a + # stable-key table that apply merges by upsert) so it ends up matching the + # file exactly. Domains with no stable root -- configs (singleton + replaced + # surrogate roots), phrase_bank -- are already full replacements on apply, so + # they are left to phase B. + if replace: + for domain in included: + roots = [r for r, d in ps.DOMAIN_ROOTS.items() if d == domain] + if roots and all(schema.tables[r].kind == "stable" for r in roots): + for table in reversed(schema.domain_tables(domain)): + conn.execute(f"DELETE FROM main.{table}") + fully_replaced.add(table) + + # B. Child-replace: clear the subtree each incoming entity supersedes, child + # first. A table whose entity root is stable is replaced per-root (scoped to + # the incoming ids); the stable root itself is left for the upsert. A table + # whose root is surrogate (endpoints/model_configs, user_personas, + # phrase_bank) has no portable identity, so it is wiped wholesale. + for table in reversed(inc): + t = schema.tables[table] + if t.kind == "singleton" or table in fully_replaced: + continue + root = schema.root_of(table) + if root.kind == "stable": + if table != root.name: + conn.execute(f"DELETE FROM main.{table} WHERE {_scope_clause(schema, table, root.name)}") + else: + conn.execute(f"DELETE FROM main.{table}") + fully_replaced.add(table) + + # C. Insert/upsert in topological order so every parent precedes its children. + cache: dict[str, set] = {} + idmaps: dict[str, dict[int, int]] = {} + for table in inc: + _merge_table(conn, schema, table, idmaps, cache) + + # D. Fix up deferred self/cycle back-pointers now that every id-map exists. + for table, col in schema.deferred: + if schema.domain_of(table) in included: + _fixup_deferred(conn, schema, table, col, idmaps, cache) + + # E. Reconcile cross-domain soft pointers into any fully-replaced parent. + for t in schema.tables.values(): + for fk in t.fks: + if ( + fk.kind == "crossref" + and (t.name, fk.from_col) not in schema.deferred + and fk.parent in fully_replaced + and schema.domain_of(t.name) != schema.domain_of(fk.parent) + ): + _reconcile_crossref(conn, schema, fk, idmaps, cache) + + # Row counts per merged domain (configs, anchored on its singleton, reports 1). + summary: dict[str, int] = {} + for domain in included: + roots = [r for r, d in ps.DOMAIN_ROOTS.items() if d == domain] + if any(schema.tables[r].kind == "singleton" for r in roots): + summary[domain] = 1 + else: + summary[domain] = sum(conn.execute(f"SELECT COUNT(*) FROM preset.{r}").fetchone()[0] for r in roots) + return summary def apply_preset(preset_path: str, *, replace: bool = False) -> dict: @@ -609,48 +722,7 @@ def apply_preset(preset_path: str, *, replace: bool = False) -> dict: conn.execute("PRAGMA foreign_keys=OFF") conn.execute("ATTACH DATABASE ? AS preset", (preset_path,)) conn.execute("BEGIN") - if replace: - _replace_wipe(conn, included) - if "lorebooks" in included: - _merge_lorebooks(conn) - summary["lorebooks"] = conn.execute("SELECT COUNT(*) FROM preset.worlds").fetchone()[0] - if "fragments" in included: - _upsert(conn, "mood_fragments") - _upsert(conn, "interactive_fragments") - summary["fragments"] = ( - conn.execute("SELECT COUNT(*) FROM preset.mood_fragments").fetchone()[0] - + conn.execute("SELECT COUNT(*) FROM preset.interactive_fragments").fetchone()[0] - ) - if "characters" in included: - _merge_characters(conn) - summary["characters"] = conn.execute("SELECT COUNT(*) FROM preset.character_cards").fetchone()[0] - if "chats" in included: - _merge_chats(conn) - summary["chats"] = conn.execute("SELECT COUNT(*) FROM preset.conversations").fetchone()[0] - if "phrase_bank" in included: - conn.execute("DELETE FROM main.phrase_bank") - _insert_no_id(conn, "phrase_bank") - summary["phrase_bank"] = conn.execute("SELECT COUNT(*) FROM preset.phrase_bank").fetchone()[0] - persona_map: dict[int, int] = {} - if "configs" in included: - persona_map = _merge_configs(conn) - summary["configs"] = 1 - - # persona_lock_id points into user_personas (configs domain); realign or - # clear it whenever a domain that carries it was touched, so a re-keyed - # or absent persona doesn't dangle the FK and abort the import. - if included & {"characters", "chats", "configs"}: - _reconcile_persona_locks(conn, persona_map) - - if replace and "lorebooks" in included: - # Worlds were replaced wholesale; null any character link to a world - # the file didn't carry. (When characters was also covered, - # _merge_characters already ran this; harmless to repeat.) - conn.execute( - "UPDATE main.character_cards SET world_id = NULL " - "WHERE world_id IS NOT NULL AND world_id NOT IN (SELECT id FROM main.worlds)" - ) - + summary = _merge(conn, included, replace) problems = conn.execute("PRAGMA foreign_key_check").fetchall() if problems: conn.execute("ROLLBACK") diff --git a/tests/integration/test_preset_schema_coverage.py b/tests/integration/test_preset_schema_coverage.py new file mode 100644 index 0000000..3107f6a --- /dev/null +++ b/tests/integration/test_preset_schema_coverage.py @@ -0,0 +1,221 @@ +"""Drift backstop + end-to-end exercise for the schema-driven preset engine. + +The merge engine derives its mechanics from the live schema, so adding a table or +an FK column needs no edit in ``presets.py``. The price of that is a loud check that +nothing new escapes the *policy* declared in ``preset_schema.py``: every table must +belong to a domain (or be excluded), every FK must resolve, and no secret-looking +column may be unaccounted for. These tests are that check, plus a full round-trip +that drives the generic engine across every domain at once. +""" + +from __future__ import annotations + +import sqlite3 + +from backend import presets +from backend.database.schema import CREATE_TABLES_SQL + + +def _fresh_schema_db(tmp_path, extra_sql: str = "") -> sqlite3.Connection: + """An in-memory-equivalent DB with the current fresh-install schema (+extras).""" + conn = sqlite3.connect(str(tmp_path / "schema.db")) + conn.executescript(CREATE_TABLES_SQL) + if extra_sql: + conn.executescript(extra_sql) + return conn + + +# ── drift check ────────────────────────────────────────────────────────────── + + +def test_live_schema_is_fully_covered(tmp_path): + """Every current table maps to a domain, every FK resolves, every secret column + is declared. This is the test that fails the day someone adds a table or a + sensitive column without updating preset_schema.py.""" + conn = _fresh_schema_db(tmp_path) + try: + assert presets.schema_coverage_problems(conn) == [] + finally: + conn.close() + + +def test_every_nonexcluded_table_resolves_to_one_domain(tmp_path): + conn = _fresh_schema_db(tmp_path) + try: + schema = presets._build_schema_model(conn) + for name in schema.tables: + assert schema.domain_of(name) is not None, name + # The excluded set is exactly machinery -- never something with a domain. + for excluded in presets.ps.EXCLUDED_TABLES: + assert excluded not in schema.tables + finally: + conn.close() + + +def test_coverage_flags_a_rogue_root_table(tmp_path): + """A new top-level table with no DOMAIN_ROOT entry must be reported, naming it.""" + conn = _fresh_schema_db(tmp_path, "CREATE TABLE widgets (id TEXT PRIMARY KEY, label TEXT NOT NULL);") + try: + problems = presets.schema_coverage_problems(conn) + assert any("widgets" in p for p in problems), problems + finally: + conn.close() + + +def test_coverage_flags_an_undeclared_secret_column(tmp_path): + conn = _fresh_schema_db(tmp_path, "ALTER TABLE settings ADD COLUMN refresh_token TEXT NOT NULL DEFAULT '';") + try: + problems = presets.schema_coverage_problems(conn) + assert any("refresh_token" in p for p in problems), problems + finally: + conn.close() + + +def test_new_cascade_child_is_handled_with_zero_edits(tmp_path): + """The whole point of the refactor: a brand-new child table hung off an existing + entity via ON DELETE CASCADE is classified, domained, ordered and covered purely + from the schema -- no edit to presets.py or preset_schema.py.""" + conn = _fresh_schema_db( + tmp_path, + "CREATE TABLE message_notes (" + " id INTEGER PRIMARY KEY AUTOINCREMENT," + " message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE," + " note TEXT NOT NULL" + ");", + ) + try: + schema = presets._build_schema_model(conn) + t = schema.tables["message_notes"] + assert t.kind == "surrogate" # autoincrement id -> reinsert with remap + assert schema.domain_of("message_notes") == "chats" # joins messages -> conversations + assert schema.root_of("message_notes").name == "conversations" + # ordered after its owner, and fully covered. + assert schema.order.index("message_notes") > schema.order.index("messages") + assert presets.schema_coverage_problems(conn) == [] + finally: + conn.close() + + +# ── full round-trip across every domain ──────────────────────────────────────── + + +def _insert_conv_tree(path: str, cid: str, persona_id: int | None) -> None: + conn = sqlite3.connect(path) + try: + ts = "2024-01-01T00:00:00" + conn.execute( + "INSERT INTO conversations (id, title, created_at, persona_lock_id) VALUES (?, ?, ?, ?)", + (cid, f"Chat {cid}", ts, persona_id), + ) + m1 = conn.execute( + "INSERT INTO messages (conversation_id, role, content, turn_index, parent_id, created_at) " + "VALUES (?, 'user', 'hello', 0, NULL, ?)", + (cid, ts), + ).lastrowid + m2 = conn.execute( + "INSERT INTO messages (conversation_id, role, content, turn_index, parent_id, created_at) " + "VALUES (?, 'assistant', 'world', 1, ?, ?)", + (cid, m1, ts), + ).lastrowid + conn.execute("UPDATE conversations SET active_leaf_id = ? WHERE id = ?", (m2, cid)) + conn.execute("INSERT INTO director_state (conversation_id, active_moods) VALUES (?, '[]')", (cid,)) + conn.commit() + finally: + conn.close() + + +def _signature(path: str) -> dict: + """Canonical, surrogate-id-independent content of every data domain. + + Surrogate ids (messages, personas, …) are never compared directly; references + to them are resolved to the parent's portable identity (a persona's name, a + leaf message's content) so two databases that differ only by autoincrement + renumbering produce the same signature. + """ + conn = sqlite3.connect(path) + conn.row_factory = sqlite3.Row + + def q(sql): + return sorted(tuple(r) for r in conn.execute(sql).fetchall()) + + try: + return { + "characters": q( + "SELECT cc.name, cc.world_id, up.name FROM character_cards cc " + "LEFT JOIN user_personas up ON cc.persona_lock_id = up.id" + ), + "conversations": q("SELECT id, title FROM conversations"), + "conv_persona": q( + "SELECT c.id, up.name FROM conversations c " "LEFT JOIN user_personas up ON c.persona_lock_id = up.id" + ), + "messages": q("SELECT conversation_id, turn_index, role, content FROM messages"), + "active_leaf": q("SELECT c.id, m.content FROM conversations c " "LEFT JOIN messages m ON c.active_leaf_id = m.id"), + "director_state": q("SELECT conversation_id FROM director_state"), + "worlds": q("SELECT id, name FROM worlds"), + "lorebook_entries": q("SELECT world_id, name, content FROM lorebook_entries"), + "personas": q("SELECT name, description FROM user_personas"), + "phrase_bank": q("SELECT variants, kind, pattern FROM phrase_bank"), + "fragments": q("SELECT id, label FROM mood_fragments"), + } + finally: + conn.close() + + +async def test_full_round_trip_is_identity_modulo_surrogate_ids(client, db_path): + """Seed every domain, export a full preset, scramble the live DB, then apply the + file with replace=True: the database must come back row-for-row identical + (ignoring autoincrement renumbering). One assertion exercises the entire generic + engine -- topo order, surrogate remap, FK rewrite, self/cycle fixup, child-replace + and cross-domain reconcile -- across all domains together.""" + path = str(db_path) + + # personas, worlds + entries, characters (one world-linked, one persona-locked). + p1 = (await client.post("/api/user-personas", json={"name": "Ada"})).json()["id"] + w1 = (await client.post("/api/worlds", json={"name": "Mythos"})).json()["id"] + await client.post(f"/api/worlds/{w1}/entries", json={"name": "Lore A", "content": "alpha"}) + await client.post(f"/api/worlds/{w1}/entries", json={"name": "Lore B", "content": "beta"}) + linked = (await client.post("/api/characters", json={"name": "Linked"})).json()["id"] + await client.put(f"/api/characters/{linked}", json={"world_id": w1}) + locked = (await client.post("/api/characters", json={"name": "Locked"})).json()["id"] + await client.put(f"/api/characters/{locked}", json={"persona_lock_id": p1}) + + # a chat tree, persona-locked, with an active leaf to remap. + _insert_conv_tree(path, "conv-keep", p1) + + # configs touch + a phrase-bank row, to cover those domains too. + await client.put("/api/settings", json={"user_name": "Ada", "api_key": "sk-keep"}) + + before = _signature(path) + + name = ( + await client.post( + "/api/presets/export", + json={"domains": list(presets.ALL_DOMAINS), "strip_keys": False, "label": "roundtrip"}, + ) + ).json()["name"] + preset_path = presets._library_path(name) + + # Scramble the live DB across domains: delete, edit, and add rows everywhere. + await client.delete(f"/api/characters/{linked}") + await client.put(f"/api/characters/{locked}", json={"name": "Renamed"}) + await client.post("/api/characters", json={"name": "Intruder"}) + _insert_conv_tree(path, "conv-extra", None) + w2 = (await client.post("/api/worlds", json={"name": "Junk"})).json()["id"] + await client.post(f"/api/worlds/{w2}/entries", json={"name": "noise", "content": "x"}) + await client.put("/api/settings", json={"user_name": "Eve"}) + + # Drive the generic engine on a full-coverage file (replace = restore semantics). + import asyncio + + summary = await asyncio.to_thread(presets.apply_preset, preset_path, replace=True) + + after = _signature(path) + assert after == before, {k: (before[k], after[k]) for k in before if before[k] != after[k]} + assert summary["chats"] == 1 and summary["characters"] == 2 and summary["configs"] == 1 + + # And the committed state has no dangling foreign keys. + conn = sqlite3.connect(path) + try: + assert conn.execute("PRAGMA foreign_key_check").fetchall() == [] + finally: + conn.close() diff --git a/tests/integration/test_presets.py b/tests/integration/test_presets.py index 7af3961..c7a6068 100644 --- a/tests/integration/test_presets.py +++ b/tests/integration/test_presets.py @@ -343,9 +343,7 @@ async def test_apply_remaps_persona_lock_when_configs_included(client, db): pid = (await client.post("/api/user-personas", json={"name": "Pinned"})).json()["id"] ch = (await client.post("/api/characters", json={"name": "Locked"})).json()["id"] await client.put(f"/api/characters/{ch}", json={"persona_lock_id": pid}) - name = ( - await client.post("/api/presets/export", json={"domains": ["characters", "configs"]}) - ).json()["name"] + name = (await client.post("/api/presets/export", json={"domains": ["characters", "configs"]})).json()["name"] resp = await client.post(f"/api/presets/{name}/apply", json={}) assert resp.status_code == 200, resp.json() @@ -367,9 +365,7 @@ async def test_apply_remaps_conversation_persona_lock_when_configs_included(clie await _make_conv_with_tree(db) await db.execute("UPDATE conversations SET persona_lock_id = ? WHERE id = 'conv-1'", (pid,)) await db.commit() - name = ( - await client.post("/api/presets/export", json={"domains": ["chats", "configs"]}) - ).json()["name"] + name = (await client.post("/api/presets/export", json={"domains": ["chats", "configs"]})).json()["name"] resp = await client.post(f"/api/presets/{name}/apply", json={}) assert resp.status_code == 200, resp.json() From 729128eeee7a0c0841f4cd78deaf018ca0c84f97 Mon Sep 17 00:00:00 2001 From: Chi Date: Fri, 12 Jun 2026 00:46:06 +0700 Subject: [PATCH 2/9] update AGENTS.md to reflect change --- AGENTS.md | 55 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 17 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 8e9a9a1..2810dac 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -65,6 +65,8 @@ Orb/ │ │ │ # depends on nothing else (see Data Contracts below) │ │ ├── connection.py # DB_PATH, get_db() async context manager, _build_set_clause │ │ ├── schema.py # CREATE TABLES script +│ │ ├── preset_schema.py # Preset engine product/security policy (single source of +│ │ │ # truth): DOMAIN_ROOTS, EXCLUDED/SECRET/PRESERVED cols │ │ ├── seeds.py # SEED_* / DEFAULT_* constants │ │ ├── bootstrap.py # init_db() (schema + inline ALTERs + seed inserts), reset_to_defaults() │ │ ├── queries/ # Per-domain CRUD modules (one file per table group) @@ -79,7 +81,9 @@ Orb/ │ ├── macros.py # Macro resolution ({{user}}, {{char}}, {{roll}}, etc.) │ ├── kv_tracker.py # Debug: logs messages/tools to JSON for inspection │ ├── presets.py # Preset/backup engine: selective export, merge-import, -│ │ # full snapshots/restore (sqlite ATTACH + VACUUM INTO) +│ │ # full snapshots/restore (sqlite ATTACH + VACUUM INTO). +│ │ # Schema-driven: mechanics derived from live schema via +│ │ # PRAGMA; policy declared in database/preset_schema.py │ ├── locks.py # Cross-module asyncio locks (workflow_state / character_state / config / maintenance) │ ├── utils.py # Shared utilities │ ├── passes/ @@ -239,22 +243,39 @@ grouped into coarse **domains** (`characters`, `chats`, `lorebooks`, `fragments` `phrase_bank`, `configs`); a *preset* carries a chosen subset, a *snapshot* is a full-domain preset, and both live in one on-disk library described by an `orb_preset_meta` row. Two ways data crosses back in: **apply** (merge by -identity — UUID rows upsert, child collections replace wholesale, integer-PK rows -reinsert with remapped references) and **restore** (roll back to the file — a -full-coverage file is swapped in whole via `restore_full`; a partial file is -restored *domain-scoped* via `restore_partial`/`apply_preset(replace=True)`, -which empties each covered domain before the merge so those domains match the -file exactly while uncovered ones are untouched). Both work on any library file — -imported ones included; restore's auto-snapshot makes the overwrite reversible. -**Import** is non-destructive: it just lands an external `.db` in the library -(the user then applies or restores it). Destructive ops auto-snapshot first. - -The single source of truth for *which tables belong to which domain* is the -`DOMAIN_TABLES` map at the top of `presets.py`. **When you add a table** (or a -table sprouts a cross-domain FK), update that map and the per-domain merge logic -there — keep the domain grouping current rather than expanding this section. Runs -synchronously off the event loop via `asyncio.to_thread` under -`backend.locks.maintenance_lock`. +identity) and **restore** (roll back to the file — a full-coverage file is swapped +in whole via `restore_full`; a partial file is restored *domain-scoped* via +`restore_partial`/`apply_preset(replace=True)`, which empties each covered domain +before the merge so those domains match the file exactly while uncovered ones are +untouched). Both work on any library file — imported ones included; restore's +auto-snapshot makes the overwrite reversible. **Import** is non-destructive: it +just lands an external `.db` in the library (the user then applies or restores it). +Destructive ops auto-snapshot first. + +**The merge engine is schema-driven.** It introspects the live schema with +`PRAGMA` (`_build_schema_model()`) to derive *all* of its mechanics — per-table +classification (`singleton` = a `CHECK (id = 1)` table updated in place; `stable` += portable identity, upserted by PK; `surrogate` = autoincrement rowid, reinserted +under fresh ids with an old→new map), the FK graph, the topological insert order, +and which edges to defer (self refs + cross-table cycles, inserted NULL then fixed +up). Ownership (`ON DELETE CASCADE`) edges define the entity tree and the +child-replace scope; non-CASCADE edges are soft cross-references, reconciled after +a full replace. **Adding a child table or an FK column needs zero edits in +`presets.py`** — the model just grows. + +The *only* hand-maintained input is the product/security **policy** in +`backend/database/preset_schema.py`: `DOMAIN_ROOTS` (root table → user-facing +domain; every non-root auto-joins its root's domain by climbing ownership edges), +`EXCLUDED_TABLES`, `SECRET_COLUMNS` (blanked when `configs` isn't exported), +`IMPLIED_DOMAINS` (e.g. `chats` pulls in `characters`), `PRESERVED_COLUMNS` +(local-only cols kept across the settings-singleton overwrite), and the +`SENSITIVE_*` markers. **When you add a table** that introduces a new entity root, +or a column that looks secret, update that file. A drift backstop — +`schema_coverage_problems()`, asserted by `tests/integration/test_preset_schema_coverage.py` +— fails loudly the moment a freshly-migrated table maps to no domain, an FK +references an unclassified parent, or a sensitive-looking column is missing from +`SECRET_COLUMNS`. Runs synchronously off the event loop via `asyncio.to_thread` +under `backend.locks.maintenance_lock`. ## Data Contracts (the model layer) From 760b901a3379f49c3cd4fbafd980ee8d3ea040c1 Mon Sep 17 00:00:00 2001 From: Chi Date: Fri, 12 Jun 2026 01:22:48 +0700 Subject: [PATCH 3/9] more test cases, fix persona silent double remap, KeyError in _fixup_deferred, remove cycling message healing --- backend/presets.py | 88 ++++++++-- .../test_preset_schema_coverage.py | 164 +++++++++++++++++- 2 files changed, 234 insertions(+), 18 deletions(-) diff --git a/backend/presets.py b/backend/presets.py index 7af6856..7d371cd 100644 --- a/backend/presets.py +++ b/backend/presets.py @@ -46,6 +46,11 @@ ALL_DOMAINS: list[str] = sorted(set(ps.DOMAIN_ROOTS.values())) +def _roots_for(domain: str) -> list[str]: + """The root tables belonging to ``domain`` (reverse of ps.DOMAIN_ROOTS).""" + return [r for r, d in ps.DOMAIN_ROOTS.items() if d == domain] + + class PresetError(Exception): """Raised for caller-facing preset failures (bad file, version skew, etc.).""" @@ -149,8 +154,8 @@ def _build_schema_model(conn: sqlite3.Connection) -> _Schema: fks: list[_FK] = [] for r in conn.execute(f"PRAGMA foreign_key_list({name})").fetchall(): parent, from_col, to_col, on_delete = r[2], r[3], r[4], r[6] - if to_col is None: # implicit reference to the parent's primary key - to_col = tables[parent].pk[0] if parent in tables else "id" + # to_col may be None (implicit reference to the parent's PK); it is + # resolved in a second pass below, once every table's PK is known. fks.append(_FK(name, from_col, parent, to_col, on_delete, notnull.get(from_col, False))) if _SINGLETON_RE.search(ddl[name]): @@ -163,6 +168,14 @@ def _build_schema_model(conn: sqlite3.Connection) -> _Schema: owner = next((f for f in fks if f.kind == "ownership"), None) tables[name] = _Table(name, cols, pk, kind, fks, owner) + # Second pass: resolve implicit FK targets now that every PK is known, so the + # fallback never depends on sqlite_master order (a child read before its parent + # used to silently get "id" instead of the parent's real PK). + for t in tables.values(): + for f in t.fks: + if f.to_col is None: + f.to_col = tables[f.parent].pk[0] if f.parent in tables else "id" + order, deferred = _topo_order(tables) return _Schema(tables, order, deferred) @@ -245,6 +258,17 @@ def schema_coverage_problems(conn: sqlite3.Connection) -> list[str]: f"column {name}.{col} looks secret but is not in SECRET_COLUMNS; " f"add it (with its scrub value) or rename it" ) + # Every deferred edge is inserted NULL and fixed up afterwards (FK checks are + # off during the merge, but a NOT NULL constraint still fires on insert). A + # future NOT NULL self-FK, or a NOT NULL crossref caught inside a broken cycle, + # would therefore raise IntegrityError on every merge -- surface it here. + for table, col in schema.deferred: + fk = schema.tables[table].fk(col) + if fk is not None and fk.notnull: + problems.append( + f"{table}.{col} is a deferred FK edge (inserted NULL, fixed up after) " + f"but is declared NOT NULL; a merge would fail its constraint. Make it nullable." + ) return problems @@ -368,13 +392,6 @@ def read_meta(path: str) -> dict | None: } -# ── small sql helpers ─────────────────────────────────────────────────────── - - -def _cols(conn: sqlite3.Connection, table: str) -> list[str]: - return [r[1] for r in conn.execute(f"PRAGMA table_info({table})").fetchall()] - - # ── export ────────────────────────────────────────────────────────────────── @@ -435,8 +452,8 @@ def build_preset(selected_domains, strip_keys: bool, label: str = "") -> str: if domain == "configs": _scrub_configs(c, schema) continue - for root, root_domain in ps.DOMAIN_ROOTS.items(): - if root_domain == domain and schema.tables[root].kind != "singleton": + for root in _roots_for(domain): + if schema.tables[root].kind != "singleton": c.execute(f"DELETE FROM {root}") if "configs" in selected and strip_keys: for table, col in ((t, col) for (t, col) in ps.SECRET_COLUMNS if col == "api_key"): @@ -587,6 +604,27 @@ def _merge_table(conn, schema, table, idmaps, cache) -> None: idmaps[table] = idmap +def _break_self_cycles(pointer: dict) -> None: + """Null the closing edge of every cycle in a self-FK pointer map (in place). + + The merge re-establishes the file's parent links faithfully in the new id + space, so a self-parented or otherwise cyclic chain in the source (a malformed + import: ``messages.parent_id`` looping, a workflow-attachment self ref) would + survive as a loop the app's tree-walk can spin on. Walk each chain and, the + moment it revisits a node, null that node's pointer so the chain reaches root + -- matching the old engine, which attached such messages to the root. + """ + for start in pointer: + seen: set = set() + cur = start + while cur is not None and cur in pointer: + if cur in seen: + pointer[cur] = None # break the cycle here -> this node becomes a root + break + seen.add(cur) + cur = pointer[cur] + + def _fixup_deferred(conn, schema, table, from_col, idmaps, cache) -> None: """Resolve a deferred (self or cycle) FK column once every id-map exists. @@ -594,22 +632,29 @@ def _fixup_deferred(conn, schema, table, from_col, idmaps, cache) -> None: the same rule and write it back, keyed by the row's new identity. Covers messages.parent_id, the workflow-attachment self refs, conversations' active_leaf_id, and the endpoints<->model_configs back-pointers in one pass. + For a *self* edge the resolved links are cycle-broken first (see + _break_self_cycles) so a malformed source tree cannot import a loop. """ t = schema.tables[table] fk = t.fk(from_col) assert fk is not None pk = t.pk[0] own_map = idmaps.get(table) # surrogate tables only + resolved: dict = {} # new_pk -> new_val, in the post-merge id space for row in conn.execute(f"SELECT {pk}, {from_col} FROM preset.{table}").fetchall(): old_pk, old_val = row[0], row[1] - new_pk = own_map[old_pk] if own_map is not None else old_pk if own_map is not None and old_pk not in own_map: continue # row was dropped during insert + new_pk = own_map[old_pk] if own_map is not None else old_pk new_val, _ = _resolve_fk(old_val, fk, idmaps, conn, cache) + resolved[new_pk] = new_val + if fk.is_self: + _break_self_cycles(resolved) + for new_pk, new_val in resolved.items(): conn.execute(f"UPDATE main.{table} SET {from_col} = ? WHERE {pk} = ?", (new_val, new_pk)) -def _reconcile_crossref(conn, schema, fk: _FK, idmaps, cache) -> None: +def _reconcile_crossref(conn, schema, fk: _FK, idmaps, cache, remap: bool) -> None: """Realign every row of a soft-pointer column after its parent was *fully* replaced, including rows the import never touched. @@ -620,10 +665,17 @@ def _reconcile_crossref(conn, schema, fk: _FK, idmaps, cache) -> None: the parent's old->new map where one exists, then NULL whatever still dangles. (Same-domain children are not reconciled here: the domain's own replace already rebuilt them, and their surrogate parent ids are not portable across it.) + + ``remap`` is False when the child *table's own domain was merged this pass*: + phase C already resolved those rows' pointers into the new id space via + _resolve_fk, so re-running the file old->new map would double-remap them (and + silently corrupt a row whose freshly-assigned new id collides with a file old + id). Only the NULL-out runs in that case, catching pre-existing rows the merge + left untouched whose now-stale local pointer no longer resolves. """ table, col = fk.table, fk.from_col pmap = idmaps.get(fk.parent) - if pmap: + if remap and pmap: conn.execute("CREATE TEMP TABLE _fk_remap (old INTEGER PRIMARY KEY, new INTEGER)") conn.executemany("INSERT INTO _fk_remap (old, new) VALUES (?, ?)", list(pmap.items())) conn.execute( @@ -649,7 +701,7 @@ def _merge(conn: sqlite3.Connection, included: set[str], replace: bool) -> dict[ # they are left to phase B. if replace: for domain in included: - roots = [r for r, d in ps.DOMAIN_ROOTS.items() if d == domain] + roots = _roots_for(domain) if roots and all(schema.tables[r].kind == "stable" for r in roots): for table in reversed(schema.domain_tables(domain)): conn.execute(f"DELETE FROM main.{table}") @@ -692,12 +744,14 @@ def _merge(conn: sqlite3.Connection, included: set[str], replace: bool) -> dict[ and fk.parent in fully_replaced and schema.domain_of(t.name) != schema.domain_of(fk.parent) ): - _reconcile_crossref(conn, schema, fk, idmaps, cache) + # Rows of a merged child domain were already FK-rewritten in phase + # C; only remap the parent map for child tables left untouched. + _reconcile_crossref(conn, schema, fk, idmaps, cache, remap=schema.domain_of(t.name) not in included) # Row counts per merged domain (configs, anchored on its singleton, reports 1). summary: dict[str, int] = {} for domain in included: - roots = [r for r, d in ps.DOMAIN_ROOTS.items() if d == domain] + roots = _roots_for(domain) if any(schema.tables[r].kind == "singleton" for r in roots): summary[domain] = 1 else: diff --git a/tests/integration/test_preset_schema_coverage.py b/tests/integration/test_preset_schema_coverage.py index 3107f6a..433166f 100644 --- a/tests/integration/test_preset_schema_coverage.py +++ b/tests/integration/test_preset_schema_coverage.py @@ -96,6 +96,158 @@ def test_new_cascade_child_is_handled_with_zero_edits(tmp_path): conn.close() +def test_coverage_flags_a_not_null_deferred_edge(tmp_path): + """A deferred FK edge (self ref, or a crossref broken to break a cycle) is + inserted NULL during the merge, so a NOT NULL one would fail every import. The + coverage check must surface that the moment such a column is added.""" + conn = _fresh_schema_db( + tmp_path, + "CREATE TABLE tree_nodes (" + " id INTEGER PRIMARY KEY AUTOINCREMENT," + " conversation_id TEXT NOT NULL REFERENCES conversations(id) ON DELETE CASCADE," + " parent_id INTEGER NOT NULL REFERENCES tree_nodes(id) ON DELETE CASCADE" + ");", + ) + try: + schema = presets._build_schema_model(conn) + assert ("tree_nodes", "parent_id") in schema.deferred # a self edge -> deferred + problems = presets.schema_coverage_problems(conn) + assert any("tree_nodes.parent_id" in p and "NOT NULL" in p for p in problems), problems + finally: + conn.close() + + +# ── merge regressions (PR #90 audit) ──────────────────────────────────────────── + + +def _seed(path: str, sql_pairs: list[tuple[str, tuple]]) -> None: + conn = sqlite3.connect(path) + try: + conn.execute("PRAGMA foreign_keys=OFF") # may seed deliberately malformed source rows + for sql, params in sql_pairs: + conn.execute(sql, params) + conn.commit() + finally: + conn.close() + + +def _merge(main_path: str, preset_path: str, included: set, *, replace: bool = False) -> None: + conn = sqlite3.connect(main_path, isolation_level=None) + try: + conn.execute("PRAGMA foreign_keys=OFF") + conn.execute("ATTACH DATABASE ? AS preset", (preset_path,)) + conn.execute("BEGIN") + presets._merge(conn, included, replace) + assert conn.execute("PRAGMA foreign_key_check").fetchall() == [] + conn.execute("COMMIT") + finally: + conn.execute("DETACH DATABASE preset") + conn.close() + + +def test_persona_lock_survives_gapped_persona_ids(tmp_path): + """Regression: a character locked to a file persona whose ids have gaps used to + be double-remapped (phase C resolves it, phase E remapped it again through the + same map), silently re-pointing it at the wrong persona. The lock must survive.""" + main, preset = str(tmp_path / "main.db"), str(tmp_path / "preset.db") + for p in (main, preset): + c = sqlite3.connect(p) + c.executescript(CREATE_TABLES_SQL) + c.commit() + c.close() + ts = "2024-01-01" + # File personas have a gap: ids {2, 5} reinsert as {1, 2}, so new id 2 collides + # with file old id 2 -- the trigger for the double remap. + _seed( + preset, + [ + ("INSERT INTO user_personas (id, name, created_at, updated_at) VALUES (2, 'Alice', ?, ?)", (ts, ts)), + ("INSERT INTO user_personas (id, name, created_at, updated_at) VALUES (5, 'Bob', ?, ?)", (ts, ts)), + ( + "INSERT INTO character_cards (id, name, created_at, updated_at, persona_lock_id) " + "VALUES ('char-1', 'Locked', ?, ?, 5)", + (ts, ts), + ), + ], + ) + _merge(main, preset, {"characters", "configs"}) + conn = sqlite3.connect(main) + try: + locked = conn.execute( + "SELECT cc.name, up.name FROM character_cards cc " "LEFT JOIN user_personas up ON cc.persona_lock_id = up.id" + ).fetchall() + finally: + conn.close() + assert locked == [("Locked", "Bob")], locked + + +def test_orphan_surrogate_row_is_dropped_not_crashed(tmp_path): + """Regression: a surrogate row dropped during insert (an external preset whose + workflow_attachment points at an absent message) used to raise KeyError in the + deferred fixup and abort the whole apply. It must be skipped instead.""" + main, preset = str(tmp_path / "main.db"), str(tmp_path / "preset.db") + for p in (main, preset): + c = sqlite3.connect(p) + c.executescript(CREATE_TABLES_SQL) + c.commit() + c.close() + ts = "2024-01-01" + _seed( + preset, + [ + ("INSERT INTO conversations (id, title, created_at) VALUES ('c1', 't', ?)", (ts,)), + ( + "INSERT INTO messages (id, conversation_id, role, content, turn_index, created_at) " + "VALUES (10, 'c1', 'user', 'hi', 0, ?)", + (ts,), + ), + ( + "INSERT INTO workflow_attachments (id, message_id, mime_type, data_b64, created_at, workflow_id) " + "VALUES (7, 999, 'image/png', 'AAA', ?, 'wf')", + (ts,), + ), + ], + ) + _merge(main, preset, {"chats", "characters"}) # must not raise + conn = sqlite3.connect(main) + try: + assert conn.execute("SELECT COUNT(*) FROM workflow_attachments").fetchone()[0] == 0 + assert conn.execute("SELECT COUNT(*) FROM messages").fetchone()[0] == 1 + finally: + conn.close() + + +def test_self_parented_message_is_healed_to_root(tmp_path): + """Regression: a self-parented (or cyclic) message in the source used to import + as a faithful loop the app's tree-walk can spin on. The fixup must null the + closing edge so the chain reaches root.""" + main, preset = str(tmp_path / "main.db"), str(tmp_path / "preset.db") + for p in (main, preset): + c = sqlite3.connect(p) + c.executescript(CREATE_TABLES_SQL) + c.commit() + c.close() + ts = "2024-01-01" + _seed( + preset, + [ + ("INSERT INTO conversations (id, title, created_at) VALUES ('c1', 't', ?)", (ts,)), + ( + "INSERT INTO messages (id, conversation_id, role, content, turn_index, parent_id, created_at) " + "VALUES (10, 'c1', 'user', 'self', 0, 10, ?)", + (ts,), + ), + ], + ) + _merge(main, preset, {"chats", "characters"}) + conn = sqlite3.connect(main) + try: + parents = conn.execute("SELECT parent_id FROM messages").fetchall() + finally: + conn.close() + assert parents == [(None,)], parents + + # ── full round-trip across every domain ──────────────────────────────────────── @@ -182,8 +334,18 @@ async def test_full_round_trip_is_identity_modulo_surrogate_ids(client, db_path) # a chat tree, persona-locked, with an active leaf to remap. _insert_conv_tree(path, "conv-keep", p1) - # configs touch + a phrase-bank row, to cover those domains too. + # configs touch, plus a phrase-bank row (surrogate full-replace path) and a + # mood fragment (stable upsert) so those domains carry real data round-trip. await client.put("/api/settings", json={"user_name": "Ada", "api_key": "sk-keep"}) + seed = sqlite3.connect(path) + try: + seed.execute("INSERT INTO phrase_bank (variants, kind, pattern) VALUES ('[\"hi\"]', 'literal', NULL)") + seed.execute( + "INSERT INTO mood_fragments (id, label, description, prompt_text) VALUES ('frag-1', 'Calm', 'desc', 'be calm')" + ) + seed.commit() + finally: + seed.close() before = _signature(path) From 4d16a5a64570221deed6a4ee71d2b7d18f77bf04 Mon Sep 17 00:00:00 2001 From: Chi Date: Fri, 12 Jun 2026 10:55:24 +0700 Subject: [PATCH 4/9] cleare comments in preset_schema.py for future development --- backend/database/preset_schema.py | 97 +++++++++++++++++++------------ 1 file changed, 61 insertions(+), 36 deletions(-) diff --git a/backend/database/preset_schema.py b/backend/database/preset_schema.py index d96b357..a7bedb8 100644 --- a/backend/database/preset_schema.py +++ b/backend/database/preset_schema.py @@ -1,22 +1,41 @@ -"""Product / security policy for the preset engine -- the single source of truth. +"""Preset engine policy -- the human-decided facts the schema can't tell the engine. -The merge engine in ``backend/presets.py`` derives *all* of its mechanics (merge -order, id remapping, FK rewrite, child-replace scope) from the live schema via -``PRAGMA`` introspection, so adding a child table or a new FK column needs **zero** -edits there. What is *not* a schema fact -- which root table belongs to which -user-facing domain, which machinery tables to ignore, which columns carry secrets -- -lives here and only here. ``tests/integration/test_preset_schema_coverage.py`` fails -loudly the moment a freshly-migrated table or a sensitive-looking column is not -accounted for below. +The merge engine in ``backend/presets.py`` reads the live SQLite schema and derives +every *mechanical* decision itself (merge order, id remapping, FK rewrite, +child-replace scope), so most schema changes need **no edit here**. This file holds +only the handful of facts no ``PRAGMA`` can reveal: + + which domain a table belongs to -> DOMAIN_ROOTS + which tables to ignore entirely -> EXCLUDED_TABLES + which columns are secret/personal -> SECRET_COLUMNS (tripwire: SENSITIVE_*) + product rules layered on top -> IMPLIED_DOMAINS, PRESERVED_COLUMNS + +You don't have to remember when to touch them: ``tests/integration/ +test_preset_schema_coverage.py`` fails the moment a migration adds a table or a +secret-looking column that isn't accounted for, and names the constant to fix. Each +section below opens with a "Touch when:" line saying exactly what to change. + +Three edits the coverage test can NOT catch -- they corrupt presets *silently*: + * Renaming a domain value. Domains are baked into every exported file + (``orb_preset_meta.included_domains``); a renamed domain no longer matches on + import, so that data is silently skipped for every preset already out there. + Add domains freely; never rename one. + * Parking a real data table in ``EXCLUDED_TABLES`` to quiet the test -- excluded + tables are invisible to export *and* merge, so the data vanishes from backups. + * Narrowing ``SENSITIVE_*`` to clear a flagged column -- declare the column in + ``SECRET_COLUMNS`` instead, or the secret ships in shared presets. """ from __future__ import annotations -# Root table -> user-facing domain. A *root* owns no other table (it has no -# ``ON DELETE CASCADE`` foreign key pointing at a parent). Every non-root table -# auto-joins its root's domain by following ownership edges upward, so only the -# roots need listing here. This is the schema-driven replacement for the old -# hand-maintained DOMAIN_TABLES map. +# Touch when: you add a brand-new top-level entity -- map its table to a user-facing +# domain. A child table hung off an existing entity needs no entry; it inherits its +# root's domain automatically. Reuse a domain or mint a new value (a new value mints +# a new exportable domain -- ALL_DOMAINS is derived from these); never rename one +# (see header). +# +# A *root* owns no other table: nothing points at it via ``ON DELETE CASCADE``. +# Non-root tables join their root's domain by following ownership edges upward. DOMAIN_ROOTS: dict[str, str] = { "conversations": "chats", "character_cards": "characters", @@ -29,20 +48,21 @@ "user_personas": "configs", } -# Machinery / legacy tables the engine neither exports nor merges: -# * orb_preset_meta -- the preset's own descriptor row -# * schema_migrations -- migration bookkeeping (stamped separately) -# * message_attachments -- always empty post-0020 (migration moves its rows to -# user_attachments and the table is retained only as a fresh-install artefact) +# Touch when: you add a table the engine must never export or merge -- bookkeeping, +# caches, or migration-only artefacts. The coverage test forces the choice for every +# new table: give it a domain, or exclude it here. Current entries: +# * orb_preset_meta -- the preset's own descriptor row +# * schema_migrations -- migration bookkeeping (stamped separately) +# * message_attachments -- empty post-0020; retained only as a fresh-install artefact EXCLUDED_TABLES: frozenset[str] = frozenset({"orb_preset_meta", "schema_migrations", "message_attachments"}) -# Secret / personal columns blanked when the ``configs`` domain is *not* exported, -# so a shared preset never leaks an API key, the user's identity, or their prompts. -# Maps ``(table, column) -> replacement value``. This is a security decision, not a -# schema fact, so it is declared rather than derived. Entries on a non-singleton -# table (e.g. endpoints.api_key) are moot for the export scrub -- those rows are -# deleted wholesale -- but are listed so the coverage check sees every key column -# accounted for, and so the key-stripping export path can find them generically. +# Touch when: a migration adds a column holding a key, the user's identity, or their +# prompts (the coverage test will fail and point you here); drop an entry only when +# its column leaves the schema. Map ``(table, column) -> the value to blank it to``. +# These are wiped when the ``configs`` domain is *not* exported, so a shared preset +# never leaks secrets. Columns on a non-singleton table (e.g. endpoints.api_key) are +# deleted with their whole row on export -- list them anyway so the coverage check +# and the generic key-strip path both see them. SECRET_COLUMNS: dict[tuple[str, str], str] = { ("settings", "api_key"): "", ("settings", "user_name"): "User", @@ -53,24 +73,29 @@ ("endpoints", "api_key"): "", } -# Exporting one domain implies exporting another (a product rule, not a schema -# fact): chats are meaningless without their character cards. +# Touch when: exporting one domain only makes sense alongside another (a product +# rule, not a schema fact). Maps a domain to the domains dragged in with it. Today: +# chats are meaningless without their character cards. IMPLIED_DOMAINS: dict[str, frozenset[str]] = { "chats": frozenset({"characters"}), } -# Columns carried across the settings-singleton overwrite on import rather than -# taken from the file: they describe local ``workflow_attachments`` rows that an -# import retains, not a user-facing config (see bootstrap.reset_to_defaults). +# Touch when: a singleton table (overwritten in place on import, like ``settings``) +# gains a column describing *local machine state* the import must keep rather than +# take from the file -- e.g. attachment-cache bookkeeping, not user-facing config. +# Maps ``table -> columns to leave untouched`` during the overwrite. PRESERVED_COLUMNS: dict[str, tuple[str, ...]] = { "settings": ("attachment_cache_budget_bytes", "attachment_access_counter"), } -# Markers that flag a column as security-sensitive. The coverage check fails on any -# matching column not present in SECRET_COLUMNS, so a newly added secret cannot slip -# into a shared preset unnoticed. Matched as *suffixes* (plus the "secret" substring) -# rather than loose substrings, so ``api_key`` / ``auth_token`` are caught while -# innocuous names like ``max_tokens`` or ``top_k`` are not. +# The tripwire behind the SECRET_COLUMNS check: any column whose name ends with one +# of these suffixes (or contains "secret") must appear in SECRET_COLUMNS, or the +# coverage test fails -- so a new secret can't slip into a shared preset unnoticed. +# Touch when: a real secret evades every pattern (e.g. ``credentials_blob``) -- add a +# pattern so it's caught. To clear a *false* positive, declare the column in +# SECRET_COLUMNS, never narrow these (see header). Suffix-matched (not loose +# substring) so ``api_key`` / ``auth_token`` are caught while ``max_tokens`` / +# ``top_k`` are not. SENSITIVE_SUFFIXES: tuple[str, ...] = ("_key", "password", "token") SENSITIVE_SUBSTRINGS: tuple[str, ...] = ("secret",) From 70eba9bba7ab0d2584b6b4335540d8f1f95c5b03 Mon Sep 17 00:00:00 2001 From: Chi Date: Fri, 12 Jun 2026 11:44:25 +0700 Subject: [PATCH 5/9] railguards for presets, reconcile persona_lock_id FK drift --- backend/database/__init__.py | 3 +- backend/database/bootstrap.py | 32 ++ .../0027_rebuild_persona_lock_fks.py | 71 +++++ ...28_drop_vestigial_settings_model_config.py | 43 +++ backend/database/preset_schema.py | 16 +- backend/database/schema.py | 28 ++ backend/main.py | 12 + backend/presets.py | 270 ++++++++++++++-- .../test_preset_schema_coverage.py | 292 +++++++++++++++++- 9 files changed, 734 insertions(+), 33 deletions(-) create mode 100644 backend/database/migrations/0027_rebuild_persona_lock_fks.py create mode 100644 backend/database/migrations/0028_drop_vestigial_settings_model_config.py diff --git a/backend/database/__init__.py b/backend/database/__init__.py index eb5cb9a..0dae539 100644 --- a/backend/database/__init__.py +++ b/backend/database/__init__.py @@ -6,7 +6,7 @@ from __future__ import annotations -from .bootstrap import init_db, reset_to_defaults +from .bootstrap import init_db, reset_to_defaults, schema_safety_problems from .connection import DB_PATH, get_db from .queries.character_cards import ( create_character_card, @@ -198,6 +198,7 @@ "get_world", "get_world_by_name", "get_worlds", + "schema_safety_problems", "init_db", "insert_alternate_greeting_swipes", "insert_workflow_attachment_row", diff --git a/backend/database/bootstrap.py b/backend/database/bootstrap.py index 0655360..12e8b9c 100644 --- a/backend/database/bootstrap.py +++ b/backend/database/bootstrap.py @@ -1,6 +1,7 @@ from __future__ import annotations import json +import sqlite3 from .connection import get_db from .schema import CREATE_TABLES_SQL @@ -13,6 +14,37 @@ ) +def schema_safety_problems() -> list[str]: + """Return why the live DB schema is unsafe for the preset engine, or ``[]`` if safe. + + Call right after ``run_pending`` so any developer schema change the preset engine + cannot safely handle -- a new uncovered table, a stale policy constant, a migration + that leaves a table unlike ``CREATE_TABLES_SQL`` (the 0026 persona_lock_id / + 0008 vestigial-column class of bug) -- surfaces at boot, naming the constant or + migration to fix. + + Non-fatal by design: the check guards *preset/backup* operations, not normal app + queries, so a schema quirk must warn loudly rather than brick the whole app at + boot (a single missed cleanup migration would otherwise refuse every real + install's startup). The preset ops themselves still call + ``presets.assert_schema_safe`` and fail hard on the same problems, so no backup is + ever built or applied against an unsafe schema. + + ``backend.presets`` is imported lazily here because it pulls in the migration + runner and would otherwise close an import cycle through this package. ``DB_PATH`` + is read off the ``connection`` module at call time (not the import-time binding) + so a monkeypatched path in tests resolves correctly. + """ + from .. import presets + from . import connection + + conn = sqlite3.connect(connection.DB_PATH) + try: + return presets.schema_safety_problems(conn) + finally: + conn.close() + + async def init_db(): """Create the latest schema for fresh installs and seed empty tables. diff --git a/backend/database/migrations/0027_rebuild_persona_lock_fks.py b/backend/database/migrations/0027_rebuild_persona_lock_fks.py new file mode 100644 index 0000000..e5ca028 --- /dev/null +++ b/backend/database/migrations/0027_rebuild_persona_lock_fks.py @@ -0,0 +1,71 @@ +"""0027_rebuild_persona_lock_fks — give persona_lock_id a real foreign key on +databases migrated through 0026. + +0026 added ``persona_lock_id`` to ``conversations`` and ``character_cards`` as a +bare ``INTEGER`` (an ALTER-added column cannot carry an enforced REFERENCES +clause), while fresh installs declare it +``INTEGER REFERENCES user_personas(id) ON DELETE SET NULL`` +(see backend/database/schema.py). The preset engine builds its merge/FK model +from the *live* ``PRAGMA foreign_key_list``, so on a migrated DB those columns +were invisible to the FK machinery: the merge copied lock ids verbatim instead +of remapping them through the personas id-map, and an export that drops the +configs domain never SET-NULLed them. This rebuilds the two tables to the +canonical DDL so the live schema matches a fresh install. + +Idempotent: a table whose ``persona_lock_id`` edge already exists (every fresh +install, and any DB already through 0027) is skipped. Run with foreign keys +OFF for the duration — the standard SQLite "other kinds of schema changes" +recipe — so dropping the old table neither cascades into ``messages`` nor trips +a constraint. Both tables have TEXT primary keys, so child references +(``messages.conversation_id`` …) keep resolving across the drop/rename. + +The rebuilt DDL is derived from ``schema.table_create_sql`` rather than pasted, +so this migration can never disagree with the schema-equivalence gate. +""" + +from __future__ import annotations + +import sqlite3 + +from backend.database import schema + +_TABLES = ("conversations", "character_cards") + + +def _has_persona_lock_fk(conn: sqlite3.Connection, table: str) -> bool: + # PRAGMA foreign_key_list row: (id, seq, parent_table, from, to, on_update, on_delete, match) + for row in conn.execute(f"PRAGMA foreign_key_list({table})").fetchall(): + if row[3] == "persona_lock_id" and row[2] == "user_personas": + return True + return False + + +def _rebuild(conn: sqlite3.Connection, table: str) -> None: + block = schema.table_create_sql(table) + new_ddl = block.replace(f"CREATE TABLE IF NOT EXISTS {table}", f"CREATE TABLE {table}_new", 1) + conn.execute(new_ddl) + new_cols = [r[1] for r in conn.execute(f"PRAGMA table_info({table}_new)").fetchall()] + old_cols = {r[1] for r in conn.execute(f"PRAGMA table_info({table})").fetchall()} + cols = ", ".join(c for c in new_cols if c in old_cols) + conn.execute(f"INSERT INTO {table}_new ({cols}) SELECT {cols} FROM {table}") + conn.execute(f"DROP TABLE {table}") + conn.execute(f"ALTER TABLE {table}_new RENAME TO {table}") + print(f"[migrations] 0027: rebuilt {table} with the persona_lock_id foreign key") + + +def migrate(conn: sqlite3.Connection) -> None: + # PRAGMA foreign_keys is a no-op inside a transaction, and DROP/RENAME under + # FK enforcement could cascade or fail; the runner has committed before this + # call, so close any stray transaction, flip FKs off for the rebuild, then + # restore the prior state. + conn.commit() + had_fk = conn.execute("PRAGMA foreign_keys").fetchone()[0] + conn.execute("PRAGMA foreign_keys=OFF") + try: + for table in _TABLES: + if not _has_persona_lock_fk(conn, table): + _rebuild(conn, table) + conn.commit() + finally: + if had_fk: + conn.execute("PRAGMA foreign_keys=ON") diff --git a/backend/database/migrations/0028_drop_vestigial_settings_model_config.py b/backend/database/migrations/0028_drop_vestigial_settings_model_config.py new file mode 100644 index 0000000..98b0fa1 --- /dev/null +++ b/backend/database/migrations/0028_drop_vestigial_settings_model_config.py @@ -0,0 +1,43 @@ +"""0028_drop_vestigial_settings_model_config — drop the dead +``settings.active_model_config_id`` column. + +Migration 0008 added ``active_model_config_id`` to ``settings`` for the original +single-active-model design. 0010 moved the active-model concept onto +``endpoints`` (``endpoints.active_model_config_id``), and the column was removed +from the fresh-install DDL (backend/database/schema.py) — but no migration ever +dropped it from databases that ran 0008, so every migrated install (including a +fresh one, which still runs 0008's ALTER) carries a vestigial, never-read +``settings.active_model_config_id``. + +That left the live schema diverging from ``CREATE_TABLES_SQL`` by exactly this +column. Dropping it reconciles the two so the fresh-vs-migrated schema-equivalence +gate (backend/presets.py ``assert_schema_safe``) holds. The column is confirmed +unreferenced by application code; only ``endpoints.active_model_config_id`` is used. + +Idempotent: skipped when the column is already absent (a DB never through 0008, or +already through 0028). ``ALTER TABLE … DROP COLUMN`` is the same mechanism migration +0016 uses; foreign keys are flipped off for the change since the column carries a +``REFERENCES model_configs(id)`` clause. +""" + +from __future__ import annotations + +import sqlite3 + + +def migrate(conn: sqlite3.Connection) -> None: + cols = {row[1] for row in conn.execute("PRAGMA table_info(settings)").fetchall()} + if "active_model_config_id" not in cols: + return + # PRAGMA foreign_keys is a no-op inside a transaction; the runner has committed + # before this call. Flip FKs off for the column drop, then restore prior state. + conn.commit() + had_fk = conn.execute("PRAGMA foreign_keys").fetchone()[0] + conn.execute("PRAGMA foreign_keys=OFF") + try: + conn.execute("ALTER TABLE settings DROP COLUMN active_model_config_id") + conn.commit() + print("[migrations] 0028: dropped vestigial settings.active_model_config_id") + finally: + if had_fk: + conn.execute("PRAGMA foreign_keys=ON") diff --git a/backend/database/preset_schema.py b/backend/database/preset_schema.py index a7bedb8..e204612 100644 --- a/backend/database/preset_schema.py +++ b/backend/database/preset_schema.py @@ -15,15 +15,25 @@ secret-looking column that isn't accounted for, and names the constant to fix. Each section below opens with a "Touch when:" line saying exactly what to change. -Three edits the coverage test can NOT catch -- they corrupt presets *silently*: +Three edits that once corrupted presets *silently* -- each now has a dedicated +tripwire, so they fail loudly instead: * Renaming a domain value. Domains are baked into every exported file (``orb_preset_meta.included_domains``); a renamed domain no longer matches on import, so that data is silently skipped for every preset already out there. - Add domains freely; never rename one. + Add domains freely; never rename one. CAUGHT BY: a frozen-literal assertion on + ``presets.ALL_DOMAINS`` in the coverage test -- a rename fails CI; an addition + is a deliberate one-line test edit. * Parking a real data table in ``EXCLUDED_TABLES`` to quiet the test -- excluded tables are invisible to export *and* merge, so the data vanishes from backups. + CAUGHT BY: a runtime tripwire in ``build_preset`` that raises if any excluded + table other than the meta/migration bookkeeping holds rows, plus a test that + every excluded data table is empty in the fresh schema. * Narrowing ``SENSITIVE_*`` to clear a flagged column -- declare the column in - ``SECRET_COLUMNS`` instead, or the secret ships in shared presets. + ``SECRET_COLUMNS`` instead, or the secret ships in shared presets. CAUGHT BY: + a secret-canary test that seeds a unique sentinel into every secret column, + exports without ``configs`` (and with ``strip_keys``), and greps the produced + file's raw bytes for any surviving canary -- a generic leak check, not just the + declared columns' happy path. """ from __future__ import annotations diff --git a/backend/database/schema.py b/backend/database/schema.py index 1ebd5dc..6965648 100644 --- a/backend/database/schema.py +++ b/backend/database/schema.py @@ -1,5 +1,7 @@ from __future__ import annotations +import re + CREATE_TABLES_SQL = """ CREATE TABLE IF NOT EXISTS settings ( id INTEGER PRIMARY KEY CHECK (id = 1), @@ -241,3 +243,29 @@ ); """ + + +def table_create_sql(table: str) -> str: + """Return the ``CREATE TABLE IF NOT EXISTS ( ... )`` block for *table*, + sliced out of ``CREATE_TABLES_SQL``. + + This is the single source of truth for a table's canonical fresh-install shape. + Rebuild migrations (e.g. 0027) and the schema-equivalence gate both derive the + canonical DDL from here rather than pasting a copy, so a rebuild can never drift + from the shape the equivalence check enforces. Parentheses are balanced (column + ``REFERENCES`` and ``CHECK`` clauses nest), so the block ends at the matching + close paren, not the first one. + """ + m = re.search(rf"CREATE TABLE IF NOT EXISTS {re.escape(table)}\s*\(", CREATE_TABLES_SQL) + if not m: + raise KeyError(f"no CREATE TABLE block for {table!r} in CREATE_TABLES_SQL") + depth = 0 + for i in range(m.end() - 1, len(CREATE_TABLES_SQL)): + ch = CREATE_TABLES_SQL[i] + if ch == "(": + depth += 1 + elif ch == ")": + depth -= 1 + if depth == 0: + return CREATE_TABLES_SQL[m.start() : i + 1] + raise ValueError(f"unbalanced parentheses extracting {table!r} from CREATE_TABLES_SQL") diff --git a/backend/main.py b/backend/main.py index c874565..2eac4c1 100644 --- a/backend/main.py +++ b/backend/main.py @@ -21,6 +21,7 @@ from .database.models import ConversationRow from .database import ( DB_PATH, + schema_safety_problems, get_db, get_messages_before, init_db, @@ -196,6 +197,17 @@ async def _conversation_stream_lock(cid: str): async def lifespan(app: FastAPI): await init_db() run_pending(DB_PATH) + # Schema safety check for the preset/backup engine. Non-fatal at startup: it + # guards backup integrity, not normal queries, so a developer schema change that + # left the live schema uncovered or unlike a fresh install must warn loudly + # (naming the constant/migration to fix) rather than block boot. The preset ops + # themselves still call assert_schema_safe and fail hard on the same problems. + problems = schema_safety_problems() + if problems: + logger.error( + "Preset/backup schema safety check failed; exports, snapshots and restores " + "will be refused until this is fixed:\n - " + "\n - ".join(problems) + ) logger.info("Database initialized") yield diff --git a/backend/presets.py b/backend/presets.py index 7d371cd..056d4bb 100644 --- a/backend/presets.py +++ b/backend/presets.py @@ -37,6 +37,7 @@ from .database import preset_schema as ps from .database.migrations import MIGRATIONS, run_pending +from .database.schema import CREATE_TABLES_SQL META_TABLE = "orb_preset_meta" @@ -195,13 +196,21 @@ def _topo_order(tables: dict[str, _Table]) -> tuple[list[str], set[tuple[str, st if f.is_self: deferred.add((t.name, f.from_col)) + # Iterate tables in a fixed (alphabetical) order, never sqlite_master's physical + # order. Both the emitted insert order and the cycle-break choice are then a pure + # function of the schema's *shape*, independent of the order tables were created + # in -- so a table rebuilt by a migration (which moves it to the end of + # sqlite_master) yields the identical model to a fresh install. The + # schema-equivalence gate relies on this determinism. + names = sorted(tables) placed: set[str] = set() order: list[str] = [] while len(placed) < len(tables): progressed = False - for name, t in tables.items(): + for name in names: if name in placed: continue + t = tables[name] unmet = any( f.parent in tables and f.parent not in placed and not f.is_self and (name, f.from_col) not in deferred for f in t.fks @@ -215,10 +224,10 @@ def _topo_order(tables: dict[str, _Table]) -> tuple[list[str], set[tuple[str, st # Stalled: a cycle remains. Break it by deferring one crossref edge whose # parent is still unplaced. broke = False - for name, t in tables.items(): + for name in names: if name in placed: continue - for f in t.fks: + for f in tables[name].fks: if f.kind == "crossref" and f.parent not in placed and (name, f.from_col) not in deferred: deferred.add((name, f.from_col)) broke = True @@ -269,9 +278,142 @@ def schema_coverage_problems(conn: sqlite3.Connection) -> list[str]: f"{table}.{col} is a deferred FK edge (inserted NULL, fixed up after) " f"but is declared NOT NULL; a merge would fail its constraint. Make it nullable." ) + + # Reverse direction: every hand-declared policy entry must still match the live + # schema. A stale entry (column dropped, table renamed) would otherwise surface + # only as a raw OperationalError mid-export, or be silently ignored. + known_domains = set(ps.DOMAIN_ROOTS.values()) + for root in ps.DOMAIN_ROOTS: + if root not in schema.tables: + problems.append( + f"DOMAIN_ROOTS key {root!r} is not an existing non-excluded table; " f"fix the name or drop the entry" + ) + continue + owner = schema.tables[root].owner_fk + if owner is not None: + problems.append( + f"DOMAIN_ROOTS key {root!r} is not a true root -- it is owned by " + f"{owner.parent!r} via {owner.from_col} (ON DELETE CASCADE); only roots may " + f"map to a domain, children inherit their root's" + ) + for table, col in ps.SECRET_COLUMNS: + if table not in schema.tables or col not in schema.tables[table].cols: + problems.append( + f"SECRET_COLUMNS entry ({table!r}, {col!r}) does not exist in the schema; " f"drop it or fix the name" + ) + for table, cols in ps.PRESERVED_COLUMNS.items(): + for col in cols: + if table not in schema.tables or col not in schema.tables[table].cols: + problems.append( + f"PRESERVED_COLUMNS entry ({table!r}, {col!r}) does not exist in the schema; " f"drop it or fix the name" + ) + for trigger, implied in ps.IMPLIED_DOMAINS.items(): + if trigger not in known_domains: + problems.append(f"IMPLIED_DOMAINS trigger {trigger!r} is not a known domain") + for dom in implied: + if dom not in known_domains: + problems.append(f"IMPLIED_DOMAINS implied domain {dom!r} (for trigger {trigger!r}) is not a known domain") return problems +def _edge_set(t: _Table) -> set[tuple]: + """A table's FK edges as comparable tuples (order-independent).""" + return {(f.from_col, f.parent, f.to_col, f.on_delete, f.notnull) for f in t.fks} + + +def schema_equivalence_problems(conn: sqlite3.Connection) -> list[str]: + """Return reasons the *live* schema diverges from a fresh install's, or []. + + The merge/FK model is read from the live database, so a migration that adds a + column or table in a shape that differs from ``CREATE_TABLES_SQL`` (the exact + 0026 persona_lock_id bug: an ALTER-added bare INTEGER where a fresh install has + an ``ON DELETE SET NULL`` FK) makes the engine silently mis-handle it. This + builds the same in-memory model from the live conn and from a throwaway + canonical DB and reports any per-table difference in columns, primary key, + classification, or FK-edge set, plus any difference in the deferred-edge set. + """ + live = _build_schema_model(conn) + ref = sqlite3.connect(":memory:") + try: + ref.executescript(CREATE_TABLES_SQL) + canon = _build_schema_model(ref) + finally: + ref.close() + + problems: list[str] = [] + live_names, canon_names = set(live.tables), set(canon.tables) + for name in sorted(canon_names - live_names): + problems.append(f"table {name!r} is in the canonical schema but missing from the live DB") + for name in sorted(live_names - canon_names): + problems.append(f"table {name!r} is in the live DB but not the canonical schema (CREATE_TABLES_SQL)") + + for name in sorted(live_names & canon_names): + lt, ct = live.tables[name], canon.tables[name] + # Compare column *sets*, not ordered lists: the merge engine names every + # column explicitly (never relies on position), and an ALTER-added column + # legitimately lands at a different ordinal on an old install than on a fresh + # one. A missing or extra column, by contrast, is a real merge hazard. + missing = set(ct.cols) - set(lt.cols) + extra = set(lt.cols) - set(ct.cols) + if missing: + problems.append(f"{name}: live is missing column(s) {sorted(missing)} present in the canonical schema") + if extra: + problems.append( + f"{name}: live has extra column(s) {sorted(extra)} absent from the canonical schema " + f"(a stale column a migration added but never dropped -> write a cleanup migration)" + ) + if lt.pk != ct.pk: + problems.append(f"{name}: primary key differs -- live {lt.pk} vs canonical {ct.pk}") + if lt.kind != ct.kind: + problems.append(f"{name}: merge kind differs -- live {lt.kind!r} vs canonical {ct.kind!r}") + live_edges, canon_edges = _edge_set(lt), _edge_set(ct) + for from_col, parent, to_col, on_delete, _nn in sorted(canon_edges - live_edges): + problems.append( + f"{name}.{from_col}: live has no matching FK, canonical has " + f"{parent}({to_col}) ON DELETE {on_delete} -> write a rebuild migration" + ) + for from_col, parent, to_col, on_delete, _nn in sorted(live_edges - canon_edges): + problems.append( + f"{name}.{from_col}: live has FK {parent}({to_col}) ON DELETE {on_delete} " f"absent from the canonical schema" + ) + + only_canon = canon.deferred - live.deferred + only_live = live.deferred - canon.deferred + if only_canon: + problems.append(f"deferred FK edges in the canonical schema but not live: {sorted(only_canon)}") + if only_live: + problems.append(f"deferred FK edges in the live schema but not canonical: {sorted(only_live)}") + return problems + + +def schema_safety_problems(conn: sqlite3.Connection) -> list[str]: + """Every reason the live schema is unsafe for the preset engine -- a policy gap + (coverage) or a fresh-vs-migrated divergence (equivalence) -- or ``[]`` if safe. + + Split out from ``assert_schema_safe`` so startup can surface these as a loud, + non-fatal warning (the check guards backup integrity, not normal queries, so a + schema quirk must not brick the whole app at boot) while every preset *operation* + still fails hard on the identical list. + """ + return schema_coverage_problems(conn) + schema_equivalence_problems(conn) + + +def assert_schema_safe(conn: sqlite3.Connection) -> None: + """Hard gate: raise ``PresetError`` if the live schema is not fully covered by + the preset policy or diverges from a fresh install. + + Called at the top of every preset op (export/apply/snapshot/restore), where + mis-handling the schema would corrupt a backup. Only a developer schema change can + trip this; the message names the constant or migration to fix. Cheap enough (a + handful of PRAGMA reads plus one in-memory ``CREATE_TABLES_SQL``) to run on every + op. Startup uses the non-fatal ``schema_safety_problems`` instead, so a schema + quirk warns but never blocks boot. + """ + problems = schema_safety_problems(conn) + if problems: + raise PresetError("Preset schema safety check failed:\n - " + "\n - ".join(problems)) + + # ── paths ─────────────────────────────────────────────────────────────────── @@ -395,6 +537,24 @@ def read_meta(path: str) -> dict | None: # ── export ────────────────────────────────────────────────────────────────── +def _assert_integrity(conn: sqlite3.Connection, what: str) -> None: + """Raise ``PresetError`` unless ``PRAGMA integrity_check`` reports ``ok``. + + Run on a file we just produced (VACUUM INTO) or are about to trust (a restore + target). A truncated or torn disk write yields a structurally broken database + that opens fine but is silently corrupt; this is the trip that stops such a + file from becoming the backup the user relies on. + """ + row = conn.execute("PRAGMA integrity_check").fetchone() + if not row or row[0] != "ok": + raise PresetError(f"Integrity check failed for {what}: {row[0] if row else 'no result'}") + + +# Excluded tables that may legitimately hold rows (bookkeeping, not domain data). +# Every *other* excluded table must stay empty, or its data would ship in no backup. +_EXCLUDED_MAY_HAVE_ROWS: frozenset[str] = frozenset({META_TABLE, "schema_migrations"}) + + def _scrub_configs(conn: sqlite3.Connection, schema: _Schema) -> None: """Strip personal config + secrets when 'configs' is not exported. @@ -433,6 +593,7 @@ def build_preset(selected_domains, strip_keys: bool, label: str = "") -> str: tmp = os.path.join(_snapshots_dir(), f".build-{os.getpid()}-{datetime.datetime.now():%H%M%S%f}.tmp") src = sqlite3.connect(_db_path()) try: + assert_schema_safe(src) src.execute("VACUUM INTO ?", (tmp,)) finally: src.close() @@ -440,8 +601,24 @@ def build_preset(selected_domains, strip_keys: bool, label: str = "") -> str: keys_stripped = False c = sqlite3.connect(tmp, isolation_level=None) try: + _assert_integrity(c, "the exported preset clone") c.execute("PRAGMA foreign_keys=ON") schema = _build_schema_model(c) + # Tripwire: an excluded table that carries data would be invisible to both + # export and merge -- its rows would silently never be backed up. The only + # excluded data table is message_attachments, empty by invariant post-0020; + # this fails loudly the day someone parks a live table in EXCLUDED_TABLES. + for tbl in ps.EXCLUDED_TABLES: + if tbl in _EXCLUDED_MAY_HAVE_ROWS: + continue + if not c.execute("SELECT 1 FROM sqlite_master WHERE type='table' AND name=?", (tbl,)).fetchone(): + continue + if c.execute(f"SELECT 1 FROM {tbl} LIMIT 1").fetchone(): + raise PresetError( + f"Excluded table {tbl!r} has rows but is invisible to export and merge; " + f"its data would silently never be backed up. Give its root a domain in " + f"DOMAIN_ROOTS, or confirm it must stay excluded." + ) # Prune each unselected domain by deleting its root tables: with FK on, a # CASCADE prunes the owned children and a SET NULL clears soft pointers, so # no per-child delete is hand-coded. configs is special (it scrubs the @@ -462,6 +639,7 @@ def build_preset(selected_domains, strip_keys: bool, label: str = "") -> str: _stamp_migrations(c) _write_meta(c, sorted(selected), label, kind, keys_stripped) c.execute("VACUUM") # reclaim pages freed by the deletes + _assert_integrity(c, "the exported preset") finally: c.close() @@ -766,35 +944,53 @@ def apply_preset(preset_path: str, *, replace: bool = False) -> dict: With ``replace=True`` (the partial-restore path) each covered domain is emptied before its merge, so the domain ends up exactly matching the file rather than merged into existing rows; domains the file doesn't carry are - left untouched.""" - check_and_upgrade(preset_path) - included = set(preset_domains(preset_path)) + left untouched. - conn = sqlite3.connect(_db_path(), isolation_level=None) - summary: dict[str, int] = {} + The stored library file is never written: we validate + upgrade + ATTACH a + throwaway ``.``-prefixed copy (which ``list_library`` ignores), so a buggy + migration on ingest can never corrupt the user's backup. ``restore_full`` does + the same with its own temp copy. + """ + work = os.path.join(_snapshots_dir(), f".apply-{os.getpid()}-{datetime.datetime.now():%H%M%S%f}.tmp") + shutil.copyfile(preset_path, work) try: - conn.execute("PRAGMA foreign_keys=OFF") - conn.execute("ATTACH DATABASE ? AS preset", (preset_path,)) - conn.execute("BEGIN") - summary = _merge(conn, included, replace) - problems = conn.execute("PRAGMA foreign_key_check").fetchall() - if problems: - conn.execute("ROLLBACK") - raise PresetError(f"Import would corrupt foreign keys ({len(problems)} violations); aborted.") - conn.execute("COMMIT") - except Exception: + check_and_upgrade(work) # quick_check + validate + migrate, all on the copy + included = set(preset_domains(work)) + + conn = sqlite3.connect(_db_path(), isolation_level=None) + summary: dict[str, int] = {} try: - conn.execute("ROLLBACK") - except sqlite3.OperationalError: - pass - raise + assert_schema_safe(conn) + conn.execute("PRAGMA foreign_keys=OFF") + conn.execute("ATTACH DATABASE ? AS preset", (work,)) + conn.execute("BEGIN") + summary = _merge(conn, included, replace) + problems = conn.execute("PRAGMA foreign_key_check").fetchall() + if problems: + conn.execute("ROLLBACK") + raise PresetError(f"Import would corrupt foreign keys ({len(problems)} violations); aborted.") + conn.execute("COMMIT") + except Exception: + try: + conn.execute("ROLLBACK") + except sqlite3.OperationalError: + pass + raise + finally: + try: + conn.execute("DETACH DATABASE preset") + except sqlite3.OperationalError: + pass + conn.close() + return summary finally: - try: - conn.execute("DETACH DATABASE preset") - except sqlite3.OperationalError: - pass - conn.close() - return summary + for sfx in ("", "-wal", "-shm"): + p = work + sfx + if os.path.exists(p): + try: + os.remove(p) + except OSError: + pass def restore_partial(preset_path: str) -> dict: @@ -816,11 +1012,13 @@ def create_snapshot(label: str = "") -> str: dest = os.path.join(_snapshots_dir(), name) src = sqlite3.connect(_db_path()) try: + assert_schema_safe(src) src.execute("VACUUM INTO ?", (dest,)) finally: src.close() c = sqlite3.connect(dest, isolation_level=None) try: + _assert_integrity(c, "the snapshot") _stamp_migrations(c) _write_meta(c, ALL_DOMAINS, label, "auto", False) finally: @@ -872,6 +1070,16 @@ def restore_full(name: str) -> None: finally: conn.close() run_pending(tmp) + # The temp copy is about to become the live DB; refuse a structurally + # broken or FK-inconsistent file rather than swapping it in. + chk = sqlite3.connect(tmp) + try: + _assert_integrity(chk, "the restore target") + fk = chk.execute("PRAGMA foreign_key_check").fetchall() + if fk: + raise PresetError(f"Restore target has {len(fk)} foreign-key violations; aborted.") + finally: + chk.close() os.replace(tmp, live) except BaseException: if os.path.exists(tmp): @@ -948,6 +1156,12 @@ def check_and_upgrade(path: str) -> None: schema. Rejects files produced by a newer Orb build.""" conn = sqlite3.connect(path) try: + # quick_check on the upload before we trust it enough to migrate: a torn + # or tampered file that still opens must be rejected, not run through + # run_pending (which would write into a corrupt database). + qc = conn.execute("PRAGMA quick_check").fetchone() + if not qc or qc[0] != "ok": + raise PresetError(f"Uploaded file failed its integrity check: {qc[0] if qc else 'no result'}") tables = {r[0] for r in conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()} if "settings" not in tables: raise PresetError("Not an Orb database file.") diff --git a/tests/integration/test_preset_schema_coverage.py b/tests/integration/test_preset_schema_coverage.py index 433166f..b5eac33 100644 --- a/tests/integration/test_preset_schema_coverage.py +++ b/tests/integration/test_preset_schema_coverage.py @@ -10,11 +10,17 @@ from __future__ import annotations +import importlib import sqlite3 +import pytest + from backend import presets +from backend.database import schema from backend.database.schema import CREATE_TABLES_SQL +_mig_0027 = importlib.import_module("backend.database.migrations.0027_rebuild_persona_lock_fks") + def _fresh_schema_db(tmp_path, extra_sql: str = "") -> sqlite3.Connection: """An in-memory-equivalent DB with the current fresh-install schema (+extras).""" @@ -117,6 +123,153 @@ def test_coverage_flags_a_not_null_deferred_edge(tmp_path): conn.close() +def test_domain_list_is_frozen(): + """Domains are baked into every exported file's meta, so renaming one silently + breaks import for every preset already out there. This frozen literal turns a + rename into a CI failure; *adding* a domain is a deliberate one-line edit here + (append only).""" + assert presets.ALL_DOMAINS == ["characters", "chats", "configs", "fragments", "lorebooks", "phrase_bank"] + + +# ── reverse policy validation (a stale/typo'd constant must be caught) ─────────── + + +def test_coverage_flags_a_non_root_domain_key(tmp_path): + """A DOMAIN_ROOTS key that is actually an owned child (not a true root) must be + reported -- children inherit their root's domain, they cannot declare one.""" + conn = _fresh_schema_db(tmp_path) + try: + # messages is owned by conversations via ON DELETE CASCADE -> not a root. + monkey = dict(presets.ps.DOMAIN_ROOTS, messages="chats") + orig = presets.ps.DOMAIN_ROOTS + presets.ps.DOMAIN_ROOTS = monkey + try: + problems = presets.schema_coverage_problems(conn) + finally: + presets.ps.DOMAIN_ROOTS = orig + assert any("messages" in p and "not a true root" in p for p in problems), problems + finally: + conn.close() + + +def test_coverage_flags_a_stale_secret_column(tmp_path): + """A SECRET_COLUMNS entry whose column no longer exists must be reported, not + surface later as a raw OperationalError mid-export.""" + conn = _fresh_schema_db(tmp_path) + try: + monkey = dict(presets.ps.SECRET_COLUMNS) + monkey[("settings", "ghost_token")] = "" + orig = presets.ps.SECRET_COLUMNS + presets.ps.SECRET_COLUMNS = monkey + try: + problems = presets.schema_coverage_problems(conn) + finally: + presets.ps.SECRET_COLUMNS = orig + assert any("ghost_token" in p for p in problems), problems + finally: + conn.close() + + +# ── fresh-vs-migrated equivalence (the 0026 class of bug) ──────────────────────── + + +def _strip_persona_lock_fk(conn: sqlite3.Connection, table: str) -> None: + """Rebuild *table* with persona_lock_id as a bare INTEGER, mimicking a database + migrated through 0026 but not yet 0027 (the silent-corruption shape).""" + block = schema.table_create_sql(table).replace(" REFERENCES user_personas(id) ON DELETE SET NULL", "") + block = block.replace(f"CREATE TABLE IF NOT EXISTS {table}", f"CREATE TABLE {table}_old", 1) + conn.execute("PRAGMA foreign_keys=OFF") + conn.execute(block) + cols = ",".join(r[1] for r in conn.execute(f"PRAGMA table_info({table}_old)")) + conn.execute(f"INSERT INTO {table}_old ({cols}) SELECT {cols} FROM {table}") + conn.execute(f"DROP TABLE {table}") + conn.execute(f"ALTER TABLE {table}_old RENAME TO {table}") + + +def test_fresh_vs_migrated_equivalence_and_0027_repair(tmp_path): + """The runtime gate must flag the pre-0027 persona_lock_id divergence (the exact + 0026 bug: an ALTER-added bare INTEGER where a fresh install has an FK), and + migration 0027 must repair it so the live schema equals a fresh install again.""" + conn = _fresh_schema_db(tmp_path) + try: + for table in ("conversations", "character_cards"): + _strip_persona_lock_fk(conn, table) + conn.commit() + + # The gate names the divergence before 0027 runs. + before = presets.schema_equivalence_problems(conn) + assert any("conversations.persona_lock_id" in p for p in before), before + assert any("character_cards.persona_lock_id" in p for p in before), before + with pytest.raises(presets.PresetError): + presets.assert_schema_safe(conn) + + # 0027 rebuilds both tables; the live schema then matches the canonical one. + _mig_0027.migrate(conn) + assert presets.schema_equivalence_problems(conn) == [] + for table in ("conversations", "character_cards"): + assert _mig_0027._has_persona_lock_fk(conn, table) + presets.assert_schema_safe(conn) # no longer raises + finally: + conn.close() + + +def test_schema_safety_problems_is_non_fatal_but_preset_ops_stay_fatal(tmp_path): + """The startup gate must not brick the app: ``schema_safety_problems`` reports the + same divergence ``assert_schema_safe`` raises on, but returns it as a list instead + of throwing -- so a schema quirk warns at boot while every preset op still fails + hard on the identical problems.""" + conn = _fresh_schema_db(tmp_path) + try: + _strip_persona_lock_fk(conn, "conversations") + conn.commit() + + # Non-fatal collector: returns the problems, never raises. + problems = presets.schema_safety_problems(conn) + assert any("conversations.persona_lock_id" in p for p in problems), problems + + # The hard gate used by export/apply/snapshot raises on the same list. + with pytest.raises(presets.PresetError) as exc: + presets.assert_schema_safe(conn) + assert "conversations.persona_lock_id" in str(exc.value) + + # A clean schema yields no problems and the gate is silent. + clean_dir = tmp_path / "clean" + clean_dir.mkdir() + clean = _fresh_schema_db(clean_dir) + try: + assert presets.schema_safety_problems(clean) == [] + presets.assert_schema_safe(clean) # must not raise + finally: + clean.close() + finally: + conn.close() + + +def test_fully_migrated_fresh_install_satisfies_gate(tmp_path): + """A real fresh install runs CREATE_TABLES_SQL *then every migration*, so the + fully-migrated schema -- not raw CREATE_TABLES_SQL -- is what production boots + with. It must satisfy the schema-safety gate. This is the integration guard that + fails the day a migration leaves the live schema unlike CREATE_TABLES_SQL (a + missing FK like 0026, a stale column like 0008's settings.active_model_config_id) + -- a class the equivalence gate exists to stop reaching production.""" + from backend.database.migrations import run_pending + + db = tmp_path / "fresh.db" + conn = sqlite3.connect(str(db)) + conn.executescript(CREATE_TABLES_SQL) + conn.commit() + conn.close() + run_pending(str(db)) + + conn = sqlite3.connect(str(db)) + try: + assert presets.schema_equivalence_problems(conn) == [] + assert presets.schema_coverage_problems(conn) == [] + presets.assert_schema_safe(conn) # must not raise + finally: + conn.close() + + # ── merge regressions (PR #90 audit) ──────────────────────────────────────────── @@ -276,13 +429,48 @@ def _insert_conv_tree(path: str, cid: str, persona_id: int | None) -> None: conn.close() +# The tables the round-trip's _signature() actually reads (declared explicitly so a +# new table can't silently drop out of round-trip coverage -- see +# test_signature_covers_every_domain_table). +SIGNATURE_TABLES = frozenset( + { + "character_cards", + "user_personas", + "conversations", + "messages", + "director_state", + "worlds", + "lorebook_entries", + "phrase_bank", + "mood_fragments", + "interactive_fragments", + } +) + +# Tables the round-trip deliberately does NOT signature-compare. Documented so the +# self-coverage test forces a conscious choice for every new table. +SIGNATURE_ALLOWLIST = frozenset( + { + # configs domain: round-trip asserts its presence via the summary, not content. + "settings", + "endpoints", + "model_configs", + # pure log / attachment tables: not part of any domain's user-facing identity. + "conversation_logs", + "user_attachments", + "workflow_attachments", + } +) + + def _signature(path: str) -> dict: """Canonical, surrogate-id-independent content of every data domain. Surrogate ids (messages, personas, …) are never compared directly; references to them are resolved to the parent's portable identity (a persona's name, a leaf message's content) so two databases that differ only by autoincrement - renumbering produce the same signature. + renumbering produce the same signature. The tables read here are pinned by + ``SIGNATURE_TABLES`` and checked against the live schema below. """ conn = sqlite3.connect(path) conn.row_factory = sqlite3.Row @@ -308,11 +496,29 @@ def q(sql): "personas": q("SELECT name, description FROM user_personas"), "phrase_bank": q("SELECT variants, kind, pattern FROM phrase_bank"), "fragments": q("SELECT id, label FROM mood_fragments"), + "interactive_fragments": q("SELECT id, label FROM interactive_fragments"), } finally: conn.close() +def test_signature_covers_every_domain_table(tmp_path): + """Self-coverage: the tables _signature() reads, plus a documented allowlist of + tables it deliberately skips, must partition the whole schema. Adding a table + then fails this until the developer either extends _signature or consciously + allowlists it -- a new table can never silently drop out of round-trip coverage.""" + conn = _fresh_schema_db(tmp_path) + try: + all_tables = set(presets._build_schema_model(conn).tables) + finally: + conn.close() + assert SIGNATURE_TABLES & SIGNATURE_ALLOWLIST == set(), "a table is both signatured and allowlisted" + assert SIGNATURE_TABLES | SIGNATURE_ALLOWLIST == all_tables, { + "unaccounted (extend _signature or allowlist)": all_tables - SIGNATURE_TABLES - SIGNATURE_ALLOWLIST, + "stale (not in schema)": (SIGNATURE_TABLES | SIGNATURE_ALLOWLIST) - all_tables, + } + + async def test_full_round_trip_is_identity_modulo_surrogate_ids(client, db_path): """Seed every domain, export a full preset, scramble the live DB, then apply the file with replace=True: the database must come back row-for-row identical @@ -381,3 +587,87 @@ async def test_full_round_trip_is_identity_modulo_surrogate_ids(client, db_path) assert conn.execute("PRAGMA foreign_key_check").fetchall() == [] finally: conn.close() + + +# ── excluded-table tripwires (data must never hide in EXCLUDED_TABLES) ──────────── + + +def test_excluded_data_tables_are_empty_in_fresh_schema(tmp_path): + """Every excluded table other than the meta/migration bookkeeping must be empty + on a fresh install -- excluded tables are invisible to export and merge, so any + rows they carry would silently never be backed up.""" + conn = _fresh_schema_db(tmp_path) + try: + for tbl in presets.ps.EXCLUDED_TABLES: + if tbl in presets._EXCLUDED_MAY_HAVE_ROWS: + continue + if not conn.execute("SELECT 1 FROM sqlite_master WHERE type='table' AND name=?", (tbl,)).fetchone(): + continue + assert conn.execute(f"SELECT COUNT(*) FROM {tbl}").fetchone()[0] == 0, tbl + finally: + conn.close() + + +async def test_build_preset_rejects_rows_in_excluded_table(client, db_path): + """Runtime tripwire: parking real data in an excluded table must fail the export + loudly rather than ship a backup that silently omits it.""" + import asyncio + + path = str(db_path) + await client.post("/api/characters", json={"name": "Keep"}) + seed = sqlite3.connect(path) + try: + seed.execute("PRAGMA foreign_keys=OFF") # message_attachments.message_id NOT NULL; we only need a row to exist + seed.execute( + "INSERT INTO message_attachments (message_id, mime_type, data_b64, created_at) " + "VALUES (1, 'image/png', 'AAA', '2024-01-01')" + ) + seed.commit() + finally: + seed.close() + + with pytest.raises(presets.PresetError) as exc: + await asyncio.to_thread(presets.build_preset, ["characters"], False) + assert "message_attachments" in str(exc.value) + + +# ── secret-canary leak sentinel ────────────────────────────────────────────────── + + +async def test_no_secret_canary_leaks_in_exports(client, db_path): + """Seed a unique sentinel into every SECRET_COLUMNS column, then prove no leak + path ships it: (a) any single domain exported without ``configs`` must contain + no sentinel at all; (b) a full export with ``strip_keys`` must contain no + *api_key* sentinel. A future leak fails this generically, not just for the + declared columns' happy path.""" + path = str(db_path) + + def canary(table: str, col: str) -> bytes: + return f"LEAK-CANARY-{table}-{col}".encode() + + seed = sqlite3.connect(path) + try: + for table, col in presets.ps.SECRET_COLUMNS: + seed.execute(f"UPDATE {table} SET {col} = ?", (canary(table, col).decode(),)) + seed.commit() + finally: + seed.close() + + all_canaries = [canary(t, c) for (t, c) in presets.ps.SECRET_COLUMNS] + api_key_canaries = [canary(t, c) for (t, c) in presets.ps.SECRET_COLUMNS if c == "api_key"] + + # (a) every single domain that does NOT pull in configs -> nothing personal ships. + non_configs = [d for d in presets.ALL_DOMAINS if d != "configs"] + for domain in non_configs: + name = (await client.post("/api/presets/export", json={"domains": [domain], "strip_keys": False})).json()["name"] + blob = open(presets._library_path(name), "rb").read() + leaked = [c.decode() for c in all_canaries if c in blob] + assert leaked == [], (domain, leaked) + + # (b) full export with strip_keys -> only the api_key sentinels must be gone. + name = (await client.post("/api/presets/export", json={"domains": list(presets.ALL_DOMAINS), "strip_keys": True})).json()[ + "name" + ] + blob = open(presets._library_path(name), "rb").read() + leaked_keys = [c.decode() for c in api_key_canaries if c in blob] + assert leaked_keys == [], leaked_keys From 7dceb0d4d5c518da0f2971c62644a030b61e1e80 Mon Sep 17 00:00:00 2001 From: Chi Date: Fri, 12 Jun 2026 13:02:31 +0700 Subject: [PATCH 6/9] fix vestigial columns detected by previous railguard --- ...29_drop_vestigial_settings_active_agent.py | 53 +++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 backend/database/migrations/0029_drop_vestigial_settings_active_agent.py diff --git a/backend/database/migrations/0029_drop_vestigial_settings_active_agent.py b/backend/database/migrations/0029_drop_vestigial_settings_active_agent.py new file mode 100644 index 0000000..fd5b294 --- /dev/null +++ b/backend/database/migrations/0029_drop_vestigial_settings_active_agent.py @@ -0,0 +1,53 @@ +"""0029_drop_vestigial_settings_active_agent — drop the dead +``settings.active_agent_endpoint_id`` and ``settings.active_agent_model_config_id`` +columns. + +An early version of the agent-endpoint feature (later rewritten into what is now +migration 0013) added an *active agent* pointer pair directly to ``settings``: +``active_agent_endpoint_id`` (REFERENCES endpoints) and +``active_agent_model_config_id`` (REFERENCES model_configs). The feature was then +redesigned: the writer/agent split moved onto ``endpoints`` +(``endpoints.agent_active_model_config_id``) and ``settings`` kept only +``agent_endpoint_id`` / ``agent_same_as_writer``. The current 0013 adds that final +shape, and the fresh-install DDL (backend/database/schema.py) never carried the +``active_agent_*`` pair — but no migration ever dropped them from databases that +ran the old 0013, so those installs carry two vestigial, never-read columns. + +That left the live schema diverging from ``CREATE_TABLES_SQL`` by exactly these two +columns (and their FKs), tripping the fresh-vs-migrated schema-equivalence gate +(backend/presets.py ``assert_schema_safe``). Dropping them reconciles the two. Both +columns are confirmed unreferenced by application code; only the surviving +``settings.agent_endpoint_id`` and ``endpoints.agent_active_model_config_id`` are +used. + +Idempotent: each column is skipped when already absent (a DB that ran only the +current 0013, or one already through 0029). ``ALTER TABLE … DROP COLUMN`` is the +same mechanism migrations 0016 and 0028 use; foreign keys are flipped off for the +change since each column carries a ``REFERENCES`` clause. +""" + +from __future__ import annotations + +import sqlite3 + +_VESTIGIAL = ("active_agent_endpoint_id", "active_agent_model_config_id") + + +def migrate(conn: sqlite3.Connection) -> None: + cols = {row[1] for row in conn.execute("PRAGMA table_info(settings)").fetchall()} + to_drop = [c for c in _VESTIGIAL if c in cols] + if not to_drop: + return + # PRAGMA foreign_keys is a no-op inside a transaction; the runner has committed + # before this call. Flip FKs off for the column drops, then restore prior state. + conn.commit() + had_fk = conn.execute("PRAGMA foreign_keys").fetchone()[0] + conn.execute("PRAGMA foreign_keys=OFF") + try: + for col in to_drop: + conn.execute(f"ALTER TABLE settings DROP COLUMN {col}") + conn.commit() + print(f"[migrations] 0029: dropped vestigial settings.{col}") + finally: + if had_fk: + conn.execute("PRAGMA foreign_keys=ON") From 4eb74a39e7f96ee619e7a51c0829a4a3a2575249 Mon Sep 17 00:00:00 2001 From: Chi Date: Fri, 12 Jun 2026 20:45:06 +0700 Subject: [PATCH 7/9] fix vestigial columns detected by previous railguard 2/? --- ...ial_voice_profiles_and_feedback_columns.py | 60 +++++++++++++++++++ 1 file changed, 60 insertions(+) create mode 100644 backend/database/migrations/0030_drop_vestigial_voice_profiles_and_feedback_columns.py diff --git a/backend/database/migrations/0030_drop_vestigial_voice_profiles_and_feedback_columns.py b/backend/database/migrations/0030_drop_vestigial_voice_profiles_and_feedback_columns.py new file mode 100644 index 0000000..278faf5 --- /dev/null +++ b/backend/database/migrations/0030_drop_vestigial_voice_profiles_and_feedback_columns.py @@ -0,0 +1,60 @@ +"""0030_drop_vestigial_voice_profiles_and_feedback_columns — reconcile two pieces of +schema drift the fresh-install DDL (backend/database/schema.py) never carried, so a +migrated DB stops diverging from ``CREATE_TABLES_SQL``. + +Both are the same class of bug 0028/0029 cleaned up: a migration left an artefact +behind that no later migration dropped, tripping the fresh-vs-migrated +schema-equivalence gate (backend/presets.py ``assert_schema_safe``) and so refusing +every preset export/snapshot/restore. + +1. ``voice_profiles`` table. Added by 0015 (legacy TTS storage), then ported into + ``character_cards.workflow_state["tts"]`` + ``settings.workflow_config["tts"]`` and + dropped by 0020 (``_port_tts``). Fresh installs never create it. But some databases + reached 0020 with the table already empty / its rows already ported and the legacy + ``settings.tts_*`` columns already gone, and came out the far side still carrying an + orphaned, never-read ``voice_profiles`` table. It is dropped here only when empty: + on any DB that reaches 0030, 0020 has already run, so any real rows were ported + long ago; a non-empty table would mean un-ported data, so we leave it for a human + rather than silently lose it (the equivalence gate keeps complaining, which is the + intended loud signal). + +2. ``conversation_logs.reasoning_feedback`` and ``conversation_logs.feedback_latency_ms``. + An early cut of the feedback sub-step (final form: migration 0024) gave feedback its + own ``reasoning_feedback`` / ``feedback_latency_ms`` columns; that was consolidated + into the single ``feedback`` JSON column 0024 actually ships, and 0024 explicitly + does *not* add the two split columns. Databases that ran the early 0024 keep them as + dead, never-read columns. They are plain TEXT/INTEGER with no FK, so a straight + ``ALTER TABLE … DROP COLUMN`` suffices. + +Idempotent: the table drop is skipped when the table is already absent (fresh installs, +or a DB already through 0030), and each column drop is skipped when already absent. +``ALTER TABLE … DROP COLUMN`` is the same mechanism 0016/0028/0029 use. +""" + +from __future__ import annotations + +import sqlite3 + +_STALE_LOG_COLUMNS = ("reasoning_feedback", "feedback_latency_ms") + + +def migrate(conn: sqlite3.Connection) -> None: + tables = {row[0] for row in conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()} + if "voice_profiles" in tables: + rows = conn.execute("SELECT COUNT(*) FROM voice_profiles").fetchone()[0] + if rows == 0: + conn.execute("DROP TABLE voice_profiles") + print("[migrations] 0030: dropped vestigial empty voice_profiles table") + else: + # Un-ported rows: refuse to drop and lose data. The equivalence gate stays + # red on purpose so this surfaces for a human instead of vanishing. + print( + f"[migrations] 0030: voice_profiles has {rows} row(s); leaving it in place " + f"(0020 should have ported and dropped it — investigate before dropping)" + ) + + log_cols = {row[1] for row in conn.execute("PRAGMA table_info(conversation_logs)").fetchall()} + for col in _STALE_LOG_COLUMNS: + if col in log_cols: + conn.execute(f"ALTER TABLE conversation_logs DROP COLUMN {col}") + print(f"[migrations] 0030: dropped vestigial conversation_logs.{col}") From b30f312f79d1f46fbb4220482530f6e8f8c81a69 Mon Sep 17 00:00:00 2001 From: Chi Date: Fri, 12 Jun 2026 20:55:57 +0700 Subject: [PATCH 8/9] fix vestigial columns detected by previous railguard 3/? --- ...031_drop_vestigial_tts_scripter_columns.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 backend/database/migrations/0031_drop_vestigial_tts_scripter_columns.py diff --git a/backend/database/migrations/0031_drop_vestigial_tts_scripter_columns.py b/backend/database/migrations/0031_drop_vestigial_tts_scripter_columns.py new file mode 100644 index 0000000..b2dde4d --- /dev/null +++ b/backend/database/migrations/0031_drop_vestigial_tts_scripter_columns.py @@ -0,0 +1,40 @@ +"""0031_drop_vestigial_tts_scripter_columns — drop the dead +``settings.tts_scripter_enabled`` and ``settings.tts_scripter_prompt`` columns. + +The detached LLM speech-scripter feature (added by 84bf39e "feat(tts): add voice +sidepanel controls") put these two columns on ``settings`` via the then-monolithic +``backend/database.py`` init path: fresh installs got them in the CREATE TABLE and +existing installs via inline ``ALTER TABLE … ADD COLUMN``. The feature was removed +shortly after (16a4288 "refactor(tts): remove detached LLM speech scripter"), which +deleted the DDL and the ALTERs — but no migration ever dropped the columns from +databases that booted in that window, so those installs carry two vestigial, +never-read columns. + +Same class as 0028/0029/0030: the live schema diverges from ``CREATE_TABLES_SQL`` +by exactly these columns, tripping the fresh-vs-migrated schema-equivalence gate +(backend/presets.py ``assert_schema_safe``) and refusing every preset +export/snapshot/restore. Found by simulating a fresh install at every historical +DDL version and migrating it to HEAD; this was the one residual divergence. + +Both columns are confirmed unreferenced by application code (the scripter's runtime +read them from a settings dict that no longer contains them). Plain INTEGER/TEXT, +no FK clause, so a straight ``ALTER TABLE … DROP COLUMN`` suffices — no FK toggle +needed, unlike 0029. + +Idempotent: each column is skipped when already absent (fresh installs, installs +that never booted in the scripter window, or a DB already through 0031). +""" + +from __future__ import annotations + +import sqlite3 + +_VESTIGIAL = ("tts_scripter_enabled", "tts_scripter_prompt") + + +def migrate(conn: sqlite3.Connection) -> None: + cols = {row[1] for row in conn.execute("PRAGMA table_info(settings)").fetchall()} + for col in _VESTIGIAL: + if col in cols: + conn.execute(f"ALTER TABLE settings DROP COLUMN {col}") + print(f"[migrations] 0031: dropped vestigial settings.{col}") From f330c2551ca3f870b7bf593e0849e2ab84184032 Mon Sep 17 00:00:00 2001 From: Chi Date: Fri, 12 Jun 2026 21:01:34 +0700 Subject: [PATCH 9/9] merge drop vestigial migration scripts --- .../0028_drop_vestigial_schema_artifacts.py | 91 +++++++++++++++++++ ...28_drop_vestigial_settings_model_config.py | 43 --------- ...29_drop_vestigial_settings_active_agent.py | 53 ----------- ...ial_voice_profiles_and_feedback_columns.py | 60 ------------ ...031_drop_vestigial_tts_scripter_columns.py | 40 -------- 5 files changed, 91 insertions(+), 196 deletions(-) create mode 100644 backend/database/migrations/0028_drop_vestigial_schema_artifacts.py delete mode 100644 backend/database/migrations/0028_drop_vestigial_settings_model_config.py delete mode 100644 backend/database/migrations/0029_drop_vestigial_settings_active_agent.py delete mode 100644 backend/database/migrations/0030_drop_vestigial_voice_profiles_and_feedback_columns.py delete mode 100644 backend/database/migrations/0031_drop_vestigial_tts_scripter_columns.py diff --git a/backend/database/migrations/0028_drop_vestigial_schema_artifacts.py b/backend/database/migrations/0028_drop_vestigial_schema_artifacts.py new file mode 100644 index 0000000..fc92a1d --- /dev/null +++ b/backend/database/migrations/0028_drop_vestigial_schema_artifacts.py @@ -0,0 +1,91 @@ +"""0028_drop_vestigial_schema_artifacts — drop every table/column an earlier build +left behind that the fresh-install DDL (backend/database/schema.py) never carried, +so a migrated DB stops diverging from ``CREATE_TABLES_SQL``. + +All four artefact groups are the same class of bug: a feature (or an early cut of +one) shipped schema via a since-rewritten migration or the old inline ``init_db`` +path, the feature was removed or redesigned, and nothing dropped the leftovers from +databases that booted in the window. Each tripped the fresh-vs-migrated +schema-equivalence gate (backend/presets.py ``assert_schema_safe``), refusing every +preset export/snapshot/restore. The full inventory, found by fresh-installing every +historical DDL version in git history and migrating it to HEAD: + +1. ``settings.active_model_config_id`` — superseded when the active-model pointer + moved to ``endpoints.active_model_config_id`` (migration 0010); the old + settings-level pointer was never read again and never dropped. +2. ``settings.active_agent_endpoint_id`` / ``settings.active_agent_model_config_id`` + — an early version of the agent-endpoint feature (later rewritten into what is + now 0013) put this pointer pair on ``settings``; the redesign kept only + ``settings.agent_endpoint_id`` + ``endpoints.agent_active_model_config_id``. +3. ``voice_profiles`` table and ``conversation_logs.reasoning_feedback`` / + ``conversation_logs.feedback_latency_ms`` — legacy TTS storage (0015) ported and + dropped by 0020, but re-created by bootstrap while the table was still in the + then-current DDL; and an early cut of the feedback sub-step whose split columns + 0024 consolidated into the single ``feedback`` JSON column. +4. ``settings.tts_scripter_enabled`` / ``settings.tts_scripter_prompt`` — the + detached LLM speech scripter (84bf39e), removed by 16a4288, which deleted the + DDL and inline ALTERs but not the columns already on disk. + +``voice_profiles`` is dropped only when empty: on any DB that reaches 0028, 0020 +has already run, so any real rows were ported long ago; a non-empty table would +mean un-ported data, so we leave it for a human rather than silently lose it (the +equivalence gate keeps complaining, which is the intended loud signal). + +Idempotent: every drop is skipped when the table/column is already absent (fresh +installs, or a DB already through 0028). ``ALTER TABLE … DROP COLUMN`` is the same +mechanism migration 0016 uses; foreign keys are flipped off for the ``settings`` +column drops since several carry a ``REFERENCES`` clause. +""" + +from __future__ import annotations + +import sqlite3 + +_VESTIGIAL_SETTINGS_COLUMNS = ( + "active_model_config_id", + "active_agent_endpoint_id", + "active_agent_model_config_id", + "tts_scripter_enabled", + "tts_scripter_prompt", +) +_VESTIGIAL_LOG_COLUMNS = ("reasoning_feedback", "feedback_latency_ms") + + +def migrate(conn: sqlite3.Connection) -> None: + settings_cols = {row[1] for row in conn.execute("PRAGMA table_info(settings)").fetchall()} + to_drop = [c for c in _VESTIGIAL_SETTINGS_COLUMNS if c in settings_cols] + if to_drop: + # PRAGMA foreign_keys is a no-op inside a transaction; the runner has + # committed before this call. Flip FKs off for the column drops (several + # carry a REFERENCES clause), then restore prior state. + conn.commit() + had_fk = conn.execute("PRAGMA foreign_keys").fetchone()[0] + conn.execute("PRAGMA foreign_keys=OFF") + try: + for col in to_drop: + conn.execute(f"ALTER TABLE settings DROP COLUMN {col}") + conn.commit() + print(f"[migrations] 0028: dropped vestigial settings.{col}") + finally: + if had_fk: + conn.execute("PRAGMA foreign_keys=ON") + + tables = {row[0] for row in conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()} + if "voice_profiles" in tables: + rows = conn.execute("SELECT COUNT(*) FROM voice_profiles").fetchone()[0] + if rows == 0: + conn.execute("DROP TABLE voice_profiles") + print("[migrations] 0028: dropped vestigial empty voice_profiles table") + else: + # Un-ported rows: refuse to drop and lose data. The equivalence gate stays + # red on purpose so this surfaces for a human instead of vanishing. + print( + f"[migrations] 0028: voice_profiles has {rows} row(s); leaving it in place " + f"(0020 should have ported and dropped it — investigate before dropping)" + ) + + log_cols = {row[1] for row in conn.execute("PRAGMA table_info(conversation_logs)").fetchall()} + for col in _VESTIGIAL_LOG_COLUMNS: + if col in log_cols: + conn.execute(f"ALTER TABLE conversation_logs DROP COLUMN {col}") + print(f"[migrations] 0028: dropped vestigial conversation_logs.{col}") diff --git a/backend/database/migrations/0028_drop_vestigial_settings_model_config.py b/backend/database/migrations/0028_drop_vestigial_settings_model_config.py deleted file mode 100644 index 98b0fa1..0000000 --- a/backend/database/migrations/0028_drop_vestigial_settings_model_config.py +++ /dev/null @@ -1,43 +0,0 @@ -"""0028_drop_vestigial_settings_model_config — drop the dead -``settings.active_model_config_id`` column. - -Migration 0008 added ``active_model_config_id`` to ``settings`` for the original -single-active-model design. 0010 moved the active-model concept onto -``endpoints`` (``endpoints.active_model_config_id``), and the column was removed -from the fresh-install DDL (backend/database/schema.py) — but no migration ever -dropped it from databases that ran 0008, so every migrated install (including a -fresh one, which still runs 0008's ALTER) carries a vestigial, never-read -``settings.active_model_config_id``. - -That left the live schema diverging from ``CREATE_TABLES_SQL`` by exactly this -column. Dropping it reconciles the two so the fresh-vs-migrated schema-equivalence -gate (backend/presets.py ``assert_schema_safe``) holds. The column is confirmed -unreferenced by application code; only ``endpoints.active_model_config_id`` is used. - -Idempotent: skipped when the column is already absent (a DB never through 0008, or -already through 0028). ``ALTER TABLE … DROP COLUMN`` is the same mechanism migration -0016 uses; foreign keys are flipped off for the change since the column carries a -``REFERENCES model_configs(id)`` clause. -""" - -from __future__ import annotations - -import sqlite3 - - -def migrate(conn: sqlite3.Connection) -> None: - cols = {row[1] for row in conn.execute("PRAGMA table_info(settings)").fetchall()} - if "active_model_config_id" not in cols: - return - # PRAGMA foreign_keys is a no-op inside a transaction; the runner has committed - # before this call. Flip FKs off for the column drop, then restore prior state. - conn.commit() - had_fk = conn.execute("PRAGMA foreign_keys").fetchone()[0] - conn.execute("PRAGMA foreign_keys=OFF") - try: - conn.execute("ALTER TABLE settings DROP COLUMN active_model_config_id") - conn.commit() - print("[migrations] 0028: dropped vestigial settings.active_model_config_id") - finally: - if had_fk: - conn.execute("PRAGMA foreign_keys=ON") diff --git a/backend/database/migrations/0029_drop_vestigial_settings_active_agent.py b/backend/database/migrations/0029_drop_vestigial_settings_active_agent.py deleted file mode 100644 index fd5b294..0000000 --- a/backend/database/migrations/0029_drop_vestigial_settings_active_agent.py +++ /dev/null @@ -1,53 +0,0 @@ -"""0029_drop_vestigial_settings_active_agent — drop the dead -``settings.active_agent_endpoint_id`` and ``settings.active_agent_model_config_id`` -columns. - -An early version of the agent-endpoint feature (later rewritten into what is now -migration 0013) added an *active agent* pointer pair directly to ``settings``: -``active_agent_endpoint_id`` (REFERENCES endpoints) and -``active_agent_model_config_id`` (REFERENCES model_configs). The feature was then -redesigned: the writer/agent split moved onto ``endpoints`` -(``endpoints.agent_active_model_config_id``) and ``settings`` kept only -``agent_endpoint_id`` / ``agent_same_as_writer``. The current 0013 adds that final -shape, and the fresh-install DDL (backend/database/schema.py) never carried the -``active_agent_*`` pair — but no migration ever dropped them from databases that -ran the old 0013, so those installs carry two vestigial, never-read columns. - -That left the live schema diverging from ``CREATE_TABLES_SQL`` by exactly these two -columns (and their FKs), tripping the fresh-vs-migrated schema-equivalence gate -(backend/presets.py ``assert_schema_safe``). Dropping them reconciles the two. Both -columns are confirmed unreferenced by application code; only the surviving -``settings.agent_endpoint_id`` and ``endpoints.agent_active_model_config_id`` are -used. - -Idempotent: each column is skipped when already absent (a DB that ran only the -current 0013, or one already through 0029). ``ALTER TABLE … DROP COLUMN`` is the -same mechanism migrations 0016 and 0028 use; foreign keys are flipped off for the -change since each column carries a ``REFERENCES`` clause. -""" - -from __future__ import annotations - -import sqlite3 - -_VESTIGIAL = ("active_agent_endpoint_id", "active_agent_model_config_id") - - -def migrate(conn: sqlite3.Connection) -> None: - cols = {row[1] for row in conn.execute("PRAGMA table_info(settings)").fetchall()} - to_drop = [c for c in _VESTIGIAL if c in cols] - if not to_drop: - return - # PRAGMA foreign_keys is a no-op inside a transaction; the runner has committed - # before this call. Flip FKs off for the column drops, then restore prior state. - conn.commit() - had_fk = conn.execute("PRAGMA foreign_keys").fetchone()[0] - conn.execute("PRAGMA foreign_keys=OFF") - try: - for col in to_drop: - conn.execute(f"ALTER TABLE settings DROP COLUMN {col}") - conn.commit() - print(f"[migrations] 0029: dropped vestigial settings.{col}") - finally: - if had_fk: - conn.execute("PRAGMA foreign_keys=ON") diff --git a/backend/database/migrations/0030_drop_vestigial_voice_profiles_and_feedback_columns.py b/backend/database/migrations/0030_drop_vestigial_voice_profiles_and_feedback_columns.py deleted file mode 100644 index 278faf5..0000000 --- a/backend/database/migrations/0030_drop_vestigial_voice_profiles_and_feedback_columns.py +++ /dev/null @@ -1,60 +0,0 @@ -"""0030_drop_vestigial_voice_profiles_and_feedback_columns — reconcile two pieces of -schema drift the fresh-install DDL (backend/database/schema.py) never carried, so a -migrated DB stops diverging from ``CREATE_TABLES_SQL``. - -Both are the same class of bug 0028/0029 cleaned up: a migration left an artefact -behind that no later migration dropped, tripping the fresh-vs-migrated -schema-equivalence gate (backend/presets.py ``assert_schema_safe``) and so refusing -every preset export/snapshot/restore. - -1. ``voice_profiles`` table. Added by 0015 (legacy TTS storage), then ported into - ``character_cards.workflow_state["tts"]`` + ``settings.workflow_config["tts"]`` and - dropped by 0020 (``_port_tts``). Fresh installs never create it. But some databases - reached 0020 with the table already empty / its rows already ported and the legacy - ``settings.tts_*`` columns already gone, and came out the far side still carrying an - orphaned, never-read ``voice_profiles`` table. It is dropped here only when empty: - on any DB that reaches 0030, 0020 has already run, so any real rows were ported - long ago; a non-empty table would mean un-ported data, so we leave it for a human - rather than silently lose it (the equivalence gate keeps complaining, which is the - intended loud signal). - -2. ``conversation_logs.reasoning_feedback`` and ``conversation_logs.feedback_latency_ms``. - An early cut of the feedback sub-step (final form: migration 0024) gave feedback its - own ``reasoning_feedback`` / ``feedback_latency_ms`` columns; that was consolidated - into the single ``feedback`` JSON column 0024 actually ships, and 0024 explicitly - does *not* add the two split columns. Databases that ran the early 0024 keep them as - dead, never-read columns. They are plain TEXT/INTEGER with no FK, so a straight - ``ALTER TABLE … DROP COLUMN`` suffices. - -Idempotent: the table drop is skipped when the table is already absent (fresh installs, -or a DB already through 0030), and each column drop is skipped when already absent. -``ALTER TABLE … DROP COLUMN`` is the same mechanism 0016/0028/0029 use. -""" - -from __future__ import annotations - -import sqlite3 - -_STALE_LOG_COLUMNS = ("reasoning_feedback", "feedback_latency_ms") - - -def migrate(conn: sqlite3.Connection) -> None: - tables = {row[0] for row in conn.execute("SELECT name FROM sqlite_master WHERE type='table'").fetchall()} - if "voice_profiles" in tables: - rows = conn.execute("SELECT COUNT(*) FROM voice_profiles").fetchone()[0] - if rows == 0: - conn.execute("DROP TABLE voice_profiles") - print("[migrations] 0030: dropped vestigial empty voice_profiles table") - else: - # Un-ported rows: refuse to drop and lose data. The equivalence gate stays - # red on purpose so this surfaces for a human instead of vanishing. - print( - f"[migrations] 0030: voice_profiles has {rows} row(s); leaving it in place " - f"(0020 should have ported and dropped it — investigate before dropping)" - ) - - log_cols = {row[1] for row in conn.execute("PRAGMA table_info(conversation_logs)").fetchall()} - for col in _STALE_LOG_COLUMNS: - if col in log_cols: - conn.execute(f"ALTER TABLE conversation_logs DROP COLUMN {col}") - print(f"[migrations] 0030: dropped vestigial conversation_logs.{col}") diff --git a/backend/database/migrations/0031_drop_vestigial_tts_scripter_columns.py b/backend/database/migrations/0031_drop_vestigial_tts_scripter_columns.py deleted file mode 100644 index b2dde4d..0000000 --- a/backend/database/migrations/0031_drop_vestigial_tts_scripter_columns.py +++ /dev/null @@ -1,40 +0,0 @@ -"""0031_drop_vestigial_tts_scripter_columns — drop the dead -``settings.tts_scripter_enabled`` and ``settings.tts_scripter_prompt`` columns. - -The detached LLM speech-scripter feature (added by 84bf39e "feat(tts): add voice -sidepanel controls") put these two columns on ``settings`` via the then-monolithic -``backend/database.py`` init path: fresh installs got them in the CREATE TABLE and -existing installs via inline ``ALTER TABLE … ADD COLUMN``. The feature was removed -shortly after (16a4288 "refactor(tts): remove detached LLM speech scripter"), which -deleted the DDL and the ALTERs — but no migration ever dropped the columns from -databases that booted in that window, so those installs carry two vestigial, -never-read columns. - -Same class as 0028/0029/0030: the live schema diverges from ``CREATE_TABLES_SQL`` -by exactly these columns, tripping the fresh-vs-migrated schema-equivalence gate -(backend/presets.py ``assert_schema_safe``) and refusing every preset -export/snapshot/restore. Found by simulating a fresh install at every historical -DDL version and migrating it to HEAD; this was the one residual divergence. - -Both columns are confirmed unreferenced by application code (the scripter's runtime -read them from a settings dict that no longer contains them). Plain INTEGER/TEXT, -no FK clause, so a straight ``ALTER TABLE … DROP COLUMN`` suffices — no FK toggle -needed, unlike 0029. - -Idempotent: each column is skipped when already absent (fresh installs, installs -that never booted in the scripter window, or a DB already through 0031). -""" - -from __future__ import annotations - -import sqlite3 - -_VESTIGIAL = ("tts_scripter_enabled", "tts_scripter_prompt") - - -def migrate(conn: sqlite3.Connection) -> None: - cols = {row[1] for row in conn.execute("PRAGMA table_info(settings)").fetchall()} - for col in _VESTIGIAL: - if col in cols: - conn.execute(f"ALTER TABLE settings DROP COLUMN {col}") - print(f"[migrations] 0031: dropped vestigial settings.{col}")