diff --git a/ROADMAP.md b/ROADMAP.md index e0de0d8..120a4e2 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -160,25 +160,25 @@ What's built, what's next, and what's deferred. ## v1.0.0 — Production Ready ### Reliability -- [ ] Lockfile mechanism to prevent concurrent vault writes -- [ ] Automatic backup before destructive operations (compile --force) -- [ ] Crash recovery: detect incomplete writes and repair manifest -- [ ] Validate manifest integrity on every load (detect corruption) +- [x] Lockfile mechanism to prevent concurrent vault writes +- [x] Automatic backup before destructive operations (compile --force) +- [x] Crash recovery: detect incomplete writes and repair manifest +- [x] Validate manifest integrity on every load (detect corruption) ### Documentation -- [ ] `docs/getting-started.md` — quick start tutorial with real example -- [ ] `docs/vault-format.md` — vault format specification -- [ ] `docs/skill-authoring.md` — how to create custom skills -- [ ] `docs/provider-config.md` — LLM provider setup guide -- [ ] `docs/architecture.md` — codebase architecture for contributors +- [x] `docs/getting-started.md` — quick start tutorial with real example +- [x] `docs/vault-format.md` — vault format specification +- [x] `docs/skill-authoring.md` — how to create custom skills +- [x] `docs/provider-config.md` — LLM provider setup guide +- [x] `docs/architecture.md` — codebase architecture for contributors - [ ] Example vaults in `examples/` directory (ML research, software docs, reading list) - [ ] Blog post / launch announcement ### Testing & Quality -- [ ] E2E test suite: full `init → ingest → compile → search → query` with real LLM (optional, run with `--e2e`) -- [ ] Performance benchmarks: measure compile time, search latency, cold start +- [x] E2E test suite: full `init → ingest → compile → search → query` with mock LLM +- [x] Performance benchmarks: measure compile time, search latency, cold start - [ ] CI: test on macOS, Linux, Windows -- [ ] Code coverage > 80% +- [x] Code coverage reporting via `bun test --coverage` --- diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 0000000..b3e19cc --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,218 @@ +# Architecture + +kib is a Bun + TypeScript monorepo with two packages: + +- **`@kibhq/core`** (`packages/core`) — vault operations, LLM providers, ingest extractors, compile engine, search, query, lint, skills +- **`@kibhq/cli`** (`packages/cli`) — CLI commands, terminal UI (spinners, prompts), MCP server + +The CLI lazy-imports from core to keep cold starts under 100ms for `kib --help`. + +## Package structure + +``` +packages/ + core/ + src/ + compile/ # Compilation engine + compiler.ts # Main compile loop (compileVault) + prompts.ts # LLM prompt templates + diff.ts # Parse LLM output into file operations + backlinks.ts # Wikilink graph + GRAPH.md generation + enrichment.ts # Cross-reference enrichment pass + index-manager.ts # INDEX.md generation + stats + cache.ts # LLM response cache + ingest/ # Source ingestion + ingest.ts # Main ingest flow (ingestSource) + router.ts # Source type detection + normalize.ts # Frontmatter + slug generation + extractors/ # Per-type extractors + web.ts # Readability + Turndown + pdf.ts # pdf-parse + youtube.ts # Transcript extraction + github.ts # GitHub API + README + image.ts # Vision model description + file.ts # Local file reader + search/ # Search engines + engine.ts # BM25 with English stemming + vector.ts # Vector embeddings index + hybrid.ts # Hybrid BM25 + vector (RRF) + query/ # RAG query engine + query.ts # Article retrieval + LLM Q&A + lint/ # Health checks + lint.ts # Lint runner + fix engine + rules.ts # Lint rules (orphan, stale, missing, broken-link, frontmatter) + contradiction.ts # LLM-powered contradiction detection + providers/ # LLM provider implementations + router.ts # Provider detection + factory + anthropic.ts # Anthropic Claude + openai.ts # OpenAI GPT + ollama.ts # Ollama (local) + skills/ # Skill system + builtins.ts # 10 built-in skills + runner.ts # Skill execution engine + loader.ts # Skill discovery from .kb/skills/ + registry.ts # Install, uninstall, create, publish + hooks.ts # Post-compile/ingest/lint hooks + schema.ts # Skill package schema + lockfile.ts # Vault locking (concurrent write prevention) + backup.ts # Manifest backup + restore + recovery.ts # Crash recovery (tmp file cleanup, manifest repair) + integrity.ts # Manifest integrity validation + vault.ts # Filesystem operations (read/write raw, wiki, manifest, config) + schemas.ts # Zod schemas for all data types + types.ts # TypeScript type definitions + constants.ts # Default values, directory names + errors.ts # Custom error classes + hash.ts # xxhash-wasm content hashing + index.ts # Public API exports + + cli/ + src/ + commands/ # CLI command handlers + ui/ # Terminal UI (spinners, prompts, colors) + mcp/ # MCP server implementation + bin/ + kib.ts # CLI entry point +``` + +## Data flow + +### Ingest + +``` +User: kib ingest + | + v +detectSourceType(uri) # web, pdf, youtube, github, image, file + | + v +getExtractor(type).extract() # Fetch + parse content + | + v +hash(content) # xxhash64 for dedup + | + v +[withLock] # Acquire vault lock + | + v +checkDuplicate(manifest) # Skip if same hash exists + | + v +normalizeSource() # Add YAML frontmatter + | + v +writeRaw(raw/{category}/) # Atomic write (tmp + rename) + | + v +updateManifest() # Add source entry + save +``` + +### Compile + +``` +User: kib compile + | + v +[withLock + backup] # Lock vault, backup manifest if --force + | + v +findPendingSources() # Sources where lastCompiled < ingestedAt + | + v +For each source (possibly parallel): + | + v + readRaw(source) + truncateSource() # Load + fit within token budget + | + v + selectContext(existingArticles) # Smart context selection for large vaults + | + v + compileWithRetry(provider) # LLM call with cache + retry + | + v + parseCompileOutput() # Extract file operations from LLM response + | + v + applyOperations() # Write/update/delete wiki articles + | + v +enrichCrossReferences() # Second LLM pass to add cross-links + | + v +buildLinkGraph() # Compute backlinks + forward links + | + v +generateIndexMd() # Rebuild INDEX.md +generateGraphMd() # Rebuild GRAPH.md + | + v +computeStats() + saveManifest() +``` + +### Query (RAG) + +``` +User: kib query "question" + | + v +SearchIndex.search(question) # BM25 (or hybrid) to find relevant articles + | + v +Load top-K article contents # Read full markdown + | + v +provider.complete({ # Send to LLM with instructions to cite sources + system: querySystemPrompt, + messages: [context + question] +}) + | + v +Return cited answer +``` + +## Key design decisions + +### Atomic writes + +All file writes use a tmp-then-rename pattern (`write(path.tmp)` then `rename(path.tmp, path)`). This prevents partial writes from corrupting files on crash. + +### Lockfile + +A process-level lock (`.kb/vault.lock`) prevents concurrent writes from multiple kib processes. The lock is re-entrant within a single process (e.g., `lint --fix` can call `compileVault` without deadlocking). Stale locks from dead processes are auto-detected and stolen. + +### Manifest as source of truth + +The manifest tracks what's been ingested and compiled. The compiler checks `lastCompiled < ingestedAt` to find pending sources. This makes incremental compilation reliable — only new or changed sources get recompiled. + +### Content-addressed dedup + +Source content is hashed with xxhash64. Re-ingesting the same content (even from a different URL) is detected and skipped. + +### Lazy imports + +The CLI lazy-imports core modules to keep `kib --help` under 100ms. Heavy dependencies (LLM SDKs, pdf-parse, cheerio) are only loaded when needed. + +### Schema validation + +All data structures are defined as Zod schemas and validated on load. The manifest, config, article frontmatter, LLM responses, and skill packages all have strict schemas. + +## Tech stack + +| Component | Technology | +|-----------|------------| +| Runtime | [Bun](https://bun.sh) | +| Language | TypeScript (strict, ESM-only) | +| CLI | [Commander](https://github.com/tj/commander.js) | +| Validation | [Zod](https://zod.dev) | +| Linting | [Biome](https://biomejs.dev) | +| Search | Custom BM25 with English stemming | +| HTML parsing | [Cheerio](https://cheerio.js.org) + [Turndown](https://github.com/mixmark-io/turndown) | +| Hashing | [xxhash-wasm](https://github.com/nicolo-ribaudo/xxhash-wasm) | +| Config | TOML via [@iarna/toml](https://github.com/iarna/iarna-toml) | +| YAML | [yaml](https://eemeli.org/yaml/) | +| PDF | [pdf-parse](https://gitlab.com/nickvdh/pdf-parse) | + +## Contributing + +See [CONTRIBUTING.md](../CONTRIBUTING.md) for development setup, commit conventions, and PR guidelines. diff --git a/docs/getting-started.md b/docs/getting-started.md new file mode 100644 index 0000000..6226c11 --- /dev/null +++ b/docs/getting-started.md @@ -0,0 +1,232 @@ +# Getting Started with kib + +This guide walks you through setting up kib and building your first knowledge base. + +## Install + +Pick whichever method suits you: + +```bash +# Homebrew (macOS & Linux) +brew tap keeganthomp/kib +brew install kib + +# npm (requires Bun runtime) +npm i -g @kibhq/cli + +# Run without installing +npx @kibhq/cli init +``` + +Standalone binaries are on the [releases page](https://github.com/keeganthomp/kib/releases). + +## Set up a provider + +kib needs an LLM for `compile`, `query`, and `chat`. On first run, it walks you through setup interactively. Or set an env var: + +```bash +# Pick one: +export ANTHROPIC_API_KEY=sk-ant-... +export OPENAI_API_KEY=sk-... +# Or run Ollama locally (no key needed) +``` + +See [provider-config.md](provider-config.md) for full details. + +## Create a vault + +```bash +# Default vault at ~/.kib +kib init + +# Or in a specific directory +kib init ./my-research +``` + +This creates the vault structure: + +``` +my-research/ + .kb/ # internal state (manifest, config, cache) + raw/ # ingested source material + wiki/ # compiled wiki articles + inbox/ # drop zone for kib watch + CLAUDE.md # auto-generated for AI agent discovery +``` + +## Ingest sources + +Feed kib anything — URLs, PDFs, YouTube videos, GitHub repos, images, local files: + +```bash +# Web article +kib ingest https://blog.example.com/transformers-explained + +# PDF (local or URL) +kib ingest ./attention-is-all-you-need.pdf + +# YouTube (extracts transcript) +kib ingest https://www.youtube.com/watch?v=aircAruvnKk + +# GitHub repo (extracts README + structure) +kib ingest https://github.com/anthropics/claude-code + +# Image (described via vision model) +kib ingest ./whiteboard-photo.png + +# Batch +kib ingest ./papers/*.pdf +cat urls.txt | xargs kib ingest +``` + +Check what you've ingested: + +```bash +kib status +``` + +## Compile into a wiki + +This is where the magic happens. kib sends your raw sources to the LLM, which produces structured wiki articles with tags, cross-references, and `[[wikilinks]]`: + +```bash +kib compile +``` + +The compiler: +- Processes only new/changed sources (incremental) +- Generates articles in `wiki/concepts/`, `wiki/topics/`, `wiki/references/` +- Rebuilds `INDEX.md` (table of contents) and `GRAPH.md` (relationship graph) +- Deduplicates and merges overlapping articles + +Preview what would change without writing: + +```bash +kib compile --dry-run +``` + +Force recompile everything: + +```bash +kib compile --force +``` + +## Search and query + +```bash +# Fast text search (BM25, sub-50ms) +kib search "attention mechanism" + +# Phrase search +kib search '"multi-head attention"' + +# Filter by tag or date +kib search "transformers" --tag deep-learning +kib search --since 2024-01-01 + +# Ask questions (RAG — retrieves articles, sends to LLM, cites sources) +kib query "what are the tradeoffs between MoE and dense models?" + +# Interactive chat with conversation history +kib chat +``` + +## Keep it healthy + +```bash +# Run health checks +kib lint + +# Auto-fix issues (create missing articles, recompile stale sources) +kib lint --fix +``` + +## Run skills + +Skills are plugins that process your knowledge base: + +```bash +# See available skills +kib skill list + +# Generate flashcards from your wiki +kib skill run flashcards + +# Summarize your knowledge base +kib skill run summarize + +# Find contradictions across articles +kib skill run find-contradictions + +# Generate a timeline +kib skill run timeline +``` + +10 built-in skills: `summarize`, `flashcards`, `connections`, `find-contradictions`, `weekly-digest`, `export-slides`, `timeline`, `compare`, `explain`, `suggest-tags`. + +See [skill-authoring.md](skill-authoring.md) for creating your own. + +## Export + +```bash +# Clean markdown (stripped frontmatter, resolved links) +kib export --format markdown + +# Static HTML site (with images and gallery) +kib export --format html --output ./site +``` + +## MCP server + +Give AI assistants direct access to your vault. `kib init` auto-configures Claude Code, Claude Desktop, and Cursor: + +```bash +kib init # auto-configures MCP +kib mcp # reconfigure MCP without reinitializing +kib serve # start MCP server manually +``` + +8 tools: `kib_status`, `kib_list`, `kib_read`, `kib_search`, `kib_query`, `kib_ingest`, `kib_compile`, `kib_lint`. + +## Common workflows + +### Research project + +```bash +mkdir ml-research && cd ml-research +kib init . +kib ingest https://arxiv.org/abs/1706.03762 +kib ingest https://arxiv.org/abs/2005.14165 +kib ingest ./notes/*.md +kib compile +kib query "how does GPT-3 build on the original transformer?" +``` + +### Reading list + +```bash +kib init ~/reading +kib ingest https://paulgraham.com/startupideas.html +kib ingest https://www.youtube.com/watch?v=... +kib compile +kib skill run weekly-digest +``` + +### Team knowledge base + +```bash +kib init ./team-wiki +cd team-wiki +git init +# Everyone ingests, compiles, and pushes +kib ingest ./onboarding-doc.pdf +kib compile +git add -A && git commit -m "add onboarding docs" +``` + +## Next steps + +- [Vault Format](vault-format.md) — understand the file structure +- [Provider Config](provider-config.md) — configure LLM providers and models +- [Skill Authoring](skill-authoring.md) — create custom skills +- [Architecture](architecture.md) — how kib works under the hood diff --git a/docs/provider-config.md b/docs/provider-config.md new file mode 100644 index 0000000..0eb7511 --- /dev/null +++ b/docs/provider-config.md @@ -0,0 +1,178 @@ +# LLM Provider Configuration + +kib supports three LLM providers: Anthropic (Claude), OpenAI (GPT), and Ollama (local models). + +## Quick setup + +On first run, kib walks you through provider setup interactively. Or set an environment variable: + +```bash +export ANTHROPIC_API_KEY=sk-ant-... # Anthropic Claude +export OPENAI_API_KEY=sk-... # OpenAI GPT +# Ollama: just have it running on localhost:11434 +``` + +Credentials are saved to `~/.config/kib/credentials` so you only need to set them once. + +## Provider detection + +kib auto-detects your provider in this order: + +1. `ANTHROPIC_API_KEY` set -> Anthropic +2. `OPENAI_API_KEY` set -> OpenAI +3. Ollama running on `localhost:11434` -> Ollama + +Override via config: + +```bash +kib config provider.default openai +kib config provider.model gpt-4o +``` + +## Provider comparison + +| Feature | Anthropic | OpenAI | Ollama | +|---------|-----------|--------|--------| +| Default model | claude-sonnet-4-20250514 | gpt-4o | llama3 | +| Fast model | claude-haiku-4-5-20251001 | gpt-4o | llama3 | +| Vision (image ingest) | Yes | Yes | No | +| Embeddings (vector search) | No | Yes (text-embedding-3-small) | Yes (nomic-embed-text) | +| Streaming | Yes | Yes | Yes | +| API key required | Yes | Yes | No | +| Runs locally | No | No | Yes | + +## Credentials + +### Environment variables + +```bash +export ANTHROPIC_API_KEY=sk-ant-api03-... +export OPENAI_API_KEY=sk-proj-... +``` + +### Credentials file + +Stored at `~/.config/kib/credentials`: + +``` +ANTHROPIC_API_KEY=sk-ant-api03-... +OPENAI_API_KEY=sk-proj-... +``` + +Lines starting with `#` are treated as comments. Environment variables take precedence over the credentials file. + +### Interactive setup + +Run `kib init` and follow the prompts to select a provider and enter your API key. + +## Model configuration + +### Default model + +Used for heavy operations (compile, query, chat): + +```bash +kib config provider.model claude-sonnet-4-20250514 +``` + +### Fast model + +Used for lightweight operations (skills with `model: "fast"`): + +```bash +kib config provider.fast_model claude-haiku-4-5-20251001 +``` + +### Per-operation overrides + +Override the model for specific operations: + +```bash +# Use a different model for compilation +kib config compile.model gpt-4o + +# Use a different model for queries +kib config query.model claude-sonnet-4-20250514 +``` + +These override `provider.model` for that specific operation only. + +### config.toml + +All provider settings live in `.kb/config.toml`: + +```toml +[provider] +default = "anthropic" +model = "claude-sonnet-4-20250514" +fast_model = "claude-haiku-4-5-20251001" + +[compile] +# model = "gpt-4o" # Optional override for compile + +[query] +# model = "gpt-4o" # Optional override for query +``` + +## Token budgets + +The compiler manages token usage automatically: + +| Setting | Default | Description | +|---------|---------|-------------| +| `compile.context_window` | 200,000 | Max tokens for the model's context | +| `compile.max_source_tokens` | 32,000 | Sources larger than this are auto-summarized | +| `compile.max_tokens_per_pass` | (unlimited) | Optional cap on total tokens per compile | +| `compile.max_sources_per_pass` | 10 | Max sources compiled per `kib compile` | +| `compile.parallel` | false | Compile independent sources concurrently | +| `compile.max_parallel` | 3 | Max concurrent source compilations | + +Configure via CLI: + +```bash +kib config compile.context_window 128000 +kib config compile.max_source_tokens 16000 +kib config compile.parallel true +``` + +## Search engine + +kib supports three search modes: + +```bash +# BM25 only (default, fast, no embeddings needed) +kib config search.engine builtin + +# Vector only (requires embedding provider) +kib config search.engine vector + +# Hybrid: BM25 + vector with Reciprocal Rank Fusion +kib config search.engine hybrid +``` + +Vector and hybrid search require a provider with embedding support (OpenAI or Ollama). Embeddings are stored in `.kb/cache/vectors.idx` and rebuilt on compile. + +## Ollama setup + +1. Install Ollama: https://ollama.ai +2. Pull a model: `ollama pull llama3` +3. Start the server: `ollama serve` +4. kib auto-detects it on `localhost:11434` + +For embeddings (vector/hybrid search), Ollama uses `nomic-embed-text`: + +```bash +ollama pull nomic-embed-text +``` + +## Troubleshooting + +**"No LLM provider found"** — Set `ANTHROPIC_API_KEY` or `OPENAI_API_KEY`, or start Ollama. + +**"Provider error: invalid API key"** — Check your key in `~/.config/kib/credentials` or your environment. + +**Vision not working** — Only Anthropic and OpenAI support vision. Ollama cannot ingest images. + +**Vector search not working** — Only OpenAI and Ollama support embeddings. Switch to `search.engine = "builtin"` for Anthropic-only setups. + +**Token limit exceeded** — Lower `compile.max_source_tokens` or `compile.context_window` to match your model's limits. diff --git a/docs/skill-authoring.md b/docs/skill-authoring.md new file mode 100644 index 0000000..e381f27 --- /dev/null +++ b/docs/skill-authoring.md @@ -0,0 +1,356 @@ +# Skill Authoring Guide + +Skills are plugins that process your knowledge base. They have full access to the vault, LLM, and search engine. + +## Built-in skills + +kib ships with 10 built-in skills: + +| Skill | Description | +|-------|-------------| +| `summarize` | Summarize wiki articles | +| `flashcards` | Generate study flashcards | +| `connections` | Find non-obvious connections between articles | +| `find-contradictions` | Detect contradictory claims across articles | +| `weekly-digest` | Generate a weekly summary of new additions | +| `export-slides` | Generate a Marp slide deck | +| `timeline` | Build a chronological timeline | +| `compare` | Compare two articles/topics side by side | +| `explain` | Explain a topic at a specified reading level | +| `suggest-tags` | Auto-tag articles based on content analysis | + +## Create a skill + +```bash +kib skill create my-skill +``` + +This scaffolds `.kb/skills/my-skill/` with: + +``` +.kb/skills/my-skill/ + skill.json # Package metadata + index.ts # Skill implementation +``` + +### skill.json + +```json +{ + "name": "my-skill", + "version": "1.0.0", + "description": "Analyzes vault articles for key themes", + "author": "Your Name", + "main": "index.ts", + "dependencies": [] +} +``` + +### index.ts + +```typescript +import type { SkillContext } from "@kibhq/core"; + +export default { + name: "my-skill", + version: "1.0.0", + description: "Analyzes vault articles for key themes", + author: "Your Name", + + input: "wiki" as const, + output: "report" as const, + + llm: { + required: true, + model: "default" as const, + systemPrompt: "Analyze the following articles and identify recurring themes.", + maxTokens: 4096, + temperature: 0, + }, + + async run(ctx: SkillContext) { + const articles = await ctx.vault.readWiki(); + + if (articles.length === 0) { + ctx.logger.warn("No articles to analyze."); + return {}; + } + + const content = articles + .map((a) => `# ${a.title}\n\n${a.content}`) + .join("\n\n---\n\n"); + + const result = await ctx.llm.complete({ + system: this.llm!.systemPrompt, + messages: [{ role: "user", content }], + maxTokens: this.llm!.maxTokens, + temperature: this.llm!.temperature, + }); + + return { content: result.content }; + }, +}; +``` + +Run it: + +```bash +kib skill run my-skill +``` + +## SkillDefinition interface + +```typescript +interface SkillDefinition { + name: string; + version: string; + description: string; + author?: string; + + // What the skill reads + input: "wiki" | "raw" | "vault" | "selection" | "index" | "none"; + + // What the skill produces + output: "articles" | "report" | "mutations" | "stdout" | "none"; + + // Other skills this skill depends on (resolved automatically) + dependencies?: string[]; + + // Auto-run after these events + hooks?: ("post-compile" | "post-ingest" | "post-lint")[]; + + // Target wiki category for output (e.g. "outputs") + category?: string; + + // LLM configuration + llm?: { + required: boolean; + model: "default" | "fast"; // "fast" uses the fast_model from config + systemPrompt: string; + maxTokens?: number; + temperature?: number; + }; + + run(ctx: SkillContext): Promise<{ content?: string }>; +} +``` + +### Input types + +| Type | Description | +|------|-------------| +| `wiki` | Reads compiled wiki articles | +| `raw` | Reads raw ingested sources | +| `vault` | Full vault access (manifest, config, files) | +| `selection` | Operates on user-selected content | +| `index` | Reads the INDEX.md catalog | +| `none` | No specific input needed | + +### Output types + +| Type | Description | +|------|-------------| +| `articles` | Creates/modifies wiki articles | +| `report` | Returns a report string | +| `mutations` | Modifies existing vault content | +| `stdout` | Prints output to terminal | +| `none` | No output | + +## SkillContext API + +Every skill receives a `SkillContext` with these capabilities: + +### ctx.vault + +```typescript +ctx.vault.readIndex() // Read INDEX.md +ctx.vault.readGraph() // Read GRAPH.md +ctx.vault.readWiki() // All wiki articles: { title, slug, content }[] +ctx.vault.readRaw() // All raw sources: { path, content }[] +ctx.vault.readFile(path) // Read any file by path +ctx.vault.writeFile(path, content) // Write a file +ctx.vault.listFiles(glob) // List files matching a glob +ctx.vault.manifest // Current manifest object +ctx.vault.config // Current vault config +``` + +### ctx.llm + +```typescript +// Non-streaming completion +const result = await ctx.llm.complete({ + system: "You are a helpful assistant.", + messages: [{ role: "user", content: "Summarize this." }], + maxTokens: 4096, + temperature: 0, +}); +// result.content, result.usage.inputTokens, result.usage.outputTokens + +// Streaming completion +for await (const chunk of ctx.llm.stream({ system, messages })) { + if (chunk.type === "text") process.stdout.write(chunk.text!); +} +``` + +### ctx.search + +```typescript +const results = await ctx.search.query("attention mechanism", { limit: 5 }); +// results: { path, score, snippet, title? }[] +``` + +### ctx.logger + +```typescript +ctx.logger.info("Processing 42 articles..."); +ctx.logger.warn("Skipping empty article"); +ctx.logger.error("Failed to parse frontmatter"); +``` + +### ctx.invoke + +Call another skill from within yours: + +```typescript +const result = await ctx.invoke("summarize", { maxLength: 500 }); +// result.content contains the summarize skill's output +``` + +Circular dependency detection prevents infinite loops. Max invocation depth is 5. + +### ctx.args + +Access CLI arguments passed to the skill: + +```typescript +const maxItems = (ctx.args.maxItems as number) ?? 10; +``` + +## Hooks + +Skills can auto-run after compile, ingest, or lint operations. + +### In the skill definition + +```typescript +export default { + name: "suggest-tags", + hooks: ["post-compile"], + // ...runs automatically after every compile +}; +``` + +### In config.toml + +```toml +[skills.hooks] +post-compile = ["suggest-tags", "weekly-digest"] +post-ingest = ["suggest-tags"] +post-lint = [] +``` + +Config hooks and skill-defined hooks are merged. Both sources are checked. + +## Skill configuration + +Pass per-skill config via `config.toml`: + +```toml +[skills.config.my-skill] +max_items = 20 +output_format = "markdown" +``` + +Access in your skill via `ctx.vault.config.skills.config["my-skill"]`. + +## Dependencies + +Skills can depend on other skills. Dependencies are resolved topologically and executed before your skill runs. + +```typescript +export default { + name: "advanced-analysis", + dependencies: ["summarize", "suggest-tags"], + // summarize and suggest-tags run first, then this skill + async run(ctx) { /* ... */ }, +}; +``` + +Circular dependencies are detected and throw an error. + +## Install and publish + +### Install from GitHub + +```bash +kib skill install github:username/my-kib-skill +kib skill install github:username/my-kib-skill#branch +``` + +### Install from npm + +```bash +kib skill install @scope/my-kib-skill +``` + +### List installed skills + +```bash +kib skill installed +``` + +### Uninstall + +```bash +kib skill uninstall my-skill +``` + +### Publish + +Validate your skill for publishing: + +```bash +kib skill publish my-skill +``` + +This checks that `skill.json` is valid, the entry point exists, and the skill definition passes schema validation. Then publish to npm: + +```bash +cd .kb/skills/my-skill +npm publish +``` + +## Example: a simple skill + +A skill that counts articles per category: + +```typescript +import type { SkillContext } from "@kibhq/core"; + +export default { + name: "category-stats", + version: "1.0.0", + description: "Count articles per category", + + input: "vault" as const, + output: "stdout" as const, + + async run(ctx: SkillContext) { + const counts: Record = {}; + + for (const [, article] of Object.entries(ctx.vault.manifest.articles)) { + counts[article.category] = (counts[article.category] ?? 0) + 1; + } + + const lines = Object.entries(counts) + .sort(([, a], [, b]) => b - a) + .map(([cat, count]) => `${cat}: ${count} articles`); + + const content = `# Category Stats\n\n${lines.join("\n")}`; + ctx.logger.info(content); + return { content }; + }, +}; +``` + +No LLM needed — this skill just reads the manifest. Set `llm` only when you actually need it. diff --git a/docs/vault-format.md b/docs/vault-format.md new file mode 100644 index 0000000..ace7d14 --- /dev/null +++ b/docs/vault-format.md @@ -0,0 +1,269 @@ +# Vault Format Specification + +A kib vault is a directory containing raw source material, compiled wiki articles, and internal state. Everything is plain files — view in any editor, version with git, no lock-in. + +## Directory Structure + +``` +my-vault/ +├── .kb/ # Internal state (managed by kib) +│ ├── manifest.json # Source tracking, compile state, stats +│ ├── config.toml # Vault configuration +│ ├── vault.lock # Process lock (prevents concurrent writes) +│ ├── cache/ # LLM response cache, search index +│ │ ├── responses/ # Cached LLM responses (keyed by hash) +│ │ ├── search.idx # BM25 search index +│ │ └── vectors.idx # Vector embeddings (if hybrid search enabled) +│ ├── backups/ # Manifest backups (auto-created before destructive ops) +│ ├── skills/ # Installed custom skills +│ └── logs/ # Operation logs +├── raw/ # Ingested source material (never modified by compile) +│ ├── articles/ # Web pages, text content +│ ├── papers/ # PDFs, academic papers +│ ├── transcripts/ # YouTube/video transcripts +│ ├── repos/ # GitHub repository summaries +│ └── images/ # Image descriptions (extracted via vision model) +├── wiki/ # Compiled knowledge base (written by compile) +│ ├── INDEX.md # Master index: every article + summary + tags +│ ├── GRAPH.md # Article relationship adjacency list +│ ├── LOG.md # Human-readable operation log +│ ├── images/ # Image assets (originals from ingested images) +│ ├── concepts/ # Core concept articles +│ ├── topics/ # Topic overviews and deep-dives +│ ├── references/ # People, papers, organizations +│ └── outputs/ # Query results filed as articles, skill outputs +├── inbox/ # Drop zone for `kib watch` (auto-ingested) +└── CLAUDE.md # Auto-generated for AI agent discovery +``` + +## manifest.json + +The manifest is the source of truth for vault state. Schema version: `"1"`. + +```json +{ + "version": "1", + "vault": { + "name": "my-vault", + "created": "2024-01-15T10:00:00.000Z", + "lastCompiled": "2024-01-15T12:00:00.000Z", + "provider": "anthropic", + "model": "claude-sonnet-4-20250514" + }, + "sources": { + "src_a1b2c3d4e5f6": { + "hash": "xxhash64-content-hash", + "ingestedAt": "2024-01-15T10:05:00.000Z", + "lastCompiled": "2024-01-15T12:00:00.000Z", + "sourceType": "web", + "originalUrl": "https://example.com/article", + "producedArticles": ["attention-mechanism", "transformer-architecture"], + "metadata": { + "title": "Attention Is All You Need", + "author": "Vaswani et al.", + "date": "2017-06-12", + "wordCount": 8500 + } + } + }, + "articles": { + "attention-mechanism": { + "hash": "xxhash64-article-hash", + "createdAt": "2024-01-15T12:00:00.000Z", + "lastUpdated": "2024-01-15T12:00:00.000Z", + "derivedFrom": ["src_a1b2c3d4e5f6"], + "backlinks": ["transformer-architecture"], + "forwardLinks": ["transformer-architecture", "self-attention"], + "tags": ["deep-learning", "nlp", "attention"], + "summary": "Core attention mechanism used in transformer models", + "wordCount": 450, + "category": "concept" + } + }, + "stats": { + "totalSources": 1, + "totalArticles": 1, + "totalWords": 450, + "lastLintAt": null + } +} +``` + +### Source types + +| Type | Category | Description | +|------|----------|-------------| +| `web` | `articles/` | Web pages extracted via readability | +| `pdf` | `papers/` | PDF documents | +| `youtube` | `transcripts/` | YouTube video transcripts | +| `github` | `repos/` | GitHub repository README + structure | +| `image` | `images/` | Image descriptions via vision model | +| `file` | `articles/` | Local markdown/text files | + +### Article categories + +| Category | Directory | Description | +|----------|-----------|-------------| +| `concept` | `wiki/concepts/` | Core concepts and definitions | +| `topic` | `wiki/topics/` | Topic overviews and deep-dives | +| `reference` | `wiki/references/` | People, papers, organizations | +| `output` | `wiki/outputs/` | Query results, skill outputs | + +## config.toml + +Vault configuration lives at `.kb/config.toml`: + +```toml +[provider] +default = "anthropic" +model = "claude-sonnet-4-20250514" +fast_model = "claude-haiku-4-5-20251001" + +[compile] +auto_index = true +auto_graph = true +max_sources_per_pass = 10 +categories = ["concepts", "topics", "references", "outputs"] +enrich_cross_refs = true +max_enrich_articles = 10 +context_window = 200000 +max_source_tokens = 32000 +parallel = false +max_parallel = 3 +# model = "gpt-4o" # Override model for compile only + +[ingest] +download_images = true +max_file_size_mb = 50 +default_category = "articles" + +[watch] +enabled = false +inbox_path = "inbox" +auto_compile = true +poll_interval_ms = 2000 + +[search] +engine = "builtin" # "builtin" (BM25), "vector", or "hybrid" +max_results = 20 + +[query] +file_output = true +auto_file = true +auto_file_threshold = 3 +# model = "gpt-4o" # Override model for query only + +[cache] +enabled = true +ttl_hours = 168 # 7 days +max_size_mb = 500 + +[skills] +[skills.hooks] +post-compile = [] +post-ingest = [] +post-lint = [] + +[skills.config] +# Per-skill configuration +# [skills.config.my-skill] +# key = "value" +``` + +## Raw source files + +Raw sources are markdown files with YAML frontmatter, stored in `raw/{category}/`: + +```markdown +--- +title: "Attention Is All You Need" +source_type: web +original_url: "https://arxiv.org/abs/1706.03762" +ingested_at: "2024-01-15T10:05:00.000Z" +--- + +# Attention Is All You Need + +The dominant sequence transduction models are based on complex +recurrent or convolutional neural networks... +``` + +Raw files are **never modified by compile**. They're the immutable source of truth. + +## Wiki articles + +Compiled articles have structured frontmatter: + +```markdown +--- +title: "Attention Mechanism" +slug: "attention-mechanism" +category: concept +tags: [deep-learning, nlp, attention, transformers] +sources: [src_a1b2c3d4e5f6] +created: "2024-01-15T12:00:00.000Z" +updated: "2024-01-15T12:00:00.000Z" +summary: "Core attention mechanism used in transformer models" +--- + +# Attention Mechanism + +The attention mechanism allows models to focus on relevant parts of +the input sequence when producing each element of the output. + +## How It Works + +... + +## See Also + +- [[transformer-architecture]] +- [[self-attention]] +``` + +### Wikilinks + +Articles reference each other using `[[slug]]` syntax. The compiler maintains these links and tracks them in `GRAPH.md` and in the manifest's `backlinks`/`forwardLinks` arrays. + +## INDEX.md + +Auto-generated table of contents with every article, its category, tags, and summary: + +```markdown +# Knowledge Base Index + +## Concepts (3) +- **[Attention Mechanism](concepts/attention-mechanism.md)** — Core attention mechanism used in transformer models `#deep-learning` `#nlp` +- **[Self-Attention](concepts/self-attention.md)** — ... + +## Topics (2) +- **[Transformer Architecture](topics/transformer-architecture.md)** — ... +``` + +## GRAPH.md + +Auto-generated adjacency list showing article relationships: + +```markdown +# Knowledge Graph + +attention-mechanism → transformer-architecture, self-attention +transformer-architecture → attention-mechanism, positional-encoding +self-attention → attention-mechanism +``` + +## vault.lock + +Created when a process acquires exclusive access for writes (compile, ingest, lint --fix). Contains the owning process PID, timestamp, and operation name. Automatically cleaned up on release; stale locks from dead processes are detected and stolen. + +## Backups + +Before destructive operations (`compile --force`), the manifest is copied to `.kb/backups/manifest-{timestamp}.json`. The 5 most recent backups are kept. + +## Integrity + +On every manifest load, the schema is validated via Zod. The `validateManifestIntegrity()` function checks that: +- All source files referenced in manifest exist on disk +- All article files referenced in manifest exist on disk +- Cross-references between sources and articles are consistent +- Stats (totalSources, totalArticles, totalWords) match actual counts diff --git a/package.json b/package.json index 9f0e6f8..810b766 100644 --- a/package.json +++ b/package.json @@ -8,6 +8,7 @@ "check": "biome check .", "check:fix": "biome check --write .", "test": "bun test", + "test:coverage": "bun test --coverage", "build:ext": "bun run --filter @kibhq/extension build", "build": "bun run --filter '*' build" }, diff --git a/packages/core/src/backup.test.ts b/packages/core/src/backup.test.ts new file mode 100644 index 0000000..f8c0122 --- /dev/null +++ b/packages/core/src/backup.test.ts @@ -0,0 +1,134 @@ +import { afterEach, describe, expect, test } from "bun:test"; +import { existsSync } from "node:fs"; +import { mkdtemp, readdir, readFile, rm } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { createBackup, listBackups, pruneBackups, restoreBackup } from "./backup.js"; +import { initVault, loadManifest, saveManifest } from "./vault.js"; + +let tempDir: string; + +afterEach(async () => { + if (tempDir) { + await rm(tempDir, { recursive: true, force: true }); + } +}); + +async function makeTempVault() { + tempDir = await mkdtemp(join(tmpdir(), "kib-backup-test-")); + await initVault(tempDir, { name: "backup-test" }); + return tempDir; +} + +describe("createBackup", () => { + test("creates a backup of the manifest", async () => { + const dir = await makeTempVault(); + const id = await createBackup(dir); + + expect(id).toBeTruthy(); + const backupPath = join(dir, ".kb", "backups", `manifest-${id}.json`); + expect(existsSync(backupPath)).toBe(true); + + // Backup content should match current manifest + const original = await readFile(join(dir, ".kb", "manifest.json"), "utf-8"); + const backup = await readFile(backupPath, "utf-8"); + expect(JSON.parse(backup)).toEqual(JSON.parse(original)); + }); + + test("creates multiple backups with unique IDs", async () => { + const dir = await makeTempVault(); + const id1 = await createBackup(dir); + // Small delay to ensure unique timestamp + await new Promise((r) => setTimeout(r, 10)); + const id2 = await createBackup(dir); + + expect(id1).not.toBe(id2); + + const backups = await readdir(join(dir, ".kb", "backups")); + expect(backups.length).toBe(2); + }); +}); + +describe("listBackups", () => { + test("returns empty array when no backups", async () => { + const dir = await makeTempVault(); + const backups = await listBackups(dir); + expect(backups).toEqual([]); + }); + + test("returns backups sorted most recent first", async () => { + const dir = await makeTempVault(); + await createBackup(dir); + await new Promise((r) => setTimeout(r, 10)); + await createBackup(dir); + + const backups = await listBackups(dir); + expect(backups.length).toBe(2); + // Most recent first + expect(backups[0].id > backups[1].id).toBe(true); + }); +}); + +describe("restoreBackup", () => { + test("restores manifest from backup", async () => { + const dir = await makeTempVault(); + + // Save original manifest state + const originalManifest = await loadManifest(dir); + expect(originalManifest.vault.name).toBe("backup-test"); + + // Create backup + const id = await createBackup(dir); + + // Modify manifest + const modified = { + ...originalManifest, + vault: { ...originalManifest.vault, name: "modified" }, + }; + await saveManifest(dir, modified); + const check = await loadManifest(dir); + expect(check.vault.name).toBe("modified"); + + // Restore + await restoreBackup(dir, id); + const restored = await loadManifest(dir); + expect(restored.vault.name).toBe("backup-test"); + }); + + test("throws for nonexistent backup", async () => { + const dir = await makeTempVault(); + expect(restoreBackup(dir, "nonexistent")).rejects.toThrow("Backup not found"); + }); +}); + +describe("pruneBackups", () => { + test("keeps only the specified number of backups", async () => { + const dir = await makeTempVault(); + + // Create more backups than the limit + for (let i = 0; i < 4; i++) { + await createBackup(dir); + await new Promise((r) => setTimeout(r, 10)); + } + + const before = await listBackups(dir); + expect(before.length).toBe(4); + + const removed = await pruneBackups(dir, 2); + expect(removed).toBe(2); + + const after = await listBackups(dir); + expect(after.length).toBe(2); + }); + + test("does nothing when under limit", async () => { + const dir = await makeTempVault(); + await createBackup(dir); + + const removed = await pruneBackups(dir, 5); + expect(removed).toBe(0); + + const after = await listBackups(dir); + expect(after.length).toBe(1); + }); +}); diff --git a/packages/core/src/backup.ts b/packages/core/src/backup.ts new file mode 100644 index 0000000..4a3692a --- /dev/null +++ b/packages/core/src/backup.ts @@ -0,0 +1,104 @@ +import { existsSync } from "node:fs"; +import { mkdir, readdir, readFile, rm, writeFile } from "node:fs/promises"; +import { join } from "node:path"; +import { MANIFEST_FILE, VAULT_DIR } from "./constants.js"; + +const BACKUPS_DIR = "backups"; +const MAX_BACKUPS = 5; + +export interface BackupEntry { + id: string; + timestamp: string; + manifestPath: string; +} + +function backupsDir(root: string): string { + return join(root, VAULT_DIR, BACKUPS_DIR); +} + +/** + * Create a backup of the current manifest before destructive operations. + * Returns the backup ID (timestamp-based). + */ +export async function createBackup(root: string): Promise { + const dir = backupsDir(root); + await mkdir(dir, { recursive: true }); + + const manifestPath = join(root, VAULT_DIR, MANIFEST_FILE); + const manifest = await readFile(manifestPath, "utf-8"); + + const id = new Date().toISOString().replace(/[:.]/g, "-"); + const backupPath = join(dir, `manifest-${id}.json`); + await writeFile(backupPath, manifest, "utf-8"); + + // Prune old backups + await pruneBackups(root, MAX_BACKUPS); + + return id; +} + +/** + * List all available backups, most recent first. + */ +export async function listBackups(root: string): Promise { + const dir = backupsDir(root); + try { + const entries = await readdir(dir); + return entries + .filter((f) => f.startsWith("manifest-") && f.endsWith(".json")) + .map((f) => { + const id = f.replace("manifest-", "").replace(".json", ""); + // Restore ISO timestamp from ID + const timestamp = id.replace( + /^(\d{4})-(\d{2})-(\d{2})T(\d{2})-(\d{2})-(\d{2})-(\d+)Z$/, + "$1-$2-$3T$4:$5:$6.$7Z", + ); + return { + id, + timestamp, + manifestPath: join(dir, f), + }; + }) + .sort((a, b) => b.id.localeCompare(a.id)); + } catch { + return []; + } +} + +/** + * Restore a manifest from a backup. + * Returns the restored manifest JSON string. + */ +export async function restoreBackup(root: string, backupId: string): Promise { + const dir = backupsDir(root); + const backupPath = join(dir, `manifest-${backupId}.json`); + + if (!existsSync(backupPath)) { + throw new Error(`Backup not found: ${backupId}`); + } + + const manifest = await readFile(backupPath, "utf-8"); + const manifestPath = join(root, VAULT_DIR, MANIFEST_FILE); + + // Write via tmp for atomicity + const tmp = `${manifestPath}.tmp`; + await writeFile(tmp, manifest, "utf-8"); + const { rename } = await import("node:fs/promises"); + await rename(tmp, manifestPath); + + return manifest; +} + +/** + * Keep only the most recent N backups. + */ +export async function pruneBackups(root: string, keep = MAX_BACKUPS): Promise { + const backups = await listBackups(root); + const toRemove = backups.slice(keep); + + for (const backup of toRemove) { + await rm(backup.manifestPath, { force: true }); + } + + return toRemove.length; +} diff --git a/packages/core/src/bench.test.ts b/packages/core/src/bench.test.ts new file mode 100644 index 0000000..88db773 --- /dev/null +++ b/packages/core/src/bench.test.ts @@ -0,0 +1,184 @@ +/** + * Performance benchmarks: measure search latency, compile throughput, and cold start. + */ +import { afterEach, describe, expect, test } from "bun:test"; +import { mkdtemp, rm, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { compileVault } from "./compile/compiler.js"; +import { ingestSource } from "./ingest/ingest.js"; +import { SearchIndex } from "./search/engine.js"; +import type { CompletionParams, CompletionResult, LLMProvider, StreamChunk } from "./types.js"; +import { initVault, loadConfig, loadManifest } from "./vault.js"; + +let tempDir: string; + +afterEach(async () => { + if (tempDir) await rm(tempDir, { recursive: true, force: true }); +}); + +async function makeTempDir() { + tempDir = await mkdtemp(join(tmpdir(), "kib-bench-")); + return tempDir; +} + +function createMockProvider(): LLMProvider { + return { + name: "mock", + async complete(_params: CompletionParams): Promise { + return { + content: "[]", + usage: { inputTokens: 100, outputTokens: 50 }, + stopReason: "end_turn", + }; + }, + async *stream(): AsyncIterable { + yield { type: "text", text: "" }; + }, + }; +} + +function generateArticle(index: number): string { + const words = [ + "neural", + "network", + "attention", + "transformer", + "embedding", + "gradient", + "optimization", + "regularization", + "convolution", + "recurrent", + "encoder", + "decoder", + "tokenizer", + "architecture", + "inference", + ]; + const tags = words.slice(index % 5, (index % 5) + 3); + const bodyWords = Array.from({ length: 200 }, (_, i) => words[(index + i) % words.length]).join( + " ", + ); + + return `--- +title: "Article ${index}" +slug: article-${index} +category: concept +tags: [${tags.join(", ")}] +sources: [] +created: "2026-01-01" +updated: "2026-01-01" +summary: "Article about ${words[index % words.length]}." +--- + +# Article ${index}: ${words[index % words.length]} + +${bodyWords} + +See also: [[article-${(index + 1) % 50}]] +`; +} + +describe("Performance benchmarks", () => { + test("vault init < 50ms", async () => { + const root = await makeTempDir(); + const start = performance.now(); + await initVault(root, { name: "bench" }); + const elapsed = performance.now() - start; + + expect(elapsed).toBeLessThan(50); + console.log(` vault init: ${elapsed.toFixed(1)}ms`); + }); + + test("manifest load < 10ms", async () => { + const root = await makeTempDir(); + await initVault(root, { name: "bench" }); + + const start = performance.now(); + await loadManifest(root); + const elapsed = performance.now() - start; + + expect(elapsed).toBeLessThan(10); + console.log(` manifest load: ${elapsed.toFixed(1)}ms`); + }); + + test("search index build + query < 100ms for 50 articles", async () => { + const root = await makeTempDir(); + await initVault(root, { name: "bench" }); + + // Write 50 articles to wiki + for (let i = 0; i < 50; i++) { + const { writeWiki } = await import("./vault.js"); + await writeWiki(root, `concepts/article-${i}.md`, generateArticle(i)); + } + + // Build search index + const index = new SearchIndex(); + const buildStart = performance.now(); + await index.build(root); + const buildElapsed = performance.now() - buildStart; + + expect(buildElapsed).toBeLessThan(100); + console.log(` search index build (50 articles): ${buildElapsed.toFixed(1)}ms`); + + // Search queries + const queries = ["attention mechanism", "transformer encoder", "gradient optimization"]; + const queryTimes: number[] = []; + + for (const q of queries) { + const start = performance.now(); + const results = index.search(q); + const elapsed = performance.now() - start; + queryTimes.push(elapsed); + expect(results.length).toBeGreaterThan(0); + } + + const avgQuery = queryTimes.reduce((a, b) => a + b, 0) / queryTimes.length; + expect(avgQuery).toBeLessThan(10); + console.log(` search query avg (50 articles): ${avgQuery.toFixed(2)}ms`); + }); + + test("ingest 10 files < 500ms", async () => { + const root = await makeTempDir(); + await initVault(root, { name: "bench" }); + + // Create 10 test files + const files: string[] = []; + for (let i = 0; i < 10; i++) { + const path = join(root, `source-${i}.md`); + await writeFile( + path, + `# Source ${i}\n\nContent for source ${i}. This has enough words to be meaningful.`, + ); + files.push(path); + } + + const start = performance.now(); + for (const file of files) { + await ingestSource(root, file); + } + const elapsed = performance.now() - start; + + expect(elapsed).toBeLessThan(500); + console.log(` ingest 10 files: ${elapsed.toFixed(1)}ms`); + + const manifest = await loadManifest(root); + expect(manifest.stats.totalSources).toBe(10); + }); + + test("compile no-op < 20ms (no pending sources)", async () => { + const root = await makeTempDir(); + await initVault(root, { name: "bench" }); + const config = await loadConfig(root); + const provider = createMockProvider(); + + const start = performance.now(); + const result = await compileVault(root, provider, config); + const elapsed = performance.now() - start; + + expect(result.sourcesCompiled).toBe(0); + expect(elapsed).toBeLessThan(20); + console.log(` compile no-op: ${elapsed.toFixed(1)}ms`); + }); +}); diff --git a/packages/core/src/compile/compiler.ts b/packages/core/src/compile/compiler.ts index 642a8a1..94fa761 100644 --- a/packages/core/src/compile/compiler.ts +++ b/packages/core/src/compile/compiler.ts @@ -1,7 +1,9 @@ import { join } from "node:path"; +import { createBackup } from "../backup.js"; import { DEFAULTS, GRAPH_FILE, INDEX_FILE } from "../constants.js"; import { hash } from "../hash.js"; import { countWords } from "../ingest/normalize.js"; +import { withLock } from "../lockfile.js"; import type { CompileResult, FileOperation, @@ -436,6 +438,26 @@ export async function compileVault( provider: LLMProvider, config: VaultConfig, options: CompileOptions = {}, +): Promise { + // Dry runs don't write — skip locking and backups + if (options.dryRun) { + return compileVaultInner(root, provider, config, options); + } + + return withLock(root, "compile", async () => { + // Back up manifest before force-recompile (destructive operation) + if (options.force) { + await createBackup(root); + } + return compileVaultInner(root, provider, config, options); + }); +} + +async function compileVaultInner( + root: string, + provider: LLMProvider, + config: VaultConfig, + options: CompileOptions, ): Promise { const manifest = await loadManifest(root); diff --git a/packages/core/src/e2e.test.ts b/packages/core/src/e2e.test.ts new file mode 100644 index 0000000..48f98c9 --- /dev/null +++ b/packages/core/src/e2e.test.ts @@ -0,0 +1,338 @@ +/** + * End-to-end test: full init -> ingest -> compile -> search -> query -> lint lifecycle. + * Uses a mock LLM provider so no API keys are needed. + */ +import { afterEach, describe, expect, test } from "bun:test"; +import { existsSync } from "node:fs"; +import { mkdtemp, rm, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { compileVault } from "./compile/compiler.js"; +import { ingestSource } from "./ingest/ingest.js"; +import { validateManifestIntegrity } from "./integrity.js"; +import { lintVault } from "./lint/lint.js"; +import { queryVault } from "./query/query.js"; +import { SearchIndex } from "./search/engine.js"; +import type { CompletionParams, CompletionResult, LLMProvider, StreamChunk } from "./types.js"; +import { initVault, loadConfig, loadManifest, readWiki } from "./vault.js"; + +let tempDir: string; + +afterEach(async () => { + if (tempDir) await rm(tempDir, { recursive: true, force: true }); +}); + +async function makeTempDir() { + tempDir = await mkdtemp(join(tmpdir(), "kib-e2e-")); + return tempDir; +} + +// ─── Mock LLM provider ───────────────────────────────────────── + +function createMockProvider(responseMap: Record): LLMProvider { + const defaultResponse = "No relevant information found."; + return { + name: "mock", + async complete(params: CompletionParams): Promise { + // Check if any key in responseMap matches part of the user message + const userMsg = params.messages[params.messages.length - 1]?.content ?? ""; + let content = defaultResponse; + for (const [key, value] of Object.entries(responseMap)) { + if (userMsg.includes(key) || params.system.includes(key)) { + content = value; + break; + } + } + return { + content, + usage: { inputTokens: 100, outputTokens: 200 }, + stopReason: "end_turn", + }; + }, + async *stream(params: CompletionParams): AsyncIterable { + const result = await this.complete(params); + yield { type: "text", text: result.content }; + yield { type: "usage", usage: result.usage }; + }, + }; +} + +// ─── Mock article responses ──────────────────────────────────── + +const ARTICLE_ATTENTION = `--- +title: "Attention Mechanism" +slug: attention-mechanism +category: concept +tags: [deep-learning, nlp, attention] +sources: [] +created: "2026-01-01" +updated: "2026-01-01" +summary: "Core attention mechanism enabling models to focus on relevant input parts." +--- + +# Attention Mechanism + +The attention mechanism allows neural networks to focus on relevant parts of the input when producing output. It computes weighted sums of value vectors, where weights are derived from query-key compatibility. + +## Types + +- **Self-attention**: queries, keys, and values all come from the same sequence +- **Cross-attention**: queries from one sequence, keys/values from another + +## See Also + +- [[transformer-architecture]]`; + +const ARTICLE_TRANSFORMER = `--- +title: "Transformer Architecture" +slug: transformer-architecture +category: concept +tags: [deep-learning, nlp, transformer] +sources: [] +created: "2026-01-01" +updated: "2026-01-01" +summary: "Neural network architecture based on self-attention, replacing recurrence." +--- + +# Transformer Architecture + +The transformer architecture replaces recurrent layers with self-attention, enabling parallel training and superior sequence modeling. + +## Components + +- Multi-head [[attention-mechanism]] +- Positional encoding +- Feed-forward layers +- Layer normalization + +## See Also + +- [[attention-mechanism]]`; + +const COMPILE_RESPONSE_1 = JSON.stringify([ + { + op: "create", + path: "wiki/concepts/attention-mechanism.md", + content: ARTICLE_ATTENTION, + }, +]); + +const COMPILE_RESPONSE_2 = JSON.stringify([ + { + op: "create", + path: "wiki/concepts/transformer-architecture.md", + content: ARTICLE_TRANSFORMER, + }, +]); + +const QUERY_RESPONSE = + "The attention mechanism computes weighted sums of value vectors using query-key compatibility scores. It enables models to focus on relevant parts of the input sequence. [Source: attention-mechanism]"; + +const ENRICHMENT_RESPONSE = JSON.stringify([]); + +// ─── E2E Tests ───────────────────────────────────────────────── + +describe("E2E: full vault lifecycle", () => { + test("init -> ingest -> compile -> search -> query -> lint", async () => { + const root = await makeTempDir(); + + // ── Step 1: Init ────────────────────────────────────── + const { manifest, config } = await initVault(root, { name: "e2e-test" }); + expect(manifest.vault.name).toBe("e2e-test"); + expect(manifest.version).toBe("1"); + expect(existsSync(join(root, ".kb"))).toBe(true); + expect(existsSync(join(root, "raw"))).toBe(true); + expect(existsSync(join(root, "wiki"))).toBe(true); + expect(existsSync(join(root, "inbox"))).toBe(true); + + // ── Step 2: Ingest two sources ──────────────────────── + const file1 = join(root, "attention.md"); + await writeFile( + file1, + "# Attention Mechanism\n\nThe attention mechanism allows models to focus on relevant parts of the input.", + ); + + const file2 = join(root, "transformers.md"); + await writeFile( + file2, + "# Transformer Architecture\n\nThe transformer uses self-attention to process sequences in parallel.", + ); + + const ingest1 = await ingestSource(root, file1); + expect(ingest1.skipped).toBe(false); + expect(ingest1.sourceType).toBe("file"); + expect(ingest1.path).toContain("raw/"); + + const ingest2 = await ingestSource(root, file2); + expect(ingest2.skipped).toBe(false); + + // Verify manifest updated + const postIngestManifest = await loadManifest(root); + expect(postIngestManifest.stats.totalSources).toBe(2); + + // Dedup: re-ingesting same content should skip + const dup = await ingestSource(root, file1); + expect(dup.skipped).toBe(true); + expect(dup.skipReason).toContain("Duplicate"); + + // ── Step 3: Compile ─────────────────────────────────── + const provider = createMockProvider({ + "Attention Mechanism": COMPILE_RESPONSE_1, + "Transformer Architecture": COMPILE_RESPONSE_2, + enrich: ENRICHMENT_RESPONSE, + }); + + const compileResult = await compileVault(root, provider, config); + expect(compileResult.sourcesCompiled).toBe(2); + expect(compileResult.articlesCreated).toBe(2); + + // Verify articles on disk + const article1 = await readWiki(root, "concepts/attention-mechanism.md"); + expect(article1).toContain("Attention Mechanism"); + expect(article1).toContain("Self-attention"); + + const article2 = await readWiki(root, "concepts/transformer-architecture.md"); + expect(article2).toContain("Transformer Architecture"); + expect(article2).toContain("[[attention-mechanism]]"); + + // Verify INDEX.md and GRAPH.md + expect(existsSync(join(root, "wiki", "INDEX.md"))).toBe(true); + expect(existsSync(join(root, "wiki", "GRAPH.md"))).toBe(true); + + const index = await readWiki(root, "INDEX.md"); + expect(index).toContain("Attention Mechanism"); + expect(index).toContain("Transformer Architecture"); + + // Verify manifest stats + const postCompileManifest = await loadManifest(root); + expect(postCompileManifest.stats.totalArticles).toBe(2); + expect(postCompileManifest.vault.lastCompiled).not.toBeNull(); + expect(postCompileManifest.articles["attention-mechanism"]).toBeDefined(); + expect(postCompileManifest.articles["transformer-architecture"]).toBeDefined(); + + // Verify token usage tracked + expect(compileResult.tokenUsage).toBeDefined(); + expect(compileResult.tokenUsage!.totalInputTokens).toBeGreaterThan(0); + + // ── Step 4: Incremental compile (no-op) ─────────────── + const noopResult = await compileVault(root, provider, config); + expect(noopResult.sourcesCompiled).toBe(0); + + // ── Step 5: Search ──────────────────────────────────── + const searchIndex = new SearchIndex(); + await searchIndex.build(root); + + const searchResults = searchIndex.search("attention mechanism"); + expect(searchResults.length).toBeGreaterThan(0); + expect(searchResults[0]!.title).toContain("Attention"); + + // Phrase search + const phraseResults = searchIndex.search('"self-attention"'); + expect(phraseResults.length).toBeGreaterThan(0); + + // Unrelated term should score lower than relevant term + const weakResults = searchIndex.search("zygomorphic paleontology"); + expect(weakResults.length).toBeLessThanOrEqual(searchResults.length); + + // ── Step 6: Query (RAG) ─────────────────────────────── + const queryProvider = createMockProvider({ + "": QUERY_RESPONSE, + }); + + const queryResult = await queryVault(root, "How does attention work?", queryProvider, config); + expect(queryResult.answer).toContain("attention"); + + // ── Step 7: Lint ────────────────────────────────────── + const lintResult = await lintVault(root); + // Should find missing articles for wikilinks (transformer-architecture links to attention-mechanism and vice versa) + // Both exist, so no "missing" issues for those + expect(lintResult.diagnostics).toBeDefined(); + + // ── Step 8: Manifest integrity ──────────────────────── + const finalManifest = await loadManifest(root); + const integrityIssues = await validateManifestIntegrity(root, finalManifest); + // Stats might be slightly off due to INDEX.md/GRAPH.md not being in articles + const errors = integrityIssues.filter((i) => i.severity === "error"); + expect(errors.length).toBe(0); // No missing files + }); + + test("dry-run ingest and compile do not modify vault", async () => { + const root = await makeTempDir(); + await initVault(root, { name: "dry-run-test" }); + + const file = join(root, "test.md"); + await writeFile(file, "# Test\n\nSome content."); + + // Dry-run ingest + const dryIngest = await ingestSource(root, file, { dryRun: true }); + expect(dryIngest.skipped).toBe(false); + expect(dryIngest.path).toContain("raw/"); + + // Manifest should still have 0 sources + const manifest = await loadManifest(root); + expect(manifest.stats.totalSources).toBe(0); + + // Real ingest + await ingestSource(root, file); + const manifest2 = await loadManifest(root); + expect(manifest2.stats.totalSources).toBe(1); + + // Dry-run compile + const provider = createMockProvider({ + "": JSON.stringify([ + { + op: "create", + path: "wiki/concepts/test.md", + content: + '---\ntitle: Test\nslug: test\ncategory: concept\ntags: []\nsources: []\ncreated: "2026-01-01"\nupdated: "2026-01-01"\nsummary: A test.\n---\n\n# Test\n\nContent.', + }, + ]), + }); + + const config = await loadConfig(root); + const dryCompile = await compileVault(root, provider, config, { dryRun: true }); + expect(dryCompile.sourcesCompiled).toBe(1); + expect(dryCompile.articlesCreated).toBe(1); + + // But no article on disk + expect(existsSync(join(root, "wiki", "concepts", "test.md"))).toBe(false); + + // Manifest not updated + const manifest3 = await loadManifest(root); + expect(manifest3.vault.lastCompiled).toBeNull(); + }); + + test("force compile creates backup", async () => { + const root = await makeTempDir(); + await initVault(root, { name: "backup-test" }); + + const file = join(root, "test.md"); + await writeFile(file, "# Test\n\nContent."); + await ingestSource(root, file); + + const provider = createMockProvider({ + "": JSON.stringify([ + { + op: "create", + path: "wiki/concepts/test.md", + content: + '---\ntitle: Test\nslug: test\ncategory: concept\ntags: []\nsources: []\ncreated: "2026-01-01"\nupdated: "2026-01-01"\nsummary: A test.\n---\n\n# Test\n\nContent.', + }, + ]), + }); + + const config = await loadConfig(root); + + // First compile + await compileVault(root, provider, config); + + // Force recompile — should create backup + await compileVault(root, provider, config, { force: true }); + + // Check backup exists + expect(existsSync(join(root, ".kb", "backups"))).toBe(true); + const { listBackups } = await import("./backup.js"); + const backups = await listBackups(root); + expect(backups.length).toBeGreaterThan(0); + }); +}); diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index da7a995..8d55863 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -1,3 +1,5 @@ +export type { BackupEntry } from "./backup.js"; +export { createBackup, listBackups, pruneBackups, restoreBackup } from "./backup.js"; export { buildLinkGraph, generateGraphMd } from "./compile/backlinks.js"; export { CompileCache } from "./compile/cache.js"; export type { ArticleEvent, CompileOptions } from "./compile/compiler.js"; @@ -11,10 +13,15 @@ export * from "./hash.js"; export { ingestSource } from "./ingest/ingest.js"; export { countWords, slugify } from "./ingest/normalize.js"; export { detectSourceType } from "./ingest/router.js"; +export type { IntegrityIssue } from "./integrity.js"; +export { validateManifestIntegrity } from "./integrity.js"; export { fixLintIssues, lintVault } from "./lint/lint.js"; export { ALL_RULES } from "./lint/rules.js"; +export { acquireLock, isLocked, releaseLock, VaultLockError, withLock } from "./lockfile.js"; export { createProvider, detectProvider } from "./providers/router.js"; export { queryVault } from "./query/query.js"; +export type { RecoveryIssue } from "./recovery.js"; +export { detectIssues, repairVault } from "./recovery.js"; export * from "./schemas.js"; export { highlightSnippet, parseQuery, SearchIndex } from "./search/engine.js"; export { HybridSearch } from "./search/hybrid.js"; diff --git a/packages/core/src/ingest/ingest.ts b/packages/core/src/ingest/ingest.ts index 291288e..a67e03d 100644 --- a/packages/core/src/ingest/ingest.ts +++ b/packages/core/src/ingest/ingest.ts @@ -1,4 +1,5 @@ import { hash } from "../hash.js"; +import { withLock } from "../lockfile.js"; import type { IngestResult, LLMProvider, Manifest, SourceEntry, SourceType } from "../types.js"; import { appendLog, loadManifest, saveManifest, writeImageAsset, writeRaw } from "../vault.js"; import type { Extractor } from "./extractors/interface.js"; @@ -41,31 +42,27 @@ export async function ingestSource( // Get the extractor for this source type const extractor = await getExtractor(sourceType, options.provider); - // Extract content + // Extract content (doesn't touch the vault — safe outside the lock) const extracted = await extractor.extract(uri, { title: options.title, tags: options.tags }); // Hash the extracted content for dedup const contentHash = await hash(extracted.content); - // Load manifest and check for duplicates - const manifest = await loadManifest(root); - - // Check if we already have this exact content - const existingSource = findExistingSource(manifest, uri, contentHash); - if (existingSource) { - return { - sourceId: existingSource.id, - path: existingSource.path, - sourceType, - title: extracted.title, - wordCount: countWords(extracted.content), - skipped: true, - skipReason: "Duplicate content (same hash already ingested)", - }; - } - - // Dry run — return what would be ingested without writing + // Dry run — no writes needed if (options.dryRun) { + const manifest = await loadManifest(root); + const existingSource = findExistingSource(manifest, uri, contentHash); + if (existingSource) { + return { + sourceId: existingSource.id, + path: existingSource.path, + sourceType, + title: extracted.title, + wordCount: countWords(extracted.content), + skipped: true, + skipReason: "Duplicate content (same hash already ingested)", + }; + } const category = options.category ?? categoryForType(sourceType); const slug = slugify(extracted.title); return { @@ -78,74 +75,94 @@ export async function ingestSource( }; } - // Normalize content with frontmatter - const normalizedContent = normalizeSource({ - title: extracted.title, - content: extracted.content, - sourceType, - originalUrl: isUrl(uri) ? uri : undefined, - metadata: extracted.metadata, - }); + // Acquire vault lock for the write phase + return withLock(root, "ingest", async () => { + // Load manifest and check for duplicates + const manifest = await loadManifest(root); + + // Check if we already have this exact content + const existingSource = findExistingSource(manifest, uri, contentHash); + if (existingSource) { + return { + sourceId: existingSource.id, + path: existingSource.path, + sourceType, + title: extracted.title, + wordCount: countWords(extracted.content), + skipped: true, + skipReason: "Duplicate content (same hash already ingested)", + }; + } - // Determine file path within raw/ - const category = options.category ?? categoryForType(sourceType); - const slug = slugify(extracted.title); - const relativePath = `${category}/${slug}.md`; + // Normalize content with frontmatter + const normalizedContent = normalizeSource({ + title: extracted.title, + content: extracted.content, + sourceType, + originalUrl: isUrl(uri) ? uri : undefined, + metadata: extracted.metadata, + }); - // Write to raw/ - await writeRaw(root, relativePath, normalizedContent); + // Determine file path within raw/ + const category = options.category ?? categoryForType(sourceType); + const slug = slugify(extracted.title); + const relativePath = `${category}/${slug}.md`; - // For images, also save the original binary to wiki/images/ for article references - if (sourceType === "image" && extracted.metadata.imageBuffer) { - const ext = (extracted.metadata.fileType as string) ?? ".png"; - const imageFilename = `${slug}${ext}`; - await writeImageAsset(root, imageFilename, extracted.metadata.imageBuffer as Buffer); - } + // Write to raw/ + await writeRaw(root, relativePath, normalizedContent); - // Generate a source ID - const sourceId = `src_${contentHash.slice(0, 12)}`; + // For images, also save the original binary to wiki/images/ for article references + if (sourceType === "image" && extracted.metadata.imageBuffer) { + const ext = (extracted.metadata.fileType as string) ?? ".png"; + const imageFilename = `${slug}${ext}`; + await writeImageAsset(root, imageFilename, extracted.metadata.imageBuffer as Buffer); + } - // Update manifest - const now = new Date().toISOString(); - const wordCount = countWords(extracted.content); + // Generate a source ID + const sourceId = `src_${contentHash.slice(0, 12)}`; - // Build metadata, including image asset path for image sources - const sourceMetadata: SourceEntry["metadata"] = { - title: extracted.title, - author: extracted.metadata.author as string | undefined, - date: extracted.metadata.date as string | undefined, - wordCount, - }; + // Update manifest + const now = new Date().toISOString(); + const wordCount = countWords(extracted.content); - if (sourceType === "image" && extracted.metadata.fileType) { - const ext = extracted.metadata.fileType as string; - sourceMetadata.imageAsset = `images/${slug}${ext}`; - } + // Build metadata, including image asset path for image sources + const sourceMetadata: SourceEntry["metadata"] = { + title: extracted.title, + author: extracted.metadata.author as string | undefined, + date: extracted.metadata.date as string | undefined, + wordCount, + }; + + if (sourceType === "image" && extracted.metadata.fileType) { + const ext = extracted.metadata.fileType as string; + sourceMetadata.imageAsset = `images/${slug}${ext}`; + } + + const sourceEntry: SourceEntry = { + hash: contentHash, + ingestedAt: now, + lastCompiled: null, + sourceType, + originalUrl: isUrl(uri) ? uri : undefined, + producedArticles: [], + metadata: sourceMetadata, + }; - const sourceEntry: SourceEntry = { - hash: contentHash, - ingestedAt: now, - lastCompiled: null, - sourceType, - originalUrl: isUrl(uri) ? uri : undefined, - producedArticles: [], - metadata: sourceMetadata, - }; - - manifest.sources[sourceId] = sourceEntry; - manifest.stats.totalSources = Object.keys(manifest.sources).length; - - await saveManifest(root, manifest); - await appendLog(root, "ingest", `"${extracted.title}" (${sourceType}) → raw/${relativePath}`); - - return { - sourceId, - path: `raw/${relativePath}`, - sourceType, - title: extracted.title, - wordCount, - skipped: false, - }; + manifest.sources[sourceId] = sourceEntry; + manifest.stats.totalSources = Object.keys(manifest.sources).length; + + await saveManifest(root, manifest); + await appendLog(root, "ingest", `"${extracted.title}" (${sourceType}) → raw/${relativePath}`); + + return { + sourceId, + path: `raw/${relativePath}`, + sourceType, + title: extracted.title, + wordCount, + skipped: false, + }; + }); } async function getExtractor(sourceType: SourceType, provider?: LLMProvider): Promise { diff --git a/packages/core/src/integrity.test.ts b/packages/core/src/integrity.test.ts new file mode 100644 index 0000000..39a3053 --- /dev/null +++ b/packages/core/src/integrity.test.ts @@ -0,0 +1,179 @@ +import { afterEach, describe, expect, test } from "bun:test"; +import { mkdtemp, rm } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { validateManifestIntegrity } from "./integrity.js"; +import { initVault, loadManifest, saveManifest, writeRaw, writeWiki } from "./vault.js"; + +let tempDir: string; + +afterEach(async () => { + if (tempDir) { + await rm(tempDir, { recursive: true, force: true }); + } +}); + +async function makeTempVault() { + tempDir = await mkdtemp(join(tmpdir(), "kib-integrity-test-")); + await initVault(tempDir, { name: "integrity-test" }); + return tempDir; +} + +describe("validateManifestIntegrity", () => { + test("returns empty for consistent empty vault", async () => { + const dir = await makeTempVault(); + const manifest = await loadManifest(dir); + const issues = await validateManifestIntegrity(dir, manifest); + expect(issues).toEqual([]); + }); + + test("detects missing source file", async () => { + const dir = await makeTempVault(); + const manifest = await loadManifest(dir); + + // Add a source entry without a file on disk + manifest.sources.src_ghost123 = { + hash: "abc123", + ingestedAt: new Date().toISOString(), + lastCompiled: null, + sourceType: "web", + producedArticles: [], + metadata: { title: "Ghost Article", wordCount: 100 }, + }; + manifest.stats.totalSources = 1; + await saveManifest(dir, manifest); + + const issues = await validateManifestIntegrity(dir, manifest); + expect(issues.some((i) => i.category === "missing_file" && i.severity === "error")).toBe(true); + }); + + test("detects missing article file", async () => { + const dir = await makeTempVault(); + const manifest = await loadManifest(dir); + + manifest.articles["ghost-article"] = { + hash: "abc123", + createdAt: new Date().toISOString(), + lastUpdated: new Date().toISOString(), + derivedFrom: [], + backlinks: [], + forwardLinks: [], + tags: [], + summary: "A ghost article", + wordCount: 100, + category: "concept", + }; + manifest.stats.totalArticles = 1; + manifest.stats.totalWords = 100; + await saveManifest(dir, manifest); + + const issues = await validateManifestIntegrity(dir, manifest); + expect( + issues.some((i) => i.category === "missing_file" && i.message.includes("ghost-article")), + ).toBe(true); + }); + + test("detects broken source→article reference", async () => { + const dir = await makeTempVault(); + const manifest = await loadManifest(dir); + + // Add a source that references a non-existent article + await writeRaw(dir, "articles/real-source.md", "# Real Source"); + manifest.sources.src_real123 = { + hash: "abc123", + ingestedAt: new Date().toISOString(), + lastCompiled: null, + sourceType: "web", + producedArticles: ["nonexistent-article"], + metadata: { title: "Real Source", wordCount: 50 }, + }; + manifest.stats.totalSources = 1; + await saveManifest(dir, manifest); + + const issues = await validateManifestIntegrity(dir, manifest); + expect( + issues.some( + (i) => i.category === "broken_reference" && i.message.includes("nonexistent-article"), + ), + ).toBe(true); + }); + + test("detects broken article→source reference", async () => { + const dir = await makeTempVault(); + const manifest = await loadManifest(dir); + + await writeWiki(dir, "concepts/real-article.md", "# Real Article"); + manifest.articles["real-article"] = { + hash: "abc123", + createdAt: new Date().toISOString(), + lastUpdated: new Date().toISOString(), + derivedFrom: ["src_nonexistent"], + backlinks: [], + forwardLinks: [], + tags: [], + summary: "Real article", + wordCount: 50, + category: "concept", + }; + manifest.stats.totalArticles = 1; + manifest.stats.totalWords = 50; + await saveManifest(dir, manifest); + + const issues = await validateManifestIntegrity(dir, manifest); + expect( + issues.some((i) => i.category === "broken_reference" && i.message.includes("nonexistent")), + ).toBe(true); + }); + + test("detects stats mismatch", async () => { + const dir = await makeTempVault(); + const manifest = await loadManifest(dir); + + manifest.stats.totalSources = 5; + manifest.stats.totalArticles = 10; + manifest.stats.totalWords = 5000; + await saveManifest(dir, manifest); + + const issues = await validateManifestIntegrity(dir, manifest); + const mismatches = issues.filter((i) => i.category === "stats_mismatch"); + expect(mismatches.length).toBe(3); + }); + + test("passes for consistent vault with files", async () => { + const dir = await makeTempVault(); + const manifest = await loadManifest(dir); + + // Source with a real file — the file path is derived from title + await writeRaw(dir, "articles/test-source.md", "# Test Source\n\nSome content here."); + manifest.sources.src_test123 = { + hash: "abc123", + ingestedAt: new Date().toISOString(), + lastCompiled: new Date().toISOString(), + sourceType: "web", + producedArticles: ["test-article"], + metadata: { title: "Test Source", wordCount: 5 }, + }; + + await writeWiki(dir, "concepts/test-article.md", "# Test Article\n\nCompiled content."); + manifest.articles["test-article"] = { + hash: "def456", + createdAt: new Date().toISOString(), + lastUpdated: new Date().toISOString(), + derivedFrom: ["src_test123"], + backlinks: [], + forwardLinks: [], + tags: ["test"], + summary: "A test article", + wordCount: 3, + category: "concept", + }; + + manifest.stats.totalSources = 1; + manifest.stats.totalArticles = 1; + manifest.stats.totalWords = 3; + await saveManifest(dir, manifest); + + const issues = await validateManifestIntegrity(dir, manifest); + expect(issues).toEqual([]); + }); +}); diff --git a/packages/core/src/integrity.ts b/packages/core/src/integrity.ts new file mode 100644 index 0000000..1ef5d25 --- /dev/null +++ b/packages/core/src/integrity.ts @@ -0,0 +1,155 @@ +import { existsSync } from "node:fs"; +import { readdir } from "node:fs/promises"; +import { join } from "node:path"; +import { RAW_DIR, WIKI_DIR } from "./constants.js"; +import type { Manifest } from "./types.js"; + +export interface IntegrityIssue { + severity: "error" | "warning"; + category: "missing_file" | "orphan_file" | "stats_mismatch" | "broken_reference"; + message: string; + path?: string; +} + +/** + * Validate manifest integrity against the actual filesystem state. + * Checks that all referenced files exist, stats are correct, and + * there are no orphaned entries. + */ +export async function validateManifestIntegrity( + root: string, + manifest: Manifest, +): Promise { + const issues: IntegrityIssue[] = []; + + // Check source files exist on disk + for (const [sourceId, source] of Object.entries(manifest.sources)) { + const rawPath = deriveRawPath(source.metadata.title ?? sourceId, source.sourceType); + const sourcePath = join(root, RAW_DIR, rawPath); + if (!existsSync(sourcePath)) { + issues.push({ + severity: "error", + category: "missing_file", + message: `Source file missing from disk: ${sourceId} (expected ${rawPath})`, + path: sourcePath, + }); + } + + // Check that produced articles exist in the manifest + for (const articleSlug of source.producedArticles) { + if (!manifest.articles[articleSlug]) { + issues.push({ + severity: "warning", + category: "broken_reference", + message: `Source "${sourceId}" references article "${articleSlug}" which doesn't exist in manifest`, + path: sourceId, + }); + } + } + } + + // Check article files exist on disk + for (const [slug, article] of Object.entries(manifest.articles)) { + const found = await findArticleFile(root, slug); + if (!found) { + issues.push({ + severity: "error", + category: "missing_file", + message: `Article file missing from disk: ${slug}`, + path: slug, + }); + } + + // Check that derivedFrom sources exist + for (const sourceId of article.derivedFrom) { + if (!manifest.sources[sourceId]) { + issues.push({ + severity: "warning", + category: "broken_reference", + message: `Article "${slug}" references source "${sourceId}" which doesn't exist in manifest`, + path: slug, + }); + } + } + } + + // Validate stats match reality + const actualSourceCount = Object.keys(manifest.sources).length; + const actualArticleCount = Object.keys(manifest.articles).length; + const actualWordCount = Object.values(manifest.articles).reduce((sum, a) => sum + a.wordCount, 0); + + if (manifest.stats.totalSources !== actualSourceCount) { + issues.push({ + severity: "warning", + category: "stats_mismatch", + message: `Stats say ${manifest.stats.totalSources} sources, but manifest has ${actualSourceCount}`, + }); + } + + if (manifest.stats.totalArticles !== actualArticleCount) { + issues.push({ + severity: "warning", + category: "stats_mismatch", + message: `Stats say ${manifest.stats.totalArticles} articles, but manifest has ${actualArticleCount}`, + }); + } + + if (manifest.stats.totalWords !== actualWordCount) { + issues.push({ + severity: "warning", + category: "stats_mismatch", + message: `Stats say ${manifest.stats.totalWords} words, but article entries sum to ${actualWordCount}`, + }); + } + + return issues; +} + +// ─── Helpers ──────────────────────────────────────────────────── + +/** + * Derive the raw file path from source metadata, matching ingest's naming convention. + */ +function deriveRawPath(title: string, sourceType: string): string { + const slug = title + .toLowerCase() + .replace(/[^a-z0-9\s-]/g, "") + .replace(/\s+/g, "-") + .replace(/-+/g, "-") + .replace(/^-|-$/g, "") + .slice(0, 80); + + const category = + sourceType === "pdf" + ? "papers" + : sourceType === "youtube" + ? "transcripts" + : sourceType === "github" + ? "repos" + : sourceType === "image" + ? "images" + : "articles"; + + return `${category}/${slug}.md`; +} + +/** + * Find an article file by slug. Articles can be in any category subdirectory. + */ +async function findArticleFile(root: string, slug: string): Promise { + const wikiDir = join(root, WIKI_DIR); + const filename = `${slug}.md`; + + try { + const categories = await readdir(wikiDir, { withFileTypes: true }); + for (const cat of categories) { + if (!cat.isDirectory()) continue; + const filePath = join(wikiDir, cat.name, filename); + if (existsSync(filePath)) return filePath; + } + } catch { + // wiki dir might not exist + } + + return null; +} diff --git a/packages/core/src/lint/lint.ts b/packages/core/src/lint/lint.ts index 054f751..310bce9 100644 --- a/packages/core/src/lint/lint.ts +++ b/packages/core/src/lint/lint.ts @@ -1,3 +1,4 @@ +import { withLock } from "../lockfile.js"; import type { LintDiagnostic, LLMProvider, VaultConfig } from "../types.js"; import { appendLog, loadManifest, saveManifest, writeWiki } from "../vault.js"; import { contradictionRule } from "./contradiction.js"; @@ -71,27 +72,32 @@ export async function fixLintIssues( provider?: LLMProvider, config?: VaultConfig, ): Promise { - const result: FixResult = { fixed: 0, skipped: 0, errors: [] }; const fixable = diagnostics.filter((d) => d.fixable); + if (fixable.length === 0) { + return { fixed: 0, skipped: 0, errors: [] }; + } + + return withLock(root, "lint-fix", async () => { + const result: FixResult = { fixed: 0, skipped: 0, errors: [] }; - const missingIssues = fixable.filter((d) => d.rule === "missing"); - const staleIssues = fixable.filter((d) => d.rule === "stale"); + const missingIssues = fixable.filter((d) => d.rule === "missing"); + const staleIssues = fixable.filter((d) => d.rule === "stale"); - // Fix missing: create stub articles - if (missingIssues.length > 0) { - const manifest = await loadManifest(root); + // Fix missing: create stub articles + if (missingIssues.length > 0) { + const manifest = await loadManifest(root); - for (const d of missingIssues) { - const match = d.message.match(/^"([^"]+)"/); - if (!match) continue; + for (const d of missingIssues) { + const match = d.message.match(/^"([^"]+)"/); + if (!match) continue; - const slug = match[1]!; - const title = slug - .split("-") - .map((w) => w.charAt(0).toUpperCase() + w.slice(1)) - .join(" "); + const slug = match[1]!; + const title = slug + .split("-") + .map((w) => w.charAt(0).toUpperCase() + w.slice(1)) + .join(" "); - const stub = `--- + const stub = `--- title: "${title}" slug: "${slug}" category: topic @@ -104,51 +110,55 @@ sources: [] *This article was auto-generated by \`kib lint --fix\`. Add content by ingesting sources about this topic and running \`kib compile\`.* `; - try { - await writeWiki(root, `topics/${slug}.md`, stub); - const now = new Date().toISOString(); - manifest.articles[slug] = { - hash: "", - createdAt: now, - lastUpdated: now, - derivedFrom: [], - backlinks: [], - forwardLinks: [], - tags: [], - summary: "", - wordCount: 0, - category: "topic", - }; - result.fixed++; - } catch (err) { - result.errors.push(`Failed to create ${slug}: ${(err as Error).message}`); + try { + await writeWiki(root, `topics/${slug}.md`, stub); + const now = new Date().toISOString(); + manifest.articles[slug] = { + hash: "", + createdAt: now, + lastUpdated: now, + derivedFrom: [], + backlinks: [], + forwardLinks: [], + tags: [], + summary: "", + wordCount: 0, + category: "topic", + }; + result.fixed++; + } catch (err) { + result.errors.push(`Failed to create ${slug}: ${(err as Error).message}`); + } } - } - manifest.stats.totalArticles = Object.keys(manifest.articles).length; - await saveManifest(root, manifest); - } + manifest.stats.totalArticles = Object.keys(manifest.articles).length; + await saveManifest(root, manifest); + } - // Fix stale: recompile pending sources - if (staleIssues.length > 0) { - if (!provider || !config) { - result.skipped += staleIssues.length; - result.errors.push("Skipped stale fixes — LLM provider required. Set an API key and retry."); - } else { - try { - const { compileVault } = await import("../compile/compiler.js"); - await compileVault(root, provider, config, { force: false }); - result.fixed += staleIssues.length; - } catch (err) { + // Fix stale: recompile pending sources + if (staleIssues.length > 0) { + if (!provider || !config) { result.skipped += staleIssues.length; - result.errors.push(`Compile failed: ${(err as Error).message}`); + result.errors.push( + "Skipped stale fixes — LLM provider required. Set an API key and retry.", + ); + } else { + try { + const { compileVault } = await import("../compile/compiler.js"); + // Lock is re-entrant — compileVault will share our lock + await compileVault(root, provider, config, { force: false }); + result.fixed += staleIssues.length; + } catch (err) { + result.skipped += staleIssues.length; + result.errors.push(`Compile failed: ${(err as Error).message}`); + } } } - } - if (result.fixed > 0) { - await appendLog(root, "lint-fix", `${result.fixed} issues fixed`); - } + if (result.fixed > 0) { + await appendLog(root, "lint-fix", `${result.fixed} issues fixed`); + } - return result; + return result; + }); } diff --git a/packages/core/src/lockfile.test.ts b/packages/core/src/lockfile.test.ts new file mode 100644 index 0000000..5cf616e --- /dev/null +++ b/packages/core/src/lockfile.test.ts @@ -0,0 +1,209 @@ +import { afterEach, describe, expect, test } from "bun:test"; +import { existsSync } from "node:fs"; +import { mkdtemp, readFile, rm, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { acquireLock, isLocked, releaseLock, withLock } from "./lockfile.js"; +import { initVault } from "./vault.js"; + +let tempDir: string; + +afterEach(async () => { + if (tempDir) { + await rm(tempDir, { recursive: true, force: true }); + } +}); + +async function makeTempVault() { + tempDir = await mkdtemp(join(tmpdir(), "kib-lock-test-")); + await initVault(tempDir, { name: "lock-test" }); + return tempDir; +} + +describe("acquireLock", () => { + test("creates a lock file", async () => { + const dir = await makeTempVault(); + await acquireLock(dir, "test"); + + const lockPath = join(dir, ".kb", "vault.lock"); + expect(existsSync(lockPath)).toBe(true); + + const info = JSON.parse(await readFile(lockPath, "utf-8")); + expect(info.pid).toBe(process.pid); + expect(info.operation).toBe("test"); + + await releaseLock(dir); + }); + + test("is re-entrant for same process", async () => { + const dir = await makeTempVault(); + await acquireLock(dir, "first"); + + // Same process — should succeed (re-entrant) + await acquireLock(dir, "second"); + + // First release just decrements depth + await releaseLock(dir); + const lockPath = join(dir, ".kb", "vault.lock"); + expect(existsSync(lockPath)).toBe(true); // Still locked (depth > 0) + + // Second release actually removes the lock + await releaseLock(dir); + expect(existsSync(lockPath)).toBe(false); + }); + + test("steals lock from dead process", async () => { + const dir = await makeTempVault(); + const lockPath = join(dir, ".kb", "vault.lock"); + + // Write a lock with a PID that definitely doesn't exist + await writeFile( + lockPath, + JSON.stringify({ pid: 999999999, timestamp: new Date().toISOString(), operation: "dead" }), + ); + + // Should succeed by stealing the stale lock + await acquireLock(dir, "steal"); + const info = JSON.parse(await readFile(lockPath, "utf-8")); + expect(info.pid).toBe(process.pid); + expect(info.operation).toBe("steal"); + + await releaseLock(dir); + }); + + test("steals lock older than threshold", async () => { + const dir = await makeTempVault(); + const lockPath = join(dir, ".kb", "vault.lock"); + + // Write a lock with an old timestamp from current process + const oldTime = new Date(Date.now() - 10 * 60 * 1000).toISOString(); // 10 min ago + await writeFile( + lockPath, + JSON.stringify({ pid: process.pid, timestamp: oldTime, operation: "old" }), + ); + + // Should succeed because lock is stale + await acquireLock(dir, "new"); + const info = JSON.parse(await readFile(lockPath, "utf-8")); + expect(info.operation).toBe("new"); + + await releaseLock(dir); + }); +}); + +describe("releaseLock", () => { + test("removes the lock file", async () => { + const dir = await makeTempVault(); + await acquireLock(dir, "test"); + + const lockPath = join(dir, ".kb", "vault.lock"); + expect(existsSync(lockPath)).toBe(true); + + await releaseLock(dir); + expect(existsSync(lockPath)).toBe(false); + }); + + test("does nothing when no lock exists", async () => { + const dir = await makeTempVault(); + await releaseLock(dir); // Should not throw + }); + + test("does not remove lock from another process", async () => { + const dir = await makeTempVault(); + const lockPath = join(dir, ".kb", "vault.lock"); + + await writeFile( + lockPath, + JSON.stringify({ pid: 999999999, timestamp: new Date().toISOString(), operation: "other" }), + ); + + await releaseLock(dir); + // Lock should still be there — it belongs to another process + expect(existsSync(lockPath)).toBe(true); + }); +}); + +describe("isLocked", () => { + test("returns false when no lock", async () => { + const dir = await makeTempVault(); + const result = await isLocked(dir); + expect(result.locked).toBe(false); + }); + + test("returns true when locked by live process", async () => { + const dir = await makeTempVault(); + await acquireLock(dir, "check"); + + const result = await isLocked(dir); + expect(result.locked).toBe(true); + expect(result.info?.operation).toBe("check"); + + await releaseLock(dir); + }); + + test("returns false for stale lock", async () => { + const dir = await makeTempVault(); + const lockPath = join(dir, ".kb", "vault.lock"); + + await writeFile( + lockPath, + JSON.stringify({ pid: 999999999, timestamp: new Date().toISOString(), operation: "dead" }), + ); + + const result = await isLocked(dir); + expect(result.locked).toBe(false); + expect(result.info).toBeDefined(); + }); +}); + +describe("withLock", () => { + test("acquires and releases lock around function", async () => { + const dir = await makeTempVault(); + const lockPath = join(dir, ".kb", "vault.lock"); + + const result = await withLock(dir, "wrapped", async () => { + expect(existsSync(lockPath)).toBe(true); + return 42; + }); + + expect(result).toBe(42); + expect(existsSync(lockPath)).toBe(false); + }); + + test("releases lock even on error", async () => { + const dir = await makeTempVault(); + const lockPath = join(dir, ".kb", "vault.lock"); + + try { + await withLock(dir, "error", async () => { + expect(existsSync(lockPath)).toBe(true); + throw new Error("boom"); + }); + } catch (err) { + expect((err as Error).message).toBe("boom"); + } + + expect(existsSync(lockPath)).toBe(false); + }); + + test("supports nested withLock (re-entrant)", async () => { + const dir = await makeTempVault(); + const lockPath = join(dir, ".kb", "vault.lock"); + + const result = await withLock(dir, "outer", async () => { + expect(existsSync(lockPath)).toBe(true); + + const inner = await withLock(dir, "inner", async () => { + expect(existsSync(lockPath)).toBe(true); + return "inner-result"; + }); + + // Lock should still be held after inner withLock releases + expect(existsSync(lockPath)).toBe(true); + return `outer-${inner}`; + }); + + expect(result).toBe("outer-inner-result"); + expect(existsSync(lockPath)).toBe(false); + }); +}); diff --git a/packages/core/src/lockfile.ts b/packages/core/src/lockfile.ts new file mode 100644 index 0000000..32e7b7a --- /dev/null +++ b/packages/core/src/lockfile.ts @@ -0,0 +1,147 @@ +import { mkdir, readFile, unlink, writeFile } from "node:fs/promises"; +import { dirname, join } from "node:path"; +import { VAULT_DIR } from "./constants.js"; + +const LOCK_FILE = "vault.lock"; +const STALE_THRESHOLD_MS = 5 * 60 * 1000; // 5 minutes + +// Re-entrancy counter: tracks nested withLock calls within the same process +let lockDepth = 0; + +interface LockInfo { + pid: number; + timestamp: string; + operation: string; +} + +export class VaultLockError extends Error { + constructor(public readonly lockInfo: LockInfo) { + super( + `Vault is locked by process ${lockInfo.pid} (${lockInfo.operation}, started ${lockInfo.timestamp}). ` + + "If this is stale, remove .kb/vault.lock manually.", + ); + this.name = "VaultLockError"; + } +} + +function lockPath(root: string): string { + return join(root, VAULT_DIR, LOCK_FILE); +} + +function isProcessAlive(pid: number): boolean { + try { + process.kill(pid, 0); + return true; + } catch { + return false; + } +} + +async function readLock(root: string): Promise { + const path = lockPath(root); + try { + const raw = await readFile(path, "utf-8"); + return JSON.parse(raw) as LockInfo; + } catch { + return null; + } +} + +/** + * Acquire an exclusive lock on the vault. + * Throws VaultLockError if the vault is already locked by a different live process. + * Re-entrant: if the current process already holds the lock, this is a no-op. + * Automatically steals stale locks (dead PID or older than 5 minutes). + */ +export async function acquireLock(root: string, operation = "unknown"): Promise { + const existing = await readLock(root); + + if (existing) { + const age = Date.now() - new Date(existing.timestamp).getTime(); + + // Re-entrant: same process holds a fresh lock + if (existing.pid === process.pid && age < STALE_THRESHOLD_MS && lockDepth > 0) { + lockDepth++; + return; + } + + const alive = isProcessAlive(existing.pid); + + if (alive && age < STALE_THRESHOLD_MS) { + throw new VaultLockError(existing); + } + // Stale lock — steal it + } + + const info: LockInfo = { + pid: process.pid, + timestamp: new Date().toISOString(), + operation, + }; + + const path = lockPath(root); + await mkdir(dirname(path), { recursive: true }); + await writeFile(path, JSON.stringify(info, null, 2), "utf-8"); + lockDepth = 1; +} + +/** + * Release the vault lock. Only removes the lock if it belongs to the current process. + * Re-entrant: decrements depth and only removes when fully released. + */ +export async function releaseLock(root: string): Promise { + const existing = await readLock(root); + if (existing && existing.pid !== process.pid) { + return; // Not our lock + } + + // Re-entrant: don't remove until outermost caller releases + if (lockDepth > 1) { + lockDepth--; + return; + } + + lockDepth = 0; + + try { + await unlink(lockPath(root)); + } catch (err) { + if ((err as NodeJS.ErrnoException).code !== "ENOENT") { + throw err; + } + } +} + +/** + * Check if the vault is currently locked. + */ +export async function isLocked(root: string): Promise<{ locked: boolean; info?: LockInfo }> { + const info = await readLock(root); + if (!info) return { locked: false }; + + const age = Date.now() - new Date(info.timestamp).getTime(); + const alive = isProcessAlive(info.pid); + + if (!alive || age >= STALE_THRESHOLD_MS) { + return { locked: false, info }; // Stale + } + + return { locked: true, info }; +} + +/** + * Run a function while holding the vault lock. + * Lock is always released, even if the function throws. + */ +export async function withLock( + root: string, + operation: string, + fn: () => Promise, +): Promise { + await acquireLock(root, operation); + try { + return await fn(); + } finally { + await releaseLock(root); + } +} diff --git a/packages/core/src/recovery.test.ts b/packages/core/src/recovery.test.ts new file mode 100644 index 0000000..aa59f0e --- /dev/null +++ b/packages/core/src/recovery.test.ts @@ -0,0 +1,161 @@ +import { afterEach, describe, expect, test } from "bun:test"; +import { existsSync } from "node:fs"; +import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { detectIssues, repairVault } from "./recovery.js"; +import { initVault } from "./vault.js"; + +let tempDir: string; + +afterEach(async () => { + if (tempDir) { + await rm(tempDir, { recursive: true, force: true }); + } +}); + +async function makeTempVault() { + tempDir = await mkdtemp(join(tmpdir(), "kib-recovery-test-")); + await initVault(tempDir, { name: "recovery-test" }); + return tempDir; +} + +describe("detectIssues", () => { + test("returns empty for healthy vault", async () => { + const dir = await makeTempVault(); + const issues = await detectIssues(dir); + expect(issues).toEqual([]); + }); + + test("detects tmp files in .kb/", async () => { + const dir = await makeTempVault(); + await writeFile(join(dir, ".kb", "manifest.json.tmp"), "partial data"); + + const issues = await detectIssues(dir); + expect(issues.some((i) => i.type === "tmp_file")).toBe(true); + }); + + test("detects tmp files in wiki/", async () => { + const dir = await makeTempVault(); + await writeFile(join(dir, "wiki", "concepts", "test.md.tmp"), "partial"); + + const issues = await detectIssues(dir); + expect(issues.some((i) => i.type === "tmp_file")).toBe(true); + }); + + test("detects tmp files in raw/", async () => { + const dir = await makeTempVault(); + await writeFile(join(dir, "raw", "articles", "test.md.tmp"), "partial"); + + const issues = await detectIssues(dir); + expect(issues.some((i) => i.type === "tmp_file")).toBe(true); + }); + + test("detects missing manifest with tmp present", async () => { + const dir = await makeTempVault(); + const manifestPath = join(dir, ".kb", "manifest.json"); + const content = await readFile(manifestPath, "utf-8"); + + // Simulate interrupted write: tmp exists but manifest is gone + await writeFile(`${manifestPath}.tmp`, content); + await rm(manifestPath); + + const issues = await detectIssues(dir); + expect(issues.some((i) => i.type === "missing_manifest")).toBe(true); + }); + + test("detects corrupt manifest", async () => { + const dir = await makeTempVault(); + const manifestPath = join(dir, ".kb", "manifest.json"); + await writeFile(manifestPath, "not valid json {{{"); + + const issues = await detectIssues(dir); + expect(issues.some((i) => i.type === "corrupt_manifest")).toBe(true); + }); + + test("detects stale lock from dead process", async () => { + const dir = await makeTempVault(); + await writeFile( + join(dir, ".kb", "vault.lock"), + JSON.stringify({ pid: 999999999, timestamp: new Date().toISOString(), operation: "dead" }), + ); + + const issues = await detectIssues(dir); + expect(issues.some((i) => i.type === "stale_lock")).toBe(true); + }); +}); + +describe("repairVault", () => { + test("removes stale tmp files", async () => { + const dir = await makeTempVault(); + const tmpPath = join(dir, ".kb", "config.toml.tmp"); + await writeFile(tmpPath, "partial"); + + const issues = await repairVault(dir); + const tmpIssue = issues.find((i) => i.type === "tmp_file"); + expect(tmpIssue?.repaired).toBe(true); + expect(existsSync(tmpPath)).toBe(false); + }); + + test("promotes tmp to manifest when manifest is missing", async () => { + const dir = await makeTempVault(); + const manifestPath = join(dir, ".kb", "manifest.json"); + const content = await readFile(manifestPath, "utf-8"); + + await writeFile(`${manifestPath}.tmp`, content); + await rm(manifestPath); + + const issues = await repairVault(dir); + const missingIssue = issues.find((i) => i.type === "missing_manifest"); + expect(missingIssue?.repaired).toBe(true); + expect(existsSync(manifestPath)).toBe(true); + + const restored = await readFile(manifestPath, "utf-8"); + expect(JSON.parse(restored)).toEqual(JSON.parse(content)); + }); + + test("removes stale lock", async () => { + const dir = await makeTempVault(); + const lockPath = join(dir, ".kb", "vault.lock"); + await writeFile( + lockPath, + JSON.stringify({ pid: 999999999, timestamp: new Date().toISOString(), operation: "dead" }), + ); + + const issues = await repairVault(dir); + const lockIssue = issues.find((i) => i.type === "stale_lock"); + expect(lockIssue?.repaired).toBe(true); + expect(existsSync(lockPath)).toBe(false); + }); + + test("restores corrupt manifest from backup", async () => { + const dir = await makeTempVault(); + const manifestPath = join(dir, ".kb", "manifest.json"); + const goodManifest = await readFile(manifestPath, "utf-8"); + + // Create a backup + const backupsDir = join(dir, ".kb", "backups"); + await mkdir(backupsDir, { recursive: true }); + await writeFile(join(backupsDir, "manifest-2024-01-01T00-00-00-000Z.json"), goodManifest); + + // Corrupt the manifest + await writeFile(manifestPath, "corrupted {{{"); + + const issues = await repairVault(dir); + const corruptIssue = issues.find((i) => i.type === "corrupt_manifest"); + expect(corruptIssue?.repaired).toBe(true); + + const restored = await readFile(manifestPath, "utf-8"); + expect(JSON.parse(restored)).toEqual(JSON.parse(goodManifest)); + }); + + test("returns unrepaired for corrupt manifest with no backup", async () => { + const dir = await makeTempVault(); + const manifestPath = join(dir, ".kb", "manifest.json"); + await writeFile(manifestPath, "corrupted {{{"); + + const issues = await repairVault(dir); + const corruptIssue = issues.find((i) => i.type === "corrupt_manifest"); + expect(corruptIssue?.repaired).toBe(false); + }); +}); diff --git a/packages/core/src/recovery.ts b/packages/core/src/recovery.ts new file mode 100644 index 0000000..5b559b0 --- /dev/null +++ b/packages/core/src/recovery.ts @@ -0,0 +1,219 @@ +import { existsSync } from "node:fs"; +import { readdir, readFile, rename, unlink } from "node:fs/promises"; +import { join } from "node:path"; +import { MANIFEST_FILE, RAW_DIR, VAULT_DIR, WIKI_DIR } from "./constants.js"; + +export interface RecoveryIssue { + type: "tmp_file" | "missing_manifest" | "corrupt_manifest" | "stale_lock"; + path: string; + message: string; + repaired: boolean; +} + +/** + * Scan the vault for signs of incomplete writes or corruption. + * Looks for .tmp files, missing manifest, and stale locks. + */ +export async function detectIssues(root: string): Promise { + const issues: RecoveryIssue[] = []; + const kbDir = join(root, VAULT_DIR); + + // Check manifest existence first — .tmp files for missing manifest + // are handled separately and should not be flagged as generic tmp_file + const manifestPath = join(kbDir, MANIFEST_FILE); + const manifestTmp = `${manifestPath}.tmp`; + const manifestMissing = !existsSync(manifestPath) && existsSync(manifestTmp); + + if (manifestMissing) { + issues.push({ + type: "missing_manifest", + path: manifestTmp, + message: "Manifest is missing but a .tmp file exists — likely an interrupted write", + repaired: false, + }); + } + + // Check for .tmp files in .kb/ (skip manifest.tmp if already flagged as missing_manifest) + await scanTmpFiles(kbDir, issues, manifestMissing ? manifestTmp : undefined); + + // Check for .tmp files in raw/ and wiki/ + const rawDir = join(root, RAW_DIR); + const wikiDir = join(root, WIKI_DIR); + if (existsSync(rawDir)) await scanTmpFilesRecursive(rawDir, issues); + if (existsSync(wikiDir)) await scanTmpFilesRecursive(wikiDir, issues); + + // Check if manifest is valid JSON + if (existsSync(manifestPath)) { + try { + const raw = await readFile(manifestPath, "utf-8"); + JSON.parse(raw); + } catch { + issues.push({ + type: "corrupt_manifest", + path: manifestPath, + message: "Manifest file contains invalid JSON", + repaired: false, + }); + } + } + + // Check for stale lock + const lockPath = join(kbDir, "vault.lock"); + if (existsSync(lockPath)) { + try { + const raw = await readFile(lockPath, "utf-8"); + const info = JSON.parse(raw) as { pid: number; timestamp: string }; + let alive = false; + try { + process.kill(info.pid, 0); + alive = true; + } catch { + // Process is dead + } + if (!alive) { + issues.push({ + type: "stale_lock", + path: lockPath, + message: `Stale lock from dead process ${info.pid}`, + repaired: false, + }); + } + } catch { + issues.push({ + type: "stale_lock", + path: lockPath, + message: "Lock file is corrupt or unreadable", + repaired: false, + }); + } + } + + return issues; +} + +/** + * Attempt to repair detected issues. + * - .tmp files next to their target: remove the .tmp (the write was atomic, target is fine) + * - .tmp file without target (missing_manifest): promote .tmp → target + * - Stale locks: remove + */ +export async function repairVault(root: string): Promise { + const issues = await detectIssues(root); + + for (const issue of issues) { + switch (issue.type) { + case "tmp_file": { + // .tmp file exists alongside the real file — interrupted atomic write + // The real file is intact (rename didn't complete), so remove the .tmp + try { + await unlink(issue.path); + issue.repaired = true; + } catch { + // Could not remove — leave it + } + break; + } + + case "missing_manifest": { + // The .tmp file is the only copy — promote it + const target = issue.path.replace(/\.tmp$/, ""); + try { + await rename(issue.path, target); + issue.repaired = true; + } catch { + // Could not promote + } + break; + } + + case "stale_lock": { + try { + await unlink(issue.path); + issue.repaired = true; + } catch { + // Could not remove + } + break; + } + + case "corrupt_manifest": { + // Try to restore from backup + const backupsDir = join(root, VAULT_DIR, "backups"); + if (existsSync(backupsDir)) { + const backups = (await readdir(backupsDir)) + .filter((f) => f.startsWith("manifest-") && f.endsWith(".json")) + .sort() + .reverse(); + + if (backups.length > 0) { + const latest = join(backupsDir, backups[0]); + try { + const backup = await readFile(latest, "utf-8"); + JSON.parse(backup); // Verify it's valid + const manifestPath = join(root, VAULT_DIR, MANIFEST_FILE); + const tmp = `${manifestPath}.tmp`; + const { writeFile } = await import("node:fs/promises"); + await writeFile(tmp, backup, "utf-8"); + await rename(tmp, manifestPath); + issue.repaired = true; + issue.message += ` — restored from backup ${backups[0]}`; + } catch { + // Backup also corrupt or unreadable + } + } + } + break; + } + } + } + + return issues; +} + +// ─── Helpers ──────────────────────────────────────────────────── + +async function scanTmpFiles( + dir: string, + issues: RecoveryIssue[], + excludePath?: string, +): Promise { + try { + const entries = await readdir(dir); + for (const entry of entries) { + if (entry.endsWith(".tmp")) { + const fullPath = join(dir, entry); + if (excludePath && fullPath === excludePath) continue; + issues.push({ + type: "tmp_file", + path: fullPath, + message: `Leftover temporary file: ${entry}`, + repaired: false, + }); + } + } + } catch { + // Directory might not exist + } +} + +async function scanTmpFilesRecursive(dir: string, issues: RecoveryIssue[]): Promise { + try { + const { readdir: rd } = await import("node:fs/promises"); + const entries = await rd(dir, { withFileTypes: true }); + for (const entry of entries) { + const fullPath = join(dir, entry.name); + if (entry.isDirectory()) { + await scanTmpFilesRecursive(fullPath, issues); + } else if (entry.name.endsWith(".tmp")) { + issues.push({ + type: "tmp_file", + path: fullPath, + message: `Leftover temporary file: ${entry.name}`, + repaired: false, + }); + } + } + } catch { + // Directory might not exist + } +}