diff --git a/ROADMAP.md b/ROADMAP.md index 788ebd1..b8f331c 100644 --- a/ROADMAP.md +++ b/ROADMAP.md @@ -195,10 +195,10 @@ kib should silently learn from everything you read without you thinking about it ### Instant Value Without Compile Most of kib's value is locked behind `kib compile`. That's wrong — value should be immediate on ingest. -- [ ] Search + query over raw sources directly (no compile required) -- [ ] Compile becomes an optional enrichment step, not a prerequisite -- [ ] Incremental indexing: search index updates on ingest, not compile -- [ ] "Ask about this source" — query a single raw source without compiling the whole vault +- [x] Search + query over raw sources directly (no compile required) +- [x] Compile becomes an optional enrichment step, not a prerequisite +- [x] Incremental indexing: search index updates on ingest, not compile +- [x] "Ask about this source" — query a single raw source without compiling the whole vault ### Beyond CLI CLI-only means developer-only. The knowledge is valuable to everyone. diff --git a/packages/cli/src/commands/ingest.ts b/packages/cli/src/commands/ingest.ts index b1e340b..03c919e 100644 --- a/packages/cli/src/commands/ingest.ts +++ b/packages/cli/src/commands/ingest.ts @@ -106,7 +106,7 @@ export async function ingest(sources: string[], opts: IngestOpts) { log.dim("(dry run — no files were written)"); } else { log.blank(); - log.dim("run kib compile to update the wiki"); + log.dim("search and query ready — run kib compile to enrich into wiki articles"); } } else if (skipped > 0) { log.dim(`All ${skipped} source${skipped === 1 ? "" : "s"} already ingested`); diff --git a/packages/cli/src/commands/query.ts b/packages/cli/src/commands/query.ts index 6b96136..3554dd6 100644 --- a/packages/cli/src/commands/query.ts +++ b/packages/cli/src/commands/query.ts @@ -1,3 +1,4 @@ +import { resolve } from "node:path"; import type { LLMProvider } from "@kibhq/core"; import { createProvider, @@ -14,6 +15,7 @@ import { createSpinner } from "../ui/spinner.js"; interface QueryOpts { file?: boolean; sources?: boolean; + source?: string; json?: boolean; } @@ -47,13 +49,19 @@ export async function query(question: string, opts: QueryOpts) { const { queryVault } = await import("@kibhq/core"); + // Resolve --source path to absolute + const sourcePath = opts.source ? resolve(opts.source) : undefined; + debug(`vault root: ${root}`); debug(`provider: ${config.provider.default}, model: ${config.provider.model}`); debug(`question: "${question}"`); + if (sourcePath) debug(`source: ${sourcePath}`); - log.header("querying knowledge base"); + log.header(sourcePath ? "querying source" : "querying knowledge base"); - const spinner = createSpinner("Searching and generating answer..."); + const spinner = createSpinner( + sourcePath ? "Reading source and generating answer..." : "Searching and generating answer...", + ); spinner.start(); const endQuery = debugTime("queryVault"); @@ -62,6 +70,7 @@ export async function query(question: string, opts: QueryOpts) { const result = await queryVault(root, question, provider, { autoFile, autoFileThreshold: config.query.auto_file_threshold, + source: sourcePath, }); endQuery(); debug(`sources used: ${result.sourcePaths.length}`); diff --git a/packages/cli/src/index.ts b/packages/cli/src/index.ts index 3590019..d4510f9 100644 --- a/packages/cli/src/index.ts +++ b/packages/cli/src/index.ts @@ -96,6 +96,7 @@ program .option("--file", "auto-file to wiki/outputs/") .option("--no-file", "never file") .option("--sources", "show which articles were used") + .option("--source ", "query a specific source file directly") .option("--json", "JSON output") .action(async (question, opts) => { const { query } = await import("./commands/query.js"); diff --git a/packages/cli/src/mcp/server.ts b/packages/cli/src/mcp/server.ts index ea3db96..ce7e448 100644 --- a/packages/cli/src/mcp/server.ts +++ b/packages/cli/src/mcp/server.ts @@ -1,3 +1,4 @@ +import { join } from "node:path"; import { compileVault, createProvider, @@ -247,7 +248,7 @@ export function createMcpServer(root: string) { server.tool( "kib_query", - "Ask a question against the knowledge base using RAG (retrieval-augmented generation). Requires a configured LLM provider.", + "Ask a question against the knowledge base using RAG (retrieval-augmented generation). Searches both raw sources and compiled wiki articles, so it works immediately after ingest — no compile needed. Requires a configured LLM provider.", { question: z.string().describe("Question to ask"), max_articles: z @@ -256,16 +257,24 @@ export function createMcpServer(root: string) { .positive() .max(10) .default(5) - .describe("Max articles to use as context"), + .describe("Max sources/articles to use as context"), + source: z + .string() + .optional() + .describe( + "Path to a specific source to query against (e.g. 'raw/articles/my-source.md'). Skips search and uses only this source as context.", + ), }, - async ({ question, max_articles }) => { + async ({ question, max_articles, source }) => { try { const provider = await ctx.getProvider(); const config = await ctx.getConfig(); + const sourcePath = source ? join(root, source) : undefined; const result = await queryVault(root, question, provider, { maxArticles: max_articles, autoFile: config.query.auto_file, autoFileThreshold: config.query.auto_file_threshold, + source: sourcePath, }); const filed = result.filedTo ? `\nFiled to: ${result.filedTo}` : ""; return ok( @@ -281,7 +290,7 @@ export function createMcpServer(root: string) { server.tool( "kib_ingest", - "Ingest a source (URL or file path) into the knowledge base. No API key needed for ingestion. Auto-compiles after ingest if an LLM provider is configured; otherwise sources are saved but not compiled.", + "Ingest a source (URL or file path) into the knowledge base. No API key needed for ingestion. Sources are immediately searchable and queryable after ingest. Auto-compiles into wiki articles if an LLM provider is configured.", { source: z.string().describe("URL or file path to ingest"), category: z @@ -332,9 +341,10 @@ export function createMcpServer(root: string) { title: result.title, wordCount: result.wordCount, skipped: result.skipped, + searchable: true, compiled: null, compileError: isProviderErr - ? "No LLM provider configured. The source was saved but not compiled. Tell the user to set ANTHROPIC_API_KEY, OPENAI_API_KEY, or start Ollama, then run `kib compile`." + ? "No LLM provider configured. Source is searchable and queryable immediately. To compile into wiki articles, set ANTHROPIC_API_KEY, OPENAI_API_KEY, or start Ollama, then run `kib compile`." : msg, }); } @@ -347,6 +357,7 @@ export function createMcpServer(root: string) { wordCount: result.wordCount, skipped: result.skipped, skipReason: result.skipReason, + searchable: !result.skipped, compiled: compiled ? { articlesCreated: compiled.articlesCreated, diff --git a/packages/core/src/ingest/ingest.test.ts b/packages/core/src/ingest/ingest.test.ts index 3ef7e2b..e096c95 100644 --- a/packages/core/src/ingest/ingest.test.ts +++ b/packages/core/src/ingest/ingest.test.ts @@ -3,6 +3,8 @@ import { existsSync } from "node:fs"; import { mkdtemp, readFile, rm, writeFile } from "node:fs/promises"; import { tmpdir } from "node:os"; import { join } from "node:path"; +import { SearchIndex } from "../search/engine.js"; +import { VectorIndex } from "../search/vector.js"; import type { LLMProvider } from "../types.js"; import { initVault, listImageAssets, loadManifest } from "../vault.js"; import { ingestSource } from "./ingest.js"; @@ -228,6 +230,133 @@ describe("ingestSource", () => { expect(rawContent).toContain("word_count:"); }); + test("ingest updates search index so source is immediately searchable", async () => { + const root = await makeTempVault(); + + const testFile = join(root, "quantum-computing.md"); + await writeFile( + testFile, + "# Quantum Computing\n\nQuantum computers use qubits and superposition to solve problems.", + ); + + await ingestSource(root, testFile); + + // Load the search index and verify the source is searchable + const index = new SearchIndex(); + const loaded = await index.load(root); + expect(loaded).toBe(true); + expect(index.documentCount).toBeGreaterThan(0); + + const results = index.search("quantum computing"); + expect(results.length).toBeGreaterThan(0); + expect(results[0]!.title).toBe("Quantum Computing"); + }); + + test("multiple ingests build up the search index incrementally", async () => { + const root = await makeTempVault(); + + const file1 = join(root, "first.md"); + const file2 = join(root, "second.md"); + await writeFile(file1, "# Machine Learning\n\nML uses statistical models to learn from data."); + await writeFile( + file2, + "# Deep Learning\n\nDeep learning uses neural networks with many layers.", + ); + + await ingestSource(root, file1); + await ingestSource(root, file2); + + const index = new SearchIndex(); + const loaded = await index.load(root); + expect(loaded).toBe(true); + expect(index.documentCount).toBe(2); + + // Both should be searchable + const mlResults = index.search("machine learning"); + expect(mlResults.length).toBeGreaterThan(0); + + const dlResults = index.search("deep learning neural"); + expect(dlResults.length).toBeGreaterThan(0); + expect(dlResults[0]!.title).toBe("Deep Learning"); + }); + + test("ingest updates vector index when provider has embed()", async () => { + const root = await makeTempVault(); + + // Mock provider with embed support + const embedProvider: LLMProvider = { + name: "mock-embed", + async complete() { + return { + content: "", + usage: { inputTokens: 0, outputTokens: 0 }, + stopReason: "end_turn" as const, + }; + }, + async *stream() {}, + async embed(texts: string[]): Promise { + return texts.map((text) => { + const vec = new Float32Array(32); + const lower = text.toLowerCase(); + for (let i = 0; i < 32; i++) { + const char = String.fromCharCode(97 + (i % 26)); + vec[i] = (lower.match(new RegExp(char, "g")) ?? []).length / lower.length; + } + return vec; + }); + }, + }; + + const testFile = join(root, "quantum-ml.md"); + await writeFile( + testFile, + "# Quantum Machine Learning\n\nQuantum computing applied to machine learning tasks.", + ); + + await ingestSource(root, testFile, { provider: embedProvider }); + + // Verify the vector index was updated + const vectorIndex = new VectorIndex(); + const loaded = await vectorIndex.load(root); + expect(loaded).toBe(true); + expect(vectorIndex.documentCount).toBe(1); + + const results = await vectorIndex.search("quantum machine learning", embedProvider); + expect(results.length).toBe(1); + expect(results[0]!.title).toBe("Quantum Machine Learning"); + }); + + test("ingest skips vector index when provider lacks embed()", async () => { + const root = await makeTempVault(); + + // Provider without embed + const noEmbedProvider: LLMProvider = { + name: "no-embed", + async complete() { + return { + content: "", + usage: { inputTokens: 0, outputTokens: 0 }, + stopReason: "end_turn" as const, + }; + }, + async *stream() {}, + }; + + const testFile = join(root, "test-no-embed.md"); + await writeFile(testFile, "# No Embed\n\nContent without embeddings."); + + await ingestSource(root, testFile, { provider: noEmbedProvider }); + + // BM25 index should exist + const bm25 = new SearchIndex(); + expect(await bm25.load(root)).toBe(true); + expect(bm25.documentCount).toBe(1); + + // Vector index should NOT exist (no embed support) + const vectorIndex = new VectorIndex(); + expect(await vectorIndex.load(root)).toBe(false); + }); + test("image ingest saves binary to wiki/images/", async () => { const root = await makeTempVault(); diff --git a/packages/core/src/ingest/ingest.ts b/packages/core/src/ingest/ingest.ts index a67e03d..5b38986 100644 --- a/packages/core/src/ingest/ingest.ts +++ b/packages/core/src/ingest/ingest.ts @@ -1,5 +1,9 @@ +import { join } from "node:path"; +import { RAW_DIR } from "../constants.js"; import { hash } from "../hash.js"; import { withLock } from "../lockfile.js"; +import { SearchIndex } from "../search/engine.js"; +import { VectorIndex } from "../search/vector.js"; import type { IngestResult, LLMProvider, Manifest, SourceEntry, SourceType } from "../types.js"; import { appendLog, loadManifest, saveManifest, writeImageAsset, writeRaw } from "../vault.js"; import type { Extractor } from "./extractors/interface.js"; @@ -154,6 +158,38 @@ export async function ingestSource( await saveManifest(root, manifest); await appendLog(root, "ingest", `"${extracted.title}" (${sourceType}) → raw/${relativePath}`); + // Incrementally update search indexes so the source is immediately searchable + const docPath = join(root, RAW_DIR, relativePath); + try { + const index = new SearchIndex(); + await index.load(root); + index.addDocument({ + path: docPath, + title: extracted.title, + content: extracted.content, + tags: options.tags, + date: now, + }); + await index.save(root); + } catch { + // BM25 index update is best-effort — don't fail the ingest + } + + // Incrementally update vector index if a provider with embed() is available + if (options.provider?.embed) { + try { + const vectorIndex = new VectorIndex(); + await vectorIndex.load(root); + await vectorIndex.addDocument( + { path: docPath, title: extracted.title, content: extracted.content }, + options.provider, + ); + await vectorIndex.save(root); + } catch { + // Vector index update is best-effort + } + } + return { sourceId, path: `raw/${relativePath}`, diff --git a/packages/core/src/query/query.test.ts b/packages/core/src/query/query.test.ts index 3847dad..88f17b0 100644 --- a/packages/core/src/query/query.test.ts +++ b/packages/core/src/query/query.test.ts @@ -10,7 +10,7 @@ import type { Message, StreamChunk, } from "../types.js"; -import { initVault, writeWiki } from "../vault.js"; +import { initVault, writeRaw, writeWiki } from "../vault.js"; import { queryVault } from "./query.js"; let tempDir: string; @@ -168,4 +168,74 @@ describe("queryVault", () => { expect(receivedMessages[1]!.content).toBe("previous answer"); expect(receivedMessages[2]!.content).toContain("follow up question"); }); + + test("queries raw sources when no wiki articles exist", async () => { + const root = await makeTempVault(); + + // Write a raw source (no wiki articles) + await writeRaw( + root, + "articles/quantum.md", + "---\ntitle: Quantum Computing\nslug: quantum-computing\nsource_type: file\n---\n\n# Quantum Computing\n\nQuantum computers use qubits.", + ); + + // Build search index over all (raw + wiki) + const index = new SearchIndex(); + await index.build(root, "all"); + await index.save(root); + + const provider = mockProvider( + "Quantum computers use qubits for computation [Quantum Computing].", + ); + + const result = await queryVault(root, "How do quantum computers work?", provider); + + expect(result.answer).toContain("qubits"); + expect(result.sourcePaths.length).toBeGreaterThan(0); + expect(result.sourcePaths[0]).toContain("raw/"); + }); + + test("single-source query loads specific file directly", async () => { + const root = await makeTempVault(); + + // Write two raw sources + await writeRaw( + root, + "articles/ml.md", + "---\ntitle: Machine Learning\nslug: machine-learning\n---\n\n# Machine Learning\n\nML uses statistical models.", + ); + await writeRaw( + root, + "articles/dl.md", + "---\ntitle: Deep Learning\nslug: deep-learning\n---\n\n# Deep Learning\n\nDeep learning uses neural networks.", + ); + + // Track what context gets sent to the LLM + let receivedContent = ""; + const provider: LLMProvider = { + name: "mock", + async complete(params: CompletionParams): Promise { + receivedContent = params.messages[params.messages.length - 1]!.content; + return { + content: "Deep learning uses neural networks.", + usage: { inputTokens: 100, outputTokens: 50 }, + stopReason: "end_turn", + }; + }, + async *stream(): AsyncIterable { + yield { type: "text", text: "stream" }; + }, + }; + + const sourcePath = join(root, "raw/articles/dl.md"); + const result = await queryVault(root, "What is deep learning?", provider, { + source: sourcePath, + }); + + // Should only include the specified source, not ML + expect(receivedContent).toContain("Deep Learning"); + expect(receivedContent).not.toContain("Machine Learning"); + expect(result.sourcePaths).toHaveLength(1); + expect(result.sourcePaths[0]).toContain("dl.md"); + }); }); diff --git a/packages/core/src/query/query.ts b/packages/core/src/query/query.ts index 760630f..6fbba12 100644 --- a/packages/core/src/query/query.ts +++ b/packages/core/src/query/query.ts @@ -18,6 +18,10 @@ export interface QueryOptions { autoFile?: boolean; /** Minimum sources cited to trigger auto-file (default 3) */ autoFileThreshold?: number; + /** Search scope: "wiki" for compiled articles only, "raw" for sources only, "all" for both (default: "all") */ + scope?: "wiki" | "raw" | "all"; + /** Path to a specific source file to query (skips search, loads this file directly) */ + source?: string; } export interface QueryResult { @@ -28,12 +32,12 @@ export interface QueryResult { filedTo?: string; } -const QUERY_SYSTEM_PROMPT = `You are a knowledge assistant for a personal wiki. Answer questions using ONLY the information provided in the articles below. +const QUERY_SYSTEM_PROMPT = `You are a knowledge assistant for a personal knowledge base. Answer questions using ONLY the information provided in the sources below. RULES: -- Base your answer strictly on the provided articles -- Cite sources using [Article Title] notation when referencing specific information -- If the answer is not in the provided articles, say so clearly +- Base your answer strictly on the provided sources +- Cite sources using [Source Title] notation when referencing specific information +- If the answer is not in the provided sources, say so clearly - Be concise and direct - Use markdown formatting for readability`; @@ -51,64 +55,83 @@ export async function queryVault( options: QueryOptions = {}, ): Promise { const maxArticles = options.maxArticles ?? 5; + const scope = options.scope ?? "all"; - // Determine search engine from vault config - let searchEngine: "builtin" | "vector" | "hybrid" = "builtin"; - try { - const config = await loadConfig(root); - searchEngine = config.search.engine; - } catch { - // Default to builtin - } - - // Search for relevant articles - let searchResults: SearchResult[]; - - if (searchEngine === "hybrid" || searchEngine === "vector") { - const bm25 = new SearchIndex(); - const vector = new VectorIndex(); - const hybrid = new HybridSearch(bm25, vector); - const loaded = await hybrid.load(root); - if (!loaded.bm25) { - await hybrid.build(root, provider, "wiki"); - await hybrid.save(root); - } - searchResults = await hybrid.search(question, provider, { limit: maxArticles }); - } else { - const index = new SearchIndex(); - const loaded = await index.load(root); - if (!loaded) { - await index.build(root, "wiki"); - } - searchResults = index.search(question, { limit: maxArticles }); - } - - // Load the full articles + // Load the full source/article documents const articles: { title: string; path: string; content: string }[] = []; - for (const result of searchResults) { + if (options.source) { + // Single-source query: skip search, load the specified file directly try { - const content = await readFile(result.path, "utf-8"); + const content = await readFile(options.source, "utf-8"); const { frontmatter, body } = parseFrontmatter(content); articles.push({ - title: (frontmatter.title as string) ?? result.title ?? result.path, - path: result.path, + title: + (frontmatter.title as string) ?? + options.source.split("/").pop()?.replace(/\.md$/, "") ?? + options.source, + path: options.source, content: body, }); } catch { - // File might have been deleted + // File not found — fall through to empty context handling + } + } else { + // Determine search engine from vault config + let searchEngine: "builtin" | "vector" | "hybrid" = "builtin"; + try { + const config = await loadConfig(root); + searchEngine = config.search.engine; + } catch { + // Default to builtin } - } - // If no articles found, try using INDEX.md as fallback context - if (articles.length === 0) { - const indexContent = await readIndex(root); - if (indexContent) { - articles.push({ - title: "Knowledge Base Index", - path: "wiki/INDEX.md", - content: indexContent, - }); + // Search for relevant sources/articles + let searchResults: SearchResult[]; + + if (searchEngine === "hybrid" || searchEngine === "vector") { + const bm25 = new SearchIndex(); + const vector = new VectorIndex(); + const hybrid = new HybridSearch(bm25, vector); + const loaded = await hybrid.load(root); + if (!loaded.bm25) { + await hybrid.build(root, provider, scope); + await hybrid.save(root); + } + searchResults = await hybrid.search(question, provider, { limit: maxArticles }); + } else { + const index = new SearchIndex(); + const loaded = await index.load(root); + if (!loaded) { + await index.build(root, scope); + } + searchResults = index.search(question, { limit: maxArticles }); + } + + for (const result of searchResults) { + try { + const content = await readFile(result.path, "utf-8"); + const { frontmatter, body } = parseFrontmatter(content); + articles.push({ + title: (frontmatter.title as string) ?? result.title ?? result.path, + path: result.path, + content: body, + }); + } catch { + // File might have been deleted + } + } + + // If no articles found, try using INDEX.md as fallback context + if (articles.length === 0) { + const indexContent = await readIndex(root); + if (indexContent) { + articles.push({ + title: "Knowledge Base Index", + path: "wiki/INDEX.md", + content: indexContent, + }); + } } } diff --git a/packages/core/src/search/engine.test.ts b/packages/core/src/search/engine.test.ts index 5308608..da9deab 100644 --- a/packages/core/src/search/engine.test.ts +++ b/packages/core/src/search/engine.test.ts @@ -229,6 +229,127 @@ describe("SearchIndex", () => { }); }); +// ─── Incremental Indexing (addDocument) ───────────────────────── + +describe("addDocument", () => { + test("adds a new document to an empty index", () => { + const index = new SearchIndex(); + index.addDocument({ + path: "/tmp/test/raw/articles/test.md", + title: "Machine Learning", + content: "Machine learning is a subset of artificial intelligence.", + }); + + expect(index.documentCount).toBe(1); + const results = index.search("machine learning"); + expect(results.length).toBe(1); + expect(results[0]!.title).toBe("Machine Learning"); + }); + + test("adds to an existing index built from files", async () => { + const root = await makeTempVault(); + await writeWiki( + root, + "concepts/transformers.md", + articleMd("Transformer Architecture", "The transformer is a neural network architecture."), + ); + + const index = new SearchIndex(); + await index.build(root, "wiki"); + expect(index.documentCount).toBe(1); + + // Add a new document incrementally + index.addDocument({ + path: join(root, "raw/articles/attention.md"), + title: "Attention Mechanisms", + content: "Attention mechanisms compute weighted sums over value vectors.", + }); + + expect(index.documentCount).toBe(2); + + // Both documents should be searchable + const transformerResults = index.search("transformer"); + expect(transformerResults.length).toBeGreaterThan(0); + expect(transformerResults[0]!.title).toBe("Transformer Architecture"); + + const attentionResults = index.search("attention"); + expect(attentionResults.length).toBeGreaterThan(0); + expect(attentionResults[0]!.title).toBe("Attention Mechanisms"); + }); + + test("replaces existing document with same path", () => { + const index = new SearchIndex(); + const path = "/tmp/test/raw/articles/test.md"; + + index.addDocument({ + path, + title: "Old Title", + content: "Old content about quantum physics.", + }); + expect(index.documentCount).toBe(1); + + // Re-add same path with different content + index.addDocument({ + path, + title: "New Title", + content: "New content about machine learning.", + }); + + expect(index.documentCount).toBe(1); + const results = index.search("machine learning"); + expect(results.length).toBe(1); + expect(results[0]!.title).toBe("New Title"); + + // Old content should not be found + const oldResults = index.search("quantum physics"); + expect(oldResults).toHaveLength(0); + }); + + test("preserves tags and date", () => { + const index = new SearchIndex(); + index.addDocument({ + path: "/tmp/test/raw/articles/test.md", + title: "Tagged Doc", + content: "Some content about testing.", + tags: ["ml", "nlp"], + date: "2025-06-01", + }); + + // Tag filter should work + const tagResults = index.search("testing", { tag: "ml" }); + expect(tagResults.length).toBe(1); + + // Date filter should work + const dateResults = index.search("testing", { since: "2025-01-01" }); + expect(dateResults.length).toBe(1); + + // Non-matching tag should filter out + const noResults = index.search("testing", { tag: "vision" }); + expect(noResults).toHaveLength(0); + }); + + test("save and load round-trip after addDocument", async () => { + const root = await makeTempVault(); + + const index1 = new SearchIndex(); + index1.addDocument({ + path: join(root, "raw/articles/test.md"), + title: "Incremental Doc", + content: "This document was added incrementally without a full build.", + }); + await index1.save(root); + + const index2 = new SearchIndex(); + const loaded = await index2.load(root); + expect(loaded).toBe(true); + expect(index2.documentCount).toBe(1); + + const results = index2.search("incrementally"); + expect(results.length).toBe(1); + expect(results[0]!.title).toBe("Incremental Doc"); + }); +}); + // ─── Fuzzy Matching ───────────────────────────────────────────── describe("editDistance1", () => { diff --git a/packages/core/src/search/engine.ts b/packages/core/src/search/engine.ts index 544e81f..b9d8b55 100644 --- a/packages/core/src/search/engine.ts +++ b/packages/core/src/search/engine.ts @@ -459,6 +459,73 @@ export class SearchIndex { } } + /** + * Add a single document to the index incrementally (no full rebuild needed). + * Recomputes IDF after insertion. Call save() to persist. + */ + addDocument(opts: { + path: string; + title: string; + content: string; + tags?: string[]; + date?: string | null; + }): void { + // Remove existing document with same path (re-ingest of same source) + this.documents = this.documents.filter((d) => d.path !== opts.path); + + const tokens = tokenize(`${opts.title} ${opts.title} ${opts.content}`); + const termFreqs = new Map(); + for (const token of tokens) { + termFreqs.set(token, (termFreqs.get(token) ?? 0) + 1); + } + + const tags = opts.tags?.map((t) => t.toLowerCase()) ?? []; + const date = + opts.date && !Number.isNaN(Date.parse(String(opts.date))) ? String(opts.date) : null; + + this.documents.push({ + path: opts.path, + title: opts.title, + content: opts.content, + tokens, + tokenCount: tokens.length, + termFreqs, + tags, + date, + }); + + // Recompute IDF with the updated document set + this.recomputeIdf(); + } + + /** + * Recompute IDF values and average document length from current documents. + */ + private recomputeIdf(): void { + this.idf.clear(); + const N = this.documents.length; + const docFreq = new Map(); + + for (const doc of this.documents) { + const seen = new Set(); + for (const token of doc.tokens) { + if (!seen.has(token)) { + docFreq.set(token, (docFreq.get(token) ?? 0) + 1); + seen.add(token); + } + } + } + + for (const [term, df] of docFreq) { + this.idf.set(term, Math.log((N - df + 0.5) / (df + 0.5) + 1)); + } + + this.avgDl = + this.documents.length > 0 + ? this.documents.reduce((sum, d) => sum + d.tokenCount, 0) / this.documents.length + : 0; + } + get documentCount(): number { return this.documents.length; } diff --git a/packages/core/src/search/vector.test.ts b/packages/core/src/search/vector.test.ts index 7f672b6..5dcb900 100644 --- a/packages/core/src/search/vector.test.ts +++ b/packages/core/src/search/vector.test.ts @@ -293,6 +293,103 @@ describe("VectorIndex", () => { expect(result.total).toBe(1); }); + test("addDocument adds to an empty index", async () => { + const root = await makeTempVault(); + const provider = createMockEmbedProvider(); + + const index = new VectorIndex(); + await index.addDocument( + { + path: join(root, "raw/articles/ml.md"), + title: "Machine Learning", + content: "Machine learning uses neural networks for deep learning tasks.", + }, + provider, + ); + + expect(index.documentCount).toBe(1); + const results = await index.search("machine learning neural", provider); + expect(results.length).toBe(1); + expect(results[0]!.title).toBe("Machine Learning"); + }); + + test("addDocument adds to an existing built index", async () => { + const root = await makeTempVault(); + const provider = createMockEmbedProvider(); + + await writeWiki( + root, + "concepts/transformers.md", + articleMd("Transformer Architecture", "The transformer uses self-attention mechanisms."), + ); + + const index = new VectorIndex(); + await index.build(root, provider, "wiki"); + expect(index.documentCount).toBe(1); + + await index.addDocument( + { + path: join(root, "raw/articles/attention.md"), + title: "Attention Mechanisms", + content: "Attention mechanisms compute weighted sums in neural network layers.", + }, + provider, + ); + + expect(index.documentCount).toBe(2); + + const results = await index.search("attention neural", provider); + expect(results.length).toBe(2); + expect(results[0]!.title).toBe("Attention Mechanisms"); + }); + + test("addDocument replaces existing document with same path", async () => { + const provider = createMockEmbedProvider(); + const path = "/tmp/test/raw/articles/test.md"; + + const index = new VectorIndex(); + await index.addDocument( + { path, title: "Old", content: "Old content about quantum physics." }, + provider, + ); + expect(index.documentCount).toBe(1); + + await index.addDocument( + { path, title: "New", content: "New content about neural network deep learning." }, + provider, + ); + expect(index.documentCount).toBe(1); + + const results = await index.search("neural network", provider); + expect(results.length).toBe(1); + expect(results[0]!.title).toBe("New"); + }); + + test("addDocument save/load round-trip", async () => { + const root = await makeTempVault(); + const provider = createMockEmbedProvider(); + + const index1 = new VectorIndex(); + await index1.addDocument( + { + path: join(root, "raw/articles/test.md"), + title: "Incremental Doc", + content: "Neural network deep learning transformer architecture.", + }, + provider, + ); + await index1.save(root); + + const index2 = new VectorIndex(); + const loaded = await index2.load(root); + expect(loaded).toBe(true); + expect(index2.documentCount).toBe(1); + + const results = await index2.search("neural transformer", provider); + expect(results.length).toBe(1); + expect(results[0]!.title).toBe("Incremental Doc"); + }); + test("throws when provider lacks embed", async () => { const root = await makeTempVault(); const provider: LLMProvider = { diff --git a/packages/core/src/search/vector.ts b/packages/core/src/search/vector.ts index 95ef1de..b250d18 100644 --- a/packages/core/src/search/vector.ts +++ b/packages/core/src/search/vector.ts @@ -310,6 +310,41 @@ export class VectorIndex { } } + /** + * Add a single document to the vector index incrementally. + * Requires a provider with embed() support. Call save() to persist. + */ + async addDocument( + opts: { path: string; title: string; content: string }, + provider: LLMProvider, + ): Promise { + if (!provider.embed) { + throw new Error(`Provider "${provider.name}" does not support embeddings`); + } + + // Remove existing document with same path (re-ingest) + this.documents = this.documents.filter((d) => d.path !== opts.path); + + const contentHash = await hash(opts.content); + const text = `${opts.title}\n\n${opts.content}`; + const chunks = chunkText(text); + const [embedding] = await provider.embed([chunks[0]!]); + if (!embedding) return; + normalize(embedding); + + if (this.dimensions === 0) { + this.dimensions = embedding.length; + } + + this.documents.push({ + path: opts.path, + title: opts.title, + snippet: opts.content.slice(0, 200), + hash: contentHash, + embedding, + }); + } + get documentCount(): number { return this.documents.length; } diff --git a/packages/core/src/vault.ts b/packages/core/src/vault.ts index 5802cd2..9b856ce 100644 --- a/packages/core/src/vault.ts +++ b/packages/core/src/vault.ts @@ -157,20 +157,20 @@ kib ingests sources (URLs, PDFs, YouTube, GitHub repos, files, images) and compi **Work immediately (no API key needed):** - \`kib_status\` — vault state, provider status, and setup instructions -- \`kib_search\` — full-text BM25 search across all articles +- \`kib_search\` — full-text search across all sources and articles (works immediately after ingest) - \`kib_list\` — list wiki articles or raw sources - \`kib_read\` — read a specific article or source -- \`kib_ingest\` — ingest URLs, files, PDFs, YouTube, repos, images (saves to raw/) +- \`kib_ingest\` — ingest URLs, files, PDFs, YouTube, repos, images (immediately searchable) - \`kib_export\` — export wiki as markdown or HTML - \`kib_lint\` — health checks on the wiki - \`kib_config\` — get/set vault configuration **Require an LLM API key:** -- \`kib_compile\` — compile raw sources into wiki articles via LLM -- \`kib_query\` — ask questions with RAG (retrieval-augmented generation) +- \`kib_compile\` — compile raw sources into wiki articles via LLM (optional enrichment) +- \`kib_query\` — ask questions with RAG — works over raw sources and wiki articles, use \`source\` param to query a specific file - \`kib_skill\` — run skills (summarize, flashcards, connections, etc.) -Note: \`kib_ingest\` auto-compiles after ingesting if a provider is configured. Without a key, sources are saved but not compiled. +Note: Sources are searchable and queryable immediately after ingest — no compile needed. \`kib_ingest\` auto-compiles if a provider is configured, creating enriched wiki articles. ${apiKeySection} ## Vault Layout