diff --git a/ROADMAP.md b/ROADMAP.md
index ddd2b5f..b3b544a 100644
--- a/ROADMAP.md
+++ b/ROADMAP.md
@@ -116,18 +116,18 @@ What's built, what's next, and what's deferred.
## v0.7.0 — Advanced Search
### Vector/Semantic Search
-- [ ] Optional embedding-based search alongside BM25
-- [ ] Embedding provider: OpenAI `text-embedding-3-small`, or local via Ollama
-- [ ] Hybrid scoring: combine BM25 + cosine similarity
-- [ ] Store embeddings in `.kb/cache/embeddings.bin`
-- [ ] Rebuild embeddings on compile
+- [x] Optional embedding-based search alongside BM25
+- [x] Embedding provider: OpenAI `text-embedding-3-small`, or local via Ollama
+- [x] Hybrid scoring: combine BM25 + cosine similarity (Reciprocal Rank Fusion)
+- [x] Store embeddings in `.kb/cache/vectors.idx` (binary Float32Array format)
+- [x] Rebuild embeddings on compile
### Search Improvements
-- [ ] Fuzzy matching for typo tolerance
-- [ ] Phrase search with quotes: `kib search '"attention mechanism"'`
-- [ ] Tag-based filtering: `kib search "transformers" --tag deep-learning`
-- [ ] Date range filtering: `kib search --since 2024-01-01`
-- [ ] Search result highlighting in terminal (bold matched terms)
+- [x] Fuzzy matching for typo tolerance (edit distance ≤ 1 for tokens ≥ 4 chars)
+- [x] Phrase search with quotes: `kib search '"attention mechanism"'`
+- [x] Tag-based filtering: `kib search "transformers" --tag deep-learning`
+- [x] Date range filtering: `kib search --since 2024-01-01`
+- [x] Search result highlighting in terminal (bold matched terms)
---
diff --git a/bun.lock b/bun.lock
index ed78507..c89da5b 100644
--- a/bun.lock
+++ b/bun.lock
@@ -1,5 +1,6 @@
{
"lockfileVersion": 1,
+ "configVersion": 0,
"workspaces": {
"": {
"name": "kib-monorepo",
diff --git a/packages/cli/src/commands/search.ts b/packages/cli/src/commands/search.ts
index 7f6665e..fcd0c0a 100644
--- a/packages/cli/src/commands/search.ts
+++ b/packages/cli/src/commands/search.ts
@@ -10,6 +10,8 @@ interface SearchOpts {
limit?: number;
json?: boolean;
engine?: "builtin" | "vector" | "hybrid";
+ tag?: string[];
+ since?: string;
}
export async function search(term: string, opts: SearchOpts) {
@@ -30,6 +32,8 @@ export async function search(term: string, opts: SearchOpts) {
const scope = opts.wiki ? "wiki" : opts.raw ? "raw" : "all";
const limit = opts.limit ?? 20;
+ const tags = opts.tag ?? undefined;
+ const since = opts.since ?? undefined;
// Determine search engine
let engine = opts.engine;
@@ -44,6 +48,8 @@ export async function search(term: string, opts: SearchOpts) {
debug(`vault root: ${root}`);
debug(`scope: ${scope}, limit: ${limit}, engine: ${engine}, term: "${term}"`);
+ if (tags) debug(`tag filter: ${tags.join(", ")}`);
+ if (since) debug(`since filter: ${since}`);
const spinner = createSpinner("Searching...");
spinner.start();
@@ -51,6 +57,8 @@ export async function search(term: string, opts: SearchOpts) {
let results: SearchResult[];
let elapsed: number;
+ const searchOpts = { limit, tag: tags, since, highlight: !opts.json };
+
if (engine === "hybrid" || engine === "vector") {
const endIndex = debugTime("load/build hybrid index");
const bm25 = new SearchIndex();
@@ -83,6 +91,15 @@ export async function search(term: string, opts: SearchOpts) {
const start = performance.now();
results = await hybrid.search(term, provider, { limit });
elapsed = Math.round(performance.now() - start);
+
+ // Apply tag/date filters and highlighting post-hoc for hybrid
+ // (BM25 side supports it natively, but hybrid fuses results)
+ if (tags || since) {
+ // Re-run BM25 with filters to get filtered results
+ const filteredBm25 = bm25.search(term, searchOpts);
+ const filteredPaths = new Set(filteredBm25.map((r) => r.path));
+ results = results.filter((r) => filteredPaths.has(r.path));
+ }
} else {
endIndex();
// Fallback path
@@ -94,7 +111,7 @@ export async function search(term: string, opts: SearchOpts) {
await index.save(root);
}
const start = performance.now();
- results = index.search(term, { limit });
+ results = index.search(term, searchOpts);
elapsed = Math.round(performance.now() - start);
}
} else {
@@ -112,7 +129,7 @@ export async function search(term: string, opts: SearchOpts) {
endIndex();
const start = performance.now();
- results = index.search(term, { limit });
+ results = index.search(term, searchOpts);
elapsed = Math.round(performance.now() - start);
}
@@ -143,14 +160,13 @@ export async function search(term: string, opts: SearchOpts) {
console.log(` ${num}. ${title} ${score}`);
console.log(` ${dimPath(r.path)}`);
if (r.snippet) {
- console.log(` ${truncate(r.snippet, 80)}`);
+ console.log(` ${truncate(r.snippet, 120)}`);
}
console.log();
}
}
function dimPath(path: string): string {
- // Import chalk dynamically to keep lazy loading
return `\x1b[2m${path}\x1b[0m`;
}
diff --git a/packages/cli/src/index.ts b/packages/cli/src/index.ts
index 0df3a58..0390dd2 100644
--- a/packages/cli/src/index.ts
+++ b/packages/cli/src/index.ts
@@ -76,6 +76,13 @@ program
.option("--limit ", "max results", Number.parseInt)
.option("--json", "JSON output")
.option("--engine ", "search engine: builtin, vector, hybrid")
+ .option(
+ "--tag ",
+ "filter by tag (repeatable)",
+ (val: string, prev: string[]) => [...prev, val],
+ [] as string[],
+ )
+ .option("--since ", "filter to articles dated on or after (YYYY-MM-DD)")
.action(async (term, opts) => {
const { search } = await import("./commands/search.js");
await search(term, opts);
diff --git a/packages/cli/src/mcp/export-helper.ts b/packages/cli/src/mcp/export-helper.ts
new file mode 100644
index 0000000..eb636e1
--- /dev/null
+++ b/packages/cli/src/mcp/export-helper.ts
@@ -0,0 +1,178 @@
+import { copyFile, mkdir, readFile, writeFile } from "node:fs/promises";
+import { join, relative } from "node:path";
+import { listImageAssets, listWiki, parseFrontmatter, WIKI_DIR } from "@kibhq/core";
+
+export async function exportVault(
+ root: string,
+ format: "markdown" | "html",
+ output?: string,
+): Promise<{ format: string; output: string; files: number }> {
+ const outputDir = output ?? join(root, "export");
+
+ let fileCount: number;
+ switch (format) {
+ case "markdown":
+ fileCount = await exportMarkdown(root, outputDir);
+ break;
+ case "html":
+ fileCount = await exportHtml(root, outputDir);
+ break;
+ default:
+ throw new Error(`Unsupported format: ${format}. Use 'markdown' or 'html'.`);
+ }
+
+ return { format, output: outputDir, files: fileCount };
+}
+
+async function exportMarkdown(root: string, outputDir: string): Promise {
+ const wikiDir = join(root, WIKI_DIR);
+ const files = await listWiki(root);
+
+ await copyImageAssets(root, outputDir);
+
+ for (const filePath of files) {
+ const content = await readFile(filePath, "utf-8");
+ const relPath = relative(wikiDir, filePath);
+ const outPath = join(outputDir, relPath);
+
+ await mkdir(join(outPath, ".."), { recursive: true });
+
+ const cleaned = content.replace(/^---[\s\S]*?---\s*\n/, "");
+ const resolved = cleaned.replace(
+ /\[\[([^\]]+)\]\]/g,
+ (_, slug: string) => `[${slug}](${slug}.md)`,
+ );
+
+ await writeFile(outPath, resolved, "utf-8");
+ }
+
+ return files.length;
+}
+
+const SHARED_CSS = `
+ body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", sans-serif; max-width: 720px; margin: 2rem auto; padding: 0 1rem; line-height: 1.6; color: #1a1a1a; }
+ a { color: #0066cc; }
+ code { background: #f4f4f4; padding: 0.2em 0.4em; border-radius: 3px; font-size: 0.9em; }
+ pre { background: #f4f4f4; padding: 1rem; border-radius: 6px; overflow-x: auto; }
+ pre code { background: none; padding: 0; }
+ nav { margin-bottom: 2rem; padding-bottom: 1rem; border-bottom: 1px solid #eee; }
+ nav a { margin-right: 1rem; }
+ img { max-width: 100%; height: auto; border-radius: 6px; margin: 1rem 0; }`;
+
+async function exportHtml(root: string, outputDir: string): Promise {
+ const wikiDir = join(root, WIKI_DIR);
+ const files = await listWiki(root);
+
+ await mkdir(outputDir, { recursive: true });
+ const imageFiles = await copyImageAssets(root, outputDir);
+
+ const articles: { title: string; htmlPath: string }[] = [];
+
+ for (const filePath of files) {
+ const content = await readFile(filePath, "utf-8");
+ const relPath = relative(wikiDir, filePath);
+ const { frontmatter, body } = parseFrontmatter(content);
+ const title = (frontmatter.title as string) ?? relPath.replace(/\.md$/, "");
+ const htmlPath = relPath.replace(/\.md$/, ".html");
+
+ const depth = htmlPath.split("/").length - 1;
+ const prefix = depth > 0 ? "../".repeat(depth) : "";
+
+ const html = simpleMarkdownToHtml(body, prefix);
+
+ const page = `
+
+
+
+
+ ${escapeHtml(title)}
+
+
+
+
+ ${escapeHtml(title)}
+ ${html}
+
+`;
+
+ const outPath = join(outputDir, htmlPath);
+ await mkdir(join(outPath, ".."), { recursive: true });
+ await writeFile(outPath, page, "utf-8");
+
+ articles.push({ title, htmlPath });
+ }
+
+ const indexHtml = `
+
+
+
+
+ Knowledge Base
+
+
+
+ Knowledge Base
+ ${articles.length} articles
+
+ ${articles
+ .sort((a, b) => a.title.localeCompare(b.title))
+ .map((a) => `- ${escapeHtml(a.title)}
`)
+ .join("\n ")}
+
+
+`;
+
+ await writeFile(join(outputDir, "index.html"), indexHtml, "utf-8");
+
+ return articles.length;
+}
+
+async function copyImageAssets(root: string, outputDir: string): Promise {
+ const imageFiles = await listImageAssets(root);
+ if (imageFiles.length === 0) return [];
+
+ const srcDir = join(root, WIKI_DIR, "images");
+ const destDir = join(outputDir, "images");
+ await mkdir(destDir, { recursive: true });
+
+ for (const filename of imageFiles) {
+ await copyFile(join(srcDir, filename), join(destDir, filename));
+ }
+
+ return imageFiles;
+}
+
+function simpleMarkdownToHtml(md: string, imagePrefix = ""): string {
+ return md
+ .replace(/```(\w*)\n([\s\S]*?)```/g, "$2
")
+ .replace(/^### (.+)$/gm, "$1
")
+ .replace(/^## (.+)$/gm, "$1
")
+ .replace(/^# (.+)$/gm, "$1
")
+ .replace(/\*\*(.+?)\*\*/g, "$1")
+ .replace(/\*(.+?)\*/g, "$1")
+ .replace(/`([^`]+)`/g, "$1")
+ .replace(
+ /!\[([^\]]*)\]\((images\/[^)]+)\)/g,
+ (_, alt: string, src: string) => `
`,
+ )
+ .replace(
+ /!\[([^\]]*)\]\((https?:\/\/[^)]+)\)/g,
+ (_, alt: string, src: string) => `
`,
+ )
+ .replace(/\[\[([^\]]+)\]\]/g, '$1')
+ .replace(/\[([^\]]+)\]\(([^)]+)\)/g, '$1')
+ .replace(/^- (.+)$/gm, "$1")
+ .replace(/\n\n/g, "
")
+ .replace(/^/, "
")
+ .replace(/$/, "
")
+ .replace(//g, "- ")
+ .replace(/<\/li><\/p>/g, "
");
+}
+
+function escapeHtml(str: string): string {
+ return str
+ .replace(/&/g, "&")
+ .replace(//g, ">")
+ .replace(/"/g, """);
+}
diff --git a/packages/cli/src/mcp/server.test.ts b/packages/cli/src/mcp/server.test.ts
index c3f68bd..bf20651 100644
--- a/packages/cli/src/mcp/server.test.ts
+++ b/packages/cli/src/mcp/server.test.ts
@@ -291,6 +291,267 @@ describe("MCP server", () => {
});
});
+ // ── kib_search advanced ───────────────────────────────────
+
+ describe("kib_search advanced", () => {
+ function taggedArticle(title: string, tags: string[], content: string): string {
+ return `---\ntitle: ${title}\nslug: ${title.toLowerCase().replace(/\s+/g, "-")}\ntags: [${tags.join(", ")}]\ndate: 2025-06-01\n---\n\n# ${title}\n\n${content}`;
+ }
+
+ test("filters by tag", async () => {
+ const root = await makeTempVault();
+ await writeWiki(
+ root,
+ "concepts/transformer.md",
+ taggedArticle("Transformer", ["nlp", "deep-learning"], "A neural network."),
+ );
+ await writeWiki(
+ root,
+ "concepts/cnn.md",
+ taggedArticle("CNN", ["vision"], "A convolutional neural network."),
+ );
+ const client = await createClient(root);
+
+ const result = await client.callTool({
+ name: "kib_search",
+ arguments: { query: "neural network", tag: "nlp" },
+ });
+ expect(result.isError).toBeFalsy();
+
+ const hits = JSON.parse(textOf(result));
+ expect(hits.length).toBe(1);
+ expect(hits[0].path).toContain("transformer.md");
+ });
+
+ test("filters by since date", async () => {
+ const root = await makeTempVault();
+ await writeWiki(
+ root,
+ "concepts/old.md",
+ `---\ntitle: Old\nslug: old\ndate: 2023-01-01\n---\n\n# Old\n\nNeural network.`,
+ );
+ await writeWiki(
+ root,
+ "concepts/new.md",
+ `---\ntitle: New\nslug: new\ndate: 2025-06-01\n---\n\n# New\n\nNeural network.`,
+ );
+ const client = await createClient(root);
+
+ const result = await client.callTool({
+ name: "kib_search",
+ arguments: { query: "neural", since: "2025-01-01" },
+ });
+ expect(result.isError).toBeFalsy();
+
+ const hits = JSON.parse(textOf(result));
+ expect(hits.length).toBe(1);
+ expect(hits[0].path).toContain("new.md");
+ });
+
+ test("scopes search to wiki only", async () => {
+ const root = await makeTempVault();
+ await writeWiki(root, "concepts/wiki.md", articleMd("Wiki Article", "Neural network."));
+ await writeRaw(root, "articles/raw.md", "# Raw Article\n\nNeural network.");
+ const client = await createClient(root);
+
+ const result = await client.callTool({
+ name: "kib_search",
+ arguments: { query: "neural", scope: "wiki" },
+ });
+ expect(result.isError).toBeFalsy();
+
+ const hits = JSON.parse(textOf(result));
+ expect(hits.every((h: { path: string }) => h.path.includes("wiki/"))).toBe(true);
+ });
+ });
+
+ // ── kib_config ─────────────────────────────────────────────
+
+ describe("kib_config", () => {
+ test("lists all config", async () => {
+ const root = await makeTempVault();
+ const client = await createClient(root);
+
+ const result = await client.callTool({ name: "kib_config", arguments: {} });
+ expect(result.isError).toBeFalsy();
+
+ const data = JSON.parse(textOf(result));
+ expect(data.provider).toBeDefined();
+ expect(data.search).toBeDefined();
+ });
+
+ test("reads a specific config key", async () => {
+ const root = await makeTempVault();
+ const client = await createClient(root);
+
+ const result = await client.callTool({
+ name: "kib_config",
+ arguments: { key: "search.engine" },
+ });
+ expect(result.isError).toBeFalsy();
+
+ const data = JSON.parse(textOf(result));
+ expect(data["search.engine"]).toBeDefined();
+ });
+
+ test("sets a config value", async () => {
+ const root = await makeTempVault();
+ const client = await createClient(root);
+
+ const setResult = await client.callTool({
+ name: "kib_config",
+ arguments: { key: "search.engine", value: "hybrid" },
+ });
+ expect(setResult.isError).toBeFalsy();
+
+ const data = JSON.parse(textOf(setResult));
+ expect(data["search.engine"]).toBe("hybrid");
+ expect(data.saved).toBe(true);
+
+ // Verify it persisted
+ const getResult = await client.callTool({
+ name: "kib_config",
+ arguments: { key: "search.engine" },
+ });
+ const readBack = JSON.parse(textOf(getResult));
+ expect(readBack["search.engine"]).toBe("hybrid");
+ });
+
+ test("returns error for unknown key", async () => {
+ const root = await makeTempVault();
+ const client = await createClient(root);
+
+ const result = await client.callTool({
+ name: "kib_config",
+ arguments: { key: "nonexistent.key" },
+ });
+ expect(result.isError).toBe(true);
+ });
+ });
+
+ // ── kib_skill ──────────────────────────────────────────────
+
+ describe("kib_skill", () => {
+ test("lists built-in skills", async () => {
+ const root = await makeTempVault();
+ const client = await createClient(root);
+
+ const result = await client.callTool({
+ name: "kib_skill",
+ arguments: { action: "list" },
+ });
+ expect(result.isError).toBeFalsy();
+
+ const skills = JSON.parse(textOf(result));
+ expect(Array.isArray(skills)).toBe(true);
+ expect(skills.length).toBeGreaterThan(0);
+ expect(skills[0].name).toBeDefined();
+ expect(skills[0].description).toBeDefined();
+ });
+
+ test("returns error when running without name", async () => {
+ const root = await makeTempVault();
+ const client = await createClient(root);
+
+ const result = await client.callTool({
+ name: "kib_skill",
+ arguments: { action: "run" },
+ });
+ expect(result.isError).toBe(true);
+ expect(textOf(result)).toContain("name is required");
+ });
+
+ test("returns error for nonexistent skill", async () => {
+ const root = await makeTempVault();
+ const client = await createClient(root);
+
+ const result = await client.callTool({
+ name: "kib_skill",
+ arguments: { action: "run", name: "nonexistent-skill" },
+ });
+ expect(result.isError).toBe(true);
+ expect(textOf(result)).toContain("not found");
+ });
+ });
+
+ // ── kib_export ─────────────────────────────────────────────
+
+ describe("kib_export", () => {
+ test("exports wiki as markdown", async () => {
+ const root = await makeTempVault();
+ await writeWiki(root, "concepts/test.md", articleMd("Test", "Test content."));
+ const client = await createClient(root);
+
+ const outputDir = join(tempDir, "export-test");
+ const result = await client.callTool({
+ name: "kib_export",
+ arguments: { format: "markdown", output: outputDir },
+ });
+ expect(result.isError).toBeFalsy();
+
+ const data = JSON.parse(textOf(result));
+ expect(data.format).toBe("markdown");
+ expect(data.files).toBeGreaterThan(0);
+ expect(data.output).toBe(outputDir);
+ });
+
+ test("exports wiki as html", async () => {
+ const root = await makeTempVault();
+ await writeWiki(root, "concepts/test.md", articleMd("Test", "Test content."));
+ const client = await createClient(root);
+
+ const outputDir = join(tempDir, "export-html-test");
+ const result = await client.callTool({
+ name: "kib_export",
+ arguments: { format: "html", output: outputDir },
+ });
+ expect(result.isError).toBeFalsy();
+
+ const data = JSON.parse(textOf(result));
+ expect(data.format).toBe("html");
+ expect(data.files).toBeGreaterThan(0);
+ });
+ });
+
+ // ── kib_compile with max ──────────────────────────────────
+
+ describe("kib_compile params", () => {
+ test("accepts dry_run and max params", async () => {
+ const root = await makeTempVault();
+ const client = await createClient(root);
+
+ // dry_run compile on empty vault should succeed
+ const result = await client.callTool({
+ name: "kib_compile",
+ arguments: { dry_run: true, max: 5 },
+ });
+ // Will error due to no provider, which is expected
+ // The point is it doesn't crash on the new params
+ expect(result).toBeDefined();
+ });
+ });
+
+ // ── kib_ingest dry_run ────────────────────────────────────
+
+ describe("kib_ingest dry_run", () => {
+ test("dry run returns preview without writing", async () => {
+ const root = await makeTempVault();
+ const filePath = join(tempDir, "dry-run-input.txt");
+ await writeFile(filePath, "Test document for dry run verification.");
+ const client = await createClient(root);
+
+ const result = await client.callTool({
+ name: "kib_ingest",
+ arguments: { source: filePath, dry_run: true },
+ });
+ expect(result.isError).toBeFalsy();
+
+ const data = JSON.parse(textOf(result));
+ expect(data.dryRun).toBe(true);
+ expect(data.path).toBeTruthy();
+ });
+ });
+
// ── Tool listing ───────────────────────────────────────────
describe("tool listing", () => {
@@ -302,12 +563,15 @@ describe("MCP server", () => {
const names = tools.map((t) => t.name).sort();
expect(names).toEqual([
"kib_compile",
+ "kib_config",
+ "kib_export",
"kib_ingest",
"kib_lint",
"kib_list",
"kib_query",
"kib_read",
"kib_search",
+ "kib_skill",
"kib_status",
]);
});
diff --git a/packages/cli/src/mcp/server.ts b/packages/cli/src/mcp/server.ts
index bd149d6..dabd0fe 100644
--- a/packages/cli/src/mcp/server.ts
+++ b/packages/cli/src/mcp/server.ts
@@ -1,6 +1,7 @@
import {
compileVault,
createProvider,
+ fixLintIssues,
ingestSource,
type LLMProvider,
lintVault,
@@ -15,6 +16,7 @@ import {
readRaw,
readWiki,
SearchIndex,
+ saveConfig,
type VaultConfig,
} from "@kibhq/core";
import { McpServer } from "@modelcontextprotocol/sdk/server/mcp.js";
@@ -147,7 +149,7 @@ export function createMcpServer(root: string) {
},
);
- // ── kib_read ──────────────────────────────────────────────
+ // ── kib_read ─────────────────────────────��────────────────
server.tool(
"kib_read",
@@ -170,15 +172,35 @@ export function createMcpServer(root: string) {
server.tool(
"kib_search",
- "Search the knowledge base using full-text BM25 search",
+ "Search the knowledge base using full-text BM25 search. Supports fuzzy matching, phrase search (wrap in quotes), tag filtering, and date filtering.",
{
- query: z.string().describe("Search query"),
+ query: z
+ .string()
+ .describe(
+ 'Search query. Wrap phrases in quotes for exact match, e.g. "attention mechanism"',
+ ),
limit: z.number().int().positive().max(50).default(10).describe("Max results"),
+ tag: z
+ .union([z.string(), z.array(z.string())])
+ .optional()
+ .describe("Filter by frontmatter tag(s). Single tag or array for AND logic."),
+ since: z
+ .string()
+ .optional()
+ .describe("Filter to articles dated on or after this date (YYYY-MM-DD)"),
+ scope: z
+ .enum(["wiki", "raw", "all"])
+ .default("all")
+ .describe("Search scope: wiki articles, raw sources, or all"),
},
- async ({ query, limit }) => {
+ async ({ query, limit, tag, since, scope }) => {
try {
- const index = await ctx.getSearchIndex();
- const results = index.search(query, { limit });
+ // Rebuild index with requested scope if not 'all'
+ const index = scope === "all" ? await ctx.getSearchIndex() : new SearchIndex();
+ if (scope !== "all") {
+ await index.build(root, scope);
+ }
+ const results = index.search(query, { limit, tag, since });
const prefix = `${root}/`;
return json(
results.map((r) => ({
@@ -240,14 +262,29 @@ export function createMcpServer(root: string) {
.optional()
.describe("Raw subdirectory override (e.g. 'papers', 'articles')"),
tags: z.string().optional().describe("Comma-separated tags"),
+ dry_run: z
+ .boolean()
+ .default(false)
+ .describe("Preview what would be ingested without writing"),
},
- async ({ source, category, tags }) => {
+ async ({ source, category, tags, dry_run }) => {
try {
const result = await ingestSource(root, source, {
category,
tags: tags?.split(",").map((t) => t.trim()),
+ dryRun: dry_run,
});
+ if (dry_run) {
+ return json({
+ dryRun: true,
+ path: result.path,
+ title: result.title,
+ wordCount: result.wordCount,
+ skipped: result.skipped,
+ });
+ }
+
// Auto-compile after ingest so content is immediately queryable
let compiled = null;
if (!result.skipped) {
@@ -305,8 +342,14 @@ export function createMcpServer(root: string) {
force: z.boolean().default(false).describe("Recompile all sources"),
source: z.string().optional().describe("Compile only a specific source"),
dry_run: z.boolean().default(false).describe("Preview without writing"),
+ max: z
+ .number()
+ .int()
+ .positive()
+ .optional()
+ .describe("Limit number of sources to compile per pass"),
},
- async ({ force, source, dry_run }) => {
+ async ({ force, source, dry_run, max }) => {
try {
const provider = await ctx.getProvider();
const config = await ctx.getConfig();
@@ -314,6 +357,7 @@ export function createMcpServer(root: string) {
force,
dryRun: dry_run,
sourceFilter: source,
+ maxSources: max,
});
ctx.invalidateSearch();
return json({
@@ -333,7 +377,7 @@ export function createMcpServer(root: string) {
server.tool(
"kib_lint",
- "Run health checks on the wiki and report issues",
+ "Run health checks on the wiki and report issues. Use fix=true to auto-fix fixable issues (recompile stale sources, create missing articles).",
{
rule: z
.string()
@@ -341,11 +385,44 @@ export function createMcpServer(root: string) {
.describe(
"Run only a specific rule: orphan, stale, missing, broken-link, frontmatter, contradiction",
),
+ fix: z
+ .boolean()
+ .default(false)
+ .describe("Auto-fix fixable issues (recompile stale, create missing articles)"),
},
- async ({ rule }) => {
+ async ({ rule, fix }) => {
try {
const provider = await ctx.getProvider().catch(() => undefined);
const result = await lintVault(root, { ruleFilter: rule, provider });
+
+ if (fix) {
+ const fixable = result.diagnostics.filter((d) => d.fixable);
+ if (fixable.length > 0) {
+ let fixProvider: LLMProvider | undefined;
+ let config: VaultConfig | undefined;
+ const hasStale = fixable.some((d) => d.rule === "stale");
+ if (hasStale) {
+ try {
+ config = await ctx.getConfig();
+ fixProvider = await ctx.getProvider();
+ } catch {
+ // Provider not available — stale fixes will be skipped
+ }
+ }
+ const fixResult = await fixLintIssues(root, result.diagnostics, fixProvider, config);
+ ctx.invalidateSearch();
+ return json({
+ diagnostics: result.diagnostics,
+ errors: result.errors,
+ warnings: result.warnings,
+ infos: result.infos,
+ fixed: fixResult.fixed,
+ fixSkipped: fixResult.skipped,
+ fixErrors: fixResult.errors,
+ });
+ }
+ }
+
return json({
errors: result.errors,
warnings: result.warnings,
@@ -358,6 +435,114 @@ export function createMcpServer(root: string) {
},
);
+ // ── kib_config ────────────────────────────────────────────
+
+ server.tool(
+ "kib_config",
+ "Get or set vault configuration. Call with no arguments to list all config. Pass key to read a value, pass key+value to set it.",
+ {
+ key: z
+ .string()
+ .optional()
+ .describe(
+ "Dot-separated config key (e.g. 'provider.default', 'provider.model', 'search.engine')",
+ ),
+ value: z.string().optional().describe("Value to set. Omit to read the current value."),
+ },
+ async ({ key, value }) => {
+ try {
+ const config = await loadConfig(root);
+
+ // List all config
+ if (!key) {
+ return json(config);
+ }
+
+ // Get a value
+ if (!value) {
+ const val = getNestedValue(config, key);
+ if (val === undefined) return err(`Unknown config key: ${key}`);
+ return json({ [key]: val });
+ }
+
+ // Set a value
+ const parsed = parseConfigValue(value);
+ const updated = setNestedValue(config, key, parsed);
+ if (!updated) return err(`Unknown config key: ${key}`);
+ await saveConfig(root, config);
+ return json({ [key]: parsed, saved: true });
+ } catch (e) {
+ return err((e as Error).message);
+ }
+ },
+ );
+
+ // ── kib_skill ─────────────────────────��───────────────────
+
+ server.tool(
+ "kib_skill",
+ "List or run vault skills. Skills are reusable LLM-powered operations (summarize, flashcards, connections, etc).",
+ {
+ action: z
+ .enum(["list", "run"])
+ .describe("'list' to see available skills, 'run' to execute one"),
+ name: z.string().optional().describe("Skill name to run (required when action is 'run')"),
+ },
+ async ({ action, name }) => {
+ try {
+ const { loadSkills, findSkill, runSkill } = await import("@kibhq/core");
+
+ if (action === "list") {
+ const skills = await loadSkills(root);
+ return json(skills.map((s) => ({ name: s.name, description: s.description })));
+ }
+
+ // action === "run"
+ if (!name) return err("Skill name is required. Use action='list' to see available skills.");
+
+ const skill = await findSkill(root, name);
+ if (!skill)
+ return err(`Skill "${name}" not found. Use action='list' to see available skills.`);
+
+ let provider: LLMProvider | undefined;
+ if (skill.llm?.required) {
+ const config = await ctx.getConfig();
+ const modelKey = skill.llm.model === "fast" ? "fast_model" : "model";
+ const model = config.provider[modelKey as keyof typeof config.provider] as string;
+ provider = await createProvider(config.provider.default, model);
+ }
+
+ const result = await runSkill(root, skill, { provider });
+ return json({ skill: skill.name, content: result.content ?? null });
+ } catch (e) {
+ return err((e as Error).message);
+ }
+ },
+ );
+
+ // ── kib_export ────────────────────────────────────────────
+
+ server.tool(
+ "kib_export",
+ "Export the wiki as a clean markdown bundle or static HTML site. Returns the output directory path and file count.",
+ {
+ format: z
+ .enum(["markdown", "html"])
+ .default("markdown")
+ .describe("Export format: 'markdown' (clean, no frontmatter) or 'html' (static site)"),
+ output: z.string().optional().describe("Output directory path. Defaults to /export"),
+ },
+ async ({ format, output }) => {
+ try {
+ const { exportVault } = await import("./export-helper.js");
+ const result = await exportVault(root, format, output);
+ return json(result);
+ } catch (e) {
+ return err((e as Error).message);
+ }
+ },
+ );
+
// ── Resources ─────────────────────────────────────────────
server.resource("wiki-index", "wiki://index", { mimeType: "text/markdown" }, async () => {
@@ -385,6 +570,45 @@ export function createMcpServer(root: string) {
return server;
}
+// ─── Config Helpers ─────────────────────────────────────────────
+
+function getNestedValue(obj: Record, path: string): unknown {
+ const parts = path.split(".");
+ let current: unknown = obj;
+ for (const part of parts) {
+ if (current == null || typeof current !== "object") return undefined;
+ current = (current as Record)[part];
+ }
+ return current;
+}
+
+function setNestedValue(obj: Record, path: string, value: unknown): boolean {
+ const parts = path.split(".");
+ let current: unknown = obj;
+ for (let i = 0; i < parts.length - 1; i++) {
+ if (current == null || typeof current !== "object") return false;
+ current = (current as Record)[parts[i]!];
+ }
+ const lastKey = parts[parts.length - 1]!;
+ if (
+ current == null ||
+ typeof current !== "object" ||
+ !(lastKey in (current as Record))
+ ) {
+ return false;
+ }
+ (current as Record)[lastKey] = value;
+ return true;
+}
+
+function parseConfigValue(val: string): unknown {
+ if (val === "true") return true;
+ if (val === "false") return false;
+ const num = Number(val);
+ if (!Number.isNaN(num) && val.trim() !== "") return num;
+ return val;
+}
+
export async function startMcpServer(root: string) {
const server = createMcpServer(root);
const transport = new StdioServerTransport();
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index a85e233..86b1480 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -16,7 +16,7 @@ export { ALL_RULES } from "./lint/rules.js";
export { createProvider, detectProvider } from "./providers/router.js";
export { queryVault } from "./query/query.js";
export * from "./schemas.js";
-export { SearchIndex } from "./search/engine.js";
+export { highlightSnippet, parseQuery, SearchIndex } from "./search/engine.js";
export { HybridSearch } from "./search/hybrid.js";
export { VectorIndex } from "./search/vector.js";
export { findSkill, loadSkills } from "./skills/loader.js";
diff --git a/packages/core/src/search/engine.test.ts b/packages/core/src/search/engine.test.ts
index e2398f5..5308608 100644
--- a/packages/core/src/search/engine.test.ts
+++ b/packages/core/src/search/engine.test.ts
@@ -3,7 +3,7 @@ import { mkdtemp, rm } from "node:fs/promises";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { initVault, writeWiki } from "../vault.js";
-import { SearchIndex } from "./engine.js";
+import { editDistance1, highlightSnippet, parseQuery, SearchIndex } from "./engine.js";
let tempDir: string;
@@ -228,3 +228,379 @@ describe("SearchIndex", () => {
expect(results[0]!.title).toBe("Transformer");
});
});
+
+// ─── Fuzzy Matching ─────────────────────────────────────────────
+
+describe("editDistance1", () => {
+ test("returns true for single substitution", () => {
+ expect(editDistance1("cat", "bat")).toBe(true);
+ expect(editDistance1("hello", "hallo")).toBe(true);
+ });
+
+ test("returns true for single insertion", () => {
+ expect(editDistance1("cat", "cart")).toBe(true);
+ expect(editDistance1("test", "teset")).toBe(true);
+ });
+
+ test("returns true for single deletion", () => {
+ expect(editDistance1("cart", "cat")).toBe(true);
+ expect(editDistance1("hello", "helo")).toBe(true);
+ });
+
+ test("returns false for identical strings", () => {
+ expect(editDistance1("same", "same")).toBe(false);
+ });
+
+ test("returns false for distance > 1", () => {
+ expect(editDistance1("cat", "dog")).toBe(false);
+ expect(editDistance1("hello", "world")).toBe(false);
+ expect(editDistance1("abc", "abcde")).toBe(false);
+ });
+
+ test("returns false for empty vs 2+ chars", () => {
+ expect(editDistance1("", "ab")).toBe(false);
+ });
+
+ test("handles single char edge cases", () => {
+ expect(editDistance1("a", "b")).toBe(true); // substitution
+ expect(editDistance1("a", "ab")).toBe(true); // insertion
+ expect(editDistance1("ab", "a")).toBe(true); // deletion
+ });
+});
+
+describe("fuzzy search", () => {
+ test("finds results with typos (edit distance 1)", async () => {
+ const root = await makeTempVault();
+ await writeWiki(
+ root,
+ "concepts/transformer.md",
+ articleMd(
+ "Transformer Architecture",
+ "The transformer is a neural network architecture based on self-attention mechanisms.",
+ ),
+ );
+
+ const index = new SearchIndex();
+ await index.build(root, "wiki");
+
+ // "transfomer" is a common typo (missing 'r')
+ const results = index.search("transfomer");
+ expect(results.length).toBeGreaterThan(0);
+ expect(results[0]!.title).toBe("Transformer Architecture");
+ });
+
+ test("does not fuzzy match short tokens (< 4 chars)", async () => {
+ const root = await makeTempVault();
+ await writeWiki(
+ root,
+ "concepts/ai.md",
+ articleMd("AI Basics", "AI is artificial intelligence."),
+ );
+
+ const index = new SearchIndex();
+ await index.build(root, "wiki");
+
+ // "ax" is short — should not fuzzy-match "ai"
+ const results = index.search("ax");
+ expect(results).toHaveLength(0);
+ });
+});
+
+// ─── Phrase Search ──────────────────────────────────────────────
+
+describe("parseQuery", () => {
+ test("extracts quoted phrases", () => {
+ const result = parseQuery('"attention mechanism" transformer');
+ expect(result.phrases).toEqual(["attention mechanism"]);
+ expect(result.terms).toEqual(["transformer"]);
+ });
+
+ test("handles multiple quoted phrases", () => {
+ const result = parseQuery('"hello world" foo "bar baz"');
+ expect(result.phrases).toEqual(["hello world", "bar baz"]);
+ expect(result.terms).toEqual(["foo"]);
+ });
+
+ test("handles no quotes", () => {
+ const result = parseQuery("simple search query");
+ expect(result.phrases).toEqual([]);
+ expect(result.terms).toEqual(["simple", "search", "query"]);
+ });
+
+ test("handles only quotes", () => {
+ const result = parseQuery('"exact phrase"');
+ expect(result.phrases).toEqual(["exact phrase"]);
+ expect(result.terms).toEqual([]);
+ });
+
+ test("handles empty query", () => {
+ const result = parseQuery("");
+ expect(result.phrases).toEqual([]);
+ expect(result.terms).toEqual([]);
+ });
+});
+
+describe("phrase search", () => {
+ test("exact phrase matches rank and filter correctly", async () => {
+ const root = await makeTempVault();
+ await writeWiki(
+ root,
+ "concepts/attention.md",
+ articleMd(
+ "Attention Mechanisms",
+ "Self-attention mechanisms compute weighted sums. Attention is all you need.",
+ ),
+ );
+ await writeWiki(
+ root,
+ "concepts/rnn.md",
+ articleMd(
+ "Recurrent Networks",
+ "RNNs process sequences. Some use attention over hidden states.",
+ ),
+ );
+
+ const index = new SearchIndex();
+ await index.build(root, "wiki");
+
+ // Phrase search should only match articles containing the exact phrase
+ const results = index.search('"attention is all you need"');
+ expect(results.length).toBe(1);
+ expect(results[0]!.title).toBe("Attention Mechanisms");
+ });
+
+ test("phrase search with additional terms", async () => {
+ const root = await makeTempVault();
+ await writeWiki(
+ root,
+ "concepts/transformer.md",
+ articleMd("Transformer", "The transformer uses self-attention mechanisms for processing."),
+ );
+ await writeWiki(
+ root,
+ "concepts/cnn.md",
+ articleMd("CNN", "CNNs use convolutional layers for self-attention on images. Not really."),
+ );
+
+ const index = new SearchIndex();
+ await index.build(root, "wiki");
+
+ // Only the transformer article has "self-attention mechanisms" as a phrase
+ const results = index.search('"self-attention mechanisms" transformer');
+ expect(results.length).toBe(1);
+ expect(results[0]!.title).toBe("Transformer");
+ });
+});
+
+// ─── Tag Filtering ──────────────────────────────────────────────
+
+describe("tag filtering", () => {
+ function taggedArticle(title: string, tags: string[], content: string): string {
+ return `---\ntitle: ${title}\nslug: ${title.toLowerCase().replace(/\s+/g, "-")}\ntags: [${tags.join(", ")}]\n---\n\n# ${title}\n\n${content}`;
+ }
+
+ test("filters results by single tag", async () => {
+ const root = await makeTempVault();
+ await writeWiki(
+ root,
+ "concepts/transformer.md",
+ taggedArticle("Transformer", ["deep-learning", "nlp"], "Neural network architecture."),
+ );
+ await writeWiki(
+ root,
+ "concepts/cnn.md",
+ taggedArticle("CNN", ["deep-learning", "vision"], "Convolutional neural network."),
+ );
+ await writeWiki(
+ root,
+ "concepts/bert.md",
+ taggedArticle("BERT", ["nlp"], "Bidirectional encoder from transformers. A neural network."),
+ );
+
+ const index = new SearchIndex();
+ await index.build(root, "wiki");
+
+ const results = index.search("neural network", { tag: "nlp" });
+ // Only transformer and BERT have the nlp tag
+ expect(results.every((r) => r.title === "Transformer" || r.title === "BERT")).toBe(true);
+ expect(results.some((r) => r.title === "CNN")).toBe(false);
+ });
+
+ test("filters results by multiple tags (AND logic)", async () => {
+ const root = await makeTempVault();
+ await writeWiki(
+ root,
+ "concepts/transformer.md",
+ taggedArticle("Transformer", ["deep-learning", "nlp"], "A neural architecture."),
+ );
+ await writeWiki(
+ root,
+ "concepts/bert.md",
+ taggedArticle("BERT", ["nlp"], "Bidirectional encoder. A neural architecture."),
+ );
+
+ const index = new SearchIndex();
+ await index.build(root, "wiki");
+
+ const results = index.search("neural", { tag: ["deep-learning", "nlp"] });
+ // Only transformer has both tags
+ expect(results.length).toBe(1);
+ expect(results[0]!.title).toBe("Transformer");
+ });
+
+ test("returns empty when no docs match tag", async () => {
+ const root = await makeTempVault();
+ await writeWiki(
+ root,
+ "concepts/test.md",
+ taggedArticle("Test", ["misc"], "Some content about testing."),
+ );
+
+ const index = new SearchIndex();
+ await index.build(root, "wiki");
+
+ const results = index.search("test", { tag: "nonexistent" });
+ expect(results).toHaveLength(0);
+ });
+});
+
+// ─── Date Filtering ─────────────────────────────────────────────
+
+describe("date filtering", () => {
+ function datedArticle(title: string, date: string, content: string): string {
+ return `---\ntitle: ${title}\nslug: ${title.toLowerCase().replace(/\s+/g, "-")}\ndate: ${date}\n---\n\n# ${title}\n\n${content}`;
+ }
+
+ test("filters results by --since date", async () => {
+ const root = await makeTempVault();
+ await writeWiki(
+ root,
+ "concepts/old.md",
+ datedArticle("Old Article", "2023-01-15", "Neural networks from the past."),
+ );
+ await writeWiki(
+ root,
+ "concepts/new.md",
+ datedArticle("New Article", "2025-06-01", "Recent neural network research."),
+ );
+
+ const index = new SearchIndex();
+ await index.build(root, "wiki");
+
+ const results = index.search("neural", { since: "2025-01-01" });
+ expect(results.length).toBe(1);
+ expect(results[0]!.title).toBe("New Article");
+ });
+
+ test("includes articles on the exact --since date", async () => {
+ const root = await makeTempVault();
+ await writeWiki(
+ root,
+ "concepts/exact.md",
+ datedArticle("Exact Date", "2025-03-15", "Neural network content."),
+ );
+
+ const index = new SearchIndex();
+ await index.build(root, "wiki");
+
+ const results = index.search("neural", { since: "2025-03-15" });
+ expect(results.length).toBe(1);
+ });
+
+ test("includes articles with no date when using --since", async () => {
+ const root = await makeTempVault();
+ await writeWiki(
+ root,
+ "concepts/nodated.md",
+ articleMd("No Date", "Neural network with no date frontmatter."),
+ );
+
+ const index = new SearchIndex();
+ await index.build(root, "wiki");
+
+ // Articles with no date should not be excluded
+ const results = index.search("neural", { since: "2025-01-01" });
+ expect(results.length).toBe(1);
+ });
+});
+
+// ─── Highlighting ───────────────────────────────────────────────
+
+describe("highlightSnippet", () => {
+ test("bolds matched words", () => {
+ const result = highlightSnippet("The transformer architecture is powerful", ["transform"]);
+ expect(result).toContain("\x1b[1mtransformer\x1b[22m");
+ expect(result).toContain("The");
+ expect(result).toContain("is powerful");
+ });
+
+ test("highlights multiple terms", () => {
+ const result = highlightSnippet("Neural networks use attention mechanisms", [
+ "neural",
+ "attention",
+ ]);
+ expect(result).toContain("\x1b[1mNeural\x1b[22m");
+ expect(result).toContain("\x1b[1mattention\x1b[22m");
+ });
+
+ test("returns unchanged snippet when no tokens match", () => {
+ const snippet = "No matches here";
+ const result = highlightSnippet(snippet, ["quantum"]);
+ expect(result).toBe(snippet);
+ });
+
+ test("returns unchanged snippet with empty tokens", () => {
+ const snippet = "Some text";
+ expect(highlightSnippet(snippet, [])).toBe(snippet);
+ });
+});
+
+// ─── Save/Load with tags and date ───────────────────────────────
+
+describe("index serialization with metadata", () => {
+ function taggedDatedArticle(
+ title: string,
+ tags: string[],
+ date: string,
+ content: string,
+ ): string {
+ return `---\ntitle: ${title}\nslug: ${title.toLowerCase().replace(/\s+/g, "-")}\ntags: [${tags.join(", ")}]\ndate: ${date}\n---\n\n# ${title}\n\n${content}`;
+ }
+
+ test("save and load preserves tag and date filtering", async () => {
+ const root = await makeTempVault();
+ await writeWiki(
+ root,
+ "concepts/tagged.md",
+ taggedDatedArticle(
+ "Tagged Article",
+ ["ml", "nlp"],
+ "2025-06-01",
+ "Machine learning content.",
+ ),
+ );
+ await writeWiki(
+ root,
+ "concepts/other.md",
+ taggedDatedArticle("Other Article", ["vision"], "2024-01-01", "Computer vision content."),
+ );
+
+ const index1 = new SearchIndex();
+ await index1.build(root, "wiki");
+ await index1.save(root);
+
+ const index2 = new SearchIndex();
+ const loaded = await index2.load(root);
+ expect(loaded).toBe(true);
+
+ // Tag filter should still work after load
+ const tagResults = index2.search("content", { tag: "ml" });
+ expect(tagResults.length).toBe(1);
+ expect(tagResults[0]!.title).toBe("Tagged Article");
+
+ // Date filter should still work after load
+ const dateResults = index2.search("content", { since: "2025-01-01" });
+ expect(dateResults.length).toBe(1);
+ expect(dateResults[0]!.title).toBe("Tagged Article");
+ });
+});
diff --git a/packages/core/src/search/engine.ts b/packages/core/src/search/engine.ts
index 8e2db56..544e81f 100644
--- a/packages/core/src/search/engine.ts
+++ b/packages/core/src/search/engine.ts
@@ -165,16 +165,20 @@ interface Document {
tokens: string[];
tokenCount: number;
termFreqs: Map;
+ tags: string[];
+ date: string | null;
}
interface SerializedIndex {
- version: 1;
+ version: 2;
documents: {
path: string;
title: string;
snippet: string;
tokenCount: number;
termFreqs: [string, number][];
+ tags: string[];
+ date: string | null;
}[];
idf: [string, number][];
avgDl: number;
@@ -210,6 +214,20 @@ export class SearchIndex {
const title =
(frontmatter.title as string) ?? filePath.split("/").pop()?.replace(/\.md$/, "") ?? "";
+ // Extract tags from frontmatter
+ const rawTags = frontmatter.tags;
+ const tags: string[] = Array.isArray(rawTags)
+ ? rawTags.map((t: unknown) => String(t).toLowerCase())
+ : [];
+
+ // Extract date from frontmatter (try common field names)
+ const rawDate =
+ (frontmatter.date as string) ??
+ (frontmatter.created as string) ??
+ (frontmatter.ingested as string) ??
+ null;
+ const date = rawDate && !Number.isNaN(Date.parse(String(rawDate))) ? String(rawDate) : null;
+
const tokens = tokenize(`${title} ${title} ${body}`); // title gets extra weight
const termFreqs = new Map();
for (const token of tokens) {
@@ -223,6 +241,8 @@ export class SearchIndex {
tokens,
tokenCount: tokens.length,
termFreqs,
+ tags,
+ date,
});
}
@@ -254,34 +274,92 @@ export class SearchIndex {
}
/**
- * Search the index using BM25 scoring.
+ * Search the index using BM25 scoring with fuzzy matching, phrase search,
+ * tag filtering, date filtering, and optional highlighting.
*/
- search(query: string, opts: { limit?: number; threshold?: number } = {}): SearchResult[] {
+ search(
+ query: string,
+ opts: {
+ limit?: number;
+ threshold?: number;
+ tag?: string | string[];
+ since?: string;
+ highlight?: boolean;
+ } = {},
+ ): SearchResult[] {
const limit = opts.limit ?? 20;
const threshold = opts.threshold ?? 0;
- const queryTokens = tokenize(query);
+ const highlight = opts.highlight ?? false;
+
+ // Parse tag filter
+ const tagFilter: string[] | null = opts.tag
+ ? (Array.isArray(opts.tag) ? opts.tag : [opts.tag]).map((t) => t.toLowerCase())
+ : null;
- if (queryTokens.length === 0 || this.documents.length === 0) {
+ // Parse date filter
+ const sinceTs = opts.since ? Date.parse(opts.since) : null;
+
+ // Parse phrases (quoted strings) and remaining terms
+ const { phrases, terms } = parseQuery(query);
+ const queryTokens = terms.flatMap((t) => tokenize(t));
+
+ if ((queryTokens.length === 0 && phrases.length === 0) || this.documents.length === 0) {
return [];
}
const scores: { doc: Document; score: number }[] = [];
for (const doc of this.documents) {
+ // Tag filter: skip docs that don't have all required tags
+ if (tagFilter && !tagFilter.every((t) => doc.tags.includes(t))) {
+ continue;
+ }
+
+ // Date filter: skip docs older than --since
+ if (sinceTs && doc.date) {
+ const docTs = Date.parse(doc.date);
+ if (!Number.isNaN(docTs) && docTs < sinceTs) continue;
+ }
+
+ // Phrase filter: skip docs that don't contain all exact phrases
+ if (phrases.length > 0) {
+ const lowerContent = `${doc.title} ${doc.content}`.toLowerCase();
+ if (!phrases.every((p) => lowerContent.includes(p.toLowerCase()))) {
+ continue;
+ }
+ }
+
let score = 0;
const dl = doc.tokenCount;
for (const qt of queryTokens) {
- const tf = doc.termFreqs.get(qt) ?? 0;
+ // Exact match first
+ let tf = doc.termFreqs.get(qt) ?? 0;
+
+ // Fuzzy match: if no exact hit, check edit distance ≤ 1 for tokens ≥ 4 chars
+ if (tf === 0 && qt.length >= 4) {
+ for (const [docToken, freq] of doc.termFreqs) {
+ if (editDistance1(qt, docToken)) {
+ tf = Math.ceil(freq * 0.8); // discount fuzzy matches slightly
+ break;
+ }
+ }
+ }
+
if (tf === 0) continue;
- const idfVal = this.idf.get(qt) ?? 0;
+ const idfVal = this.idf.get(qt) ?? this.computeFuzzyIdf(qt);
const tfNorm =
(tf * (this.k1 + 1)) / (tf + this.k1 * (1 - this.b + this.b * (dl / this.avgDl)));
score += idfVal * tfNorm;
}
+ // Give a bonus for phrase matches (phrases already filtered above)
+ if (phrases.length > 0) {
+ score += phrases.length * 2.0;
+ }
+
if (score > threshold) {
scores.push({ doc, score });
}
@@ -290,26 +368,48 @@ export class SearchIndex {
// Sort by score descending
scores.sort((a, b) => b.score - a.score);
+ // Collect all terms for highlighting (query tokens + phrase words)
+ const highlightTerms = highlight
+ ? [...queryTokens, ...phrases.flatMap((p) => tokenize(p))]
+ : [];
+
return scores.slice(0, limit).map(({ doc, score }) => ({
path: doc.path,
score: Math.round(score * 100) / 100,
- snippet: extractSnippet(doc.content, queryTokens),
+ snippet: highlight
+ ? highlightSnippet(
+ extractSnippet(doc.content, [...queryTokens, ...phrases]),
+ highlightTerms,
+ )
+ : extractSnippet(doc.content, [...queryTokens, ...phrases]),
title: doc.title || undefined,
}));
}
+ /**
+ * Compute approximate IDF for a fuzzy-matched term by finding the closest known term.
+ */
+ private computeFuzzyIdf(token: string): number {
+ for (const [term, idf] of this.idf) {
+ if (editDistance1(token, term)) return idf * 0.8;
+ }
+ return 0;
+ }
+
/**
* Serialize the index for caching.
*/
serialize(): string {
const data: SerializedIndex = {
- version: 1,
+ version: 2,
documents: this.documents.map((d) => ({
path: d.path,
title: d.title,
snippet: d.content.slice(0, 200),
tokenCount: d.tokens.length,
termFreqs: [...d.termFreqs.entries()],
+ tags: d.tags,
+ date: d.date,
})),
idf: [...this.idf.entries()],
avgDl: this.avgDl,
@@ -336,9 +436,9 @@ export class SearchIndex {
try {
const raw = await readFile(path, "utf-8");
- const data = JSON.parse(raw) as SerializedIndex;
+ const data = JSON.parse(raw) as SerializedIndex & { version: number };
- if (data.version !== 1) return false;
+ if (data.version !== 1 && data.version !== 2) return false;
this.documents = data.documents.map((d) => ({
path: d.path,
@@ -347,6 +447,8 @@ export class SearchIndex {
tokens: [], // Not needed for search — termFreqs is enough
tokenCount: d.tokenCount,
termFreqs: new Map(d.termFreqs),
+ tags: (d as { tags?: string[] }).tags ?? [],
+ date: (d as { date?: string | null }).date ?? null,
}));
this.idf = new Map(data.idf);
this.avgDl = data.avgDl;
@@ -388,3 +490,83 @@ function extractSnippet(content: string, queryTokens: string[], maxLength = 150)
return snippet;
}
+
+// ─── Query Parser ───────────────────────────────────────────────
+
+/**
+ * Parse a search query into exact phrases (quoted) and remaining terms.
+ * Example: `"attention mechanism" transformer` → phrases: ["attention mechanism"], terms: ["transformer"]
+ */
+export function parseQuery(query: string): { phrases: string[]; terms: string[] } {
+ const phrases: string[] = [];
+ const remaining = query.replace(/"([^"]+)"/g, (_match, phrase: string) => {
+ phrases.push(phrase);
+ return "";
+ });
+ const terms = remaining
+ .split(/\s+/)
+ .map((t) => t.trim())
+ .filter(Boolean);
+ return { phrases, terms };
+}
+
+// ─── Fuzzy Matching ─────────────────────────────────────────────
+
+/**
+ * Check if two strings have edit distance ≤ 1 (substitution, insertion, or deletion).
+ * Optimized: avoids full DP matrix by bailing early.
+ */
+export function editDistance1(a: string, b: string): boolean {
+ const lenDiff = a.length - b.length;
+ if (lenDiff > 1 || lenDiff < -1) return false;
+
+ if (a.length === b.length) {
+ // Check for exactly one substitution
+ let diffs = 0;
+ for (let i = 0; i < a.length; i++) {
+ if (a[i] !== b[i]) {
+ diffs++;
+ if (diffs > 1) return false;
+ }
+ }
+ return diffs === 1;
+ }
+
+ // One is longer by 1: check for single insertion/deletion
+ const longer = a.length > b.length ? a : b;
+ const shorter = a.length > b.length ? b : a;
+ let i = 0;
+ let j = 0;
+ let diffs = 0;
+ while (i < longer.length && j < shorter.length) {
+ if (longer[i] !== shorter[j]) {
+ diffs++;
+ if (diffs > 1) return false;
+ i++; // skip the extra char in the longer string
+ } else {
+ i++;
+ j++;
+ }
+ }
+ return true;
+}
+
+// ─── Highlighting ───────────────────────────────────────────────
+
+/**
+ * Highlight matched terms in a snippet using ANSI bold.
+ * Matches stemmed forms so "transformers" highlights when searching for "transformer".
+ */
+export function highlightSnippet(snippet: string, queryTokens: string[]): string {
+ if (queryTokens.length === 0) return snippet;
+
+ // Build a regex that matches any word whose stem matches a query token
+ // We match whole words and check stems
+ return snippet.replace(/[a-zA-Z0-9]+/g, (word) => {
+ const stemmed = stem(word.toLowerCase());
+ if (queryTokens.some((qt) => stemmed === qt || editDistance1(stemmed, qt))) {
+ return `\x1b[1m${word}\x1b[22m`; // ANSI bold
+ }
+ return word;
+ });
+}