From aeacf065519fed17fd053da6ccda1f57f189fab0 Mon Sep 17 00:00:00 2001 From: Timo Runge Date: Wed, 3 Jun 2026 23:36:14 +0200 Subject: [PATCH 1/2] build(deps): bump the all group across 1 directory with 8 updates Mirrors Dependabot PR #4 so the dependency bump and feature work share one CI run. --- Cargo.lock | 96 +++++++++++++++++++++----------------------------- Cargo.toml | 2 +- cli/Cargo.toml | 4 +-- 3 files changed, 43 insertions(+), 59 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b390c04..d9faf96 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1180,9 +1180,9 @@ dependencies = [ [[package]] name = "calamine" -version = "0.34.0" +version = "0.35.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20ae05a4e39297eecf9a994210d27501318c37a9318201f8e11050add82bb6f0" +checksum = "8822fe6253ca47aa5ad9a3be09f6fe7cd20c6a74e41b0aa42e8f4e3d523508df" dependencies = [ "atoi_simd", "byteorder", @@ -1231,9 +1231,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.61" +version = "1.2.63" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d16d90359e986641506914ba71350897565610e87ce0ad9e6f28569db3dd5c6d" +checksum = "556e016178bb5662a08681bbe0f00f8e17631781a4dfc8c45e466e4b185ec27f" dependencies = [ "find-msvc-tools", "jobserver", @@ -2549,9 +2549,9 @@ dependencies = [ [[package]] name = "html-to-markdown-rs" -version = "3.3.3" +version = "3.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9451a842dd9316c9762a1c2c1d496df9db4db29e702e22f7a3fca210636ed4f9" +checksum = "6cbbfb183e8634cb956309c6bbd781d9ddae068d376fb9eb1451ac49cf4cbba7" dependencies = [ "ahash", "astral-tl", @@ -2559,7 +2559,7 @@ dependencies = [ "html-escape", "html5ever", "image", - "lru 0.17.0", + "lru 0.18.0", "memchr", "once_cell", "regex", @@ -3263,9 +3263,9 @@ dependencies = [ [[package]] name = "kreuzberg" -version = "4.9.7" +version = "4.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a9b6e7118097d06f882f62427e6d8722eef31bc20bd0b2f434a8b8785d05c30c" +checksum = "ef08d77d5b72b5c424acb86e297ea13fd9f3db9372d570dd4468011dfc8c3d53" dependencies = [ "ahash", "async-trait", @@ -3313,7 +3313,7 @@ dependencies = [ "pastey 0.2.2", "pkg-config", "pulldown-cmark", - "quick-xml 0.39.4", + "quick-xml 0.40.1", "rayon", "regex", "rmp-serde", @@ -3365,9 +3365,9 @@ dependencies = [ [[package]] name = "kreuzberg-tesseract" -version = "4.9.7" +version = "4.9.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "550d1aedb28114013aa51767a1566318a7c6409b56605851c55c2ecd433ccf1b" +checksum = "5f896204110548ad886f059e42bcc8b839dba64b95db2572ed2252acd72f22dc" dependencies = [ "cc", "cmake", @@ -3550,7 +3550,7 @@ dependencies = [ "mime_guess", "notify", "proptest", - "quick-xml 0.40.0", + "quick-xml 0.40.1", "regex", "reqwest", "rmcp", @@ -3618,9 +3618,9 @@ dependencies = [ [[package]] name = "lru" -version = "0.17.0" +version = "0.18.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0e0b564323a0fb6d54b864f625ae139de9612e27edb944dda37c109f05aac531" +checksum = "8a860605968fce16869fd239cf4237a82f3ac470723415db603b0e8b6c8d4fb9" dependencies = [ "hashbrown 0.17.0", ] @@ -4480,16 +4480,16 @@ checksum = "cdcc8dd4e2f670d309a5f0e83fe36dfdc05af317008fea29144da1a2ac858e5e" dependencies = [ "encoding_rs", "memchr", - "serde", ] [[package]] name = "quick-xml" -version = "0.40.0" +version = "0.40.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b7315c86b26aaef0321fba33c9dcc160da659c6a9d278f0f6a5656d6561c03b" +checksum = "2474bd2e5029e7ccb6abb2ba48cf2383a333851dedf495901544281590c7da7f" dependencies = [ "memchr", + "serde", ] [[package]] @@ -4798,9 +4798,9 @@ checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" [[package]] name = "reqwest" -version = "0.13.3" +version = "0.13.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62e0021ea2c22aed41653bc7e1419abb2c97e038ff2c33d0e1309e49a97deec0" +checksum = "219c5811de6525e5416c7d5d53bb656d3afdbc6c5af816e0802bcfa42dbdc1c3" dependencies = [ "base64", "bytes", @@ -4865,9 +4865,9 @@ dependencies = [ [[package]] name = "rmcp" -version = "1.6.0" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e12ca9067b5ebfbd5b3fcdc4acfceb81aa7d5ab2a879dff7cb75d22434276aad" +checksum = "0810a9f717d9828f475fe1f629f4c305c8464b7f496c3a854b58d29e65f4058e" dependencies = [ "async-trait", "base64", @@ -4898,9 +4898,9 @@ dependencies = [ [[package]] name = "rmcp-macros" -version = "1.6.0" +version = "1.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7caa6743cc0888e433105fe1bc551a7f607940b126a37bc97b478e86064627eb" +checksum = "6aefac48c364756e97f04c0401ba3231e8607882c7c1d92da0437dc16307904d" dependencies = [ "darling", "proc-macro2", @@ -5091,15 +5091,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "scc" -version = "2.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "46e6f046b7fef48e2660c57ed794263155d713de679057f2d0c169bfc6e756cc" -dependencies = [ - "sdd", -] - [[package]] name = "schannel" version = "0.1.29" @@ -5141,12 +5132,6 @@ version = "1.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" -[[package]] -name = "sdd" -version = "3.0.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "490dcfcbfef26be6800d11870ff2df8774fa6e86d047e3e8c8a76b25655e41ca" - [[package]] name = "security-framework" version = "3.7.0" @@ -5219,9 +5204,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.149" +version = "1.0.150" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +checksum = "e8014e44b4736ed0538adeecded0fce2a272f22dc9578a7eb6b2d9993c74cfb9" dependencies = [ "indexmap", "itoa", @@ -5294,24 +5279,23 @@ dependencies = [ [[package]] name = "serial_test" -version = "3.4.0" +version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "911bd979bf1070a3f3aa7b691a3b3e9968f339ceeec89e08c280a8a22207a32f" +checksum = "699f4197115b8a7e7ff19c9a315a4bd6fffec26cc4626ef45ecaea389e081c6d" dependencies = [ "futures-executor", "futures-util", "log", "once_cell", "parking_lot", - "scc", "serial_test_derive", ] [[package]] name = "serial_test_derive" -version = "3.4.0" +version = "3.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0a7d91949b85b0d2fb687445e448b40d322b6b3e4af6b44a29b21d9a5f33e6d9" +checksum = "94e153fc76e1c6a068703d6d29c508a0b15c061c4b7e43da59cc097bc342673c" dependencies = [ "proc-macro2", "quote", @@ -5373,9 +5357,9 @@ dependencies = [ [[package]] name = "shlex" -version = "1.3.0" +version = "2.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" +checksum = "f8fadd59c855ef2080decdef8ff161eb6661b86933c9d82e5ba29dc602a55aba" [[package]] name = "signal-hook-registry" @@ -5806,9 +5790,9 @@ checksum = "55937e1799185b12863d447f42597ed69d9928686b8d88a1df17376a097d8369" [[package]] name = "tar" -version = "0.4.45" +version = "0.4.46" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973" +checksum = "3f6221d9a6003c78398e3b239969f352578258df48c8eb051caadae0015bc840" dependencies = [ "filetime", "libc", @@ -5943,9 +5927,9 @@ dependencies = [ [[package]] name = "tikv-jemalloc-sys" -version = "0.6.1+5.3.0-1-ge13ca993e8ccb9ba9847cc330696e02839f328f7" +version = "0.7.1+5.3.1-0-g81034ce1f1373e37dc865038e1bc8eeecf559ce8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd8aa5b2ab86a2cefa406d889139c162cbb230092f7d1d7cbc1716405d852a3b" +checksum = "1a2825c78386b4ae0314074867860ba9577875de945f05992c38815cbec327f0" dependencies = [ "cc", "libc", @@ -5953,9 +5937,9 @@ dependencies = [ [[package]] name = "tikv-jemallocator" -version = "0.6.1" +version = "0.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0359b4327f954e0567e69fb191cf1436617748813819c94b8cd4a431422d053a" +checksum = "249f09e49ab1609436f34c776e84231bead18d6a955f119f939bdc1d847561bd" dependencies = [ "libc", "tikv-jemalloc-sys", @@ -6283,9 +6267,9 @@ checksum = "009994f150cc0cd50ff54917d5bc8bffe8cad10ca10d81c34da2ec421ae61782" [[package]] name = "tree-sitter-language-pack" -version = "1.7.0" +version = "1.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "925d3fdf2a7f2be09d354463119556ac092832d807a9533708ffd3b18bb712f1" +checksum = "208658b27011901bb099b638e9ba25ad0a2063d5d6b7fcc6c07af0dcc94ffe10" dependencies = [ "ahash", "cc", diff --git a/Cargo.toml b/Cargo.toml index 782f31a..639269e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -108,7 +108,7 @@ flate2 = { version = "1", optional = true } zstd = { version = "0.13", optional = true } bzip2 = { version = "0.6", optional = true } xz2 = { version = "0.1", optional = true } -rmcp = { version = "1.4", features = ["client", "transport-child-process", "transport-streamable-http-client-reqwest"], optional = true } +rmcp = { version = "1.7", features = ["client", "transport-child-process", "transport-streamable-http-client-reqwest"], optional = true } mail-parser = { version = "0.11", optional = true } mime_guess = { version = "2", optional = true } tempfile = "3" diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 820c932..a50f567 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -34,7 +34,7 @@ tokio = { version = "1", features = ["full"] } indicatif = "0.18" tracing-subscriber = { version = "0.3", features = ["env-filter"] } notify = "8" -rmcp = { version = "1.4", features = ["server", "transport-io", "transport-streamable-http-server", "macros"] } +rmcp = { version = "1.7", features = ["server", "transport-io", "transport-streamable-http-server", "macros"] } axum = { version = "0.8", features = ["http1", "tokio"] } tokio-util = "0.7" serde = { version = "1", features = ["derive"] } @@ -54,7 +54,7 @@ tar = "0.4" zip = { version = "8", default-features = false, features = ["deflate"] } [target.'cfg(not(target_env = "msvc"))'.dependencies] -tikv-jemallocator = "0.6" +tikv-jemallocator = "0.7" [target.'cfg(unix)'.dependencies] libc = "0.2" From 673cf7f188c603b6b690be97de9ee7f87b6057d2 Mon Sep 17 00:00:00 2001 From: Timo Runge Date: Wed, 3 Jun 2026 23:36:25 +0200 Subject: [PATCH 2/2] feat: exec output limits, maintain health check, store provenance - exec: add `max_output_bytes` (default 10 MiB); stream stdout/stderr through bounded reads and kill the child process on timeout - cli: `lore maintain health` reports config validity, compiled features, LLM client init, and store reachability - store: stamp chunk-config fingerprint + version in lore.meta and warn at ingest when an existing store was built with different settings - store: federated multi-store search with BM25 score normalization - llm: surface missing-credential warnings via the ingest observer - test: pure chunker boundary cases (empty, short, min-chunk, indices) - chore(lints): enforce missing_errors_doc/missing_panics_doc and document the public API surface --- cli/src/cli/args.rs | 2 + cli/src/cli/ingest.rs | 5 + cli/src/cli/maintain.rs | 146 ++++++++++++++++++++++++ cli/src/cli/mod.rs | 3 +- cli/src/cli/watch.rs | 15 ++- cli/src/main.rs | 8 +- docs/configuration.md | 1 + skills/lore/SKILL.md | 1 + src/cache/mod.rs | 8 ++ src/config/mod.rs | 107 ++++++++++++++++++ src/config/processing.rs | 4 + src/config/source.rs | 19 ++++ src/fmt/format.rs | 4 + src/ingest/chunker.rs | 62 +++++++++++ src/ingest/discover/mod.rs | 1 + src/ingest/loaders/exec.rs | 160 ++++++++++++++++++++++++--- src/ingest/loaders/file.rs | 8 ++ src/ingest/mod.rs | 52 +++++++++ src/ingest/transforms.rs | 4 + src/ingest/watch.rs | 4 + src/lib.rs | 3 + src/llm.rs | 119 ++++++++++++++++++++ src/output/documents.rs | 8 ++ src/output/info.rs | 4 + src/query.rs | 24 ++++ src/store/documents.rs | 10 +- src/store/meta_key.rs | 4 + src/store/mod.rs | 8 ++ src/store/multi.rs | 220 ++++++++++++++++++++++++++++++++++++- src/store/resolve.rs | 8 ++ src/store/search.rs | 12 ++ src/store/test_helpers.rs | 2 + src/store/types.rs | 38 ++++++- src/store/writer.rs | 32 ++++++ src/util/fs.rs | 4 + src/util/marker.rs | 4 + 36 files changed, 1088 insertions(+), 26 deletions(-) diff --git a/cli/src/cli/args.rs b/cli/src/cli/args.rs index 9df1d32..e902fb3 100644 --- a/cli/src/cli/args.rs +++ b/cli/src/cli/args.rs @@ -482,4 +482,6 @@ pub enum MaintainAction { #[arg(default_value = "all", hide_default_value = true)] scope: CacheScope, }, + /// Print a single-screen health report (config, features, LLM, store) + Health, } diff --git a/cli/src/cli/ingest.rs b/cli/src/cli/ingest.rs index 9ccb4d9..cf32752 100644 --- a/cli/src/cli/ingest.rs +++ b/cli/src/cli/ingest.rs @@ -152,6 +152,11 @@ impl IngestObserver for CliIngestObserver { *status = sb; } + fn on_llm_warning(&self, msg: &str) { + let prefix = self.kb_prefix(); + self.println(format!("{prefix}[{} ] {msg}", self.paint.yellow("!"))); + } + fn on_dry_run_notice(&self) { let prefix = self.kb_prefix(); self.println(format!( diff --git a/cli/src/cli/maintain.rs b/cli/src/cli/maintain.rs index 41da9fc..31da602 100644 --- a/cli/src/cli/maintain.rs +++ b/cli/src/cli/maintain.rs @@ -419,6 +419,152 @@ fn fix_issues(store: &store::Store, issues: &[Issue]) -> Result { Ok(fixed) } +/// Print a single-screen health report for this lore setup. +pub fn health( + config_path: &Path, + store_path: &Path, + mode: OutputMode, + prefix: &LinePrefix, +) -> Result<()> { + let mp = MultiProgress::new(); + let paint = crate::terminal::stderr_painter(); + let json = mode == OutputMode::Json; + + if !json { + progress::mp_println( + &mp, + format!("{prefix}[{} ] lore health check", paint.purple(".")), + ); + } + + let mut hard_error = false; + + // 1. Config validity + match lore::config::IngestConfig::from_yaml(config_path) { + Ok(_) => { + if !json { + progress::mp_println( + &mp, + format!( + "{prefix}[{}] config: {} -- ok", + paint.green("+"), + config_path.display() + ), + ); + } + } + Err(e) => { + hard_error = true; + if !json { + progress::mp_println( + &mp, + format!( + "{prefix}[{}] config: {} -- {e:#}", + paint.red("!"), + config_path.display() + ), + ); + } + } + } + + // 2. Compiled features + if !json { + let features: Vec<&str> = [ + #[cfg(feature = "ingest")] + "ingest", + #[cfg(feature = "llm")] + "llm", + #[cfg(feature = "s3")] + "s3", + #[cfg(feature = "mcp")] + "mcp", + #[cfg(feature = "ocr")] + "ocr", + #[cfg(feature = "iwork")] + "iwork", + #[cfg(feature = "tree-sitter")] + "tree-sitter", + ] + .into_iter() + .collect(); + let feat_str = if features.is_empty() { + "none".to_owned() + } else { + features.join(", ") + }; + progress::mp_println( + &mp, + format!("{prefix}[{}] features: {feat_str}", paint.green("+")), + ); + } + + // 3. LLM credential resolution (feature-gated) + #[cfg(feature = "llm")] + { + if let Ok(cfg) = lore::config::IngestConfig::from_yaml(config_path) { + if let Some(ref llm_cfg) = cfg.llm { + match lore::llm::LlmClient::new(llm_cfg) { + Ok(_) => { + if !json { + progress::mp_println( + &mp, + format!("{prefix}[{}] llm: client initialized", paint.green("+")), + ); + } + } + Err(e) => { + if !json { + progress::mp_println( + &mp, + format!( + "{prefix}[{}] llm: could not initialize client: {e}", + paint.yellow("-"), + ), + ); + } + } + } + } else if !json { + progress::mp_println( + &mp, + format!("{prefix}[{}] llm: not configured", paint.blue("i")), + ); + } + } + } + + // 4. Store reachability + let store_path_norm = normalize_path(store_path); + if store_path_norm.is_dir() { + if !json { + progress::mp_println( + &mp, + format!( + "{prefix}[{}] store: {} -- reachable", + paint.green("+"), + store_path_norm.display() + ), + ); + } + } else if !json { + progress::mp_println( + &mp, + format!( + "{prefix}[{}] store: {} -- not found (run `lore ingest` to create)", + paint.yellow("-"), + store_path_norm.display() + ), + ); + } + + if hard_error { + anyhow::bail!("health check found configuration errors (see above)"); + } + + Ok(()) +} + #[cfg(test)] mod tests { use super::*; diff --git a/cli/src/cli/mod.rs b/cli/src/cli/mod.rs index 7fca33f..1a812bd 100644 --- a/cli/src/cli/mod.rs +++ b/cli/src/cli/mod.rs @@ -133,8 +133,7 @@ pub fn resolve_all_configs(cli_configs: Vec) -> Result, bars: SourceBars, idle_bar: Arc>, + /// Ensures the LLM credential warning prints at most once per watch session. + llm_warned: AtomicBool, } impl WatchIngestObserver { @@ -35,8 +38,10 @@ impl WatchIngestObserver { Self { bars: SourceBars::new(mp.clone(), term_width), mp, + paint: terminal::stderr_painter(), shutdown, idle_bar: Arc::new(std::sync::Mutex::new(idle)), + llm_warned: AtomicBool::new(false), } } @@ -74,6 +79,14 @@ impl IngestObserver for WatchIngestObserver { self.bars.clear(); } + fn on_llm_warning(&self, msg: &str) { + // Credentials come from the environment and don't change across cycles, + // so surface the warning only once per watch session. + if !self.llm_warned.swap(true, Ordering::Relaxed) { + progress::mp_println(&self.mp, format!("[{} ] {msg}", self.paint.yellow("!"))); + } + } + fn on_status(&self, _msg: &str) {} fn on_source_waiting(&self, _index: usize, label: &str) { diff --git a/cli/src/main.rs b/cli/src/main.rs index 7de55ff..79846fe 100644 --- a/cli/src/main.rs +++ b/cli/src/main.rs @@ -265,7 +265,7 @@ fn main() { .config .into_iter() .next() - .map(|p| lore::config::IngestConfig::from_yaml(&p)) + .map(|p| lore::config::load_config_with_hints(&p)) .transpose()?; lore_cli::cli::preview::preview(lore_cli::cli::preview::PreviewOptions { paths: &paths, @@ -354,6 +354,12 @@ fn main() { pfx, ), MaintainAction::Clean { .. } => unreachable!(), + MaintainAction::Health => lore_cli::cli::maintain::health( + &rc.config_path, + &store_path, + lore_cli::terminal::output_mode(false), + pfx, + ), } }, ) diff --git a/docs/configuration.md b/docs/configuration.md index 3ea8e07..29b6f22 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -992,6 +992,7 @@ Runs one or more shell commands via `sh -c`. Supports two output modes: | `topic` | string | -- | Topic label applied to all chunks from this source. | | `tags` | string | -- | Comma-separated tags applied to all documents from this source. | | `processing` | string or object | -- | Processing override: a preset name or an inline profile object. | +| `max_output_bytes` | int | -- | Maximum bytes to read from stdout (default: 10 MiB). Commands that exceed this limit fail with an error; raise this value or narrow the command if needed. | **JSONL mode** (default): each non-empty stdout line must be a JSON object diff --git a/skills/lore/SKILL.md b/skills/lore/SKILL.md index 218ae81..adaa527 100644 --- a/skills/lore/SKILL.md +++ b/skills/lore/SKILL.md @@ -152,6 +152,7 @@ For HTTP transport, multi-KB setups, tool reference, and resource URIs: | `lore maintain repair` | Fix detected consistency issues | | `lore maintain compact` | Merge index segments for lower memory and faster reads | | `lore maintain clean` | Clear cached downloads, git repos, or temporary files | +| `lore maintain health` | Print a single-screen health report (config, features, LLM, store) | ### Key flags diff --git a/src/cache/mod.rs b/src/cache/mod.rs index e1e9f3a..b450716 100644 --- a/src/cache/mod.rs +++ b/src/cache/mod.rs @@ -149,6 +149,10 @@ pub(crate) fn http_cache_dir() -> Result { } /// Return the logs directory under the cache root, creating it if needed. +/// +/// # Errors +/// +/// Returns an error if the cache root cannot be determined or the directory cannot be created. pub fn logs_dir() -> Result { cache_dir(&[CacheScope::LOGS_DIR]) } @@ -160,6 +164,10 @@ pub(crate) fn repo_cache_path(repo_url: &str) -> Result { } /// Clear cached data. Returns `(items_removed, bytes_freed)`. +/// +/// # Errors +/// +/// Returns an error if the cache root cannot be determined or deletion fails. pub fn clear_cache(scope: CacheScope) -> Result<(usize, u64)> { clear_cache_at(&cache_root()?, scope) } diff --git a/src/config/mod.rs b/src/config/mod.rs index ec8527b..5ee90c4 100644 --- a/src/config/mod.rs +++ b/src/config/mod.rs @@ -78,6 +78,10 @@ impl IngestConfig { /// /// Merges global user-level defaults (if present) under the project /// config before deserialization. Precedence: global < project < env vars. + /// + /// # Errors + /// + /// Returns an error if the file cannot be read, the YAML is invalid, or validation fails. pub fn from_yaml(path: &Path) -> Result { let content = std::fs::read_to_string(path) .with_context(|| format!("failed to read config: {}", path.display()))?; @@ -243,6 +247,55 @@ impl Validate for IngestConfig { } } +/// Load a config file with user-friendly error hints on failure. +/// +/// Wraps `IngestConfig::from_yaml` and appends contextual guidance when +/// common error patterns are detected. +/// +/// # Errors +/// +/// Returns an error if the config file cannot be read, parsed, or fails validation. +pub fn load_config_with_hints(path: &Path) -> Result { + IngestConfig::from_yaml(path).map_err(|e| annotate_config_error(e, path)) +} + +fn annotate_config_error(err: anyhow::Error, path: &Path) -> anyhow::Error { + let msg = format!("{err:#}"); + + let hint = if msg.contains("No such file") || msg.contains("not found") { + Some(format!( + "hint: config file not found at {}; run `lore init` to create one", + path.display() + )) + } else if msg.contains("unknown field") { + let field = extract_unknown_field(&msg).unwrap_or("(see above)"); + Some(format!( + "hint: unknown field `{field}` -- check spelling or see docs/configuration.md" + )) + } else if msg.contains("failed to parse config") { + Some("hint: YAML syntax error -- check indentation and quoting".to_owned()) + } else if msg.contains("requires the '") { + Some( + "hint: this source type requires a compile-time feature; \ + reinstall with: cargo install lore --features " + .to_owned(), + ) + } else { + None + }; + + match hint { + Some(h) => err.context(h), + None => err, + } +} + +fn extract_unknown_field(msg: &str) -> Option<&str> { + let start = msg.find("unknown field `")? + "unknown field `".len(); + let end = msg[start..].find('`')? + start; + Some(&msg[start..end]) +} + /// Read an environment variable and parse it into `T`; returns `None` if unset, errors if malformed. fn parse_env(name: &str) -> Result> { let Ok(raw) = std::env::var(name) else { @@ -670,4 +723,58 @@ processing: "LORE_MAX_CHUNK_CHARS should override processing.max_chunk_chars" ); } + + #[test] + fn annotate_config_error_file_not_found() { + let err = anyhow::anyhow!("failed to read config: No such file or directory"); + let annotated = annotate_config_error(err, Path::new("/tmp/missing.yaml")); + let msg = format!("{annotated:#}"); + assert!(msg.contains("hint:"), "should contain hint: {msg}"); + assert!(msg.contains("lore init"), "should suggest lore init: {msg}"); + } + + #[test] + fn annotate_config_error_unknown_field() { + let err = anyhow::anyhow!("failed to parse config: unknown field `typo_field`"); + let annotated = annotate_config_error(err, Path::new("/tmp/lore.yaml")); + let msg = format!("{annotated:#}"); + assert!( + msg.contains("check spelling"), + "should suggest check spelling: {msg}" + ); + assert!( + msg.contains("typo_field"), + "should mention the field name: {msg}" + ); + } + + #[test] + fn annotate_config_error_parse_error() { + let err = anyhow::anyhow!("failed to parse config: expected mapping"); + let annotated = annotate_config_error(err, Path::new("/tmp/lore.yaml")); + let msg = format!("{annotated:#}"); + assert!( + msg.contains("indentation"), + "should mention indentation: {msg}" + ); + } + + #[test] + fn annotate_config_error_passthrough() { + let err = anyhow::anyhow!("something totally unexpected"); + let annotated = annotate_config_error(err, Path::new("/tmp/lore.yaml")); + let msg = format!("{annotated:#}"); + assert!(!msg.contains("hint:"), "should not add hint: {msg}"); + } + + #[test] + fn extract_unknown_field_parses_serde_format() { + let msg = "unknown field `bad_field`, expected one of `name`, `sources`"; + assert_eq!(extract_unknown_field(msg), Some("bad_field")); + } + + #[test] + fn extract_unknown_field_returns_none_for_unrecognized() { + assert_eq!(extract_unknown_field("no backtick pattern here"), None); + } } diff --git a/src/config/processing.rs b/src/config/processing.rs index 846eaf9..55ce444 100644 --- a/src/config/processing.rs +++ b/src/config/processing.rs @@ -105,6 +105,10 @@ impl ProcessingConfig { } /// Resolve a per-source `ProcessingRef` into a concrete profile. + /// + /// # Errors + /// + /// Returns an error if a named preset is referenced but not defined in `self.presets`. pub fn resolve(&self, r: Option<&ProcessingRef>) -> Result { match r { None => Ok(self.default_profile()), diff --git a/src/config/source.rs b/src/config/source.rs index ff89126..e126f0c 100644 --- a/src/config/source.rs +++ b/src/config/source.rs @@ -296,6 +296,14 @@ impl ExecOutputMode { } } +const fn default_max_output_bytes() -> usize { + 10 * 1024 * 1024 +} +#[allow(clippy::trivially_copy_pass_by_ref)] +fn is_default_max_output_bytes(v: &usize) -> bool { + *v == default_max_output_bytes() +} + /// Command(s) to execute. stdout is parsed as JSONL (default) or consumed as raw content. #[derive(Debug, Clone, Deserialize, Serialize, schemars::JsonSchema)] #[serde(deny_unknown_fields)] @@ -334,6 +342,13 @@ pub struct ExecSource { /// Processing override: a preset name or an inline profile object. #[serde(skip_serializing_if = "Option::is_none")] pub processing: Option, + /// Maximum bytes to read from stdout (default: 10 MiB). Commands that exceed this limit fail + /// with an error; raise this value or narrow the command if needed. + #[serde( + default = "default_max_output_bytes", + skip_serializing_if = "is_default_max_output_bytes" + )] + pub max_output_bytes: usize, } /// MCP transport protocol. @@ -767,6 +782,10 @@ impl SourceConfig { } /// Check structural invariants and URL schemes for this source. + /// + /// # Errors + /// + /// Returns an error if any required field is empty, invalid, or uses an unsupported URL scheme. pub fn validate(&self) -> Result<()> { match self { Self::Local(s) => { diff --git a/src/fmt/format.rs b/src/fmt/format.rs index 26ed552..96c5c4f 100644 --- a/src/fmt/format.rs +++ b/src/fmt/format.rs @@ -59,6 +59,10 @@ pub fn format_elapsed(d: Duration) -> String { } /// Serialize a value as pretty-printed JSON. +/// +/// # Errors +/// +/// Returns an error if the value cannot be serialized to JSON. pub fn to_json_pretty(value: &T) -> Result { serde_json::to_string_pretty(value).context("JSON serialization failed") } diff --git a/src/ingest/chunker.rs b/src/ingest/chunker.rs index e73ced2..90d3cf5 100644 --- a/src/ingest/chunker.rs +++ b/src/ingest/chunker.rs @@ -587,6 +587,68 @@ mod tests { ); } + #[test] + fn chunker_boundary_empty_document() { + let doc = LoaderResult::test_doc(""); + let chunks = chunk_markdown(&doc, "test", &ProcessingProfile::default()); + assert!(chunks.is_empty(), "empty doc must produce zero chunks"); + } + + #[test] + fn chunker_boundary_doc_shorter_than_chunk() { + let doc = LoaderResult::test_doc("Short content."); + let cfg = ProcessingProfile { + min_chunk_chars: 0, + max_chunk_chars: 1600, + ..Default::default() + }; + let chunks = chunk_markdown(&doc, "test", &cfg); + assert_eq!( + chunks.len(), + 1, + "short doc should produce exactly one chunk" + ); + assert!(chunks[0].body.contains("Short content")); + assert_eq!(chunks[0].chunk_index, 0); + } + + #[test] + fn chunker_boundary_min_chunk_filters_tiny_pieces() { + let doc = LoaderResult::test_doc("ab\n\ncd\n\nef"); + let cfg = ProcessingProfile { + min_chunk_chars: 10, + max_chunk_chars: 1600, + ..Default::default() + }; + let chunks = chunk_markdown(&doc, "test", &cfg); + for c in &chunks { + assert!( + c.body.len() >= cfg.min_chunk_chars || chunks.len() == 1, + "chunk body shorter than min_chunk_chars={}: {:?}", + cfg.min_chunk_chars, + c.body + ); + } + } + + #[test] + fn chunker_boundary_chunk_indices_are_contiguous() { + let content = "# A\n\nSome text here.\n\n# B\n\nMore text here.\n\n# C\n\nEven more text."; + let doc = LoaderResult::test_doc(content); + let cfg = ProcessingProfile { + min_chunk_chars: 0, + max_chunk_chars: 200, + ..Default::default() + }; + let chunks = chunk_markdown(&doc, "test", &cfg); + for (i, chunk) in chunks.iter().enumerate() { + assert_eq!( + chunk.chunk_index, i as i64, + "chunk_index must equal position in output vec" + ); + } + } + #[test] fn chunking_presplit_no_duplicate_heading() { let repeated = "x ".repeat(30); diff --git a/src/ingest/discover/mod.rs b/src/ingest/discover/mod.rs index bd5303b..2304145 100644 --- a/src/ingest/discover/mod.rs +++ b/src/ingest/discover/mod.rs @@ -668,6 +668,7 @@ async fn discover_exec( s.dir.as_deref(), &s.env, timeout, + s.max_output_bytes, ctx.topic.as_deref(), s.output, s.source_key.as_deref(), diff --git a/src/ingest/loaders/exec.rs b/src/ingest/loaders/exec.rs index 5324af1..5ed91ff 100644 --- a/src/ingest/loaders/exec.rs +++ b/src/ingest/loaders/exec.rs @@ -17,6 +17,7 @@ pub(crate) async fn run_exec( dir: Option<&str>, env: &HashMap, timeout_secs: u64, + max_output_bytes: usize, topic: Option<&str>, output: ExecOutputMode, source_key: Option<&str>, @@ -37,7 +38,8 @@ pub(crate) async fn run_exec( let mut all_docs = Vec::new(); let mut failures = Vec::new(); for cmd in cmds { - let stdout = run_single_cmd(cmd, &effective_dir, env, timeout_secs).await?; + let stdout = + run_single_cmd(cmd, &effective_dir, env, timeout_secs, max_output_bytes).await?; match output { ExecOutputMode::Jsonl => { for line in stdout.lines() { @@ -92,40 +94,81 @@ async fn run_single_cmd( dir: &Path, env: &HashMap, timeout_secs: u64, + max_output_bytes: usize, ) -> Result { - let mut child = tokio::process::Command::new("sh"); - child + const STDERR_CAP: usize = 16 * 1024; + + let mut command = tokio::process::Command::new("sh"); + command .args(["-c", cmd]) .current_dir(dir) + .kill_on_drop(true) .stdin(Stdio::null()) .stdout(Stdio::piped()) .stderr(Stdio::piped()); for (k, v) in env { - child.env(k, v); + command.env(k, v); } - let output = tokio::time::timeout(Duration::from_secs(timeout_secs), child.output()) + let mut child = command.spawn().map_err(|e| { + if e.kind() == std::io::ErrorKind::NotFound { + anyhow::anyhow!("sh is not available; exec sources require a POSIX shell on PATH") + } else { + anyhow::Error::from(e).context("failed to spawn exec command") + } + })?; + + let stdout = child.stdout.take().expect("stdout piped"); + let stderr = child.stderr.take().expect("stderr piped"); + + let (stdout_bytes, stderr_bytes, status) = + tokio::time::timeout(Duration::from_secs(timeout_secs), async { + use tokio::io::AsyncReadExt; + let (stdout_res, stderr_res) = tokio::join!( + async { + let mut buf = Vec::new(); + stdout + .take((max_output_bytes as u64).saturating_add(1)) + .read_to_end(&mut buf) + .await + .map(|_| buf) + }, + async { + let mut buf = Vec::new(); + stderr + .take((STDERR_CAP as u64).saturating_add(1)) + .read_to_end(&mut buf) + .await + .map(|_| buf) + }, + ); + let s_bytes = stdout_res?; + let e_bytes = stderr_res?; + let st = child.wait().await?; + Ok::<_, std::io::Error>((s_bytes, e_bytes, st)) + }) .await .context("exec command timed out")? - .map_err(|e| { - if e.kind() == std::io::ErrorKind::NotFound { - anyhow::anyhow!("sh is not available; exec sources require a POSIX shell on PATH") - } else { - anyhow::Error::from(e).context("failed to spawn exec command") - } - })?; + .map_err(|e| anyhow::Error::from(e).context("failed to read exec command output"))?; - if !output.status.success() { - let stderr = String::from_utf8_lossy(&output.stderr); + if stdout_bytes.len() > max_output_bytes { + anyhow::bail!( + "exec command exceeded max_output_bytes ({max_output_bytes}); \ + raise exec.max_output_bytes or narrow the command" + ); + } + + if !status.success() { + let stderr = String::from_utf8_lossy(&stderr_bytes); anyhow::bail!( "exec command {:?} exited with {}: {}", cmd, - output.status, + status, stderr.trim() ); } - Ok(String::from_utf8_lossy(&output.stdout).into_owned()) + Ok(String::from_utf8_lossy(&stdout_bytes).into_owned()) } fn parse_jsonl_line( @@ -296,6 +339,7 @@ mod tests { ); } + #[cfg(unix)] #[tokio::test] async fn raw_mode_cmd_is_source() { let (docs, failures) = run_exec( @@ -303,6 +347,7 @@ mod tests { None, &HashMap::new(), 10, + 10 * 1024 * 1024, None, ExecOutputMode::Raw, None, @@ -318,6 +363,7 @@ mod tests { assert_eq!(docs[0].content, "hello"); } + #[cfg(unix)] #[tokio::test] async fn raw_mode_explicit_source_key() { let (docs, _) = run_exec( @@ -325,6 +371,7 @@ mod tests { None, &HashMap::new(), 10, + 10 * 1024 * 1024, None, ExecOutputMode::Raw, Some("my-doc"), @@ -337,6 +384,7 @@ mod tests { assert_eq!(docs[0].source, "my-doc"); } + #[cfg(unix)] #[tokio::test] async fn raw_mode_empty_stdout_yields_failure() { let (docs, failures) = run_exec( @@ -344,6 +392,7 @@ mod tests { None, &HashMap::new(), 10, + 10 * 1024 * 1024, None, ExecOutputMode::Raw, None, @@ -357,6 +406,7 @@ mod tests { assert_eq!(failures.len(), 1); } + #[cfg(unix)] #[tokio::test] async fn raw_mode_topic_and_format() { let (docs, _) = run_exec( @@ -364,6 +414,7 @@ mod tests { None, &HashMap::new(), 10, + 10 * 1024 * 1024, Some("Docs"), ExecOutputMode::Raw, None, @@ -376,4 +427,81 @@ mod tests { assert_eq!(docs[0].topic.as_deref(), Some("Docs")); assert_eq!(docs[0].format.as_deref(), Some("md")); } + + #[cfg(unix)] + #[tokio::test] + async fn exec_output_within_cap_ok() { + let (docs, failures) = run_exec( + &["echo hello".to_owned()], + None, + &HashMap::new(), + 10, + 10 * 1024 * 1024, + None, + ExecOutputMode::Raw, + None, + None, + Path::new("/tmp"), + &ProgressHandle::noop(), + ) + .await + .unwrap(); + assert_eq!(docs.len(), 1); + assert!(failures.is_empty()); + assert_eq!(docs[0].content, "hello"); + } + + #[cfg(unix)] + #[tokio::test] + async fn exec_output_exceeding_cap_errors() { + // Emit 10 bytes via a POSIX-portable command (no bash brace expansion, + // which dash -- Ubuntu's /bin/sh -- does not support). + let result = run_exec( + &["printf 1234567890".to_owned()], + None, + &HashMap::new(), + 10, + 5, // 5-byte cap + None, + ExecOutputMode::Raw, + None, + None, + Path::new("/tmp"), + &ProgressHandle::noop(), + ) + .await; + let err = result.unwrap_err(); + assert!( + err.to_string().contains("max_output_bytes"), + "error should mention max_output_bytes: {err}" + ); + } + + #[cfg(unix)] + #[tokio::test] + async fn exec_respects_configured_cap() { + // 'echo hi' outputs "hi\n" (3 bytes), which exceeds a 2-byte cap. + let result = run_exec( + &["echo hi".to_owned()], + None, + &HashMap::new(), + 10, + 2, // 2-byte cap + None, + ExecOutputMode::Raw, + None, + None, + Path::new("/tmp"), + &ProgressHandle::noop(), + ) + .await; + assert!( + result.is_err(), + "should fail when output exceeds configured cap" + ); + assert!( + result.unwrap_err().to_string().contains("max_output_bytes"), + "error should mention max_output_bytes" + ); + } } diff --git a/src/ingest/loaders/file.rs b/src/ingest/loaders/file.rs index 47080f6..42e69b8 100644 --- a/src/ingest/loaders/file.rs +++ b/src/ingest/loaders/file.rs @@ -217,6 +217,10 @@ fn detect_format(path: &Path, mime: Option<&str>) -> Option { } /// Walk `base` for files matching `pattern` (gitignore-aware), capped at `max` entries. +/// +/// # Errors +/// +/// Returns an error if the glob pattern is invalid or the blocking walk task panics. pub async fn list_files(base: &Path, pattern: &str, max: Option) -> Result> { let base = base.to_path_buf(); let pattern = pattern.to_owned(); @@ -275,6 +279,10 @@ fn list_files_sync(base: &Path, pattern: &str, max: Option) -> Result, diff --git a/src/ingest/mod.rs b/src/ingest/mod.rs index e12e950..1dbd41b 100644 --- a/src/ingest/mod.rs +++ b/src/ingest/mod.rs @@ -54,6 +54,9 @@ pub trait IngestObserver: Send + Sync { /// Ingest run starting. Called once before any sources are processed. fn on_start(&self, _kb_name: &str, _n_sources: usize, _mode: IngestMode, _dry_run: bool) {} + /// LLM configuration warning (missing credentials, likely to fail). + fn on_llm_warning(&self, _msg: &str) {} + /// Dry-run notice. fn on_dry_run_notice(&self) {} @@ -186,6 +189,10 @@ pub struct IngestResult { /// The `observer` controls all user-facing output (progress bars, status /// messages, signal handling). Use [`QuietIngestObserver`] for silent /// operation. +/// +/// # Errors +/// +/// Returns an error if the store cannot be opened or a fatal I/O error occurs during indexing. pub async fn ingest( config: IngestConfig, config_path: PathBuf, @@ -221,6 +228,31 @@ pub async fn ingest( ) .context("failed to open store")?; + // Warn if the existing store was built with different chunking parameters + // or a different lore version. + if store_exists && mode != IngestMode::Recreate { + let current_fingerprint = chunk_config_fingerprint(&config); + if let Some(stored) = store.get_metadata(meta_key::CHUNK_CONFIG) + && stored != current_fingerprint + { + tracing::warn!( + stored_chunk_config = stored, + current_chunk_config = current_fingerprint, + "store was built with different chunking parameters; \ + run `lore ingest --recreate` to rebuild with current settings" + ); + } + if let Some(stored_ver) = store.get_metadata(meta_key::LORE_VERSION) + && stored_ver != crate::VERSION + { + tracing::warn!( + stored_version = stored_ver, + current_version = crate::VERSION, + "store was built with a different lore version" + ); + } + } + // Acquire an exclusive lock to prevent concurrent ingests against the // same store. The lock is acquired after Store::open (which creates the // directory) and after any --recreate destroy, so the lock file lives @@ -350,6 +382,13 @@ pub async fn ingest( }; let store = &*ctx.store; + #[cfg(feature = "llm")] + if let Some(ref client) = ctx.llm_client { + for warning in client.validate() { + observer.on_llm_warning(&warning); + } + } + let mut total_docs = 0u64; let mut total_chunks = 0u64; let mut all_failed_docs: Vec = Vec::new(); @@ -813,6 +852,18 @@ fn cleanup_stale_documents( Ok(()) } +/// Stable fingerprint of the chunking-relevant processing defaults. +/// +/// Captures `max_chunk_chars` and `min_chunk_chars` from the global processing +/// config. When this string changes between ingest runs a warning is emitted so +/// users know their stored chunks may not match the current settings. +fn chunk_config_fingerprint(config: &IngestConfig) -> String { + format!( + "max_chunk_chars={},min_chunk_chars={}", + config.processing.max_chunk_chars, config.processing.min_chunk_chars + ) +} + /// Write run metadata to the store, commit, and optimize if needed. fn write_store_metadata( store: &Store, @@ -841,6 +892,7 @@ fn write_store_metadata( &config.store.writer_heap_mb.to_string(), ); store.set_metadata(meta_key::LANGUAGE, config.store.language.as_str()); + store.set_metadata(meta_key::CHUNK_CONFIG, &chunk_config_fingerprint(config)); if let Some(name) = &config.name { store.set_metadata(meta_key::NAME, name); diff --git a/src/ingest/transforms.rs b/src/ingest/transforms.rs index 63448f6..dc4badb 100644 --- a/src/ingest/transforms.rs +++ b/src/ingest/transforms.rs @@ -38,6 +38,10 @@ pub struct CompiledProfile { impl CompiledProfile { /// Compile a `ProcessingProfile` into an executable form. + /// + /// # Errors + /// + /// Returns an error if any regex pattern in the pipeline is invalid. pub fn compile(profile: &ProcessingProfile) -> Result { let mut pipeline = Vec::with_capacity(profile.pipeline.len()); for (i, step) in profile.pipeline.iter().enumerate() { diff --git a/src/ingest/watch.rs b/src/ingest/watch.rs index 9e37780..6e4e53f 100644 --- a/src/ingest/watch.rs +++ b/src/ingest/watch.rs @@ -34,6 +34,10 @@ pub trait WatchObserver: Send + Sync { /// Watch local source paths and re-ingest on changes, with optional periodic /// polling. +/// +/// # Errors +/// +/// Returns an error if the filesystem watcher cannot be initialised or encounters a fatal I/O failure. pub async fn watch( config: IngestConfig, config_path: PathBuf, diff --git a/src/lib.rs b/src/lib.rs index 8d7aaf7..d7f907d 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,3 +1,6 @@ +// Documentation lints are enforced on the public library surface only (the +// workspace table leaves them allow-by-default so binaries aren't burdened). +#![warn(clippy::missing_errors_doc, clippy::missing_panics_doc)] //! Knowledge base library: ingest, index, search, and serve documents over MCP. /// Infallible `write!` to a `String` -- panics on `Err` (which `String` never returns). diff --git a/src/llm.rs b/src/llm.rs index f2a4408..31731f6 100644 --- a/src/llm.rs +++ b/src/llm.rs @@ -73,6 +73,10 @@ pub struct DocumentEnrichment { impl LlmClient { /// Create a new client from the given LLM config, building the shared HTTP client. + /// + /// # Errors + /// + /// Returns an error if the HTTP client cannot be built (e.g. invalid TLS configuration). pub fn new(config: &LlmConfig) -> Result { let http = reqwest::Client::builder() .timeout(std::time::Duration::from_secs(config.timeout_secs)) @@ -86,6 +90,60 @@ impl LlmClient { }) } + /// Check that required credentials are available for the configured provider. + /// + /// Returns warnings for obvious misconfigurations (missing API keys). + /// Does not perform network calls. + pub fn validate(&self) -> Vec { + Self::validate_credentials( + self.config.provider, + std::env::var("ANTHROPIC_API_KEY").is_ok(), + std::env::var("OPENAI_API_KEY").is_ok(), + ) + } + + /// Pure credential-check logic, separated from environment access so it can + /// be unit-tested without mutating process-global environment variables. + fn validate_credentials( + provider: Option, + has_anthropic: bool, + has_openai: bool, + ) -> Vec { + let mut warnings = Vec::new(); + match provider { + Some(LlmProvider::Anthropic) => { + if !has_anthropic { + warnings.push( + "ANTHROPIC_API_KEY is not set -- LLM enrichment will fail; \ + set the variable or remove the llm config block" + .to_owned(), + ); + } + } + Some(LlmProvider::Openai) => { + if !has_openai { + warnings.push( + "OPENAI_API_KEY is not set -- LLM enrichment will fail; \ + set the variable or remove the llm config block" + .to_owned(), + ); + } + } + Some(LlmProvider::Ollama | LlmProvider::Bedrock) => {} + None => { + if !has_anthropic && !has_openai { + warnings.push( + "no LLM API key found (checked ANTHROPIC_API_KEY, OPENAI_API_KEY) \ + -- enrichment will fall back to Ollama; set a key or configure \ + llm.provider explicitly" + .to_owned(), + ); + } + } + } + warnings + } + /// Detect a topic name for a document using LLM. pub async fn detect_topic(&self, content: &str) -> Option { self.call_simple( @@ -541,6 +599,10 @@ pub async fn enrich_document( } /// Enrich all chunks in a document batch via concurrent LLM calls, filtering by quality threshold. +/// +/// # Panics +/// +/// Panics if the enrichment semaphore is closed before all tasks complete (should never occur during normal operation). pub async fn enrich_chunks( chunks: &mut Vec, llm_client: &LlmClient, @@ -597,3 +659,60 @@ pub async fn enrich_chunks( }) .collect(); } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn validate_anthropic_missing_key() { + let warnings = LlmClient::validate_credentials(Some(LlmProvider::Anthropic), false, false); + assert_eq!(warnings.len(), 1); + assert!(warnings[0].contains("ANTHROPIC_API_KEY")); + } + + #[test] + fn validate_anthropic_key_present() { + let warnings = LlmClient::validate_credentials(Some(LlmProvider::Anthropic), true, false); + assert!(warnings.is_empty()); + } + + #[test] + fn validate_openai_missing_key() { + let warnings = LlmClient::validate_credentials(Some(LlmProvider::Openai), false, false); + assert_eq!(warnings.len(), 1); + assert!(warnings[0].contains("OPENAI_API_KEY")); + } + + #[test] + fn validate_openai_key_present() { + let warnings = LlmClient::validate_credentials(Some(LlmProvider::Openai), false, true); + assert!(warnings.is_empty()); + } + + #[test] + fn validate_ollama_no_warning() { + assert!( + LlmClient::validate_credentials(Some(LlmProvider::Ollama), false, false).is_empty() + ); + } + + #[test] + fn validate_bedrock_no_warning() { + assert!( + LlmClient::validate_credentials(Some(LlmProvider::Bedrock), false, false).is_empty() + ); + } + + #[test] + fn validate_default_provider_warns_without_keys() { + let warnings = LlmClient::validate_credentials(None, false, false); + assert_eq!(warnings.len(), 1); + assert!(warnings[0].contains("no LLM API key")); + } + + #[test] + fn validate_default_provider_silent_with_key() { + assert!(LlmClient::validate_credentials(None, true, false).is_empty()); + } +} diff --git a/src/output/documents.rs b/src/output/documents.rs index 6e5d628..708cef9 100644 --- a/src/output/documents.rs +++ b/src/output/documents.rs @@ -13,6 +13,10 @@ use crate::store::DocDetail; use crate::types::{Chunk, DocMeta}; /// Format a paginated list of document metadata. +/// +/// # Errors +/// +/// Returns an error if JSON serialization fails (only possible in `Json` output mode). pub fn format_document_list( entries: &[DocMeta], total: usize, @@ -157,6 +161,10 @@ fn format_document_list_cli( } /// Format a single document with its chunk content. +/// +/// # Errors +/// +/// Returns an error if JSON serialization fails (only possible in `Json` output mode). pub fn format_document( doc: &DocDetail, offset: usize, diff --git a/src/output/info.rs b/src/output/info.rs index 4b3ff4b..5711519 100644 --- a/src/output/info.rs +++ b/src/output/info.rs @@ -14,6 +14,10 @@ use crate::store::{StoreInfo, TopicStat}; const TOPIC_DISPLAY_LIMIT: usize = 10; /// Format store statistics as CLI text, JSON, or MCP text. +/// +/// # Errors +/// +/// Returns an error if JSON serialization fails (only possible in `Json` output mode). pub fn format_store_info( info: &StoreInfo, store_entries: &[(PathBuf, u64)], diff --git a/src/query.rs b/src/query.rs index 1486bb3..cf2f541 100644 --- a/src/query.rs +++ b/src/query.rs @@ -196,6 +196,10 @@ pub fn resolve_filters( } /// Build a `SearchQuery` from `SearchArgs` and run it against the store set. +/// +/// # Errors +/// +/// Returns an error if the query is empty or the Tantivy search fails. pub fn execute_search( stores: &StoreSet, args: SearchArgs, @@ -223,6 +227,10 @@ pub fn execute_search( } /// Full-text search across indexed chunks with optional filters. +/// +/// # Errors +/// +/// Returns an error if the search query fails or output formatting fails. pub fn search(stores: &StoreSet, args: SearchArgs, mode: OutputMode) -> Result { let offset = args.pagination.offset; let (results, sq, total) = execute_search(stores, args)?; @@ -231,6 +239,10 @@ pub fn search(stores: &StoreSet, args: SearchArgs, mode: OutputMode) -> Result Res } /// Retrieve details for a single topic by name. +/// +/// # Errors +/// +/// Returns an error if the store query fails, the topic is not found, or output formatting fails. // Same by-value rationale as `list_topics`: fields are consumed by the query. #[allow(clippy::needless_pass_by_value)] pub fn get_topic(stores: &StoreSet, args: TopicArgs, mode: OutputMode) -> Result { @@ -276,6 +292,10 @@ pub fn get_topic(stores: &StoreSet, args: TopicArgs, mode: OutputMode) -> Result } /// List indexed documents with optional filtering and pagination. +/// +/// # Errors +/// +/// Returns an error if output formatting fails. pub fn list_documents(stores: &StoreSet, args: DocsArgs, mode: OutputMode) -> Result { let limit = args.pagination.limit; let offset = args.pagination.offset; @@ -298,6 +318,10 @@ pub fn list_documents(stores: &StoreSet, args: DocsArgs, mode: OutputMode) -> Re } /// Retrieve a single document's metadata and chunks by source path. +/// +/// # Errors +/// +/// Returns an error if the store query fails, the document is not found, or output formatting fails. #[allow(clippy::needless_pass_by_value)] pub fn get_document(stores: &StoreSet, args: ReadArgs, mode: OutputMode) -> Result { let (limit, offset) = if args.full { diff --git a/src/store/documents.rs b/src/store/documents.rs index 02ee3b1..a7e0e46 100644 --- a/src/store/documents.rs +++ b/src/store/documents.rs @@ -75,13 +75,17 @@ impl Store { self.dirty.store(true, Ordering::Release); } - #[cfg(all(test, feature = "ingest"))] + #[cfg(feature = "ingest")] pub(crate) fn get_metadata(&self, key: &str) -> Option { let meta = self.read_meta(); meta.get(key).cloned() } /// Convenience wrapper: delete existing chunks, insert new ones, upsert document meta. + /// + /// # Errors + /// + /// Returns an error if deleting old chunks or inserting new chunks into the Tantivy index fails. pub fn replace_document(&self, source_id: &str, chunks: &[Chunk], meta: DocMeta) -> Result<()> { self.delete_chunks_by_source(source_id)?; if !chunks.is_empty() { @@ -92,6 +96,10 @@ impl Store { } /// Fetch metadata and all chunks for a single document by source_id. + /// + /// # Errors + /// + /// Returns an error if the Tantivy index query fails. pub fn get_document(&self, source_id: &str) -> Result> { let Some(meta) = self.read_docs().get(source_id).cloned() else { return Ok(None); diff --git a/src/store/meta_key.rs b/src/store/meta_key.rs index aa99cd6..8ffe877 100644 --- a/src/store/meta_key.rs +++ b/src/store/meta_key.rs @@ -18,3 +18,7 @@ pub(crate) const PHRASE_SEARCH: &str = "phrase_search"; pub(crate) const WRITER_HEAP_MB: &str = "writer_heap_mb"; /// Stemming/stop-word language used to build the tokenizer pipeline. pub(crate) const LANGUAGE: &str = "language"; +/// Fingerprint of the chunker-relevant processing config (max/min chunk chars). +/// Used to warn when a store was built with different chunking parameters. +#[cfg(feature = "ingest")] +pub(crate) const CHUNK_CONFIG: &str = "chunk_config"; diff --git a/src/store/mod.rs b/src/store/mod.rs index 6533ec9..4ce752c 100644 --- a/src/store/mod.rs +++ b/src/store/mod.rs @@ -68,6 +68,10 @@ pub struct Store { impl Store { /// Open a store for reading with default settings (phrase search on, default heap). /// The language is read from lore.meta if present, falling back to English. + /// + /// # Errors + /// + /// Returns an error if the Tantivy index at `path` cannot be opened. pub fn open_readonly(path: &Path) -> Result { use crate::types::IndexLanguage; let preloaded = load_meta_map(path); @@ -80,6 +84,10 @@ impl Store { } /// Open or create a store at `path`. + /// + /// # Errors + /// + /// Returns an error if the Tantivy index cannot be opened or created at `path`. pub fn open( path: &Path, phrase_search: bool, diff --git a/src/store/multi.rs b/src/store/multi.rs index 771ef2b..f9a58dd 100644 --- a/src/store/multi.rs +++ b/src/store/multi.rs @@ -6,8 +6,8 @@ use anyhow::Result; use crate::store::documents::resolve_name_from_terms; use crate::store::types::{avg_words_per_chunk, sort_chunks_natural}; use crate::store::{ - DocDetail, DocFilter, SearchHit, SearchQuery, SourceResolveResult, Store, StoreInfo, - TopicFilter, TopicResult, TopicSort, TopicStat, + DocDetail, DocFilter, SearchHit, SearchQuery, SearchSort, SourceResolveResult, Store, + StoreInfo, TopicFilter, TopicResult, TopicSort, TopicStat, }; use crate::types::{Chunk, DocKind, DocMeta, SourceType}; use crate::util; @@ -22,6 +22,10 @@ pub struct StoreSet { impl StoreSet { /// Wrap multiple stores into a federated set. + /// + /// # Errors + /// + /// Returns an error if `stores` is empty. pub fn new(stores: Vec) -> Result { anyhow::ensure!(!stores.is_empty(), "StoreSet requires at least one store"); Ok(Self { @@ -30,6 +34,10 @@ impl StoreSet { } /// Wrap a single store. + /// + /// # Panics + /// + /// Never panics; the internal `new` call with a single-element vec always succeeds. pub fn single(store: Store) -> Self { Self::new(vec![store]).expect("single-element vec is always non-empty") } @@ -45,6 +53,10 @@ impl StoreSet { } /// Full-text search across all stores, merged by score and re-paginated. + /// + /// # Errors + /// + /// Returns an error if any store's Tantivy search fails. pub fn search( &self, sq: &SearchQuery, @@ -63,11 +75,18 @@ impl StoreSet { let mut all_hits: Vec = Vec::new(); let mut total: usize = 0; + let mut batch_ranges: Vec<(usize, usize)> = Vec::new(); for store in &self.stores { + let start = all_hits.len(); let (hits, store_total) = store.search(&overfetch)?; total = total.saturating_add(store_total); all_hits.extend(hits); + batch_ranges.push((start, all_hits.len())); + } + + if matches!(sq.sort, SearchSort::Score) { + normalize_scores(&mut all_hits, &batch_ranges); } sq.sort.apply(&mut all_hits, sq.reverse); @@ -136,6 +155,10 @@ impl StoreSet { } /// Retrieve topic details, merging chunks from all stores that have the topic. + /// + /// # Errors + /// + /// Returns an error if any store's topic query fails. pub fn get_topic(&self, name: &str) -> Result> { if self.stores.len() == 1 { return self.stores[0].get_topic(name); @@ -213,6 +236,10 @@ impl StoreSet { } /// Retrieve a document's metadata and chunks, resolving the source key first. + /// + /// # Errors + /// + /// Returns an error if any store's document query fails. pub fn get_document(&self, source_id: &str) -> Result> { if self.stores.len() == 1 { return self.stores[0].get_document(source_id); @@ -279,6 +306,32 @@ fn merge_counts(target: &mut HashMap, source: H } } +/// Min-max normalize BM25 scores within each store batch to [0.0, 1.0]. +/// +/// Makes scores from different-sized corpora comparable before merging. +fn normalize_scores(hits: &mut [SearchHit], batch_ranges: &[(usize, usize)]) { + for &(start, end) in batch_ranges { + let batch = &hits[start..end]; + if batch.is_empty() { + continue; + } + let min = batch + .iter() + .fold(f64::INFINITY, |acc, h| acc.min(h.score.unwrap_or(0.0))); + let max = batch + .iter() + .fold(f64::NEG_INFINITY, |acc, h| acc.max(h.score.unwrap_or(0.0))); + let range = max - min; + for h in &mut hits[start..end] { + h.score = Some(if range == 0.0 { + 1.0 + } else { + (h.score.unwrap_or(0.0) - min) / range + }); + } + } +} + /// Accumulate a `TopicStat` into a topic map, merging counts for duplicate names. fn merge_topic_stat(map: &mut HashMap, stat: TopicStat) { let entry = map.entry(stat.name.clone()).or_insert_with(|| TopicStat { @@ -482,4 +535,167 @@ mod tests { drop((d1, d2)); } + + #[test] + fn normalize_scores_multi_store() { + let (set, _d1, _d2) = two_store_set("t1", "t2"); + let sq = SearchQuery::new("content"); + let (hits, _) = set.search(&sq).unwrap(); + for hit in &hits { + let score = hit.score.expect("score should be present"); + assert!( + (0.0..=1.0).contains(&score), + "score {score} should be in [0.0, 1.0]" + ); + } + } + + #[test] + fn normalize_scores_single_hit_per_store() { + let (s1, d1) = temp_store(); + s1.insert_chunks(&[test_chunk("/a.md", "unique alpha", "t1", 0)]) + .unwrap(); + s1.upsert_document(test_meta("/a.md", "t1")); + s1.commit().unwrap(); + + let (s2, d2) = temp_store(); + s2.insert_chunks(&[test_chunk("/b.md", "unique beta", "t2", 0)]) + .unwrap(); + s2.upsert_document(test_meta("/b.md", "t2")); + s2.commit().unwrap(); + + let set = StoreSet::new(vec![s1, s2]).unwrap(); + let sq = SearchQuery::new("unique"); + let (hits, _) = set.search(&sq).unwrap(); + assert_eq!(hits.len(), 2); + for hit in &hits { + assert_eq!( + hit.score, + Some(1.0), + "single hit per store should normalize to 1.0" + ); + } + + drop((d1, d2)); + } + + #[test] + fn normalize_scores_skipped_for_source_sort() { + let (set, _d1, _d2) = two_store_set("t1", "t2"); + + // Same query under both sorts. Each store holds a single matching chunk, + // so normalization (when it runs) pins every score to exactly 1.0. + let scored = set + .search(&SearchQuery { + query: "content".into(), + sort: SearchSort::Score, + ..SearchQuery::default() + }) + .unwrap() + .0; + let by_source = set + .search(&SearchQuery { + query: "content".into(), + sort: SearchSort::Source, + ..SearchQuery::default() + }) + .unwrap() + .0; + + assert_eq!(scored.len(), 2, "query should match one chunk per store"); + assert_eq!(by_source.len(), 2, "query should match one chunk per store"); + + // Score sort normalizes each single-hit store batch to exactly 1.0... + assert!( + scored.iter().all(|h| h.score == Some(1.0)), + "score sort should normalize single-hit stores to 1.0, got {:?}", + scored.iter().map(|h| h.score).collect::>() + ); + // ...whereas source sort skips normalization and preserves the raw BM25 + // scores, which would otherwise be forced to 1.0 if normalization ran. + assert!( + by_source.iter().all(|h| h.score != Some(1.0)), + "source sort must preserve raw BM25 scores, got {:?}", + by_source.iter().map(|h| h.score).collect::>() + ); + } + + #[test] + fn score_sort_ties_break_by_source_not_store_order() { + // Two single-hit stores both normalize to exactly 1.0 -> a cross-store tie. + // Store insertion order is [zebra, apple] (deliberately NOT alphabetical), + // so a store-order tie-break would yield zebra first. The contract is that + // ties resolve by source path, independent of store declaration order. + let (s1, d1) = temp_store(); + s1.insert_chunks(&[test_chunk("/zebra.md", "tie content", "t1", 0)]) + .unwrap(); + s1.upsert_document(test_meta("/zebra.md", "t1")); + s1.commit().unwrap(); + + let (s2, d2) = temp_store(); + s2.insert_chunks(&[test_chunk("/apple.md", "tie content", "t2", 0)]) + .unwrap(); + s2.upsert_document(test_meta("/apple.md", "t2")); + s2.commit().unwrap(); + + let set = StoreSet::new(vec![s1, s2]).unwrap(); + let (hits, _) = set.search(&SearchQuery::new("content")).unwrap(); + + assert_eq!(hits.len(), 2); + assert_eq!( + hits[0].score, + Some(1.0), + "both single-hit stores normalize to 1.0" + ); + assert_eq!(hits[1].score, Some(1.0)); + assert_eq!( + hits[0].chunk.source.as_ref(), + "/apple.md", + "tie must break by source path, not store insertion order" + ); + assert_eq!(hits[1].chunk.source.as_ref(), "/zebra.md"); + + drop((d1, d2)); + } + + #[test] + fn normalize_preserves_relative_order() { + let (s1, d1) = temp_store(); + s1.insert_chunks(&[ + test_chunk("/a.md", "rust programming language guide", "t", 0), + test_chunk("/b.md", "rust rust rust", "t", 0), + test_chunk("/c.md", "brief rust", "t", 0), + ]) + .unwrap(); + s1.commit().unwrap(); + + let (s2, d2) = temp_store(); + s2.insert_chunks(&[test_chunk("/d.md", "unrelated python", "t", 0)]) + .unwrap(); + s2.commit().unwrap(); + + let set = StoreSet::new(vec![s1, s2]).unwrap(); + let sq = SearchQuery { + query: "rust".into(), + limit: 10, + ..SearchQuery::default() + }; + let (hits, _) = set.search(&sq).unwrap(); + + let s1_scores: Vec = hits + .iter() + .filter(|h| h.chunk.source.starts_with('/')) + .filter(|h| h.chunk.source.as_ref() != "/d.md") + .map(|h| h.score.unwrap_or(0.0)) + .collect(); + + for window in s1_scores.windows(2) { + assert!( + window[0] >= window[1], + "scores should be descending within a store: {s1_scores:?}" + ); + } + + drop((d1, d2)); + } } diff --git a/src/store/resolve.rs b/src/store/resolve.rs index 81062b0..edd8738 100644 --- a/src/store/resolve.rs +++ b/src/store/resolve.rs @@ -15,6 +15,10 @@ impl Store { /// Iterates each segment's FAST `source` StrColumn over live docs, /// collecting unique ordinals per segment and computing source_id from /// each unique source path. Skips soft-deleted docs via `doc_ids_alive()`. + /// + /// # Errors + /// + /// Returns an error if a segment's FAST column cannot be accessed. pub fn index_source_keys(&self) -> Result> { let searcher = self.reader.searcher(); let mut keys = HashSet::::new(); @@ -47,6 +51,10 @@ impl Store { /// /// Runs a targeted `BooleanQuery` to find matching doc addresses, then /// reads `source` from the FAST StrColumn and computes `source_id`. + /// + /// # Errors + /// + /// Returns an error if the Tantivy search or FAST field access fails. pub fn count_chunks_for_sources(&self, sources: &[&str]) -> Result> { if sources.is_empty() { return Ok(HashMap::new()); diff --git a/src/store/search.rs b/src/store/search.rs index 5c3290d..286ecd6 100644 --- a/src/store/search.rs +++ b/src/store/search.rs @@ -23,6 +23,10 @@ impl Store { /// /// `total` is the exact Tantivy match count for the query+filters (excluding /// in-memory-only filters like source substring and `max_per_source`). + /// + /// # Errors + /// + /// Returns an error if the query is empty or the Tantivy index search fails. pub fn search( &self, query: &SearchQuery, @@ -204,6 +208,10 @@ impl Store { } /// Build a BooleanQuery that matches any of the given source strings. + /// + /// # Panics + /// + /// Panics if `sources` is empty. pub fn source_term_query(&self, sources: &[impl AsRef]) -> BooleanQuery { assert!( !sources.is_empty(), @@ -225,6 +233,10 @@ impl Store { /// Fetch all chunks for a set of source paths. Returns chunks sorted by /// source then position. + /// + /// # Errors + /// + /// Returns an error if the Tantivy index search or document retrieval fails. pub fn chunks_for_sources(&self, sources: &[impl AsRef]) -> Result> { if sources.is_empty() { return Ok(Vec::new()); diff --git a/src/store/test_helpers.rs b/src/store/test_helpers.rs index 4cb9491..15872a8 100644 --- a/src/store/test_helpers.rs +++ b/src/store/test_helpers.rs @@ -6,11 +6,13 @@ use crate::store::SearchHit; use crate::types::{Chunk, DocKind, DocMeta, SourceId, SourceType}; /// Open a store at an existing path (for tests that need the path to outlive the store). +#[allow(clippy::missing_panics_doc)] pub fn open_test_store(path: &std::path::Path) -> store::Store { store::Store::open(path, true, 256, crate::types::IndexLanguage::default(), 100).unwrap() } /// Create a temporary store backed by a `TempDir`. +#[allow(clippy::missing_panics_doc)] pub fn temp_store() -> (store::Store, tempfile::TempDir) { let dir = tempfile::tempdir().unwrap(); let store = open_test_store(dir.path()); diff --git a/src/store/types.rs b/src/store/types.rs index 5b10e43..d170aa5 100644 --- a/src/store/types.rs +++ b/src/store/types.rs @@ -162,16 +162,36 @@ pub enum SearchSort { impl SearchSort { /// Sort a slice of search hits in-place, optionally reversing. + /// + /// Every arm falls back to `tie_break` so the result is a deterministic + /// total order -- independent of store declaration order, merge order, and the + /// stability of the underlying sort algorithm. pub fn apply(self, hits: &mut [SearchHit], reverse: bool) { match self { SearchSort::Score => { - hits.sort_by(|a, b| b.score.unwrap_or(0.0).total_cmp(&a.score.unwrap_or(0.0))); + hits.sort_by(|a, b| { + b.score + .unwrap_or(0.0) + .total_cmp(&a.score.unwrap_or(0.0)) + .then_with(|| tie_break(a, b)) + }); } SearchSort::Source => { - hits.sort_by(|a, b| a.chunk.source.cmp(&b.chunk.source)); + hits.sort_by(|a, b| { + a.chunk + .source + .cmp(&b.chunk.source) + .then_with(|| tie_break(a, b)) + }); } SearchSort::Topic => { - hits.sort_by(|a, b| a.chunk.topic.as_deref().cmp(&b.chunk.topic.as_deref())); + hits.sort_by(|a, b| { + a.chunk + .topic + .as_deref() + .cmp(&b.chunk.topic.as_deref()) + .then_with(|| tie_break(a, b)) + }); } } if reverse { @@ -180,6 +200,18 @@ impl SearchSort { } } +/// Deterministic, store-order-independent tie-break for hits whose primary sort +/// key is equal: orders by source path, then chunk index. Without it, equal-score +/// hits -- common after per-store score normalization pins every store's top hit to +/// 1.0 -- would fall back to incidental merge/input order, which is fragile under +/// refactors (e.g. switching to `sort_unstable_by` or a parallel store fetch). +fn tie_break(a: &SearchHit, b: &SearchHit) -> std::cmp::Ordering { + a.chunk + .source + .cmp(&b.chunk.source) + .then_with(|| a.chunk.chunk_index.cmp(&b.chunk.chunk_index)) +} + /// Full-text search request with filters, pagination, and sort. #[derive(Clone)] pub struct SearchQuery { diff --git a/src/store/writer.rs b/src/store/writer.rs index b8910e1..49475bf 100644 --- a/src/store/writer.rs +++ b/src/store/writer.rs @@ -31,6 +31,10 @@ impl Store { /// Unlike `commit()`, this keeps the writer alive so that Tantivy's merge /// policy can compact segments in the background between commits (background /// compaction is managed by Tantivy, not controlled directly here). + /// + /// # Errors + /// + /// Returns an error if the Tantivy commit or reader reload fails. pub fn commit_index(&self) -> Result<()> { if !self.dirty.load(Ordering::Acquire) { return Ok(()); @@ -48,6 +52,10 @@ impl Store { /// Full commit: Tantivy index + sidecar files (`lore.meta`, `lore.docs`). /// Used for per-source commits and the final commit. Skips writing when /// no changes have been made since the last commit. + /// + /// # Errors + /// + /// Returns an error if the Tantivy commit or sidecar file writes fail. pub fn commit(&self) -> Result<()> { if !self.dirty.load(Ordering::Acquire) { return Ok(()); @@ -58,11 +66,23 @@ impl Store { /// Unconditional commit: always writes sidecar files regardless of dirty /// flags. Used on interrupt to guarantee persistence even if a prior /// commit already cleared the flags. + /// + /// # Errors + /// + /// Returns an error if the Tantivy commit or sidecar file writes fail. pub fn force_commit(&self) -> Result<()> { self.commit_inner(true) } /// Add chunks to the Tantivy index. + /// + /// # Errors + /// + /// Returns an error if the index writer cannot be initialised or adding a document fails. + /// + /// # Panics + /// + /// Panics if the writer lock is in an unexpected state (should never occur in normal use). pub fn insert_chunks(&self, chunks: &[Chunk]) -> Result<()> { let guard = self.ensure_writer()?; let writer = guard.as_ref().expect("writer initialized by ensure_writer"); @@ -113,6 +133,14 @@ impl Store { } /// Delete all chunks for a given source ID. + /// + /// # Errors + /// + /// Returns an error if the index writer cannot be initialised. + /// + /// # Panics + /// + /// Panics if the writer lock is in an unexpected state (should never occur in normal use). pub fn delete_chunks_by_source(&self, source_id: &str) -> Result<()> { let term = tantivy::Term::from_field_text(self.fields.source_id, source_id); let guard = self.ensure_writer()?; @@ -146,6 +174,10 @@ impl Store { /// Tantivy's GC can delete the old segment files after each merge commit. /// Holding the lock across rounds would not provide additional safety /// because each round is self-contained (commit + reader reload + GC). + /// + /// # Errors + /// + /// Returns an error if listing segments, merging, or committing fails. pub fn optimize(&self, on_progress: impl Fn(usize)) -> Result<()> { // Phase 1: smart merge -- collapse small segments while skipping the // dominant one to avoid a full rewrite when most data is already in a diff --git a/src/util/fs.rs b/src/util/fs.rs index c99da2c..c92598a 100644 --- a/src/util/fs.rs +++ b/src/util/fs.rs @@ -104,6 +104,10 @@ pub fn is_lexically_safe_path(path: &Path) -> bool { /// On Unix, if the destination file already exists its permissions are preserved on /// the new file after the rename. New files receive the default `NamedTempFile` /// permissions (typically 0o600). +/// +/// # Errors +/// +/// Returns an error if the parent directory cannot be created, the temp file cannot be written, or the rename fails. pub fn atomic_write(path: &Path, data: &[u8]) -> Result<()> { let parent = path.parent().context("no parent directory")?; std::fs::create_dir_all(parent).context("failed to create parent directory")?; diff --git a/src/util/marker.rs b/src/util/marker.rs index 796799f..f600afa 100644 --- a/src/util/marker.rs +++ b/src/util/marker.rs @@ -17,6 +17,10 @@ use crate::util::{atomic_write, blake3_hex}; /// directory) this is acceptable: a crash here is no worse than the marker never /// being written. If hard durability is required, callers must fsync the /// parent directory after this function returns. +/// +/// # Errors +/// +/// Returns an error if the marker file cannot be written. pub fn ensure_marker(root: &Path, file_name: &str, salt: &str) -> Result<()> { // `canonicalize` resolves symlinks so the hash is stable across relative-path // variations. The fallback to the raw path (when the directory does not yet