diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..4b6fdce --- /dev/null +++ b/.env.example @@ -0,0 +1,17 @@ +# Copy to .env and fill in your values: +# cp .env.example .env + +# Your personal profile name (loads application-.yml) +# Create your config: cp scripts/user-config/application-user.yml.example scripts/user-config/application-.yml +GUIDE_PROFILE=user + +# OpenAI API key (required for embeddings and chat) +OPENAI_API_KEY=sk-proj-your-key-here + +# Neo4j (optional — defaults shown) +# NEO4J_USERNAME=neo4j +# NEO4J_PASSWORD=brahmsian +# NEO4J_URI=bolt://localhost:7687 + +# Discord bot token (optional — only needed for Discord integration) +# DISCORD_TOKEN=your-discord-token diff --git a/.gitignore b/.gitignore index a4053f3..34ec003 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,7 @@ # Gradle .gradle/ **/build/ +**/bin/ # MCP resources embabel-agent-api/src/main/resources/mcp/** @@ -33,6 +34,11 @@ embabel-agent-api/src/main/resources/mcp/** .env .envrc +# Personal application overrides (set GUIDE_PROFILE in .env; default profile is "user") +# Ignore all personal profile files except the checked-in example +scripts/user-config/application-*.yml +!scripts/user-config/application-*.yml.example + # Temporary files *.tmp *.bak diff --git a/README.md b/README.md index 9944b04..fd6fe7e 100644 --- a/README.md +++ b/README.md @@ -35,6 +35,8 @@ curl -X POST http://localhost:1337/api/v1/data/load-references To see stats on data, make a GET request or browse to http://localhost:1337/api/v1/data/stats +RAG content storage uses the `ChunkingContentElementRepository` interface from the `embabel-agent-rag-core` library. The default backend is Neo4j via `DrivineStore`. You can plug in other backends by providing a different `ChunkingContentElementRepository` bean. + ## Viewing and Deleting Data Go to the Neo Browser at http://localhost:7474/browser/ diff --git a/scripts/INGESTION-TESTING.md b/scripts/INGESTION-TESTING.md new file mode 100644 index 0000000..665685d --- /dev/null +++ b/scripts/INGESTION-TESTING.md @@ -0,0 +1,103 @@ +# Testing Guide + +## Run all tests + +```bash +./mvnw test +``` + +Runs all 97 tests (unit + integration). Integration tests use Testcontainers to spin up Neo4j automatically — no local Neo4j needed. + +## Run specific test classes + +```bash +# Single class +./mvnw test -Dtest=IngestionResultTest + +# Multiple classes +./mvnw test -Dtest="IngestionResultTest,IngestionRunnerTest,DataManagerControllerTest" + +# Single method +./mvnw test -Dtest="IngestionRunnerTest#summary banner contains URL results" +``` + +## Test coverage by area + +### Ingestion pipeline (new) + +| Test class | Type | What it covers | +|---|---|---| +| `IngestionResultTest` | Unit | `IngestionResult` record: totals, `hasFailures()`, duration | +| `IngestionRunnerTest` | Unit | `IngestionRunner`: calls `loadReferences`, prints banner with URLs/dirs/stats/port, `formatDuration` | +| `DataManagerControllerTest` | Unit | REST endpoints: `GET /stats`, `POST /load-references` returns `IngestionResult` | +| `DataManagerLoadReferencesIntegrationTest` | Integration | Full pipeline: DataManager → Neo4j. Ingests sample directory, verifies structured result + documents/chunks in store | + +Run just these: + +```bash +./mvnw test -Dtest="IngestionResultTest,IngestionRunnerTest,DataManagerControllerTest,DataManagerLoadReferencesIntegrationTest" +``` + +### Other test areas + +| Test class | Type | What it covers | +|---|---|---| +| `GuidePropertiesPathResolutionTest` | Unit | Path resolution (`~/`, absolute, relative) | +| `HubApiControllerTest` | Integration | Hub REST API (register, login, sessions, JWT) | +| `HubServiceTest` | Integration | User registration validation | +| `DrivineGuideUserRepositoryTest` | Integration | Neo4j user repository (Drivine) | +| `GuideUserRepositoryDefaultImplTest` | Integration | Neo4j user repository (GraphView) | +| `GuideUserServiceTest` | Integration | Anonymous web user service | +| `McpSecurityTest` | Integration | MCP endpoints are publicly accessible | + +## Using local Neo4j (faster iteration) + +By default, tests use Testcontainers (slower startup, fully isolated). For faster runs during development: + +1. Start Neo4j: + +```bash +docker compose up neo4j -d +``` + +2. Run tests with local Neo4j: + +```bash +USE_LOCAL_NEO4J=true ./mvnw test +``` + +## Manual testing of fresh-ingest.sh + +To test the full ingestion flow end-to-end: + +1. Set up your `.env` and personal profile (see `scripts/README.md`) +2. Run: + +```bash +./scripts/fresh-ingest.sh +``` + +3. Watch for the **INGESTION COMPLETE** banner with: + - Time elapsed + - Loaded/failed URLs + - Ingested/failed directories + - RAG store stats (documents, chunks, elements) + - Port and MCP endpoint + +4. Verify the REST API: + +```bash +# Stats +curl http://localhost:1337/api/v1/data/stats + +# Trigger ingestion manually (returns JSON IngestionResult) +curl -X POST http://localhost:1337/api/v1/data/load-references +``` + +5. Verify MCP: + +```bash +curl -i --max-time 3 http://localhost:1337/sse +``` + +Should return `Content-Type: text/event-stream`. diff --git a/scripts/README.md b/scripts/README.md index b3785e4..4fc233e 100644 --- a/scripts/README.md +++ b/scripts/README.md @@ -1,3 +1,45 @@ # Shell scripts -- `shell.sh` runs the application in interactive shell mode. \ No newline at end of file +| Script | Purpose | +|---|---| +| `fresh-ingest.sh` | Wipes Neo4j RAG data and re-ingests everything from scratch. Use for first-time setup or when you want a clean slate. | +| `append-ingest.sh` | Re-ingests without clearing existing data. Use when you've added new URLs or directories. Comment out already-ingested items in your profile to avoid re-processing them. | +| `shell.sh` | Runs the application in interactive shell mode. | + +Both ingestion scripts start Neo4j in Docker, load your personal profile, and print an **INGESTION COMPLETE** banner when done. + +## Personal profiles + +Both scripts read `GUIDE_PROFILE` from `.env` (default: `user`). +Each developer can have their own Spring profile: + +```bash +cp scripts/user-config/application-user.yml.example scripts/user-config/application-yourname.yml +# Edit to taste, then: +echo 'GUIDE_PROFILE=yourname' >> .env +./scripts/fresh-ingest.sh +``` + +This loads `application-yourname.yml` with your URLs, directories, and settings. +See `scripts/user-config/README.md` for full details. + +## Using append-ingest.sh + +Since `append-ingest.sh` doesn't clear the store, you should comment out URLs and directories that are already ingested in your profile to avoid re-processing them. For example: + +```yaml +guide: + urls: + # - https://docs.embabel.com/embabel-agent/guide/0.3.5-SNAPSHOT/ # already ingested + - https://some-new-url.com # new, will be ingested + directories: + # - ~/github/jmjava/guide # already ingested + - ~/github/jmjava/new-repo # new, will be ingested +``` + +Then run `./scripts/append-ingest.sh`. The new content is added alongside existing data in Neo4j. + +## Tips + +- **If ingestion seems stuck** on a URL: the thread is blocked on fetch -> parse -> embed. Try lowering `embedding-batch-size` to 20, or temporarily remove the slow URL. +- **Speed up ingestion**: increase `embedding-batch-size` (default 50) or `max-chunk-size` (default 4000). diff --git a/scripts/append-ingest.sh b/scripts/append-ingest.sh new file mode 100755 index 0000000..ebac691 --- /dev/null +++ b/scripts/append-ingest.sh @@ -0,0 +1,73 @@ +#!/usr/bin/env bash +# Re-ingest content WITHOUT clearing Neo4j first. +# Existing RAG data is kept; new/updated content is added on top. +# IngestionRunner prints the summary when done. +# +# Set GUIDE_PROFILE in .env to use your own profile (default: "user"). +# e.g. GUIDE_PROFILE=menke → loads application-menke.yml +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +GUIDE_ROOT="$(dirname "$SCRIPT_DIR")" +cd "$GUIDE_ROOT" + +if [ -f .env ]; then + echo "Loading .env..." + set -a + source .env + set +a +fi + +GUIDE_PORT="${GUIDE_PORT:-1337}" +EXISTING_PID=$(lsof -ti :"$GUIDE_PORT" 2>/dev/null | head -1) +if [ -n "$EXISTING_PID" ]; then + echo "Killing existing process on port $GUIDE_PORT (PID $EXISTING_PID)..." + kill "$EXISTING_PID" 2>/dev/null || true + sleep 1 + kill -9 "$EXISTING_PID" 2>/dev/null || true + sleep 1 +fi + +echo "Ensuring Neo4j is up (Docker)..." +docker compose up neo4j -d + +NEO4J_BOLT_PORT="${NEO4J_BOLT_PORT:-7687}" +echo "Waiting for Neo4j on port $NEO4J_BOLT_PORT..." +max_wait=60 +elapsed=0 +while [ $elapsed -lt $max_wait ]; do + if docker exec embabel-neo4j cypher-shell -u "${NEO4J_USERNAME:-neo4j}" -p "${NEO4J_PASSWORD:-brahmsian}" "RETURN 1" >/dev/null 2>&1; then + echo "Neo4j is ready." + break + fi + sleep 3 + elapsed=$((elapsed + 3)) + echo " ... ${elapsed}s" +done +if [ $elapsed -ge $max_wait ]; then + echo "Neo4j did not become ready in time." + exit 1 +fi + +echo "Keeping existing RAG data (append mode)." + +GUIDE_PROFILE="${GUIDE_PROFILE:-user}" +export SPRING_PROFILES_ACTIVE="local,${GUIDE_PROFILE}" +export NEO4J_URI="${NEO4J_URI:-bolt://localhost:${NEO4J_BOLT_PORT}}" +export NEO4J_HOST="${NEO4J_HOST:-localhost}" + +# Force ingestion on startup (IngestionRunner prints the summary) +export GUIDE_RELOADCONTENTONSTARTUP=true + +echo "" +echo "Starting Guide with profiles: $SPRING_PROFILES_ACTIVE" +echo "Neo4j: $NEO4J_URI" +echo "" +echo "Ingestion will append to existing data." +echo "Watch for the INGESTION COMPLETE banner." +echo "Press Ctrl+C to stop." +echo "" + +# Run in foreground so Ctrl+C kills it directly +# Include scripts/user-config/ so Spring Boot finds personal profile files +./mvnw -DskipTests spring-boot:run -Dspring-boot.run.arguments="--spring.config.additional-location=file:./scripts/user-config/" diff --git a/scripts/fresh-ingest.sh b/scripts/fresh-ingest.sh new file mode 100755 index 0000000..b236fcb --- /dev/null +++ b/scripts/fresh-ingest.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash +# Wipe Neo4j RAG data and re-ingest everything from scratch. +# Starts Neo4j (Docker), clears all ContentElement nodes, then runs Guide +# with reload-content-on-startup=true. IngestionRunner prints the summary. +# +# Set GUIDE_PROFILE in .env to use your own profile (default: "user"). +# e.g. GUIDE_PROFILE=menke → loads application-menke.yml +set -e + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +GUIDE_ROOT="$(dirname "$SCRIPT_DIR")" +cd "$GUIDE_ROOT" + +if [ -f .env ]; then + echo "Loading .env..." + set -a + source .env + set +a +fi + +GUIDE_PORT="${GUIDE_PORT:-1337}" +EXISTING_PID=$(lsof -ti :"$GUIDE_PORT" 2>/dev/null | head -1) +if [ -n "$EXISTING_PID" ]; then + echo "Killing existing process on port $GUIDE_PORT (PID $EXISTING_PID)..." + kill "$EXISTING_PID" 2>/dev/null || true + sleep 1 + kill -9 "$EXISTING_PID" 2>/dev/null || true + sleep 1 +fi + +echo "Ensuring Neo4j is up (Docker)..." +docker compose up neo4j -d + +NEO4J_BOLT_PORT="${NEO4J_BOLT_PORT:-7687}" +echo "Waiting for Neo4j on port $NEO4J_BOLT_PORT..." +max_wait=60 +elapsed=0 +while [ $elapsed -lt $max_wait ]; do + if docker exec embabel-neo4j cypher-shell -u "${NEO4J_USERNAME:-neo4j}" -p "${NEO4J_PASSWORD:-brahmsian}" "RETURN 1" >/dev/null 2>&1; then + echo "Neo4j is ready." + break + fi + sleep 3 + elapsed=$((elapsed + 3)) + echo " ... ${elapsed}s" +done +if [ $elapsed -ge $max_wait ]; then + echo "Neo4j did not become ready in time." + exit 1 +fi + +echo "Clearing RAG content in Neo4j (ContentElement nodes)..." +docker exec embabel-neo4j cypher-shell -u "${NEO4J_USERNAME:-neo4j}" -p "${NEO4J_PASSWORD:-brahmsian}" "MATCH (c:ContentElement) DETACH DELETE c" 2>/dev/null || true +echo "RAG content cleared." + +GUIDE_PROFILE="${GUIDE_PROFILE:-user}" +export SPRING_PROFILES_ACTIVE="local,${GUIDE_PROFILE}" +export NEO4J_URI="${NEO4J_URI:-bolt://localhost:${NEO4J_BOLT_PORT}}" +export NEO4J_HOST="${NEO4J_HOST:-localhost}" + +# Force ingestion on startup (IngestionRunner prints the summary) +export GUIDE_RELOADCONTENTONSTARTUP=true + +echo "" +echo "Starting Guide with profiles: $SPRING_PROFILES_ACTIVE" +echo "Neo4j: $NEO4J_URI" +echo "" +echo "Ingestion will run automatically on startup." +echo "Watch for the INGESTION COMPLETE banner." +echo "Press Ctrl+C to stop." +echo "" + +# Run in foreground so Ctrl+C kills it directly +# Include scripts/user-config/ so Spring Boot finds personal profile files +./mvnw -DskipTests spring-boot:run -Dspring-boot.run.arguments="--spring.config.additional-location=file:./scripts/user-config/" diff --git a/scripts/user-config/README.md b/scripts/user-config/README.md new file mode 100644 index 0000000..a63d403 --- /dev/null +++ b/scripts/user-config/README.md @@ -0,0 +1,39 @@ +# Personal config + +Each developer can have their own Spring profile with personal settings (URLs, directories, paths, etc.). + +## Quick start + +```bash +cp scripts/user-config/application-user.yml.example scripts/user-config/application-myname.yml +# Edit to taste, then: +echo 'GUIDE_PROFILE=myname' >> .env +./scripts/fresh-ingest.sh +``` + +## How it works + +- The scripts (`fresh-ingest.sh`, `append-ingest.sh`) read `GUIDE_PROFILE` from `.env` (default: `user`) +- Spring profiles become `local,` → loads `application-.yml` +- The scripts pass `--spring.config.additional-location=file:./scripts/user-config/` so Spring picks up profiles from this directory +- Personal profiles in `scripts/user-config/` are gitignored (only the `.example` is checked in) + +## Ingestion on startup + +The `IngestionRunner` only activates when `guide.reload-content-on-startup` is `true`. The default in `application.yml` is `false`, so normal builds (`./mvnw test`, `./mvnw spring-boot:run`) never trigger ingestion. Only the scripts set this flag -- `fresh-ingest.sh` exports `GUIDE_RELOADCONTENTONSTARTUP=true` before launching the app. + +## Failure recovery + +Ingestion is resilient at every level -- a single failure never prevents the remaining items from being processed: + +- **URLs**: each URL is ingested independently. If one times out or returns an error, the rest continue. +- **Directories**: each configured directory is ingested independently. A missing or unreadable directory doesn't block others. +- **Documents within a directory**: each file is written to the store individually. A single unparseable file (e.g. corrupt encoding) doesn't skip the remaining files in that directory. + +All failures are collected with their source and reason into the `IngestionResult`, which is: +- Printed in the **INGESTION COMPLETE** banner (so you can see what failed and why at a glance) +- Returned as JSON from `POST /api/v1/data/load-references` for programmatic inspection + +## MCP tools + +All ingested content -- both URLs and local directories -- is immediately available through the MCP tools (`docs_vectorSearch`, `docs_textSearch`, etc.). The MCP tools and the ingestion pipeline share the same Neo4j store, so there is no separate sync step. Once ingestion completes, MCP clients (Cursor, Claude Desktop, etc.) can search the content right away. diff --git a/scripts/user-config/application-user.yml.example b/scripts/user-config/application-user.yml.example new file mode 100644 index 0000000..e970cf3 --- /dev/null +++ b/scripts/user-config/application-user.yml.example @@ -0,0 +1,28 @@ +# Example personal config. Copy to application-user.yml (gitignored) and customize. +# Or set GUIDE_PROFILE=yourname in .env and name it application-yourname.yml. +# +# cp scripts/user-config/application-user.yml.example scripts/user-config/application-user.yml + +guide: + # Set to true to ingest URLs/directories on startup and print a summary + reload-content-on-startup: true + projects-path: ./embabel-projects + default-persona: adaptive + chat-llm: + model: gpt-4.1-mini + content-chunker: + include-section-title-in-chunk: false + max-chunk-size: 4000 + overlap-size: 200 + embedding-batch-size: 50 + urls: + - https://docs.embabel.com/embabel-agent/guide/0.3.5-SNAPSHOT/ + - https://github.com/embabel/embabel-agent + - https://github.com/embabel/embabel-agent-examples + - https://github.com/embabel/dice + # Ingest full local repos (paths relative to working dir or absolute) + directories: + - ./embabel-projects/dice + - ./embabel-projects/embabel-agent + - ./embabel-projects/embabel-agent-examples + tool-groups: diff --git a/src/main/java/com/embabel/guide/GuideProperties.java b/src/main/java/com/embabel/guide/GuideProperties.java index d7bda58..bef18d6 100644 --- a/src/main/java/com/embabel/guide/GuideProperties.java +++ b/src/main/java/com/embabel/guide/GuideProperties.java @@ -19,12 +19,15 @@ * * @param reloadContentOnStartup whether to reload RAG content on startup * @param defaultPersona name of the default persona to use + * @param chatLlm LLM options for chat + * @param projectsPath path to projects root: absolute, or relative to the process working directory (user.dir) * @param chatLlm LLM options for RAG chat (beefy model) * @param fastLlm LLM options for classification and quick responses (lightweight model) * @param projectsPath path under user's home directory where projects are created * @param chunkerConfig chunker configuration for RAG ingestion * @param referencesFile YML files containing LLM references such as GitHub repositories and classpath info * @param urls list of URLs to ingest--for example, documentation and blogs + * @param directories optional list of local directory paths to ingest (full tree); resolved like projectsPath * @param toolGroups toolGroups, such as "web", that are allowed */ @Validated @@ -45,6 +48,7 @@ public record GuideProperties( List urls, @DefaultValue("") String toolPrefix, + List directories, Set toolGroups, LlmOptions narratorLlm ) { @@ -54,11 +58,42 @@ public StringTransformer toolNamingStrategy() { } /** - * Returns the root path for projects, combining the user's home directory with the specified projects path. + * Resolves the projects path: if path starts with ~/, expands to user.home; if absolute, uses as-is; + * otherwise resolves relative to user.dir. * - * @return the full path to the projects root directory + * @return the absolute path to the projects root directory */ public String projectRootPath() { - return Path.of(System.getProperty("user.home"), projectsPath).toString(); + return resolvePath(projectsPath); + } + + /** + * Resolves a path: ~/... to user.home, absolute as-is, else relative to user.dir. + */ + public String resolvePath(String path) { + return resolvePath(path, System.getProperty("user.home"), System.getProperty("user.dir")); + } + + /** + * Resolves a path with explicit home and cwd; used for testing. + * + * @param path path to resolve (may be ~/..., absolute, or relative) + * @param userHome value for user.home + * @param userDir value for user.dir (working directory) + * @return resolved absolute path, or path if null/blank + */ + static String resolvePath(String path, String userHome, String userDir) { + if (path == null || path.isBlank()) { + return path; + } + String expanded = path.strip(); + if (expanded.startsWith("~/") || "~".equals(expanded)) { + expanded = expanded.length() == 1 ? userHome : Path.of(userHome, expanded.substring(2)).normalize().toString(); + } + Path p = Path.of(expanded); + if (p.isAbsolute()) { + return p.normalize().toAbsolutePath().toString(); + } + return Path.of(userDir, expanded).normalize().toAbsolutePath().toString(); } } \ No newline at end of file diff --git a/src/main/java/com/embabel/guide/rag/DataManager.java b/src/main/java/com/embabel/guide/rag/DataManager.java index 48da38a..b9d9145 100644 --- a/src/main/java/com/embabel/guide/rag/DataManager.java +++ b/src/main/java/com/embabel/guide/rag/DataManager.java @@ -5,7 +5,9 @@ import com.embabel.agent.api.reference.LlmReferenceProviders; import com.embabel.agent.rag.ingestion.*; import com.embabel.agent.rag.ingestion.policy.UrlSpecificContentRefreshPolicy; -import com.embabel.agent.rag.neo.drivine.DrivineStore; +import com.embabel.agent.rag.model.NavigableDocument; +import com.embabel.agent.rag.store.ChunkingContentElementRepository; +import com.embabel.agent.rag.store.ContentElementRepositoryInfo; import com.embabel.agent.tools.file.FileTools; import com.embabel.guide.GuideProperties; import com.google.common.collect.Iterables; @@ -15,25 +17,25 @@ import org.springframework.lang.Nullable; import org.springframework.stereotype.Service; +import java.time.Duration; +import java.time.Instant; +import java.util.ArrayList; import java.util.Collections; import java.util.List; /** - * Exposes references and RAG configuration + * Exposes references and RAG configuration. + * Depends on {@link ChunkingContentElementRepository} from the rag-core library, + * so any backend implementing that interface (e.g. DrivineStore for Neo4j) can be + * plugged in without changes here. */ @Service public class DataManager { - public record Stats( - int chunkCount, - int documentCount, - int contentElementCount) { - } - private final Logger logger = LoggerFactory.getLogger(DataManager.class); private final GuideProperties guideProperties; private final List references; - private final DrivineStore store; + private final ChunkingContentElementRepository store; private final HierarchicalContentReader hierarchicalContentReader = new TikaHierarchicalContentReader(); @@ -43,22 +45,29 @@ public record Stats( ); public DataManager( - DrivineStore store, + ChunkingContentElementRepository store, GuideProperties guideProperties ) { this.store = store; this.guideProperties = guideProperties; this.references = LlmReferenceProviders.fromYmlFile(guideProperties.referencesFile()); store.provision(); - if (guideProperties.reloadContentOnStartup()) { - logger.info("Reloading RAG content on startup"); - loadReferences(); - } + // Ingestion on startup is now handled by IngestionRunner (ApplicationRunner) + // which is activated by guide.reload-content-on-startup=true + } + + public ContentElementRepositoryInfo getStats() { + return store.info(); + } + + /** Convenience for tests that cannot reference Stats (e.g. Kotlin). */ + public int getDocumentCount() { + return store.info().getDocumentCount(); } - public Stats getStats() { - var info = store.info(); - return new Stats(info.getChunkCount(), info.getDocumentCount(), info.getContentElementCount()); + /** Convenience for tests that cannot reference Stats (e.g. Kotlin). */ + public int getChunkCount() { + return store.info().getChunkCount(); } @NonNull @@ -77,18 +86,32 @@ public void provisionDatabase() { } /** - * Read all files under this directory on this local machine + * Read all files under this directory on this local machine. + * Each document is written individually so a single failure does not + * prevent the remaining documents from being ingested. * - * @param dir absolute path + * @param dir absolute path + * @param failedDocuments collector for per-document failures (mutated) + * @return the parsing result (may still be useful even when some documents failed) */ - public DirectoryParsingResult ingestDirectory(String dir) { + public DirectoryParsingResult ingestDirectory(String dir, List failedDocuments) { var ft = FileTools.readOnly(dir); var directoryParsingResult = new TikaHierarchicalContentReader() .parseFromDirectory(ft, new DirectoryParsingConfig()); for (var root : directoryParsingResult.getContentRoots()) { - logger.info("Parsed root: {} with {} descendants", root.getTitle(), - Iterables.size(root.descendants())); - store.writeAndChunkDocument(root); + String docTitle = "unknown"; + try { + var doc = (NavigableDocument) root; + docTitle = doc.getTitle(); + logger.info("Parsed root: {} with {} descendants", docTitle, + Iterables.size(doc.descendants())); + store.writeAndChunkDocument(doc); + } catch (Throwable t) { + logger.error("Failed to write document '{}' from directory {}: {}", + docTitle, dir, t.getMessage(), t); + failedDocuments.add(IngestionFailure.fromException( + dir + " -> " + docTitle, t)); + } } return directoryParsingResult; } @@ -99,8 +122,7 @@ public DirectoryParsingResult ingestDirectory(String dir) { * @param url the URL to ingest */ public void ingestPage(String url) { - var root = contentRefreshPolicy - .ingestUriIfNeeded(store, hierarchicalContentReader, url); + var root = contentRefreshPolicy.ingestUriIfNeeded(store, hierarchicalContentReader, url); if (root != null) { logger.info("Ingested page: {} with {} descendants", root.getTitle(), @@ -112,26 +134,56 @@ public void ingestPage(String url) { } /** - * Load all referenced URLs from configuration + * Load all referenced URLs and directories from configuration. + * Each item is ingested independently -- a single failure never prevents + * the remaining items from being processed. + * + * @return structured result with loaded/failed URLs and directories (with reasons) */ - public void loadReferences() { - int successCount = 0; - int failureCount = 0; + public IngestionResult loadReferences() { + var start = Instant.now(); + var loadedUrls = new ArrayList(); + var failedUrls = new ArrayList(); + var ingestedDirs = new ArrayList(); + var failedDirs = new ArrayList(); + var failedDocuments = new ArrayList(); for (var url : guideProperties.urls()) { try { - logger.info("⏳Loading URL: {}...", url); + logger.info("⏳ Loading URL: {}...", url); ingestPage(url); logger.info("✅ Loaded URL: {}", url); - successCount++; - + loadedUrls.add(url); } catch (Throwable t) { logger.error("❌ Failure loading URL {}: {}", url, t.getMessage(), t); - failureCount++; + failedUrls.add(IngestionFailure.fromException(url, t)); } } logger.info("Loaded {}/{} URLs successfully ({} failed)", - successCount, guideProperties.urls().size(), failureCount); + loadedUrls.size(), guideProperties.urls().size(), failedUrls.size()); + + List dirs = guideProperties.directories(); + if (dirs != null && !dirs.isEmpty()) { + for (String dir : dirs) { + try { + String absolutePath = guideProperties.resolvePath(dir); + logger.info("⏳ Ingesting directory: {}...", absolutePath); + ingestDirectory(absolutePath, failedDocuments); + logger.info("✅ Ingested directory: {}", absolutePath); + ingestedDirs.add(absolutePath); + } catch (Throwable t) { + logger.error("❌ Failure ingesting directory {}: {}", dir, t.getMessage(), t); + failedDirs.add(IngestionFailure.fromException(dir, t)); + } + } + logger.info("Ingested {}/{} directories ({} dir failures, {} document failures)", + ingestedDirs.size(), dirs.size(), failedDirs.size(), failedDocuments.size()); + } else { + logger.info("No directories configured for ingestion (guide.directories empty or not set)"); + } + + return new IngestionResult(loadedUrls, failedUrls, ingestedDirs, failedDirs, + failedDocuments, Duration.between(start, Instant.now())); } } diff --git a/src/main/java/com/embabel/guide/rag/DataManagerController.java b/src/main/java/com/embabel/guide/rag/DataManagerController.java index f9cd4af..c1fb775 100644 --- a/src/main/java/com/embabel/guide/rag/DataManagerController.java +++ b/src/main/java/com/embabel/guide/rag/DataManagerController.java @@ -1,5 +1,6 @@ package com.embabel.guide.rag; +import com.embabel.agent.rag.store.ContentElementRepositoryInfo; import org.springframework.web.bind.annotation.GetMapping; import org.springframework.web.bind.annotation.PostMapping; import org.springframework.web.bind.annotation.RequestMapping; @@ -19,13 +20,12 @@ public DataManagerController(DataManager dataManager) { } @GetMapping("/stats") - public DataManager.Stats getStats() { + public ContentElementRepositoryInfo getStats() { return dataManager.getStats(); } @PostMapping("/load-references") - public String loadReferences() { - dataManager.loadReferences(); - return "References loaded successfully"; + public IngestionResult loadReferences() { + return dataManager.loadReferences(); } } diff --git a/src/main/java/com/embabel/guide/rag/IngestionFailure.java b/src/main/java/com/embabel/guide/rag/IngestionFailure.java new file mode 100644 index 0000000..4f6c5f0 --- /dev/null +++ b/src/main/java/com/embabel/guide/rag/IngestionFailure.java @@ -0,0 +1,25 @@ +package com.embabel.guide.rag; + +/** + * Captures the identity of a failed ingestion item together with + * a human-readable reason so operators can diagnose problems without + * digging through logs. + * + * @param source the URL or directory path that failed + * @param reason short description of what went wrong + */ +public record IngestionFailure( + String source, + String reason +) { + /** + * Build from an exception, using its message (or class name as fallback). + */ + public static IngestionFailure fromException(String source, Throwable t) { + String reason = t.getMessage(); + if (reason == null || reason.isBlank()) { + reason = t.getClass().getSimpleName(); + } + return new IngestionFailure(source, reason); + } +} diff --git a/src/main/java/com/embabel/guide/rag/IngestionResult.java b/src/main/java/com/embabel/guide/rag/IngestionResult.java new file mode 100644 index 0000000..84a0b6f --- /dev/null +++ b/src/main/java/com/embabel/guide/rag/IngestionResult.java @@ -0,0 +1,36 @@ +package com.embabel.guide.rag; + +import java.time.Duration; +import java.util.List; + +/** + * Structured result from a full ingestion run (URLs + directories). + * Failed items carry an {@link IngestionFailure} with a reason so operators + * can diagnose problems from the summary banner alone. + */ +public record IngestionResult( + List loadedUrls, + List failedUrls, + List ingestedDirectories, + List failedDirectories, + /** Per-document failures that occurred inside otherwise-successful directories. */ + List failedDocuments, + Duration elapsed +) { + + public int totalUrls() { + return loadedUrls.size() + failedUrls.size(); + } + + public int totalDirectories() { + return ingestedDirectories.size() + failedDirectories.size(); + } + + public boolean hasFailures() { + return !failedUrls.isEmpty() || !failedDirectories.isEmpty() || !failedDocuments.isEmpty(); + } + + public int totalFailures() { + return failedUrls.size() + failedDirectories.size() + failedDocuments.size(); + } +} diff --git a/src/main/java/com/embabel/guide/rag/IngestionRunner.java b/src/main/java/com/embabel/guide/rag/IngestionRunner.java new file mode 100644 index 0000000..c1ff20c --- /dev/null +++ b/src/main/java/com/embabel/guide/rag/IngestionRunner.java @@ -0,0 +1,115 @@ +package com.embabel.guide.rag; + +import com.embabel.agent.rag.store.ContentElementRepositoryInfo; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.boot.ApplicationArguments; +import org.springframework.boot.ApplicationRunner; +import org.springframework.boot.autoconfigure.condition.ConditionalOnProperty; +import org.springframework.stereotype.Component; + +/** + * Runs ingestion on startup when {@code guide.reload-content-on-startup} is true. + * Prints a structured summary to stdout so the shell script (or human) can see + * exactly what was loaded without parsing log files. + */ +@Component +@ConditionalOnProperty(name = "guide.reload-content-on-startup", havingValue = "true") +public class IngestionRunner implements ApplicationRunner { + + private static final Logger logger = LoggerFactory.getLogger(IngestionRunner.class); + + private final DataManager dataManager; + + @Value("${server.port:8080}") + private int serverPort; + + public IngestionRunner(DataManager dataManager) { + this.dataManager = dataManager; + } + + @Override + public void run(ApplicationArguments args) { + logger.info("IngestionRunner: starting ingestion (reload-content-on-startup=true)"); + + var result = dataManager.loadReferences(); + + var stats = dataManager.getStats(); + printSummary(result, stats); + } + + private void printSummary(IngestionResult result, ContentElementRepositoryInfo stats) { + var sb = new StringBuilder(); + sb.append("\n"); + sb.append("╔══════════════════════════════════════════════════╗\n"); + sb.append("║ INGESTION COMPLETE ║\n"); + sb.append("╚══════════════════════════════════════════════════╝\n"); + sb.append("\n"); + + sb.append(" Time: ").append(formatDuration(result.elapsed())).append("\n\n"); + + sb.append(" ── URLs (").append(result.loadedUrls().size()).append("/") + .append(result.totalUrls()).append(" loaded) ──\n"); + if (!result.loadedUrls().isEmpty()) { + sb.append(" Loaded:\n"); + result.loadedUrls().forEach(u -> sb.append(" ✓ ").append(u).append("\n")); + } + if (!result.failedUrls().isEmpty()) { + sb.append(" Failed:\n"); + result.failedUrls().forEach(f -> + sb.append(" ✗ ").append(f.source()).append("\n") + .append(" reason: ").append(f.reason()).append("\n")); + } + sb.append("\n"); + + if (result.totalDirectories() > 0) { + sb.append(" ── Directories (").append(result.ingestedDirectories().size()).append("/") + .append(result.totalDirectories()).append(" ingested) ──\n"); + if (!result.ingestedDirectories().isEmpty()) { + sb.append(" Ingested:\n"); + result.ingestedDirectories().forEach(d -> sb.append(" ✓ ").append(d).append("\n")); + } + if (!result.failedDirectories().isEmpty()) { + sb.append(" Failed:\n"); + result.failedDirectories().forEach(f -> + sb.append(" ✗ ").append(f.source()).append("\n") + .append(" reason: ").append(f.reason()).append("\n")); + } + } else { + sb.append(" ── Directories: none configured ──\n"); + } + + if (!result.failedDocuments().isEmpty()) { + sb.append("\n ── Document Failures (").append(result.failedDocuments().size()).append(") ──\n"); + result.failedDocuments().forEach(f -> + sb.append(" ✗ ").append(f.source()).append("\n") + .append(" reason: ").append(f.reason()).append("\n")); + } + + sb.append("\n"); + sb.append(" ── RAG Store ──\n"); + sb.append(" Documents: ").append(stats.getDocumentCount()).append("\n"); + sb.append(" Chunks: ").append(stats.getChunkCount()).append("\n"); + sb.append(" Elements: ").append(stats.getContentElementCount()).append("\n"); + sb.append("\n"); + + sb.append(" Guide is running on port ").append(serverPort).append("\n"); + sb.append(" MCP endpoint: http://localhost:").append(serverPort).append("/sse\n"); + sb.append("\n"); + + // Print to stdout so it's visible even without log-level config + System.out.println(sb); + logger.info("Ingestion summary printed to stdout"); + } + + private static String formatDuration(java.time.Duration d) { + long totalSec = d.getSeconds(); + if (totalSec < 60) { + return totalSec + "s"; + } + long min = totalSec / 60; + long sec = totalSec % 60; + return min + "m " + sec + "s"; + } +} diff --git a/src/main/resources/application.yml b/src/main/resources/application.yml index dd5c01c..3120c03 100644 --- a/src/main/resources/application.yml +++ b/src/main/resources/application.yml @@ -3,10 +3,10 @@ server: guide: - # If true, the Guide chatbot will load content from the projects path on startup - # Snapshots will always be reloaded: other URLs will not + # If true, the Guide chatbot will load content from URLs and directories on startup reload-content-on-startup: false + # Projects root: absolute path, or relative to process working directory (e.g. ./embabel-projects) projects-path: ./embabel-projects # Change to any filename from /src/main/resources/prompts/persona, @@ -27,12 +27,18 @@ guide: include-section-title-in-chunk: false max-chunk-size: 4000 overlap-size: 200 + # Chunks per embedding API call. 50 is safe; increase for speed if no timeouts (max 2048 for OpenAI). + embedding-batch-size: 50 + # Ingested via load-references (or on startup if reload-content-on-startup is true) urls: - https://docs.embabel.com/embabel-agent/guide/0.3.5-SNAPSHOT/ - https://medium.com/@springrod/build-better-agents-in-java-vs-python-embabel-vs-langgraph-f7951a0d855c - https://medium.com/@springrod/context-engineering-needs-domain-understanding-b4387e8e4bf8 - - https://github.com/embabel/embabel-agent/blob/main/README.md?plain=1 + + # Local directories to ingest (full tree). Paths are absolute or relative to working directory. + # Example: clone repos then list them here for full-repo RAG. + directories: [] tool-groups: diff --git a/src/test/kotlin/com/embabel/guide/GuidePropertiesPathResolutionTest.kt b/src/test/kotlin/com/embabel/guide/GuidePropertiesPathResolutionTest.kt new file mode 100644 index 0000000..21a2813 --- /dev/null +++ b/src/test/kotlin/com/embabel/guide/GuidePropertiesPathResolutionTest.kt @@ -0,0 +1,73 @@ +/* + * Copyright 2024-2025 Embabel Software, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.embabel.guide + +import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.Test +import java.io.File + +/** + * Unit tests for GuideProperties path resolution (resolvePath, projectRootPath). + * Tests tilde expansion, absolute paths, and relative-to-cwd resolution. + */ +class GuidePropertiesPathResolutionTest { + + private val home = File.separator + "home" + File.separator + "user" + private val cwd = File.separator + "work" + File.separator + "guide" + + @Test + fun `resolvePath expands tilde to user home`() { + val result = GuideProperties.resolvePath("~", home, cwd) + assertEquals(home, result) + } + + @Test + fun `resolvePath expands tilde slash path to user home plus path`() { + val result = GuideProperties.resolvePath("~/github/jmjava", home, cwd) + assertEquals( + File(home, "github/jmjava").normalize().absolutePath, + result + ) + } + + @Test + fun `resolvePath leaves absolute path unchanged apart from normalization`() { + val absolute = File.separator + "abs" + File.separator + "path" + File.separator + ".." + File.separator + "repo" + val result = GuideProperties.resolvePath(absolute, home, cwd) + assertEquals(File(absolute).normalize().absolutePath, result) + } + + @Test + fun `resolvePath resolves relative path against user dir`() { + val result = GuideProperties.resolvePath("./embabel-projects", home, cwd) + assertEquals( + File(cwd, "embabel-projects").normalize().absolutePath, + result + ) + } + + @Test + fun `resolvePath returns null when path is null`() { + val result = GuideProperties.resolvePath(null, home, cwd) + assertEquals(null, result) + } + + @Test + fun `resolvePath returns blank when path is blank`() { + val result = GuideProperties.resolvePath(" ", home, cwd) + assertEquals(" ", result) + } +} diff --git a/src/test/kotlin/com/embabel/guide/rag/DataManagerControllerTest.kt b/src/test/kotlin/com/embabel/guide/rag/DataManagerControllerTest.kt new file mode 100644 index 0000000..b6ba6ee --- /dev/null +++ b/src/test/kotlin/com/embabel/guide/rag/DataManagerControllerTest.kt @@ -0,0 +1,44 @@ +package com.embabel.guide.rag + +import com.embabel.agent.rag.neo.drivine.model.ContentElementRepositoryInfoImpl +import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.Test +import org.mockito.Mockito.* +import java.time.Duration + +class DataManagerControllerTest { + + private val dataManager = mock(DataManager::class.java) + private val controller = DataManagerController(dataManager) + + @Test + fun `getStats delegates to dataManager`() { + val stats = ContentElementRepositoryInfoImpl(10, 3, 20, false, true) + `when`(dataManager.getStats()).thenReturn(stats) + + val result = controller.getStats() + + assertEquals(stats, result) + verify(dataManager).getStats() + } + + @Test + fun `loadReferences returns IngestionResult from dataManager`() { + val ingestionResult = IngestionResult( + listOf("http://example.com"), + emptyList(), + listOf("/dir"), + emptyList(), + emptyList(), + Duration.ofSeconds(60) + ) + `when`(dataManager.loadReferences()).thenReturn(ingestionResult) + + val result = controller.loadReferences() + + assertEquals(ingestionResult, result) + assertEquals(1, result.loadedUrls().size) + assertEquals(1, result.ingestedDirectories().size) + verify(dataManager).loadReferences() + } +} diff --git a/src/test/kotlin/com/embabel/guide/rag/DataManagerLoadReferencesIntegrationTest.kt b/src/test/kotlin/com/embabel/guide/rag/DataManagerLoadReferencesIntegrationTest.kt new file mode 100644 index 0000000..56ef3e0 --- /dev/null +++ b/src/test/kotlin/com/embabel/guide/rag/DataManagerLoadReferencesIntegrationTest.kt @@ -0,0 +1,76 @@ +/* + * Copyright 2024-2025 Embabel Software, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package com.embabel.guide.rag + +import com.embabel.guide.Neo4jPropertiesInitializer +import org.junit.jupiter.api.Assertions.assertFalse +import org.junit.jupiter.api.Assertions.assertTrue +import org.junit.jupiter.api.Test +import org.springframework.ai.mcp.client.common.autoconfigure.McpClientAutoConfiguration +import org.springframework.beans.factory.annotation.Autowired +import org.springframework.boot.autoconfigure.ImportAutoConfiguration +import org.springframework.boot.test.context.SpringBootTest +import org.springframework.test.context.ActiveProfiles +import org.springframework.test.context.ContextConfiguration +import org.springframework.test.context.TestPropertySource + +/** + * Integration test for DataManager.loadReferences() with guide.directories. + * Uses the same Neo4j setup as other integration tests (local or Testcontainers via Neo4jPropertiesInitializer). + * Verifies that when guide.directories is set, loadReferences() ingests the directory without throwing. + */ +@SpringBootTest +@ActiveProfiles("test") +@ContextConfiguration(initializers = [Neo4jPropertiesInitializer::class]) +@ImportAutoConfiguration(exclude = [McpClientAutoConfiguration::class]) +@TestPropertySource( + properties = [ + "guide.urls=", + "guide.directories[0]=./src/test/resources/sample-repo-for-ingestion" + ] +) +class DataManagerLoadReferencesIntegrationTest { + + @Autowired + private lateinit var dataManager: DataManager + + @Test + fun `loadReferences ingests configured directory and returns structured result`() { + val result = dataManager.loadReferences() + + // Verify structured result + assertTrue(result.failedUrls().isEmpty(), "No URLs configured so none should fail") + assertTrue(result.failedDirectories().isEmpty(), "Directory ingestion should succeed") + assertTrue( + result.ingestedDirectories().isNotEmpty(), + "Should have ingested at least one directory" + ) + assertFalse(result.hasFailures(), "Should have no failures") + assertTrue(result.elapsed().toMillis() >= 0, "Elapsed should be non-negative") + + // Verify data actually landed in the store + val docCount = dataManager.getDocumentCount() + val chunkCount = dataManager.getChunkCount() + assertTrue( + docCount >= 1, + "Expected at least one document after ingesting sample-repo-for-ingestion; got documentCount=$docCount" + ) + assertTrue( + chunkCount >= 1, + "Expected at least one chunk after ingesting; got chunkCount=$chunkCount" + ) + } +} diff --git a/src/test/kotlin/com/embabel/guide/rag/IngestionFailureTest.kt b/src/test/kotlin/com/embabel/guide/rag/IngestionFailureTest.kt new file mode 100644 index 0000000..a54cab7 --- /dev/null +++ b/src/test/kotlin/com/embabel/guide/rag/IngestionFailureTest.kt @@ -0,0 +1,41 @@ +package com.embabel.guide.rag + +import org.junit.jupiter.api.Assertions.* +import org.junit.jupiter.api.Test + +class IngestionFailureTest { + + @Test + fun `fromException uses exception message`() { + val ex = RuntimeException("Connection timed out") + val failure = IngestionFailure.fromException("http://example.com", ex) + + assertEquals("http://example.com", failure.source()) + assertEquals("Connection timed out", failure.reason()) + } + + @Test + fun `fromException falls back to class name when message is null`() { + val ex = NullPointerException() + val failure = IngestionFailure.fromException("/some/dir", ex) + + assertEquals("/some/dir", failure.source()) + assertEquals("NullPointerException", failure.reason()) + } + + @Test + fun `fromException falls back to class name when message is blank`() { + val ex = RuntimeException(" ") + val failure = IngestionFailure.fromException("/some/dir", ex) + + assertEquals("/some/dir", failure.source()) + assertEquals("RuntimeException", failure.reason()) + } + + @Test + fun `record fields are accessible`() { + val failure = IngestionFailure("src", "reason") + assertEquals("src", failure.source()) + assertEquals("reason", failure.reason()) + } +} diff --git a/src/test/kotlin/com/embabel/guide/rag/IngestionResultTest.kt b/src/test/kotlin/com/embabel/guide/rag/IngestionResultTest.kt new file mode 100644 index 0000000..d591741 --- /dev/null +++ b/src/test/kotlin/com/embabel/guide/rag/IngestionResultTest.kt @@ -0,0 +1,102 @@ +package com.embabel.guide.rag + +import org.junit.jupiter.api.Assertions.* +import org.junit.jupiter.api.Test +import java.time.Duration + +class IngestionResultTest { + + private fun failure(source: String, reason: String = "test error") = + IngestionFailure(source, reason) + + @Test + fun `totalUrls sums loaded and failed`() { + val result = IngestionResult( + listOf("a", "b"), listOf(failure("c")), + emptyList(), emptyList(), emptyList(), Duration.ZERO + ) + assertEquals(3, result.totalUrls()) + } + + @Test + fun `totalDirectories sums ingested and failed`() { + val result = IngestionResult( + emptyList(), emptyList(), + listOf("d1"), listOf(failure("d2"), failure("d3")), + emptyList(), Duration.ZERO + ) + assertEquals(3, result.totalDirectories()) + } + + @Test + fun `hasFailures returns true when URLs failed`() { + val result = IngestionResult( + listOf("ok"), listOf(failure("bad")), + emptyList(), emptyList(), emptyList(), Duration.ZERO + ) + assertTrue(result.hasFailures()) + } + + @Test + fun `hasFailures returns true when directories failed`() { + val result = IngestionResult( + emptyList(), emptyList(), + emptyList(), listOf(failure("bad-dir")), + emptyList(), Duration.ZERO + ) + assertTrue(result.hasFailures()) + } + + @Test + fun `hasFailures returns true when documents failed`() { + val result = IngestionResult( + emptyList(), emptyList(), + listOf("dir"), emptyList(), + listOf(failure("dir -> doc1")), + Duration.ZERO + ) + assertTrue(result.hasFailures()) + } + + @Test + fun `hasFailures returns false when nothing failed`() { + val result = IngestionResult( + listOf("ok"), emptyList(), + listOf("dir"), emptyList(), emptyList(), Duration.ZERO + ) + assertFalse(result.hasFailures()) + } + + @Test + fun `empty result has zero totals and no failures`() { + val result = IngestionResult( + emptyList(), emptyList(), + emptyList(), emptyList(), emptyList(), Duration.ZERO + ) + assertEquals(0, result.totalUrls()) + assertEquals(0, result.totalDirectories()) + assertEquals(0, result.totalFailures()) + assertFalse(result.hasFailures()) + } + + @Test + fun `totalFailures counts all failure types`() { + val result = IngestionResult( + emptyList(), listOf(failure("u1"), failure("u2")), + emptyList(), listOf(failure("d1")), + listOf(failure("doc1"), failure("doc2"), failure("doc3")), + Duration.ZERO + ) + assertEquals(6, result.totalFailures()) + } + + @Test + fun `elapsed duration is preserved`() { + val duration = Duration.ofMinutes(5) + val result = IngestionResult( + emptyList(), emptyList(), + emptyList(), emptyList(), emptyList(), duration + ) + assertEquals(duration, result.elapsed()) + } +} diff --git a/src/test/kotlin/com/embabel/guide/rag/IngestionRunnerTest.kt b/src/test/kotlin/com/embabel/guide/rag/IngestionRunnerTest.kt new file mode 100644 index 0000000..e28b434 --- /dev/null +++ b/src/test/kotlin/com/embabel/guide/rag/IngestionRunnerTest.kt @@ -0,0 +1,205 @@ +package com.embabel.guide.rag + +import com.embabel.agent.rag.neo.drivine.model.ContentElementRepositoryInfoImpl +import org.junit.jupiter.api.Assertions.* +import org.junit.jupiter.api.Test +import org.mockito.Mockito.* +import org.springframework.boot.DefaultApplicationArguments +import java.io.ByteArrayOutputStream +import java.io.PrintStream +import java.time.Duration + +class IngestionRunnerTest { + + private val dataManager = mock(DataManager::class.java) + + private fun failure(source: String, reason: String = "test error") = + IngestionFailure(source, reason) + + private fun createRunner(port: Int = 1337): IngestionRunner { + val runner = IngestionRunner(dataManager) + val field = IngestionRunner::class.java.getDeclaredField("serverPort") + field.isAccessible = true + field.setInt(runner, port) + return runner + } + + @Test + fun `run calls loadReferences and getStats`() { + val result = IngestionResult( + listOf("http://example.com"), emptyList(), + emptyList(), emptyList(), emptyList(), + Duration.ofSeconds(10) + ) + `when`(dataManager.loadReferences()).thenReturn(result) + `when`(dataManager.getStats()).thenReturn(ContentElementRepositoryInfoImpl(5, 2, 10, false, true)) + + val runner = createRunner() + runner.run(DefaultApplicationArguments()) + + verify(dataManager).loadReferences() + verify(dataManager).getStats() + } + + @Test + fun `summary banner contains URL results`() { + val result = IngestionResult( + listOf("http://loaded.com"), + listOf(failure("http://failed.com", "Connection refused")), + emptyList(), emptyList(), emptyList(), + Duration.ofSeconds(30) + ) + `when`(dataManager.loadReferences()).thenReturn(result) + `when`(dataManager.getStats()).thenReturn(ContentElementRepositoryInfoImpl(0, 0, 0, false, true)) + + val output = captureStdout { + createRunner().run(DefaultApplicationArguments()) + } + + assertTrue(output.contains("INGESTION COMPLETE"), "Should contain banner") + assertTrue(output.contains("http://loaded.com"), "Should list loaded URL") + assertTrue(output.contains("http://failed.com"), "Should list failed URL") + assertTrue(output.contains("Connection refused"), "Should show failure reason") + assertTrue(output.contains("1/2 loaded"), "Should show URL counts") + } + + @Test + fun `summary banner contains directory results`() { + val result = IngestionResult( + emptyList(), emptyList(), + listOf("/home/user/repo"), + listOf(failure("/home/user/bad", "No such directory")), + emptyList(), + Duration.ofMinutes(5) + ) + `when`(dataManager.loadReferences()).thenReturn(result) + `when`(dataManager.getStats()).thenReturn(ContentElementRepositoryInfoImpl(100, 10, 200, false, true)) + + val output = captureStdout { + createRunner().run(DefaultApplicationArguments()) + } + + assertTrue(output.contains("/home/user/repo"), "Should list ingested dir") + assertTrue(output.contains("/home/user/bad"), "Should list failed dir") + assertTrue(output.contains("No such directory"), "Should show dir failure reason") + assertTrue(output.contains("1/2 ingested"), "Should show dir counts") + } + + @Test + fun `summary shows document failures`() { + val result = IngestionResult( + emptyList(), emptyList(), + listOf("/home/user/repo"), emptyList(), + listOf(failure("/home/user/repo -> README.md", "Parse error: invalid encoding")), + Duration.ofSeconds(20) + ) + `when`(dataManager.loadReferences()).thenReturn(result) + `when`(dataManager.getStats()).thenReturn(ContentElementRepositoryInfoImpl(50, 5, 100, false, true)) + + val output = captureStdout { + createRunner().run(DefaultApplicationArguments()) + } + + assertTrue(output.contains("Document Failures (1)"), "Should show document failure section") + assertTrue(output.contains("README.md"), "Should show failed document") + assertTrue(output.contains("Parse error: invalid encoding"), "Should show doc failure reason") + } + + @Test + fun `summary shows no directories configured when empty`() { + val result = IngestionResult( + emptyList(), emptyList(), + emptyList(), emptyList(), emptyList(), + Duration.ofSeconds(1) + ) + `when`(dataManager.loadReferences()).thenReturn(result) + `when`(dataManager.getStats()).thenReturn(ContentElementRepositoryInfoImpl(0, 0, 0, false, true)) + + val output = captureStdout { + createRunner().run(DefaultApplicationArguments()) + } + + assertTrue(output.contains("none configured"), "Should say no directories") + } + + @Test + fun `summary shows RAG store stats`() { + val result = IngestionResult( + emptyList(), emptyList(), + emptyList(), emptyList(), emptyList(), Duration.ZERO + ) + `when`(dataManager.loadReferences()).thenReturn(result) + `when`(dataManager.getStats()).thenReturn(ContentElementRepositoryInfoImpl(42, 7, 100, false, true)) + + val output = captureStdout { + createRunner().run(DefaultApplicationArguments()) + } + + assertTrue(output.contains("Documents: 7"), "Should show document count") + assertTrue(output.contains("Chunks: 42"), "Should show chunk count") + assertTrue(output.contains("Elements: 100"), "Should show element count") + } + + @Test + fun `summary shows port and MCP endpoint`() { + val result = IngestionResult( + emptyList(), emptyList(), + emptyList(), emptyList(), emptyList(), Duration.ZERO + ) + `when`(dataManager.loadReferences()).thenReturn(result) + `when`(dataManager.getStats()).thenReturn(ContentElementRepositoryInfoImpl(0, 0, 0, false, true)) + + val output = captureStdout { + createRunner(port = 9999).run(DefaultApplicationArguments()) + } + + assertTrue(output.contains("port 9999"), "Should show port") + assertTrue(output.contains("http://localhost:9999/sse"), "Should show MCP endpoint") + } + + @Test + fun `formatDuration shows seconds for short durations`() { + val result = IngestionResult( + emptyList(), emptyList(), + emptyList(), emptyList(), emptyList(), + Duration.ofSeconds(45) + ) + `when`(dataManager.loadReferences()).thenReturn(result) + `when`(dataManager.getStats()).thenReturn(ContentElementRepositoryInfoImpl(0, 0, 0, false, true)) + + val output = captureStdout { + createRunner().run(DefaultApplicationArguments()) + } + + assertTrue(output.contains("45s"), "Should format as seconds") + } + + @Test + fun `formatDuration shows minutes and seconds for longer durations`() { + val result = IngestionResult( + emptyList(), emptyList(), + emptyList(), emptyList(), emptyList(), + Duration.ofSeconds(125) + ) + `when`(dataManager.loadReferences()).thenReturn(result) + `when`(dataManager.getStats()).thenReturn(ContentElementRepositoryInfoImpl(0, 0, 0, false, true)) + + val output = captureStdout { + createRunner().run(DefaultApplicationArguments()) + } + + assertTrue(output.contains("2m 5s"), "Should format as minutes + seconds") + } + + private fun captureStdout(block: () -> Unit): String { + val original = System.out + val baos = ByteArrayOutputStream() + System.setOut(PrintStream(baos)) + try { + block() + } finally { + System.setOut(original) + } + return baos.toString() + } +} diff --git a/src/test/resources/sample-repo-for-ingestion/README.txt b/src/test/resources/sample-repo-for-ingestion/README.txt new file mode 100644 index 0000000..2968781 --- /dev/null +++ b/src/test/resources/sample-repo-for-ingestion/README.txt @@ -0,0 +1,2 @@ +Sample repo for integration tests. +Used by DataManagerLoadReferencesIntegrationTest to verify directory ingestion. diff --git a/src/test/resources/sample-repo-for-ingestion/notes.txt b/src/test/resources/sample-repo-for-ingestion/notes.txt new file mode 100644 index 0000000..1cfd1a1 --- /dev/null +++ b/src/test/resources/sample-repo-for-ingestion/notes.txt @@ -0,0 +1 @@ +Minimal content for RAG ingestion test.