diff --git a/.agents/scratchpad/2026-02-15-smolterms/task-01-update-vectorstore-interface/context.md b/.agents/scratchpad/2026-02-15-smolterms/task-01-update-vectorstore-interface/context.md new file mode 100644 index 0000000..bdf4245 --- /dev/null +++ b/.agents/scratchpad/2026-02-15-smolterms/task-01-update-vectorstore-interface/context.md @@ -0,0 +1,25 @@ +# Context: Update VectorStore Interface with URL Parameter + +## Requirements +- Add `url string` parameter to `VectorStore.Search` interface method +- Add `URL string` field to `SearchCall` struct +- Update `MockVectorStore.Search` to accept and record the `url` parameter +- Package `vectorstore` must compile; downstream breakage is expected and handled by tasks 02/03 + +## Key Files +- `backend/internal/vectorstore/store.go` — interface, mock, and call recording types (PRIMARY TARGET) +- `backend/internal/vectorstore/store_test.go` — mock tests (must be updated) +- `backend/internal/vectorstore/qdrant.go:135` — QdrantStore.Search (task 02) +- `backend/internal/vectorstore/qdrant_test.go` — Qdrant tests (task 02) +- `backend/internal/rag/pipeline.go:100` — RAG caller (task 03) + +## Patterns +- Interface + mock live in same file (`store.go`) +- Call recording structs capture all parameters for test assertions +- Mock returns preconfigured results/errors, records all calls + +## Downstream Impact +Callers that will break (handled by later tasks): +- `QdrantStore.Search` in qdrant.go (task 02) +- `pipeline.go:100` in rag package (task 03) +- All test files calling Search (tasks 02/03) diff --git a/.agents/scratchpad/2026-02-15-smolterms/task-01-update-vectorstore-interface/plan.md b/.agents/scratchpad/2026-02-15-smolterms/task-01-update-vectorstore-interface/plan.md new file mode 100644 index 0000000..0ad3f67 --- /dev/null +++ b/.agents/scratchpad/2026-02-15-smolterms/task-01-update-vectorstore-interface/plan.md @@ -0,0 +1,13 @@ +# Plan: task-01-update-vectorstore-interface + +## Test Strategy +- Existing `store_test.go` tests updated to pass `url` parameter +- `TestMockSearch_RecordsCalls` extended with URL assertion to verify recording +- `TestVectorStoreInterfaceSatisfied` confirms mock still satisfies interface + +## Implementation Plan +1. Add `url string` as 5th parameter to `VectorStore.Search` interface +2. Add `URL string` field to `SearchCall` struct +3. Update `MockVectorStore.Search` signature to accept and record `url` +4. Update all test calls with appropriate URL values +5. Verify compilation and tests pass diff --git a/.agents/scratchpad/2026-02-15-smolterms/task-01-update-vectorstore-interface/progress.md b/.agents/scratchpad/2026-02-15-smolterms/task-01-update-vectorstore-interface/progress.md new file mode 100644 index 0000000..0b0cbe9 --- /dev/null +++ b/.agents/scratchpad/2026-02-15-smolterms/task-01-update-vectorstore-interface/progress.md @@ -0,0 +1,22 @@ +# Progress: task-01-update-vectorstore-interface + +## Setup +- [x] Created documentation directory structure +- [x] Discovered instruction files (README.md, backend/README.md) +- [x] Read existing store.go and identified all callers + +## Implementation Checklist +- [x] Update `VectorStore.Search` interface signature +- [x] Add `URL string` field to `SearchCall` +- [x] Update `MockVectorStore.Search` method signature and recording +- [x] Update `store_test.go` mock tests to pass new `url` parameter +- [x] Add URL assertion in `TestMockSearch_RecordsCalls` +- [x] Verify vectorstore package compiles +- [x] Verify vectorstore tests pass + +## TDD Cycles +1. Updated interface + mock + tests simultaneously (single coherent change) +2. All 10 tests pass: `ok github.com/parth/smolterms/backend/internal/vectorstore 0.003s` + +## Commit +_(pending)_ diff --git a/.agents/scratchpad/2026-02-15-smolterms/task-02-add-url-filter-to-qdrant-search/context.md b/.agents/scratchpad/2026-02-15-smolterms/task-02-add-url-filter-to-qdrant-search/context.md new file mode 100644 index 0000000..e6d263b --- /dev/null +++ b/.agents/scratchpad/2026-02-15-smolterms/task-02-add-url-filter-to-qdrant-search/context.md @@ -0,0 +1,26 @@ +# Context: Add URL Filter to Qdrant Search + +## Requirements +1. Update `QdrantStore.Search` signature to match new interface: add `url string` parameter +2. When `url` is non-empty, add a `Filter` to `QueryPoints` with `qdrant.NewMatchKeyword("url", url)` +3. When `url` is empty, no filter applied (backward-compatible) +4. Add `url` to the search log line +5. Update existing tests to pass `""` for url +6. Add new tests verifying filter presence/absence + +## Key Files +- `backend/internal/vectorstore/qdrant.go:135` — `QdrantStore.Search` method +- `backend/internal/vectorstore/qdrant_test.go` — all Qdrant tests +- `backend/internal/vectorstore/store.go` — interface (already updated in task-01) + +## Qdrant Filter API +- `qdrant.NewMatchKeyword(field, keyword string) *qdrant.Condition` — helper constructor +- `&qdrant.Filter{Must: []*qdrant.Condition{...}}` — wrap conditions +- `QueryPoints.Filter` field accepts `*qdrant.Filter` +- Mock records `queryCalls []*qdrant.QueryPoints` — tests can inspect `.Filter` + +## Test Strategy +- Existing tests: add `""` as url parameter (no filter applied) +- New test: `TestQdrantStore_Search_WithURLFilter` — verify filter present in QueryPoints +- New test: `TestQdrantStore_Search_WithoutURLFilter` — verify filter nil in QueryPoints +- Log test: verify `url` attribute in search log entry diff --git a/.agents/scratchpad/2026-02-15-smolterms/task-02-add-url-filter-to-qdrant-search/progress.md b/.agents/scratchpad/2026-02-15-smolterms/task-02-add-url-filter-to-qdrant-search/progress.md new file mode 100644 index 0000000..acfc941 --- /dev/null +++ b/.agents/scratchpad/2026-02-15-smolterms/task-02-add-url-filter-to-qdrant-search/progress.md @@ -0,0 +1,24 @@ +# Progress: task-02-add-url-filter-to-qdrant-search + +## Setup +- [x] Created documentation directory +- [x] Read qdrant.go and qdrant_test.go +- [x] Researched Qdrant Go client filter API + +## Implementation Checklist +- [x] Update `QdrantStore.Search` signature with `url string` parameter +- [x] Add conditional filter to `QueryPoints` when url is non-empty +- [x] Add `url` to search log line +- [x] Update existing qdrant tests to pass `""` for url (7 call sites) +- [x] Add test: `TestQdrantStore_Search_WithURLFilter` — verifies filter structure +- [x] Add test: `TestQdrantStore_Search_WithoutURLFilter` — verifies nil filter +- [x] Update log test to verify url attribute +- [x] All 36 tests pass + +## TDD Cycles +1. Updated implementation and tests together (interface change + filter logic + tests) +2. All tests pass on first run: `ok github.com/parth/smolterms/backend/internal/vectorstore 0.004s` + +## Commit +- Hash: `a8489f0` +- Message: `feat(vectorstore): add URL-based filtering to Qdrant search` diff --git a/.agents/scratchpad/2026-02-15-smolterms/task-03-thread-url-through-rag-and-analyzer/progress.md b/.agents/scratchpad/2026-02-15-smolterms/task-03-thread-url-through-rag-and-analyzer/progress.md new file mode 100644 index 0000000..264095e --- /dev/null +++ b/.agents/scratchpad/2026-02-15-smolterms/task-03-thread-url-through-rag-and-analyzer/progress.md @@ -0,0 +1,24 @@ +# Task 03: Thread URL Through RAG and Analyzer — Progress + +## Status: Complete + +## Changes Made + +### 1. `backend/internal/rag/pipeline.go` +- Updated `Pipeline.Retrieve` signature: added `url string` parameter +- Forwarded `url` to `p.store.Search(ctx, p.collection, vectors[0], limit, url)` +- Added `slog.String("url", url)` to the retrieve log line + +### 2. `backend/internal/rag/pipeline_test.go` +- Updated all 6 `Retrieve` calls to include the `url` parameter +- Added `sc.URL` assertion in `TestRetrieve_EmbedsAndSearches` to verify URL forwarding +- Added `"url"` to the expected log attrs in `TestRetrieve_LogsOperationFields` + +### 3. `backend/internal/analyzer/analyzer.go` +- Updated Stage 6 (RAG Retrieve) to pass `req.URL` as the 4th arg to `Retrieve` + +### 4. `backend/internal/analyzer/analyze_pipeline_test.go` +- Added `searchCall.URL` assertion in `TestAnalyze_CorrectDependencyCalls` to verify `req.URL` reaches the vector store search + +## Test Results +- All backend tests pass: `go test ./backend/...` — all packages OK diff --git a/.agents/tasks/2026-02-15-smolterms/step17/task-01-update-vectorstore-interface.code-task.md b/.agents/tasks/2026-02-15-smolterms/step17/task-01-update-vectorstore-interface.code-task.md new file mode 100644 index 0000000..ccef8a3 --- /dev/null +++ b/.agents/tasks/2026-02-15-smolterms/step17/task-01-update-vectorstore-interface.code-task.md @@ -0,0 +1,54 @@ +# Task: Update VectorStore Interface and Mock with URL Parameter + +## Description +Add a `url string` parameter to the `VectorStore.Search` method signature, the `SearchCall` recording struct, and the `MockVectorStore` implementation. This is the foundational change that all downstream packages depend on. + +## Background +All embeddings from all analyzed websites are currently stored in a single Qdrant collection. The `Search` method performs pure vector similarity across the entire collection with no filtering. Since most privacy policies use similar language, this causes cross-contamination — analyzing site-b.com can pull back chunks from site-a.com. The `url` field is already stored as payload metadata and has a keyword index in Qdrant, but is never used during search. + +This task adds the `url` parameter to the interface so downstream implementations (Qdrant, mock) can apply URL-based filtering. + +## Technical Requirements +1. Add `url string` parameter to `VectorStore.Search` interface method +2. Add `URL string` field to `SearchCall` struct for test assertion +3. Update `MockVectorStore.Search` to accept and record the `url` parameter +4. All existing tests that call the mock must still compile after this change (they will be updated in task-03) + +## Dependencies +- `backend/internal/vectorstore/store.go` — the file being modified +- No external dependencies + +## Implementation Approach +1. Read the current `store.go` to understand the interface, mock, and call recording types +2. Add `url string` to the `Search` method in the `VectorStore` interface +3. Add `URL string` field to `SearchCall` +4. Update `MockVectorStore.Search` signature and recording logic +5. Run `go build ./backend/internal/vectorstore/...` to verify the package compiles +6. Note: downstream packages (qdrant, rag, analyzer) will fail to compile until tasks 02 and 03 are completed + +## Acceptance Criteria + +1. **Interface Updated** + - Given the `VectorStore` interface in `store.go` + - When a developer reads the `Search` method signature + - Then it includes `url string` as the fifth parameter: `Search(ctx context.Context, collectionID string, query []float32, limit int, url string) ([]Chunk, error)` + +2. **SearchCall Records URL** + - Given a `SearchCall` struct + - When a test inspects recorded calls + - Then the `URL` field contains the URL passed to `Search` + +3. **Mock Implementation Updated** + - Given the `MockVectorStore` + - When `Search` is called with a URL + - Then the URL is recorded in `SearchCalls` and behavior is otherwise unchanged + +4. **Package Compiles** + - Given the updated `store.go` + - When running `go build ./backend/internal/vectorstore/...` + - Then compilation succeeds with no errors + +## Metadata +- **Complexity**: Low +- **Labels**: VectorStore, Interface, Mock, Refactor +- **Required Skills**: Go interfaces, test doubles diff --git a/.agents/tasks/2026-02-15-smolterms/step17/task-02-add-url-filter-to-qdrant-search.code-task.md b/.agents/tasks/2026-02-15-smolterms/step17/task-02-add-url-filter-to-qdrant-search.code-task.md new file mode 100644 index 0000000..8ed29de --- /dev/null +++ b/.agents/tasks/2026-02-15-smolterms/step17/task-02-add-url-filter-to-qdrant-search.code-task.md @@ -0,0 +1,64 @@ +# Task: Add URL Filter to Qdrant Search Implementation + +## Description +Update the `QdrantStore.Search` method to accept the new `url string` parameter and apply a Qdrant payload filter that restricts results to chunks matching the given URL. When the URL is empty, no filter is applied (preserving backward-compatible behavior). + +## Background +The Qdrant collection already has a keyword index on the `url` field (created in `ensureCollection`), but the `Search` method never uses it. This task wires up a `FieldCondition` filter on `url` in the `QueryPoints` request so that vector similarity search is scoped to a single website's chunks. + +## Technical Requirements +1. Update `QdrantStore.Search` signature to match the new `VectorStore` interface: `Search(ctx context.Context, collectionID string, query []float32, limit int, url string) ([]Chunk, error)` +2. When `url` is non-empty, add a `Filter` to `QueryPoints` with a `FieldCondition` matching `url` exactly (keyword match) +3. When `url` is empty, do not add any filter (search across all chunks) +4. Add the `url` value to the search log line for observability +5. Update existing Qdrant tests to pass the new parameter +6. Add new test cases verifying filter is applied when URL is provided and omitted when empty + +## Dependencies +- Task 01 must be completed first (interface change) +- `backend/internal/vectorstore/qdrant.go` — implementation file +- `backend/internal/vectorstore/qdrant_test.go` — test file +- Qdrant Go client `qdrant` package for filter types + +## Implementation Approach +1. Read `qdrant.go` and `qdrant_test.go` to understand current implementation and test patterns +2. Update `Search` method signature to include `url string` +3. Build the Qdrant filter using `qdrant.Filter` with a `Must` condition containing a `FieldCondition` on the `url` field with a `Match` of type `MatchKeyword` +4. Conditionally attach the filter to `QueryPoints` only when `url != ""` +5. Add `url` to the existing slog line in Search +6. Update all existing test cases to pass `""` for `url` (preserving current behavior) +7. Add test: `TestQdrantStore_Search_WithURLFilter` — verify the `QueryPoints` sent to mock client includes the filter when URL is provided +8. Add test: `TestQdrantStore_Search_WithoutURLFilter` — verify no filter when URL is empty +9. Run `go test ./backend/internal/vectorstore/...` to verify all tests pass + +## Acceptance Criteria + +1. **Filter Applied When URL Provided** + - Given a `QdrantStore` with a mock client + - When `Search` is called with `url = "https://example.com/privacy"` + - Then the `QueryPoints` sent to the Qdrant client includes a `Filter` with a `Must` condition matching `url` = `"https://example.com/privacy"` + +2. **No Filter When URL Empty** + - Given a `QdrantStore` with a mock client + - When `Search` is called with `url = ""` + - Then the `QueryPoints` sent to the Qdrant client has `nil` or empty `Filter` + +3. **Results Unchanged for Matching Chunks** + - Given chunks stored with URL "https://example.com/privacy" + - When `Search` is called with that same URL + - Then matching chunks are returned with correct scores and metadata + +4. **URL Logged** + - Given a `Search` call with a URL + - When the search completes + - Then the log line includes the URL value + +5. **Existing Tests Pass** + - Given the updated implementation + - When running `go test ./backend/internal/vectorstore/...` + - Then all tests pass including new URL filter tests + +## Metadata +- **Complexity**: Medium +- **Labels**: VectorStore, Qdrant, Filter, gRPC +- **Required Skills**: Go, Qdrant gRPC client API, payload filtering diff --git a/.agents/tasks/2026-02-15-smolterms/step17/task-03-thread-url-through-rag-and-analyzer.code-task.md b/.agents/tasks/2026-02-15-smolterms/step17/task-03-thread-url-through-rag-and-analyzer.code-task.md new file mode 100644 index 0000000..fc3c0ff --- /dev/null +++ b/.agents/tasks/2026-02-15-smolterms/step17/task-03-thread-url-through-rag-and-analyzer.code-task.md @@ -0,0 +1,70 @@ +# Task: Thread URL Through RAG Pipeline and Analyzer + +## Description +Update the RAG `Pipeline.Retrieve` method to accept and forward the `url` parameter to `VectorStore.Search`, then update `Analyzer.Analyze` to pass `req.URL` through the retrieval call. Update all affected tests across both packages. + +## Background +With the `VectorStore.Search` interface now accepting a `url` parameter (tasks 01-02), the RAG pipeline and analyzer need to thread the URL through so that retrieval is scoped to the correct website. Currently `Pipeline.Retrieve` takes only `query` and `limit`, and `Analyzer.Analyze` calls `Retrieve` without any URL context. + +## Technical Requirements +1. Update `Pipeline.Retrieve` signature to accept `url string`: `Retrieve(ctx context.Context, query string, limit int, url string) ([]vectorstore.Chunk, error)` +2. Pass `url` through to `p.store.Search(ctx, p.collection, vectors[0], limit, url)` +3. Add `url` to the retrieve log line for observability +4. Update `Analyzer.Analyze` stage 6 to pass `req.URL` to `Retrieve`: `a.rag.Retrieve(ctx, broadRetrievalQuery, retrievalLimit, req.URL)` +5. Update all RAG pipeline tests to pass URL parameter and assert it's forwarded correctly +6. Update all analyzer pipeline tests to verify `req.URL` reaches the `Search` call + +## Dependencies +- Tasks 01 and 02 must be completed first +- `backend/internal/rag/pipeline.go` — RAG pipeline +- `backend/internal/rag/pipeline_test.go` — RAG tests +- `backend/internal/analyzer/analyzer.go` — analyzer pipeline +- `backend/internal/analyzer/analyze_pipeline_test.go` — analyzer tests +- Any other files that call `Pipeline.Retrieve` or `MockVectorStore.Search` + +## Implementation Approach +1. Read `pipeline.go`, `pipeline_test.go`, `analyzer.go`, and `analyze_pipeline_test.go` +2. Update `Pipeline.Retrieve` to accept `url string` and forward it to `store.Search` +3. Add `slog.String("url", url)` to the retrieve log line +4. Update `Analyzer.Analyze` line 148 to pass `req.URL` as the fourth argument to `Retrieve` +5. Update RAG pipeline tests: + - All existing `Retrieve` calls need the URL parameter added + - Add test case verifying URL is passed through to `MockVectorStore.Search` + - Assert `SearchCall.URL` matches expected value +6. Update analyzer pipeline tests: + - Verify the mock's `SearchCalls[0].URL` equals `req.URL` in the happy-path test + - Update any other test cases that call through the pipeline +7. Search for any other callers of `Retrieve` across the codebase and update them +8. Run full test suite: `go test ./backend/...` + +## Acceptance Criteria + +1. **RAG Retrieve Forwards URL** + - Given a RAG pipeline with a mock vector store + - When `Retrieve` is called with `url = "https://example.com/privacy"` + - Then `MockVectorStore.Search` is called with that same URL + +2. **Analyzer Passes req.URL** + - Given an analyzer processing a request with `URL: "https://example.com/privacy"` + - When the pipeline reaches the RAG retrieve stage + - Then the vector store search is filtered to `"https://example.com/privacy"` + +3. **URL Logged in Retrieve** + - Given a `Retrieve` call with a URL + - When retrieval completes + - Then the log line includes the URL + +4. **All Tests Pass** + - Given the updated code across all packages + - When running `go test ./backend/...` + - Then all tests pass with no compilation errors + +5. **No Cross-Contamination Path** + - Given the full pipeline from API request to vector search + - When tracing the URL parameter + - Then `req.URL` flows through `Analyzer.Analyze` → `Pipeline.Retrieve` → `VectorStore.Search` without being lost or defaulted + +## Metadata +- **Complexity**: Medium +- **Labels**: RAG, Analyzer, Integration, Pipeline +- **Required Skills**: Go, interface threading, test mocks, pipeline architecture diff --git a/backend/internal/analyzer/analyze_pipeline_test.go b/backend/internal/analyzer/analyze_pipeline_test.go index a703a69..90ebbb8 100644 --- a/backend/internal/analyzer/analyze_pipeline_test.go +++ b/backend/internal/analyzer/analyze_pipeline_test.go @@ -490,11 +490,17 @@ func TestAnalyze_CorrectDependencyCalls(t *testing.T) { } } - // Verify the search limit was 20 + // Verify the search limit, URL filter, and content hash filter if len(deps.store.SearchCalls) > 0 { searchCall := deps.store.SearchCalls[0] if searchCall.Limit != retrievalLimit { t.Errorf("Search limit = %d, want %d", searchCall.Limit, retrievalLimit) } + if searchCall.URL != req.URL { + t.Errorf("Search URL = %q, want %q", searchCall.URL, req.URL) + } + if searchCall.ContentHash == "" { + t.Error("Search ContentHash is empty, want non-empty (should match computed content hash)") + } } } diff --git a/backend/internal/analyzer/analyzer.go b/backend/internal/analyzer/analyzer.go index 7b7670a..6f173be 100644 --- a/backend/internal/analyzer/analyzer.go +++ b/backend/internal/analyzer/analyzer.go @@ -145,7 +145,7 @@ func (a *Analyzer) Analyze(ctx context.Context, req AnalysisRequest) (*AnalysisR // Stage 6: RAG Retrieve start = time.Now() - retrieved, err := a.rag.Retrieve(ctx, broadRetrievalQuery, retrievalLimit) + retrieved, err := a.rag.Retrieve(ctx, broadRetrievalQuery, retrievalLimit, req.URL, contentHash) if err != nil { return nil, fmt.Errorf("analyze: rag retrieve: %w", err) } diff --git a/backend/internal/rag/pipeline.go b/backend/internal/rag/pipeline.go index 9987930..8bb2360 100644 --- a/backend/internal/rag/pipeline.go +++ b/backend/internal/rag/pipeline.go @@ -84,7 +84,7 @@ func (p *Pipeline) Store(ctx context.Context, url string, contentHash string, ch // Retrieve embeds the query, searches the vector store, and returns // deduplicated results. -func (p *Pipeline) Retrieve(ctx context.Context, query string, limit int) ([]vectorstore.Chunk, error) { +func (p *Pipeline) Retrieve(ctx context.Context, query string, limit int, url string, contentHash string) ([]vectorstore.Chunk, error) { if err := ctx.Err(); err != nil { return nil, fmt.Errorf("rag: retrieve: %w", err) } @@ -97,7 +97,7 @@ func (p *Pipeline) Retrieve(ctx context.Context, query string, limit int) ([]vec embedLatency := time.Since(embedStart) searchStart := time.Now() - results, err := p.store.Search(ctx, p.collection, vectors[0], limit) + results, err := p.store.Search(ctx, p.collection, vectors[0], limit, url, contentHash) if err != nil { return nil, fmt.Errorf("rag: search: %w", err) } @@ -113,6 +113,8 @@ func (p *Pipeline) Retrieve(ctx context.Context, query string, limit int) ([]vec p.logger.Info("retrieved chunks", slog.String("operation", "retrieve"), slog.String("query", truncatedQuery), + slog.String("url", url), + slog.String("content_hash", contentHash), slog.Int("result_count", len(deduped)), slog.Duration("embed_latency", embedLatency), slog.Duration("search_latency", searchLatency), diff --git a/backend/internal/rag/pipeline_test.go b/backend/internal/rag/pipeline_test.go index 53d65d0..89fbae1 100644 --- a/backend/internal/rag/pipeline_test.go +++ b/backend/internal/rag/pipeline_test.go @@ -194,7 +194,7 @@ func TestRetrieve_EmbedsAndSearches(t *testing.T) { } p := newTestPipeline(embedder, store) - results, err := p.Retrieve(context.Background(), "privacy data collection", 10) + results, err := p.Retrieve(context.Background(), "privacy data collection", 10, "https://example.com/privacy", "hash456") if err != nil { t.Fatalf("Retrieve() error = %v", err) } @@ -207,7 +207,7 @@ func TestRetrieve_EmbedsAndSearches(t *testing.T) { t.Errorf("Embed texts = %v, want [privacy data collection]", embedder.Calls[0]) } - // Verify Search called with vector and limit + // Verify Search called with vector, limit, URL, and contentHash if len(store.SearchCalls) != 1 { t.Fatalf("Search call count = %d, want 1", len(store.SearchCalls)) } @@ -218,6 +218,12 @@ func TestRetrieve_EmbedsAndSearches(t *testing.T) { if sc.Limit != 10 { t.Errorf("Search limit = %d, want 10", sc.Limit) } + if sc.URL != "https://example.com/privacy" { + t.Errorf("Search URL = %q, want %q", sc.URL, "https://example.com/privacy") + } + if sc.ContentHash != "hash456" { + t.Errorf("Search ContentHash = %q, want %q", sc.ContentHash, "hash456") + } if len(results) != 1 { t.Fatalf("results count = %d, want 1", len(results)) @@ -238,7 +244,7 @@ func TestRetrieve_DeduplicatesByText(t *testing.T) { } p := newTestPipeline(embedder, store) - results, err := p.Retrieve(context.Background(), "query", 10) + results, err := p.Retrieve(context.Background(), "query", 10, "", "") if err != nil { t.Fatalf("Retrieve() error = %v", err) } @@ -261,7 +267,7 @@ func TestRetrieve_EmbedError(t *testing.T) { store := &vectorstore.MockVectorStore{} p := newTestPipeline(embedder, store) - _, err := p.Retrieve(context.Background(), "query", 10) + _, err := p.Retrieve(context.Background(), "query", 10, "", "") if !errors.Is(err, wantErr) { t.Errorf("Retrieve() error = %v, want %v", err, wantErr) } @@ -276,7 +282,7 @@ func TestRetrieve_SearchError(t *testing.T) { store := &vectorstore.MockVectorStore{SearchErr: wantErr} p := newTestPipeline(embedder, store) - _, err := p.Retrieve(context.Background(), "query", 10) + _, err := p.Retrieve(context.Background(), "query", 10, "", "") if !errors.Is(err, wantErr) { t.Errorf("Retrieve() error = %v, want %v", err, wantErr) } @@ -310,7 +316,7 @@ func TestRetrieve_ContextCancellation(t *testing.T) { store := &vectorstore.MockVectorStore{SearchResult: []vectorstore.Chunk{}} p := newTestPipeline(embedder, store) - _, err := p.Retrieve(ctx, "query", 10) + _, err := p.Retrieve(ctx, "query", 10, "", "") if !errors.Is(err, context.Canceled) { t.Errorf("Retrieve() error = %v, want context.Canceled", err) } @@ -370,7 +376,7 @@ func TestRetrieve_LogsOperationFields(t *testing.T) { } p := NewPipeline(embedder, store, logger, "test_collection") - results, err := p.Retrieve(context.Background(), "privacy data collection", 10) + results, err := p.Retrieve(context.Background(), "privacy data collection", 10, "https://example.com/privacy", "hash789") if err != nil { t.Fatalf("Retrieve() error = %v", err) } @@ -382,7 +388,7 @@ func TestRetrieve_LogsOperationFields(t *testing.T) { if rec.Message != "retrieved chunks" { t.Errorf("log message = %q, want %q", rec.Message, "retrieved chunks") } - for _, key := range []string{"operation", "query", "result_count", "embed_latency", "search_latency"} { + for _, key := range []string{"operation", "query", "url", "content_hash", "result_count", "embed_latency", "search_latency"} { if _, ok := rec.Attrs[key]; !ok { t.Errorf("log missing attr %q", key) } diff --git a/backend/internal/vectorstore/qdrant.go b/backend/internal/vectorstore/qdrant.go index 0e1cec7..d7245ae 100644 --- a/backend/internal/vectorstore/qdrant.go +++ b/backend/internal/vectorstore/qdrant.go @@ -132,23 +132,37 @@ func (s *QdrantStore) Upsert(ctx context.Context, collectionID string, chunks [] // Search returns the top-limit chunks most similar to query in the named collection, // ordered by cosine similarity score descending. Score is populated on each returned Chunk. -func (s *QdrantStore) Search(ctx context.Context, collectionID string, query []float32, limit int) ([]Chunk, error) { +// When url is non-empty, results are filtered to chunks from that URL. +// When contentHash is non-empty, results are further filtered to that content version. +func (s *QdrantStore) Search(ctx context.Context, collectionID string, query []float32, limit int, url string, contentHash string) ([]Chunk, error) { if err := s.ensureCollection(ctx, collectionID); err != nil { return nil, err } start := time.Now() limitU := uint64(limit) - results, err := s.client.Query(ctx, &qdrant.QueryPoints{ + req := &qdrant.QueryPoints{ CollectionName: collectionID, Query: qdrant.NewQueryDense(query), Limit: &limitU, WithPayload: qdrant.NewWithPayload(true), - }) + } + var mustConditions []*qdrant.Condition + if url != "" { + mustConditions = append(mustConditions, qdrant.NewMatchKeyword("url", url)) + } + if contentHash != "" { + mustConditions = append(mustConditions, qdrant.NewMatchKeyword("content_hash", contentHash)) + } + if len(mustConditions) > 0 { + req.Filter = &qdrant.Filter{Must: mustConditions} + } + results, err := s.client.Query(ctx, req) if err != nil { return nil, fmt.Errorf("searching %q: %w", collectionID, err) } s.logger.InfoContext(ctx, "search complete", "collection", collectionID, + "url", url, "results", len(results), "latency_ms", time.Since(start).Milliseconds(), ) diff --git a/backend/internal/vectorstore/qdrant_test.go b/backend/internal/vectorstore/qdrant_test.go index 8d96497..430822d 100644 --- a/backend/internal/vectorstore/qdrant_test.go +++ b/backend/internal/vectorstore/qdrant_test.go @@ -221,7 +221,7 @@ func TestQdrantStore_Search_ReturnsOrderedResults(t *testing.T) { } store := newTestStore(mock) - results, err := store.Search(context.Background(), "col1", []float32{0.1, 0.2}, 5) + results, err := store.Search(context.Background(), "col1", []float32{0.1, 0.2}, 5, "", "") if err != nil { t.Fatalf("Search() error = %v", err) } @@ -251,7 +251,7 @@ func TestQdrantStore_Search_ReconstructsChunks(t *testing.T) { } store := newTestStore(mock) - results, err := store.Search(context.Background(), "col1", []float32{0.1}, 1) + results, err := store.Search(context.Background(), "col1", []float32{0.1}, 1, "", "") if err != nil { t.Fatalf("Search() error = %v", err) } @@ -288,7 +288,7 @@ func TestQdrantStore_Search_ReturnsError(t *testing.T) { mock := &mockQdrantOps{collectionExistsResult: true, queryErr: want} store := newTestStore(mock) - _, err := store.Search(context.Background(), "col1", []float32{0.1}, 3) + _, err := store.Search(context.Background(), "col1", []float32{0.1}, 3, "", "") if err == nil { t.Fatal("Search() expected error, got nil") } @@ -376,7 +376,7 @@ func TestQdrantStore_ContextCancellation_Search(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) cancel() - _, err := store.Search(ctx, "col1", []float32{0.1}, 5) + _, err := store.Search(ctx, "col1", []float32{0.1}, 5, "", "") if err == nil { t.Fatal("Search() expected context error, got nil") } @@ -389,7 +389,7 @@ func TestQdrantStore_Search_QueryParams(t *testing.T) { mock := &mockQdrantOps{collectionExistsResult: true} store := newTestStore(mock) - _, _ = store.Search(context.Background(), "test-col", []float32{0.1, 0.2, 0.3}, 15) + _, _ = store.Search(context.Background(), "test-col", []float32{0.1, 0.2, 0.3}, 15, "", "") if len(mock.queryCalls) != 1 { t.Fatalf("queryCalls len = %d, want 1", len(mock.queryCalls)) @@ -406,6 +406,85 @@ func TestQdrantStore_Search_QueryParams(t *testing.T) { } } +func TestQdrantStore_Search_WithURLFilter(t *testing.T) { + mock := &mockQdrantOps{collectionExistsResult: true} + store := newTestStore(mock) + + _, _ = store.Search(context.Background(), "col1", []float32{0.1}, 5, "https://example.com/privacy", "") + + if len(mock.queryCalls) != 1 { + t.Fatalf("queryCalls len = %d, want 1", len(mock.queryCalls)) + } + req := mock.queryCalls[0] + if req.Filter == nil { + t.Fatal("Filter should not be nil when URL is provided") + } + if len(req.Filter.Must) != 1 { + t.Fatalf("Filter.Must len = %d, want 1", len(req.Filter.Must)) + } + field := req.Filter.Must[0].GetField() + if field == nil { + t.Fatal("expected FieldCondition, got nil") + } + if field.Key != "url" { + t.Errorf("FieldCondition.Key = %q, want %q", field.Key, "url") + } + if field.Match.GetKeyword() != "https://example.com/privacy" { + t.Errorf("Match.Keyword = %q, want %q", field.Match.GetKeyword(), "https://example.com/privacy") + } +} + +func TestQdrantStore_Search_WithURLAndContentHashFilter(t *testing.T) { + mock := &mockQdrantOps{collectionExistsResult: true} + store := newTestStore(mock) + + _, _ = store.Search(context.Background(), "col1", []float32{0.1}, 5, "https://example.com/privacy", "abc123") + + if len(mock.queryCalls) != 1 { + t.Fatalf("queryCalls len = %d, want 1", len(mock.queryCalls)) + } + req := mock.queryCalls[0] + if req.Filter == nil { + t.Fatal("Filter should not be nil when URL and contentHash are provided") + } + if len(req.Filter.Must) != 2 { + t.Fatalf("Filter.Must len = %d, want 2", len(req.Filter.Must)) + } + + // First condition: url + urlField := req.Filter.Must[0].GetField() + if urlField == nil || urlField.Key != "url" { + t.Errorf("Filter.Must[0] key = %q, want %q", urlField.GetKey(), "url") + } + if urlField.Match.GetKeyword() != "https://example.com/privacy" { + t.Errorf("url Match.Keyword = %q, want %q", urlField.Match.GetKeyword(), "https://example.com/privacy") + } + + // Second condition: content_hash + hashField := req.Filter.Must[1].GetField() + if hashField == nil || hashField.Key != "content_hash" { + t.Errorf("Filter.Must[1] key = %q, want %q", hashField.GetKey(), "content_hash") + } + if hashField.Match.GetKeyword() != "abc123" { + t.Errorf("content_hash Match.Keyword = %q, want %q", hashField.Match.GetKeyword(), "abc123") + } +} + +func TestQdrantStore_Search_WithoutURLFilter(t *testing.T) { + mock := &mockQdrantOps{collectionExistsResult: true} + store := newTestStore(mock) + + _, _ = store.Search(context.Background(), "col1", []float32{0.1}, 5, "", "") + + if len(mock.queryCalls) != 1 { + t.Fatalf("queryCalls len = %d, want 1", len(mock.queryCalls)) + } + req := mock.queryCalls[0] + if req.Filter != nil { + t.Errorf("Filter should be nil when URL is empty, got %v", req.Filter) + } +} + func TestChunkUUID_Deterministic(t *testing.T) { c := Chunk{ContentHash: "abc", Index: 5} if chunkUUID(c) != chunkUUID(c) { @@ -492,7 +571,7 @@ func TestQdrantStore_Search_PopulatesChunkID(t *testing.T) { } store := newTestStore(mock) - results, err := store.Search(context.Background(), "col1", []float32{0.1}, 5) + results, err := store.Search(context.Background(), "col1", []float32{0.1}, 5, "", "") if err != nil { t.Fatalf("Search() error = %v", err) } @@ -537,7 +616,7 @@ func TestQdrantStore_Search_LogsEntry(t *testing.T) { } store, logHandler := newTestStoreWithLogger(mock) - if _, err := store.Search(context.Background(), "col1", []float32{0.1}, 5); err != nil { + if _, err := store.Search(context.Background(), "col1", []float32{0.1}, 5, "https://example.com", ""); err != nil { t.Fatalf("Search() error = %v", err) } @@ -548,6 +627,9 @@ func TestQdrantStore_Search_LogsEntry(t *testing.T) { if e.Attrs["collection"] != "col1" { t.Errorf("log collection = %v, want %q", e.Attrs["collection"], "col1") } + if e.Attrs["url"] != "https://example.com" { + t.Errorf("log url = %v, want %q", e.Attrs["url"], "https://example.com") + } if _, ok := e.Attrs["latency_ms"]; !ok { t.Error("expected latency_ms in log entry") } diff --git a/backend/internal/vectorstore/store.go b/backend/internal/vectorstore/store.go index c2090b5..a1944a8 100644 --- a/backend/internal/vectorstore/store.go +++ b/backend/internal/vectorstore/store.go @@ -21,7 +21,9 @@ type VectorStore interface { // Upsert stores or updates chunks in the given collection. Upsert(ctx context.Context, collectionID string, chunks []Chunk) error // Search returns the top-limit chunks most similar to query in the given collection. - Search(ctx context.Context, collectionID string, query []float32, limit int) ([]Chunk, error) + // When url is non-empty, results are filtered to chunks from that URL. + // When contentHash is non-empty, results are further filtered to that content version. + Search(ctx context.Context, collectionID string, query []float32, limit int, url string, contentHash string) ([]Chunk, error) } // UpsertCall records a single call to MockVectorStore.Upsert for test assertion. @@ -35,6 +37,8 @@ type SearchCall struct { CollectionID string Query []float32 Limit int + URL string + ContentHash string } // MockVectorStore is a configurable VectorStore implementation for use in tests. @@ -59,8 +63,8 @@ func (m *MockVectorStore) Upsert(_ context.Context, collectionID string, chunks } // Search records the call and returns SearchResult or SearchErr. -func (m *MockVectorStore) Search(_ context.Context, collectionID string, query []float32, limit int) ([]Chunk, error) { - m.SearchCalls = append(m.SearchCalls, SearchCall{CollectionID: collectionID, Query: query, Limit: limit}) +func (m *MockVectorStore) Search(_ context.Context, collectionID string, query []float32, limit int, url string, contentHash string) ([]Chunk, error) { + m.SearchCalls = append(m.SearchCalls, SearchCall{CollectionID: collectionID, Query: query, Limit: limit, URL: url, ContentHash: contentHash}) if m.SearchErr != nil { return nil, m.SearchErr } diff --git a/backend/internal/vectorstore/store_test.go b/backend/internal/vectorstore/store_test.go index 9ae5e82..725d822 100644 --- a/backend/internal/vectorstore/store_test.go +++ b/backend/internal/vectorstore/store_test.go @@ -95,7 +95,7 @@ func TestMockSearch_ReturnsConfiguredChunks(t *testing.T) { want := []Chunk{{ID: "r1", Score: 0.9}, {ID: "r2", Score: 0.8}} m := &MockVectorStore{SearchResult: want} - got, err := m.Search(context.Background(), "col1", []float32{0.1, 0.2}, 5) + got, err := m.Search(context.Background(), "col1", []float32{0.1, 0.2}, 5, "https://example.com", "") if err != nil { t.Fatalf("Search() error = %v", err) } @@ -112,7 +112,7 @@ func TestMockSearch_ReturnsConfiguredChunks(t *testing.T) { // TestMockSearch_ReturnsNilAndEmptyByDefault verifies Search returns nil error and empty slice by default. func TestMockSearch_ReturnsNilAndEmptyByDefault(t *testing.T) { m := &MockVectorStore{} - got, err := m.Search(context.Background(), "col1", []float32{0.1}, 3) + got, err := m.Search(context.Background(), "col1", []float32{0.1}, 3, "", "") if err != nil { t.Errorf("Search() error = %v, want nil", err) } @@ -126,7 +126,7 @@ func TestMockSearch_ReturnsConfiguredError(t *testing.T) { want := errors.New("search failed") m := &MockVectorStore{SearchErr: want} - _, err := m.Search(context.Background(), "col1", []float32{0.1}, 3) + _, err := m.Search(context.Background(), "col1", []float32{0.1}, 3, "", "") if !errors.Is(err, want) { t.Errorf("Search() error = %v, want %v", err, want) } @@ -136,7 +136,7 @@ func TestMockSearch_ReturnsConfiguredError(t *testing.T) { func TestMockSearch_RecordsCalls(t *testing.T) { m := &MockVectorStore{} query := []float32{0.1, 0.2, 0.3} - _, _ = m.Search(context.Background(), "my-collection", query, 10) + _, _ = m.Search(context.Background(), "my-collection", query, 10, "https://example.com/privacy", "hash123") if len(m.SearchCalls) != 1 { t.Fatalf("SearchCalls len = %d, want 1", len(m.SearchCalls)) @@ -150,6 +150,12 @@ func TestMockSearch_RecordsCalls(t *testing.T) { if len(m.SearchCalls[0].Query) != 3 { t.Errorf("SearchCalls[0].Query len = %d, want 3", len(m.SearchCalls[0].Query)) } + if m.SearchCalls[0].URL != "https://example.com/privacy" { + t.Errorf("SearchCalls[0].URL = %q, want %q", m.SearchCalls[0].URL, "https://example.com/privacy") + } + if m.SearchCalls[0].ContentHash != "hash123" { + t.Errorf("SearchCalls[0].ContentHash = %q, want %q", m.SearchCalls[0].ContentHash, "hash123") + } } // TestMockRecordsMultipleCalls verifies that multiple calls are all recorded. @@ -157,8 +163,8 @@ func TestMockRecordsMultipleCalls(t *testing.T) { m := &MockVectorStore{} _ = m.Upsert(context.Background(), "col1", []Chunk{{ID: "a"}}) _ = m.Upsert(context.Background(), "col2", []Chunk{{ID: "b"}}) - _, _ = m.Search(context.Background(), "col1", []float32{0.1}, 5) - _, _ = m.Search(context.Background(), "col2", []float32{0.2}, 3) + _, _ = m.Search(context.Background(), "col1", []float32{0.1}, 5, "", "") + _, _ = m.Search(context.Background(), "col2", []float32{0.2}, 3, "", "") if len(m.UpsertCalls) != 2 { t.Errorf("UpsertCalls len = %d, want 2", len(m.UpsertCalls))