From 05bda1abecace301bebc72e012e0adce06ecd70b Mon Sep 17 00:00:00 2001 From: Parth576 Date: Sun, 1 Mar 2026 20:49:03 -0500 Subject: [PATCH 1/5] add step17 --- ...-update-vectorstore-interface.code-task.md | 54 ++++++++++++++ ...d-url-filter-to-qdrant-search.code-task.md | 64 +++++++++++++++++ ...-url-through-rag-and-analyzer.code-task.md | 70 +++++++++++++++++++ 3 files changed, 188 insertions(+) create mode 100644 .agents/tasks/2026-02-15-smolterms/step17/task-01-update-vectorstore-interface.code-task.md create mode 100644 .agents/tasks/2026-02-15-smolterms/step17/task-02-add-url-filter-to-qdrant-search.code-task.md create mode 100644 .agents/tasks/2026-02-15-smolterms/step17/task-03-thread-url-through-rag-and-analyzer.code-task.md diff --git a/.agents/tasks/2026-02-15-smolterms/step17/task-01-update-vectorstore-interface.code-task.md b/.agents/tasks/2026-02-15-smolterms/step17/task-01-update-vectorstore-interface.code-task.md new file mode 100644 index 0000000..ccef8a3 --- /dev/null +++ b/.agents/tasks/2026-02-15-smolterms/step17/task-01-update-vectorstore-interface.code-task.md @@ -0,0 +1,54 @@ +# Task: Update VectorStore Interface and Mock with URL Parameter + +## Description +Add a `url string` parameter to the `VectorStore.Search` method signature, the `SearchCall` recording struct, and the `MockVectorStore` implementation. This is the foundational change that all downstream packages depend on. + +## Background +All embeddings from all analyzed websites are currently stored in a single Qdrant collection. The `Search` method performs pure vector similarity across the entire collection with no filtering. Since most privacy policies use similar language, this causes cross-contamination — analyzing site-b.com can pull back chunks from site-a.com. The `url` field is already stored as payload metadata and has a keyword index in Qdrant, but is never used during search. + +This task adds the `url` parameter to the interface so downstream implementations (Qdrant, mock) can apply URL-based filtering. + +## Technical Requirements +1. Add `url string` parameter to `VectorStore.Search` interface method +2. Add `URL string` field to `SearchCall` struct for test assertion +3. Update `MockVectorStore.Search` to accept and record the `url` parameter +4. All existing tests that call the mock must still compile after this change (they will be updated in task-03) + +## Dependencies +- `backend/internal/vectorstore/store.go` — the file being modified +- No external dependencies + +## Implementation Approach +1. Read the current `store.go` to understand the interface, mock, and call recording types +2. Add `url string` to the `Search` method in the `VectorStore` interface +3. Add `URL string` field to `SearchCall` +4. Update `MockVectorStore.Search` signature and recording logic +5. Run `go build ./backend/internal/vectorstore/...` to verify the package compiles +6. Note: downstream packages (qdrant, rag, analyzer) will fail to compile until tasks 02 and 03 are completed + +## Acceptance Criteria + +1. **Interface Updated** + - Given the `VectorStore` interface in `store.go` + - When a developer reads the `Search` method signature + - Then it includes `url string` as the fifth parameter: `Search(ctx context.Context, collectionID string, query []float32, limit int, url string) ([]Chunk, error)` + +2. **SearchCall Records URL** + - Given a `SearchCall` struct + - When a test inspects recorded calls + - Then the `URL` field contains the URL passed to `Search` + +3. **Mock Implementation Updated** + - Given the `MockVectorStore` + - When `Search` is called with a URL + - Then the URL is recorded in `SearchCalls` and behavior is otherwise unchanged + +4. **Package Compiles** + - Given the updated `store.go` + - When running `go build ./backend/internal/vectorstore/...` + - Then compilation succeeds with no errors + +## Metadata +- **Complexity**: Low +- **Labels**: VectorStore, Interface, Mock, Refactor +- **Required Skills**: Go interfaces, test doubles diff --git a/.agents/tasks/2026-02-15-smolterms/step17/task-02-add-url-filter-to-qdrant-search.code-task.md b/.agents/tasks/2026-02-15-smolterms/step17/task-02-add-url-filter-to-qdrant-search.code-task.md new file mode 100644 index 0000000..8ed29de --- /dev/null +++ b/.agents/tasks/2026-02-15-smolterms/step17/task-02-add-url-filter-to-qdrant-search.code-task.md @@ -0,0 +1,64 @@ +# Task: Add URL Filter to Qdrant Search Implementation + +## Description +Update the `QdrantStore.Search` method to accept the new `url string` parameter and apply a Qdrant payload filter that restricts results to chunks matching the given URL. When the URL is empty, no filter is applied (preserving backward-compatible behavior). + +## Background +The Qdrant collection already has a keyword index on the `url` field (created in `ensureCollection`), but the `Search` method never uses it. This task wires up a `FieldCondition` filter on `url` in the `QueryPoints` request so that vector similarity search is scoped to a single website's chunks. + +## Technical Requirements +1. Update `QdrantStore.Search` signature to match the new `VectorStore` interface: `Search(ctx context.Context, collectionID string, query []float32, limit int, url string) ([]Chunk, error)` +2. When `url` is non-empty, add a `Filter` to `QueryPoints` with a `FieldCondition` matching `url` exactly (keyword match) +3. When `url` is empty, do not add any filter (search across all chunks) +4. Add the `url` value to the search log line for observability +5. Update existing Qdrant tests to pass the new parameter +6. Add new test cases verifying filter is applied when URL is provided and omitted when empty + +## Dependencies +- Task 01 must be completed first (interface change) +- `backend/internal/vectorstore/qdrant.go` — implementation file +- `backend/internal/vectorstore/qdrant_test.go` — test file +- Qdrant Go client `qdrant` package for filter types + +## Implementation Approach +1. Read `qdrant.go` and `qdrant_test.go` to understand current implementation and test patterns +2. Update `Search` method signature to include `url string` +3. Build the Qdrant filter using `qdrant.Filter` with a `Must` condition containing a `FieldCondition` on the `url` field with a `Match` of type `MatchKeyword` +4. Conditionally attach the filter to `QueryPoints` only when `url != ""` +5. Add `url` to the existing slog line in Search +6. Update all existing test cases to pass `""` for `url` (preserving current behavior) +7. Add test: `TestQdrantStore_Search_WithURLFilter` — verify the `QueryPoints` sent to mock client includes the filter when URL is provided +8. Add test: `TestQdrantStore_Search_WithoutURLFilter` — verify no filter when URL is empty +9. Run `go test ./backend/internal/vectorstore/...` to verify all tests pass + +## Acceptance Criteria + +1. **Filter Applied When URL Provided** + - Given a `QdrantStore` with a mock client + - When `Search` is called with `url = "https://example.com/privacy"` + - Then the `QueryPoints` sent to the Qdrant client includes a `Filter` with a `Must` condition matching `url` = `"https://example.com/privacy"` + +2. **No Filter When URL Empty** + - Given a `QdrantStore` with a mock client + - When `Search` is called with `url = ""` + - Then the `QueryPoints` sent to the Qdrant client has `nil` or empty `Filter` + +3. **Results Unchanged for Matching Chunks** + - Given chunks stored with URL "https://example.com/privacy" + - When `Search` is called with that same URL + - Then matching chunks are returned with correct scores and metadata + +4. **URL Logged** + - Given a `Search` call with a URL + - When the search completes + - Then the log line includes the URL value + +5. **Existing Tests Pass** + - Given the updated implementation + - When running `go test ./backend/internal/vectorstore/...` + - Then all tests pass including new URL filter tests + +## Metadata +- **Complexity**: Medium +- **Labels**: VectorStore, Qdrant, Filter, gRPC +- **Required Skills**: Go, Qdrant gRPC client API, payload filtering diff --git a/.agents/tasks/2026-02-15-smolterms/step17/task-03-thread-url-through-rag-and-analyzer.code-task.md b/.agents/tasks/2026-02-15-smolterms/step17/task-03-thread-url-through-rag-and-analyzer.code-task.md new file mode 100644 index 0000000..fc3c0ff --- /dev/null +++ b/.agents/tasks/2026-02-15-smolterms/step17/task-03-thread-url-through-rag-and-analyzer.code-task.md @@ -0,0 +1,70 @@ +# Task: Thread URL Through RAG Pipeline and Analyzer + +## Description +Update the RAG `Pipeline.Retrieve` method to accept and forward the `url` parameter to `VectorStore.Search`, then update `Analyzer.Analyze` to pass `req.URL` through the retrieval call. Update all affected tests across both packages. + +## Background +With the `VectorStore.Search` interface now accepting a `url` parameter (tasks 01-02), the RAG pipeline and analyzer need to thread the URL through so that retrieval is scoped to the correct website. Currently `Pipeline.Retrieve` takes only `query` and `limit`, and `Analyzer.Analyze` calls `Retrieve` without any URL context. + +## Technical Requirements +1. Update `Pipeline.Retrieve` signature to accept `url string`: `Retrieve(ctx context.Context, query string, limit int, url string) ([]vectorstore.Chunk, error)` +2. Pass `url` through to `p.store.Search(ctx, p.collection, vectors[0], limit, url)` +3. Add `url` to the retrieve log line for observability +4. Update `Analyzer.Analyze` stage 6 to pass `req.URL` to `Retrieve`: `a.rag.Retrieve(ctx, broadRetrievalQuery, retrievalLimit, req.URL)` +5. Update all RAG pipeline tests to pass URL parameter and assert it's forwarded correctly +6. Update all analyzer pipeline tests to verify `req.URL` reaches the `Search` call + +## Dependencies +- Tasks 01 and 02 must be completed first +- `backend/internal/rag/pipeline.go` — RAG pipeline +- `backend/internal/rag/pipeline_test.go` — RAG tests +- `backend/internal/analyzer/analyzer.go` — analyzer pipeline +- `backend/internal/analyzer/analyze_pipeline_test.go` — analyzer tests +- Any other files that call `Pipeline.Retrieve` or `MockVectorStore.Search` + +## Implementation Approach +1. Read `pipeline.go`, `pipeline_test.go`, `analyzer.go`, and `analyze_pipeline_test.go` +2. Update `Pipeline.Retrieve` to accept `url string` and forward it to `store.Search` +3. Add `slog.String("url", url)` to the retrieve log line +4. Update `Analyzer.Analyze` line 148 to pass `req.URL` as the fourth argument to `Retrieve` +5. Update RAG pipeline tests: + - All existing `Retrieve` calls need the URL parameter added + - Add test case verifying URL is passed through to `MockVectorStore.Search` + - Assert `SearchCall.URL` matches expected value +6. Update analyzer pipeline tests: + - Verify the mock's `SearchCalls[0].URL` equals `req.URL` in the happy-path test + - Update any other test cases that call through the pipeline +7. Search for any other callers of `Retrieve` across the codebase and update them +8. Run full test suite: `go test ./backend/...` + +## Acceptance Criteria + +1. **RAG Retrieve Forwards URL** + - Given a RAG pipeline with a mock vector store + - When `Retrieve` is called with `url = "https://example.com/privacy"` + - Then `MockVectorStore.Search` is called with that same URL + +2. **Analyzer Passes req.URL** + - Given an analyzer processing a request with `URL: "https://example.com/privacy"` + - When the pipeline reaches the RAG retrieve stage + - Then the vector store search is filtered to `"https://example.com/privacy"` + +3. **URL Logged in Retrieve** + - Given a `Retrieve` call with a URL + - When retrieval completes + - Then the log line includes the URL + +4. **All Tests Pass** + - Given the updated code across all packages + - When running `go test ./backend/...` + - Then all tests pass with no compilation errors + +5. **No Cross-Contamination Path** + - Given the full pipeline from API request to vector search + - When tracing the URL parameter + - Then `req.URL` flows through `Analyzer.Analyze` → `Pipeline.Retrieve` → `VectorStore.Search` without being lost or defaulted + +## Metadata +- **Complexity**: Medium +- **Labels**: RAG, Analyzer, Integration, Pipeline +- **Required Skills**: Go, interface threading, test mocks, pipeline architecture From 86fa3202a8833ce9eda65d75f9b7f28611b00352 Mon Sep 17 00:00:00 2001 From: Parth576 Date: Sun, 1 Mar 2026 20:52:33 -0500 Subject: [PATCH 2/5] refactor(vectorstore): add url parameter to Search interface Add url string parameter to VectorStore.Search method signature, SearchCall recording struct, and MockVectorStore implementation. This enables URL-based filtering during search to prevent cross-contamination between analyzed websites. Assisted by the code-assist SOP --- .../context.md | 25 +++++++++++++++++++ .../plan.md | 13 ++++++++++ .../progress.md | 22 ++++++++++++++++ backend/internal/vectorstore/store.go | 8 +++--- backend/internal/vectorstore/store_test.go | 15 ++++++----- 5 files changed, 74 insertions(+), 9 deletions(-) create mode 100644 .agents/scratchpad/2026-02-15-smolterms/task-01-update-vectorstore-interface/context.md create mode 100644 .agents/scratchpad/2026-02-15-smolterms/task-01-update-vectorstore-interface/plan.md create mode 100644 .agents/scratchpad/2026-02-15-smolterms/task-01-update-vectorstore-interface/progress.md diff --git a/.agents/scratchpad/2026-02-15-smolterms/task-01-update-vectorstore-interface/context.md b/.agents/scratchpad/2026-02-15-smolterms/task-01-update-vectorstore-interface/context.md new file mode 100644 index 0000000..bdf4245 --- /dev/null +++ b/.agents/scratchpad/2026-02-15-smolterms/task-01-update-vectorstore-interface/context.md @@ -0,0 +1,25 @@ +# Context: Update VectorStore Interface with URL Parameter + +## Requirements +- Add `url string` parameter to `VectorStore.Search` interface method +- Add `URL string` field to `SearchCall` struct +- Update `MockVectorStore.Search` to accept and record the `url` parameter +- Package `vectorstore` must compile; downstream breakage is expected and handled by tasks 02/03 + +## Key Files +- `backend/internal/vectorstore/store.go` — interface, mock, and call recording types (PRIMARY TARGET) +- `backend/internal/vectorstore/store_test.go` — mock tests (must be updated) +- `backend/internal/vectorstore/qdrant.go:135` — QdrantStore.Search (task 02) +- `backend/internal/vectorstore/qdrant_test.go` — Qdrant tests (task 02) +- `backend/internal/rag/pipeline.go:100` — RAG caller (task 03) + +## Patterns +- Interface + mock live in same file (`store.go`) +- Call recording structs capture all parameters for test assertions +- Mock returns preconfigured results/errors, records all calls + +## Downstream Impact +Callers that will break (handled by later tasks): +- `QdrantStore.Search` in qdrant.go (task 02) +- `pipeline.go:100` in rag package (task 03) +- All test files calling Search (tasks 02/03) diff --git a/.agents/scratchpad/2026-02-15-smolterms/task-01-update-vectorstore-interface/plan.md b/.agents/scratchpad/2026-02-15-smolterms/task-01-update-vectorstore-interface/plan.md new file mode 100644 index 0000000..0ad3f67 --- /dev/null +++ b/.agents/scratchpad/2026-02-15-smolterms/task-01-update-vectorstore-interface/plan.md @@ -0,0 +1,13 @@ +# Plan: task-01-update-vectorstore-interface + +## Test Strategy +- Existing `store_test.go` tests updated to pass `url` parameter +- `TestMockSearch_RecordsCalls` extended with URL assertion to verify recording +- `TestVectorStoreInterfaceSatisfied` confirms mock still satisfies interface + +## Implementation Plan +1. Add `url string` as 5th parameter to `VectorStore.Search` interface +2. Add `URL string` field to `SearchCall` struct +3. Update `MockVectorStore.Search` signature to accept and record `url` +4. Update all test calls with appropriate URL values +5. Verify compilation and tests pass diff --git a/.agents/scratchpad/2026-02-15-smolterms/task-01-update-vectorstore-interface/progress.md b/.agents/scratchpad/2026-02-15-smolterms/task-01-update-vectorstore-interface/progress.md new file mode 100644 index 0000000..0b0cbe9 --- /dev/null +++ b/.agents/scratchpad/2026-02-15-smolterms/task-01-update-vectorstore-interface/progress.md @@ -0,0 +1,22 @@ +# Progress: task-01-update-vectorstore-interface + +## Setup +- [x] Created documentation directory structure +- [x] Discovered instruction files (README.md, backend/README.md) +- [x] Read existing store.go and identified all callers + +## Implementation Checklist +- [x] Update `VectorStore.Search` interface signature +- [x] Add `URL string` field to `SearchCall` +- [x] Update `MockVectorStore.Search` method signature and recording +- [x] Update `store_test.go` mock tests to pass new `url` parameter +- [x] Add URL assertion in `TestMockSearch_RecordsCalls` +- [x] Verify vectorstore package compiles +- [x] Verify vectorstore tests pass + +## TDD Cycles +1. Updated interface + mock + tests simultaneously (single coherent change) +2. All 10 tests pass: `ok github.com/parth/smolterms/backend/internal/vectorstore 0.003s` + +## Commit +_(pending)_ diff --git a/backend/internal/vectorstore/store.go b/backend/internal/vectorstore/store.go index c2090b5..65e842f 100644 --- a/backend/internal/vectorstore/store.go +++ b/backend/internal/vectorstore/store.go @@ -21,7 +21,8 @@ type VectorStore interface { // Upsert stores or updates chunks in the given collection. Upsert(ctx context.Context, collectionID string, chunks []Chunk) error // Search returns the top-limit chunks most similar to query in the given collection. - Search(ctx context.Context, collectionID string, query []float32, limit int) ([]Chunk, error) + // When url is non-empty, results are filtered to chunks from that URL. + Search(ctx context.Context, collectionID string, query []float32, limit int, url string) ([]Chunk, error) } // UpsertCall records a single call to MockVectorStore.Upsert for test assertion. @@ -35,6 +36,7 @@ type SearchCall struct { CollectionID string Query []float32 Limit int + URL string } // MockVectorStore is a configurable VectorStore implementation for use in tests. @@ -59,8 +61,8 @@ func (m *MockVectorStore) Upsert(_ context.Context, collectionID string, chunks } // Search records the call and returns SearchResult or SearchErr. -func (m *MockVectorStore) Search(_ context.Context, collectionID string, query []float32, limit int) ([]Chunk, error) { - m.SearchCalls = append(m.SearchCalls, SearchCall{CollectionID: collectionID, Query: query, Limit: limit}) +func (m *MockVectorStore) Search(_ context.Context, collectionID string, query []float32, limit int, url string) ([]Chunk, error) { + m.SearchCalls = append(m.SearchCalls, SearchCall{CollectionID: collectionID, Query: query, Limit: limit, URL: url}) if m.SearchErr != nil { return nil, m.SearchErr } diff --git a/backend/internal/vectorstore/store_test.go b/backend/internal/vectorstore/store_test.go index 9ae5e82..6d3d6c5 100644 --- a/backend/internal/vectorstore/store_test.go +++ b/backend/internal/vectorstore/store_test.go @@ -95,7 +95,7 @@ func TestMockSearch_ReturnsConfiguredChunks(t *testing.T) { want := []Chunk{{ID: "r1", Score: 0.9}, {ID: "r2", Score: 0.8}} m := &MockVectorStore{SearchResult: want} - got, err := m.Search(context.Background(), "col1", []float32{0.1, 0.2}, 5) + got, err := m.Search(context.Background(), "col1", []float32{0.1, 0.2}, 5, "https://example.com") if err != nil { t.Fatalf("Search() error = %v", err) } @@ -112,7 +112,7 @@ func TestMockSearch_ReturnsConfiguredChunks(t *testing.T) { // TestMockSearch_ReturnsNilAndEmptyByDefault verifies Search returns nil error and empty slice by default. func TestMockSearch_ReturnsNilAndEmptyByDefault(t *testing.T) { m := &MockVectorStore{} - got, err := m.Search(context.Background(), "col1", []float32{0.1}, 3) + got, err := m.Search(context.Background(), "col1", []float32{0.1}, 3, "") if err != nil { t.Errorf("Search() error = %v, want nil", err) } @@ -126,7 +126,7 @@ func TestMockSearch_ReturnsConfiguredError(t *testing.T) { want := errors.New("search failed") m := &MockVectorStore{SearchErr: want} - _, err := m.Search(context.Background(), "col1", []float32{0.1}, 3) + _, err := m.Search(context.Background(), "col1", []float32{0.1}, 3, "") if !errors.Is(err, want) { t.Errorf("Search() error = %v, want %v", err, want) } @@ -136,7 +136,7 @@ func TestMockSearch_ReturnsConfiguredError(t *testing.T) { func TestMockSearch_RecordsCalls(t *testing.T) { m := &MockVectorStore{} query := []float32{0.1, 0.2, 0.3} - _, _ = m.Search(context.Background(), "my-collection", query, 10) + _, _ = m.Search(context.Background(), "my-collection", query, 10, "https://example.com/privacy") if len(m.SearchCalls) != 1 { t.Fatalf("SearchCalls len = %d, want 1", len(m.SearchCalls)) @@ -150,6 +150,9 @@ func TestMockSearch_RecordsCalls(t *testing.T) { if len(m.SearchCalls[0].Query) != 3 { t.Errorf("SearchCalls[0].Query len = %d, want 3", len(m.SearchCalls[0].Query)) } + if m.SearchCalls[0].URL != "https://example.com/privacy" { + t.Errorf("SearchCalls[0].URL = %q, want %q", m.SearchCalls[0].URL, "https://example.com/privacy") + } } // TestMockRecordsMultipleCalls verifies that multiple calls are all recorded. @@ -157,8 +160,8 @@ func TestMockRecordsMultipleCalls(t *testing.T) { m := &MockVectorStore{} _ = m.Upsert(context.Background(), "col1", []Chunk{{ID: "a"}}) _ = m.Upsert(context.Background(), "col2", []Chunk{{ID: "b"}}) - _, _ = m.Search(context.Background(), "col1", []float32{0.1}, 5) - _, _ = m.Search(context.Background(), "col2", []float32{0.2}, 3) + _, _ = m.Search(context.Background(), "col1", []float32{0.1}, 5, "") + _, _ = m.Search(context.Background(), "col2", []float32{0.2}, 3, "") if len(m.UpsertCalls) != 2 { t.Errorf("UpsertCalls len = %d, want 2", len(m.UpsertCalls)) From a8489f0893a3ad7b0e0cdb0e295f77080dfc8a39 Mon Sep 17 00:00:00 2001 From: Parth576 Date: Sun, 1 Mar 2026 20:57:22 -0500 Subject: [PATCH 3/5] feat(vectorstore): add URL-based filtering to Qdrant search When a non-empty URL is provided, QdrantStore.Search now applies a Qdrant payload filter (keyword match on the "url" field) to restrict results to chunks from that specific website. Empty URL preserves the previous unfiltered behavior. The URL is also logged for observability. Assisted by the code-assist SOP --- .../context.md | 26 ++++++++ .../progress.md | 19 ++++++ backend/internal/vectorstore/qdrant.go | 14 ++++- backend/internal/vectorstore/qdrant_test.go | 60 ++++++++++++++++--- 4 files changed, 109 insertions(+), 10 deletions(-) create mode 100644 .agents/scratchpad/2026-02-15-smolterms/task-02-add-url-filter-to-qdrant-search/context.md create mode 100644 .agents/scratchpad/2026-02-15-smolterms/task-02-add-url-filter-to-qdrant-search/progress.md diff --git a/.agents/scratchpad/2026-02-15-smolterms/task-02-add-url-filter-to-qdrant-search/context.md b/.agents/scratchpad/2026-02-15-smolterms/task-02-add-url-filter-to-qdrant-search/context.md new file mode 100644 index 0000000..e6d263b --- /dev/null +++ b/.agents/scratchpad/2026-02-15-smolterms/task-02-add-url-filter-to-qdrant-search/context.md @@ -0,0 +1,26 @@ +# Context: Add URL Filter to Qdrant Search + +## Requirements +1. Update `QdrantStore.Search` signature to match new interface: add `url string` parameter +2. When `url` is non-empty, add a `Filter` to `QueryPoints` with `qdrant.NewMatchKeyword("url", url)` +3. When `url` is empty, no filter applied (backward-compatible) +4. Add `url` to the search log line +5. Update existing tests to pass `""` for url +6. Add new tests verifying filter presence/absence + +## Key Files +- `backend/internal/vectorstore/qdrant.go:135` — `QdrantStore.Search` method +- `backend/internal/vectorstore/qdrant_test.go` — all Qdrant tests +- `backend/internal/vectorstore/store.go` — interface (already updated in task-01) + +## Qdrant Filter API +- `qdrant.NewMatchKeyword(field, keyword string) *qdrant.Condition` — helper constructor +- `&qdrant.Filter{Must: []*qdrant.Condition{...}}` — wrap conditions +- `QueryPoints.Filter` field accepts `*qdrant.Filter` +- Mock records `queryCalls []*qdrant.QueryPoints` — tests can inspect `.Filter` + +## Test Strategy +- Existing tests: add `""` as url parameter (no filter applied) +- New test: `TestQdrantStore_Search_WithURLFilter` — verify filter present in QueryPoints +- New test: `TestQdrantStore_Search_WithoutURLFilter` — verify filter nil in QueryPoints +- Log test: verify `url` attribute in search log entry diff --git a/.agents/scratchpad/2026-02-15-smolterms/task-02-add-url-filter-to-qdrant-search/progress.md b/.agents/scratchpad/2026-02-15-smolterms/task-02-add-url-filter-to-qdrant-search/progress.md new file mode 100644 index 0000000..8e3a853 --- /dev/null +++ b/.agents/scratchpad/2026-02-15-smolterms/task-02-add-url-filter-to-qdrant-search/progress.md @@ -0,0 +1,19 @@ +# Progress: task-02-add-url-filter-to-qdrant-search + +## Setup +- [x] Created documentation directory +- [x] Read qdrant.go and qdrant_test.go +- [x] Researched Qdrant Go client filter API + +## Implementation Checklist +- [ ] Update `QdrantStore.Search` signature with `url string` parameter +- [ ] Add conditional filter to `QueryPoints` when url is non-empty +- [ ] Add `url` to search log line +- [ ] Update existing qdrant tests to pass `""` for url +- [ ] Add test: filter applied when URL provided +- [ ] Add test: no filter when URL empty +- [ ] Update log test to verify url attribute +- [ ] Verify all tests pass + +## TDD Cycles +_(to be filled during implementation)_ diff --git a/backend/internal/vectorstore/qdrant.go b/backend/internal/vectorstore/qdrant.go index 0e1cec7..f532b84 100644 --- a/backend/internal/vectorstore/qdrant.go +++ b/backend/internal/vectorstore/qdrant.go @@ -132,23 +132,31 @@ func (s *QdrantStore) Upsert(ctx context.Context, collectionID string, chunks [] // Search returns the top-limit chunks most similar to query in the named collection, // ordered by cosine similarity score descending. Score is populated on each returned Chunk. -func (s *QdrantStore) Search(ctx context.Context, collectionID string, query []float32, limit int) ([]Chunk, error) { +// When url is non-empty, results are filtered to chunks from that URL. +func (s *QdrantStore) Search(ctx context.Context, collectionID string, query []float32, limit int, url string) ([]Chunk, error) { if err := s.ensureCollection(ctx, collectionID); err != nil { return nil, err } start := time.Now() limitU := uint64(limit) - results, err := s.client.Query(ctx, &qdrant.QueryPoints{ + req := &qdrant.QueryPoints{ CollectionName: collectionID, Query: qdrant.NewQueryDense(query), Limit: &limitU, WithPayload: qdrant.NewWithPayload(true), - }) + } + if url != "" { + req.Filter = &qdrant.Filter{ + Must: []*qdrant.Condition{qdrant.NewMatchKeyword("url", url)}, + } + } + results, err := s.client.Query(ctx, req) if err != nil { return nil, fmt.Errorf("searching %q: %w", collectionID, err) } s.logger.InfoContext(ctx, "search complete", "collection", collectionID, + "url", url, "results", len(results), "latency_ms", time.Since(start).Milliseconds(), ) diff --git a/backend/internal/vectorstore/qdrant_test.go b/backend/internal/vectorstore/qdrant_test.go index 8d96497..ae80384 100644 --- a/backend/internal/vectorstore/qdrant_test.go +++ b/backend/internal/vectorstore/qdrant_test.go @@ -221,7 +221,7 @@ func TestQdrantStore_Search_ReturnsOrderedResults(t *testing.T) { } store := newTestStore(mock) - results, err := store.Search(context.Background(), "col1", []float32{0.1, 0.2}, 5) + results, err := store.Search(context.Background(), "col1", []float32{0.1, 0.2}, 5, "") if err != nil { t.Fatalf("Search() error = %v", err) } @@ -251,7 +251,7 @@ func TestQdrantStore_Search_ReconstructsChunks(t *testing.T) { } store := newTestStore(mock) - results, err := store.Search(context.Background(), "col1", []float32{0.1}, 1) + results, err := store.Search(context.Background(), "col1", []float32{0.1}, 1, "") if err != nil { t.Fatalf("Search() error = %v", err) } @@ -288,7 +288,7 @@ func TestQdrantStore_Search_ReturnsError(t *testing.T) { mock := &mockQdrantOps{collectionExistsResult: true, queryErr: want} store := newTestStore(mock) - _, err := store.Search(context.Background(), "col1", []float32{0.1}, 3) + _, err := store.Search(context.Background(), "col1", []float32{0.1}, 3, "") if err == nil { t.Fatal("Search() expected error, got nil") } @@ -376,7 +376,7 @@ func TestQdrantStore_ContextCancellation_Search(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) cancel() - _, err := store.Search(ctx, "col1", []float32{0.1}, 5) + _, err := store.Search(ctx, "col1", []float32{0.1}, 5, "") if err == nil { t.Fatal("Search() expected context error, got nil") } @@ -389,7 +389,7 @@ func TestQdrantStore_Search_QueryParams(t *testing.T) { mock := &mockQdrantOps{collectionExistsResult: true} store := newTestStore(mock) - _, _ = store.Search(context.Background(), "test-col", []float32{0.1, 0.2, 0.3}, 15) + _, _ = store.Search(context.Background(), "test-col", []float32{0.1, 0.2, 0.3}, 15, "") if len(mock.queryCalls) != 1 { t.Fatalf("queryCalls len = %d, want 1", len(mock.queryCalls)) @@ -406,6 +406,49 @@ func TestQdrantStore_Search_QueryParams(t *testing.T) { } } +func TestQdrantStore_Search_WithURLFilter(t *testing.T) { + mock := &mockQdrantOps{collectionExistsResult: true} + store := newTestStore(mock) + + _, _ = store.Search(context.Background(), "col1", []float32{0.1}, 5, "https://example.com/privacy") + + if len(mock.queryCalls) != 1 { + t.Fatalf("queryCalls len = %d, want 1", len(mock.queryCalls)) + } + req := mock.queryCalls[0] + if req.Filter == nil { + t.Fatal("Filter should not be nil when URL is provided") + } + if len(req.Filter.Must) != 1 { + t.Fatalf("Filter.Must len = %d, want 1", len(req.Filter.Must)) + } + field := req.Filter.Must[0].GetField() + if field == nil { + t.Fatal("expected FieldCondition, got nil") + } + if field.Key != "url" { + t.Errorf("FieldCondition.Key = %q, want %q", field.Key, "url") + } + if field.Match.GetKeyword() != "https://example.com/privacy" { + t.Errorf("Match.Keyword = %q, want %q", field.Match.GetKeyword(), "https://example.com/privacy") + } +} + +func TestQdrantStore_Search_WithoutURLFilter(t *testing.T) { + mock := &mockQdrantOps{collectionExistsResult: true} + store := newTestStore(mock) + + _, _ = store.Search(context.Background(), "col1", []float32{0.1}, 5, "") + + if len(mock.queryCalls) != 1 { + t.Fatalf("queryCalls len = %d, want 1", len(mock.queryCalls)) + } + req := mock.queryCalls[0] + if req.Filter != nil { + t.Errorf("Filter should be nil when URL is empty, got %v", req.Filter) + } +} + func TestChunkUUID_Deterministic(t *testing.T) { c := Chunk{ContentHash: "abc", Index: 5} if chunkUUID(c) != chunkUUID(c) { @@ -492,7 +535,7 @@ func TestQdrantStore_Search_PopulatesChunkID(t *testing.T) { } store := newTestStore(mock) - results, err := store.Search(context.Background(), "col1", []float32{0.1}, 5) + results, err := store.Search(context.Background(), "col1", []float32{0.1}, 5, "") if err != nil { t.Fatalf("Search() error = %v", err) } @@ -537,7 +580,7 @@ func TestQdrantStore_Search_LogsEntry(t *testing.T) { } store, logHandler := newTestStoreWithLogger(mock) - if _, err := store.Search(context.Background(), "col1", []float32{0.1}, 5); err != nil { + if _, err := store.Search(context.Background(), "col1", []float32{0.1}, 5, "https://example.com"); err != nil { t.Fatalf("Search() error = %v", err) } @@ -548,6 +591,9 @@ func TestQdrantStore_Search_LogsEntry(t *testing.T) { if e.Attrs["collection"] != "col1" { t.Errorf("log collection = %v, want %q", e.Attrs["collection"], "col1") } + if e.Attrs["url"] != "https://example.com" { + t.Errorf("log url = %v, want %q", e.Attrs["url"], "https://example.com") + } if _, ok := e.Attrs["latency_ms"]; !ok { t.Error("expected latency_ms in log entry") } From f4cedeac5a314fc0c2138145fb07120031293fd2 Mon Sep 17 00:00:00 2001 From: Parth576 Date: Sun, 1 Mar 2026 21:01:45 -0500 Subject: [PATCH 4/5] feat(rag,analyzer): thread URL through RAG pipeline and analyzer Add url parameter to Pipeline.Retrieve and forward it to VectorStore.Search so retrieval is scoped to the correct website. Update Analyzer.Analyze to pass req.URL through the retrieval call. Add URL to the retrieve log line for observability. Assisted by the code-assist SOP --- .../progress.md | 23 +++++++++++------- .../progress.md | 24 +++++++++++++++++++ .../analyzer/analyze_pipeline_test.go | 5 +++- backend/internal/analyzer/analyzer.go | 2 +- backend/internal/rag/pipeline.go | 5 ++-- backend/internal/rag/pipeline_test.go | 19 ++++++++------- 6 files changed, 57 insertions(+), 21 deletions(-) create mode 100644 .agents/scratchpad/2026-02-15-smolterms/task-03-thread-url-through-rag-and-analyzer/progress.md diff --git a/.agents/scratchpad/2026-02-15-smolterms/task-02-add-url-filter-to-qdrant-search/progress.md b/.agents/scratchpad/2026-02-15-smolterms/task-02-add-url-filter-to-qdrant-search/progress.md index 8e3a853..acfc941 100644 --- a/.agents/scratchpad/2026-02-15-smolterms/task-02-add-url-filter-to-qdrant-search/progress.md +++ b/.agents/scratchpad/2026-02-15-smolterms/task-02-add-url-filter-to-qdrant-search/progress.md @@ -6,14 +6,19 @@ - [x] Researched Qdrant Go client filter API ## Implementation Checklist -- [ ] Update `QdrantStore.Search` signature with `url string` parameter -- [ ] Add conditional filter to `QueryPoints` when url is non-empty -- [ ] Add `url` to search log line -- [ ] Update existing qdrant tests to pass `""` for url -- [ ] Add test: filter applied when URL provided -- [ ] Add test: no filter when URL empty -- [ ] Update log test to verify url attribute -- [ ] Verify all tests pass +- [x] Update `QdrantStore.Search` signature with `url string` parameter +- [x] Add conditional filter to `QueryPoints` when url is non-empty +- [x] Add `url` to search log line +- [x] Update existing qdrant tests to pass `""` for url (7 call sites) +- [x] Add test: `TestQdrantStore_Search_WithURLFilter` — verifies filter structure +- [x] Add test: `TestQdrantStore_Search_WithoutURLFilter` — verifies nil filter +- [x] Update log test to verify url attribute +- [x] All 36 tests pass ## TDD Cycles -_(to be filled during implementation)_ +1. Updated implementation and tests together (interface change + filter logic + tests) +2. All tests pass on first run: `ok github.com/parth/smolterms/backend/internal/vectorstore 0.004s` + +## Commit +- Hash: `a8489f0` +- Message: `feat(vectorstore): add URL-based filtering to Qdrant search` diff --git a/.agents/scratchpad/2026-02-15-smolterms/task-03-thread-url-through-rag-and-analyzer/progress.md b/.agents/scratchpad/2026-02-15-smolterms/task-03-thread-url-through-rag-and-analyzer/progress.md new file mode 100644 index 0000000..264095e --- /dev/null +++ b/.agents/scratchpad/2026-02-15-smolterms/task-03-thread-url-through-rag-and-analyzer/progress.md @@ -0,0 +1,24 @@ +# Task 03: Thread URL Through RAG and Analyzer — Progress + +## Status: Complete + +## Changes Made + +### 1. `backend/internal/rag/pipeline.go` +- Updated `Pipeline.Retrieve` signature: added `url string` parameter +- Forwarded `url` to `p.store.Search(ctx, p.collection, vectors[0], limit, url)` +- Added `slog.String("url", url)` to the retrieve log line + +### 2. `backend/internal/rag/pipeline_test.go` +- Updated all 6 `Retrieve` calls to include the `url` parameter +- Added `sc.URL` assertion in `TestRetrieve_EmbedsAndSearches` to verify URL forwarding +- Added `"url"` to the expected log attrs in `TestRetrieve_LogsOperationFields` + +### 3. `backend/internal/analyzer/analyzer.go` +- Updated Stage 6 (RAG Retrieve) to pass `req.URL` as the 4th arg to `Retrieve` + +### 4. `backend/internal/analyzer/analyze_pipeline_test.go` +- Added `searchCall.URL` assertion in `TestAnalyze_CorrectDependencyCalls` to verify `req.URL` reaches the vector store search + +## Test Results +- All backend tests pass: `go test ./backend/...` — all packages OK diff --git a/backend/internal/analyzer/analyze_pipeline_test.go b/backend/internal/analyzer/analyze_pipeline_test.go index a703a69..c51a2a5 100644 --- a/backend/internal/analyzer/analyze_pipeline_test.go +++ b/backend/internal/analyzer/analyze_pipeline_test.go @@ -490,11 +490,14 @@ func TestAnalyze_CorrectDependencyCalls(t *testing.T) { } } - // Verify the search limit was 20 + // Verify the search limit and URL filter if len(deps.store.SearchCalls) > 0 { searchCall := deps.store.SearchCalls[0] if searchCall.Limit != retrievalLimit { t.Errorf("Search limit = %d, want %d", searchCall.Limit, retrievalLimit) } + if searchCall.URL != req.URL { + t.Errorf("Search URL = %q, want %q", searchCall.URL, req.URL) + } } } diff --git a/backend/internal/analyzer/analyzer.go b/backend/internal/analyzer/analyzer.go index 7b7670a..9b07ab3 100644 --- a/backend/internal/analyzer/analyzer.go +++ b/backend/internal/analyzer/analyzer.go @@ -145,7 +145,7 @@ func (a *Analyzer) Analyze(ctx context.Context, req AnalysisRequest) (*AnalysisR // Stage 6: RAG Retrieve start = time.Now() - retrieved, err := a.rag.Retrieve(ctx, broadRetrievalQuery, retrievalLimit) + retrieved, err := a.rag.Retrieve(ctx, broadRetrievalQuery, retrievalLimit, req.URL) if err != nil { return nil, fmt.Errorf("analyze: rag retrieve: %w", err) } diff --git a/backend/internal/rag/pipeline.go b/backend/internal/rag/pipeline.go index 9987930..36c1dba 100644 --- a/backend/internal/rag/pipeline.go +++ b/backend/internal/rag/pipeline.go @@ -84,7 +84,7 @@ func (p *Pipeline) Store(ctx context.Context, url string, contentHash string, ch // Retrieve embeds the query, searches the vector store, and returns // deduplicated results. -func (p *Pipeline) Retrieve(ctx context.Context, query string, limit int) ([]vectorstore.Chunk, error) { +func (p *Pipeline) Retrieve(ctx context.Context, query string, limit int, url string) ([]vectorstore.Chunk, error) { if err := ctx.Err(); err != nil { return nil, fmt.Errorf("rag: retrieve: %w", err) } @@ -97,7 +97,7 @@ func (p *Pipeline) Retrieve(ctx context.Context, query string, limit int) ([]vec embedLatency := time.Since(embedStart) searchStart := time.Now() - results, err := p.store.Search(ctx, p.collection, vectors[0], limit) + results, err := p.store.Search(ctx, p.collection, vectors[0], limit, url) if err != nil { return nil, fmt.Errorf("rag: search: %w", err) } @@ -113,6 +113,7 @@ func (p *Pipeline) Retrieve(ctx context.Context, query string, limit int) ([]vec p.logger.Info("retrieved chunks", slog.String("operation", "retrieve"), slog.String("query", truncatedQuery), + slog.String("url", url), slog.Int("result_count", len(deduped)), slog.Duration("embed_latency", embedLatency), slog.Duration("search_latency", searchLatency), diff --git a/backend/internal/rag/pipeline_test.go b/backend/internal/rag/pipeline_test.go index 53d65d0..652a3ea 100644 --- a/backend/internal/rag/pipeline_test.go +++ b/backend/internal/rag/pipeline_test.go @@ -194,7 +194,7 @@ func TestRetrieve_EmbedsAndSearches(t *testing.T) { } p := newTestPipeline(embedder, store) - results, err := p.Retrieve(context.Background(), "privacy data collection", 10) + results, err := p.Retrieve(context.Background(), "privacy data collection", 10, "https://example.com/privacy") if err != nil { t.Fatalf("Retrieve() error = %v", err) } @@ -207,7 +207,7 @@ func TestRetrieve_EmbedsAndSearches(t *testing.T) { t.Errorf("Embed texts = %v, want [privacy data collection]", embedder.Calls[0]) } - // Verify Search called with vector and limit + // Verify Search called with vector, limit, and URL if len(store.SearchCalls) != 1 { t.Fatalf("Search call count = %d, want 1", len(store.SearchCalls)) } @@ -218,6 +218,9 @@ func TestRetrieve_EmbedsAndSearches(t *testing.T) { if sc.Limit != 10 { t.Errorf("Search limit = %d, want 10", sc.Limit) } + if sc.URL != "https://example.com/privacy" { + t.Errorf("Search URL = %q, want %q", sc.URL, "https://example.com/privacy") + } if len(results) != 1 { t.Fatalf("results count = %d, want 1", len(results)) @@ -238,7 +241,7 @@ func TestRetrieve_DeduplicatesByText(t *testing.T) { } p := newTestPipeline(embedder, store) - results, err := p.Retrieve(context.Background(), "query", 10) + results, err := p.Retrieve(context.Background(), "query", 10, "") if err != nil { t.Fatalf("Retrieve() error = %v", err) } @@ -261,7 +264,7 @@ func TestRetrieve_EmbedError(t *testing.T) { store := &vectorstore.MockVectorStore{} p := newTestPipeline(embedder, store) - _, err := p.Retrieve(context.Background(), "query", 10) + _, err := p.Retrieve(context.Background(), "query", 10, "") if !errors.Is(err, wantErr) { t.Errorf("Retrieve() error = %v, want %v", err, wantErr) } @@ -276,7 +279,7 @@ func TestRetrieve_SearchError(t *testing.T) { store := &vectorstore.MockVectorStore{SearchErr: wantErr} p := newTestPipeline(embedder, store) - _, err := p.Retrieve(context.Background(), "query", 10) + _, err := p.Retrieve(context.Background(), "query", 10, "") if !errors.Is(err, wantErr) { t.Errorf("Retrieve() error = %v, want %v", err, wantErr) } @@ -310,7 +313,7 @@ func TestRetrieve_ContextCancellation(t *testing.T) { store := &vectorstore.MockVectorStore{SearchResult: []vectorstore.Chunk{}} p := newTestPipeline(embedder, store) - _, err := p.Retrieve(ctx, "query", 10) + _, err := p.Retrieve(ctx, "query", 10, "") if !errors.Is(err, context.Canceled) { t.Errorf("Retrieve() error = %v, want context.Canceled", err) } @@ -370,7 +373,7 @@ func TestRetrieve_LogsOperationFields(t *testing.T) { } p := NewPipeline(embedder, store, logger, "test_collection") - results, err := p.Retrieve(context.Background(), "privacy data collection", 10) + results, err := p.Retrieve(context.Background(), "privacy data collection", 10, "https://example.com/privacy") if err != nil { t.Fatalf("Retrieve() error = %v", err) } @@ -382,7 +385,7 @@ func TestRetrieve_LogsOperationFields(t *testing.T) { if rec.Message != "retrieved chunks" { t.Errorf("log message = %q, want %q", rec.Message, "retrieved chunks") } - for _, key := range []string{"operation", "query", "result_count", "embed_latency", "search_latency"} { + for _, key := range []string{"operation", "query", "url", "result_count", "embed_latency", "search_latency"} { if _, ok := rec.Attrs[key]; !ok { t.Errorf("log missing attr %q", key) } From 86181df5aa6c72fe295f4470376fcaf38f92b37a Mon Sep 17 00:00:00 2001 From: Parth576 Date: Sun, 1 Mar 2026 21:52:07 -0500 Subject: [PATCH 5/5] feat(vectorstore,rag,analyzer): add content hash filtering to search Add contentHash parameter to VectorStore.Search, Pipeline.Retrieve, and Analyzer.Analyze so retrieval is scoped to the exact document version (url + content_hash), preventing stale chunks from a previous crawl from mixing into results. Update Qdrant implementation to build a multi-condition Must filter when both url and contentHash are provided. Add content_hash to the retrieve log line for observability. Assisted by the code-assist SOP --- .../analyzer/analyze_pipeline_test.go | 5 +- backend/internal/analyzer/analyzer.go | 2 +- backend/internal/rag/pipeline.go | 5 +- backend/internal/rag/pipeline_test.go | 19 ++++--- backend/internal/vectorstore/qdrant.go | 14 +++-- backend/internal/vectorstore/qdrant_test.go | 54 +++++++++++++++---- backend/internal/vectorstore/store.go | 8 +-- backend/internal/vectorstore/store_test.go | 15 +++--- 8 files changed, 88 insertions(+), 34 deletions(-) diff --git a/backend/internal/analyzer/analyze_pipeline_test.go b/backend/internal/analyzer/analyze_pipeline_test.go index c51a2a5..90ebbb8 100644 --- a/backend/internal/analyzer/analyze_pipeline_test.go +++ b/backend/internal/analyzer/analyze_pipeline_test.go @@ -490,7 +490,7 @@ func TestAnalyze_CorrectDependencyCalls(t *testing.T) { } } - // Verify the search limit and URL filter + // Verify the search limit, URL filter, and content hash filter if len(deps.store.SearchCalls) > 0 { searchCall := deps.store.SearchCalls[0] if searchCall.Limit != retrievalLimit { @@ -499,5 +499,8 @@ func TestAnalyze_CorrectDependencyCalls(t *testing.T) { if searchCall.URL != req.URL { t.Errorf("Search URL = %q, want %q", searchCall.URL, req.URL) } + if searchCall.ContentHash == "" { + t.Error("Search ContentHash is empty, want non-empty (should match computed content hash)") + } } } diff --git a/backend/internal/analyzer/analyzer.go b/backend/internal/analyzer/analyzer.go index 9b07ab3..6f173be 100644 --- a/backend/internal/analyzer/analyzer.go +++ b/backend/internal/analyzer/analyzer.go @@ -145,7 +145,7 @@ func (a *Analyzer) Analyze(ctx context.Context, req AnalysisRequest) (*AnalysisR // Stage 6: RAG Retrieve start = time.Now() - retrieved, err := a.rag.Retrieve(ctx, broadRetrievalQuery, retrievalLimit, req.URL) + retrieved, err := a.rag.Retrieve(ctx, broadRetrievalQuery, retrievalLimit, req.URL, contentHash) if err != nil { return nil, fmt.Errorf("analyze: rag retrieve: %w", err) } diff --git a/backend/internal/rag/pipeline.go b/backend/internal/rag/pipeline.go index 36c1dba..8bb2360 100644 --- a/backend/internal/rag/pipeline.go +++ b/backend/internal/rag/pipeline.go @@ -84,7 +84,7 @@ func (p *Pipeline) Store(ctx context.Context, url string, contentHash string, ch // Retrieve embeds the query, searches the vector store, and returns // deduplicated results. -func (p *Pipeline) Retrieve(ctx context.Context, query string, limit int, url string) ([]vectorstore.Chunk, error) { +func (p *Pipeline) Retrieve(ctx context.Context, query string, limit int, url string, contentHash string) ([]vectorstore.Chunk, error) { if err := ctx.Err(); err != nil { return nil, fmt.Errorf("rag: retrieve: %w", err) } @@ -97,7 +97,7 @@ func (p *Pipeline) Retrieve(ctx context.Context, query string, limit int, url st embedLatency := time.Since(embedStart) searchStart := time.Now() - results, err := p.store.Search(ctx, p.collection, vectors[0], limit, url) + results, err := p.store.Search(ctx, p.collection, vectors[0], limit, url, contentHash) if err != nil { return nil, fmt.Errorf("rag: search: %w", err) } @@ -114,6 +114,7 @@ func (p *Pipeline) Retrieve(ctx context.Context, query string, limit int, url st slog.String("operation", "retrieve"), slog.String("query", truncatedQuery), slog.String("url", url), + slog.String("content_hash", contentHash), slog.Int("result_count", len(deduped)), slog.Duration("embed_latency", embedLatency), slog.Duration("search_latency", searchLatency), diff --git a/backend/internal/rag/pipeline_test.go b/backend/internal/rag/pipeline_test.go index 652a3ea..89fbae1 100644 --- a/backend/internal/rag/pipeline_test.go +++ b/backend/internal/rag/pipeline_test.go @@ -194,7 +194,7 @@ func TestRetrieve_EmbedsAndSearches(t *testing.T) { } p := newTestPipeline(embedder, store) - results, err := p.Retrieve(context.Background(), "privacy data collection", 10, "https://example.com/privacy") + results, err := p.Retrieve(context.Background(), "privacy data collection", 10, "https://example.com/privacy", "hash456") if err != nil { t.Fatalf("Retrieve() error = %v", err) } @@ -207,7 +207,7 @@ func TestRetrieve_EmbedsAndSearches(t *testing.T) { t.Errorf("Embed texts = %v, want [privacy data collection]", embedder.Calls[0]) } - // Verify Search called with vector, limit, and URL + // Verify Search called with vector, limit, URL, and contentHash if len(store.SearchCalls) != 1 { t.Fatalf("Search call count = %d, want 1", len(store.SearchCalls)) } @@ -221,6 +221,9 @@ func TestRetrieve_EmbedsAndSearches(t *testing.T) { if sc.URL != "https://example.com/privacy" { t.Errorf("Search URL = %q, want %q", sc.URL, "https://example.com/privacy") } + if sc.ContentHash != "hash456" { + t.Errorf("Search ContentHash = %q, want %q", sc.ContentHash, "hash456") + } if len(results) != 1 { t.Fatalf("results count = %d, want 1", len(results)) @@ -241,7 +244,7 @@ func TestRetrieve_DeduplicatesByText(t *testing.T) { } p := newTestPipeline(embedder, store) - results, err := p.Retrieve(context.Background(), "query", 10, "") + results, err := p.Retrieve(context.Background(), "query", 10, "", "") if err != nil { t.Fatalf("Retrieve() error = %v", err) } @@ -264,7 +267,7 @@ func TestRetrieve_EmbedError(t *testing.T) { store := &vectorstore.MockVectorStore{} p := newTestPipeline(embedder, store) - _, err := p.Retrieve(context.Background(), "query", 10, "") + _, err := p.Retrieve(context.Background(), "query", 10, "", "") if !errors.Is(err, wantErr) { t.Errorf("Retrieve() error = %v, want %v", err, wantErr) } @@ -279,7 +282,7 @@ func TestRetrieve_SearchError(t *testing.T) { store := &vectorstore.MockVectorStore{SearchErr: wantErr} p := newTestPipeline(embedder, store) - _, err := p.Retrieve(context.Background(), "query", 10, "") + _, err := p.Retrieve(context.Background(), "query", 10, "", "") if !errors.Is(err, wantErr) { t.Errorf("Retrieve() error = %v, want %v", err, wantErr) } @@ -313,7 +316,7 @@ func TestRetrieve_ContextCancellation(t *testing.T) { store := &vectorstore.MockVectorStore{SearchResult: []vectorstore.Chunk{}} p := newTestPipeline(embedder, store) - _, err := p.Retrieve(ctx, "query", 10, "") + _, err := p.Retrieve(ctx, "query", 10, "", "") if !errors.Is(err, context.Canceled) { t.Errorf("Retrieve() error = %v, want context.Canceled", err) } @@ -373,7 +376,7 @@ func TestRetrieve_LogsOperationFields(t *testing.T) { } p := NewPipeline(embedder, store, logger, "test_collection") - results, err := p.Retrieve(context.Background(), "privacy data collection", 10, "https://example.com/privacy") + results, err := p.Retrieve(context.Background(), "privacy data collection", 10, "https://example.com/privacy", "hash789") if err != nil { t.Fatalf("Retrieve() error = %v", err) } @@ -385,7 +388,7 @@ func TestRetrieve_LogsOperationFields(t *testing.T) { if rec.Message != "retrieved chunks" { t.Errorf("log message = %q, want %q", rec.Message, "retrieved chunks") } - for _, key := range []string{"operation", "query", "url", "result_count", "embed_latency", "search_latency"} { + for _, key := range []string{"operation", "query", "url", "content_hash", "result_count", "embed_latency", "search_latency"} { if _, ok := rec.Attrs[key]; !ok { t.Errorf("log missing attr %q", key) } diff --git a/backend/internal/vectorstore/qdrant.go b/backend/internal/vectorstore/qdrant.go index f532b84..d7245ae 100644 --- a/backend/internal/vectorstore/qdrant.go +++ b/backend/internal/vectorstore/qdrant.go @@ -133,7 +133,8 @@ func (s *QdrantStore) Upsert(ctx context.Context, collectionID string, chunks [] // Search returns the top-limit chunks most similar to query in the named collection, // ordered by cosine similarity score descending. Score is populated on each returned Chunk. // When url is non-empty, results are filtered to chunks from that URL. -func (s *QdrantStore) Search(ctx context.Context, collectionID string, query []float32, limit int, url string) ([]Chunk, error) { +// When contentHash is non-empty, results are further filtered to that content version. +func (s *QdrantStore) Search(ctx context.Context, collectionID string, query []float32, limit int, url string, contentHash string) ([]Chunk, error) { if err := s.ensureCollection(ctx, collectionID); err != nil { return nil, err } @@ -145,10 +146,15 @@ func (s *QdrantStore) Search(ctx context.Context, collectionID string, query []f Limit: &limitU, WithPayload: qdrant.NewWithPayload(true), } + var mustConditions []*qdrant.Condition if url != "" { - req.Filter = &qdrant.Filter{ - Must: []*qdrant.Condition{qdrant.NewMatchKeyword("url", url)}, - } + mustConditions = append(mustConditions, qdrant.NewMatchKeyword("url", url)) + } + if contentHash != "" { + mustConditions = append(mustConditions, qdrant.NewMatchKeyword("content_hash", contentHash)) + } + if len(mustConditions) > 0 { + req.Filter = &qdrant.Filter{Must: mustConditions} } results, err := s.client.Query(ctx, req) if err != nil { diff --git a/backend/internal/vectorstore/qdrant_test.go b/backend/internal/vectorstore/qdrant_test.go index ae80384..430822d 100644 --- a/backend/internal/vectorstore/qdrant_test.go +++ b/backend/internal/vectorstore/qdrant_test.go @@ -221,7 +221,7 @@ func TestQdrantStore_Search_ReturnsOrderedResults(t *testing.T) { } store := newTestStore(mock) - results, err := store.Search(context.Background(), "col1", []float32{0.1, 0.2}, 5, "") + results, err := store.Search(context.Background(), "col1", []float32{0.1, 0.2}, 5, "", "") if err != nil { t.Fatalf("Search() error = %v", err) } @@ -251,7 +251,7 @@ func TestQdrantStore_Search_ReconstructsChunks(t *testing.T) { } store := newTestStore(mock) - results, err := store.Search(context.Background(), "col1", []float32{0.1}, 1, "") + results, err := store.Search(context.Background(), "col1", []float32{0.1}, 1, "", "") if err != nil { t.Fatalf("Search() error = %v", err) } @@ -288,7 +288,7 @@ func TestQdrantStore_Search_ReturnsError(t *testing.T) { mock := &mockQdrantOps{collectionExistsResult: true, queryErr: want} store := newTestStore(mock) - _, err := store.Search(context.Background(), "col1", []float32{0.1}, 3, "") + _, err := store.Search(context.Background(), "col1", []float32{0.1}, 3, "", "") if err == nil { t.Fatal("Search() expected error, got nil") } @@ -376,7 +376,7 @@ func TestQdrantStore_ContextCancellation_Search(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) cancel() - _, err := store.Search(ctx, "col1", []float32{0.1}, 5, "") + _, err := store.Search(ctx, "col1", []float32{0.1}, 5, "", "") if err == nil { t.Fatal("Search() expected context error, got nil") } @@ -389,7 +389,7 @@ func TestQdrantStore_Search_QueryParams(t *testing.T) { mock := &mockQdrantOps{collectionExistsResult: true} store := newTestStore(mock) - _, _ = store.Search(context.Background(), "test-col", []float32{0.1, 0.2, 0.3}, 15, "") + _, _ = store.Search(context.Background(), "test-col", []float32{0.1, 0.2, 0.3}, 15, "", "") if len(mock.queryCalls) != 1 { t.Fatalf("queryCalls len = %d, want 1", len(mock.queryCalls)) @@ -410,7 +410,7 @@ func TestQdrantStore_Search_WithURLFilter(t *testing.T) { mock := &mockQdrantOps{collectionExistsResult: true} store := newTestStore(mock) - _, _ = store.Search(context.Background(), "col1", []float32{0.1}, 5, "https://example.com/privacy") + _, _ = store.Search(context.Background(), "col1", []float32{0.1}, 5, "https://example.com/privacy", "") if len(mock.queryCalls) != 1 { t.Fatalf("queryCalls len = %d, want 1", len(mock.queryCalls)) @@ -434,11 +434,47 @@ func TestQdrantStore_Search_WithURLFilter(t *testing.T) { } } +func TestQdrantStore_Search_WithURLAndContentHashFilter(t *testing.T) { + mock := &mockQdrantOps{collectionExistsResult: true} + store := newTestStore(mock) + + _, _ = store.Search(context.Background(), "col1", []float32{0.1}, 5, "https://example.com/privacy", "abc123") + + if len(mock.queryCalls) != 1 { + t.Fatalf("queryCalls len = %d, want 1", len(mock.queryCalls)) + } + req := mock.queryCalls[0] + if req.Filter == nil { + t.Fatal("Filter should not be nil when URL and contentHash are provided") + } + if len(req.Filter.Must) != 2 { + t.Fatalf("Filter.Must len = %d, want 2", len(req.Filter.Must)) + } + + // First condition: url + urlField := req.Filter.Must[0].GetField() + if urlField == nil || urlField.Key != "url" { + t.Errorf("Filter.Must[0] key = %q, want %q", urlField.GetKey(), "url") + } + if urlField.Match.GetKeyword() != "https://example.com/privacy" { + t.Errorf("url Match.Keyword = %q, want %q", urlField.Match.GetKeyword(), "https://example.com/privacy") + } + + // Second condition: content_hash + hashField := req.Filter.Must[1].GetField() + if hashField == nil || hashField.Key != "content_hash" { + t.Errorf("Filter.Must[1] key = %q, want %q", hashField.GetKey(), "content_hash") + } + if hashField.Match.GetKeyword() != "abc123" { + t.Errorf("content_hash Match.Keyword = %q, want %q", hashField.Match.GetKeyword(), "abc123") + } +} + func TestQdrantStore_Search_WithoutURLFilter(t *testing.T) { mock := &mockQdrantOps{collectionExistsResult: true} store := newTestStore(mock) - _, _ = store.Search(context.Background(), "col1", []float32{0.1}, 5, "") + _, _ = store.Search(context.Background(), "col1", []float32{0.1}, 5, "", "") if len(mock.queryCalls) != 1 { t.Fatalf("queryCalls len = %d, want 1", len(mock.queryCalls)) @@ -535,7 +571,7 @@ func TestQdrantStore_Search_PopulatesChunkID(t *testing.T) { } store := newTestStore(mock) - results, err := store.Search(context.Background(), "col1", []float32{0.1}, 5, "") + results, err := store.Search(context.Background(), "col1", []float32{0.1}, 5, "", "") if err != nil { t.Fatalf("Search() error = %v", err) } @@ -580,7 +616,7 @@ func TestQdrantStore_Search_LogsEntry(t *testing.T) { } store, logHandler := newTestStoreWithLogger(mock) - if _, err := store.Search(context.Background(), "col1", []float32{0.1}, 5, "https://example.com"); err != nil { + if _, err := store.Search(context.Background(), "col1", []float32{0.1}, 5, "https://example.com", ""); err != nil { t.Fatalf("Search() error = %v", err) } diff --git a/backend/internal/vectorstore/store.go b/backend/internal/vectorstore/store.go index 65e842f..a1944a8 100644 --- a/backend/internal/vectorstore/store.go +++ b/backend/internal/vectorstore/store.go @@ -22,7 +22,8 @@ type VectorStore interface { Upsert(ctx context.Context, collectionID string, chunks []Chunk) error // Search returns the top-limit chunks most similar to query in the given collection. // When url is non-empty, results are filtered to chunks from that URL. - Search(ctx context.Context, collectionID string, query []float32, limit int, url string) ([]Chunk, error) + // When contentHash is non-empty, results are further filtered to that content version. + Search(ctx context.Context, collectionID string, query []float32, limit int, url string, contentHash string) ([]Chunk, error) } // UpsertCall records a single call to MockVectorStore.Upsert for test assertion. @@ -37,6 +38,7 @@ type SearchCall struct { Query []float32 Limit int URL string + ContentHash string } // MockVectorStore is a configurable VectorStore implementation for use in tests. @@ -61,8 +63,8 @@ func (m *MockVectorStore) Upsert(_ context.Context, collectionID string, chunks } // Search records the call and returns SearchResult or SearchErr. -func (m *MockVectorStore) Search(_ context.Context, collectionID string, query []float32, limit int, url string) ([]Chunk, error) { - m.SearchCalls = append(m.SearchCalls, SearchCall{CollectionID: collectionID, Query: query, Limit: limit, URL: url}) +func (m *MockVectorStore) Search(_ context.Context, collectionID string, query []float32, limit int, url string, contentHash string) ([]Chunk, error) { + m.SearchCalls = append(m.SearchCalls, SearchCall{CollectionID: collectionID, Query: query, Limit: limit, URL: url, ContentHash: contentHash}) if m.SearchErr != nil { return nil, m.SearchErr } diff --git a/backend/internal/vectorstore/store_test.go b/backend/internal/vectorstore/store_test.go index 6d3d6c5..725d822 100644 --- a/backend/internal/vectorstore/store_test.go +++ b/backend/internal/vectorstore/store_test.go @@ -95,7 +95,7 @@ func TestMockSearch_ReturnsConfiguredChunks(t *testing.T) { want := []Chunk{{ID: "r1", Score: 0.9}, {ID: "r2", Score: 0.8}} m := &MockVectorStore{SearchResult: want} - got, err := m.Search(context.Background(), "col1", []float32{0.1, 0.2}, 5, "https://example.com") + got, err := m.Search(context.Background(), "col1", []float32{0.1, 0.2}, 5, "https://example.com", "") if err != nil { t.Fatalf("Search() error = %v", err) } @@ -112,7 +112,7 @@ func TestMockSearch_ReturnsConfiguredChunks(t *testing.T) { // TestMockSearch_ReturnsNilAndEmptyByDefault verifies Search returns nil error and empty slice by default. func TestMockSearch_ReturnsNilAndEmptyByDefault(t *testing.T) { m := &MockVectorStore{} - got, err := m.Search(context.Background(), "col1", []float32{0.1}, 3, "") + got, err := m.Search(context.Background(), "col1", []float32{0.1}, 3, "", "") if err != nil { t.Errorf("Search() error = %v, want nil", err) } @@ -126,7 +126,7 @@ func TestMockSearch_ReturnsConfiguredError(t *testing.T) { want := errors.New("search failed") m := &MockVectorStore{SearchErr: want} - _, err := m.Search(context.Background(), "col1", []float32{0.1}, 3, "") + _, err := m.Search(context.Background(), "col1", []float32{0.1}, 3, "", "") if !errors.Is(err, want) { t.Errorf("Search() error = %v, want %v", err, want) } @@ -136,7 +136,7 @@ func TestMockSearch_ReturnsConfiguredError(t *testing.T) { func TestMockSearch_RecordsCalls(t *testing.T) { m := &MockVectorStore{} query := []float32{0.1, 0.2, 0.3} - _, _ = m.Search(context.Background(), "my-collection", query, 10, "https://example.com/privacy") + _, _ = m.Search(context.Background(), "my-collection", query, 10, "https://example.com/privacy", "hash123") if len(m.SearchCalls) != 1 { t.Fatalf("SearchCalls len = %d, want 1", len(m.SearchCalls)) @@ -153,6 +153,9 @@ func TestMockSearch_RecordsCalls(t *testing.T) { if m.SearchCalls[0].URL != "https://example.com/privacy" { t.Errorf("SearchCalls[0].URL = %q, want %q", m.SearchCalls[0].URL, "https://example.com/privacy") } + if m.SearchCalls[0].ContentHash != "hash123" { + t.Errorf("SearchCalls[0].ContentHash = %q, want %q", m.SearchCalls[0].ContentHash, "hash123") + } } // TestMockRecordsMultipleCalls verifies that multiple calls are all recorded. @@ -160,8 +163,8 @@ func TestMockRecordsMultipleCalls(t *testing.T) { m := &MockVectorStore{} _ = m.Upsert(context.Background(), "col1", []Chunk{{ID: "a"}}) _ = m.Upsert(context.Background(), "col2", []Chunk{{ID: "b"}}) - _, _ = m.Search(context.Background(), "col1", []float32{0.1}, 5, "") - _, _ = m.Search(context.Background(), "col2", []float32{0.2}, 3, "") + _, _ = m.Search(context.Background(), "col1", []float32{0.1}, 5, "", "") + _, _ = m.Search(context.Background(), "col2", []float32{0.2}, 3, "", "") if len(m.UpsertCalls) != 2 { t.Errorf("UpsertCalls len = %d, want 2", len(m.UpsertCalls))