diff --git a/Sources/FetchKit/InMemoryFetchIndex.swift b/Sources/FetchKit/InMemoryFetchIndex.swift index 92b6213..42c9124 100644 --- a/Sources/FetchKit/InMemoryFetchIndex.swift +++ b/Sources/FetchKit/InMemoryFetchIndex.swift @@ -154,9 +154,11 @@ actor InMemoryFetchIndex: FetchIndex { lowercaseQuery: String, kind: FetchSearchKind ) -> SearchMatch? { + var seenTerms = Set() let terms = lowercaseQuery .split(whereSeparator: \.isWhitespace) .map(String.init) + .filter { seenTerms.insert($0).inserted } guard !terms.isEmpty else { return nil } @@ -172,10 +174,74 @@ actor InMemoryFetchIndex: FetchIndex { return SearchMatch( field: field, text: text, - score: boostedScore(base: 0.8 + (0.02 * Double(terms.count)), field: field, kind: kind) + score: boostedScore( + base: allTermsScoreBase(for: terms, in: lowercaseText), + field: field, + kind: kind + ) ) } + private func allTermsScoreBase(for terms: [String], in lowercaseText: String) -> Double { + let base = 0.8 + (0.02 * Double(terms.count)) + guard let compactness = termCompactness(for: terms, in: lowercaseText) else { + return base + } + + return base + (0.12 * compactness) + } + + private func termCompactness(for terms: [String], in lowercaseText: String) -> Double? { + let locationsByTerm = terms.map { term in + termLocations(for: term, in: lowercaseText) + } + guard locationsByTerm.allSatisfy({ !$0.isEmpty }) else { + return nil + } + + let span = smallestCoveringSpan(in: locationsByTerm) + let totalTermLength = terms.reduce(0) { $0 + $1.count } + guard span > 0 else { + return nil + } + + return min(1.0, Double(totalTermLength) / Double(span)) + } + + private func termLocations(for term: String, in lowercaseText: String) -> [Int] { + var locations: [Int] = [] + var searchStart = lowercaseText.startIndex + + while searchStart < lowercaseText.endIndex, + let range = lowercaseText.range(of: term, range: searchStart.. Int { + var cursors = Array(repeating: 0, count: locationsByTerm.count) + var smallestSpan = Int.max + + while true { + let currentLocations = locationsByTerm.enumerated().map { termIndex, locations in + (termIndex: termIndex, location: locations[cursors[termIndex]]) + } + guard let minLocation = currentLocations.min(by: { $0.location < $1.location }), + let maxLocation = currentLocations.max(by: { $0.location < $1.location }) else { + return smallestSpan + } + + smallestSpan = min(smallestSpan, maxLocation.location - minLocation.location + 1) + cursors[minLocation.termIndex] += 1 + if cursors[minLocation.termIndex] >= locationsByTerm[minLocation.termIndex].count { + return smallestSpan + } + } + } + private func boostedScore( base: Double, field: FetchSearchField, diff --git a/Tests/FetchKitTests/FixtureCorpusQualityTests.swift b/Tests/FetchKitTests/FixtureCorpusQualityTests.swift index 8d35a08..d8910e4 100644 --- a/Tests/FetchKitTests/FixtureCorpusQualityTests.swift +++ b/Tests/FetchKitTests/FixtureCorpusQualityTests.swift @@ -51,7 +51,10 @@ struct FixtureCorpusQualityTests { limit: 4 ) - #expect(foodStorageResults.map(\.document.id) == ["gutenberg-78430-chapter-1"]) + #expect(foodStorageResults.map(\.document.id).prefix(2) == [ + "gutenberg-78430-chapter-1", + "fixture-botany-near-miss", + ]) #expect(germinationResults.map(\.document.id) == ["gutenberg-78430-chapter-2"]) } @@ -75,6 +78,45 @@ struct FixtureCorpusQualityTests { #expect(!snippet.text.localizedCaseInsensitiveContains("Transcriber's Note")) } + @Test("Fixture corpus ranks focused body evidence over near misses") + func fixtureCorpusRanksFocusedBodyEvidenceOverNearMisses() async throws { + let library = try await indexedFixtureLibrary() + + let results = try await library.search( + "storage food seeds", + kind: .allTerms, + fields: [.body], + limit: 4 + ) + + #expect(results.map(\.document.id).prefix(2) == [ + "gutenberg-78430-chapter-1", + "fixture-botany-near-miss", + ]) + #expect(results.first?.snippet?.text.localizedCaseInsensitiveContains("storage of food in seeds") == true) + } + + @Test("Fixture corpus selects useful snippets from longer bodies") + func fixtureCorpusSelectsUsefulSnippetsFromLongerBodies() async throws { + let library = try await indexedFixtureLibrary() + + let results = try await library.search( + "pioneer chores neighbors cooperation", + kind: .allTerms, + fields: [.body], + limit: 4 + ) + let firstResult = try #require(results.first) + let snippet = try #require(firstResult.snippet) + + #expect(firstResult.document.id == "fixture-long-frontier-body") + #expect(snippet.text.localizedCaseInsensitiveContains("pioneer children")) + #expect(snippet.text.localizedCaseInsensitiveContains("cooperation")) + #expect(snippet.text.hasPrefix("…")) + #expect(snippet.text.hasSuffix("…")) + #expect(firstResult.snippetField == .body) + } + private func indexedFixtureLibrary() async throws -> FetchKitLibrary { let library = FetchKitLibrary() try await library.addDocuments(GutenbergMiniCorpus.records) diff --git a/Tests/FetchKitTests/Fixtures/GutenbergMiniCorpus.swift b/Tests/FetchKitTests/Fixtures/GutenbergMiniCorpus.swift index 46c6183..1dd572a 100644 --- a/Tests/FetchKitTests/Fixtures/GutenbergMiniCorpus.swift +++ b/Tests/FetchKitTests/Fixtures/GutenbergMiniCorpus.swift @@ -90,5 +90,39 @@ enum GutenbergMiniCorpus { "fixture.gutenbergID": "78432", ] ), + FetchDocumentRecord( + id: "fixture-botany-near-miss", + title: "Botany Classroom Supply Notes", + body: """ + This note lists classroom supplies for a botany course: labels, trays, hand lenses, jars, and paper envelopes. It mentions seeds as specimens, food labels for classroom bins, and storage cabinets for materials, but it stays focused on supplies rather than seed structure. + """, + kind: .note, + language: "en", + sourceURI: source.url, + metadata: [ + "fixture.dataset": source.datasetID, + "fixture.role": "near-miss", + "fixture.topic": "botany", + ] + ), + FetchDocumentRecord( + id: "fixture-long-frontier-body", + title: "Frontier Field Notes", + body: """ + Opening notes describe travel preparations, camp inventory, river crossings, and weather observations before the main subject appears. The early paragraphs are intentionally broad so snippet selection has to skip unhelpful front matter and move toward the useful passage. + + A later section focuses on pioneer children learning conduct of life through frontier chores, animal care, and cooperation with neighbors. The passage repeats pioneer children and frontier life together because those are the terms a reader would expect a useful search result to explain. + + Closing notes return to general scenery, wagon repairs, and family correspondence, giving the snippet builder material on both sides of the relevant section. + """, + kind: .note, + language: "en", + sourceURI: source.url, + metadata: [ + "fixture.dataset": source.datasetID, + "fixture.role": "long-body", + "fixture.topic": "frontier", + ] + ), ] } diff --git a/Tests/FetchKitTests/SearchKitFetchIndexTests.swift b/Tests/FetchKitTests/SearchKitFetchIndexTests.swift index 6977ef3..371ac93 100644 --- a/Tests/FetchKitTests/SearchKitFetchIndexTests.swift +++ b/Tests/FetchKitTests/SearchKitFetchIndexTests.swift @@ -237,6 +237,41 @@ final class SearchKitFetchIndexTests: XCTestCase { XCTAssertEqual(titleResults.first?.snippet?.text.localizedCaseInsensitiveContains("Transcriber's Note"), false) } + func testSearchKitFetchIndexMatchesFixtureCorpusNearMissAndLongBodyBehavior() async throws { + let index = try SearchKitFetchIndex( + configuration: .init( + storage: .inMemory, + indexNamePrefix: "SearchKitFetchIndexTests-\(UUID().uuidString)" + ) + ) + + try await index.apply( + FetchIndexingChangeset( + GutenbergMiniCorpus.records.map { .upsert($0.indexDocument) } + ) + ) + + let nearMissResults = try await index.search( + FetchSearchQuery("storage food seeds", kind: .allTerms, fields: [.body], limit: 4) + ) + let longBodyResults = try await index.search( + FetchSearchQuery("pioneer chores neighbors cooperation", kind: .allTerms, fields: [.body], limit: 4) + ) + + XCTAssertEqual(nearMissResults.map(\.document.id).prefix(2), [ + "gutenberg-78430-chapter-1", + "fixture-botany-near-miss", + ]) + XCTAssertEqual(nearMissResults.first?.snippet?.text.localizedCaseInsensitiveContains("storage of food in seeds"), true) + + XCTAssertEqual(longBodyResults.first?.document.id, "fixture-long-frontier-body") + XCTAssertEqual(longBodyResults.first?.snippet?.text.localizedCaseInsensitiveContains("pioneer children"), true) + XCTAssertEqual(longBodyResults.first?.snippet?.text.localizedCaseInsensitiveContains("cooperation"), true) + XCTAssertEqual(longBodyResults.first?.snippet?.text.hasPrefix("…"), true) + XCTAssertEqual(longBodyResults.first?.snippet?.text.hasSuffix("…"), true) + XCTAssertEqual(longBodyResults.first?.snippetField, .body) + } + func testFetchKitLibraryBuildsPersistentPair() async throws { let temporaryDirectory = URL(fileURLWithPath: NSTemporaryDirectory(), isDirectory: true) .appendingPathComponent(UUID().uuidString, isDirectory: true) diff --git a/docs/maintainers/fixture-corpus.md b/docs/maintainers/fixture-corpus.md index 7c6ae15..334fb03 100644 --- a/docs/maintainers/fixture-corpus.md +++ b/docs/maintainers/fixture-corpus.md @@ -18,7 +18,7 @@ Why this source fits the first pass: - the `chapters` config has chapter titles and chapter text, which is a useful shape for document-search quality tests - the corpus can be inspected through the Hugging Face Dataset Viewer APIs without adding a Swift dependency -The fixture records live in `Tests/FetchKitTests/Fixtures/GutenbergMiniCorpus.swift`. Each record carries dataset, config, split, row, and Gutenberg ID metadata so the sample remains attributable and replaceable. +The fixture records live in `Tests/FetchKitTests/Fixtures/GutenbergMiniCorpus.swift`. Each source-derived record carries dataset, config, split, row, and Gutenberg ID metadata so the sample remains attributable and replaceable. The fixture also includes small synthetic near-miss and longer-body records derived from the same topic shape. Those synthetic records exist to stress ranking and snippet selection without expanding the checked-in corpus into a large text dump. ## Result Evidence Policy @@ -57,4 +57,5 @@ Use this fixture to keep the settled Milestone 4 result-evidence behavior honest - whether the current ranking and snippet heuristics are enough for ordinary app callers - whether a larger fixture corpus exposes ranking or snippet gaps that the mini corpus cannot show +- whether near-miss records and longer-body records keep ranking and snippet behavior aligned between the default in-memory path and the SearchKit-backed path - whether future extended snippets should be backed by precomputed summaries for larger documents rather than by foreground search-time work