Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
68 changes: 67 additions & 1 deletion Sources/FetchKit/InMemoryFetchIndex.swift
Original file line number Diff line number Diff line change
Expand Up @@ -154,9 +154,11 @@ actor InMemoryFetchIndex: FetchIndex {
lowercaseQuery: String,
kind: FetchSearchKind
) -> SearchMatch? {
var seenTerms = Set<String>()
let terms = lowercaseQuery
.split(whereSeparator: \.isWhitespace)
.map(String.init)
.filter { seenTerms.insert($0).inserted }
guard !terms.isEmpty else {
return nil
}
Expand All @@ -172,10 +174,74 @@ actor InMemoryFetchIndex: FetchIndex {
return SearchMatch(
field: field,
text: text,
score: boostedScore(base: 0.8 + (0.02 * Double(terms.count)), field: field, kind: kind)
score: boostedScore(
base: allTermsScoreBase(for: terms, in: lowercaseText),
field: field,
kind: kind
)
)
}

private func allTermsScoreBase(for terms: [String], in lowercaseText: String) -> Double {
let base = 0.8 + (0.02 * Double(terms.count))
guard let compactness = termCompactness(for: terms, in: lowercaseText) else {
return base
}

return base + (0.12 * compactness)
}

private func termCompactness(for terms: [String], in lowercaseText: String) -> Double? {
let locationsByTerm = terms.map { term in
termLocations(for: term, in: lowercaseText)
}
guard locationsByTerm.allSatisfy({ !$0.isEmpty }) else {
return nil
}

let span = smallestCoveringSpan(in: locationsByTerm)
let totalTermLength = terms.reduce(0) { $0 + $1.count }
guard span > 0 else {
return nil
}

return min(1.0, Double(totalTermLength) / Double(span))
}

private func termLocations(for term: String, in lowercaseText: String) -> [Int] {
var locations: [Int] = []
var searchStart = lowercaseText.startIndex

while searchStart < lowercaseText.endIndex,
let range = lowercaseText.range(of: term, range: searchStart..<lowercaseText.endIndex) {
locations.append(lowercaseText.distance(from: lowercaseText.startIndex, to: range.lowerBound))
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Avoid quadratic index-distance scans in term location loop

The new compactness scorer makes every .allTerms query walk each matching document body via termLocations, and this loop computes distance(from: startIndex, to:) for every hit. On long texts with frequent terms (for example, a common word appearing thousands of times), those repeated distance calculations accumulate to roughly O(n²) work per term, which can make in-memory search latency spike substantially compared with the previous constant-time scoring path. Converting the search text to a random-access representation once (or tracking offsets incrementally) avoids this regression.

Useful? React with 👍 / 👎.

searchStart = range.upperBound
}

return locations
}

private func smallestCoveringSpan(in locationsByTerm: [[Int]]) -> Int {
var cursors = Array(repeating: 0, count: locationsByTerm.count)
var smallestSpan = Int.max

while true {
let currentLocations = locationsByTerm.enumerated().map { termIndex, locations in
(termIndex: termIndex, location: locations[cursors[termIndex]])
}
guard let minLocation = currentLocations.min(by: { $0.location < $1.location }),
let maxLocation = currentLocations.max(by: { $0.location < $1.location }) else {
return smallestSpan
}

smallestSpan = min(smallestSpan, maxLocation.location - minLocation.location + 1)
cursors[minLocation.termIndex] += 1
if cursors[minLocation.termIndex] >= locationsByTerm[minLocation.termIndex].count {
return smallestSpan
}
}
}

private func boostedScore(
base: Double,
field: FetchSearchField,
Expand Down
44 changes: 43 additions & 1 deletion Tests/FetchKitTests/FixtureCorpusQualityTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,10 @@ struct FixtureCorpusQualityTests {
limit: 4
)

#expect(foodStorageResults.map(\.document.id) == ["gutenberg-78430-chapter-1"])
#expect(foodStorageResults.map(\.document.id).prefix(2) == [
"gutenberg-78430-chapter-1",
"fixture-botany-near-miss",
])
#expect(germinationResults.map(\.document.id) == ["gutenberg-78430-chapter-2"])
}

Expand All @@ -75,6 +78,45 @@ struct FixtureCorpusQualityTests {
#expect(!snippet.text.localizedCaseInsensitiveContains("Transcriber's Note"))
}

@Test("Fixture corpus ranks focused body evidence over near misses")
func fixtureCorpusRanksFocusedBodyEvidenceOverNearMisses() async throws {
let library = try await indexedFixtureLibrary()

let results = try await library.search(
"storage food seeds",
kind: .allTerms,
fields: [.body],
limit: 4
)

#expect(results.map(\.document.id).prefix(2) == [
"gutenberg-78430-chapter-1",
"fixture-botany-near-miss",
])
#expect(results.first?.snippet?.text.localizedCaseInsensitiveContains("storage of food in seeds") == true)
}

@Test("Fixture corpus selects useful snippets from longer bodies")
func fixtureCorpusSelectsUsefulSnippetsFromLongerBodies() async throws {
let library = try await indexedFixtureLibrary()

let results = try await library.search(
"pioneer chores neighbors cooperation",
kind: .allTerms,
fields: [.body],
limit: 4
)
let firstResult = try #require(results.first)
let snippet = try #require(firstResult.snippet)

#expect(firstResult.document.id == "fixture-long-frontier-body")
#expect(snippet.text.localizedCaseInsensitiveContains("pioneer children"))
#expect(snippet.text.localizedCaseInsensitiveContains("cooperation"))
#expect(snippet.text.hasPrefix("…"))
#expect(snippet.text.hasSuffix("…"))
#expect(firstResult.snippetField == .body)
}

private func indexedFixtureLibrary() async throws -> FetchKitLibrary {
let library = FetchKitLibrary()
try await library.addDocuments(GutenbergMiniCorpus.records)
Expand Down
34 changes: 34 additions & 0 deletions Tests/FetchKitTests/Fixtures/GutenbergMiniCorpus.swift
Original file line number Diff line number Diff line change
Expand Up @@ -90,5 +90,39 @@ enum GutenbergMiniCorpus {
"fixture.gutenbergID": "78432",
]
),
FetchDocumentRecord(
id: "fixture-botany-near-miss",
title: "Botany Classroom Supply Notes",
body: """
This note lists classroom supplies for a botany course: labels, trays, hand lenses, jars, and paper envelopes. It mentions seeds as specimens, food labels for classroom bins, and storage cabinets for materials, but it stays focused on supplies rather than seed structure.
""",
kind: .note,
language: "en",
sourceURI: source.url,
metadata: [
"fixture.dataset": source.datasetID,
"fixture.role": "near-miss",
"fixture.topic": "botany",
]
),
FetchDocumentRecord(
id: "fixture-long-frontier-body",
title: "Frontier Field Notes",
body: """
Opening notes describe travel preparations, camp inventory, river crossings, and weather observations before the main subject appears. The early paragraphs are intentionally broad so snippet selection has to skip unhelpful front matter and move toward the useful passage.

A later section focuses on pioneer children learning conduct of life through frontier chores, animal care, and cooperation with neighbors. The passage repeats pioneer children and frontier life together because those are the terms a reader would expect a useful search result to explain.

Closing notes return to general scenery, wagon repairs, and family correspondence, giving the snippet builder material on both sides of the relevant section.
""",
kind: .note,
language: "en",
sourceURI: source.url,
metadata: [
"fixture.dataset": source.datasetID,
"fixture.role": "long-body",
"fixture.topic": "frontier",
]
),
]
}
35 changes: 35 additions & 0 deletions Tests/FetchKitTests/SearchKitFetchIndexTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -237,6 +237,41 @@ final class SearchKitFetchIndexTests: XCTestCase {
XCTAssertEqual(titleResults.first?.snippet?.text.localizedCaseInsensitiveContains("Transcriber's Note"), false)
}

func testSearchKitFetchIndexMatchesFixtureCorpusNearMissAndLongBodyBehavior() async throws {
let index = try SearchKitFetchIndex(
configuration: .init(
storage: .inMemory,
indexNamePrefix: "SearchKitFetchIndexTests-\(UUID().uuidString)"
)
)

try await index.apply(
FetchIndexingChangeset(
GutenbergMiniCorpus.records.map { .upsert($0.indexDocument) }
)
)

let nearMissResults = try await index.search(
FetchSearchQuery("storage food seeds", kind: .allTerms, fields: [.body], limit: 4)
)
let longBodyResults = try await index.search(
FetchSearchQuery("pioneer chores neighbors cooperation", kind: .allTerms, fields: [.body], limit: 4)
)

XCTAssertEqual(nearMissResults.map(\.document.id).prefix(2), [
"gutenberg-78430-chapter-1",
"fixture-botany-near-miss",
])
XCTAssertEqual(nearMissResults.first?.snippet?.text.localizedCaseInsensitiveContains("storage of food in seeds"), true)

XCTAssertEqual(longBodyResults.first?.document.id, "fixture-long-frontier-body")
XCTAssertEqual(longBodyResults.first?.snippet?.text.localizedCaseInsensitiveContains("pioneer children"), true)
XCTAssertEqual(longBodyResults.first?.snippet?.text.localizedCaseInsensitiveContains("cooperation"), true)
XCTAssertEqual(longBodyResults.first?.snippet?.text.hasPrefix("…"), true)
XCTAssertEqual(longBodyResults.first?.snippet?.text.hasSuffix("…"), true)
XCTAssertEqual(longBodyResults.first?.snippetField, .body)
}

func testFetchKitLibraryBuildsPersistentPair() async throws {
let temporaryDirectory = URL(fileURLWithPath: NSTemporaryDirectory(), isDirectory: true)
.appendingPathComponent(UUID().uuidString, isDirectory: true)
Expand Down
3 changes: 2 additions & 1 deletion docs/maintainers/fixture-corpus.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ Why this source fits the first pass:
- the `chapters` config has chapter titles and chapter text, which is a useful shape for document-search quality tests
- the corpus can be inspected through the Hugging Face Dataset Viewer APIs without adding a Swift dependency

The fixture records live in `Tests/FetchKitTests/Fixtures/GutenbergMiniCorpus.swift`. Each record carries dataset, config, split, row, and Gutenberg ID metadata so the sample remains attributable and replaceable.
The fixture records live in `Tests/FetchKitTests/Fixtures/GutenbergMiniCorpus.swift`. Each source-derived record carries dataset, config, split, row, and Gutenberg ID metadata so the sample remains attributable and replaceable. The fixture also includes small synthetic near-miss and longer-body records derived from the same topic shape. Those synthetic records exist to stress ranking and snippet selection without expanding the checked-in corpus into a large text dump.

## Result Evidence Policy

Expand Down Expand Up @@ -57,4 +57,5 @@ Use this fixture to keep the settled Milestone 4 result-evidence behavior honest

- whether the current ranking and snippet heuristics are enough for ordinary app callers
- whether a larger fixture corpus exposes ranking or snippet gaps that the mini corpus cannot show
- whether near-miss records and longer-body records keep ranking and snippet behavior aligned between the default in-memory path and the SearchKit-backed path
- whether future extended snippets should be backed by precomputed summaries for larger documents rather than by foreground search-time work
Loading