Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion Cotabby/Models/SuggestionEngineModels.swift
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,9 @@ struct DisabledApplicationRule: Codable, Equatable, Identifiable, Sendable {
enum AcceptanceGranularity: String, CaseIterable, Codable, Sendable {
/// One word (with the existing trailing-punctuation policy applied per chunk).
case word
/// Words accumulated until a sentence terminator (`.`, `!`, `?`, `\n`) or the tail runs out.
/// Words accumulated until a phrase boundary or the tail runs out: a sentence terminator
/// (`.`, `!`, `?`, CJK `。!?。`, `\n`) or a CJK clause comma (`、,`), so space-less scripts
/// advance clause by clause instead of a whole sentence per press.
case phrase
}

Expand Down
14 changes: 11 additions & 3 deletions Cotabby/Support/SentenceBoundaryClassifier.swift
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,12 @@ enum SentenceBoundaryClassifier {
switch text[lastIndex] {
case "!", "?":
return true
// The shared CJK terminator set (see `Character.isCJKSentenceTerminator`): unambiguous, so
// terminal without the period disambiguation below. Without these a Japanese completion never
// registers a sentence end and generation always runs to the token budget, which is why CJK
// suggestions came out so long.
case let character where character.isCJKSentenceTerminator:
return true
case ".":
return isTerminalPeriod(in: text, at: lastIndex)
default:
Expand Down Expand Up @@ -97,10 +103,12 @@ enum SentenceBoundaryClassifier {

private extension Character {
/// Closing punctuation that may follow a sentence terminator: straight and curly quotes,
/// parentheses, square brackets, and braces. `endsSentence` walks back past a run of these to find
/// the real terminator underneath, so `"done."` and `(stop!)` register as sentence ends.
/// parentheses, square brackets, and braces, plus the shared CJK closer set (see
/// `Character.isCJKClosingPunctuation`). `endsSentence` walks back past a run of these to find
/// the real terminator underneath, so `"done."`, `(stop!)`, and `終わり。」` register as
/// sentence ends.
var isSentenceClosingPunctuation: Bool {
self == "\"" || self == "'" || self == ")" || self == "]" || self == "}"
|| self == "\u{201D}" || self == "\u{2019}"
|| self == "\u{201D}" || self == "\u{2019}" || isCJKClosingPunctuation
}
}
154 changes: 130 additions & 24 deletions Cotabby/Support/SuggestionSessionReconciler.swift
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,28 @@ enum SuggestionSessionReconciler {
if tokenStart < index,
remainingText[tokenStart].beginsSpacelessScriptWord,
let wordEnd = firstSegmentedWordEnd(in: remainingText, from: tokenStart, notPast: index) {
index = wordEnd
}

// Bind an immediately following CJK punctuation run to the word so one Tab accepts
// "読み、" as a unit. Without this the punctuation would lead the *next* token, and a
// punctuation-led token skips ICU segmentation entirely, so in flat text it would swallow
// everything up to the next whitespace in a single accept.
index = endOfCJKPunctuationRun(in: remainingText, from: wordEnd, notPast: index)
} else if tokenStart < index,
remainingText[tokenStart].bindsToPrecedingSpacelessWord
|| remainingText[tokenStart].isCJKOpeningBracket {
// A token can also begin with CJK punctuation: closers/commas when the previous chunk
// ended exactly at the word (a typed-through advance), and opening brackets always,
// because an opener belongs to the *next* word so the trailing-binding above never
// consumes it. Peel the punctuation run as its own chunk instead of falling through to
// the whitespace scan, which would swallow everything up to the next whitespace.
index = endOfCJKPunctuationRun(in: remainingText, from: tokenStart, notPast: index, includingOpeners: true)
}

// With trailing-punctuation auto-accept off, peel any trailing punctuation (including a CJK
// run just bound above) back off the chunk, so `資料、` accepts as `資料` and the comma waits
// for the next Tab. This intentionally overrides the binding for word granularity; the phrase
// walker re-accumulates the comma regardless, so phrase output is unchanged either way. A
// punctuation-only token survives whole because `wordEndTrimmingTrailingPunctuation` returns
// nil when there is no word character to trim back to, so the peeled chunk is never empty.
if !autoAcceptTrailingPunctuation,
let wordEnd = wordEndTrimmingTrailingPunctuation(in: remainingText, from: tokenStart, to: index) {
index = wordEnd
Expand Down Expand Up @@ -286,10 +305,35 @@ enum SuggestionSessionReconciler {
return min(wordEnd, limit)
}

/// Accepts a full phrase up to the next sentence terminator (`.`, `!`, `?`, `\n`) or the end
/// of the buffered suggestion tail. Composes over `nextAcceptanceChunk` so word-boundary,
/// internal-punctuation, and leading-whitespace policy stay identical across the seams of a
/// multi-word accept.
/// The index just past the contiguous run of CJK punctuation starting at `start`, clamped to
/// `limit`. Returns `start` unchanged when the character there is not such punctuation, so the
/// word-binding call site degrades to "no extension". `includingOpeners` is true only for the
/// peel path: a trailing extension must stop before an opening bracket (it belongs to the next
/// word), while a punctuation-led peel takes the whole mixed run.
private static func endOfCJKPunctuationRun(
in text: String,
from start: String.Index,
notPast limit: String.Index,
includingOpeners: Bool = false
) -> String.Index {
var cursor = start
while cursor < limit {
let character = text[cursor]
guard character.bindsToPrecedingSpacelessWord
|| (includingOpeners && character.isCJKOpeningBracket) else {
break
}
cursor = text.index(after: cursor)
}
return cursor
}

/// Accepts a full phrase up to the next phrase boundary or the end of the buffered suggestion
/// tail. Boundaries are sentence terminators (`.`, `!`, `?`, their CJK forms `。!?。`, `\n`)
/// and the CJK clause commas (`、,`), so Japanese/Chinese phrase accepts advance clause by
/// clause instead of swallowing a whole space-less sentence in one Tab. Composes over
/// `nextAcceptanceChunk` so word-boundary, internal-punctuation, and leading-whitespace policy
/// stay identical across the seams of a multi-word accept.
///
/// Newlines need an extra rule: `nextAcceptanceChunk` returns leading whitespace as part of
/// the next chunk, so a tail like `Hello\nworld` would surface `\n` as the leading character
Expand Down Expand Up @@ -339,19 +383,20 @@ enum SuggestionSessionReconciler {
accumulated += chunk
working = String(working.dropFirst(chunk.count))

if endsInSentenceTerminator(accumulated) {
if endsAtPhraseBoundary(accumulated) {
return accumulated
}
}

return accumulated
}

/// Tail-end check for sentence terminators that survives closing quotes and brackets, so
/// `"done."` and `(yes!)` are recognized as phrase ends even though their final character is
/// a closer rather than `.!?`. Walks back past any run of closing punctuation, then checks
/// whether the character immediately before that run is a sentence terminator.
private static func endsInSentenceTerminator(_ text: String) -> Bool {
/// Tail-end check for phrase boundaries that survives closing quotes and brackets, so
/// `"done."`, `(yes!)`, and `終わり。」` are recognized as phrase ends even though their final
/// character is a closer rather than the terminator itself. Walks back past any run of closing
/// punctuation, then checks whether the character immediately before that run ends a sentence or
/// a CJK clause.
private static func endsAtPhraseBoundary(_ text: String) -> Bool {
var index = text.endIndex
while index > text.startIndex {
let prev = text.index(before: index)
Expand All @@ -365,12 +410,20 @@ enum SuggestionSessionReconciler {
return false
}
let prev = text.index(before: index)
// The ideographic / fullwidth comma marks a clause boundary in CJK prose. Space-less scripts
// have no whitespace rhythm, so without this stop a Japanese phrase accept swallows an entire
// sentence in one Tab; with it, Tab advances clause by clause. ASCII "," is deliberately NOT
// a boundary, so English phrase cadence is unchanged.
if text[prev].isPhraseClauseBoundary {
return true
}
guard text[prev].isPhraseSentenceTerminator else {
return false
}
// `!` and `?` always end a sentence. A period is ambiguous: decimals, list/ordinal numbers,
// single-letter initials, and common abbreviations are not sentence ends, so consult the
// classifier rather than treating every "." as terminal.
// `!`/`?` and the CJK terminators always end a sentence. An ASCII period is ambiguous:
// decimals, list/ordinal numbers, single-letter initials, and common abbreviations are not
// sentence ends, so consult the classifier rather than treating every "." as terminal. The
// ideographic `。` has no such ambiguity (it never marks decimals or abbreviations).
if text[prev] == "." {
return SentenceBoundaryClassifier.isTerminalPeriod(in: text, at: prev)
}
Expand Down Expand Up @@ -509,6 +562,28 @@ private extension String {
}
}

/// The CJK punctuation primitives, internal because they are the single source of truth shared by
/// this file's acceptance policy and `SentenceBoundaryClassifier`'s sentence-end detection. Adding a
/// codepoint here updates phrase boundaries, chunk binding, and the generation stop in one edit.
extension Character {
/// The CJK sentence terminators: ideographic full stop `。`, fullwidth `!` `?`, and the halfwidth
/// ideographic stop `。`. Unlike the ASCII period these are unambiguous (they never mark decimals,
/// list numbers, or abbreviations), so every consumer treats them as terminal without classifier
/// disambiguation.
var isCJKSentenceTerminator: Bool {
self == "\u{3002}" || self == "\u{FF01}" || self == "\u{FF1F}" || self == "\u{FF61}"
}

/// The CJK closing punctuation: corner brackets `」` `』` (and the halfwidth corner `」`),
/// fullwidth parenthesis `)`, lenticular bracket `】`, and angle brackets `〉` `》`. Walk-backs
/// skip a run of these to find the real terminator underneath, and chunk binding attaches them to
/// the word they close.
var isCJKClosingPunctuation: Bool {
self == "\u{300D}" || self == "\u{300F}" || self == "\u{FF09}"
|| self == "\u{3011}" || self == "\u{3009}" || self == "\u{300B}" || self == "\u{FF63}"
}
}

private extension Character {
/// True when the character begins a word of a space-less script (Han, Hiragana, Katakana, Hangul,
/// Thai, Lao, Khmer, Myanmar, ...). These scripts write words without separating spaces, so the
Expand Down Expand Up @@ -544,19 +619,50 @@ private extension Character {
isLetter || isNumber
}

/// Sentence-ending punctuation for phrase mode. `\n` is handled separately because it can
/// appear inside a leading-whitespace prefix of a composed chunk rather than at the chunk's
/// tail end.
/// The CJK opening brackets: corner brackets `「` `『` (and the halfwidth corner `「`), fullwidth
/// parenthesis `(`, lenticular bracket `【`, and angle brackets `〈` `《`. These lead the word
/// they quote, so the trailing-binding rule stops before them while the punctuation-led peel
/// takes them; without the peel a chunk starting at `「` would skip ICU segmentation and swallow
/// the rest of a flat quoted run to the next whitespace.
var isCJKOpeningBracket: Bool {
self == "\u{300C}" || self == "\u{300E}" || self == "\u{FF08}"
|| self == "\u{3010}" || self == "\u{3008}" || self == "\u{300A}" || self == "\u{FF62}"
}

/// Sentence-ending punctuation for phrase mode, in both ASCII and CJK forms: `.` `!` `?` plus the
/// ideographic full stop `。`, fullwidth `!` `?`, and the halfwidth ideographic stop `。`. `\n` is
/// handled separately because it can appear inside a leading-whitespace prefix of a composed chunk
/// rather than at the chunk's tail end.
var isPhraseSentenceTerminator: Bool {
self == "." || self == "!" || self == "?"
self == "." || self == "!" || self == "?" || isCJKSentenceTerminator
}

/// Clause-boundary punctuation for phrase mode: the ideographic comma `、` (and its halfwidth
/// form `、`) and the fullwidth comma `,`. CJK prose marks its natural pause points with these
/// rather than whitespace, so phrase acceptance treats them as boundaries to advance clause by
/// clause instead of swallowing a whole sentence per Tab. All three codepoints occur only in CJK
/// text, and ASCII "," is deliberately excluded, so space-delimited scripts never stop at a comma.
var isPhraseClauseBoundary: Bool {
self == "\u{3001}" || self == "\u{FF0C}" || self == "\u{FF64}"
}

/// Closing punctuation that may follow a sentence terminator in prose: straight + curly
/// quotes, parentheses, square brackets, and braces. The phrase scanner walks back past a
/// run of these to find the real sentence terminator underneath, so `"done."` stops as a
/// complete sentence even though its final character is the closing quote.
/// quotes, parentheses, square brackets, and braces, plus the CJK closers (corner brackets,
/// fullwidth parenthesis, lenticular and angle brackets). The phrase scanner walks back past a
/// run of these to find the real sentence terminator underneath, so `"done."` and `終わり。」`
/// stop as complete sentences even though their final character is the closer.
var isPhraseClosingPunctuation: Bool {
self == "\"" || self == "'" || self == ")" || self == "]" || self == "}"
|| self == "\u{201D}" || self == "\u{2019}"
|| self == "\u{201D}" || self == "\u{2019}" || isCJKClosingPunctuation
}

/// CJK punctuation that binds to the space-less word it follows for acceptance chunking: clause
/// commas, sentence terminators, and closing brackets/quotes. One Tab then accepts `読み、` as a
/// unit, and a chunk can never start at a punctuation cliff that would swallow the rest of the
/// run. Opening brackets are excluded because they belong to the next word, and every contributing
/// set is CJK-only (ASCII punctuation is never a member), so this can never affect space-delimited
/// text.
var bindsToPrecedingSpacelessWord: Bool {
isPhraseClauseBoundary || isCJKSentenceTerminator || isCJKClosingPunctuation
}
Comment thread
greptile-apps[bot] marked this conversation as resolved.
}
26 changes: 26 additions & 0 deletions CotabbyTests/SentenceBoundaryClassifierTests.swift
Original file line number Diff line number Diff line change
Expand Up @@ -76,4 +76,30 @@ final class SentenceBoundaryClassifierTests: XCTestCase {
func test_endsSentence_falseForEmptyString() {
XCTAssertFalse(SentenceBoundaryClassifier.endsSentence(""))
}

/// CJK terminators are unambiguous sentence ends. Without these the decode stop policy never
/// fires for Japanese/Chinese text and generation always runs to the token budget, which is why
/// CJK suggestions came out so long.
func test_endsSentence_trueForCJKTerminators() {
XCTAssertTrue(SentenceBoundaryClassifier.endsSentence("資料を読む。"))
XCTAssertTrue(SentenceBoundaryClassifier.endsSentence("すごい!"))
XCTAssertTrue(SentenceBoundaryClassifier.endsSentence("いいですか?"))
}

func test_endsSentence_walksPastCJKClosingPunctuation() {
XCTAssertTrue(SentenceBoundaryClassifier.endsSentence("終わり。」"))
}

/// Halfwidth kana punctuation (legacy SJIS contexts) terminates like its fullwidth counterparts,
/// including the walk past a halfwidth corner bracket.
func test_endsSentence_trueForHalfwidthTerminatorAndCloser() {
XCTAssertTrue(SentenceBoundaryClassifier.endsSentence("終わり。"))
XCTAssertTrue(SentenceBoundaryClassifier.endsSentence("終わり。」"))
}

/// The ideographic comma is a clause boundary, not a sentence end: generation should keep going
/// past `、` and only stop at a real terminator.
func test_endsSentence_falseForIdeographicComma() {
XCTAssertFalse(SentenceBoundaryClassifier.endsSentence("資料を読み、"))
}
}
Loading