From dac0ddfac32004cd24bc411605b86b37e13288f6 Mon Sep 17 00:00:00 2001 From: David Yonge-Mallo Date: Wed, 6 May 2026 17:39:54 +0200 Subject: [PATCH] fix(inline): coalesce subword splits inside long hex literals --- lua/diffview/scene/inline_diff.lua | 216 ++++++++++++++---- .../tests/functional/inline_diff_spec.lua | 113 ++++++++- 2 files changed, 282 insertions(+), 47 deletions(-) diff --git a/lua/diffview/scene/inline_diff.lua b/lua/diffview/scene/inline_diff.lua index 04d24fd5..4373a8cf 100644 --- a/lua/diffview/scene/inline_diff.lua +++ b/lua/diffview/scene/inline_diff.lua @@ -156,23 +156,13 @@ local function is_word_token(s) end -- Classify a UTF-8 character into a subword class for `tokenize`: --- "lower" (ASCII a-z, plus any non-ASCII byte — see below), "upper" --- (ASCII A-Z), "digit" (ASCII 0-9), "under" (`_`), or `nil` for ASCII --- non-word characters. --- --- ASCII-only classification by design: every multi-byte character and --- every stray high byte from malformed UTF-8 falls into "lower". This --- keeps multi-byte runs joined with adjacent ASCII lowercase rather --- than splitting per-character, and avoids carrying a Unicode category --- table in pure Lua. Known trade-offs that follow from this: --- * Accented uppercase (`Ü`, `É`) does not trigger a lower→upper --- split, so `fooÜber` stays one token. --- * Unicode punctuation/symbols (`—`, emoji) are not treated as --- non-word and stay inside word tokens rather than splitting them. --- ASCII camelCase/PascalCase/snake_case/digit splits — the common case --- in source code — work as expected. Stray high bytes share the --- "lower" bucket so classification stays consistent with --- `is_word_byte`, which treats them as word bytes. +-- "lower" (ASCII a-z), "upper" (ASCII A-Z), "digit" (ASCII 0-9), +-- "under" (`_`), or `nil` for ASCII non-word characters. Multi-byte +-- characters and stray high bytes from malformed UTF-8 bucket as +-- "lower" so non-Latin runs join adjacent ASCII lowercase. Known +-- consequences: accented uppercase (`Ü`, `É`) doesn't trigger a +-- lower→upper split, and Unicode punctuation/symbols stay inside word +-- tokens rather than splitting them. ---@param ch string ---@return "lower"|"upper"|"digit"|"under"|nil local function subword_class(ch) @@ -201,33 +191,172 @@ local function subword_class(ch) return nil end --- Tokenize `s` for subword-level intraline diffing. Splits both on --- ASCII non-word characters AND on subword boundaries within word-char --- runs: camelCase (`fooBar` → `foo`, `Bar`), acronym→word --- (`XMLParser` → `XML`, `Parser`), digit↔letter (`error123abc` → --- `error`, `123`, `abc`), and underscore (`audio_preservation` → --- `audio`, `_`, `preservation`). Each ASCII non-word character becomes --- its own token; each subword run becomes one token. Multi-byte chars --- bucket as lowercase letters; see `subword_class` for the trade-offs. --- Returns the token list and a parallel byte-range map. +-- Minimum length for `coalesce_hex_runs` to consider folding subword +-- splits back into a single hex-literal token. Sized to catch full +-- SHAs and longer abbreviations without lumping in short identifier +-- suffixes; 7-char abbreviations stay subword-split. +local HEX_TOKEN_MIN_LEN = 8 + +-- Upper bound on the longest contiguous letter run permitted in a +-- hex-literal candidate. Random hex averages a max letter run of ~3; +-- pseudo-hex words like `cafe`, `face`, `dead`, `cafef00d`, `decade` +-- exceed the threshold. Hashes that roll a 5+ letter streak skip the +-- coalesce and fall back to the existing `INTRALINE_MAX_HUNKS` gate. +local HEX_TOKEN_MAX_LETTER_RUN = 4 + +-- True when the concatenation of `tokens[i..j]` (with combined byte +-- length `total_len`) looks like a hash or numeric literal rather +-- than a coincidentally-all-hex identifier. Combines four cheap +-- signals in one O(n) pass over the token slice's bytes: +-- * length >= `HEX_TOKEN_MIN_LEN`, +-- * single-case hex (every letter in `[a-f]` or every letter in +-- `[A-F]`), +-- * digit/letter transitions >= `ceil(length / 4)` (rejects +-- word-prefixed candidates like `decade1234567`), +-- * max contiguous letter run <= `HEX_TOKEN_MAX_LETTER_RUN` +-- (rejects dictionary-shaped runs like `cafef00d1234`). +-- Walking the slice directly lets `coalesce_hex_runs` decide before +-- allocating a merged string, so a non-hex word run (e.g. most +-- camelCase identifiers) bails on its first non-[0-9A-Fa-f] byte. +---@param tokens string[] +---@param i integer +---@param j integer +---@param total_len integer +---@return boolean +local function is_hex_run_in_tokens(tokens, i, j, total_len) + if total_len < HEX_TOKEN_MIN_LEN then + return false + end + local has_lo, has_up = false, false + local prev_class = 0 -- 0 = none, 1 = digit, 2 = letter. + local transitions = 0 + local cur_letter_run, max_letter_run = 0, 0 + for k = i, j do + local t = tokens[k] + for p = 1, #t do + local b = t:byte(p) + local class + if b >= 0x30 and b <= 0x39 then + class = 1 + cur_letter_run = 0 + elseif b >= 0x61 and b <= 0x66 then + class = 2 + has_lo = true + cur_letter_run = cur_letter_run + 1 + if cur_letter_run > max_letter_run then + max_letter_run = cur_letter_run + end + elseif b >= 0x41 and b <= 0x46 then + class = 2 + has_up = true + cur_letter_run = cur_letter_run + 1 + if cur_letter_run > max_letter_run then + max_letter_run = cur_letter_run + end + else + return false + end + if prev_class ~= 0 and prev_class ~= class then + transitions = transitions + 1 + end + prev_class = class + end + end + if has_lo and has_up then + return false + end + if max_letter_run > HEX_TOKEN_MAX_LETTER_RUN then + return false + end + if transitions < math.ceil(total_len / 4) then + return false + end + return true +end + +-- Thin string wrapper around `is_hex_run_in_tokens`. The production +-- path (`coalesce_hex_runs`) drives the token-slice variant directly; +-- this wrapper exists so the predicate stays exercisable from tests +-- with a single string input. +---@param s string +---@return boolean +local function is_hex_run(s) + return is_hex_run_in_tokens({ s }, 1, 1, #s) +end + +-- Walk a fresh `tokenize` token list and merge runs of byte-adjacent +-- word tokens whose joined string passes `is_hex_run_in_tokens`. A +-- 40-char hash that would otherwise fragment into ~25 alternating +-- digit/letter subwords folds back into one token, so a 1:1 hash +-- replacement renders as a whole-token swap instead of slipping under +-- `INTRALINE_MAX_HUNKS` from coincidental subword matches between two +-- unrelated hashes. -- --- Per-character tokenization was tried first, but `vim.diff --minimal` --- matches coincidental letters between dissimilar lines (e.g. --- "something." and "any tracked metric." share 't', 'e', 'm', 'i', '.') --- and splits the diff into small hunks. Rendered in overleaf style those --- fragments interleave deleted and inserted text into unreadable --- character-level noise. +-- The walk is bounded by byte adjacency so it can't span a non-word +-- char: `"v1.0-1c9dfb..."` keeps `v`, dots, and dashes as natural +-- boundaries. Within one source word run the walk is all-or-nothing. +---@param tokens string[] +---@param byte_map { byte: integer, byte_len: integer }[] +---@return string[] +---@return { byte: integer, byte_len: integer }[] +local function coalesce_hex_runs(tokens, byte_map) + local out, out_map = {}, {} + local i = 1 + while i <= #tokens do + if not is_word_token(tokens[i]) then + out[#out + 1] = tokens[i] + out_map[#out_map + 1] = byte_map[i] + i = i + 1 + else + local j = i + while + j + 1 <= #tokens + and is_word_token(tokens[j + 1]) + and byte_map[j + 1].byte == byte_map[j].byte + byte_map[j].byte_len + do + j = j + 1 + end + + if j > i then + local merged_len = byte_map[j].byte + byte_map[j].byte_len - byte_map[i].byte + if is_hex_run_in_tokens(tokens, i, j, merged_len) then + out[#out + 1] = table.concat(tokens, "", i, j) + out_map[#out_map + 1] = { + byte = byte_map[i].byte, + byte_len = merged_len, + } + else + for k = i, j do + out[#out + 1] = tokens[k] + out_map[#out_map + 1] = byte_map[k] + end + end + else + out[#out + 1] = tokens[i] + out_map[#out_map + 1] = byte_map[i] + end + i = j + 1 + end + end + return out, out_map +end + +-- Tokenize `s` for intraline diffing. Splits at ASCII non-word +-- characters and at subword boundaries within word-char runs: +-- camelCase (`fooBar` → `foo`, `Bar`), acronym→word (`XMLParser` → +-- `XML`, `Parser`), digit↔letter (`error123abc` → `error`, `123`, +-- `abc`), and underscore (`audio_preservation` → `audio`, `_`, +-- `preservation`). Each non-word char becomes its own token; each +-- subword run becomes one token. Multi-byte chars bucket as +-- lowercase; see `subword_class`. Returns the token list and a +-- parallel byte-range map. -- --- Whole-identifier word tokens fixed that but introduced a second --- failure: a 1:1 token replacement between unrelated identifiers sharing --- only a structural prefix (`EventStateOpen` / `EventStateClose`) --- admitted char-level refinement on the prefix-overlap gate, then --- `vim.diff` latched onto a coincidental letter inside the differing --- tails (`Open` / `Close` share an `e`) and produced interleaved --- per-char noise. Subword tokens close the gap: the divergent subword --- now pairs directly with its replacement, the resulting char-level --- diff has no shared prefix/suffix to admit refinement, and the hunk --- renders as a clean whole-token replacement. +-- Subword granularity keeps `vim.diff --minimal` from latching onto +-- coincidental letters in dissimilar lines or in the divergent tails +-- of structurally similar identifiers, both of which would render as +-- per-char noise in overleaf style. A post-pass (`coalesce_hex_runs`) +-- folds the splits back into one token for long hash-like hex runs +-- so git SHAs and similar literals stay atomic. ---@param s string ---@return string[] tokens ---@return { byte: integer, byte_len: integer }[] byte_map @@ -298,7 +427,7 @@ local function tokenize(s) end flush() - return tokens, byte_map + return coalesce_hex_runs(tokens, byte_map) end -- Decompose `s` into UTF-8 characters with byte offsets. Used to refine a @@ -1713,6 +1842,7 @@ M._test = { is_word_char = is_word_char, is_word_token = is_word_token, subword_class = subword_class, + is_hex_run = is_hex_run, tokenize = tokenize, split_chars = split_chars, diff_units = diff_units, diff --git a/lua/diffview/tests/functional/inline_diff_spec.lua b/lua/diffview/tests/functional/inline_diff_spec.lua index 84d79d4a..4c671b68 100644 --- a/lua/diffview/tests/functional/inline_diff_spec.lua +++ b/lua/diffview/tests/functional/inline_diff_spec.lua @@ -771,6 +771,45 @@ describe("diffview.scene.inline_diff", function() ) end) + it("tokenize coalesces a long single-case hex hash into one token", function() + -- 40-char SHA-1, all-lowercase hex with mixed digit/letter runs: + -- without the post-pass the run subword-splits into ~25 tokens, + -- which fragment the diff against an unrelated hash and admit + -- coincidental subword matches under `INTRALINE_MAX_HUNKS`. + local s = "1c9dfb261b8be35f689c6d83dfd3e92b7f59ecf8" + local tokens, m = h.tokenize(s) + assert.are.same({ s }, tokens) + assert.are.equal(0, m[1].byte) + assert.are.equal(40, m[1].byte_len) + end) + + it("tokenize coalesces uppercase hex with alternating digits too", function() + -- All-uppercase run with frequent digit↔letter alternation passes + -- the same predicate. + assert.are.same({ "AB12CD34EF56AB78" }, (h.tokenize("AB12CD34EF56AB78"))) + end) + + it("tokenize leaves short hex-like runs subword-split", function() + -- 7 chars, below `HEX_TOKEN_MIN_LEN`: stays subword-split so a + -- short identifier suffix isn't collapsed into one opaque token. + assert.are.same({ "abc", "1234" }, (h.tokenize("abc1234"))) + end) + + it("tokenize rejects pseudo-hex with low transition density", function() + -- 1-2 transitions across 12-13 chars sit below `ceil(len/4)`, so + -- word-prefixed patterns keep their split and `decade` / + -- `1234567` stay diffable as semantic units. + assert.are.same({ "decade", "1234567" }, (h.tokenize("decade1234567"))) + assert.are.same({ "cafebabe", "1234" }, (h.tokenize("cafebabe1234"))) + assert.are.same({ "face", "1234", "abcd" }, (h.tokenize("face1234abcd"))) + end) + + it("tokenize rejects pseudo-hex with a long letter run", function() + -- 3 transitions hits `ceil(12/4)=3`, but `cafef` letter run = 5 + -- exceeds `HEX_TOKEN_MAX_LETTER_RUN`, so the predicate rejects. + assert.are.same({ "cafef", "00", "d", "1234" }, (h.tokenize("cafef00d1234"))) + end) + it("subword_class buckets ASCII and multi-byte chars correctly", function() assert.are.equal("lower", h.subword_class("a")) assert.are.equal("upper", h.subword_class("Z")) @@ -795,15 +834,51 @@ describe("diffview.scene.inline_diff", function() assert.is_false(h.is_word_token("")) end) + it("is_hex_run accepts long single-case hex with high transition density", function() + assert.is_true(h.is_hex_run("1c9dfb261b8be35f689c6d83dfd3e92b7f59ecf8")) + assert.is_true(h.is_hex_run("AB12CD34EF56AB78")) + end) + + it("is_hex_run rejects strings shorter than HEX_TOKEN_MIN_LEN", function() + assert.is_false(h.is_hex_run("abc1234")) + assert.is_false(h.is_hex_run("")) + end) + + it("is_hex_run rejects mixed-case hex", function() + -- `tokenize`'s upper→lower split already separates the cases, so a + -- mixed-case merge candidate never reaches `is_hex_run` in practice; + -- the predicate rejects defensively for correctness. + assert.is_false(h.is_hex_run("AbCd1234")) + end) + + it("is_hex_run rejects strings with non-hex chars", function() + assert.is_false(h.is_hex_run("hello123abc")) + assert.is_false(h.is_hex_run("1234567890_")) + end) + + it("is_hex_run rejects low transition density", function() + assert.is_false(h.is_hex_run("decade1234567")) + assert.is_false(h.is_hex_run("cafebabe1234")) + end) + + it("is_hex_run rejects long letter runs", function() + -- `cafef` = 5-char letter run, exceeds `HEX_TOKEN_MAX_LETTER_RUN`. + assert.is_false(h.is_hex_run("cafef00d1234")) + end) + + it("is_hex_run rejects pure-digit and pure-letter runs", function() + -- These can't reach the predicate from `coalesce_hex_runs` (their + -- source word run is one subword token), but rejecting them on + -- their own merits keeps the contract sharp: zero transitions. + assert.is_false(h.is_hex_run("12345678")) + assert.is_false(h.is_hex_run("abcdefab")) + end) + it("diff_units returns [] when either side is empty", function() assert.are.same({}, h.diff_units({}, { "a" })) assert.are.same({}, h.diff_units({ "a" }, {})) end) - it("INTRALINE_MAX_HUNKS is a positive integer", function() - assert.is_true(h.INTRALINE_MAX_HUNKS >= 1) - end) - it("refinement_safe admits single-hunk sub-diffs regardless of overlap", function() -- One hunk cannot interleave, so even unrelated words (foo/bar) are safe. assert.is_true(h.refinement_safe({ "f", "o", "o" }, { "b", "a", "r" }, 1)) @@ -895,6 +970,36 @@ describe("diffview.scene.inline_diff", function() local vls = virt_line_hls(extmarks(bufnr)) assert.are.same({ "DiffviewDiffDeleteInline" }, vls) end) + + it("renders a JSON commit-hash modification as one whole-token swap", function() + -- A pair of lazy-lock.json-style lines differing only in a 40-char + -- SHA. Without `coalesce_hex_runs`, the two unrelated hashes + -- subword-split into ~17-19 alternating digit/lowercase tokens + -- whose coincidental matches kept some pairs under + -- `INTRALINE_MAX_HUNKS` and rendered as fragmented per-token + -- highlights; after coalescing, the diff is a single 1:1 hash + -- replacement covering exactly the new hash's byte range. + local old_line = ' "x": { "commit": "ffa44ee9470743a7697d28df3a1a216fdfe2b09d" }' + local new_line = ' "x": { "commit": "cf4c30892644f01ebfb1e248eeca9e259856f9dc" }' + local bufnr = fresh_buf({ new_line }) + inline_diff.render(bufnr, { old_line }, { new_line }) + local cr = char_ranges(extmarks(bufnr)) + assert.are.equal(1, #cr) + local hash_start = string.find(new_line, "cf4c3089", 1, true) - 1 + assert.are.equal(0, cr[1].row) + assert.are.equal(hash_start, cr[1].start) + assert.are.equal(hash_start + 40, cr[1].finish) + end) + + it("emits the entire deleted hash as one inline virt_text in overleaf", function() + local old_line = ' "x": { "commit": "ffa44ee9470743a7697d28df3a1a216fdfe2b09d" }' + local new_line = ' "x": { "commit": "cf4c30892644f01ebfb1e248eeca9e259856f9dc" }' + local bufnr = fresh_buf({ new_line }) + inline_diff.render(bufnr, { old_line }, { new_line }, { style = "overleaf" }) + local virts = inline_virt_texts(extmarks(bufnr), "DiffviewDiffDeleteInline") + assert.are.equal(1, #virts) + assert.are.equal("ffa44ee9470743a7697d28df3a1a216fdfe2b09d", virts[1].text) + end) end) describe("intraline tokenization", function()