From dac0ddfac32004cd24bc411605b86b37e13288f6 Mon Sep 17 00:00:00 2001
From: David Yonge-Mallo <dlyongemallo@users.noreply.github.com>
Date: Wed, 6 May 2026 17:39:54 +0200
Subject: [PATCH] fix(inline): coalesce subword splits inside long hex literals

---
 lua/diffview/scene/inline_diff.lua            | 216 ++++++++++++++----
 .../tests/functional/inline_diff_spec.lua     | 113 ++++++++-
 2 files changed, 282 insertions(+), 47 deletions(-)

diff --git a/lua/diffview/scene/inline_diff.lua b/lua/diffview/scene/inline_diff.lua
index 04d24fd5..4373a8cf 100644
--- a/lua/diffview/scene/inline_diff.lua
+++ b/lua/diffview/scene/inline_diff.lua
@@ -156,23 +156,13 @@ local function is_word_token(s)
 end
 
 -- Classify a UTF-8 character into a subword class for `tokenize`:
--- "lower" (ASCII a-z, plus any non-ASCII byte — see below), "upper"
--- (ASCII A-Z), "digit" (ASCII 0-9), "under" (`_`), or `nil` for ASCII
--- non-word characters.
---
--- ASCII-only classification by design: every multi-byte character and
--- every stray high byte from malformed UTF-8 falls into "lower". This
--- keeps multi-byte runs joined with adjacent ASCII lowercase rather
--- than splitting per-character, and avoids carrying a Unicode category
--- table in pure Lua. Known trade-offs that follow from this:
---   * Accented uppercase (`Ü`, `É`) does not trigger a lower→upper
---     split, so `fooÜber` stays one token.
---   * Unicode punctuation/symbols (`—`, emoji) are not treated as
---     non-word and stay inside word tokens rather than splitting them.
--- ASCII camelCase/PascalCase/snake_case/digit splits — the common case
--- in source code — work as expected. Stray high bytes share the
--- "lower" bucket so classification stays consistent with
--- `is_word_byte`, which treats them as word bytes.
+-- "lower" (ASCII a-z), "upper" (ASCII A-Z), "digit" (ASCII 0-9),
+-- "under" (`_`), or `nil` for ASCII non-word characters. Multi-byte
+-- characters and stray high bytes from malformed UTF-8 bucket as
+-- "lower" so non-Latin runs join adjacent ASCII lowercase. Known
+-- consequences: accented uppercase (`Ü`, `É`) doesn't trigger a
+-- lower→upper split, and Unicode punctuation/symbols stay inside word
+-- tokens rather than splitting them.
 ---@param ch string
 ---@return "lower"|"upper"|"digit"|"under"|nil
 local function subword_class(ch)
@@ -201,33 +191,172 @@ local function subword_class(ch)
   return nil
 end
 
--- Tokenize `s` for subword-level intraline diffing. Splits both on
--- ASCII non-word characters AND on subword boundaries within word-char
--- runs: camelCase (`fooBar` → `foo`, `Bar`), acronym→word
--- (`XMLParser` → `XML`, `Parser`), digit↔letter (`error123abc` →
--- `error`, `123`, `abc`), and underscore (`audio_preservation` →
--- `audio`, `_`, `preservation`). Each ASCII non-word character becomes
--- its own token; each subword run becomes one token. Multi-byte chars
--- bucket as lowercase letters; see `subword_class` for the trade-offs.
--- Returns the token list and a parallel byte-range map.
+-- Minimum length for `coalesce_hex_runs` to consider folding subword
+-- splits back into a single hex-literal token. Sized to catch full
+-- SHAs and longer abbreviations without lumping in short identifier
+-- suffixes; 7-char abbreviations stay subword-split.
+local HEX_TOKEN_MIN_LEN = 8
+
+-- Upper bound on the longest contiguous letter run permitted in a
+-- hex-literal candidate. Random hex averages a max letter run of ~3;
+-- pseudo-hex words like `cafe`, `face`, `dead`, `cafef00d`, `decade`
+-- exceed the threshold. Hashes that roll a 5+ letter streak skip the
+-- coalesce and fall back to the existing `INTRALINE_MAX_HUNKS` gate.
+local HEX_TOKEN_MAX_LETTER_RUN = 4
+
+-- True when the concatenation of `tokens[i..j]` (with combined byte
+-- length `total_len`) looks like a hash or numeric literal rather
+-- than a coincidentally-all-hex identifier. Combines four cheap
+-- signals in one O(n) pass over the token slice's bytes:
+--   * length >= `HEX_TOKEN_MIN_LEN`,
+--   * single-case hex (every letter in `[a-f]` or every letter in
+--     `[A-F]`),
+--   * digit/letter transitions >= `ceil(length / 4)` (rejects
+--     word-prefixed candidates like `decade1234567`),
+--   * max contiguous letter run <= `HEX_TOKEN_MAX_LETTER_RUN`
+--     (rejects dictionary-shaped runs like `cafef00d1234`).
+-- Walking the slice directly lets `coalesce_hex_runs` decide before
+-- allocating a merged string, so a non-hex word run (e.g. most
+-- camelCase identifiers) bails on its first non-[0-9A-Fa-f] byte.
+---@param tokens string[]
+---@param i integer
+---@param j integer
+---@param total_len integer
+---@return boolean
+local function is_hex_run_in_tokens(tokens, i, j, total_len)
+  if total_len < HEX_TOKEN_MIN_LEN then
+    return false
+  end
+  local has_lo, has_up = false, false
+  local prev_class = 0 -- 0 = none, 1 = digit, 2 = letter.
+  local transitions = 0
+  local cur_letter_run, max_letter_run = 0, 0
+  for k = i, j do
+    local t = tokens[k]
+    for p = 1, #t do
+      local b = t:byte(p)
+      local class
+      if b >= 0x30 and b <= 0x39 then
+        class = 1
+        cur_letter_run = 0
+      elseif b >= 0x61 and b <= 0x66 then
+        class = 2
+        has_lo = true
+        cur_letter_run = cur_letter_run + 1
+        if cur_letter_run > max_letter_run then
+          max_letter_run = cur_letter_run
+        end
+      elseif b >= 0x41 and b <= 0x46 then
+        class = 2
+        has_up = true
+        cur_letter_run = cur_letter_run + 1
+        if cur_letter_run > max_letter_run then
+          max_letter_run = cur_letter_run
+        end
+      else
+        return false
+      end
+      if prev_class ~= 0 and prev_class ~= class then
+        transitions = transitions + 1
+      end
+      prev_class = class
+    end
+  end
+  if has_lo and has_up then
+    return false
+  end
+  if max_letter_run > HEX_TOKEN_MAX_LETTER_RUN then
+    return false
+  end
+  if transitions < math.ceil(total_len / 4) then
+    return false
+  end
+  return true
+end
+
+-- Thin string wrapper around `is_hex_run_in_tokens`. The production
+-- path (`coalesce_hex_runs`) drives the token-slice variant directly;
+-- this wrapper exists so the predicate stays exercisable from tests
+-- with a single string input.
+---@param s string
+---@return boolean
+local function is_hex_run(s)
+  return is_hex_run_in_tokens({ s }, 1, 1, #s)
+end
+
+-- Walk a fresh `tokenize` token list and merge runs of byte-adjacent
+-- word tokens whose joined string passes `is_hex_run_in_tokens`. A
+-- 40-char hash that would otherwise fragment into ~25 alternating
+-- digit/letter subwords folds back into one token, so a 1:1 hash
+-- replacement renders as a whole-token swap instead of slipping under
+-- `INTRALINE_MAX_HUNKS` from coincidental subword matches between two
+-- unrelated hashes.
 --
--- Per-character tokenization was tried first, but `vim.diff --minimal`
--- matches coincidental letters between dissimilar lines (e.g.
--- "something." and "any tracked metric." share 't', 'e', 'm', 'i', '.')
--- and splits the diff into small hunks. Rendered in overleaf style those
--- fragments interleave deleted and inserted text into unreadable
--- character-level noise.
+-- The walk is bounded by byte adjacency so it can't span a non-word
+-- char: `"v1.0-1c9dfb..."` keeps `v`, dots, and dashes as natural
+-- boundaries. Within one source word run the walk is all-or-nothing.
+---@param tokens string[]
+---@param byte_map { byte: integer, byte_len: integer }[]
+---@return string[]
+---@return { byte: integer, byte_len: integer }[]
+local function coalesce_hex_runs(tokens, byte_map)
+  local out, out_map = {}, {}
+  local i = 1
+  while i <= #tokens do
+    if not is_word_token(tokens[i]) then
+      out[#out + 1] = tokens[i]
+      out_map[#out_map + 1] = byte_map[i]
+      i = i + 1
+    else
+      local j = i
+      while
+        j + 1 <= #tokens
+        and is_word_token(tokens[j + 1])
+        and byte_map[j + 1].byte == byte_map[j].byte + byte_map[j].byte_len
+      do
+        j = j + 1
+      end
+
+      if j > i then
+        local merged_len = byte_map[j].byte + byte_map[j].byte_len - byte_map[i].byte
+        if is_hex_run_in_tokens(tokens, i, j, merged_len) then
+          out[#out + 1] = table.concat(tokens, "", i, j)
+          out_map[#out_map + 1] = {
+            byte = byte_map[i].byte,
+            byte_len = merged_len,
+          }
+        else
+          for k = i, j do
+            out[#out + 1] = tokens[k]
+            out_map[#out_map + 1] = byte_map[k]
+          end
+        end
+      else
+        out[#out + 1] = tokens[i]
+        out_map[#out_map + 1] = byte_map[i]
+      end
+      i = j + 1
+    end
+  end
+  return out, out_map
+end
+
+-- Tokenize `s` for intraline diffing. Splits at ASCII non-word
+-- characters and at subword boundaries within word-char runs:
+-- camelCase (`fooBar` → `foo`, `Bar`), acronym→word (`XMLParser` →
+-- `XML`, `Parser`), digit↔letter (`error123abc` → `error`, `123`,
+-- `abc`), and underscore (`audio_preservation` → `audio`, `_`,
+-- `preservation`). Each non-word char becomes its own token; each
+-- subword run becomes one token. Multi-byte chars bucket as
+-- lowercase; see `subword_class`. Returns the token list and a
+-- parallel byte-range map.
 --
--- Whole-identifier word tokens fixed that but introduced a second
--- failure: a 1:1 token replacement between unrelated identifiers sharing
--- only a structural prefix (`EventStateOpen` / `EventStateClose`)
--- admitted char-level refinement on the prefix-overlap gate, then
--- `vim.diff` latched onto a coincidental letter inside the differing
--- tails (`Open` / `Close` share an `e`) and produced interleaved
--- per-char noise. Subword tokens close the gap: the divergent subword
--- now pairs directly with its replacement, the resulting char-level
--- diff has no shared prefix/suffix to admit refinement, and the hunk
--- renders as a clean whole-token replacement.
+-- Subword granularity keeps `vim.diff --minimal` from latching onto
+-- coincidental letters in dissimilar lines or in the divergent tails
+-- of structurally similar identifiers, both of which would render as
+-- per-char noise in overleaf style. A post-pass (`coalesce_hex_runs`)
+-- folds the splits back into one token for long hash-like hex runs
+-- so git SHAs and similar literals stay atomic.
 ---@param s string
 ---@return string[] tokens
 ---@return { byte: integer, byte_len: integer }[] byte_map
@@ -298,7 +427,7 @@ local function tokenize(s)
   end
   flush()
 
-  return tokens, byte_map
+  return coalesce_hex_runs(tokens, byte_map)
 end
 
 -- Decompose `s` into UTF-8 characters with byte offsets. Used to refine a
@@ -1713,6 +1842,7 @@ M._test = {
   is_word_char = is_word_char,
   is_word_token = is_word_token,
   subword_class = subword_class,
+  is_hex_run = is_hex_run,
   tokenize = tokenize,
   split_chars = split_chars,
   diff_units = diff_units,
diff --git a/lua/diffview/tests/functional/inline_diff_spec.lua b/lua/diffview/tests/functional/inline_diff_spec.lua
index 84d79d4a..4c671b68 100644
--- a/lua/diffview/tests/functional/inline_diff_spec.lua
+++ b/lua/diffview/tests/functional/inline_diff_spec.lua
@@ -771,6 +771,45 @@ describe("diffview.scene.inline_diff", function()
       )
     end)
 
+    it("tokenize coalesces a long single-case hex hash into one token", function()
+      -- 40-char SHA-1, all-lowercase hex with mixed digit/letter runs:
+      -- without the post-pass the run subword-splits into ~25 tokens,
+      -- which fragment the diff against an unrelated hash and admit
+      -- coincidental subword matches under `INTRALINE_MAX_HUNKS`.
+      local s = "1c9dfb261b8be35f689c6d83dfd3e92b7f59ecf8"
+      local tokens, m = h.tokenize(s)
+      assert.are.same({ s }, tokens)
+      assert.are.equal(0, m[1].byte)
+      assert.are.equal(40, m[1].byte_len)
+    end)
+
+    it("tokenize coalesces uppercase hex with alternating digits too", function()
+      -- All-uppercase run with frequent digit↔letter alternation passes
+      -- the same predicate.
+      assert.are.same({ "AB12CD34EF56AB78" }, (h.tokenize("AB12CD34EF56AB78")))
+    end)
+
+    it("tokenize leaves short hex-like runs subword-split", function()
+      -- 7 chars, below `HEX_TOKEN_MIN_LEN`: stays subword-split so a
+      -- short identifier suffix isn't collapsed into one opaque token.
+      assert.are.same({ "abc", "1234" }, (h.tokenize("abc1234")))
+    end)
+
+    it("tokenize rejects pseudo-hex with low transition density", function()
+      -- 1-2 transitions across 12-13 chars sit below `ceil(len/4)`, so
+      -- word-prefixed patterns keep their split and `decade` /
+      -- `1234567` stay diffable as semantic units.
+      assert.are.same({ "decade", "1234567" }, (h.tokenize("decade1234567")))
+      assert.are.same({ "cafebabe", "1234" }, (h.tokenize("cafebabe1234")))
+      assert.are.same({ "face", "1234", "abcd" }, (h.tokenize("face1234abcd")))
+    end)
+
+    it("tokenize rejects pseudo-hex with a long letter run", function()
+      -- 3 transitions hits `ceil(12/4)=3`, but `cafef` letter run = 5
+      -- exceeds `HEX_TOKEN_MAX_LETTER_RUN`, so the predicate rejects.
+      assert.are.same({ "cafef", "00", "d", "1234" }, (h.tokenize("cafef00d1234")))
+    end)
+
     it("subword_class buckets ASCII and multi-byte chars correctly", function()
       assert.are.equal("lower", h.subword_class("a"))
       assert.are.equal("upper", h.subword_class("Z"))
@@ -795,15 +834,51 @@ describe("diffview.scene.inline_diff", function()
       assert.is_false(h.is_word_token(""))
     end)
 
+    it("is_hex_run accepts long single-case hex with high transition density", function()
+      assert.is_true(h.is_hex_run("1c9dfb261b8be35f689c6d83dfd3e92b7f59ecf8"))
+      assert.is_true(h.is_hex_run("AB12CD34EF56AB78"))
+    end)
+
+    it("is_hex_run rejects strings shorter than HEX_TOKEN_MIN_LEN", function()
+      assert.is_false(h.is_hex_run("abc1234"))
+      assert.is_false(h.is_hex_run(""))
+    end)
+
+    it("is_hex_run rejects mixed-case hex", function()
+      -- `tokenize`'s upper→lower split already separates the cases, so a
+      -- mixed-case merge candidate never reaches `is_hex_run` in practice;
+      -- the predicate rejects defensively for correctness.
+      assert.is_false(h.is_hex_run("AbCd1234"))
+    end)
+
+    it("is_hex_run rejects strings with non-hex chars", function()
+      assert.is_false(h.is_hex_run("hello123abc"))
+      assert.is_false(h.is_hex_run("1234567890_"))
+    end)
+
+    it("is_hex_run rejects low transition density", function()
+      assert.is_false(h.is_hex_run("decade1234567"))
+      assert.is_false(h.is_hex_run("cafebabe1234"))
+    end)
+
+    it("is_hex_run rejects long letter runs", function()
+      -- `cafef` = 5-char letter run, exceeds `HEX_TOKEN_MAX_LETTER_RUN`.
+      assert.is_false(h.is_hex_run("cafef00d1234"))
+    end)
+
+    it("is_hex_run rejects pure-digit and pure-letter runs", function()
+      -- These can't reach the predicate from `coalesce_hex_runs` (their
+      -- source word run is one subword token), but rejecting them on
+      -- their own merits keeps the contract sharp: zero transitions.
+      assert.is_false(h.is_hex_run("12345678"))
+      assert.is_false(h.is_hex_run("abcdefab"))
+    end)
+
     it("diff_units returns [] when either side is empty", function()
       assert.are.same({}, h.diff_units({}, { "a" }))
       assert.are.same({}, h.diff_units({ "a" }, {}))
     end)
 
-    it("INTRALINE_MAX_HUNKS is a positive integer", function()
-      assert.is_true(h.INTRALINE_MAX_HUNKS >= 1)
-    end)
-
     it("refinement_safe admits single-hunk sub-diffs regardless of overlap", function()
       -- One hunk cannot interleave, so even unrelated words (foo/bar) are safe.
       assert.is_true(h.refinement_safe({ "f", "o", "o" }, { "b", "a", "r" }, 1))
@@ -895,6 +970,36 @@ describe("diffview.scene.inline_diff", function()
       local vls = virt_line_hls(extmarks(bufnr))
       assert.are.same({ "DiffviewDiffDeleteInline" }, vls)
     end)
+
+    it("renders a JSON commit-hash modification as one whole-token swap", function()
+      -- A pair of lazy-lock.json-style lines differing only in a 40-char
+      -- SHA. Without `coalesce_hex_runs`, the two unrelated hashes
+      -- subword-split into ~17-19 alternating digit/lowercase tokens
+      -- whose coincidental matches kept some pairs under
+      -- `INTRALINE_MAX_HUNKS` and rendered as fragmented per-token
+      -- highlights; after coalescing, the diff is a single 1:1 hash
+      -- replacement covering exactly the new hash's byte range.
+      local old_line = '  "x": { "commit": "ffa44ee9470743a7697d28df3a1a216fdfe2b09d" }'
+      local new_line = '  "x": { "commit": "cf4c30892644f01ebfb1e248eeca9e259856f9dc" }'
+      local bufnr = fresh_buf({ new_line })
+      inline_diff.render(bufnr, { old_line }, { new_line })
+      local cr = char_ranges(extmarks(bufnr))
+      assert.are.equal(1, #cr)
+      local hash_start = string.find(new_line, "cf4c3089", 1, true) - 1
+      assert.are.equal(0, cr[1].row)
+      assert.are.equal(hash_start, cr[1].start)
+      assert.are.equal(hash_start + 40, cr[1].finish)
+    end)
+
+    it("emits the entire deleted hash as one inline virt_text in overleaf", function()
+      local old_line = '  "x": { "commit": "ffa44ee9470743a7697d28df3a1a216fdfe2b09d" }'
+      local new_line = '  "x": { "commit": "cf4c30892644f01ebfb1e248eeca9e259856f9dc" }'
+      local bufnr = fresh_buf({ new_line })
+      inline_diff.render(bufnr, { old_line }, { new_line }, { style = "overleaf" })
+      local virts = inline_virt_texts(extmarks(bufnr), "DiffviewDiffDeleteInline")
+      assert.are.equal(1, #virts)
+      assert.are.equal("ffa44ee9470743a7697d28df3a1a216fdfe2b09d", virts[1].text)
+    end)
   end)
 
   describe("intraline tokenization", function()