From bf58853343d212fe472ea2b8794a9b44af15ffb4 Mon Sep 17 00:00:00 2001 From: Akhilesh Arora Date: Sat, 16 May 2026 22:23:45 +0200 Subject: [PATCH] fix(offload): preserve non-BMP characters in sanitizeText UNSAFE_CHAR_RE included the surrogate range [\uD800-\uDFFF] without the `u` flag, so JS treated strings as UTF-16 code units and stripped each half of every well-formed non-BMP code point. sanitizeText and sanitizeJsonLine therefore destroyed emoji, CJK Extension B, math bold, etc. in tool params, tool results, and ref-md archives. Adding the `u` flag makes paired surrogates combine into a single code point before matching, so the [\uD800-\uDFFF] entry now matches only lone (malformed) surrogates, which is the original intent. All other entries (replacement char, C0/C1 controls, zero-width chars, line separators, BOM) keep their behavior. Added a vitest suite covering the preserved and stripped cases. Closes #30 --- src/offload/storage.test.ts | 43 +++++++++++++++++++++++++++++++++++++ src/offload/storage.ts | 2 +- 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 src/offload/storage.test.ts diff --git a/src/offload/storage.test.ts b/src/offload/storage.test.ts new file mode 100644 index 0000000..c22eded --- /dev/null +++ b/src/offload/storage.test.ts @@ -0,0 +1,43 @@ +import { describe, expect, it } from "vitest"; + +import { sanitizeText } from "./storage.js"; + +describe("sanitizeText", () => { + it("preserves plain ASCII", () => { + expect(sanitizeText("hello world")).toBe("hello world"); + }); + + it("preserves emoji and other non-BMP code points", () => { + // ๐ŸŽ‰ = U+1F389, ๐ ฎท = U+20BB7 (CJK Extension B), ๐€ = U+1D400 (math bold A). + // Each is a surrogate pair in UTF-16. Without the `u` flag, the + // [\uD800-\uDFFF] range in UNSAFE_CHAR_RE would strip each half + // independently and silently destroy these characters. + expect(sanitizeText("emoji \u{1F389} here")).toBe("emoji \u{1F389} here"); + expect(sanitizeText("CJK ext-B \u{20BB7} here")).toBe( + "CJK ext-B \u{20BB7} here", + ); + expect(sanitizeText("math bold \u{1D400} here")).toBe( + "math bold \u{1D400} here", + ); + }); + + it("strips lone (malformed) surrogates", () => { + expect(sanitizeText("lone \uD800 surrogate")).toBe("lone surrogate"); + expect(sanitizeText("lone \uDC00 surrogate")).toBe("lone surrogate"); + }); + + it("strips C0 and C1 control characters", () => { + expect(sanitizeText("ctrlhere")).toBe("ctrlhere"); + expect(sanitizeText("c1ย…here")).toBe("c1here"); + }); + + it("strips zero-width characters and BOM", () => { + expect(sanitizeText("aโ€‹b")).toBe("ab"); + expect(sanitizeText("a๏ปฟb")).toBe("ab"); + }); + + it("returns non-string input unchanged", () => { + // Matches the existing typeof guard in sanitizeText. + expect(sanitizeText(42 as unknown as string)).toBe(42); + }); +}); diff --git a/src/offload/storage.ts b/src/offload/storage.ts index a5cb182..222a66b 100644 --- a/src/offload/storage.ts +++ b/src/offload/storage.ts @@ -162,7 +162,7 @@ export async function listRegisteredSessions( // โ”€โ”€โ”€ JSONL Defense Layer โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ const UNSAFE_CHAR_RE = - /[\uFFFD\u0000-\u0008\u000B\u000C\u000E-\u001F\u0080-\u009F\uD800-\uDFFF\u200B-\u200F\u2028\u2029\uFEFF]/g; + /[\uFFFD\u0000-\u0008\u000B\u000C\u000E-\u001F\u0080-\u009F\uD800-\uDFFF\u200B-\u200F\u2028\u2029\uFEFF]/gu; /** Layer 0 โ€” Source text sanitize. Strips unsafe characters from arbitrary text. */ export function sanitizeText(text: string): string {