diff --git a/src/offload/storage.test.ts b/src/offload/storage.test.ts new file mode 100644 index 0000000..c22eded --- /dev/null +++ b/src/offload/storage.test.ts @@ -0,0 +1,43 @@ +import { describe, expect, it } from "vitest"; + +import { sanitizeText } from "./storage.js"; + +describe("sanitizeText", () => { + it("preserves plain ASCII", () => { + expect(sanitizeText("hello world")).toBe("hello world"); + }); + + it("preserves emoji and other non-BMP code points", () => { + // ๐ŸŽ‰ = U+1F389, ๐ ฎท = U+20BB7 (CJK Extension B), ๐€ = U+1D400 (math bold A). + // Each is a surrogate pair in UTF-16. Without the `u` flag, the + // [\uD800-\uDFFF] range in UNSAFE_CHAR_RE would strip each half + // independently and silently destroy these characters. + expect(sanitizeText("emoji \u{1F389} here")).toBe("emoji \u{1F389} here"); + expect(sanitizeText("CJK ext-B \u{20BB7} here")).toBe( + "CJK ext-B \u{20BB7} here", + ); + expect(sanitizeText("math bold \u{1D400} here")).toBe( + "math bold \u{1D400} here", + ); + }); + + it("strips lone (malformed) surrogates", () => { + expect(sanitizeText("lone \uD800 surrogate")).toBe("lone surrogate"); + expect(sanitizeText("lone \uDC00 surrogate")).toBe("lone surrogate"); + }); + + it("strips C0 and C1 control characters", () => { + expect(sanitizeText("ctrlhere")).toBe("ctrlhere"); + expect(sanitizeText("c1ย…here")).toBe("c1here"); + }); + + it("strips zero-width characters and BOM", () => { + expect(sanitizeText("aโ€‹b")).toBe("ab"); + expect(sanitizeText("a๏ปฟb")).toBe("ab"); + }); + + it("returns non-string input unchanged", () => { + // Matches the existing typeof guard in sanitizeText. + expect(sanitizeText(42 as unknown as string)).toBe(42); + }); +}); diff --git a/src/offload/storage.ts b/src/offload/storage.ts index a5cb182..222a66b 100644 --- a/src/offload/storage.ts +++ b/src/offload/storage.ts @@ -162,7 +162,7 @@ export async function listRegisteredSessions( // โ”€โ”€โ”€ JSONL Defense Layer โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€ const UNSAFE_CHAR_RE = - /[\uFFFD\u0000-\u0008\u000B\u000C\u000E-\u001F\u0080-\u009F\uD800-\uDFFF\u200B-\u200F\u2028\u2029\uFEFF]/g; + /[\uFFFD\u0000-\u0008\u000B\u000C\u000E-\u001F\u0080-\u009F\uD800-\uDFFF\u200B-\u200F\u2028\u2029\uFEFF]/gu; /** Layer 0 โ€” Source text sanitize. Strips unsafe characters from arbitrary text. */ export function sanitizeText(text: string): string {