diff --git a/fixtures/forms/pdfjs/annotation-choice-widget.pdf b/fixtures/forms/pdfjs/annotation-choice-widget.pdf new file mode 100644 index 0000000..b557d9a Binary files /dev/null and b/fixtures/forms/pdfjs/annotation-choice-widget.pdf differ diff --git a/fixtures/forms/pdfjs/bug1669099.pdf b/fixtures/forms/pdfjs/bug1669099.pdf new file mode 100644 index 0000000..6c4554b Binary files /dev/null and b/fixtures/forms/pdfjs/bug1669099.pdf differ diff --git a/fixtures/forms/pdfjs/issue15053.pdf b/fixtures/forms/pdfjs/issue15053.pdf new file mode 100755 index 0000000..a0a6f75 Binary files /dev/null and b/fixtures/forms/pdfjs/issue15053.pdf differ diff --git a/fixtures/issues/form-filling/FINTRAC.pdf b/fixtures/issues/form-filling/FINTRAC.pdf new file mode 100644 index 0000000..af584de Binary files /dev/null and b/fixtures/issues/form-filling/FINTRAC.pdf differ diff --git a/src/document/form-integration.test.ts b/src/document/form-integration.test.ts index f5b557c..24d504c 100644 --- a/src/document/form-integration.test.ts +++ b/src/document/form-integration.test.ts @@ -7,6 +7,7 @@ */ import { PDF } from "#src/api/pdf"; +import { DropdownField } from "#src/document/forms/fields/choice-fields"; import { loadFixture, saveTestOutput } from "#src/test-utils"; import { describe, expect, it } from "vitest"; @@ -844,6 +845,147 @@ describe("Form Integration: Edge Cases", () => { expect(savedBytes.length).toBeGreaterThan(0); }); + it("reuses valid registered existing fonts for appearance generation", async () => { + const pdfBytes = await loadFixture("forms", "with_combed_fields.pdf"); + const pdf = await PDF.load(pdfBytes); + const form = pdf.getForm(); + expect(form).not.toBeNull(); + + const targetField = form! + .getTextFields() + .find(field => field.alternateName === "6. Certification. Name."); + + expect(targetField).toBeDefined(); + + targetField!.setValue("Jane Doe"); + form!.updateAppearances(); + + const savedBytes = await pdf.save(); + const pdf2 = await PDF.load(savedBytes); + const field2 = pdf2 + .getForm()! + .getTextFields() + .find(field => field.alternateName === "6. Certification. Name."); + + expect(field2).toBeDefined(); + expect(field2!.getValue()).toBe("Jane Doe"); + + const appearance = field2!.getWidgets()[0].getNormalAppearance(); + expect(appearance).not.toBeNull(); + + const streamContent = new TextDecoder().decode(appearance!.getDecodedData()); + + expect(streamContent).toContain("/HeBo"); + expect(streamContent).not.toContain("/Helv"); + expect(streamContent).toContain("(Jane Doe) Tj"); + }); + + it("skips unusable field fonts and reuses a later registered font", async () => { + const pdfBytes = await loadFixture("forms", "pdfjs/bug1669099.pdf"); + const pdf = await PDF.load(pdfBytes); + const form = pdf.getForm(); + expect(form).not.toBeNull(); + + const targetField = form!.getTextField("_e"); + expect(targetField).not.toBeNull(); + + targetField!.setValue("Visible text"); + form!.updateAppearances(); + + const savedBytes = await pdf.save(); + const pdf2 = await PDF.load(savedBytes); + const field2 = pdf2.getForm()!.getTextField("_e"); + expect(field2).not.toBeNull(); + expect(field2!.getValue()).toBe("Visible text"); + + const appearance = field2!.getWidgets()[0].getNormalAppearance(); + expect(appearance).not.toBeNull(); + + const streamContent = new TextDecoder().decode(appearance!.getDecodedData()); + + // Original /DA uses an unusable anonymous font name ("/"). We should + // skip it and reuse the later registered OpenSans font instead. + expect(streamContent).toContain("/Fo2"); + expect(streamContent).not.toContain("/Helv"); + expect(streamContent).not.toContain("\n/ 12.00000 Tf"); + expect(streamContent).toContain("Visible text"); + }); + + it("reuses existing fonts for choice field appearances", async () => { + const pdfBytes = await loadFixture("forms", "pdfjs/annotation-choice-widget.pdf"); + const pdf = await PDF.load(pdfBytes); + const form = pdf.getForm(); + expect(form).not.toBeNull(); + + const dropdown = form! + .acroForm() + .getFields() + .find(field => field.type === "dropdown" && field.alternateName === "Combo box"); + + expect(dropdown).toBeInstanceOf(DropdownField); + + const comboBox = dropdown as DropdownField; + + comboBox.setValue("Amet"); + form!.updateAppearances(); + + const savedBytes = await pdf.save(); + const pdf2 = await PDF.load(savedBytes); + const dropdown2 = pdf2 + .getForm()! + .acroForm() + .getFields() + .find(field => field.type === "dropdown" && field.alternateName === "Combo box"); + + expect(dropdown2).toBeInstanceOf(DropdownField); + + const appearance = (dropdown2 as DropdownField).getWidgets()[0].getNormalAppearance(); + expect(appearance).not.toBeNull(); + + const streamContent = new TextDecoder().decode(appearance!.getDecodedData()); + + expect(streamContent).toContain("/MyriadPro-Regular"); + expect(streamContent).not.toContain("/Helv"); + expect(streamContent).toContain("(Amet) Tj"); + }); + + it("reuses valid Type0 fonts for button captions", async () => { + const pdfBytes = await loadFixture("forms", "pdfjs/issue15053.pdf"); + const pdf = await PDF.load(pdfBytes); + const form = pdf.getForm(); + expect(form).not.toBeNull(); + + const buttonField = form! + .acroForm() + .getFields() + .find(field => field.type === "button" && field.name === "Button2"); + + expect(buttonField).toBeDefined(); + + buttonField!.setFont(form!.acroForm().getExistingFont("/KozMinPr6N-Regular")!); + buttonField!.needsAppearanceUpdate = true; + form!.updateAppearances(); + + const savedBytes = await pdf.save(); + const pdf2 = await PDF.load(savedBytes); + const buttonField2 = pdf2 + .getForm()! + .acroForm() + .getFields() + .find(field => field.type === "button" && field.name === "Button2"); + + expect(buttonField2).toBeDefined(); + + const appearance = buttonField2!.getWidgets()[0].getNormalAppearance(); + expect(appearance).not.toBeNull(); + + const streamContent = new TextDecoder().decode(appearance!.getDecodedData()); + + expect(streamContent).toContain("/KozMinPr6N-Regular"); + expect(streamContent).not.toContain("/Helv"); + expect(streamContent).toContain("<0042007500740074006F006E0031> Tj"); + }); + it("handles multiline text fields", async () => { const pdfBytes = await loadFixture("forms", "sample_form.pdf"); const pdf = await PDF.load(pdfBytes); @@ -1140,3 +1282,131 @@ describe("Form Integration: Stress Test", () => { } }); }); + +// ───────────────────────────────────────────────────────────────────────────── +// CID Font Form Filling (FINTRAC) +// ───────────────────────────────────────────────────────────────────────────── + +describe("Form Integration: CID Font PDFs", () => { + it("fills FINTRAC form without black rectangles or tofu", async () => { + // This PDF uses a CID font (Type0/Identity-H) for its form fields. + // Previously, filling caused: + // 1. Black rectangles (text color misidentified as background fill) + // 2. Tofu characters (CID font used for single-byte text encoding) + const pdfBytes = await loadFixture("issues", "form-filling/FINTRAC.pdf"); + const pdf = await PDF.load(pdfBytes); + + const form = pdf.getForm(); + expect(form).not.toBeNull(); + + // Fill text fields + const result = form!.fill({ + transaction: "123 main st", + realtor: "No one", + date: "2026-02-02", + full_name: "John Doe", + client_address: "123 Any Street, Toronto, ON, M0M 0M0", + date_of_birth: "1968-09-05", + nature_of_business: "asd", + id_number: "D6101-40706-60905", + issuing_authority: "Ontario", + issuing_country: "Canada", + expiry_date: "2012-11-26", + // Checkboxes + driverslicense_button: true, + passport_button: false, + third_party_no_button: true, + question_1_yes: true, + question_2_no: true, + question_3_no: true, + question_4_no: true, + question_5_yes: true, + relationship_nature_residential: true, + }); + + expect(result.filled.length).toBeGreaterThan(0); + + // Save and reload + const savedBytes = await pdf.save(); + const outputPath = await saveTestOutput("forms/fintrac-filled.pdf", savedBytes); + console.log(` -> Filled output: ${outputPath}`); + + expect(savedBytes.length).toBeGreaterThan(0); + + // Verify text field values round-trip correctly + const pdf2 = await PDF.load(savedBytes); + const form2 = pdf2.getForm()!; + + expect(form2.getTextField("full_name")?.getValue()).toBe("John Doe"); + expect(form2.getTextField("transaction")?.getValue()).toBe("123 main st"); + expect(form2.getTextField("date")?.getValue()).toBe("2026-02-02"); + + // Verify appearance streams don't contain background fill operations + // (the bug was: text color 0.266667 g was drawn as a filled rectangle) + const fullNameField = form2.getTextField("full_name")!; + const widgets = fullNameField.getWidgets(); + const appearance = widgets[0].getNormalAppearance(); + expect(appearance).not.toBeNull(); + + const streamContent = new TextDecoder().decode(appearance!.getDecodedData()); + + // The appearance should contain the text + expect(streamContent).toContain("Tj"); + // The appearance should NOT have a filled background rectangle + // (a "re f" before BT would indicate a background fill) + const preBT = streamContent.slice(0, streamContent.indexOf("BT")); + expect(preBT).not.toMatch(/re\s*\n?\s*f/); + + // The FINTRAC PDF's CID font has stripped glyph outlines (no renderable + // data). The appearance generator should fall back to Helvetica. + expect(streamContent).toContain("/Helv"); + // Text should be encoded as a regular PDF string (not hex for CID) + expect(streamContent).toContain("John Doe"); + }); + + it("flattens FINTRAC form correctly", async () => { + const pdfBytes = await loadFixture("issues", "form-filling/FINTRAC.pdf"); + const pdf = await PDF.load(pdfBytes); + + const form = pdf.getForm()!; + + form.fill({ + transaction: "123 main st", + realtor: "No one", + date: "2026-02-02", + full_name: "John Doe", + client_address: "123 Any Street, Toronto, ON, M0M 0M0", + date_of_birth: "1968-09-05", + nature_of_business: "asd", + id_number: "D6101-40706-60905", + issuing_authority: "Ontario", + issuing_country: "Canada", + expiry_date: "2012-11-26", + // Checkboxes + driverslicense_button: true, + passport_button: false, + third_party_no_button: true, + question_1_yes: true, + question_2_no: true, + question_3_no: true, + question_4_no: true, + question_5_yes: true, + relationship_nature_residential: true, + }); + + form.flatten(); + + const savedBytes = await pdf.save(); + const outputPath = await saveTestOutput("forms/fintrac-flattened.pdf", savedBytes); + console.log(` -> Flattened output: ${outputPath}`); + + expect(savedBytes.length).toBeGreaterThan(0); + + // Form should have no fields after flattening + const pdf2 = await PDF.load(savedBytes); + const form2 = pdf2.getForm(); + if (form2) { + expect(form2.getFields().length).toBe(0); + } + }); +}); diff --git a/src/document/forms/appearance-generator.ts b/src/document/forms/appearance-generator.ts index 43332dd..99eec0d 100644 --- a/src/document/forms/appearance-generator.ts +++ b/src/document/forms/appearance-generator.ts @@ -38,7 +38,7 @@ export { } from "./appearance-utils"; // Import implementation modules -import type { ExtractedAppearanceStyle } from "./appearance-utils"; +import type { AppearanceContext, ExtractedAppearanceStyle } from "./appearance-utils"; import * as ButtonAppearance from "./button-appearance"; import * as ChoiceAppearance from "./choice-appearance"; import * as TextAppearance from "./text-appearance"; @@ -67,9 +67,7 @@ export class AppearanceGenerator { /** * Get the shared context for appearance generation. */ - private getContext(): TextAppearance.TextAppearanceContext & - ButtonAppearance.ButtonAppearanceContext & - ChoiceAppearance.ChoiceAppearanceContext { + private getContext(): AppearanceContext { return { acroForm: this.acroForm, registry: this.registry, diff --git a/src/document/forms/appearance-utils.ts b/src/document/forms/appearance-utils.ts index 33deb6a..c0f1df0 100644 --- a/src/document/forms/appearance-utils.ts +++ b/src/document/forms/appearance-utils.ts @@ -3,6 +3,8 @@ */ import type { Operator } from "#src/content/operators"; +import { ContentStreamParser } from "#src/content/parsing/content-stream-parser"; +import { isParsedOperation, type ContentToken } from "#src/content/parsing/types"; import { closePath, curveTo, @@ -23,8 +25,18 @@ import { import { PdfDict } from "#src/objects/pdf-dict"; import { PdfName } from "#src/objects/pdf-name"; import type { PdfStream } from "#src/objects/pdf-stream"; +import { PdfString } from "#src/objects/pdf-string"; -import { type FormFont, isEmbeddedFont } from "./form-font"; +import type { ObjectRegistry } from "../object-registry"; +import type { AcroForm } from "./acro-form"; +import type { FormField, RgbColor } from "./fields"; +import { + ExistingFont, + type FormFont, + isEmbeddedFont, + isExistingFont, + mapToStandardFont, +} from "./form-font"; /** * Parsed default appearance string components. @@ -68,6 +80,11 @@ export interface FontMetrics { getTextWidth(text: string, fontSize: number): number; } +export interface AppearanceFontSource { + getFont(): FormFont | null; + defaultAppearance?: string | null; +} + /** * Constants for appearance generation. */ @@ -79,118 +96,317 @@ export const DEFAULT_HIGHLIGHT_COLOR = { r: 153 / 255, g: 193 / 255, b: 218 / 25 /** * Extract styling information from an existing appearance stream. * - * Parses the content stream to find colors, fonts, and border widths - * so they can be reused when regenerating the appearance. + * Uses the content stream parser to walk operations while tracking + * the graphics state stack (q/Q). This correctly identifies: + * - Background color: fill color when a rectangle is filled outside text + * - Border color/width: stroke color/width when stroked outside text + * - Text color: fill color at the time text is actually shown + * - Font: the Tf setting active when text is shown + * + * Handles all color spaces: gray (g/G), RGB (rg/RG), and CMYK (k/K). */ export function extractAppearanceStyle(stream: PdfStream): ExtractedAppearanceStyle { const style: ExtractedAppearanceStyle = {}; try { const data = stream.getDecodedData(); + const parser = new ContentStreamParser(data); + const { operations } = parser.parse(); + + // Graphics state tracking + interface GState { + fillColor: number[] | null; + strokeColor: number[] | null; + lineWidth: number | null; + fontName: string | null; + fontSize: number | null; + } - const content = new TextDecoder().decode(data); + const stateStack: GState[] = []; + let state: GState = { + fillColor: null, + strokeColor: null, + lineWidth: null, + fontName: null, + fontSize: null, + }; - // Extract background color (first fill color before any BT block) - // Look for: r g b rg (RGB) or g g (gray) or c m y k k (CMYK) - const btIndex = content.indexOf("BT"); - const preBT = btIndex > 0 ? content.slice(0, btIndex) : content; + let inTextBlock = false; + let hasSeenTextShowOp = false; - // RGB fill: "0.5 0.5 0.5 rg" - const rgMatch = preBT.match(/([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+rg/); + // State captured at time of text showing (most accurate) + let shownTextColor: number[] | null = null; + let shownFontName: string | null = null; + let shownFontSize: number | null = null; - if (rgMatch) { - style.backgroundColor = [ - Number.parseFloat(rgMatch[1]), - Number.parseFloat(rgMatch[2]), - Number.parseFloat(rgMatch[3]), - ]; - } + // Last font/color set inside a text block (fallback for empty fields) + let textBlockFontName: string | null = null; + let textBlockFontSize: number | null = null; + let textBlockFillColor: number[] | null = null; - // Gray fill: "0.5 g" (but not "0 g" which resets) + for (const op of operations) { + if (!isParsedOperation(op)) { + continue; + } - if (!style.backgroundColor) { - const gMatch = preBT.match(/([\d.]+)\s+g(?!\w)/); + const { operator, operands } = op; + + switch (operator) { + // ── Graphics state stack ── + case "q": + stateStack.push({ ...state }); + break; + case "Q": + if (stateStack.length > 0) { + // biome-ignore lint/style/noNonNullAssertion: length check above + state = stateStack.pop()!; + } + break; + + // ── Fill colors (g, rg, k) ── + case "g": + state.fillColor = [num(operands[0])]; + break; + case "rg": + state.fillColor = [num(operands[0]), num(operands[1]), num(operands[2])]; + break; + case "k": + state.fillColor = [ + num(operands[0]), + num(operands[1]), + num(operands[2]), + num(operands[3]), + ]; + break; + + // ── Stroke colors (G, RG, K) ── + case "G": + state.strokeColor = [num(operands[0])]; + break; + case "RG": + state.strokeColor = [num(operands[0]), num(operands[1]), num(operands[2])]; + break; + case "K": + state.strokeColor = [ + num(operands[0]), + num(operands[1]), + num(operands[2]), + num(operands[3]), + ]; + break; + + // ── Line width ── + case "w": + state.lineWidth = num(operands[0]); + break; + + // ── Font ── + case "Tf": + state.fontName = nameStr(operands[0]); + state.fontSize = num(operands[1]); + + if (inTextBlock) { + textBlockFontName = state.fontName; + textBlockFontSize = state.fontSize; + } + break; + + // ── Fill operations (background detection) ── + // Only treat as background if we haven't entered a text block yet + case "f": + case "F": + case "f*": + if (!inTextBlock && !hasSeenTextShowOp && state.fillColor) { + style.backgroundColor = [...state.fillColor]; + } + break; + + // ── Combined fill+stroke ── + case "B": + case "B*": + case "b": + case "b*": + if (!inTextBlock && !hasSeenTextShowOp) { + if (state.fillColor) { + style.backgroundColor = [...state.fillColor]; + } + if (state.strokeColor) { + style.borderColor = [...state.strokeColor]; + } + if (state.lineWidth != null) { + style.borderWidth = state.lineWidth; + } + } + break; + + // ── Stroke operations (border detection) ── + case "S": + case "s": + if (!inTextBlock && !hasSeenTextShowOp) { + if (state.strokeColor) { + style.borderColor = [...state.strokeColor]; + } + if (state.lineWidth != null) { + style.borderWidth = state.lineWidth; + } + } + break; + + // ── Text blocks ── + case "BT": + inTextBlock = true; + break; + case "ET": + inTextBlock = false; + break; + + // ── Text showing operations ── + // The fill color at text-show time IS the text color (render mode 0) + case "Tj": + case "TJ": + case "'": + case '"': + hasSeenTextShowOp = true; + if (state.fillColor) { + shownTextColor = [...state.fillColor]; + } + if (state.fontName) { + shownFontName = state.fontName; + shownFontSize = state.fontSize; + } + break; + } - if (gMatch && Number.parseFloat(gMatch[1]) !== 0) { - style.backgroundColor = [Number.parseFloat(gMatch[1])]; + // Track fill color changes inside text blocks (for empty field fallback) + if ( + inTextBlock && + (operator === "g" || operator === "rg" || operator === "k") && + state.fillColor + ) { + textBlockFillColor = [...state.fillColor]; } } - // Extract border color (stroke color before BT block) - // Only extract if there's actually a stroke operation (S or s) - otherwise the - // stroke color setting wasn't used to draw a visible border - const hasStrokeOp = /\bS\b/.test(preBT); - - if (hasStrokeOp) { - // RGB stroke: "0.5 0.5 0.5 RG" - const RGMatch = preBT.match(/([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+RG/); - - if (RGMatch) { - style.borderColor = [ - Number.parseFloat(RGMatch[1]), - Number.parseFloat(RGMatch[2]), - Number.parseFloat(RGMatch[3]), - ]; + // Assign text color: prefer shown, then text-block, then nothing + if (shownTextColor) { + style.textColor = shownTextColor; + } else if (textBlockFillColor) { + style.textColor = textBlockFillColor; + } + + // Assign font: prefer shown, then text-block, then last seen + const fontName = shownFontName ?? textBlockFontName ?? state.fontName; + const fontSize = shownFontSize ?? textBlockFontSize ?? state.fontSize; + + if (fontName) { + style.fontName = fontName; + if (fontSize != null && fontSize > 0) { + style.fontSize = fontSize; } + } + } catch { + // If parsing fails, return empty style + } - // Gray stroke: "0.5 G" + return style; +} - if (!style.borderColor) { - const GMatch = preBT.match(/([\d.]+)\s+G(?!\w)/); +/** + * Resolve the first font candidate that can be used for the given appearance text. + */ +export function chooseAppearanceFont( + text: string, + candidates: Iterable, +): FormFont { + for (const candidate of candidates) { + if (!candidate) { + continue; + } - if (GMatch) { - style.borderColor = [Number.parseFloat(GMatch[1])]; - } + if (isExistingFont(candidate)) { + if (candidate.canUseForAppearance(text)) { + return candidate; } - // Border width: "2 w" - only meaningful if there's a stroke - const wMatch = preBT.match(/([\d.]+)\s+w/); + continue; + } - if (wMatch) { - style.borderWidth = Number.parseFloat(wMatch[1]); + if (isEmbeddedFont(candidate)) { + if (candidate.canEncode(text)) { + return candidate; } - } - // Extract text color (inside BT...ET block) - const btMatch = content.match(/BT[\s\S]*?ET/); + continue; + } - if (btMatch) { - const btContent = btMatch[0]; + return candidate; + } - // RGB text color - const textRgMatch = btContent.match(/([\d.]+)\s+([\d.]+)\s+([\d.]+)\s+rg/); + return new ExistingFont("Helv", null, null); +} - if (textRgMatch) { - style.textColor = [ - Number.parseFloat(textRgMatch[1]), - Number.parseFloat(textRgMatch[2]), - Number.parseFloat(textRgMatch[3]), - ]; - } +/** + * Resolve the best available font for generating an appearance. + */ +export function resolveAppearanceFont( + acroForm: AcroForm, + field: AppearanceFontSource, + text: string, + existingFontName?: string, +): FormFont { + const candidates: FormFont[] = []; + const fieldFont = field.getFont(); + const defaultFont = acroForm.getDefaultFont(); + + if (fieldFont) { + candidates.push(fieldFont); + } - // Gray text color + if (defaultFont) { + candidates.push(defaultFont); + } - if (!style.textColor) { - const textGMatch = btContent.match(/([\d.]+)\s+g(?!\w)/); + if (existingFontName) { + const existingFont = acroForm.getExistingFont(existingFontName); - if (textGMatch) { - style.textColor = [Number.parseFloat(textGMatch[1])]; - } - } + if (existingFont) { + candidates.push(existingFont); } + } - // Extract font info: "/Helv 12 Tf" - const fontMatch = content.match(/\/(\w+)\s+([\d.]+)\s+Tf/); + const da = field.defaultAppearance ?? acroForm.defaultAppearance; + const daInfo = parseDAString(da); + const daFont = acroForm.getExistingFont(daInfo.fontName); - if (fontMatch) { - style.fontName = fontMatch[1]; - style.fontSize = Number.parseFloat(fontMatch[2]); + if (daFont) { + candidates.push(daFont); + } + + for (const availableFont of acroForm.getAvailableFonts()) { + if (!candidates.includes(availableFont)) { + candidates.push(availableFont); } - } catch { - // If parsing fails, return empty style } - return style; + return chooseAppearanceFont(text, candidates); +} + +/** Extract numeric value from a content token operand. */ +function num(token?: ContentToken): number { + if (token && token.type === "number" && typeof token.value === "number") { + return token.value; + } + + return 0; +} + +/** Extract name string from a content token operand (strips leading slash). */ +function nameStr(token: ContentToken): string { + if (token && token.type === "name" && typeof token.value === "string") { + return token.value; + } + + return ""; } /** @@ -394,20 +610,186 @@ export function getFontMetrics(font: FormFont): FontMetrics { }; } +// ───────────────────────────────────────────────────────────────────────────── +// Shared Appearance Context & Helpers +// ───────────────────────────────────────────────────────────────────────────── + +/** + * Shared context for appearance stream generation. + * + * All appearance generators (text, button, choice) share this context + * to coordinate font resource naming across a generation session. + */ +export interface AppearanceContext { + acroForm: AcroForm; + registry: ObjectRegistry; + fontResourceNames: Map; + fontNameCounter: number; +} + /** - * Map font names to Standard 14 font names. + * Assign a resource name for a font in the current generation session. */ -export function mapToStandardFontName(name: string): string { - const aliases: Record = { - Helv: "Helvetica", - HeBo: "Helvetica-Bold", - TiRo: "Times-Roman", - TiBo: "Times-Bold", - Cour: "Courier", - CoBo: "Courier-Bold", - ZaDb: "ZapfDingbats", - Symb: "Symbol", +export function getFontResourceName( + ctx: AppearanceContext, + font: FormFont, +): { name: string; counter: number } { + if (ctx.fontResourceNames.has(font)) { + return { + // biome-ignore lint/style/noNonNullAssertion: checked above + name: ctx.fontResourceNames.get(font)!, + counter: ctx.fontNameCounter, + }; + } + + let name: string; + + if (isExistingFont(font)) { + name = font.name.startsWith("/") ? font.name : `/${font.name}`; + } else { + ctx.fontNameCounter++; + name = `/F${ctx.fontNameCounter}`; + } + + ctx.fontResourceNames.set(font, name); + + return { + name, + counter: ctx.fontNameCounter, }; +} + +/** + * Resolve the default appearance string from field or form defaults, and parse it. + */ +export function parseDefaultAppearance(ctx: AppearanceContext, field: FormField): ParsedDA { + const da = field.defaultAppearance ?? ctx.acroForm.defaultAppearance ?? ""; + + return parseDAString(da); +} + +/** + * Calculate font size to fit text within given dimensions. + */ +export function calculateAutoFontSize( + text: string, + width: number, + height: number, + font: FormFont, + isMultiline = false, +): number { + const contentWidth = width - 2 * PADDING; + const contentHeight = height - 2 * PADDING; + + if (isMultiline) { + return Math.max(MIN_FONT_SIZE, Math.min(12, contentHeight * 0.15)); + } + + const heightBased = contentHeight * 0.7; + + let fontSize = heightBased; + const metrics = getFontMetrics(font); + let textWidth = metrics.getTextWidth(text || "X", fontSize); + + while (textWidth > contentWidth && fontSize > MIN_FONT_SIZE) { + fontSize -= 1; + textWidth = metrics.getTextWidth(text || "X", fontSize); + } + + return Math.max(MIN_FONT_SIZE, Math.min(fontSize, MAX_FONT_SIZE)); +} + +/** + * Encode text for use in a PDF content stream with the given font. + */ +export function encodeTextForFont(text: string, font: FormFont): PdfString { + if (isEmbeddedFont(font)) { + font.markUsedInForm(); + + if (!font.canEncode(text)) { + const unencodable = font.getUnencodableCharacters(text); + const firstBad = unencodable[0]; + + throw new Error( + `Font cannot encode character '${firstBad}' (U+${firstBad.codePointAt(0)?.toString(16).toUpperCase().padStart(4, "0")})`, + ); + } + + const gids = font.encodeTextToGids(text); + const bytes = new Uint8Array(gids.length * 2); + + for (let i = 0; i < gids.length; i++) { + bytes[i * 2] = (gids[i] >> 8) & 0xff; + bytes[i * 2 + 1] = gids[i] & 0xff; + } - return aliases[name] || name; + return PdfString.fromBytes(bytes); + } + + if (isExistingFont(font) && font.isCIDFont) { + return PdfString.fromBytes(font.encodeTextToBytes(text)); + } + + return PdfString.fromString(text); +} + +/** + * Generate color operators for text rendering. + */ +export function getColorOperators(textColor: RgbColor | null, daInfo: ParsedDA): Operator[] { + if (textColor) { + return [setNonStrokingRGB(textColor.r, textColor.g, textColor.b)]; + } + + switch (daInfo.colorOp) { + case "g": + return [setNonStrokingGray(daInfo.colorArgs[0] ?? 0)]; + case "rg": + return [ + setNonStrokingRGB( + daInfo.colorArgs[0] ?? 0, + daInfo.colorArgs[1] ?? 0, + daInfo.colorArgs[2] ?? 0, + ), + ]; + case "k": + return [ + setNonStrokingCMYK( + daInfo.colorArgs[0] ?? 0, + daInfo.colorArgs[1] ?? 0, + daInfo.colorArgs[2] ?? 0, + daInfo.colorArgs[3] ?? 0, + ), + ]; + default: + return [setNonStrokingGray(0)]; + } +} + +/** + * Build a resources dictionary containing a single font entry. + */ +export function buildFontResources(font: FormFont, fontName: string): PdfDict { + const resources = new PdfDict(); + const fonts = new PdfDict(); + + const cleanName = fontName.startsWith("/") ? fontName.slice(1) : fontName; + + if (isEmbeddedFont(font)) { + fonts.set(cleanName, font.ref); + } else if (isExistingFont(font) && font.ref) { + fonts.set(cleanName, font.ref); + } else { + const fontDict = new PdfDict(); + + fontDict.set("Type", PdfName.of("Font")); + fontDict.set("Subtype", PdfName.of("Type1")); + fontDict.set("BaseFont", PdfName.of(mapToStandardFont(cleanName) ?? cleanName)); + + fonts.set(cleanName, fontDict); + } + + resources.set("Font", fonts); + + return resources; } diff --git a/src/document/forms/button-appearance.ts b/src/document/forms/button-appearance.ts index bae3834..ca181cb 100644 --- a/src/document/forms/button-appearance.ts +++ b/src/document/forms/button-appearance.ts @@ -8,7 +8,6 @@ */ import { ContentStreamBuilder } from "#src/content/content-stream"; -import type { EmbeddedFont } from "#src/fonts/embedded-font"; import { beginText, endText, @@ -25,26 +24,25 @@ import { stroke, } from "#src/helpers/operators"; import { PdfDict } from "#src/objects/pdf-dict"; -import { PdfName } from "#src/objects/pdf-name"; import type { PdfStream } from "#src/objects/pdf-stream"; import { PdfString } from "#src/objects/pdf-string"; -import type { ObjectRegistry } from "../object-registry"; -import type { AcroForm } from "./acro-form"; import { + type AppearanceContext, + buildFontResources, buildZapfDingbatsResources, + calculateAutoFontSize, drawCircle, + encodeTextForFont, generateBackgroundAndBorder, + getColorOperators, getFontMetrics, - MAX_FONT_SIZE, - MIN_FONT_SIZE, - mapToStandardFontName, - PADDING, - type ParsedDA, - parseDAString, + getFontResourceName, + parseDefaultAppearance, + resolveAppearanceFont, } from "./appearance-utils"; -import type { ButtonField, CheckboxField, RadioField, RgbColor } from "./fields"; -import { ExistingFont, type FormFont, isEmbeddedFont, isExistingFont } from "./form-font"; +import type { ButtonField, CheckboxField, RadioField } from "./fields"; +import type { FormFont } from "./form-font"; import type { WidgetAnnotation } from "./widget-annotation"; /** @@ -56,12 +54,7 @@ const ZAPF_CIRCLE = "\x6C"; // "l" = filled circle in ZapfDingbats /** * Context for button appearance generation. */ -export interface ButtonAppearanceContext { - acroForm: AcroForm; - registry: ObjectRegistry; - fontResourceNames: Map; - fontNameCounter: number; -} +export type ButtonAppearanceContext = AppearanceContext; /** * Generate appearance streams for a checkbox. @@ -198,7 +191,7 @@ export function generateButtonAppearance( }; } - const font = resolveFont(ctx, field); + const font = resolveAppearanceFont(ctx.acroForm, field, caption); const { name: fontName, counter } = getFontResourceName(ctx, font); @@ -243,193 +236,10 @@ export function generateButtonAppearance( .add(endText()) .add(popGraphicsState()); - const resources = buildResources(ctx, font, fontName); + const resources = buildFontResources(font, fontName); return { stream: content.toFormXObject([0, 0, width, height], resources), fontNameCounter: ctx.fontNameCounter, }; } - -// ───────────────────────────────────────────────────────────────────────────── -// Helper Functions -// ───────────────────────────────────────────────────────────────────────────── - -function resolveFont( - ctx: ButtonAppearanceContext, - field: { getFont(): FormFont | null; defaultAppearance?: string | null }, -): FormFont { - const fieldFont = field.getFont(); - - if (fieldFont) { - return fieldFont; - } - - const defaultFont = ctx.acroForm.getDefaultFont(); - - if (defaultFont) { - return defaultFont; - } - - const da = - "defaultAppearance" in field - ? (field.defaultAppearance ?? ctx.acroForm.defaultAppearance) - : ctx.acroForm.defaultAppearance; - const daInfo = parseDAString(da); - const existingFont = ctx.acroForm.getExistingFont(daInfo.fontName); - - if (existingFont) { - return existingFont; - } - - return new ExistingFont("Helv", null, null); -} - -function getFontResourceName( - ctx: ButtonAppearanceContext, - font: FormFont, -): { name: string; counter: number } { - if (ctx.fontResourceNames.has(font)) { - return { - // biome-ignore lint/style/noNonNullAssertion: fontResourceNames is guaranteed to have a value - name: ctx.fontResourceNames.get(font)!, - counter: ctx.fontNameCounter, - }; - } - - let name: string; - - if (isExistingFont(font)) { - name = font.name.startsWith("/") ? font.name : `/${font.name}`; - } else { - ctx.fontNameCounter++; - name = `/F${ctx.fontNameCounter}`; - } - - ctx.fontResourceNames.set(font, name); - - return { - name, - counter: ctx.fontNameCounter, - }; -} - -function parseDefaultAppearance( - ctx: ButtonAppearanceContext, - field: { defaultAppearance?: string | null }, -): ParsedDA { - const da = - "defaultAppearance" in field - ? (field.defaultAppearance ?? ctx.acroForm.defaultAppearance) - : ctx.acroForm.defaultAppearance; - - return parseDAString(da); -} - -function calculateAutoFontSize( - text: string, - width: number, - height: number, - font: FormFont, -): number { - const contentWidth = width - 2 * PADDING; - const contentHeight = height - 2 * PADDING; - - const heightBased = contentHeight * 0.7; - - let fontSize = heightBased; - const metrics = getFontMetrics(font); - let textWidth = metrics.getTextWidth(text || "X", fontSize); - - while (textWidth > contentWidth && fontSize > MIN_FONT_SIZE) { - fontSize -= 1; - textWidth = metrics.getTextWidth(text || "X", fontSize); - } - - return Math.max(MIN_FONT_SIZE, Math.min(fontSize, MAX_FONT_SIZE)); -} - -function encodeTextForFont(text: string, font: FormFont): PdfString { - if (isEmbeddedFont(font)) { - font.markUsedInForm(); - - if (!font.canEncode(text)) { - const unencodable = font.getUnencodableCharacters(text); - const firstBad = unencodable[0]; - - throw new Error( - `Font cannot encode character '${firstBad}' (U+${firstBad.codePointAt(0)?.toString(16).toUpperCase().padStart(4, "0")})`, - ); - } - - const gids = font.encodeTextToGids(text); - const bytes = new Uint8Array(gids.length * 2); - - for (let i = 0; i < gids.length; i++) { - bytes[i * 2] = (gids[i] >> 8) & 0xff; - bytes[i * 2 + 1] = gids[i] & 0xff; - } - - return PdfString.fromBytes(bytes); - } - - return PdfString.fromString(text); -} - -import type { Operator } from "#src/content/operators"; -import { setNonStrokingCMYK } from "#src/helpers/operators"; - -function getColorOperators(textColor: RgbColor | null, daInfo: ParsedDA): Operator[] { - if (textColor) { - return [setNonStrokingRGB(textColor.r, textColor.g, textColor.b)]; - } - - switch (daInfo.colorOp) { - case "g": - return [setNonStrokingGray(daInfo.colorArgs[0] ?? 0)]; - case "rg": - return [ - setNonStrokingRGB( - daInfo.colorArgs[0] ?? 0, - daInfo.colorArgs[1] ?? 0, - daInfo.colorArgs[2] ?? 0, - ), - ]; - case "k": - return [ - setNonStrokingCMYK( - daInfo.colorArgs[0] ?? 0, - daInfo.colorArgs[1] ?? 0, - daInfo.colorArgs[2] ?? 0, - daInfo.colorArgs[3] ?? 0, - ), - ]; - default: - return [setNonStrokingGray(0)]; - } -} - -function buildResources(ctx: ButtonAppearanceContext, font: FormFont, fontName: string): PdfDict { - const resources = new PdfDict(); - const fonts = new PdfDict(); - - const cleanName = fontName.startsWith("/") ? fontName.slice(1) : fontName; - - if (isEmbeddedFont(font)) { - fonts.set(cleanName, font.ref); - } else if (isExistingFont(font) && font.ref) { - fonts.set(cleanName, font.ref); - } else { - const fontDict = new PdfDict(); - - fontDict.set("Type", PdfName.of("Font")); - fontDict.set("Subtype", PdfName.of("Type1")); - fontDict.set("BaseFont", PdfName.of(mapToStandardFontName(cleanName))); - - fonts.set(cleanName, fontDict); - } - - resources.set("Font", fonts); - - return resources; -} diff --git a/src/document/forms/choice-appearance.ts b/src/document/forms/choice-appearance.ts index 6c339f0..5505b20 100644 --- a/src/document/forms/choice-appearance.ts +++ b/src/document/forms/choice-appearance.ts @@ -7,8 +7,6 @@ */ import { ContentStreamBuilder } from "#src/content/content-stream"; -import type { Operator } from "#src/content/operators"; -import type { EmbeddedFont } from "#src/fonts/embedded-font"; import { beginMarkedContent, beginText, @@ -22,41 +20,33 @@ import { pushGraphicsState, rectangle, setFont, - setNonStrokingCMYK, setNonStrokingGray, setNonStrokingRGB, showText, } from "#src/helpers/operators"; -import { PdfDict } from "#src/objects/pdf-dict"; -import { PdfName } from "#src/objects/pdf-name"; import type { PdfStream } from "#src/objects/pdf-stream"; -import { PdfString } from "#src/objects/pdf-string"; -import type { ObjectRegistry } from "../object-registry"; -import type { AcroForm } from "./acro-form"; import { + type AppearanceContext, + buildFontResources, + calculateAutoFontSize, DEFAULT_HIGHLIGHT_COLOR, + encodeTextForFont, + getColorOperators, getFontMetrics, - MAX_FONT_SIZE, - MIN_FONT_SIZE, - mapToStandardFontName, + getFontResourceName, PADDING, - type ParsedDA, - parseDAString, + parseDefaultAppearance, + resolveAppearanceFont, } from "./appearance-utils"; -import type { DropdownField, ListBoxField, RgbColor } from "./fields"; -import { ExistingFont, type FormFont, isEmbeddedFont, isExistingFont } from "./form-font"; +import type { DropdownField, ListBoxField } from "./fields"; +import type { FormFont } from "./form-font"; import type { WidgetAnnotation } from "./widget-annotation"; /** * Context for choice appearance generation. */ -export interface ChoiceAppearanceContext { - acroForm: AcroForm; - registry: ObjectRegistry; - fontResourceNames: Map; - fontNameCounter: number; -} +export type ChoiceAppearanceContext = AppearanceContext; /** * Generate appearance stream for a dropdown (combo box). @@ -73,7 +63,7 @@ export function generateDropdownAppearance( const selectedOption = options.find(opt => opt.value === value); const displayText = selectedOption?.display ?? value; - const font = resolveFont(ctx, field); + const font = resolveAppearanceFont(ctx.acroForm, field, displayText); const { name: fontName, counter } = getFontResourceName(ctx, font); ctx.fontNameCounter = counter; @@ -108,7 +98,7 @@ export function generateDropdownAppearance( endMarkedContent(), ]); - const resources = buildResources(ctx, font, fontName); + const resources = buildFontResources(font, fontName); return { stream: content.toFormXObject([0, 0, width, height], resources), @@ -128,7 +118,11 @@ export function generateListBoxAppearance( const options = field.getOptions(); const { width, height } = widget; - const font = resolveFont(ctx, field); + const font = resolveAppearanceFont( + ctx.acroForm, + field, + options.map(option => option.display).join(""), + ); const { name: fontName, counter } = getFontResourceName(ctx, font); ctx.fontNameCounter = counter; @@ -222,190 +216,10 @@ export function generateListBoxAppearance( content.add(endText()).add(popGraphicsState()).add(endMarkedContent()); - const resources = buildResources(ctx, font, fontName); + const resources = buildFontResources(font, fontName); return { stream: content.toFormXObject([0, 0, width, height], resources), fontNameCounter: ctx.fontNameCounter, }; } - -// ───────────────────────────────────────────────────────────────────────────── -// Helper Functions -// ───────────────────────────────────────────────────────────────────────────── - -function resolveFont( - ctx: ChoiceAppearanceContext, - field: { getFont(): FormFont | null; defaultAppearance?: string | null }, -): FormFont { - const fieldFont = field.getFont(); - - if (fieldFont) { - return fieldFont; - } - - const defaultFont = ctx.acroForm.getDefaultFont(); - - if (defaultFont) { - return defaultFont; - } - - const da = - "defaultAppearance" in field - ? (field.defaultAppearance ?? ctx.acroForm.defaultAppearance) - : ctx.acroForm.defaultAppearance; - const daInfo = parseDAString(da); - const existingFont = ctx.acroForm.getExistingFont(daInfo.fontName); - - if (existingFont) { - return existingFont; - } - - return new ExistingFont("Helv", null, null); -} - -function getFontResourceName( - ctx: ChoiceAppearanceContext, - font: FormFont, -): { name: string; counter: number } { - if (ctx.fontResourceNames.has(font)) { - return { - // biome-ignore lint/style/noNonNullAssertion: fontResourceNames is guaranteed to have a value - name: ctx.fontResourceNames.get(font)!, - counter: ctx.fontNameCounter, - }; - } - - let name: string; - - if (isExistingFont(font)) { - name = font.name.startsWith("/") ? font.name : `/${font.name}`; - } else { - ctx.fontNameCounter++; - name = `/F${ctx.fontNameCounter}`; - } - - ctx.fontResourceNames.set(font, name); - - return { - name, - counter: ctx.fontNameCounter, - }; -} - -function parseDefaultAppearance( - ctx: ChoiceAppearanceContext, - field: { defaultAppearance?: string | null }, -): ParsedDA { - const da = - "defaultAppearance" in field - ? (field.defaultAppearance ?? ctx.acroForm.defaultAppearance) - : ctx.acroForm.defaultAppearance; - - return parseDAString(da); -} - -function calculateAutoFontSize( - text: string, - width: number, - height: number, - font: FormFont, -): number { - const contentWidth = width - 2 * PADDING; - const contentHeight = height - 2 * PADDING; - - const heightBased = contentHeight * 0.7; - - let fontSize = heightBased; - const metrics = getFontMetrics(font); - let textWidth = metrics.getTextWidth(text || "X", fontSize); - - while (textWidth > contentWidth && fontSize > MIN_FONT_SIZE) { - fontSize -= 1; - textWidth = metrics.getTextWidth(text || "X", fontSize); - } - - return Math.max(MIN_FONT_SIZE, Math.min(fontSize, MAX_FONT_SIZE)); -} - -function encodeTextForFont(text: string, font: FormFont): PdfString { - if (isEmbeddedFont(font)) { - font.markUsedInForm(); - - if (!font.canEncode(text)) { - const unencodable = font.getUnencodableCharacters(text); - const firstBad = unencodable[0]; - - throw new Error( - `Font cannot encode character '${firstBad}' (U+${firstBad.codePointAt(0)?.toString(16).toUpperCase().padStart(4, "0")})`, - ); - } - - const gids = font.encodeTextToGids(text); - const bytes = new Uint8Array(gids.length * 2); - - for (let i = 0; i < gids.length; i++) { - bytes[i * 2] = (gids[i] >> 8) & 0xff; - bytes[i * 2 + 1] = gids[i] & 0xff; - } - - return PdfString.fromBytes(bytes); - } - - return PdfString.fromString(text); -} - -function getColorOperators(textColor: RgbColor | null, daInfo: ParsedDA): Operator[] { - if (textColor) { - return [setNonStrokingRGB(textColor.r, textColor.g, textColor.b)]; - } - - switch (daInfo.colorOp) { - case "g": - return [setNonStrokingGray(daInfo.colorArgs[0] ?? 0)]; - case "rg": - return [ - setNonStrokingRGB( - daInfo.colorArgs[0] ?? 0, - daInfo.colorArgs[1] ?? 0, - daInfo.colorArgs[2] ?? 0, - ), - ]; - case "k": - return [ - setNonStrokingCMYK( - daInfo.colorArgs[0] ?? 0, - daInfo.colorArgs[1] ?? 0, - daInfo.colorArgs[2] ?? 0, - daInfo.colorArgs[3] ?? 0, - ), - ]; - default: - return [setNonStrokingGray(0)]; - } -} - -function buildResources(ctx: ChoiceAppearanceContext, font: FormFont, fontName: string): PdfDict { - const resources = new PdfDict(); - const fonts = new PdfDict(); - - const cleanName = fontName.startsWith("/") ? fontName.slice(1) : fontName; - - if (isEmbeddedFont(font)) { - fonts.set(cleanName, font.ref); - } else if (isExistingFont(font) && font.ref) { - fonts.set(cleanName, font.ref); - } else { - const fontDict = new PdfDict(); - - fontDict.set("Type", PdfName.of("Font")); - fontDict.set("Subtype", PdfName.of("Type1")); - fontDict.set("BaseFont", PdfName.of(mapToStandardFontName(cleanName))); - - fonts.set(cleanName, fontDict); - } - - resources.set("Font", fonts); - - return resources; -} diff --git a/src/document/forms/form-font.test.ts b/src/document/forms/form-font.test.ts index 7fd3e95..7fac6aa 100644 --- a/src/document/forms/form-font.test.ts +++ b/src/document/forms/form-font.test.ts @@ -2,10 +2,58 @@ * Tests for form font functionality. */ +import { CIDFont } from "#src/fonts/cid-font"; +import type { FontProgram } from "#src/fonts/font-program"; +import { ToUnicodeMap } from "#src/fonts/to-unicode"; import { describe, expect, it } from "vitest"; import { ExistingFont, isEmbeddedFont, isExistingFont } from "./form-font"; +class StubFontProgram implements FontProgram { + readonly type = "truetype" as const; + readonly numGlyphs = 4; + readonly unitsPerEm = 1000; + readonly bbox = [0, 0, 1000, 1000] as const; + readonly postScriptName = "Stub"; + readonly familyName = "Stub"; + readonly isFixedPitch = false; + readonly italicAngle = 0; + readonly ascent = 800; + readonly descent = -200; + readonly capHeight = 700; + readonly xHeight = 500; + readonly stemV = 80; + + constructor( + private readonly glyphMap: Map, + private readonly renderableGlyphs: Set, + ) {} + + getGlyphId(codePoint: number): number { + return this.glyphMap.get(codePoint) ?? 0; + } + + getAdvanceWidth(_glyphId: number): number { + return 600; + } + + hasGlyph(codePoint: number): boolean { + return this.glyphMap.has(codePoint); + } + + hasRenderableGlyphs(): boolean { + return this.renderableGlyphs.size > 0; + } + + hasRenderableGlyph(glyphId: number): boolean { + return this.renderableGlyphs.has(glyphId); + } + + getData(): Uint8Array { + return new Uint8Array(); + } +} + describe("ExistingFont", () => { describe("constructor", () => { it("should create with name and null ref", () => { @@ -42,6 +90,68 @@ describe("ExistingFont", () => { // Use Unicode escape for CJK character (U+4E16 = ) expect(font.canEncode("\u4E16")).toBe(false); }); + + it("returns true for CID fonts with explicit CIDToGID maps when ToUnicode resolves the code", () => { + const cidFont = new CIDFont({ + subtype: "CIDFontType2", + baseFontName: "StubCID", + cidToGidMap: new Uint16Array([0, 7]), + embeddedProgram: new StubFontProgram(new Map([[0x0041, 7]]), new Set([7])), + toUnicodeMap: new ToUnicodeMap(new Map([[1, "A"]])), + }); + const font = new ExistingFont("F0", null, null, true, cidFont); + + expect(font.canEncode("A")).toBe(true); + expect(Array.from(font.encodeTextToBytes("A"))).toEqual([0x00, 0x01]); + }); + + it("returns false for astral characters in CID fonts", () => { + const cidFont = new CIDFont({ + subtype: "CIDFontType2", + baseFontName: "StubCID", + embeddedProgram: new StubFontProgram(new Map([[0x1f600, 5]]), new Set([5])), + }); + const font = new ExistingFont("F0", null, null, true, cidFont); + + expect(font.canEncode("😀")).toBe(false); + expect(font.canUseForAppearance("😀")).toBe(false); + }); + }); + + describe("canUseForAppearance", () => { + it("returns true for standard 14 fonts with encodable text", () => { + const font = new ExistingFont("Helv", null, null); + + expect(font.canUseForAppearance("Hello World")).toBe(true); + }); + + it("returns false for standard 14 fonts with unencodable text", () => { + const font = new ExistingFont("Helv", null, null); + + expect(font.canUseForAppearance("\u4E16")).toBe(false); + }); + + it("returns false for CID fonts whose mapped glyph is not renderable", () => { + const cidFont = new CIDFont({ + subtype: "CIDFontType2", + baseFontName: "StubCID", + embeddedProgram: new StubFontProgram(new Map([[0x0041, 1]]), new Set()), + }); + const font = new ExistingFont("F0", null, null, true, cidFont); + + expect(font.canUseForAppearance("A")).toBe(false); + }); + + it("returns true for CID fonts whose mapped glyph is renderable", () => { + const cidFont = new CIDFont({ + subtype: "CIDFontType2", + baseFontName: "StubCID", + embeddedProgram: new StubFontProgram(new Map([[0x0041, 1]]), new Set([1])), + }); + const font = new ExistingFont("F0", null, null, true, cidFont); + + expect(font.canUseForAppearance("A")).toBe(true); + }); }); describe("encodeText", () => { diff --git a/src/document/forms/form-font.ts b/src/document/forms/form-font.ts index bd7927f..780f967 100644 --- a/src/document/forms/form-font.ts +++ b/src/document/forms/form-font.ts @@ -8,6 +8,7 @@ * PDF Reference: Section 12.7.3.3 "Variable Text" */ +import { type CIDFont, parseCIDFont } from "#src/fonts/cid-font"; import { EmbeddedFont } from "#src/fonts/embedded-font"; import { parseSimpleFont, type SimpleFont } from "#src/fonts/simple-font"; import { @@ -16,8 +17,9 @@ import { getStandard14GlyphWidth, isStandard14Font, } from "#src/fonts/standard-14"; +import { parseToUnicode, type ToUnicodeMap } from "#src/fonts/to-unicode"; +import type { RefResolver } from "#src/helpers/types"; import { unicodeToGlyphName } from "#src/helpers/unicode"; -import { PdfArray } from "#src/objects/pdf-array"; import { PdfDict } from "#src/objects/pdf-dict"; import { PdfRef } from "#src/objects/pdf-ref"; import { PdfStream } from "#src/objects/pdf-stream"; @@ -35,46 +37,126 @@ export type FormFont = EmbeddedFont | ExistingFont; * This is a lightweight wrapper for fonts already present in the PDF, * typically from the AcroForm's /DR (Default Resources) dictionary. * - * Provides limited metrics based on Standard 14 font data for common fonts. + * Supports both simple fonts (Type1, TrueType) and CID fonts (Type0). + * CID fonts use 2-byte character codes and are commonly used with + * Identity-H/Identity-V encoding for CJK and Unicode text. */ export class ExistingFont { - /** Font name as it appears in the PDF (e.g., "Helv", "ZaDb") */ + /** Font name as it appears in the PDF (e.g., "Helv", "ZaDb", "F0") */ readonly name: string; /** Reference to font object in PDF (may be null for inline Standard 14 fonts) */ readonly ref: PdfRef | null; - /** Underlying SimpleFont if resolved from PDF */ + /** Whether this is a CID-keyed font (Type0 with Identity-H/V encoding) */ + readonly isCIDFont: boolean; + + /** Underlying SimpleFont if resolved from PDF (for non-CID fonts) */ private readonly simpleFont: SimpleFont | null; + /** Underlying CIDFont if resolved from PDF (for CID fonts) */ + private readonly cidFont: CIDFont | null; + /** Standard 14 font name if this maps to one (e.g., "Helvetica" for "Helv") */ private readonly standardFontName: string | null; - constructor(name: string, ref: PdfRef | null, simpleFont: SimpleFont | null = null) { + constructor( + name: string, + ref: PdfRef | null, + simpleFont: SimpleFont | null = null, + isCIDFont = false, + cidFont: CIDFont | null = null, + ) { this.name = name; this.ref = ref; this.simpleFont = simpleFont; + this.isCIDFont = isCIDFont; + this.cidFont = cidFont; // Map common form font names to Standard 14 fonts this.standardFontName = mapToStandardFont(name); } + /** + * Check whether this font can be safely used to generate an appearance for + * the given text. + */ + canUseForAppearance(text: string): boolean { + if (this.isCIDFont) { + if (!this.cidFont) { + return false; + } + + if (!this.cidFont.getEmbeddedProgram()) { + return false; + } + + for (const char of text) { + const codePoint = char.codePointAt(0)!; + + if (codePoint > 0xffff) { + return false; + } + + const charCode = this.cidFont.tryGetCharCodeForUnicode(codePoint); + + if (charCode === null || !this.cidFont.hasRenderableGlyphForCID(charCode)) { + return false; + } + } + + return true; + } + + if (!this.canEncode(text)) { + return false; + } + + if (this.simpleFont) { + return true; + } + + return this.standardFontName !== null; + } + /** * Check if font can encode the given text. * - * For existing fonts, this is always true for ASCII text. - * For non-ASCII, returns false (can't verify without full font data). + * CID fonts with Identity-H/V encoding can encode any BMP character. + * Simple fonts are limited to their encoding (typically Latin-1). */ canEncode(text: string): boolean { + if (this.isCIDFont) { + if (!this.cidFont) { + return false; + } + + if (!this.cidFont.getEmbeddedProgram()) { + return false; + } + + for (const char of text) { + const codePoint = char.codePointAt(0)!; + + if (codePoint > 0xffff) { + return false; + } + + if (this.cidFont.tryGetCharCodeForUnicode(codePoint) === null) { + return false; + } + } + + return true; + } + if (this.simpleFont) { return this.simpleFont.canEncode(text); } - // For Standard 14 fonts, only ASCII is safe + // Standard 14 fallback: only Latin-1 for (const char of text) { - const code = char.charCodeAt(0); - - if (code > 255) { + if (char.charCodeAt(0) > 255) { return false; } } @@ -85,7 +167,8 @@ export class ExistingFont { /** * Encode text to character codes for this font. * - * For existing fonts, uses WinAnsi encoding (0-255). + * For CID fonts: returns 2-byte Unicode code points (Identity-H/V encoding). + * For simple fonts: uses the font's encoding (WinAnsi, custom, etc.). */ encodeText(text: string): number[] { if (this.simpleFont) { @@ -102,95 +185,144 @@ export class ExistingFont { return codes; } + /** + * Encode text as bytes for use in a PdfString. + * + * For CID fonts with Identity-H/V encoding, the character codes in the + * PDF string are CIDs. With Identity CIDToGIDMap, CID = GID, so the + * character code must equal the glyph ID in the font program. + * + * The encoding pipeline is: + * Unicode code point → GID (via font program cmap) + * → character code to write (= CID that maps to that GID) + * + * For Identity CIDToGIDMap: write GID directly (CID = GID) + * For stream CIDToGIDMap: find CID where CIDToGIDMap[CID] = GID + * + * For simple fonts, produces single-byte codes. + */ + encodeTextToBytes(text: string): Uint8Array { + if (this.isCIDFont) { + const codePoints = Array.from(text, char => char.codePointAt(0)!); + const bytes = new Uint8Array(codePoints.length * 2); + + for (let i = 0; i < codePoints.length; i++) { + // Use CIDFont mapping if available, otherwise fall back to raw code point + const charCode = this.cidFont + ? this.cidFont.getCharCodeForUnicode(codePoints[i]) + : codePoints[i]; + bytes[i * 2] = (charCode >> 8) & 0xff; + bytes[i * 2 + 1] = charCode & 0xff; + } + + return bytes; + } + + // Simple font: single-byte encoding + const codes = this.encodeText(text); + + return new Uint8Array(codes); + } + /** * Get width of text in points at a given font size. */ getTextWidth(text: string, fontSize: number): number { - if (this.simpleFont) { - return this.simpleFont.getTextWidth(text, fontSize); + // CID font with parsed width data + if (this.cidFont) { + let totalWidth = 0; + + for (const char of text) { + // CIDFont.getWidth() expects a CID. For Identity-H, CID = character code. + // Use getCharCodeForUnicode() to get the correct CID for width lookup. + const cid = this.cidFont.getCharCodeForUnicode(char.codePointAt(0)!); + totalWidth += this.cidFont.getWidth(cid); + } + + return (totalWidth * fontSize) / 1000; } - if (!this.standardFontName) { - // Approximate for unknown fonts: 0.5 * fontSize per character - return text.length * fontSize * 0.5; + if (this.simpleFont) { + try { + return this.simpleFont.getTextWidth(text, fontSize); + } catch { + // Fall through to approximation if font can't encode the text + } } - let totalWidth = 0; + if (this.standardFontName) { + let totalWidth = 0; + + for (const char of text) { + const glyphName = unicodeToGlyphName(char.charCodeAt(0)); + const width = glyphName + ? (getStandard14GlyphWidth(this.standardFontName, glyphName) ?? + getStandard14DefaultWidth(this.standardFontName)) + : getStandard14DefaultWidth(this.standardFontName); + totalWidth += width; + } - for (const char of text) { - const glyphName = unicodeToGlyphName(char.charCodeAt(0)); - const width = glyphName - ? (getStandard14GlyphWidth(this.standardFontName, glyphName) ?? - getStandard14DefaultWidth(this.standardFontName)) - : getStandard14DefaultWidth(this.standardFontName); - totalWidth += width; + return (totalWidth * fontSize) / 1000; } - return (totalWidth * fontSize) / 1000; + // Approximate for unknown fonts: 0.5 * fontSize per character + return text.length * fontSize * 0.5; } /** * Get ascent in points at a given font size. */ getAscent(fontSize: number): number { - if (this.simpleFont?.descriptor) { - return (this.simpleFont.descriptor.ascent * fontSize) / 1000; - } - - if (this.standardFontName) { - const metrics = FONT_BASIC_METRICS[this.standardFontName]; - - if (metrics) { - return (metrics.ascent * fontSize) / 1000; - } - } - - return fontSize * 0.8; + return this.getMetric("ascent", fontSize, 0.8); } /** * Get descent in points at a given font size (negative value). */ getDescent(fontSize: number): number { - if (this.simpleFont?.descriptor) { - return (this.simpleFont.descriptor.descent * fontSize) / 1000; - } - - if (this.standardFontName) { - const metrics = FONT_BASIC_METRICS[this.standardFontName]; - - if (metrics) { - return (metrics.descent * fontSize) / 1000; - } - } - - return -fontSize * 0.2; + return this.getMetric("descent", fontSize, -0.2); } /** * Get cap height in points at a given font size. */ getCapHeight(fontSize: number): number { - if (this.simpleFont?.descriptor) { - return (this.simpleFont.descriptor.capHeight * fontSize) / 1000; + return this.getMetric("capHeight", fontSize, 0.7); + } + + /** + * Look up a font metric from the descriptor, Standard 14 tables, or a fallback ratio. + */ + private getMetric( + key: "ascent" | "descent" | "capHeight", + fontSize: number, + fallbackRatio: number, + ): number { + const descriptor = this.cidFont?.descriptor ?? this.simpleFont?.descriptor; + + if (descriptor) { + return (descriptor[key] * fontSize) / 1000; } if (this.standardFontName) { const metrics = FONT_BASIC_METRICS[this.standardFontName]; if (metrics) { - return (metrics.capHeight * fontSize) / 1000; + return (metrics[key] * fontSize) / 1000; } } - return fontSize * 0.7; + return fontSize * fallbackRatio; } } /** * Map common form font names to Standard 14 fonts. + * + * Returns the canonical Standard 14 name for common aliases (e.g., "Helv" → "Helvetica"), + * or `null` if the name doesn't map to any Standard 14 font. */ -function mapToStandardFont(name: string): string | null { +export function mapToStandardFont(name: string): string | null { // Remove leading slash if present const cleanName = name.startsWith("/") ? name.slice(1) : name; @@ -225,6 +357,10 @@ function mapToStandardFont(name: string): string | null { /** * Parse an existing font from the PDF's resources. + * + * For simple fonts (Type1, TrueType): parses as SimpleFont for metrics. + * For CID fonts (Type0): parses the DescendantFont's CIDFont for + * accurate metrics and glyph widths, enabling proper 2-byte encoding. */ export function parseExistingFont( name: string, @@ -233,26 +369,94 @@ export function parseExistingFont( ): ExistingFont { let ref: PdfRef | null = null; let simpleFont: SimpleFont | null = null; + let cidFont: CIDFont | null = null; + let isCIDFont = false; if (fontObj instanceof PdfRef) { ref = fontObj; - const resolved = registry.getObject(fontObj); + // Use resolve() to handle fonts stored in object streams (where + // getObject() returns null). resolve() caches the result so + // subsequent getObject() calls will also succeed. + const resolved = registry.resolve(fontObj); if (resolved instanceof PdfDict) { - // Parse as SimpleFont for accurate metrics - try { - simpleFont = parseSimpleFont(resolved, { - resolver: registry.resolve.bind(registry), - }); - } catch (err) { - console.warn(err); - // Ignore parsing errors for existing fonts + const resolver: RefResolver = r => registry.resolve(r); + + // Check if this is a CID-keyed font (Type0 with Identity encoding). + const subtype = resolved.getName("Subtype", resolver)?.value; + const encoding = resolved.getName("Encoding", resolver)?.value; + const hasDescendantFonts = resolved.has("DescendantFonts"); + + if ( + subtype === "Type0" || + hasDescendantFonts || + encoding === "Identity-H" || + encoding === "Identity-V" + ) { + isCIDFont = true; + + // Parse the DescendantFont for metrics and glyph widths. + // Type0 fonts have a DescendantFonts array with exactly one CIDFont. + try { + cidFont = parseCIDFontFromDescendants(resolved, resolver); + } catch { + // Ignore parsing errors — we can still use the font + // with approximate metrics + } + } else { + // Parse as SimpleFont for accurate metrics + try { + simpleFont = parseSimpleFont(resolved, { resolver }); + } catch { + // Ignore parsing errors for existing fonts + } } } } - return new ExistingFont(name, ref, simpleFont); + return new ExistingFont(name, ref, simpleFont, isCIDFont, cidFont); +} + +/** + * Parse the CIDFont from a Type0 font's DescendantFonts array. + */ +function parseCIDFontFromDescendants(type0Dict: PdfDict, resolver: RefResolver): CIDFont | null { + const descendants = type0Dict.getArray("DescendantFonts", resolver); + + if (!descendants || descendants.length === 0) { + return null; + } + + // Get the first (and only) descendant font + let cidFontObj = descendants.at(0); + + if (cidFontObj instanceof PdfRef) { + cidFontObj = resolver(cidFontObj) ?? undefined; + } + + if (!(cidFontObj instanceof PdfDict)) { + return null; + } + + return parseCIDFont(cidFontObj, { + resolver, + toUnicodeMap: parseToUnicodeMap(type0Dict, resolver), + }); +} + +function parseToUnicodeMap(type0Dict: PdfDict, resolver: RefResolver): ToUnicodeMap | null { + const toUnicode = type0Dict.get("ToUnicode", resolver); + + if (!(toUnicode instanceof PdfStream)) { + return null; + } + + try { + return parseToUnicode(toUnicode.getDecodedData()); + } catch { + return null; + } } /** diff --git a/src/document/forms/text-appearance.ts b/src/document/forms/text-appearance.ts index 905c3e4..ddd934e 100644 --- a/src/document/forms/text-appearance.ts +++ b/src/document/forms/text-appearance.ts @@ -8,8 +8,6 @@ */ import { ContentStreamBuilder } from "#src/content/content-stream"; -import type { Operator } from "#src/content/operators"; -import type { EmbeddedFont } from "#src/fonts/embedded-font"; import { beginMarkedContent, beginText, @@ -25,45 +23,35 @@ import { rectangle, setFont, setLeading, - setNonStrokingCMYK, - setNonStrokingGray, - setNonStrokingRGB, setStrokingGray, showText, stroke, } from "#src/helpers/operators"; -import { PdfDict } from "#src/objects/pdf-dict"; -import { PdfName } from "#src/objects/pdf-name"; import type { PdfStream } from "#src/objects/pdf-stream"; -import { PdfString } from "#src/objects/pdf-string"; -import type { ObjectRegistry } from "../object-registry"; -import type { AcroForm } from "./acro-form"; import { + type AppearanceContext, + buildFontResources, + calculateAutoFontSize, + encodeTextForFont, type ExtractedAppearanceStyle, type FontMetrics, generateBackgroundAndBorder, + getColorOperators, getFontMetrics, - MAX_FONT_SIZE, - MIN_FONT_SIZE, - mapToStandardFontName, - PADDING, + getFontResourceName, + parseDefaultAppearance, type ParsedDA, - parseDAString, + resolveAppearanceFont, } from "./appearance-utils"; import type { RgbColor, TextField } from "./fields"; -import { ExistingFont, type FormFont, isEmbeddedFont, isExistingFont } from "./form-font"; +import type { FormFont } from "./form-font"; import type { WidgetAnnotation } from "./widget-annotation"; /** * Context for text appearance generation. */ -export interface TextAppearanceContext { - acroForm: AcroForm; - registry: ObjectRegistry; - fontResourceNames: Map; - fontNameCounter: number; -} +export type TextAppearanceContext = AppearanceContext; /** * Generate appearance stream for a text field widget. @@ -86,7 +74,7 @@ export function generateTextAppearance( } // Resolve font - const font = resolveFont(ctx, field, existingStyle?.fontName); + const font = resolveAppearanceFont(ctx.acroForm, field, value, existingStyle?.fontName); const { name: fontName, counter } = getFontResourceName(ctx, font); ctx.fontNameCounter = counter; @@ -416,114 +404,6 @@ function generateCombAppearance( // Helper Functions // ───────────────────────────────────────────────────────────────────────────── -function resolveFont( - ctx: TextAppearanceContext, - field: { getFont(): FormFont | null; defaultAppearance?: string | null }, - existingFontName?: string, -): FormFont { - const fieldFont = field.getFont(); - - if (fieldFont) { - return fieldFont; - } - - const defaultFont = ctx.acroForm.getDefaultFont(); - - if (defaultFont) { - return defaultFont; - } - - if (existingFontName) { - const existingFont = ctx.acroForm.getExistingFont(existingFontName); - - if (existingFont) { - return existingFont; - } - } - - const da = - "defaultAppearance" in field - ? (field.defaultAppearance ?? ctx.acroForm.defaultAppearance) - : ctx.acroForm.defaultAppearance; - const daInfo = parseDAString(da); - const existingFont = ctx.acroForm.getExistingFont(daInfo.fontName); - - if (existingFont) { - return existingFont; - } - - return new ExistingFont("Helv", null, null); -} - -function getFontResourceName( - ctx: TextAppearanceContext, - font: FormFont, -): { name: string; counter: number } { - if (ctx.fontResourceNames.has(font)) { - return { - // biome-ignore lint/style/noNonNullAssertion: fontResourceNames is guaranteed to have a value - name: ctx.fontResourceNames.get(font)!, - counter: ctx.fontNameCounter, - }; - } - - let name: string; - - if (isExistingFont(font)) { - name = font.name.startsWith("/") ? font.name : `/${font.name}`; - } else { - ctx.fontNameCounter++; - name = `/F${ctx.fontNameCounter}`; - } - - ctx.fontResourceNames.set(font, name); - - return { - name, - counter: ctx.fontNameCounter, - }; -} - -function parseDefaultAppearance( - ctx: TextAppearanceContext, - field: { defaultAppearance?: string | null }, -): ParsedDA { - const da = - "defaultAppearance" in field - ? (field.defaultAppearance ?? ctx.acroForm.defaultAppearance) - : ctx.acroForm.defaultAppearance; - - return parseDAString(da); -} - -function calculateAutoFontSize( - text: string, - width: number, - height: number, - font: FormFont, - isMultiline: boolean, -): number { - const contentWidth = width - 2 * PADDING; - const contentHeight = height - 2 * PADDING; - - if (isMultiline) { - return Math.max(MIN_FONT_SIZE, Math.min(12, contentHeight * 0.15)); - } - - const heightBased = contentHeight * 0.7; - - let fontSize = heightBased; - const metrics = getFontMetrics(font); - let textWidth = metrics.getTextWidth(text || "X", fontSize); - - while (textWidth > contentWidth && fontSize > MIN_FONT_SIZE) { - fontSize -= 1; - textWidth = metrics.getTextWidth(text || "X", fontSize); - } - - return Math.max(MIN_FONT_SIZE, Math.min(fontSize, MAX_FONT_SIZE)); -} - function calculateXPosition( textWidth: number, contentWidth: number, @@ -602,63 +482,6 @@ function wrapText( return lines; } -function encodeTextForFont(text: string, font: FormFont): PdfString { - if (isEmbeddedFont(font)) { - font.markUsedInForm(); - - if (!font.canEncode(text)) { - const unencodable = font.getUnencodableCharacters(text); - const firstBad = unencodable[0]; - - throw new Error( - `Font cannot encode character '${firstBad}' (U+${firstBad.codePointAt(0)?.toString(16).toUpperCase().padStart(4, "0")})`, - ); - } - - const gids = font.encodeTextToGids(text); - const bytes = new Uint8Array(gids.length * 2); - - for (let i = 0; i < gids.length; i++) { - bytes[i * 2] = (gids[i] >> 8) & 0xff; - bytes[i * 2 + 1] = gids[i] & 0xff; - } - - return PdfString.fromBytes(bytes); - } - - return PdfString.fromString(text); -} - -function getColorOperators(textColor: RgbColor | null, daInfo: ParsedDA): Operator[] { - if (textColor) { - return [setNonStrokingRGB(textColor.r, textColor.g, textColor.b)]; - } - - switch (daInfo.colorOp) { - case "g": - return [setNonStrokingGray(daInfo.colorArgs[0] ?? 0)]; - case "rg": - return [ - setNonStrokingRGB( - daInfo.colorArgs[0] ?? 0, - daInfo.colorArgs[1] ?? 0, - daInfo.colorArgs[2] ?? 0, - ), - ]; - case "k": - return [ - setNonStrokingCMYK( - daInfo.colorArgs[0] ?? 0, - daInfo.colorArgs[1] ?? 0, - daInfo.colorArgs[2] ?? 0, - daInfo.colorArgs[3] ?? 0, - ), - ]; - default: - return [setNonStrokingGray(0)]; - } -} - function buildFormXObject( ctx: TextAppearanceContext, content: ContentStreamBuilder, @@ -668,7 +491,7 @@ function buildFormXObject( fontName: string, widget: WidgetAnnotation, ): PdfStream { - const resources = buildResources(ctx, font, fontName); + const resources = buildFontResources(font, fontName); const mk = widget.getAppearanceCharacteristics(); const rotation = mk?.rotation ?? 0; @@ -678,31 +501,6 @@ function buildFormXObject( return content.toFormXObject([0, 0, width, height], resources, matrix); } -function buildResources(ctx: TextAppearanceContext, font: FormFont, fontName: string): PdfDict { - const resources = new PdfDict(); - const fonts = new PdfDict(); - - const cleanName = fontName.startsWith("/") ? fontName.slice(1) : fontName; - - if (isEmbeddedFont(font)) { - fonts.set(cleanName, font.ref); - } else if (isExistingFont(font) && font.ref) { - fonts.set(cleanName, font.ref); - } else { - const fontDict = new PdfDict(); - - fontDict.set("Type", PdfName.of("Font")); - fontDict.set("Subtype", PdfName.of("Type1")); - fontDict.set("BaseFont", PdfName.of(mapToStandardFontName(cleanName))); - - fonts.set(cleanName, fontDict); - } - - resources.set("Font", fonts); - - return resources; -} - function calculateAppearanceMatrix( width: number, height: number, diff --git a/src/fonts/cid-font.test.ts b/src/fonts/cid-font.test.ts index e5dfda9..3c8b10c 100644 --- a/src/fonts/cid-font.test.ts +++ b/src/fonts/cid-font.test.ts @@ -3,6 +3,7 @@ import { PdfNumber } from "#src/objects/pdf-number"; import { describe, expect, it } from "vitest"; import { CIDFont, CIDWidthMap, parseCIDWidths } from "./cid-font"; +import { ToUnicodeMap } from "./to-unicode"; describe("CIDWidthMap", () => { describe("individual mappings", () => { @@ -185,4 +186,15 @@ describe("CIDFont", () => { expect(font.cidSystemInfo.ordering).toBe("Japan1"); expect(font.cidSystemInfo.supplement).toBe(6); }); + + it("should resolve char codes through ToUnicode when glyph lookup is unavailable", () => { + const font = new CIDFont({ + subtype: "CIDFontType2", + baseFontName: "TestFont", + cidToGidMap: new Uint16Array([0, 2]), + toUnicodeMap: new ToUnicodeMap(new Map([[1, "A"]])), + }); + + expect(font.tryGetCharCodeForUnicode(0x41)).toBe(1); + }); }); diff --git a/src/fonts/cid-font.ts b/src/fonts/cid-font.ts index a4f598b..02ade76 100644 --- a/src/fonts/cid-font.ts +++ b/src/fonts/cid-font.ts @@ -27,7 +27,9 @@ import { PdfStream } from "#src/objects/pdf-stream.ts"; import { type EmbeddedParserOptions, parseEmbeddedProgram } from "./embedded-parser"; import { FontDescriptor } from "./font-descriptor"; +import { isCFFCIDFontProgram } from "./font-program/cff-cid.ts"; import type { FontProgram } from "./font-program/index.ts"; +import type { ToUnicodeMap } from "./to-unicode"; export type CIDFontSubtype = "CIDFontType0" | "CIDFontType2"; @@ -69,9 +71,18 @@ export class CIDFont { /** CID to GID mapping (null = Identity, Uint16Array = explicit map) */ private readonly cidToGidMap: "Identity" | Uint16Array | null; + /** Inverse CID to GID map: GID → CID (built lazily for stream-based maps) */ + private gidToCidMap: Map | null = null; + /** Embedded font program (if available) */ private readonly embeddedProgram: FontProgram | null; + /** ToUnicode map from the parent Type0 font, if available */ + private readonly toUnicodeMap: ToUnicodeMap | null; + + /** Reverse ToUnicode map: Unicode code point -> character code */ + private unicodeToCharCodeMap: Map | null = null; + constructor(options: { subtype: CIDFontSubtype; baseFontName: string; @@ -81,6 +92,7 @@ export class CIDFont { widths?: CIDWidthMap; cidToGidMap?: "Identity" | Uint16Array | null; embeddedProgram?: FontProgram | null; + toUnicodeMap?: ToUnicodeMap | null; }) { this.subtype = options.subtype; this.baseFontName = options.baseFontName; @@ -94,6 +106,7 @@ export class CIDFont { this.widths = options.widths ?? new CIDWidthMap(); this.cidToGidMap = options.cidToGidMap ?? "Identity"; this.embeddedProgram = options.embeddedProgram ?? null; + this.toUnicodeMap = options.toUnicodeMap ?? null; } /** @@ -103,6 +116,29 @@ export class CIDFont { return this.embeddedProgram !== null; } + /** + * Check if the embedded font program has renderable glyph outlines. + * + * Some PDFs embed fonts with metrics and cmap data but stripped outlines. + * These fonts are usable for text extraction but cannot render new text. + */ + get hasRenderableGlyphs(): boolean { + return this.embeddedProgram?.hasRenderableGlyphs() ?? false; + } + + /** + * Check whether the embedded program can render a glyph for the given CID. + */ + hasRenderableGlyphForCID(cid: number): boolean { + if (!this.embeddedProgram) { + return false; + } + + const gid = this.getGid(cid); + + return this.embeddedProgram.hasRenderableGlyph(gid); + } + /** * Get the embedded font program, if available. */ @@ -141,12 +177,113 @@ export class CIDFont { * Used when accessing embedded font data. */ getGid(cid: number): number { + if (this.embeddedProgram && isCFFCIDFontProgram(this.embeddedProgram)) { + return this.embeddedProgram.getGlyphIdForCID(cid); + } + if (this.cidToGidMap === "Identity" || this.cidToGidMap === null) { return cid; } return this.cidToGidMap[cid] ?? 0; } + + /** + * Whether the CIDToGIDMap is Identity (or absent, which defaults to Identity). + */ + get isIdentityCidToGid(): boolean { + return this.cidToGidMap === "Identity" || this.cidToGidMap === null; + } + + /** + * Get the character code (= CID for Identity-H) to write in a PDF string + * for a given Unicode code point. + * + * The encoding pipeline for Identity-H is: + * character code (in PDF string) = CID (because Identity-H maps 1:1) + * CID → GID (via CIDToGIDMap) + * GID → glyph in font file + * + * We need to find the character code such that after the CIDToGIDMap + * transformation, we get the GID corresponding to the desired Unicode + * character in the embedded font program. + * + * For Identity CIDToGIDMap: charCode = CID = GID = fontProgram.getGlyphId(unicode) + * For stream CIDToGIDMap: charCode = CID where CIDToGIDMap[CID] = desired GID + */ + getCharCodeForUnicode(unicode: number): number { + return this.tryGetCharCodeForUnicode(unicode) ?? unicode; + } + + /** + * Try to resolve the character code (= CID for Identity-H) for a Unicode + * code point. Returns null when the mapping cannot be proven. + */ + tryGetCharCodeForUnicode(unicode: number): number | null { + const fontProgram = this.getEmbeddedProgram(); + + if (fontProgram) { + const desiredGid = fontProgram.getGlyphId(unicode); + + if (desiredGid !== 0) { + if (this.isIdentityCidToGid) { + // Identity CIDToGIDMap: CID = GID, so write GID directly. + return desiredGid; + } + + const cid = this.getCharCodeForGid(desiredGid); + + if (cid !== null) { + return cid; + } + } + } + + if (!this.toUnicodeMap) { + return null; + } + + if (!this.unicodeToCharCodeMap) { + this.unicodeToCharCodeMap = new Map(); + const unicodeToCharCodeMap = this.unicodeToCharCodeMap; + + this.toUnicodeMap.forEach((unicodeValue, charCode) => { + const chars = Array.from(unicodeValue); + + if (chars.length !== 1) { + return; + } + + const codePoint = chars[0].codePointAt(0); + + if (codePoint === undefined || unicodeToCharCodeMap.has(codePoint)) { + return; + } + + unicodeToCharCodeMap.set(codePoint, charCode); + }); + } + + return this.unicodeToCharCodeMap.get(unicode) ?? null; + } + + private getCharCodeForGid(gid: number): number | null { + if (!this.gidToCidMap) { + this.gidToCidMap = new Map(); + + if (this.cidToGidMap instanceof Uint16Array) { + for (let cid = 0; cid < this.cidToGidMap.length; cid++) { + const mappedGid = this.cidToGidMap[cid]; + + if (mappedGid !== 0 && !this.gidToCidMap.has(mappedGid)) { + this.gidToCidMap.set(mappedGid, cid); + } + } + } + } + + return this.gidToCidMap.get(gid) ?? null; + } } /** @@ -271,6 +408,7 @@ export function parseCIDFont( dict: PdfDict, options: { resolver?: RefResolver; + toUnicodeMap?: ToUnicodeMap | null; } = {}, ): CIDFont { const subtypeName = dict.getName("Subtype"); @@ -298,18 +436,8 @@ export function parseCIDFont( const defaultWidth = dict.getNumber("DW")?.value ?? 1000; // Parse /W array (can be inline or a ref) - let widths = new CIDWidthMap(); - let w = dict.get("W", options.resolver); - - let wArray: PdfArray | null = null; - - if (w instanceof PdfArray) { - wArray = w; - } - - if (wArray) { - widths = parseCIDWidths(wArray); - } + const w = dict.get("W", options.resolver); + const widths = w instanceof PdfArray ? parseCIDWidths(w) : new CIDWidthMap(); // Parse FontDescriptor and embedded font program let descriptor: FontDescriptor | null = null; @@ -359,5 +487,6 @@ export function parseCIDFont( widths, cidToGidMap, embeddedProgram, + toUnicodeMap: options.toUnicodeMap, }); } diff --git a/src/fonts/composite-font.ts b/src/fonts/composite-font.ts index 8a4a635..8d96899 100644 --- a/src/fonts/composite-font.ts +++ b/src/fonts/composite-font.ts @@ -187,7 +187,7 @@ export function parseCompositeFont( // Parse DescendantFonts (should be array with one CIDFont) // DescendantFonts can be inline array or a ref to an array let cidFont: CIDFont; - let descendants = dict.get("DescendantFonts", options.resolver); + const descendants = dict.get("DescendantFonts", options.resolver); let descendantsArray: PdfArray | null = null; if (descendants instanceof PdfArray) { @@ -198,7 +198,10 @@ export function parseCompositeFont( const firstDescendant = descendantsArray.at(0, options.resolver); if (firstDescendant instanceof PdfDict) { - cidFont = parseCIDFont(firstDescendant, options); + cidFont = parseCIDFont(firstDescendant, { + resolver: options.resolver, + toUnicodeMap: options.toUnicodeMap, + }); } else { cidFont = createDefaultCIDFont(baseFontName); } diff --git a/src/fonts/embedded-font.test.ts b/src/fonts/embedded-font.test.ts index 8537c60..36a2d1a 100644 --- a/src/fonts/embedded-font.test.ts +++ b/src/fonts/embedded-font.test.ts @@ -172,6 +172,13 @@ describe("parseFontProgram", () => { expect(program.numGlyphs).toBeGreaterThan(0); }); + it("treats OpenType CFF fonts as renderable", async () => { + const fontBytes = await loadFixture("fonts", "otf/FoglihtenNo07.otf"); + const program = parseFontProgram(fontBytes); + + expect(program.hasRenderableGlyphs()).toBe(true); + }); + it("should reject invalid data", () => { const invalidData = new Uint8Array([0, 0, 0, 0]); expect(() => parseFontProgram(invalidData)).toThrow(); diff --git a/src/fonts/font-program/base.ts b/src/fonts/font-program/base.ts index 434ebde..3c04306 100644 --- a/src/fonts/font-program/base.ts +++ b/src/fonts/font-program/base.ts @@ -72,6 +72,22 @@ export interface FontProgram { */ hasGlyph(codePoint: number): boolean; + /** + * Check if the font has renderable glyph outlines. + * + * Some PDF subsetted fonts are "crippled" — they contain glyph metrics + * and cmap data but no actual outline data (0 contours for all glyphs). + * These fonts are used only for text extraction and cannot render text. + * + * Returns true if at least some common glyphs have actual outline data. + */ + hasRenderableGlyphs(): boolean; + + /** + * Check if a specific glyph has renderable outlines or charstring data. + */ + hasRenderableGlyph(glyphId: number): boolean; + /** * Get the raw font data. */ diff --git a/src/fonts/font-program/cff-cid.ts b/src/fonts/font-program/cff-cid.ts index c09983c..28feac8 100644 --- a/src/fonts/font-program/cff-cid.ts +++ b/src/fonts/font-program/cff-cid.ts @@ -85,11 +85,15 @@ export class CFFCIDFontProgram implements FontProgram { return 80; } - getGlyphId(_codePoint: number): number { - // CID fonts map CID to GID via charset - // Without a CMap, we can't map Unicode to CID - // Return 0 (notdef) - caller should use CMap - return 0; + getGlyphId(codePoint: number): number { + // CID fonts usually need the parent Type0 font's CMap/ToUnicode to map + // Unicode to character codes. As a fallback, support cases where the CID + // itself is the Unicode BMP code point. + if (codePoint < 0 || codePoint > 0xffff) { + return 0; + } + + return this.getGlyphIdForCID(codePoint); } /** @@ -111,7 +115,24 @@ export class CFFCIDFontProgram implements FontProgram { } hasGlyph(_codePoint: number): boolean { - // CID fonts need CMap for proper lookup + return this.getGlyphId(_codePoint) !== 0; + } + + hasRenderableGlyph(glyphId: number): boolean { + return ( + glyphId > 0 && + glyphId < this.font.charStrings.length && + this.font.charStrings[glyphId].length > 0 + ); + } + + hasRenderableGlyphs(): boolean { + for (let glyphId = 1; glyphId < this.font.charStrings.length; glyphId++) { + if (this.hasRenderableGlyph(glyphId)) { + return true; + } + } + return false; } diff --git a/src/fonts/font-program/cff.ts b/src/fonts/font-program/cff.ts index 0256d08..352e4aa 100644 --- a/src/fonts/font-program/cff.ts +++ b/src/fonts/font-program/cff.ts @@ -125,6 +125,24 @@ export class CFFType1FontProgram implements FontProgram { return this.getGlyphId(codePoint) !== 0; } + hasRenderableGlyph(glyphId: number): boolean { + return ( + glyphId > 0 && + glyphId < this.font.charStrings.length && + this.font.charStrings[glyphId].length > 0 + ); + } + + hasRenderableGlyphs(): boolean { + for (let glyphId = 1; glyphId < this.font.charStrings.length; glyphId++) { + if (this.hasRenderableGlyph(glyphId)) { + return true; + } + } + + return false; + } + getData(): Uint8Array { return this.data; } diff --git a/src/fonts/font-program/truetype.ts b/src/fonts/font-program/truetype.ts index 36db482..b81c316 100644 --- a/src/fonts/font-program/truetype.ts +++ b/src/fonts/font-program/truetype.ts @@ -2,6 +2,7 @@ * TrueType/OpenType font program wrapper. */ +import { parseCFF } from "#src/fontbox/cff/parser.ts"; import type { TrueTypeFont } from "#src/fontbox/ttf/truetype-font.ts"; import type { FontProgram } from "./base.ts"; @@ -86,6 +87,56 @@ export class TrueTypeFontProgram implements FontProgram { return this.font.hasGlyph(codePoint); } + hasRenderableGlyph(glyphId: number): boolean { + if (glyphId <= 0) { + return false; + } + + if (!this.font.glyf) { + const cffData = this.font.getTableBytes("CFF ") ?? this.font.getTableBytes("CFF2"); + + if (!cffData) { + return false; + } + + try { + const [cffFont] = parseCFF(cffData); + + return ( + !!cffFont && + glyphId < cffFont.charStrings.length && + cffFont.charStrings[glyphId].length > 0 + ); + } catch { + return false; + } + } + + const glyph = this.font.glyf?.getGlyph(glyphId); + + if (!glyph) { + return false; + } + + const { description } = glyph; + + if (description.isComposite) { + return (description.components?.length ?? 0) > 0; + } + + return description.numberOfContours !== 0; + } + + hasRenderableGlyphs(): boolean { + for (let glyphId = 1; glyphId < this.font.numGlyphs; glyphId++) { + if (this.hasRenderableGlyph(glyphId)) { + return true; + } + } + + return false; + } + getData(): Uint8Array { return this.data; } diff --git a/src/fonts/font-program/type1.ts b/src/fonts/font-program/type1.ts index dec2c50..a3d4715 100644 --- a/src/fonts/font-program/type1.ts +++ b/src/fonts/font-program/type1.ts @@ -147,6 +147,33 @@ export class Type1FontProgram implements FontProgram { return this.font.hasGlyph(name); } + hasRenderableGlyph(glyphId: number): boolean { + if (glyphId <= 0) { + return false; + } + + const glyphNames = this.font.getGlyphNames(); + const glyphName = glyphNames[glyphId]; + + if (!glyphName) { + return false; + } + + return (this.font.charstrings.get(glyphName)?.length ?? 0) > 0; + } + + hasRenderableGlyphs(): boolean { + const glyphNames = this.font.getGlyphNames(); + + for (let glyphId = 1; glyphId < glyphNames.length; glyphId++) { + if (this.hasRenderableGlyph(glyphId)) { + return true; + } + } + + return false; + } + getData(): Uint8Array { return this.data; } diff --git a/src/fonts/simple-font.ts b/src/fonts/simple-font.ts index 9bd094c..fe8981f 100644 --- a/src/fonts/simple-font.ts +++ b/src/fonts/simple-font.ts @@ -278,7 +278,7 @@ export function parseSimpleFont( const lastChar = dict.getNumber("LastChar", options.resolver)?.value ?? 255; // Parse widths array (can be inline or a ref) - let widthsArray = dict.getArray("Widths", options.resolver); + const widthsArray = dict.getArray("Widths", options.resolver); const widths: number[] = []; diff --git a/src/test-utils.ts b/src/test-utils.ts index 0feaf70..9f420b6 100644 --- a/src/test-utils.ts +++ b/src/test-utils.ts @@ -29,6 +29,7 @@ export type FixtureCategory = | "fonts" | "forms" | "images" + | "issues" | "layers" | "malformed" | "scenarios"