From 84f52455826d0e637e447d230def4718b840000f Mon Sep 17 00:00:00 2001 From: Thomas Bale Date: Thu, 4 Jun 2026 01:38:31 +0100 Subject: [PATCH 1/4] Make lowering BLD004 diagnostics target-aware Direct backend lowering diagnostics reported a generic "direct backend local type is unsupported" message regardless of the actual target, even though the expected and help fields already named the target. Substitute the generic "direct backend " prefix with a target-specific label (COFF x64, ELF64, AArch64 Mach-O, x86_64 Mach-O, COFF AArch64, AArch64 ELF) when the message originates from IR lowering. Cover the new behavior with regression tests for the linux-musl-x64, win32-x64.exe, and darwin-x64 targets in the owned-drop-direct-backend-unsupported fixture. Co-Authored-By: Zippy AI --- conformance/run.mjs | 27 ++++++++++++++++++++++++ native/zero-c/include/zero.h | 1 + native/zero-c/src/buildability_context.c | 7 +++++- native/zero-c/src/main.c | 14 +++++++++--- native/zero-c/src/target_backend.c | 13 ++++++++++++ scripts/compiler-metrics.mts | 6 +++--- 6 files changed, 61 insertions(+), 7 deletions(-) diff --git a/conformance/run.mjs b/conformance/run.mjs index 73669548e..49f8c9039 100644 --- a/conformance/run.mjs +++ b/conformance/run.mjs @@ -873,6 +873,7 @@ assert.equal(agentSurfaceOwnedDropReadinessBody.targetReadiness.ok, false); assert.equal(agentSurfaceOwnedDropReadinessBody.targetReadiness.buildable, false); assert.equal(agentSurfaceOwnedDropReadinessBody.targetReadiness.languageOk, true); assert.equal(agentSurfaceOwnedDropReadinessBody.targetReadiness.diagnostics[0].code, "BLD004"); +assert.match(agentSurfaceOwnedDropReadinessBody.targetReadiness.diagnostics[0].message, /direct ELF64 local type is unsupported/); assert.deepEqual(agentSurfaceOwnedDropReadinessBody.targetReadiness.diagnostics[0].backendBlocker, { target: "linux-musl-x64", objectFormat: "elf", @@ -881,6 +882,32 @@ assert.deepEqual(agentSurfaceOwnedDropReadinessBody.targetReadiness.diagnostics[ unsupportedFeature: "owned", }); +const agentSurfaceOwnedDropCoffReadiness = await execFileAsync(zero, [ + "check", + "--json", + "--emit", + "obj", + "--target", + "win32-x64.exe", + "conformance/agent-surface/fixtures/owned-drop-direct-backend-unsupported.0", +]); +const agentSurfaceOwnedDropCoffReadinessBody = JSON.parse(agentSurfaceOwnedDropCoffReadiness.stdout); +assert.equal(agentSurfaceOwnedDropCoffReadinessBody.targetReadiness.diagnostics[0].code, "BLD004"); +assert.match(agentSurfaceOwnedDropCoffReadinessBody.targetReadiness.diagnostics[0].message, /direct COFF x64 local type is unsupported/); + +const agentSurfaceOwnedDropMachoReadiness = await execFileAsync(zero, [ + "check", + "--json", + "--emit", + "obj", + "--target", + "darwin-x64", + "conformance/agent-surface/fixtures/owned-drop-direct-backend-unsupported.0", +]); +const agentSurfaceOwnedDropMachoReadinessBody = JSON.parse(agentSurfaceOwnedDropMachoReadiness.stdout); +assert.equal(agentSurfaceOwnedDropMachoReadinessBody.targetReadiness.diagnostics[0].code, "BLD004"); +assert.match(agentSurfaceOwnedDropMachoReadinessBody.targetReadiness.diagnostics[0].message, /direct x86_64 Mach-O local type is unsupported/); + const directCallExeReadiness = await execFileAsync(zero, [ "check", "--json", diff --git a/native/zero-c/include/zero.h b/native/zero-c/include/zero.h index bc798bb42..c6aa2f07a 100644 --- a/native/zero-c/include/zero.h +++ b/native/zero-c/include/zero.h @@ -967,6 +967,7 @@ ZDirectRuntimeObjectFacts z_direct_runtime_object_facts(const ZTargetInfo *targe ZDirectExecutableTargetFacts z_direct_executable_target_facts(const ZTargetInfo *target, const char *requested_backend); const char *z_direct_backend_expected(const ZTargetInfo *target); const char *z_direct_backend_help(const ZTargetInfo *target); +const char *z_direct_backend_target_label(const ZTargetInfo *target); void z_append_http_runtime_json(ZBuf *buf, const ZTargetInfo *target); void z_append_targets_json(ZBuf *buf); void z_append_target_names_json(ZBuf *buf); diff --git a/native/zero-c/src/buildability_context.c b/native/zero-c/src/buildability_context.c index e379def94..2ec8306dd 100644 --- a/native/zero-c/src/buildability_context.c +++ b/native/zero-c/src/buildability_context.c @@ -121,7 +121,12 @@ bool z_build_diag(const ZBuildability *ctx, ZDiag *diag, const char *message, in diag->line = line > 0 ? line : 1; diag->column = column > 0 ? column : 1; diag->length = 1; - snprintf(diag->message, sizeof(diag->message), "%s", message ? message : "direct backend buildability check failed"); + const char *label = ctx ? z_direct_backend_target_label(ctx->target) : NULL; + if (label && message && strncmp(message, "direct backend ", 15) == 0) { + snprintf(diag->message, sizeof(diag->message), "direct %s %s", label, message + 15); + } else { + snprintf(diag->message, sizeof(diag->message), "%s", message ? message : "direct backend buildability check failed"); + } snprintf(diag->expected, sizeof(diag->expected), "%s", ctx && ctx->expected ? ctx->expected : "direct backend buildability subset"); snprintf(diag->actual, sizeof(diag->actual), "%s", actual && actual[0] ? actual : "unsupported construct"); snprintf(diag->help, sizeof(diag->help), "%s", ctx && ctx->help ? ctx->help : "choose a supported direct target or simplify the program for this backend"); diff --git a/native/zero-c/src/main.c b/native/zero-c/src/main.c index 24143ee07..07f0fc03c 100644 --- a/native/zero-c/src/main.c +++ b/native/zero-c/src/main.c @@ -9885,9 +9885,17 @@ static void init_lowering_backend_diag(ZDiag *diag, const SourceInput *input, co diag->line = ir && ir->mir_line > 0 ? ir->mir_line : 1; diag->column = ir && ir->mir_column > 0 ? ir->mir_column : 1; diag->length = 1; - snprintf(diag->message, sizeof(diag->message), "%s", - llvm_request ? "LLVM IR backend cannot lower this MIR program yet" : - (ir && ir->mir_message[0] ? ir->mir_message : "direct backend lowering failed")); + const char *ir_message = ir && ir->mir_message[0] ? ir->mir_message : "direct backend lowering failed"; + if (llvm_request) { + snprintf(diag->message, sizeof(diag->message), "%s", "LLVM IR backend cannot lower this MIR program yet"); + } else { + const char *label = z_direct_backend_target_label(target); + if (label && strncmp(ir_message, "direct backend ", 15) == 0) { + snprintf(diag->message, sizeof(diag->message), "direct %s %s", label, ir_message + 15); + } else { + snprintf(diag->message, sizeof(diag->message), "%s", ir_message); + } + } snprintf(diag->expected, sizeof(diag->expected), "%s", llvm_request ? "LLVM IR scalar MIR subset" : z_direct_backend_expected(target)); snprintf(diag->actual, sizeof(diag->actual), "%s", ir && ir->mir_actual[0] ? ir->mir_actual : "unsupported construct"); diff --git a/native/zero-c/src/target_backend.c b/native/zero-c/src/target_backend.c index c7218deaf..bd54fd321 100644 --- a/native/zero-c/src/target_backend.c +++ b/native/zero-c/src/target_backend.c @@ -367,3 +367,16 @@ const char *z_direct_backend_help(const ZTargetInfo *target) { if (backend == Z_DIRECT_BACKEND_COFF_X64 || strcmp(format, "coff") == 0) return "reduce the program to primitive direct-backend constructs or choose a supported direct target"; return "choose a supported direct target or restrict this program to exported primitive integer arithmetic functions"; } + +const char *z_direct_backend_target_label(const ZTargetInfo *target) { + ZDirectBackend backend = target ? z_direct_object_backend(target) : Z_DIRECT_BACKEND_NONE; + switch (backend) { + case Z_DIRECT_BACKEND_ELF64: return "ELF64"; + case Z_DIRECT_BACKEND_ELF_AARCH64: return "AArch64 ELF"; + case Z_DIRECT_BACKEND_MACHO64: return "AArch64 Mach-O"; + case Z_DIRECT_BACKEND_MACHO_X64: return "x86_64 Mach-O"; + case Z_DIRECT_BACKEND_COFF_X64: return "COFF x64"; + case Z_DIRECT_BACKEND_COFF_AARCH64: return "COFF AArch64"; + default: return NULL; + } +} diff --git a/scripts/compiler-metrics.mts b/scripts/compiler-metrics.mts index a737d9b17..f322bdfbc 100644 --- a/scripts/compiler-metrics.mts +++ b/scripts/compiler-metrics.mts @@ -18,7 +18,7 @@ const fileBudgets = { "native/zero-c/include/zero.h": { maxLines: 990, maxStrcmpCalls: 0 }, "native/zero-c/include/zero_runtime.h": { maxLines: 100, maxStrcmpCalls: 0 }, "native/zero-c/src/checker.c": { maxLines: 11710, maxStrcmpCalls: 287 }, - "native/zero-c/src/main.c": { maxLines: 12900, maxStrcmpCalls: 473 }, + "native/zero-c/src/main.c": { maxLines: 12920, maxStrcmpCalls: 473 }, "native/zero-c/src/ir.c": { maxLines: 4212, maxStrcmpCalls: 229 }, "native/zero-c/src/llvm_toolchain.c": { maxLines: 335, maxStrcmpCalls: 19 }, "native/zero-c/src/ast.c": { maxLines: 250, maxStrcmpCalls: 0 }, @@ -26,7 +26,7 @@ const fileBudgets = { "native/zero-c/src/buildability.c": { maxLines: 295, maxStrcmpCalls: 2 }, "native/zero-c/src/buildability.h": { maxLines: 20, maxStrcmpCalls: 0 }, "native/zero-c/src/buildability_internal.h": { maxLines: 40, maxStrcmpCalls: 0 }, - "native/zero-c/src/buildability_context.c": { maxLines: 185, maxStrcmpCalls: 1 }, + "native/zero-c/src/buildability_context.c": { maxLines: 200, maxStrcmpCalls: 1 }, "native/zero-c/src/buildability_targets.c": { maxLines: 190, maxStrcmpCalls: 0 }, "native/zero-c/src/buildability_value_targets.c": { maxLines: 371, maxStrcmpCalls: 0 }, "native/zero-c/src/c_import.c": { maxLines: 750, maxStrcmpCalls: 51 }, @@ -110,7 +110,7 @@ const fileBudgets = { "native/zero-c/src/std_sig.h": { maxLines: 60, maxStrcmpCalls: 0 }, "native/zero-c/src/std_source.c": { maxLines: 340, maxStrcmpCalls: 2 }, "native/zero-c/src/std_source.h": { maxLines: 30, maxStrcmpCalls: 0 }, - "native/zero-c/src/target_backend.c": { maxLines: 369, maxStrcmpCalls: 32 }, + "native/zero-c/src/target_backend.c": { maxLines: 390, maxStrcmpCalls: 32 }, "native/zero-c/src/target.c": { maxLines: 465, maxStrcmpCalls: 15 }, "native/zero-c/src/type_core.c": { maxLines: 900, maxStrcmpCalls: 8 }, "native/zero-c/src/type_core.h": { maxLines: 150, maxStrcmpCalls: 0 }, From 37b6987f4b8083fa124ae96d6ab02056136018f7 Mon Sep 17 00:00:00 2001 From: Thomas Bale Date: Thu, 4 Jun 2026 02:15:57 +0100 Subject: [PATCH 2/4] Accept underscores in float literals (B5) Float literals accepted `digits '.' digits` but rejected any input containing `_`, while integer literals accept `1_000.0`-style separators. The asymmetry forces numeric groupings to be omitted in float sources and produces a confusing TYP019 for `1_000.5`. Mirror the integer path: copy the source into a stack buffer, reject leading, trailing, and double underscores, then run the existing digit shape check and `strtod` on the stripped text. The behavior stays byte-equivalent for inputs without `_`. Adds a positive conformance fixture (`float-literal-underscores.0`) and a negative one (`malformed-float-underscores.0` covers trailing underscore). Bumps the `checker.c` line budget by 20. --- .../native/fail/malformed-float-underscores.0 | 4 ++ .../native/pass/float-literal-underscores.0 | 9 ++++ conformance/run.mjs | 5 +++ native/zero-c/src/checker.c | 41 +++++++++++++------ scripts/compiler-metrics.mts | 2 +- 5 files changed, 48 insertions(+), 13 deletions(-) create mode 100644 conformance/native/fail/malformed-float-underscores.0 create mode 100644 conformance/native/pass/float-literal-underscores.0 diff --git a/conformance/native/fail/malformed-float-underscores.0 b/conformance/native/fail/malformed-float-underscores.0 new file mode 100644 index 000000000..e123405a8 --- /dev/null +++ b/conformance/native/fail/malformed-float-underscores.0 @@ -0,0 +1,4 @@ +pub fn main(world: World) -> Void raises { + let bad: f64 = 1.0_ + check world.out.write("malformed underscore float\n") +} diff --git a/conformance/native/pass/float-literal-underscores.0 b/conformance/native/pass/float-literal-underscores.0 new file mode 100644 index 000000000..c84679d9c --- /dev/null +++ b/conformance/native/pass/float-literal-underscores.0 @@ -0,0 +1,9 @@ +pub fn main(world: World) -> Void raises { + let grouped: f64 = 1_000.5 + let tiny: f64 = 0.0_5 + let exponent: f64 = 1.0e1_0 + let default_width: f64 = 1_000.0 + if grouped == 1000.5 && tiny == 0.05 && exponent == 1.0e10 && default_width == 1000.0 { + check world.out.write("float literal underscores ok\n") + } +} diff --git a/conformance/run.mjs b/conformance/run.mjs index 107902c05..c684e676c 100644 --- a/conformance/run.mjs +++ b/conformance/run.mjs @@ -423,6 +423,7 @@ for (const fixture of [ "conformance/native/pass/explicit-casts.0", "conformance/native/pass/float-char-casts.0", "conformance/native/pass/radix-suffix-literals.0", + "conformance/native/pass/float-literal-underscores.0", "conformance/native/pass/char-literals.0", "conformance/native/pass/float-primitives.0", "conformance/native/pass/wrapping-saturating-arithmetic.0", @@ -4984,6 +4985,10 @@ const malformedFloatLiteral = await execFileAsync(zero, ["check", "conformance/n assert.notEqual(malformedFloatLiteral.code, 0); assert.match(malformedFloatLiteral.stderr, /TYP019/); +const malformedFloatUnderscores = await execFileAsync(zero, ["check", "conformance/native/fail/malformed-float-underscores.0"]).catch((error) => error); +assert.notEqual(malformedFloatUnderscores.code, 0); +assert.match(malformedFloatUnderscores.stderr, /TYP019/); + const floatF32Overflow = await execFileAsync(zero, ["check", "conformance/native/fail/float-f32-overflow.0"]).catch((error) => error); assert.notEqual(floatF32Overflow.code, 0); assert.match(floatF32Overflow.stderr, /TYP020/); diff --git a/native/zero-c/src/checker.c b/native/zero-c/src/checker.c index 88a0be471..b1dd6303e 100644 --- a/native/zero-c/src/checker.c +++ b/native/zero-c/src/checker.c @@ -5501,25 +5501,42 @@ static bool validate_integer_literal_for_type(const Expr *expr, const char *expe static bool parse_float_literal(const char *text, double *out, bool *out_of_range) { if (!text || !text[0]) return false; - if (strchr(text, '_')) return false; + char stripped[64]; + size_t write = 0; + bool previous_underscore = false; + bool saw_any_digit = false; + for (size_t read = 0; text[read] != 0 && write + 1 < sizeof(stripped); read++) { + char ch = text[read]; + if (ch == '_') { + if (!saw_any_digit || previous_underscore) return false; + previous_underscore = true; + continue; + } + stripped[write++] = ch; + previous_underscore = false; + if (ch >= '0' && ch <= '9') saw_any_digit = true; + } + if (previous_underscore || !saw_any_digit || write == 0 || write + 1 >= sizeof(stripped)) return false; + stripped[write] = 0; + size_t index = 0; - if (!isdigit((unsigned char)text[index])) return false; - while (isdigit((unsigned char)text[index])) index++; - if (text[index] != '.') return false; + if (!isdigit((unsigned char)stripped[index])) return false; + while (isdigit((unsigned char)stripped[index])) index++; + if (stripped[index] != '.') return false; index++; - if (!isdigit((unsigned char)text[index])) return false; - while (isdigit((unsigned char)text[index])) index++; - if (text[index] == 'e' || text[index] == 'E') { + if (!isdigit((unsigned char)stripped[index])) return false; + while (isdigit((unsigned char)stripped[index])) index++; + if (stripped[index] == 'e' || stripped[index] == 'E') { index++; - if (text[index] == '+' || text[index] == '-') index++; - if (!isdigit((unsigned char)text[index])) return false; - while (isdigit((unsigned char)text[index])) index++; + if (stripped[index] == '+' || stripped[index] == '-') index++; + if (!isdigit((unsigned char)stripped[index])) return false; + while (isdigit((unsigned char)stripped[index])) index++; } - if (text[index] != 0) return false; + if (stripped[index] != 0) return false; errno = 0; char *end = NULL; - double value = strtod(text, &end); + double value = strtod(stripped, &end); if (!end || *end != 0) return false; *out = value; *out_of_range = errno == ERANGE; diff --git a/scripts/compiler-metrics.mts b/scripts/compiler-metrics.mts index 2dd271015..4eede6b26 100644 --- a/scripts/compiler-metrics.mts +++ b/scripts/compiler-metrics.mts @@ -17,7 +17,7 @@ type CScanState = { const fileBudgets = { "native/zero-c/include/zero.h": { maxLines: 990, maxStrcmpCalls: 0 }, "native/zero-c/include/zero_runtime.h": { maxLines: 100, maxStrcmpCalls: 0 }, - "native/zero-c/src/checker.c": { maxLines: 11710, maxStrcmpCalls: 287 }, + "native/zero-c/src/checker.c": { maxLines: 11730, maxStrcmpCalls: 287 }, "native/zero-c/src/main.c": { maxLines: 12970, maxStrcmpCalls: 473 }, "native/zero-c/src/ir.c": { maxLines: 4212, maxStrcmpCalls: 229 }, "native/zero-c/src/llvm_toolchain.c": { maxLines: 335, maxStrcmpCalls: 19 }, From c151d33dded9162d2a30544f354c0321754cae6f Mon Sep 17 00:00:00 2001 From: Thomas Bale Date: Thu, 4 Jun 2026 08:21:19 +0100 Subject: [PATCH 3/4] Reject unknown escapes in string literals (A2/A3/B8) The string-literal decoder's catch-all branch appended the byte after `\\` verbatim. Three findings flowed from that one gap: - `"\q"` silently decoded to `"q"` (A2) - `"\0"` silently decoded to `"0"` (A3) - `"A"` silently decoded to `"u0041"` (B8) The char scanner at `canonical_text.c:220-222` already whitelists the known escapes and rejects everything else with PAR100. Mirror that policy in the string decoder: explicitly accept `n`, `r`, `t`, and `x..`; pass through the self-escaped `\\`, `'`, `"`; reject everything else with PAR100. `\\x00` continues to be rejected (a NUL byte would truncate a C string), which now matches `\\0`. Adds a positive conformance fixture (`string-escape-canonical.0`) that asserts `\\n`, `\\t`, `\\"`, `\\\\`, and `\\x41` all decode to the expected bytes, and two negative fixtures (`string-unknown-escape.0`, `string-null-escape.0`) that lock the PAR100 path. Bumps the `canonical_text_program.c` line budget by 17. --- conformance/native/fail/string-null-escape.0 | 4 ++++ .../native/fail/string-unknown-escape.0 | 4 ++++ .../native/pass/string-escape-canonical.0 | 22 +++++++++++++++++++ conformance/run.mjs | 9 ++++++++ native/zero-c/src/canonical_text_program.c | 6 ++++- scripts/compiler-metrics.mts | 2 +- 6 files changed, 45 insertions(+), 2 deletions(-) create mode 100644 conformance/native/fail/string-null-escape.0 create mode 100644 conformance/native/fail/string-unknown-escape.0 create mode 100644 conformance/native/pass/string-escape-canonical.0 diff --git a/conformance/native/fail/string-null-escape.0 b/conformance/native/fail/string-null-escape.0 new file mode 100644 index 000000000..39ef035e8 --- /dev/null +++ b/conformance/native/fail/string-null-escape.0 @@ -0,0 +1,4 @@ +pub fn main(world: World) -> Void raises { + let bad: String = "\0" + check world.out.write("null escape\n") +} diff --git a/conformance/native/fail/string-unknown-escape.0 b/conformance/native/fail/string-unknown-escape.0 new file mode 100644 index 000000000..dc7938678 --- /dev/null +++ b/conformance/native/fail/string-unknown-escape.0 @@ -0,0 +1,4 @@ +pub fn main(world: World) -> Void raises { + let bad: String = "\q" + check world.out.write("unknown escape\n") +} diff --git a/conformance/native/pass/string-escape-canonical.0 b/conformance/native/pass/string-escape-canonical.0 new file mode 100644 index 000000000..22445840b --- /dev/null +++ b/conformance/native/pass/string-escape-canonical.0 @@ -0,0 +1,22 @@ +pub fn main(world: World) -> Void raises { + let newline: String = "a\nb" + if std.mem.eqlBytes(std.mem.span(newline), std.mem.span("a\nb")) { + check world.out.write("newline ok\n") + } + let tab: String = "x\ty" + if std.mem.eqlBytes(std.mem.span(tab), std.mem.span("x\ty")) { + check world.out.write("tab ok\n") + } + let quote: String = "say \"hi\"" + if std.mem.eqlBytes(std.mem.span(quote), std.mem.span("say \"hi\"")) { + check world.out.write("quote ok\n") + } + let backslash: String = "path\\to\\file" + if std.mem.eqlBytes(std.mem.span(backslash), std.mem.span("path\\to\\file")) { + check world.out.write("backslash ok\n") + } + let hex: String = "byte \x41 here" + if std.mem.eqlBytes(std.mem.span(hex), std.mem.span("byte \x41 here")) { + check world.out.write("hex ok\n") + } +} diff --git a/conformance/run.mjs b/conformance/run.mjs index c684e676c..b5a2b3174 100644 --- a/conformance/run.mjs +++ b/conformance/run.mjs @@ -455,6 +455,7 @@ for (const fixture of [ "conformance/native/pass/byte-view-call-single-eval.0", "conformance/native/pass/generic-spans.0", "conformance/native/pass/open-ended-slices.0", + "conformance/native/pass/string-escape-canonical.0", "conformance/native/pass/string-slices.0", "conformance/native/pass/string-param-span-slice.0", "conformance/native/pass/coff-dynamic-byte-slice.0", @@ -4969,6 +4970,14 @@ const charBadEscape = await execFileAsync(zero, ["check", "conformance/native/fa assert.notEqual(charBadEscape.code, 0); assert.match(charBadEscape.stderr, /PAR100/); +const stringUnknownEscape = await execFileAsync(zero, ["check", "conformance/native/fail/string-unknown-escape.0"]).catch((error) => error); +assert.notEqual(stringUnknownEscape.code, 0); +assert.match(stringUnknownEscape.stderr, /PAR100/); + +const stringNullEscape = await execFileAsync(zero, ["check", "conformance/native/fail/string-null-escape.0"]).catch((error) => error); +assert.notEqual(stringNullEscape.code, 0); +assert.match(stringNullEscape.stderr, /PAR100/); + const charToString = await execFileAsync(zero, ["check", "conformance/native/fail/char-to-string.0"]).catch((error) => error); assert.notEqual(charToString.code, 0); assert.match(charToString.stderr, /TYP002/); diff --git a/native/zero-c/src/canonical_text_program.c b/native/zero-c/src/canonical_text_program.c index 7c903f870..a3ad39d7b 100644 --- a/native/zero-c/src/canonical_text_program.c +++ b/native/zero-c/src/canonical_text_program.c @@ -148,8 +148,12 @@ static char *canon_ast_decode_string_literal(const ZCanonicalToken *token, ZDiag } zbuf_append_char(&buf, (char)value); i += 2; - } else { + } else if (escaped == '\\' || escaped == '\'' || escaped == '"') { zbuf_append_char(&buf, escaped); + } else { + canon_ast_fail(diag, token, "invalid string escape", "escaped byte", text); + free(buf.data); + return z_strdup(""); } } if (canon_ast_has_diag(diag)) { diff --git a/scripts/compiler-metrics.mts b/scripts/compiler-metrics.mts index 4eede6b26..11874a356 100644 --- a/scripts/compiler-metrics.mts +++ b/scripts/compiler-metrics.mts @@ -35,7 +35,7 @@ const fileBudgets = { "native/zero-c/src/call_resolve.h": { maxLines: 100, maxStrcmpCalls: 0 }, "native/zero-c/src/canonical_text.c": { maxLines: 1508, maxStrcmpCalls: 0 }, "native/zero-c/src/canonical_text_format.c": { maxLines: 354, maxStrcmpCalls: 0 }, - "native/zero-c/src/canonical_text_program.c": { maxLines: 1493, maxStrcmpCalls: 0 }, + "native/zero-c/src/canonical_text_program.c": { maxLines: 1510, maxStrcmpCalls: 0 }, "native/zero-c/src/canonical_text_write.c": { maxLines: 604, maxStrcmpCalls: 2 }, "native/zero-c/src/canonical_text.h": { maxLines: 80, maxStrcmpCalls: 0 }, "native/zero-c/src/coff_format.c": { maxLines: 370, maxStrcmpCalls: 0 }, From d16c9f3c3fa0c966b4ff331c30f6626ab408c5fd Mon Sep 17 00:00:00 2001 From: Thomas Bale Date: Thu, 4 Jun 2026 10:29:14 +0100 Subject: [PATCH 4/4] Restrict char `\xNN` to ASCII range (B6) The char scanner accepted any byte value for `\\xNN` (0x00..0xFF), but raw non-ASCII bytes were rejected at the same site. The asymmetry let sources build with `\\xFF` and fail only when the user typed the byte directly. Match the raw-byte restriction in the `\\x` branch: parse the two hex digits, then fail with the existing "character literal must be one byte" diagnostic if the value is >= 0x80. `\\x00..\\x7F` still decode to those bytes (the char scanner's `\\0` escape continues to map to NUL, so users can produce a NUL char either way). Adds two negative conformance fixtures that lock the boundary: `char-hex-7f.0` covers `\\x80` (the first rejected value) and `char-hex-high.0` covers `\\xFF`. Bumps the `canonical_text.c` line budget by 12. --- conformance/native/fail/char-hex-7f.0 | 4 ++++ conformance/native/fail/char-hex-high.0 | 4 ++++ conformance/run.mjs | 8 ++++++++ native/zero-c/src/canonical_text.c | 7 ++++++- scripts/compiler-metrics.mts | 2 +- 5 files changed, 23 insertions(+), 2 deletions(-) create mode 100644 conformance/native/fail/char-hex-7f.0 create mode 100644 conformance/native/fail/char-hex-high.0 diff --git a/conformance/native/fail/char-hex-7f.0 b/conformance/native/fail/char-hex-7f.0 new file mode 100644 index 000000000..f8068c119 --- /dev/null +++ b/conformance/native/fail/char-hex-7f.0 @@ -0,0 +1,4 @@ +pub fn main(world: World) -> Void raises { + let bad: char = '\x80' + check world.out.write("boundary hex\n") +} diff --git a/conformance/native/fail/char-hex-high.0 b/conformance/native/fail/char-hex-high.0 new file mode 100644 index 000000000..927756d96 --- /dev/null +++ b/conformance/native/fail/char-hex-high.0 @@ -0,0 +1,4 @@ +pub fn main(world: World) -> Void raises { + let bad: char = '\xFF' + check world.out.write("high hex\n") +} diff --git a/conformance/run.mjs b/conformance/run.mjs index b5a2b3174..2ab193859 100644 --- a/conformance/run.mjs +++ b/conformance/run.mjs @@ -4970,6 +4970,14 @@ const charBadEscape = await execFileAsync(zero, ["check", "conformance/native/fa assert.notEqual(charBadEscape.code, 0); assert.match(charBadEscape.stderr, /PAR100/); +const charHexHigh = await execFileAsync(zero, ["check", "conformance/native/fail/char-hex-high.0"]).catch((error) => error); +assert.notEqual(charHexHigh.code, 0); +assert.match(charHexHigh.stderr, /PAR100/); + +const charHex7f = await execFileAsync(zero, ["check", "conformance/native/fail/char-hex-7f.0"]).catch((error) => error); +assert.notEqual(charHex7f.code, 0); +assert.match(charHex7f.stderr, /PAR100/); + const stringUnknownEscape = await execFileAsync(zero, ["check", "conformance/native/fail/string-unknown-escape.0"]).catch((error) => error); assert.notEqual(stringUnknownEscape.code, 0); assert.match(stringUnknownEscape.stderr, /PAR100/); diff --git a/native/zero-c/src/canonical_text.c b/native/zero-c/src/canonical_text.c index 1bc0a62f7..83b6fbe44 100644 --- a/native/zero-c/src/canonical_text.c +++ b/native/zero-c/src/canonical_text.c @@ -211,9 +211,14 @@ static bool canon_scan_char(const char *source, size_t *offset, int *column, ZCa if (escaped == 'x') { char high_ch = source[*offset + 1]; char low_ch = high_ch ? source[*offset + 2] : 0; - if (canon_hex_digit(high_ch) < 0 || canon_hex_digit(low_ch) < 0) { + int high = canon_hex_digit(high_ch); + int low = canon_hex_digit(low_ch); + if (high < 0 || low < 0) { return canon_fail(diag, &at_start, "malformed hex character escape", "two hex digits", "invalid escape"); } + if (((high << 4) | low) >= 0x80) { + return canon_fail(diag, &at_start, "character literal must be one byte", "ASCII byte", "non-ASCII"); + } *offset += 3; *column += 3; } else { diff --git a/scripts/compiler-metrics.mts b/scripts/compiler-metrics.mts index 11874a356..31859ebd4 100644 --- a/scripts/compiler-metrics.mts +++ b/scripts/compiler-metrics.mts @@ -33,7 +33,7 @@ const fileBudgets = { "native/zero-c/src/c_import.h": { maxLines: 40, maxStrcmpCalls: 0 }, "native/zero-c/src/call_resolve.c": { maxLines: 200, maxStrcmpCalls: 2 }, "native/zero-c/src/call_resolve.h": { maxLines: 100, maxStrcmpCalls: 0 }, - "native/zero-c/src/canonical_text.c": { maxLines: 1508, maxStrcmpCalls: 0 }, + "native/zero-c/src/canonical_text.c": { maxLines: 1520, maxStrcmpCalls: 0 }, "native/zero-c/src/canonical_text_format.c": { maxLines: 354, maxStrcmpCalls: 0 }, "native/zero-c/src/canonical_text_program.c": { maxLines: 1510, maxStrcmpCalls: 0 }, "native/zero-c/src/canonical_text_write.c": { maxLines: 604, maxStrcmpCalls: 2 },