From 8a9bad371d7ab7c57b3bbf8dc6fd675d85cec256 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ralph=20K=C3=BCpper?= Date: Thu, 4 Jun 2026 19:28:55 +0200 Subject: [PATCH] fix(regex): keep escaped hyphen \- literal inside a character class (#4425) An escaped hyphen `\-` inside a JS character class is always a literal hyphen, but `push_escaped_literal` emitted a bare `-`. The Rust `regex` crate then read a `-` flanked by members as a range operator, so `[a\- ]` translated to the invalid range `[a- ]` and construction failed with "invalid pattern". When inside a class, preserve the escape so the hyphen stays a literal regardless of position. This unblocks a native build of `marked`, whose GFM table-delimiter regex ` {0,3}\|?(?:[:\- ]*\|)+[\:\- ]*\n` was built at module-init and crashed the binary on boot. --- crates/perry-runtime/src/regex.rs | 20 ++++++++++++++++++++ crates/perry-runtime/src/regex/grammar.rs | 13 ++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/crates/perry-runtime/src/regex.rs b/crates/perry-runtime/src/regex.rs index ba328e12f3..7209705e6b 100644 --- a/crates/perry-runtime/src/regex.rs +++ b/crates/perry-runtime/src/regex.rs @@ -1885,4 +1885,24 @@ mod tests { assert_eq!(string_as_str(result), "hell0 w0rld"); } + + #[test] + fn escaped_hyphen_in_class_stays_literal() { + // #4425: `\-` inside a character class is always a literal hyphen. The + // Rust `regex` crate reads a bare `-` flanked by members as a range + // operator, so the escape must be preserved or `[a\- ]` translates to + // the invalid range `[a- ]`. + assert_eq!(js_regex_to_rust(r"[a\- ]"), r"[a\- ]"); + assert_eq!(js_regex_to_rust(r"[:\- ]"), r"[:\- ]"); + assert_eq!(js_regex_to_rust(r"[\-]"), r"[\-]"); + // Outside a class a hyphen carries no range meaning, so it stays bare. + assert_eq!(js_regex_to_rust(r"a\-b"), "a-b"); + + // The patterns that crashed `marked` at module-init must now compile. + for pat in [r"[a\- ]", r"[:\- ]", r" {0,3}\|?(?:[:\- ]*\|)+[\:\- ]*\n"] { + let flags = make_string(""); + let re = js_regexp_new(make_string(pat), flags); + assert!(!re.is_null(), "pattern failed to construct: {pat}"); + } + } } diff --git a/crates/perry-runtime/src/regex/grammar.rs b/crates/perry-runtime/src/regex/grammar.rs index 83300d52e0..2b0283a8c5 100644 --- a/crates/perry-runtime/src/regex/grammar.rs +++ b/crates/perry-runtime/src/regex/grammar.rs @@ -257,7 +257,18 @@ pub(super) fn js_regex_to_rust(pattern: &str) -> String { } } ch if is_regex_identity_escape(ch) => { - push_escaped_literal(&mut result, ch); + // Inside a character class an escaped hyphen `\-` is always a + // literal hyphen, but the Rust `regex` crate reads a bare `-` + // flanked by members as a range operator (so `[a\- ]` would + // become the invalid range `[a- ]`). Keep the escape so it + // stays a literal regardless of position. `marked`'s GFM + // table-delimiter regex `[:\- ]` relies on this. + if in_class && ch == '-' { + result.push('\\'); + result.push('-'); + } else { + push_escaped_literal(&mut result, ch); + } i += 2; } // Pass through all other backslash sequences as-is. (An escaped