diff --git a/CHANGELOG.md b/CHANGELOG.md index 610505e..ec94249 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,10 +4,16 @@ All notable changes to `@danielx/hera` will be documented in this file. ## [Unreleased] +## [0.9.8] - 2026-05-26 + ### Added - Grammar: `RuleName ::Type` sets a default return type for every untyped choice in that rule. A `::Type` on an individual choice still overrides the rule-level default (#91). +- Grammar: triple-slash regular expressions (`///...///`) support + whitespace-insensitive regex bodies with `//` comments, `${...}` dynamic + interpolation, and `${const ...}` interpolation for values that should be + captured once (#87). ## [0.9.7] - 2026-05-21 diff --git a/README.md b/README.md index efce4d4..cfecc75 100644 --- a/README.md +++ b/README.md @@ -151,6 +151,51 @@ Terminal regular expressions return the array: the matched string, followed by any matched groups (strings or `undefined` for unmatched optional groups). +**Triple-slash regular expression** (`///...///`): +A whitespace-insensitive regular expression, similar to +[Civet heregex syntax](https://civet.dev/reference#regular-expressions). +Whitespace and `//` comments inside the body are ignored, so larger +expressions can be split across lines: + +``` +Identifier + /// + [A-Za-z_] // first character + [A-Za-z0-9_]* + /// +``` + +Triple-slash regular expressions also support interpolation. `${expr}` is +dynamic: the generated parser re-evaluates `expr` when the rule runs, rebuilds +the `RegExp` when the resulting source string changes, and reuses the cached +parser until the source changes. + +``` +Close + /// ${closingPattern} /// -> $0 +``` + +Use `${const expr}` for an interpolation whose value should be captured once. +If every interpolated part is `const`, the generated parser computes the +regular expression once and then uses the same parser as a normal regular +expression. In mixed static/dynamic expressions, complex `const` expressions +are cached once, while identifier-only `const` expressions are emitted directly. + +``` +Open + /// ${const openingPattern} /// -> + closingPattern = $0 === "(" ? "\\)" : "\\]" + return $0 + +Close + /// ${closingPattern} /// -> $0 +``` + +Dynamic interpolation that depends on parser state needs the same care as +handlers that mutate parser state. If result caching is enabled, that state +must be part of the cache key/state, and rules that mutate it should not be +cached. + **Repetition** (`*`, `+`): `...*` means "zero or more expansions of `...`", and `...+` means one or more repetitions of `Choice`. Repetitions return an array of the matches. **Optional** (`?`): `...?` means "zero or one expansion of `...`". If `...` matches, `...?` returns that value directly. Otherwise it succeeds without consuming input and returns `undefined`. Unlike `*` and `+`, `?` does not wrap its result in an array. diff --git a/build/typed-parser-samples b/build/typed-parser-samples index 6039f3b..a985329 100755 --- a/build/typed-parser-samples +++ b/build/typed-parser-samples @@ -27,6 +27,7 @@ civet source/cli-bin.civet --types --module --libPath ./machine.js -o parsers \ source/hera.hera \ samples/code.hera \ samples/regex.hera \ + samples/heregex.hera \ samples/url.hera \ samples/coffee.hera \ samples/structural-mapping.hera \ diff --git a/package.json b/package.json index 3e70b2e..2ce321c 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@danielx/hera", - "version": "0.9.7", + "version": "0.9.8", "description": "Small and fast parsing expression grammars", "devDependencies": { "@danielx/civet": "0.11.6", diff --git a/samples/heregex.hera b/samples/heregex.hera new file mode 100644 index 0000000..0860ca4 --- /dev/null +++ b/samples/heregex.hera @@ -0,0 +1,35 @@ +# Triple-slash regexes support `${const ...}` substitutions that are evaluated +# once and `${...}` substitutions that are dynamic. Dynamic regexes +# re-evaluate their dynamic interpolation expressions when the rule runs and +# re-use the cached RegExp while the resulting source string is unchanged. +# +# This example mutates parser state. If result caching is enabled, state like +# `closing` must be part of the cache key/state, and rules that mutate it +# should not be cached. + +``` +const opening = "[\\(\\[]" +let closing = "\\)" +``` + +Main + Open Body Close -> $1 + $2 + $3 + +Open + # Example of constant substitution => static regex + /// ${const opening} /// -> + if ($0 === "(") closing = "\\)" + else closing = "\\]" + return $0 + +Close + # Example of dynamic substitution => dynamic regex + /// ${closing} /// -> $0 + +Body + # Example of no substitution => static regex + /// \s+ | \w+ /// -> $0 + # Example of constant substitution + dynamic substitution => dynamic regex + /// (?! ${const opening} | ${closing}) . /// -> $0 + # Example of same with const caching + /// (?! ${const opening + opening} | ${closing}) . /// -> $0 diff --git a/source/compiler.civet b/source/compiler.civet index 74d1663..9b694ef 100644 --- a/source/compiler.civet +++ b/source/compiler.civet @@ -1,6 +1,6 @@ // Compile a rules json to typescript -import type { HeraAST, HeraRules } from ./hera-types.civet +import type { HeraAST, HeraRules, RegExpExpression, RegExpPart } from ./hera-types.civet export type CompilerOptions = { filename?: string @@ -23,6 +23,7 @@ export type CompilerOptions = { strDefs: string[] := [] reDefs: string[] := [] +dynamicReDefs: RegExpPart[][] := [] /** Define a literal string terminal @@ -54,6 +55,37 @@ defineRe := (re: string) -> return id +defineDynamicRe := (parts: RegExpPart[]) => + index := dynamicReDefs.findIndex((entry) => JSON.stringify(entry) is JSON.stringify(parts)) + let id + + if index >= 0 + id = `$RD${index}` + else + id = `$RD${dynamicReDefs#}` + dynamicReDefs.push parts + + return id + +isIdentifierExpression := (expression: string): boolean => + /^[$_\p{ID_Start}][$\u200C\u200D\p{ID_Continue}]*$/u.test(expression.trim()) + +heregexToExpression := (parts: RegExpPart[], constantPrefix?: string): string => + let constIndex = 0 + `\`${parts.map((part) => + if part \`${name} /\${${id}.source}/\`)` + if defaultHandler + f =`$R$0(${f})` + f } when "/" @@ -382,7 +424,7 @@ compileRuleBodyInline := (options: CompilerOptions, name: string, fnName: string paramTypes.push `typeof $$value[${i}]` callArgs.push `$$value[${i}]` addNamedParameter getNamedVariable(rule[1][i][0]), `typeof $$value[${i}]`, `$$value[${i}]` - else if rule[0] is "R" + else if rule[0] - `const $R${i} = $R${reTupleType(types, r)}(new RegExp(${JSON.stringify(r)}, 'suy'));\n` + `const $R${i} = $R${reTupleType(types, r)}(/${r or "(?:)"}/suy);\n` + + dynamicReDefSource := dynamicReDefs.map (r, i) => + rdType := types ? " as Parser & { source?: string }" : "" + ctxType := types ? ": ParserContext" : "" + stateType := types ? ": ParseState" : "" + nonNullAssert := types ? "!" : "" + if r.every((part) => part & { source?: string }" : "" + return [ + `let $RD${i}${rdParserType} = (($$ctx${ctxType}, $$state${stateType}) => {\n` + ` const $$source = ${heregexToExpression(r)};\n` + ` $RD${i} = $R(new RegExp($$source, 'suy'))${rdType};\n` + ` $RD${i}.source = $$source;\n` + ` return $RD${i}($$ctx, $$state);\n` + `})${rdType};\n` + ].join("") + + constantParts := r.flatMap((part): RegExpExpression[] => + if part " + ...constantParts.map((_, index) => `const${index}?: string`) + ].join(", ") : "" + cacheType := types ? ` as Parser & { ${cacheProps} }` : "" + constInits := constantParts.map((part, index) => ` $RD${i}.const${index} ??= String(${part.expression});\n`).join("") + [ + `const $RD${i} = (($$ctx${ctxType}, $$state${stateType}) => {\n` + constInits + ` const $$source = ${heregexToExpression(r, `$RD${i}.const`)};\n` + ` if ($$source !== $RD${i}.source) {\n` + ` $RD${i}.source = $$source;\n` + ` $RD${i}.parser = $R(new RegExp($$source, 'suy'));\n` + " }\n" + ` return $RD${i}.parser${nonNullAssert}($$ctx, $$state);\n` + `})${cacheType};\n` + ].join("") genOpts: GenerateOptions := {} @@ -660,6 +741,8 @@ export function compile(rules: HeraRules, maybeOptions?: CompilerOptions): strin "\n\n" reDefSource "\n\n" + dynamicReDefSource + "\n\n" body "\n\n" tail diff --git a/source/hera-types.civet b/source/hera-types.civet index 880a637..63206e4 100644 --- a/source/hera-types.civet +++ b/source/hera-types.civet @@ -7,14 +7,16 @@ import type { Loc, ParserContext, HeraGrammar, ParserOptions } from "./machine.c export type ParserContext, HeraGrammar, ParserOptions export type Terminal = string | RegExp -export type Handler = { $loc: Loc, f: string, t?: string, inline?: boolean } +export type Handler = { $loc: Loc, f: string, t?: string, inline?: boolean } | { t: string } export type CodeBlockNode = { type: "CodeBlock", $loc: Loc, token: string } -export type TerminalOp = "L" | "R" +export type TerminalOp = "L" | "R" | "RD" export type SequenceOp = "S" | "/" export type PrefixOp = "&" | "!" | "$" export type SuffixOp = "+" | "*" | "?" -export type Literal = [TerminalOp, string] -export type TerminalNode = [TerminalOp, string, Handler?] +export type RegExpExpression = { expression: string, constant?: boolean } +export type RegExpPart = string | RegExpExpression +export type Literal = ["L", string] | ["R", string] | ["RD", RegExpPart[]] +export type TerminalNode = [...Literal, Handler?] export type SequenceNode = [SequenceOp, HeraAST[], Handler?] export type PrefixNode = [PrefixOp, HeraAST, Handler?] export type SuffixNode = [SuffixOp, HeraAST, Handler?] diff --git a/source/hera.hera b/source/hera.hera index 1538899..ff2438c 100644 --- a/source/hera.hera +++ b/source/hera.hera @@ -6,6 +6,24 @@ import type { Handler, CodeBlockNode, } from './hera-types' + +const regexEscapes: Record = { + "\n": "\\n", + "\r": "\\r", + "\u2028": "\\u2028", + "\u2029": "\\u2029", + "\t": "\\t", + "\v": "\\v", + "\f": "\\f", + " ": " ", +} + +// \ followed by whitespace is natural in heregexes, +// but forbidden in JS regexes with s flag, so rewrite them. +// Newlines aren't allowed in JS regexes either, so we escape them too. +function normalizeRegExpSource(source: string) { + return source.replace(/\\?([\n\r\u2028\u2029\t\v\f ])/g, (_, char: string) => regexEscapes[char]) +} ``` Grammar @@ -162,10 +180,47 @@ StringLiteral StringValue -> ["L", $1] RegExpLiteral - "/" !Space $RegExpCharacter* "/" -> ["R", $3] - $CharacterClassExpression -> ["R", $1] + TripleSlash HeregexBody:body TripleSlash -> + body = body.map((part) => typeof part === "string" ? normalizeRegExpSource(part) : part) + // Static heregexes use the normal RegExp path; only interpolated ones need RD. + if (body.every((part) => typeof part === "string")) + return ["R", body.join("")] + return ["RD", body] + "/" !Space $RegExpCharacter* "/" -> ["R", normalizeRegExpSource($3)] + $CharacterClassExpression -> ["R", normalizeRegExpSource($1)] "." -> ["R", $1] +HeregexBody + HeregexPart* + +HeregexPart + $CharacterClass + # `const` substitution specifies that the expression is constant, + # so only needs to be evaluated once. + "${" Space* "const" Space $HeregexSubstitutionContent*:expression "}" -> { expression, constant: true } + # Otherwise, substitution is treated as a dynamic expression, + # re-evaluated every match. + "${" $HeregexSubstitutionContent*:expression "}" -> { expression } + EscapeSequence + # JS-style // comments + /\/\/(?!\/)[^\n\r]*/ -> "" + # NOTE: CoffeeScript strips out all unescaped whitespace chars + # but Python doesn't strip out whitespace inside character classes + # or inside '(?' groups and assertions. + # We keep spaces only inside character classes, following Civet. + [\s]+ -> "" + # Escape forward slashes (that aren't part of a triple slash) + /\/(?!\/\/)/ -> "\\/" + # Don't swallow up $ which might be interpolations, + # but handle them as single characters if they're not + /[^[\/\s$\\]+|[$]/ -> $0 + +HeregexSubstitutionContent + [^{}"']+ + $( "\"" ( [^"\\]+ / EscapeSequence )* "\"" ) + $( "'" ( [^'\\]+ / EscapeSequence )* "'" ) + $( "{" HeregexSubstitutionContent* "}" ) + CharacterClassExpression CharacterClass+ @@ -192,6 +247,9 @@ Arrow Backslash "\\" +TripleSlash + "///" + OpenBrace /\{\s*/ diff --git a/source/machine.civet b/source/machine.civet index 8b48e9b..964621b 100644 --- a/source/machine.civet +++ b/source/machine.civet @@ -77,6 +77,8 @@ interface Fail { (pos: number, expectation: string): void } +type Expectation = string | (=> string) + /** * A Parser is a function that takes a string and position to check and returns * a result if it matches. @@ -88,12 +90,12 @@ export interface Parser { /** * $EXPECT sets the friendlier `expectation` name. */ -export function $EXPECT(parser: Parser, expectation: string): Parser { +export function $EXPECT(parser: Parser, expectation: Expectation): Parser { return function (ctx, state) { // NOTE: we don't need to use a stack because we're only tracking failures on // string and regex leaf nodes right now. const result = parser(ctx, state); - if (!result) ctx.fail(state.pos, expectation) + if (!result) ctx.fail(state.pos, typeof expectation === "function" ? expectation() : expectation) return result } } diff --git a/source/rules.json b/source/rules.json index 1c58b33..e49bb3f 100644 --- a/source/rules.json +++ b/source/rules.json @@ -2,10 +2,10 @@ "code": [ { "type": "CodeBlock", - "token": "\nimport type {\n HeraAST,\n SequenceNode,\n NameNode,\n Handler,\n CodeBlockNode,\n} from './hera-types'\n", + "token": "\nimport type {\n HeraAST,\n SequenceNode,\n NameNode,\n Handler,\n CodeBlockNode,\n} from './hera-types'\n\nconst regexEscapes: Record = {\n \"\\n\": \"\\\\n\",\n \"\\r\": \"\\\\r\",\n \"\\u2028\": \"\\\\u2028\",\n \"\\u2029\": \"\\\\u2029\",\n \"\\t\": \"\\\\t\",\n \"\\v\": \"\\\\v\",\n \"\\f\": \"\\\\f\",\n \" \": \" \",\n}\n\n// \\ followed by whitespace is natural in heregexes,\n// but forbidden in JS regexes with s flag, so rewrite them.\n// Newlines aren't allowed in JS regexes either, so we escape them too.\nfunction normalizeRegExpSource(source: string) {\n return source.replace(/\\\\?([\\n\\r\\u2028\\u2029\\t\\v\\f ])/g, (_, char: string) => regexEscapes[char])\n}\n", "$loc": { "pos": 3, - "length": 104 + "length": 625 } } ], @@ -16,7 +16,7 @@ { "f": " const code = $1.filter(a => typeof a === \"object\" && a !== null && \"type\" in a && a.type === \"CodeBlock\")\n const rules = Object.fromEntries($1.filter(a => Array.isArray(a)))\n rules[Symbol.for(\"code\")] = code\n return rules", "$loc": { - "pos": 136, + "pos": 657, "length": 236 } } @@ -36,7 +36,7 @@ { "f": "$2", "$loc": { - "pos": 402, + "pos": 923, "length": 3 }, "inline": true @@ -54,7 +54,7 @@ { "f": "$2", "$loc": { - "pos": 420, + "pos": 941, "length": 4 }, "inline": true @@ -72,7 +72,7 @@ { "f": "$2", "$loc": { - "pos": 478, + "pos": 999, "length": 4 }, "inline": true @@ -87,7 +87,7 @@ { "f": "[$1, $2]", "$loc": { - "pos": 506, + "pos": 1027, "length": 10 }, "inline": true @@ -105,7 +105,7 @@ { "f": "$2", "$loc": { - "pos": 562, + "pos": 1083, "length": 4 }, "inline": true @@ -143,7 +143,7 @@ { "f": " var r: HeraAST[] = choices.map((a) => a[1] as HeraAST)\n if (t) r = r.map((a): HeraAST => {\n if (typeof a === \"string\") return [\"S\", [a], { t }] as unknown as HeraAST\n const handler = a[2]\n if (handler) {\n if (\"t\" in handler) return a\n return [a[0], a[1], { ...handler, t }] as unknown as HeraAST\n }\n return [a[0], a[1], { t }] as unknown as HeraAST\n })\n if (r.length === 1) return r[0];\n return [\"/\", r] as HeraAST", "$loc": { - "pos": 644, + "pos": 1165, "length": 471 }, "t": " HeraAST " @@ -158,7 +158,7 @@ { "f": " if ($2 !== undefined) {\n if (!$1.push)\n $1 = [\"S\", [$1], $2]\n else\n $1.push($2)\n }\n return $1", "$loc": { - "pos": 1145, + "pos": 1666, "length": 129 } } @@ -178,7 +178,7 @@ { "f": " $2.unshift($1)\n return [\"S\", $2]", "$loc": { - "pos": 1335, + "pos": 1856, "length": 40 }, "t": " SequenceNode " @@ -196,7 +196,7 @@ { "f": " $2.unshift($1)\n return [\"/\", $2]", "$loc": { - "pos": 1425, + "pos": 1946, "length": 40 }, "t": " SequenceNode " @@ -214,7 +214,7 @@ { "f": "$2", "$loc": { - "pos": 1520, + "pos": 2041, "length": 4 }, "inline": true @@ -234,7 +234,7 @@ { "f": "$4", "$loc": { - "pos": 1573, + "pos": 2094, "length": 4 }, "inline": true @@ -252,7 +252,7 @@ { "f": "$2", "$loc": { - "pos": 1605, + "pos": 2126, "length": 4 }, "inline": true @@ -274,7 +274,7 @@ { "f": " var result = null\n if ($1) result = [$1, $2]\n else result = $2\n if ($3)\n return [{name: $3}, result] as NameNode\n return result as SequenceNode", "$loc": { - "pos": 1690, + "pos": 2211, "length": 166 }, "t": " SequenceNode | NameNode " @@ -296,7 +296,7 @@ { "f": " if ($2) return [$2, $1]\n else return $1", "$loc": { - "pos": 1916, + "pos": 2437, "length": 48 } } @@ -320,7 +320,7 @@ { "f": "$2", "$loc": { - "pos": 2060, + "pos": 2581, "length": 4 }, "inline": true @@ -346,7 +346,7 @@ { "f": " return undefined", "$loc": { - "pos": 2123, + "pos": 2644, "length": 21 } } @@ -378,7 +378,7 @@ { "f": " if (t) handler = { ...handler, t }\n return handler as Handler", "$loc": { - "pos": 2207, + "pos": 2728, "length": 69 } } @@ -401,7 +401,7 @@ { "f": " return { t } as Handler", "$loc": { - "pos": 2448, + "pos": 2969, "length": 29 } } @@ -424,7 +424,7 @@ { "f": "$2", "$loc": { - "pos": 2683, + "pos": 3204, "length": 49 }, "inline": true @@ -438,7 +438,7 @@ { "f": " return {\n f: $1.trimEnd(),\n $loc,\n inline: true,\n }", "$loc": { - "pos": 2773, + "pos": 3294, "length": 75 }, "t": " Handler " @@ -452,7 +452,7 @@ { "f": " return {\n f: $1.join(\"\").trimEnd(),\n $loc,\n }", "$loc": { - "pos": 2911, + "pos": 3432, "length": 64 }, "t": " Handler " @@ -505,7 +505,7 @@ { "f": " return $1 + $2.join(\"\") + ($3 ?? \"\")", "$loc": { - "pos": 3458, + "pos": 3979, "length": 42 }, "t": " string " @@ -547,7 +547,7 @@ { "f": "$2", "$loc": { - "pos": 3764, + "pos": 4285, "length": 4 }, "inline": true @@ -584,7 +584,7 @@ { "f": "[\"L\", $1]", "$loc": { - "pos": 3884, + "pos": 4405, "length": 11 }, "inline": true @@ -593,6 +593,26 @@ "RegExpLiteral": [ "/", [ + [ + "S", + [ + "TripleSlash", + [ + { + "name": "body" + }, + "HeregexBody" + ], + "TripleSlash" + ], + { + "f": " body = body.map((part) => typeof part === \"string\" ? normalizeRegExpSource(part) : part)\n // Static heregexes use the normal RegExp path; only interpolated ones need RD.\n if (body.every((part) => typeof part === \"string\"))\n return [\"R\", body.join(\"\")]\n return [\"RD\", body]", + "$loc": { + "pos": 4476, + "length": 291 + } + } + ], [ "S", [ @@ -617,10 +637,10 @@ ] ], { - "f": "[\"R\", $3]", + "f": "[\"R\", normalizeRegExpSource($3)]", "$loc": { - "pos": 3947, - "length": 10 + "pos": 4805, + "length": 33 }, "inline": true } @@ -629,10 +649,10 @@ "$", "CharacterClassExpression", { - "f": "[\"R\", $1]", + "f": "[\"R\", normalizeRegExpSource($1)]", "$loc": { - "pos": 3988, - "length": 10 + "pos": 4869, + "length": 33 }, "inline": true } @@ -643,7 +663,7 @@ { "f": "[\"R\", $1]", "$loc": { - "pos": 4007, + "pos": 4911, "length": 11 }, "inline": true @@ -651,6 +671,230 @@ ] ] ], + "HeregexBody": [ + "*", + "HeregexPart" + ], + "HeregexPart": [ + "/", + [ + [ + "$", + "CharacterClass" + ], + [ + "S", + [ + [ + "L", + "${" + ], + [ + "*", + "Space" + ], + [ + "L", + "const" + ], + "Space", + [ + { + "name": "expression" + }, + [ + "$", + [ + "*", + "HeregexSubstitutionContent" + ] + ] + ], + [ + "L", + "}" + ] + ], + { + "f": "{ expression, constant: true }", + "$loc": { + "pos": 5163, + "length": 125 + }, + "inline": true + } + ], + [ + "S", + [ + [ + "L", + "${" + ], + [ + { + "name": "expression" + }, + [ + "$", + [ + "*", + "HeregexSubstitutionContent" + ] + ] + ], + [ + "L", + "}" + ] + ], + { + "f": "{ expression }", + "$loc": { + "pos": 5342, + "length": 15 + }, + "inline": true + } + ], + "EscapeSequence", + [ + "R", + "\\/\\/(?!\\/)[^\\n\\r]*", + { + "f": "\"\"", + "$loc": { + "pos": 5425, + "length": 246 + }, + "inline": true + } + ], + [ + "R", + "[\\s]+", + { + "f": "\"\"", + "$loc": { + "pos": 5682, + "length": 67 + }, + "inline": true + } + ], + [ + "R", + "\\/(?!\\/\\/)", + { + "f": "\"\\\\/\"", + "$loc": { + "pos": 5767, + "length": 116 + }, + "inline": true + } + ], + [ + "R", + "[^[\\/\\s$\\\\]+|[$]", + { + "f": "$0", + "$loc": { + "pos": 5907, + "length": 4 + }, + "inline": true + } + ] + ] + ], + "HeregexSubstitutionContent": [ + "/", + [ + [ + "R", + "[^{}\"']+" + ], + [ + "$", + [ + "S", + [ + [ + "L", + "\\\"" + ], + [ + "*", + [ + "/", + [ + [ + "R", + "[^\"\\\\]+" + ], + "EscapeSequence" + ] + ] + ], + [ + "L", + "\\\"" + ] + ] + ] + ], + [ + "$", + [ + "S", + [ + [ + "L", + "'" + ], + [ + "*", + [ + "/", + [ + [ + "R", + "[^'\\\\]+" + ], + "EscapeSequence" + ] + ] + ], + [ + "L", + "'" + ] + ] + ] + ], + [ + "$", + [ + "S", + [ + [ + "L", + "{" + ], + [ + "*", + "HeregexSubstitutionContent" + ], + [ + "L", + "}" + ] + ] + ] + ] + ] + ], "CharacterClassExpression": [ "+", "CharacterClass" @@ -721,6 +965,10 @@ "L", "\\\\" ], + "TripleSlash": [ + "L", + "///" + ], "OpenBrace": [ "R", "\\{\\s*" @@ -787,7 +1035,7 @@ { "f": "$2", "$loc": { - "pos": 5178, + "pos": 7266, "length": 4 }, "inline": true @@ -802,7 +1050,7 @@ { "f": " return {\n type: \"CodeBlock\",\n token: $0,\n $loc,\n }", "$loc": { - "pos": 5238, + "pos": 7326, "length": 73 }, "t": " CodeBlockNode " diff --git a/source/util.civet b/source/util.civet index 5b6de00..01f6db9 100644 --- a/source/util.civet +++ b/source/util.civet @@ -73,6 +73,15 @@ toS := (rule: HeraAST, depth=0): string -> v + hToS(h) catch '/' + v + '/' + hToS(h) + when "RD" + "///" + rule[1].map((part) => + if part @@ -137,6 +146,15 @@ ruleToEBNF := (rule: HeraAST, depth=0): string -> quote v catch quote('/' + v + '/') + when "RD" + quote("///" + rule[1].map((part) => + if part diff --git a/test/main.civet b/test/main.civet index b805184..cde8767 100644 --- a/test/main.civet +++ b/test/main.civet @@ -469,6 +469,20 @@ describe "Hera", -> assert.deepEqual parse("ab"), ["ab"] assert.deepEqual parse("ababccbc"), ["ab", "abcc", "bc"] + it "should parse and match empty slash regexes", -> + rules := parse """ + Empty + // + """ + assert.deepEqual rules.Empty, ["R", ""] + assert.match compile(rules), /const \$R0 = \$R\(\/\(\?:\)\/suy\);/ + + parser := generate """ + Empty + // -> "empty" + """ + assert.equal parser.parse(""), "empty" + it "should parse bare character classes as regexes", -> rules := parse """ Rule @@ -485,6 +499,9 @@ describe "Hera", -> Quant2 [a]{2} + + EscapedSpace + [\\ ] """ assert.deepEqual rules.Rule, ["R", "[a-z]+[1-9]*"] @@ -492,6 +509,256 @@ describe "Hera", -> assert.deepEqual rules.Name, ["R", "[_a-zA-Z][_a-zA-Z0-9]*"] assert.deepEqual rules.Quants, ["R", "[0-9]{3,4}"] assert.deepEqual rules.Quant2, ["R", "[a]{2}"] + assert.deepEqual rules.EscapedSpace, ["R", "[ ]"] + assert.match compile(rules), /const \$R0 = \$R\(\/\[a-z\]\+\[1-9\]\*\/suy\);/ + + it "should normalize regexp source whitespace for JS literals", -> + tab := "\t" + lineSeparator := "\u2028" + paragraphSeparator := "\u2029" + grammar := ``` + SlashSpace + /\\ / + SlashTab + /\\${tab}/ + SlashLineSeparator + /a${lineSeparator}b/ + ClassSpace + [\\ ] + ClassTab + [\\${tab}] + ClassParagraphSeparator + [a${paragraphSeparator}b] + ``` + rules := parse grammar + + assert.deepEqual rules.SlashSpace, ["R", " "] + assert.deepEqual rules.SlashTab, ["R", "\\t"] + assert.deepEqual rules.SlashLineSeparator, ["R", "a\\u2028b"] + assert.deepEqual rules.ClassSpace, ["R", "[ ]"] + assert.deepEqual rules.ClassTab, ["R", "[\\t]"] + assert.deepEqual rules.ClassParagraphSeparator, ["R", "[a\\u2029b]"] + + parser := generate grammar.replace(/(\n [^\n]+)/g, "$1 -> $0") + assert.equal parser.parse(" ", startRule: "SlashSpace"), " " + assert.equal parser.parse(tab, startRule: "SlashTab"), tab + assert.equal parser.parse("a" + lineSeparator + "b", startRule: "SlashLineSeparator"), "a" + lineSeparator + "b" + assert.equal parser.parse(" ", startRule: "ClassSpace"), " " + assert.equal parser.parse(tab, startRule: "ClassTab"), tab + assert.equal parser.parse(paragraphSeparator, startRule: "ClassParagraphSeparator"), paragraphSeparator + + it "should parse static triple-slash regexes", -> + tab := "\t" + lineSeparator := "\u2028" + paragraphSeparator := "\u2029" + grammar := ``` + Rule + /// ab c / [ # ] /// + + EscapedSpace + ///\\ /// + EscapedTab + ///\\${tab}/// + EscapedLineSeparator + ///\\${lineSeparator}/// + EscapedParagraphSeparator + ///\\${paragraphSeparator}/// + UnescapedWhitespace + ///a${tab}b${lineSeparator}c${paragraphSeparator}d/// + + Identifier + /// + [A-Za-z_] + [A-Za-z0-9_]* + /// + ``` + rules := parse grammar + + assert.deepEqual rules.Rule, ["R", "abc\\/[ # ]"] + assert.deepEqual rules.EscapedSpace, ["R", " "] + assert.deepEqual rules.EscapedTab, ["R", "\\t"] + assert.deepEqual rules.EscapedLineSeparator, ["R", "\\u2028"] + assert.deepEqual rules.EscapedParagraphSeparator, ["R", "\\u2029"] + assert.deepEqual rules.UnescapedWhitespace, ["R", "abcd"] + assert.deepEqual rules.Identifier, ["R", "[A-Za-z_][A-Za-z0-9_]*"] + + parser := generate grammar.replace(/(\n [^\n]+)/g, "$1 -> $0") + assert.equal parser.parse(" ", startRule: "EscapedSpace"), " " + assert.equal parser.parse(tab, startRule: "EscapedTab"), tab + assert.equal parser.parse(lineSeparator, startRule: "EscapedLineSeparator"), lineSeparator + assert.equal parser.parse(paragraphSeparator, startRule: "EscapedParagraphSeparator"), paragraphSeparator + assert.equal parser.parse("abcd", startRule: "UnescapedWhitespace"), "abcd" + + empty := parse """ + Empty + ////// + """ + assert.deepEqual empty.Empty, ["R", ""] + + emptyParser := generate """ + Empty + ////// -> "empty" + """ + assert.equal emptyParser.parse(""), "empty" + + it "should compile interpolated triple-slash regexes", -> + {parse} := generate """ + ``` + const shared = "[0-9]+" + ``` + + Rule + /// x y ${shared} /// -> $0 + + Another + /// a b ${shared} /// -> $0 + """ + + assert.equal parse("xy123"), "xy123" + assert.equal parse("ab456", startRule: "Another"), "ab456" + assert.throws -> + parse("xyabc") + , /Rule \/xy\[0-9\]\+\// + + it "should compile const interpolated triple-slash regex parts", -> + (globalThis as any).__heraConstPart = "[0-9]+" + (globalThis as any).__heraDynamicPart = "a" + try + {parse} := generate """ + ``` + globalThis.__heraConstPart = "[0-9]+" + globalThis.__heraDynamicPart = "a" + ``` + + Rule + /// token-${ const globalThis.__heraConstPart}-${globalThis.__heraDynamicPart} /// -> $0 + """ + + assert.equal parse("token-123-a"), "token-123-a" + (globalThis as any).__heraConstPart = "[a-z]+" + (globalThis as any).__heraDynamicPart = "b" + assert.equal parse("token-456-b"), "token-456-b" + assert.throws -> + parse("token-abc-b") + , /Rule \/token-\[0-9\]\+-b\// + + {parse: parseLexical} := generate """ + ``` + const lexicalConst = "[a-z]+" + let lexicalDynamic = "x" + ``` + + Rule + Lexical Set Lexical -> $1 + "|" + $3 + + Lexical + /// lexical-${const lexicalConst}-${lexicalDynamic} /// -> $0 + + Set + "" -> + lexicalDynamic = "y" + return "" + """ + assert.equal parseLexical("lexical-abc-xlexical-abc-y"), "lexical-abc-x|lexical-abc-y" + + {parse: parseBalanced} := generate """ + ``` + const dynamic = "b" + ``` + + Rule + /// token-${const ({ pattern: "[0-9]+" }).pattern}-${"}".replace("}", dynamic)} /// -> $0 + """ + assert.equal parseBalanced("token-123-b"), "token-123-b" + finally + delete (globalThis as any).__heraConstPart + delete (globalThis as any).__heraDynamicPart + + it "should reuse duplicated interpolated triple-slash regexes", -> + code := compile parse """ + ``` + const shared = "[a-z]+" + ``` + + Rule + /// ${shared} /// + /// ${shared} /// + """ + + assert.match code, /const \$RD0 = / + assert.doesNotMatch code, /const \$RD1 = / + assert.match code, /\$EXPECT\(\$RD0, \(\) => `Rule \/\$\{\$RD0\.source\}\/`\)/ + constOnlyCode := compile parse """ + Rule + /// ${const shared} /// + """ + assert.match constOnlyCode, /let \$RD0 = / + assert.match constOnlyCode, /\$RD0 = \$R\(new RegExp/ + assert.match constOnlyCode, /return \$RD0\(\$\$ctx, \$\$state\)/ + assert.doesNotMatch constOnlyCode, /const \$\$parser = \$RD0 = / + assert.match constOnlyCode, /const \$\$source = `\$\{shared\}`/ + assert.doesNotMatch constOnlyCode, /\$\$const0/ + + mixedConstCode := compile parse """ + Rule + /// ${ const shared }${dynamic} /// + """ + assert.match mixedConstCode, /`\$\{shared\s*\}\$\{dynamic\}`/ + assert.doesNotMatch mixedConstCode, /\$\$const0/ + + unicodeIdentifierConstCode := compile parse """ + ``` + const café = "[a-z]+" + ``` + + Rule + /// ${ const café }${dynamic} /// + """ + assert.match unicodeIdentifierConstCode, /`\$\{café\s*\}\$\{dynamic\}`/ + assert.doesNotMatch unicodeIdentifierConstCode, /\$\$const0/ + + mixedComplexConstCode := compile parse """ + Rule + /// ${const shared.slice(0)}${dynamic} /// + """ + assert.match mixedComplexConstCode, /\$RD0\.const0 \?\?= String\(shared\.slice\(0\)\);[\s\S]*`\$\{\$RD0\.const0\}\$\{dynamic\}`/ + assert.doesNotMatch mixedComplexConstCode, /\(\(\) =>/ + assert.doesNotMatch mixedComplexConstCode, /\$\$const0/ + + typedCode := compile parse(""" + Rule + /// ${shared} /// + """), types: true + assert.match typedCode, /parser\?: Parser/ + assert.match typedCode, /\{ source\?: string, parser\?: Parser \}/ + + typedConstCode := compile parse(""" + Rule + /// ${const shared}${dynamic} /// + """), types: true + assert.doesNotMatch typedConstCode, /\$\$const0/ + + typedComplexConstCode := compile parse(""" + Rule + /// ${const shared.slice(0)}${dynamic} /// + """), types: true + assert.match typedComplexConstCode, /const0\?: string/ + assert.doesNotMatch typedComplexConstCode, /\$\$const0/ + + typedConstOnlyCode := compile parse(""" + Rule + /// ${const shared} /// + """), types: true + assert.match typedConstOnlyCode, /let \$RD0: Parser & \{ source\?: string \}/ + assert.doesNotMatch typedConstOnlyCode, /\$\$const0/ + + it "should treat hash interpolation syntax as static text in triple-slash regexes", -> + {parse} := generate """ + Rule + /// #\\{shared\\} /// + """ + + assert.equal parse("#{shared}"), "#{shared}" it "should parse simple grammars", -> {parse} := generate """ @@ -984,7 +1251,7 @@ describe "Hera", -> , /EOF/ it "should give accurate error message with multiline input", -> - {parse} := generate """ + grammar := """ Rule Line+ Line @@ -992,9 +1259,11 @@ describe "Hera", -> EOL /\r\n|\n/ """ + assert.match compile(parse(grammar)), /\/\\r\\n\|\\n\/suy/ + parser := generate grammar assert.throws -> - parse """ + parser.parse """ aaaa aaaa aaaa diff --git a/test/util.civet b/test/util.civet index 7b7c3e9..3e1ddfc 100644 --- a/test/util.civet +++ b/test/util.civet @@ -109,6 +109,32 @@ describe "util", -> assert.equal decompile(rules), grammar + it "should decompile interpolated triple-slash regexes", -> + rules := parse """ + Rule + /// a ${shared} /// + """ + + assert.equal decompile(rules), """ + Rule + ///a${shared}/// + + """ + assert.equal grammarToEBNF(rules), 'Rule ::= "///a${shared}///"\n' + + it "should decompile const interpolated triple-slash regexes", -> + rules := parse """ + Rule + /// a ${const shared} ${dynamic} /// + """ + + assert.equal decompile(rules), """ + Rule + ///a${const shared}${dynamic}/// + + """ + assert.equal grammarToEBNF(rules), 'Rule ::= "///a${const shared}${dynamic}///"\n' + it "should decompile type annotated rule", -> grammar := """ Rule diff --git a/tsconfig.parsers.json b/tsconfig.parsers.json index 02b5f72..0b52bf6 100644 --- a/tsconfig.parsers.json +++ b/tsconfig.parsers.json @@ -10,7 +10,7 @@ // "disableReferencedProjectLoad": true, /* Reduce the number of projects loaded automatically by TypeScript. */ /* Language and Environment */ // "allowImportingTsExtensions": true, - "target": "es6", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ + "target": "ES2018", /* Set the JavaScript language version for emitted JavaScript and include compatible library declarations. */ // "lib": [ "ES2020", "DOM" ], /* Specify a set of bundled library declaration files that describe the target runtime environment. */ "jsx": "preserve", /* Specify what JSX code is generated. */ // "experimentalDecorators": true, /* Enable experimental support for TC39 stage 2 draft decorators. */