Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,16 @@ All notable changes to `@danielx/hera` will be documented in this file.

## [Unreleased]

## [0.9.8] - 2026-05-26

### Added
- Grammar: `RuleName ::Type` sets a default return type for every untyped
choice in that rule. A `::Type` on an individual choice still overrides
the rule-level default (#91).
- Grammar: triple-slash regular expressions (`///...///`) support
whitespace-insensitive regex bodies with `//` comments, `${...}` dynamic
interpolation, and `${const ...}` interpolation for values that should be
captured once (#87).

## [0.9.7] - 2026-05-21

Expand Down
45 changes: 45 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,51 @@ Terminal regular expressions return the
array: the matched string, followed by any matched groups
(strings or `undefined` for unmatched optional groups).

**Triple-slash regular expression** (`///...///`):
A whitespace-insensitive regular expression, similar to
[Civet heregex syntax](https://civet.dev/reference#regular-expressions).
Whitespace and `//` comments inside the body are ignored, so larger
expressions can be split across lines:

```
Identifier
///
[A-Za-z_] // first character
[A-Za-z0-9_]*
///
```

Triple-slash regular expressions also support interpolation. `${expr}` is
dynamic: the generated parser re-evaluates `expr` when the rule runs, rebuilds
the `RegExp` when the resulting source string changes, and reuses the cached
parser until the source changes.

```
Close
/// ${closingPattern} /// -> $0
```

Use `${const expr}` for an interpolation whose value should be captured once.
If every interpolated part is `const`, the generated parser computes the
regular expression once and then uses the same parser as a normal regular
expression. In mixed static/dynamic expressions, complex `const` expressions
are cached once, while identifier-only `const` expressions are emitted directly.

```
Open
/// ${const openingPattern} /// ->
closingPattern = $0 === "(" ? "\\)" : "\\]"
return $0

Close
/// ${closingPattern} /// -> $0
```

Dynamic interpolation that depends on parser state needs the same care as
handlers that mutate parser state. If result caching is enabled, that state
must be part of the cache key/state, and rules that mutate it should not be
cached.

**Repetition** (`*`, `+`): `...*` means "zero or more expansions of `...`", and `...+` means one or more repetitions of `Choice`. Repetitions return an array of the matches.

**Optional** (`?`): `...?` means "zero or one expansion of `...`". If `...` matches, `...?` returns that value directly. Otherwise it succeeds without consuming input and returns `undefined`. Unlike `*` and `+`, `?` does not wrap its result in an array.
Expand Down
1 change: 1 addition & 0 deletions build/typed-parser-samples
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ civet source/cli-bin.civet --types --module --libPath ./machine.js -o parsers \
source/hera.hera \
samples/code.hera \
samples/regex.hera \
samples/heregex.hera \
samples/url.hera \
samples/coffee.hera \
samples/structural-mapping.hera \
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@danielx/hera",
"version": "0.9.7",
"version": "0.9.8",
"description": "Small and fast parsing expression grammars",
"devDependencies": {
"@danielx/civet": "0.11.6",
Expand Down
35 changes: 35 additions & 0 deletions samples/heregex.hera
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# Triple-slash regexes support `${const ...}` substitutions that are evaluated
# once and `${...}` substitutions that are dynamic. Dynamic regexes
# re-evaluate their dynamic interpolation expressions when the rule runs and
# re-use the cached RegExp while the resulting source string is unchanged.
#
# This example mutates parser state. If result caching is enabled, state like
# `closing` must be part of the cache key/state, and rules that mutate it
# should not be cached.

```
const opening = "[\\(\\[]"
let closing = "\\)"
```

Main
Open Body Close -> $1 + $2 + $3

Open
# Example of constant substitution => static regex
/// ${const opening} /// ->
if ($0 === "(") closing = "\\)"
else closing = "\\]"
return $0

Close
# Example of dynamic substitution => dynamic regex
/// ${closing} /// -> $0

Body
# Example of no substitution => static regex
/// \s+ | \w+ /// -> $0
# Example of constant substitution + dynamic substitution => dynamic regex
/// (?! ${const opening} | ${closing}) . /// -> $0
# Example of same with const caching
/// (?! ${const opening + opening} | ${closing}) . /// -> $0
89 changes: 86 additions & 3 deletions source/compiler.civet
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
// Compile a rules json to typescript

import type { HeraAST, HeraRules } from ./hera-types.civet
import type { HeraAST, HeraRules, RegExpExpression, RegExpPart } from ./hera-types.civet

export type CompilerOptions = {
filename?: string
Expand All @@ -23,6 +23,7 @@ export type CompilerOptions = {

strDefs: string[] := []
reDefs: string[] := []
dynamicReDefs: RegExpPart[][] := []

/**
Define a literal string terminal
Expand Down Expand Up @@ -54,6 +55,37 @@ defineRe := (re: string) ->

return id

defineDynamicRe := (parts: RegExpPart[]) =>
index := dynamicReDefs.findIndex((entry) => JSON.stringify(entry) is JSON.stringify(parts))
let id

if index >= 0
id = `$RD${index}`
else
id = `$RD${dynamicReDefs#}`
dynamicReDefs.push parts

return id

isIdentifierExpression := (expression: string): boolean =>
/^[$_\p{ID_Start}][$\u200C\u200D\p{ID_Continue}]*$/u.test(expression.trim())

heregexToExpression := (parts: RegExpPart[], constantPrefix?: string): string =>
let constIndex = 0
`\`${parts.map((part) =>
if part <? "string"
part.replace(/[`\\$]/g, "\\$&")
else if part.constant and constantPrefix?
if isIdentifierExpression(part.expression)
"${" + part.expression + "}"
else
index := constIndex
constIndex += 1
"${" + constantPrefix + index + "}"
else
"${" + part.expression + "}"
).join("")}\``

/**
Pretty print a string or RegExp literal
*/
Expand Down Expand Up @@ -85,6 +117,16 @@ compileOp := (tuple: HeraAST, name: string, defaultHandler: boolean, types?: boo
if defaultHandler
f =`$R$0(${f})`

f
}
when "RD"
{
args := tuple[1]
id := defineDynamicRe(args)
let f = `$EXPECT(${id}, () => \`${name} /\${${id}.source}/\`)`
if defaultHandler
f =`$R$0(${f})`

f
}
when "/"
Expand Down Expand Up @@ -382,7 +424,7 @@ compileRuleBodyInline := (options: CompilerOptions, name: string, fnName: string
paramTypes.push `typeof $$value[${i}]`
callArgs.push `$$value[${i}]`
addNamedParameter getNamedVariable(rule[1][i][0]), `typeof $$value[${i}]`, `$$value[${i}]`
else if rule[0] is "R"
else if rule[0] <? "string" and rule[0].startsWith "R"
parameters = regExpHandlerParams.slice()
// RegExpMatchArray positional typing (and reTupleType short tuples) don't
// pass cleanly through .reduce/.filter; keep positional args as `any`.
Expand Down Expand Up @@ -602,6 +644,7 @@ export function compile(rules: HeraRules, maybeOptions?: CompilerOptions): strin

strDefs.length = 0
reDefs.length = 0
dynamicReDefs.length = 0

{ language, module } := options
types := options.types || language === 'typescript' || language === 'civet'
Expand All @@ -617,7 +660,45 @@ export function compile(rules: HeraRules, maybeOptions?: CompilerOptions): strin
`const $L${i} = $L("${str}");\n`

reDefSource := reDefs.map (r, i) ->
`const $R${i} = $R${reTupleType(types, r)}(new RegExp(${JSON.stringify(r)}, 'suy'));\n`
`const $R${i} = $R${reTupleType(types, r)}(/${r or "(?:)"}/suy);\n`

dynamicReDefSource := dynamicReDefs.map (r, i) =>
rdType := types ? " as Parser<RegExpMatchArray> & { source?: string }" : ""
ctxType := types ? ": ParserContext" : ""
stateType := types ? ": ParseState" : ""
nonNullAssert := types ? "!" : ""
if r.every((part) => part <? "string" or part.constant)
rdParserType := types ? ": Parser<RegExpMatchArray> & { source?: string }" : ""
return [
`let $RD${i}${rdParserType} = (($$ctx${ctxType}, $$state${stateType}) => {\n`
` const $$source = ${heregexToExpression(r)};\n`
` $RD${i} = $R(new RegExp($$source, 'suy'))${rdType};\n`
` $RD${i}.source = $$source;\n`
` return $RD${i}($$ctx, $$state);\n`
`})${rdType};\n`
].join("")

constantParts := r.flatMap((part): RegExpExpression[] =>
if part <? "string" or not part.constant or isIdentifierExpression(part.expression) then [] else [part]
)
cacheProps := types ? [
"source?: string"
"parser?: Parser<RegExpMatchArray>"
...constantParts.map((_, index) => `const${index}?: string`)
].join(", ") : ""
cacheType := types ? ` as Parser<RegExpMatchArray> & { ${cacheProps} }` : ""
constInits := constantParts.map((part, index) => ` $RD${i}.const${index} ??= String(${part.expression});\n`).join("")
[
`const $RD${i} = (($$ctx${ctxType}, $$state${stateType}) => {\n`
constInits
` const $$source = ${heregexToExpression(r, `$RD${i}.const`)};\n`
` if ($$source !== $RD${i}.source) {\n`
` $RD${i}.source = $$source;\n`
` $RD${i}.parser = $R(new RegExp($$source, 'suy'));\n`
" }\n"
` return $RD${i}.parser${nonNullAssert}($$ctx, $$state);\n`
`})${cacheType};\n`
].join("")

genOpts: GenerateOptions := {}

Expand Down Expand Up @@ -660,6 +741,8 @@ export function compile(rules: HeraRules, maybeOptions?: CompilerOptions): strin
"\n\n"
reDefSource
"\n\n"
dynamicReDefSource
"\n\n"
body
"\n\n"
tail
Expand Down
10 changes: 6 additions & 4 deletions source/hera-types.civet
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,16 @@ import type { Loc, ParserContext, HeraGrammar, ParserOptions } from "./machine.c
export type ParserContext, HeraGrammar, ParserOptions

export type Terminal = string | RegExp
export type Handler = { $loc: Loc, f: string, t?: string, inline?: boolean }
export type Handler = { $loc: Loc, f: string, t?: string, inline?: boolean } | { t: string }
export type CodeBlockNode = { type: "CodeBlock", $loc: Loc, token: string }
export type TerminalOp = "L" | "R"
export type TerminalOp = "L" | "R" | "RD"
export type SequenceOp = "S" | "/"
export type PrefixOp = "&" | "!" | "$"
export type SuffixOp = "+" | "*" | "?"
export type Literal = [TerminalOp, string]
export type TerminalNode = [TerminalOp, string, Handler?]
export type RegExpExpression = { expression: string, constant?: boolean }
export type RegExpPart = string | RegExpExpression
export type Literal = ["L", string] | ["R", string] | ["RD", RegExpPart[]]
export type TerminalNode = [...Literal, Handler?]
export type SequenceNode = [SequenceOp, HeraAST[], Handler?]
export type PrefixNode = [PrefixOp, HeraAST, Handler?]
export type SuffixNode = [SuffixOp, HeraAST, Handler?]
Expand Down
62 changes: 60 additions & 2 deletions source/hera.hera
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,24 @@ import type {
Handler,
CodeBlockNode,
} from './hera-types'

const regexEscapes: Record<string, string> = {
"\n": "\\n",
"\r": "\\r",
"\u2028": "\\u2028",
"\u2029": "\\u2029",
"\t": "\\t",
"\v": "\\v",
"\f": "\\f",
" ": " ",
}

// \ followed by whitespace is natural in heregexes,
// but forbidden in JS regexes with s flag, so rewrite them.
// Newlines aren't allowed in JS regexes either, so we escape them too.
function normalizeRegExpSource(source: string) {
return source.replace(/\\?([\n\r\u2028\u2029\t\v\f ])/g, (_, char: string) => regexEscapes[char])
}
```

Grammar
Expand Down Expand Up @@ -162,10 +180,47 @@ StringLiteral
StringValue -> ["L", $1]

RegExpLiteral
"/" !Space $RegExpCharacter* "/" -> ["R", $3]
$CharacterClassExpression -> ["R", $1]
TripleSlash HeregexBody:body TripleSlash ->
body = body.map((part) => typeof part === "string" ? normalizeRegExpSource(part) : part)
// Static heregexes use the normal RegExp path; only interpolated ones need RD.
if (body.every((part) => typeof part === "string"))
return ["R", body.join("")]
return ["RD", body]
"/" !Space $RegExpCharacter* "/" -> ["R", normalizeRegExpSource($3)]
$CharacterClassExpression -> ["R", normalizeRegExpSource($1)]
"." -> ["R", $1]

HeregexBody
HeregexPart*

HeregexPart
$CharacterClass
# `const` substitution specifies that the expression is constant,
# so only needs to be evaluated once.
"${" Space* "const" Space $HeregexSubstitutionContent*:expression "}" -> { expression, constant: true }
# Otherwise, substitution is treated as a dynamic expression,
# re-evaluated every match.
"${" $HeregexSubstitutionContent*:expression "}" -> { expression }
EscapeSequence
# JS-style // comments
/\/\/(?!\/)[^\n\r]*/ -> ""
# NOTE: CoffeeScript strips out all unescaped whitespace chars
# but Python doesn't strip out whitespace inside character classes
# or inside '(?' groups and assertions.
# We keep spaces only inside character classes, following Civet.
[\s]+ -> ""
# Escape forward slashes (that aren't part of a triple slash)
/\/(?!\/\/)/ -> "\\/"
# Don't swallow up $ which might be interpolations,
# but handle them as single characters if they're not
/[^[\/\s$\\]+|[$]/ -> $0

HeregexSubstitutionContent
[^{}"']+
$( "\"" ( [^"\\]+ / EscapeSequence )* "\"" )
$( "'" ( [^'\\]+ / EscapeSequence )* "'" )
$( "{" HeregexSubstitutionContent* "}" )

CharacterClassExpression
CharacterClass+

Expand All @@ -192,6 +247,9 @@ Arrow
Backslash
"\\"

TripleSlash
"///"

OpenBrace
/\{\s*/

Expand Down
6 changes: 4 additions & 2 deletions source/machine.civet
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,8 @@ interface Fail {
(pos: number, expectation: string): void
}

type Expectation = string | (=> string)

/**
* A Parser is a function that takes a string and position to check and returns
* a result if it matches.
Expand All @@ -88,12 +90,12 @@ export interface Parser<T> {
/**
* $EXPECT sets the friendlier `expectation` name.
*/
export function $EXPECT<T>(parser: Parser<T>, expectation: string): Parser<T> {
export function $EXPECT<T>(parser: Parser<T>, expectation: Expectation): Parser<T> {
return function (ctx, state) {
// NOTE: we don't need to use a stack because we're only tracking failures on
// string and regex leaf nodes right now.
const result = parser(ctx, state);
if (!result) ctx.fail(state.pos, expectation)
if (!result) ctx.fail(state.pos, typeof expectation === "function" ? expectation() : expectation)
return result
}
}
Expand Down
Loading
Loading