diff --git a/Sources/_StringProcessing/ByteCodeGen+DSLList.swift b/Sources/_StringProcessing/ByteCodeGen+DSLList.swift index 9432764e5..4abf2b90e 100644 --- a/Sources/_StringProcessing/ByteCodeGen+DSLList.swift +++ b/Sources/_StringProcessing/ByteCodeGen+DSLList.swift @@ -73,19 +73,19 @@ fileprivate extension Compiler.ByteCodeGen { return nil // In an alternation, all of its children must match only at start. - case .orderedChoice(let children): - for _ in 0.. 0 { return _guaranteesForwardProgressImpl(list, position: &position) @@ -719,7 +719,7 @@ fileprivate extension Compiler.ByteCodeGen { } else { return false } - case .nonCapturingGroup(let groupKind, _): + case .nonCapturingGroup(let groupKind): // .nonCapture nonCapturingGroups are ignored during compilation guard groupKind.ast == .nonCapture else { return false @@ -751,15 +751,13 @@ fileprivate extension Compiler.ByteCodeGen { guard let node = list.popFirst() else { return nil } switch node { - case let .orderedChoice(children): - let n = children.count + case let .orderedChoice(n): try emitAlternation(&list, alternationCount: n) - - case let .concatenation(children): - let n = children.count + + case let .concatenation(n): try emitConcatenation(&list, componentCount: n) - case let .capture(name, refId, _, transform): + case let .capture(name, refId, transform): options.beginScope() defer { options.endScope() } @@ -793,19 +791,19 @@ fileprivate extension Compiler.ByteCodeGen { builder.buildTransformCapture(cap, fn) } - case let .nonCapturingGroup(kind, _): + case let .nonCapturingGroup(kind): try emitNoncapturingGroup(kind.ast, &list) - case .ignoreCapturesInTypedOutput(_): + case .ignoreCapturesInTypedOutput: try emitNode(&list) - case .limitCaptureNesting(_): + case .limitCaptureNesting: return try emitNode(&list) case .conditional: throw Unsupported("Conditionals") - case let .quantification(amt, kind, _): + case let .quantification(amt, kind): try emitQuantification(amt.ast, kind, &list) case let .customCharacterClass(ccc): @@ -822,7 +820,7 @@ fileprivate extension Compiler.ByteCodeGen { case let .atom(a): try emitAtom(a) - case let .quotedLiteral(s): + case let .quotedLiteral(s, _): emitQuotedLiteral(s) case .absentFunction: @@ -852,19 +850,17 @@ extension Compiler.ByteCodeGen { ) throws { guard let node = list.popFirst() else { return } switch node { - case let .orderedChoice(children): - let n = children.count + case let .orderedChoice(n): for _ in 0.. MEProgram { - // If the whole regex is a matcher, then the whole-match value - // is the constructed value. Denote that the current value - // register is the processor's value output. - switch root { - case .matcher: - builder.denoteCurrentValueIsWholeMatchValue() - default: - break - } - - try emitNode(root) - - builder.canOnlyMatchAtStart = root.canOnlyMatchAtStart() - builder.buildAccept() - return try builder.assemble() - } -} - extension Compiler.ByteCodeGen { mutating func emitAtom(_ a: DSLTree.Atom) throws { defer { @@ -279,9 +259,15 @@ extension Compiler.ByteCodeGen { mutating func emitDot() throws { if options.dotMatchesNewline { if options.usesNSRECompatibleDot { - try emitAlternation([ - .atom(.characterClass(.newlineSequence)), - .atom(.anyNonNewline)]) + // Custom expansion of emitAlternation for (?:newlineSequence|anyNonNewline) + let done = builder.makeAddress() + let next = builder.makeAddress() + builder.buildSave(next) + emitCharacterClass(.newlineSequence) + builder.buildBranch(to: done) + builder.label(next) + emitAnyNonNewline() + builder.label(done) } else { emitAny() } @@ -326,126 +312,6 @@ extension Compiler.ByteCodeGen { builder.label(done) } - mutating func emitAlternation( - _ children: [DSLTree.Node] - ) throws { - try emitAlternationGen(children, withBacktracking: true) { - try $0.emitNode($1) - } - } - - mutating func emitConcatenationComponent( - _ node: DSLTree.Node - ) throws { - // TODO: Should we do anything special since we can - // be glueing sub-grapheme components together? - try emitNode(node) - } - - mutating func emitPositiveLookahead(_ child: DSLTree.Node) throws { - /* - save(restoringAt: success) - save(restoringAt: intercept) - // failure restores at intercept - clearThrough(intercept) // remove intercept and any leftovers from - fail(preservingCaptures: true) // ->success - intercept: - clearSavePoint // remove success - fail // propagate failure - success: - ... - */ - let intercept = builder.makeAddress() - let success = builder.makeAddress() - - builder.buildSave(success) - builder.buildSave(intercept) - try emitNode(child) - builder.buildClearThrough(intercept) - builder.buildFail(preservingCaptures: true) // Lookahead succeeds here - - builder.label(intercept) - builder.buildClear() - builder.buildFail() - - builder.label(success) - } - - mutating func emitNegativeLookahead(_ child: DSLTree.Node) throws { - /* - save(restoringAt: success) - save(restoringAt: intercept) - // failure restores at intercept - clearThrough(intercept) // remove intercept and any leftovers from - clearSavePoint // remove success - fail // propagate failure - intercept: - fail // ->success - success: - ... - */ - let intercept = builder.makeAddress() - let success = builder.makeAddress() - - builder.buildSave(success) - builder.buildSave(intercept) - try emitNode(child) - builder.buildClearThrough(intercept) - builder.buildClear() - builder.buildFail() - - builder.label(intercept) - builder.buildFail() - - builder.label(success) - } - - mutating func emitLookaround( - _ kind: (forwards: Bool, positive: Bool), - _ child: DSLTree.Node - ) throws { - guard kind.forwards else { - throw Unsupported("backwards assertions") - } - if kind.positive { - try emitPositiveLookahead(child) - } else { - try emitNegativeLookahead(child) - } - } - - mutating func emitAtomicNoncapturingGroup( - _ child: DSLTree.Node - ) throws { - /* - save(continuingAt: success) - save(restoringAt: intercept) - // failure restores at intercept - clearThrough(intercept) // remove intercept and any leftovers from - fail(preservingCaptures: true) // ->success - intercept: - clearSavePoint // remove success - fail // propagate failure - success: - ... - */ - - let intercept = builder.makeAddress() - let success = builder.makeAddress() - - builder.buildSaveAddress(success) - builder.buildSave(intercept) - try emitNode(child) - builder.buildClearThrough(intercept) - builder.buildFail(preservingCaptures: true) // Atomic group succeeds here - - builder.label(intercept) - builder.buildClear() - builder.buildFail() - - builder.label(success) - } - mutating func emitMatcher( _ matcher: @escaping _MatcherInterface ) -> ValueRegister { @@ -463,358 +329,6 @@ extension Compiler.ByteCodeGen { return valReg } - mutating func emitNoncapturingGroup( - _ kind: AST.Group.Kind, - _ child: DSLTree.Node - ) throws { - assert(!kind.isCapturing) - - options.beginScope() - defer { options.endScope() } - - if let lookaround = kind.lookaroundKind { - try emitLookaround(lookaround, child) - return - } - - switch kind { - case .lookahead, .negativeLookahead, - .lookbehind, .negativeLookbehind: - throw Unreachable("TODO: reason") - - case .capture, .namedCapture, .balancedCapture: - throw Unreachable("These should produce a capture node") - - case .changeMatchingOptions(let optionSequence): - if !hasEmittedFirstMatchableAtom { - builder.initialOptions.apply(optionSequence) - } - options.apply(optionSequence) - try emitNode(child) - - case .atomicNonCapturing: - try emitAtomicNoncapturingGroup(child) - - default: - // FIXME: Other kinds... - try emitNode(child) - } - } - - mutating func emitQuantification( - _ amount: AST.Quantification.Amount, - _ kind: DSLTree.QuantificationKind, - _ child: DSLTree.Node - ) throws { - let updatedKind = kind.applying(options: options) - - let (low, high) = amount.bounds - guard let low = low else { - throw Unreachable("Must have a lower bound") - } - switch (low, high) { - case (_, 0): - // TODO: Should error out earlier, maybe DSL and parser - // has validation logic? - return - case let (n, m?) where n > m: - // TODO: Should error out earlier, maybe DSL and parser - // has validation logic? - return - - case let (n, m) where m == nil || n <= m!: - // Ok - break - default: - throw Unreachable("TODO: reason") - } - - // Compiler and/or parser should enforce these invariants - // before we are called - assert(high != 0) - assert((0...(high ?? Int.max)).contains(low)) - - let maxExtraTrips: Int? - if let h = high { - maxExtraTrips = h - low - } else { - maxExtraTrips = nil - } - let minTrips = low - assert((maxExtraTrips ?? 1) >= 0) - - if tryEmitFastQuant(child, updatedKind, minTrips, maxExtraTrips) { - return - } - - // The below is a general algorithm for bounded and unbounded - // quantification. It can be specialized when the min - // is 0 or 1, or when extra trips is 1 or unbounded. - // - // Stuff inside `<` and `>` are decided at compile time, - // while run-time values stored in registers start with a `%` - _ = """ - min-trip-count control block: - if %minTrips is zero: - goto exit-policy control block - else: - decrement %minTrips and fallthrough - - loop-body: - : - mov currentPosition %pos - evaluate the subexpression - : - if %pos is currentPosition: - goto exit - goto min-trip-count control block - - exit-policy control block: - if %maxExtraTrips is zero: - goto exit - else: - decrement %maxExtraTrips and fallthrough - - : - save exit and goto loop-body - : - ratchet and goto loop - : - save loop-body and fallthrough (i.e. goto exit) - - exit - ... the rest of the program ... - """ - - // Specialization based on `minTrips` for 0 or 1: - _ = """ - min-trip-count control block: - : - goto exit-policy - : - /* fallthrough */ - - loop-body: - evaluate the subexpression - - /* fallthrough */ - """ - - // Specialization based on `maxExtraTrips` for 0 or unbounded - _ = """ - exit-policy control block: - : - goto exit - : - /* fallthrough */ - """ - - /* - NOTE: These specializations don't emit the optimal - code layout (e.g. fallthrough vs goto), but that's better - done later (not prematurely) and certainly better - done by an optimizing compiler. - - NOTE: We're intentionally emitting essentially the same - algorithm for all quantifications for now, for better - testing and surfacing difficult bugs. We can specialize - for other things, like `.*`, later. - - When it comes time for optimizing, we can also look into - quantification instructions (e.g. reduce save-point traffic) - */ - - let minTripsControl = builder.makeAddress() - let loopBody = builder.makeAddress() - let exitPolicy = builder.makeAddress() - let exit = builder.makeAddress() - - // We'll need registers if we're (non-trivially) bounded - let minTripsReg: IntRegister? - if minTrips > 1 { - minTripsReg = builder.makeIntRegister( - initialValue: minTrips) - } else { - minTripsReg = nil - } - - let maxExtraTripsReg: IntRegister? - if (maxExtraTrips ?? 0) > 0 { - maxExtraTripsReg = builder.makeIntRegister( - initialValue: maxExtraTrips!) - } else { - maxExtraTripsReg = nil - } - - // Set up a dummy save point for possessive to update - if updatedKind == .possessive { - builder.pushEmptySavePoint() - } - - // min-trip-count: - // condBranch(to: exitPolicy, ifZeroElseDecrement: %min) - builder.label(minTripsControl) - switch minTrips { - case 0: builder.buildBranch(to: exitPolicy) - case 1: break - default: - assert(minTripsReg != nil, "logic inconsistency") - builder.buildCondBranch( - to: exitPolicy, ifZeroElseDecrement: minTripsReg!) - } - - // FIXME: Possessive needs a "dummy" save point to ratchet - - // loop: - // - // branch min-trip-count - builder.label(loopBody) - - // if we aren't sure if the child node will have forward progress and - // we have an unbounded quantification - let startPosition: PositionRegister? - let emitPositionChecking = - (!optimizationsEnabled || !child.guaranteesForwardProgress) && - maxExtraTrips == nil - - if emitPositionChecking { - startPosition = builder.makePositionRegister() - builder.buildMoveCurrentPosition(into: startPosition!) - } else { - startPosition = nil - } - try emitNode(child) - if emitPositionChecking { - // in all quantifier cases, no matter what minTrips or maxExtraTrips is, - // if we have a successful non-advancing match, branch to exit because it - // can match an arbitrary number of times - builder.buildCondBranch(to: exit, ifSamePositionAs: startPosition!) - } - - if minTrips <= 1 { - // fallthrough - } else { - builder.buildBranch(to: minTripsControl) - } - - // exit-policy: - // condBranch(to: exit, ifZeroElseDecrement: %maxExtraTrips) - // - // - // Bool { - let isScalarSemantics = options.semanticLevel == .unicodeScalar - guard optimizationsEnabled - && minTrips <= QuantifyPayload.maxStorableTrips - && maxExtraTrips ?? 0 <= QuantifyPayload.maxStorableTrips - && kind != .reluctant else { - return false - } - switch child { - case .customCharacterClass(let ccc): - // ascii only custom character class - guard let bitset = ccc.asAsciiBitset(options) else { - return false - } - builder.buildQuantify(bitset: bitset, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) - - case .atom(let atom): - switch atom { - case .char(let c): - if options.isCaseInsensitive && c.isCased { - // Cased character with case-insensitive matching; match only as an ASCII bitset - guard let bitset = DSLTree.CustomCharacterClass(members: [.atom(atom)]).asAsciiBitset(options) else { - return false - } - builder.buildQuantify(bitset: bitset, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) - } else { - // Uncased character OR case-sensitive matching; match as a single scalar ascii value character - guard let val = c._singleScalarAsciiValue else { - return false - } - builder.buildQuantify(asciiChar: val, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) - } - - case .any: - builder.buildQuantifyAny( - matchesNewlines: true, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) - case .anyNonNewline: - builder.buildQuantifyAny( - matchesNewlines: false, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) - case .dot: - builder.buildQuantifyAny( - matchesNewlines: options.dotMatchesNewline, kind, minTrips, maxExtraTrips, isScalarSemantics: isScalarSemantics) - - case .characterClass(let cc): - // Custom character class that consumes a single grapheme - let model = cc.asRuntimeModel(options) - builder.buildQuantify( - model: model, - kind, - minTrips, - maxExtraTrips, - isScalarSemantics: isScalarSemantics) - default: - return false - } - case .limitCaptureNesting(let node): - return tryEmitFastQuant(node, kind, minTrips, maxExtraTrips) - case .nonCapturingGroup(let groupKind, let node): - // .nonCapture nonCapturingGroups are ignored during compilation - guard groupKind.ast == .nonCapture else { - return false - } - return tryEmitFastQuant(node, kind, minTrips, maxExtraTrips) - default: - return false - } - return true - } - /// Coalesce any adjacent scalar members in a custom character class together. /// This is required in order to produce correct grapheme matching behavior. func coalescingCustomCharacterClassMembers( @@ -1185,185 +699,6 @@ extension Compiler.ByteCodeGen { try $0.emitCCCMember($1) } } - - mutating func emitConcatenation(_ children: [DSLTree.Node]) throws { - // Before emitting a concatenation, we need to flatten out any nested - // concatenations, and coalesce any adjacent characters and scalars, forming - // quoted literals of their contents, over which we can perform grapheme - // breaking. - func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] { - switch node { - case .concatenation(let ch): - return ch.flatMap(flatten) - case .ignoreCapturesInTypedOutput(let n), .limitCaptureNesting(let n): - return flatten(n) - default: - return [node] - } - } - let children = children - .flatMap(flatten) - .coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in - switch node { - case .atom(let a): - guard let c = a.literalCharacterValue else { return false } - str.append(c) - return true - case .quotedLiteral(let q): - str += q - return true - case .trivia: - // Trivia can be completely ignored if we've already coalesced - // something. - return !str.isEmpty - default: - return false - } - } - for child in children { - try emitConcatenationComponent(child) - } - } - - @discardableResult - mutating func emitNode(_ node: DSLTree.Node) throws -> ValueRegister? { - switch node { - - case let .orderedChoice(children): - try emitAlternation(children) - - case let .concatenation(children): - try emitConcatenation(children) - - case let .capture(name, refId, child, transform): - options.beginScope() - defer { options.endScope() } - - let cap = builder.makeCapture(id: refId, name: name) - builder.buildBeginCapture(cap) - let value = try emitNode(child) - builder.buildEndCapture(cap) - // If the child node produced a custom capture value, e.g. the result of - // a matcher, this should override the captured substring. - if let value { - builder.buildMove(value, into: cap) - } - // If there's a capture transform, apply it now. - if let transform = transform { - let fn = builder.makeTransformFunction { input, cap in - // If it's a substring capture with no custom value, apply the - // transform directly to the substring to avoid existential traffic. - // - // FIXME: separate out this code path. This is fragile, - // slow, and these are clearly different constructs - if let range = cap.range, cap.value == nil { - return try transform(input[range]) - } - - let value = constructExistentialOutputComponent( - from: input, - component: cap.deconstructed, - optionalCount: 0) - return try transform(value) - } - builder.buildTransformCapture(cap, fn) - } - - case let .nonCapturingGroup(kind, child): - try emitNoncapturingGroup(kind.ast, child) - - case let .ignoreCapturesInTypedOutput(child): - try emitNode(child) - - case let .limitCaptureNesting(child): - return try emitNode(child) - - case .conditional: - throw Unsupported("Conditionals") - - case let .quantification(amt, kind, child): - try emitQuantification(amt.ast, kind, child) - - case let .customCharacterClass(ccc): - if ccc.containsDot { - if !ccc.isInverted { - try emitDot() - } else { - throw Unsupported("Inverted any") - } - } else { - try emitCustomCharacterClass(ccc) - } - - case let .atom(a): - try emitAtom(a) - - case let .quotedLiteral(s): - emitQuotedLiteral(s) - - case .absentFunction: - throw Unsupported("absent function") - case .consumer: - throw Unsupported("consumer") - - case let .matcher(_, f): - return emitMatcher(f) - - case .characterPredicate: - throw Unsupported("character predicates") - - case .trivia, .empty: - return nil - } - return nil - } -} - -extension DSLTree.Node { - /// A Boolean value indicating whether this node advances the match position - /// on a successful match. - /// - /// For example, an alternation like `(a|b|c)` always advances the position - /// by a character, but `(a|b|)` has an empty branch, which matches without - /// advancing. - var guaranteesForwardProgress: Bool { - switch self { - case .orderedChoice(let children): - return children.allSatisfy { $0.guaranteesForwardProgress } - case .concatenation(let children): - return children.contains(where: { $0.guaranteesForwardProgress }) - case .capture(_, _, let node, _): - return node.guaranteesForwardProgress - case .nonCapturingGroup(let kind, let child): - switch kind.ast { - case .lookahead, .negativeLookahead, .lookbehind, .negativeLookbehind: - return false - default: return child.guaranteesForwardProgress - } - case .atom(let atom): - switch atom { - case .changeMatchingOptions, .assertion: return false - // Captures may be nil so backreferences may be zero length matches - case .backreference: return false - default: return true - } - case .trivia, .empty: - return false - case .quotedLiteral(let string): - return !string.isEmpty - case .consumer, .matcher: - // Allow zero width consumers and matchers - return false - case .customCharacterClass(let ccc): - return ccc.guaranteesForwardProgress - case .quantification(let amount, _, let child): - let (atLeast, _) = amount.ast.bounds - return atLeast ?? 0 > 0 && child.guaranteesForwardProgress - case .limitCaptureNesting(let node), .ignoreCapturesInTypedOutput(let node): - return node.guaranteesForwardProgress - default: return false - } - } } extension DSLTree.CustomCharacterClass { diff --git a/Sources/_StringProcessing/Compiler.swift b/Sources/_StringProcessing/Compiler.swift index 89c8f5f34..09cdef90d 100644 --- a/Sources/_StringProcessing/Compiler.swift +++ b/Sources/_StringProcessing/Compiler.swift @@ -19,22 +19,13 @@ class Compiler { private var compileOptions: _CompileOptions = .default init(ast: AST) { - self.tree = DSLList(tree: ast.dslTree) - } - - init(tree: DSLTree) { - self.tree = DSLList(tree: tree) + self.tree = DSLList(ast: ast) } init(list: DSLList) { self.tree = list } - init(tree: DSLTree, compileOptions: _CompileOptions) { - self.tree = DSLList(tree: tree) - self.compileOptions = compileOptions - } - init(tree: DSLList, compileOptions: _CompileOptions) { self.tree = tree self.compileOptions = compileOptions @@ -44,20 +35,8 @@ class Compiler { try emitViaList() } - __consuming func emitViaTree() throws -> MEProgram { - // TODO: Handle global options - _ = ByteCodeGen( - options: options, - compileOptions: - compileOptions, - captureList: tree.captureList) - fatalError() -// return try codegen.emitRoot(tree.root) - } - __consuming func emitViaList() throws -> MEProgram { // TODO: Handle global options -// var dslList = DSLList(tree: tree) var codegen = ByteCodeGen( options: options, compileOptions: diff --git a/Sources/_StringProcessing/LiteralPrinter.swift b/Sources/_StringProcessing/LiteralPrinter.swift index d9cdbb04e..0e5b448bd 100644 --- a/Sources/_StringProcessing/LiteralPrinter.swift +++ b/Sources/_StringProcessing/LiteralPrinter.swift @@ -102,12 +102,12 @@ extension LiteralPrinter { } switch node { - case let .orderedChoice(children): - try outputAlternation(&list, count: children.count) - case let .concatenation(children): - try outputConcatenation(&list, count: children.count) + case let .orderedChoice(count): + try outputAlternation(&list, count: count) + case let .concatenation(count): + try outputConcatenation(&list, count: count) - case let .capture(name, nil, _, nil): + case let .capture(name, nil, nil): options.beginScope() defer { options.endScope() } try outputCapture(&list, name: name) @@ -116,7 +116,7 @@ extension LiteralPrinter { try inconvertible(node) return - case let .nonCapturingGroup(kind, _): + case let .nonCapturingGroup(kind): guard let kindPattern = kind._patternString else { try inconvertible(node) return @@ -131,16 +131,16 @@ extension LiteralPrinter { try outputList(&list) output(")") - case .ignoreCapturesInTypedOutput(_), - .limitCaptureNesting(_): + case .ignoreCapturesInTypedOutput, + .limitCaptureNesting: try outputList(&list) - case let .quantification(amount, kind, _): + case let .quantification(amount, kind): try outputQuantification(&list, amount: amount, kind: kind) case let .customCharacterClass(charClass): outputCustomCharacterClass(charClass) case let .atom(atom): outputAtom(atom) - case let .quotedLiteral(literal): + case let .quotedLiteral(literal, _): output(prepareQuotedLiteral(literal)) case .trivia(_): @@ -182,8 +182,8 @@ extension LiteralPrinter { func requiresGrouping(_ list: ArraySlice) -> Bool { guard let node = list.first else { return false } // malformed? switch node { - case .concatenation(let children): - switch children.count { + case .concatenation(let count): + switch count { case 0: return false case 1: @@ -192,7 +192,7 @@ extension LiteralPrinter { return true } - case .quotedLiteral(let literal): + case .quotedLiteral(let literal, _): return prepareQuotedLiteral(literal).count > 1 default: @@ -239,144 +239,7 @@ extension LiteralPrinter { } } -extension LiteralPrinter { - mutating func outputNode(_ node: DSLTree.Node) { - switch node { - case let .orderedChoice(children): - outputAlternation(children) - case let .concatenation(children): - outputConcatenation(children) - - case let .capture(name, nil, child, nil): - options.beginScope() - defer { options.endScope() } - outputCapture(name, child) - case .capture: - // Captures that use a reference or a transform are unsupported - saveInconvertible(node) - - case let .nonCapturingGroup(kind, child): - guard let kindPattern = kind._patternString else { - saveInconvertible(node) - return - } - options.beginScope() - defer { options.endScope() } - - output(kindPattern) - if case .changeMatchingOptions(let optionSequence) = kind.ast { - options.apply(optionSequence) - } - outputNode(child) - output(")") - - case let .ignoreCapturesInTypedOutput(child), - let .limitCaptureNesting(child): - outputNode(child) - case let .quantification(amount, kind, node): - outputQuantification(amount, kind, node) - case let .customCharacterClass(charClass): - outputCustomCharacterClass(charClass) - case let .atom(atom): - outputAtom(atom) - case let .quotedLiteral(literal): - output(prepareQuotedLiteral(literal)) - - case .trivia(_): - // TODO: Include trivia? - return - case .empty: - return - - case .conditional, .absentFunction, .consumer, .matcher, .characterPredicate: - saveInconvertible(node) - } - } - - mutating func outputAlternation(_ children: [DSLTree.Node]) { - guard let first = children.first else { return } - - outputNode(first) - for child in children.dropFirst() { - output("|") - outputNode(child) - } - } - - mutating func outputConcatenation(_ children: [DSLTree.Node]) { - for child in children { - outputNode(child) - } - } - - mutating func outputCapture(_ name: String?, _ child: DSLTree.Node) { - if let name { - output("(?<\(name)>") - } else { - output("(") - } - outputNode(child) - output(")") - } - - func requiresGrouping(_ node: DSLTree.Node) -> Bool { - switch node { - case .concatenation(let children): - switch children.count { - case 0: - return false - case 1: - return requiresGrouping(children.first!) - default: - return true - } - - case .quotedLiteral(let literal): - return prepareQuotedLiteral(literal).count > 1 - - default: - return false - } - } - - mutating func outputQuantification( - _ amount: DSLTree._AST.QuantificationAmount, - _ kind: DSLTree.QuantificationKind, - _ child: DSLTree.Node - ) { - // RegexBuilder regexes can have children that need - if requiresGrouping(child) { - output("(?:") - outputNode(child) - output(")") - } else { - outputNode(child) - } - - switch amount.ast { - case .zeroOrMore: - output("*") - case .oneOrMore: - output("+") - case .zeroOrOne: - output("?") - case let .exactly(n): - output("{\(n.value!)}") - case let .nOrMore(n): - output("{\(n.value!),}") - case let .upToN(n): - output("{,\(n.value!)}") - case let .range(low, high): - output("{\(low.value!),\(high.value!)}") - #if RESILIENT_LIBRARIES - @unknown default: - fatalError() - #endif - } - - outputQuantificationKind(kind) - } - +extension LiteralPrinter { mutating func outputQuantificationKind(_ kind: DSLTree.QuantificationKind) { guard let astKind = kind.quantificationKind?.ast else { // We can treat this as if the current default had been given explicity. diff --git a/Sources/_StringProcessing/Optimizations/AutoPossessification.swift b/Sources/_StringProcessing/Optimizations/AutoPossessification.swift index 46ec4d460..e2f266f05 100644 --- a/Sources/_StringProcessing/Optimizations/AutoPossessification.swift +++ b/Sources/_StringProcessing/Optimizations/AutoPossessification.swift @@ -39,10 +39,10 @@ extension DSLList { // In a concatenation, the first definitive child provides the answer, // and then we need to skip past (in some cases at least) the remaining // concatenation elements. - case .concatenation(let children): + case .concatenation(let count): var result: DSLTree.Atom?? = nil var i = 0 - while i < children.count { + while i < count { i += 1 position += 1 if let r = _requiredAtomImpl(&position, options: &options, allowOptionsChanges: allowOptionsChanges) { @@ -50,8 +50,8 @@ extension DSLList { break } } - - for _ in i..( - _ ast: T - ) -> Bool { - if let max = maxTopDownLevels, depth >= max { - return true - } - if let min = minBottomUpLevels, ast.height <= min { - return true - } - return false - } - - mutating func printBackoff(_ node: DSLTree.Node) { - precondition(node.astNode != nil, "unconverted node") - printAsCanonical( - .init(node.astNode!, globalOptions: nil, diags: Diagnostics()), - delimiters: true) - } - mutating func printAsPattern(_ ast: AST) { - // TODO: Handle global options... - let node = ast.root.dslTreeNode - + let list = DSLList(ast: ast) + // If we have any named captures, create references to those above the regex. - let namedCaptures = node.getNamedCaptures() - + let namedCaptures = list.getNamedCaptures() + for namedCapture in namedCaptures { print("let \(namedCapture) = Reference(Substring.self)") } printBlock("Regex") { printer in - printer.printAsPattern(convertedFromAST: node, isTopLevel: true) + var slice = list.nodes[...] + printer.printAsPatternFromList(&slice, isTopLevel: true) } printInlineMatchingOptions() } - mutating func printInlineMatchingOptions() { - while !inlineMatchingOptions.isEmpty { - let (options, condition) = popMatchingOptions() - - printIndented { printer in - for option in options { - switch option.kind { - case .asciiOnlyDigit: - printer.print(".asciiOnlyDigits(\(condition))") - - case .asciiOnlyPOSIXProps: - printer.print(".asciiOnlyCharacterClasses(\(condition))") - - case .asciiOnlySpace: - printer.print(".asciiOnlyWhitespace(\(condition))") - - case .asciiOnlyWord: - printer.print(".asciiOnlyWordCharacters(\(condition))") - - case .caseInsensitive: - printer.print(".ignoresCase(\(condition))") - - case .multiline: - printer.print(".anchorsMatchLineEndings(\(condition))") - - case .reluctantByDefault: - // This is handled by altering every OneOrMore, etc by changing each - // individual repetition behavior instead of creating a nested regex. - continue - - case .singleLine: - printer.print(".dotMatchesNewlines(\(condition))") - - default: - break - } - } - } - - print("}") - } - } - - // FIXME: Use of back-offs like height and depth - // imply that this DSLTree node has a corresponding - // AST. That's not always true, and it would be nice - // to have a non-backing-off pretty-printer that this - // can defer to. - private mutating func printAsPattern( - convertedFromAST node: DSLTree.Node, isTopLevel: Bool = false + private mutating func printAsPatternFromList( + _ list: inout ArraySlice, + isTopLevel: Bool = false ) { - if patternBackoff(DSLTree._Tree(node)) { - printBackoff(node) - return - } + guard let node = list.popFirst() else { return } switch node { - - case let .orderedChoice(a): + case .orderedChoice(let count): printBlock("ChoiceOf") { printer in - a.forEach { - printer.printAsPattern(convertedFromAST: $0) + for _ in 0.., + count: Int, + isTopLevel: Bool ) { - // We need to coalesce any adjacent character and scalar elements into a - // string literal, preserving scalar syntax. - let nodes = nodes - .map { NodeToPrint.dslNode($0.lookingThroughConvertedLiteral) } - .coalescing( - with: StringLiteralBuilder(), into: { .stringLiteral($0.result) } - ) { literal, node in - guard case .dslNode(let node) = node else { return false } - switch node { - case let .atom(.char(c)): - literal.append(c) - return true - case let .atom(.scalar(s)): - literal.append(unescaped: s._dslBase) - return true - case .quotedLiteral(let q): - literal.append(q) - return true - case .trivia: - // Trivia can be completely ignored if we've already coalesced - // something. - return !literal.isEmpty - default: - return false - } + if isTopLevel || count <= 1 { + for _ in 0.. [String] { - var result: [String] = [] - - switch self { - case .capture(let name?, _, _, _): - result.append(name) - - case .concatenation(let nodes): - for node in nodes { - result += node.getNamedCaptures() - } - - case .quantification(_, _, let node): - result += node.getNamedCaptures() - - default: - break - } - - return result - } -} diff --git a/Sources/_StringProcessing/Regex/ASTConversion.swift b/Sources/_StringProcessing/Regex/ASTConversion.swift index 3bad55732..9b39b9679 100644 --- a/Sources/_StringProcessing/Regex/ASTConversion.swift +++ b/Sources/_StringProcessing/Regex/ASTConversion.swift @@ -11,51 +11,50 @@ internal import _RegexParser -extension AST { - var dslTree: DSLTree { - return DSLTree(.limitCaptureNesting(root.dslTreeNode)) - } -} - extension AST.Node { func convert(into list: inout [DSLTree.Node]) throws { switch self { case .alternation(let alternation): - list.append(.orderedChoice(Array(repeating: TEMP_FAKE_NODE, count: alternation.children.count))) + list.append(.orderedChoice(alternation.children.count)) for child in alternation.children { try child.convert(into: &list) } case .concatenation(_): - let coalesced = self.coalescedChildren - list.append(.concatenation(Array(repeating: TEMP_FAKE_NODE, count: coalesced.count))) + let coalesced = self.coalescedChildrenWithDisplay + list.append(.concatenation(coalesced.count)) for child in coalesced { - try child.convert(into: &list) + switch child { + case .literal(let value, let display): + list.append(.quotedLiteral(value, display: display)) + case .node(let astNode): + try astNode.convert(into: &list) + } } case .group(let group): let child = group.child switch group.kind.value { case .capture: - list.append(.capture(TEMP_FAKE_NODE)) + list.append(.capture()) try child.convert(into: &list) case .namedCapture(let name): - list.append(.capture(name: name.value, TEMP_FAKE_NODE)) + list.append(.capture(name: name.value)) try child.convert(into: &list) case .balancedCapture: throw Unsupported("TODO: balanced captures") default: - list.append(.nonCapturingGroup(.init(ast: group.kind.value), TEMP_FAKE_NODE)) + list.append(.nonCapturingGroup(.init(ast: group.kind.value))) try child.convert(into: &list) } case .conditional(let conditional): - list.append(.conditional(.init(ast: conditional.condition.kind), TEMP_FAKE_NODE, TEMP_FAKE_NODE)) + list.append(.conditional(.init(ast: conditional.condition.kind))) try conditional.trueBranch.convert(into: &list) try conditional.falseBranch.convert(into: &list) case .quantification(let quant): list.append( - .quantification(.init(ast: quant.amount.value), .syntax(.init(ast: quant.kind.value)), TEMP_FAKE_NODE)) + .quantification(.init(ast: quant.amount.value), .syntax(.init(ast: quant.kind.value)))) try quant.child.convert(into: &list) case .quote(let node): - list.append(.quotedLiteral(node.literal)) + list.append(.quotedLiteral(node.literal, display: nil)) case .trivia(let node): list.append(.trivia(node.contents)) case .interpolation(_): @@ -63,10 +62,11 @@ extension AST.Node { case .atom(let atom): switch atom.kind { case .scalarSequence(let seq): - // The DSL doesn't have an equivalent node for scalar sequences. Splat - // them into a concatenation of scalars. - // list.append(.concatenation(Array(repeating: TEMP_FAKE_NODE, count: seq.scalarValues.count))) - list.append(.quotedLiteral(String(seq.scalarValues))) + let value = String(seq.scalarValues) + let display = seq.scalarValues + .map { "\\u{\(String($0.value, radix: 16, uppercase: true))}" } + .joined() + list.append(.quotedLiteral(value, display: display)) default: list.append(.atom(atom.dslTreeAtom)) } @@ -81,7 +81,100 @@ extension AST.Node { throw Unsupported("Unknown AST node") } } - + + /// A coalesced child is either a literal (with value and display strings) + /// produced by coalescing adjacent chars/scalars, or an unconverted AST node. + private enum CoalescedChild { + case literal(value: String, display: String) + case node(AST.Node) + } + + /// Flattens nested concatenations and coalesces adjacent characters and + /// scalars into quoted literals, tracking both the actual string value and + /// a display form that preserves \u{} notation for scalars. + private var coalescedChildrenWithDisplay: [CoalescedChild] { + func flatten(_ node: AST.Node) -> [AST.Node] { + switch node { + case .concatenation(let concat): + return concat.children.flatMap(flatten) + default: + return [node] + } + } + + guard case .concatenation(let v) = self else { + return [] + } + + let flat = v.children.flatMap(flatten) + var result: [CoalescedChild] = [] + var value = "" + var display = "" + var accumulating = false + + func finishAccumulation() { + if accumulating { + result.append(.literal(value: value, display: display)) + value = "" + display = "" + accumulating = false + } + } + + func tryAccumulateAtom(_ atom: AST.Atom) -> Bool { + switch atom.kind { + case .char(let c): + value.append(c) + display += String(c)._escaped + return true + case .scalar(let s): + value.append(Character(s.value)) + display += "\\u{\(String(s.value.value, radix: 16, uppercase: true))}" + return true + case .escaped(let c): + guard let sv = c.scalarValue else { return false } + value.append(Character(sv)) + display += "\\u{\(String(sv.value, radix: 16, uppercase: true))}" + return true + case .scalarSequence(let seq): + for s in seq.scalarValues { + value.append(Character(s)) + display += "\\u{\(String(s.value, radix: 16, uppercase: true))}" + } + return true + default: + return false + } + } + + for child in flat { + var accumulated = false + switch child { + case .atom(let a): + accumulated = tryAccumulateAtom(a) + case .quote(let q): + value += q.literal + display += q.literal._escaped + accumulated = true + case .trivia: + // Trivia can be completely ignored if we've already coalesced + // something. + accumulated = accumulating + default: + break + } + + if accumulated { + accumulating = true + } else { + finishAccumulation() + result.append(.node(child)) + } + } + finishAccumulation() + return result + } + var coalescedChildren: [AST.Node] { // Before converting a concatenation in a tree to list form, we need to // flatten out any nested concatenations, and coalesce any adjacent @@ -96,7 +189,7 @@ extension AST.Node { return [node] } } - + func appendAtom(_ atom: AST.Atom, to str: inout String) -> Bool { switch atom.kind { case .char(let c): @@ -112,12 +205,12 @@ extension AST.Node { case .scalarSequence(let seq): str.append(contentsOf: seq.scalarValues.lazy.map(Character.init)) return true - + default: return false } } - + switch self { case .alternation(let v): return v.children case .concatenation(let v): @@ -154,84 +247,6 @@ extension AST.Node { } } -extension AST.Node { - /// Converts an AST node to a `convertedRegexLiteral` node. - var dslTreeNode: DSLTree.Node { - // Convert the top-level node without wrapping - func convert() throws -> DSLTree.Node { - switch self { - case let .alternation(v): - let children = v.children.map(\.dslTreeNode) - return .orderedChoice(children) - - case let .concatenation(v): - return .concatenation(v.children.map(\.dslTreeNode)) - - case let .group(v): - let child = v.child.dslTreeNode - switch v.kind.value { - case .capture: - return .capture(child) - case .namedCapture(let name): - return .capture(name: name.value, child) - case .balancedCapture: - throw Unsupported("TODO: balanced captures") - default: - return .nonCapturingGroup(.init(ast: v.kind.value), child) - } - - case let .conditional(v): - let trueBranch = v.trueBranch.dslTreeNode - let falseBranch = v.falseBranch.dslTreeNode - return .conditional( - .init(ast: v.condition.kind), trueBranch, falseBranch) - - case let .quantification(v): - let child = v.child.dslTreeNode - return .quantification( - .init(ast: v.amount.value), .syntax(.init(ast: v.kind.value)), child) - - case let .quote(v): - return .quotedLiteral(v.literal) - - case let .trivia(v): - return .trivia(v.contents) - - case .interpolation: - throw Unsupported("TODO: interpolation") - - case let .atom(v): - switch v.kind { - case .scalarSequence(let seq): - // The DSL doesn't have an equivalent node for scalar sequences. Splat - // them into a concatenation of scalars. - return .concatenation(seq.scalarValues.map { .atom(.scalar($0)) }) - default: - return .atom(v.dslTreeAtom) - } - - case let .customCharacterClass(ccc): - return .customCharacterClass(ccc.dslTreeClass) - - case .empty(_): - return .empty - - case let .absentFunction(abs): - // TODO: What should this map to? - return .absentFunction(.init(ast: abs)) - - #if RESILIENT_LIBRARIES - @unknown default: - fatalError() - #endif - } - } - - let converted = try! convert() - return converted - } -} - extension AST.CustomCharacterClass { var dslTreeClass: DSLTree.CustomCharacterClass { // TODO: Not quite 1-1 diff --git a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift index a4e405f8c..ba65c72aa 100644 --- a/Sources/_StringProcessing/Regex/AnyRegexOutput.swift +++ b/Sources/_StringProcessing/Regex/AnyRegexOutput.swift @@ -248,7 +248,7 @@ extension Regex { /// - Parameter verbatimString: A string to convert into a regular expression /// exactly, escaping any metacharacters. public init(verbatim verbatimString: String) { - self.init(node: .quotedLiteral(verbatimString)) + self.init(node: .quotedLiteral(verbatimString, display: nil)) } /// Returns a Boolean value indicating whether a named capture with the given diff --git a/Sources/_StringProcessing/Regex/Core.swift b/Sources/_StringProcessing/Regex/Core.swift index 425f64549..fa10ba1f4 100644 --- a/Sources/_StringProcessing/Regex/Core.swift +++ b/Sources/_StringProcessing/Regex/Core.swift @@ -11,8 +11,6 @@ internal import _RegexParser -let TEMP_FAKE_NODE = DSLTree.Node.empty - /// A type that represents a regular expression. /// /// You can use types that conform to `RegexComponent` as parameters to string @@ -131,7 +129,7 @@ public struct Regex: RegexComponent { extension Regex { @available(*, deprecated, renamed: "init(verbatim:)") public init(quoting _string: String) { - self.init(node: .quotedLiteral(_string)) + self.init(node: .quotedLiteral(_string, display: nil)) } } @@ -199,10 +197,6 @@ extension Regex { self.list = DSLList(ast: ast) } - init(tree: DSLTree) { - self.list = DSLList(tree: tree) - } - init(list: DSLList) { self.list = list } @@ -261,11 +255,11 @@ extension Regex { // Use an existing concatenation if it's already the root; // otherwise, embed self and other in a new concatenation root. switch list.nodes[0] { - case .concatenation(let children): - list.nodes[0] = .concatenation(Array(repeating: TEMP_FAKE_NODE, count: children.count + 1)) + case .concatenation(let count): + list.nodes[0] = .concatenation(count + 1) list.nodes.append(contentsOf: other.nodes) default: - list.nodes.insert(.concatenation(Array(repeating: TEMP_FAKE_NODE, count: 2)), at: 0) + list.nodes.insert(.concatenation(2), at: 0) list.nodes.append(contentsOf: other.nodes) } return Regex(list: list) @@ -274,11 +268,11 @@ extension Regex { func alternating(with other: some Collection) -> Regex { var nodes = program.list.nodes switch nodes[0] { - case .orderedChoice(let children): - nodes[0] = .orderedChoice(Array(repeating: TEMP_FAKE_NODE, count: children.count + 1)) + case .orderedChoice(let count): + nodes[0] = .orderedChoice(count + 1) nodes.append(contentsOf: other) default: - nodes.insert(.orderedChoice(Array(repeating: TEMP_FAKE_NODE, count: 2)), at: 0) + nodes.insert(.orderedChoice(2), at: 0) nodes.append(contentsOf: other) } return Regex(list: DSLList(nodes)) diff --git a/Sources/_StringProcessing/Regex/DSLList.swift b/Sources/_StringProcessing/Regex/DSLList.swift index 98e478de4..3a5164ebd 100644 --- a/Sources/_StringProcessing/Regex/DSLList.swift +++ b/Sources/_StringProcessing/Regex/DSLList.swift @@ -32,12 +32,8 @@ struct DSLList { self.nodes = nodes } - init(tree: DSLTree) { - self.nodes = Array(tree.depthFirst) - } - init(ast: AST) { - self.nodes = [.limitCaptureNesting(TEMP_FAKE_NODE)] + self.nodes = [.limitCaptureNesting] try! ast.root.convert(into: &nodes) } @@ -73,110 +69,63 @@ extension DSLTree.Node { return 0 case .orderedChoice(let c), .concatenation(let c): - return c.count + return c case .capture, .nonCapturingGroup, .quantification, .ignoreCapturesInTypedOutput, - .limitCaptureNesting, .conditional: + .limitCaptureNesting: return 1 + case .conditional: + return 2 + case .absentFunction: return 0 } } } -extension DSLTree { - struct DepthFirst: Sequence, IteratorProtocol { - typealias Element = DSLTree.Node - private var stack: [Frame] - private let getChildren: (Element) -> [Element] - - private struct Frame { - let node: Element - let children: [Element] - var nextIndex: Int = 0 - } - - fileprivate init( - root: Element, - getChildren: @escaping (Element) -> [Element] - ) { - self.getChildren = getChildren - self.stack = [Frame(node: root, children: getChildren(root))] +extension ArraySlice { + internal func skipNode(_ position: inout Int) { + guard position < endIndex else { + return } - - mutating func next() -> Element? { - guard let top = stack.popLast() else { return nil } - // Push children in reverse so leftmost comes out first. - for child in top.children.reversed() { - stack.append(Frame(node: child, children: getChildren(child))) + switch self[position] { + case let .orderedChoice(n): + for _ in 0.. { - internal func skipNode(_ position: inout Int) { - guard position < endIndex else { - return - } - switch self[position] { - case let .orderedChoice(children): - let n = children.count - for _ in 0.. Int? { switch nodes[position] { - case .concatenation(let children): + case .concatenation(let count): var position = position + 1 if findLast { - for _ in 0..<(children.count - 1) { + for _ in 0..<(count - 1) { skipNode(&position) position += 1 } @@ -204,8 +153,14 @@ extension DSLList { let postfixValue = other.nodes[postfixIndex].literalStringValue else { return } + // Merge display strings, falling back to the escaped value for either + // side if its display is not available. + let prefixDisplay = nodes[prefixIndex].literalDisplayValue ?? prefixValue._escaped + let postfixDisplay = other.nodes[postfixIndex].literalDisplayValue ?? postfixValue._escaped + let mergedDisplay = prefixDisplay + postfixDisplay + // Replace the prefix node with a coalesced version of the two - nodes[prefixIndex] = .quotedLiteral(prefixValue + postfixValue) + nodes[prefixIndex] = .quotedLiteral(prefixValue + postfixValue, display: mergedDisplay) // Remove the postfix node and fix up any parent concatenations other.nodes.remove(at: postfixIndex) @@ -213,8 +168,8 @@ extension DSLList { Loop: while i >= 0 { switch other.nodes[i] { - case .concatenation(let children): - other.nodes[i] = .concatenation(.init(repeating: .empty, count: children.count - 1)) + case .concatenation(let count): + other.nodes[i] = .concatenation(count - 1) break Loop case .limitCaptureNesting, .ignoreCapturesInTypedOutput: other.nodes.remove(at: i) @@ -225,3 +180,15 @@ extension DSLList { } } } + +extension DSLList { + internal func getNamedCaptures() -> [String] { + var result: [String] = [] + for node in nodes { + if case .capture(let name?, _, _) = node, !result.contains(name) { + result.append(name) + } + } + return result + } +} diff --git a/Sources/_StringProcessing/Regex/DSLTree.swift b/Sources/_StringProcessing/Regex/DSLTree.swift index f34d1d4d1..62100f148 100644 --- a/Sources/_StringProcessing/Regex/DSLTree.swift +++ b/Sources/_StringProcessing/Regex/DSLTree.swift @@ -25,26 +25,26 @@ extension DSLTree { /// Matches each node in order. /// /// ... | ... | ... - case orderedChoice([Node]) + case orderedChoice(Int) /// Match each node in sequence. /// /// ... ... - case concatenation([Node]) + case concatenation(Int) /// Captures the result of a subpattern. /// /// (...), (?...) case capture( - name: String? = nil, reference: ReferenceID? = nil, Node, + name: String? = nil, reference: ReferenceID? = nil, CaptureTransform? = nil) /// Matches a noncapturing subpattern. - case nonCapturingGroup(_AST.GroupKind, Node) + case nonCapturingGroup(_AST.GroupKind) /// Marks all captures in a subpattern as ignored in strongly-typed output. - case ignoreCapturesInTypedOutput(Node) - case limitCaptureNesting(Node) + case ignoreCapturesInTypedOutput + case limitCaptureNesting // TODO: Consider splitting off grouped conditions, or have // our own kind @@ -53,13 +53,11 @@ extension DSLTree { /// /// (?(cond) true-branch | false-branch) /// - case conditional( - _AST.ConditionKind, Node, Node) + case conditional(_AST.ConditionKind) case quantification( _AST.QuantificationAmount, - QuantificationKind, - Node) + QuantificationKind) case customCharacterClass(CustomCharacterClass) @@ -73,7 +71,7 @@ extension DSLTree { case empty - case quotedLiteral(String) + case quotedLiteral(String, display: String?) // TODO: What should we do here? /// @@ -384,125 +382,6 @@ typealias _CharacterPredicateInterface = ( */ -extension DSLTree.Node { - /// Indicates whether this node has at least one child node (among other - /// associated values). - var hasChildNodes: Bool { - switch self { - case .trivia, .empty, .quotedLiteral, - .consumer, .matcher, .characterPredicate, - .customCharacterClass, .atom: - return false - - case .orderedChoice(let c), .concatenation(let c): - return !c.isEmpty - - case .capture, .nonCapturingGroup, - .quantification, .ignoreCapturesInTypedOutput, .limitCaptureNesting, - .conditional: - return true - - case .absentFunction(let abs): - return !abs.ast.children.isEmpty - } - } - - @_spi(RegexBuilder) - public var children: [DSLTree.Node] { - switch self { - - case let .orderedChoice(v): return v - case let .concatenation(v): return v - - case let .capture(_, _, n, _): return [n] - case let .nonCapturingGroup(_, n): return [n] - case let .quantification(_, _, n): return [n] - case let .ignoreCapturesInTypedOutput(n): return [n] - case let .limitCaptureNesting(n): return [n] - - case let .conditional(_, t, f): return [t,f] - - case .trivia, .empty, .quotedLiteral, - .consumer, .matcher, .characterPredicate, - .customCharacterClass, .atom: - return [] - - case let .absentFunction(abs): - return abs.ast.children.map(\.dslTreeNode) - } - } - - public var coalescedChildren: [DSLTree.Node] { - // Before converting a concatenation in a tree to list form, we need to - // flatten out any nested concatenations, and coalesce any adjacent - // characters and scalars, forming quoted literals of their contents, - // over which we can perform grapheme breaking. - - func flatten(_ node: DSLTree.Node) -> [DSLTree.Node] { - switch node { - case .concatenation(let ch): - return ch.flatMap(flatten) - case .ignoreCapturesInTypedOutput(let n), .limitCaptureNesting(let n): - return flatten(n) - default: - return [node] - } - } - - switch self { - case let .orderedChoice(v): return v - case let .concatenation(v): - let children = v - .flatMap(flatten) - .coalescing(with: "", into: DSLTree.Node.quotedLiteral) { str, node in - switch node { - case .atom(let a): - guard let c = a.literalCharacterValue else { return false } - str.append(c) - return true - case .quotedLiteral(let q): - str += q - return true - case .trivia: - // Trivia can be completely ignored if we've already coalesced - // something. - return !str.isEmpty - default: - return false - } - } - return children - - case let .capture(_, _, n, _): return [n] - case let .nonCapturingGroup(_, n): return [n] - case let .quantification(_, _, n): return [n] - case let .ignoreCapturesInTypedOutput(n): return [n] - case let .limitCaptureNesting(n): return [n] - - case let .conditional(_, t, f): return [t,f] - - case .trivia, .empty, .quotedLiteral, - .consumer, .matcher, .characterPredicate, - .customCharacterClass, .atom: - return [] - - case let .absentFunction(abs): - return abs.ast.children.map(\.dslTreeNode) - } - } -} - -extension DSLTree.Node { - var astNode: AST.Node? { - nil - } - - /// If this node is for a converted literal, look through it. - var lookingThroughConvertedLiteral: Self { - self - } -} - extension DSLTree.Atom { // Return the Character or promote a scalar to a Character var literalCharacterValue: Character? { @@ -518,51 +397,20 @@ extension DSLTree.Node { var literalStringValue: String? { switch self { case .atom(let a): return a.literalCharacterValue.map(String.init) - case .quotedLiteral(let s): return s + case .quotedLiteral(let s, _): return s default: return nil } } -} - -extension DSLTree { - struct Options { - // TBD - } -} -extension DSLTree { - /// Indicates whether this DSLTree contains any capture groups. - var hasCapture: Bool { - root.hasCapture - } -} -extension DSLTree.Node { - /// Indicates whether this DSLTree node contains any capture groups. - var hasCapture: Bool { + var literalDisplayValue: String? { switch self { - case .capture: - return true - default: - return self.children.any(\.hasCapture) - } - } -} - -extension DSLTree.Node { - func appending(_ newNode: DSLTree.Node) -> DSLTree.Node { - if case .concatenation(let components) = self { - return .concatenation(components + [newNode]) - } - return .concatenation([self, newNode]) - } - - func appendingAlternationCase( - _ newNode: DSLTree.Node - ) -> DSLTree.Node { - if case .orderedChoice(let components) = self { - return .orderedChoice(components + [newNode]) + case .atom(let a): + guard let c = a.literalCharacterValue else { return nil } + return String(c)._escaped + case .quotedLiteral(_, display: let d): + return d + default: return nil } - return .orderedChoice([self, newNode]) } } @@ -697,124 +545,41 @@ struct CaptureTransform: Hashable, CustomStringConvertible { } extension CaptureList.Builder { - mutating func addCaptures( - of node: DSLTree.Node, optionalNesting nesting: OptionalNesting, visibleInTypedOutput: Bool - ) { - switch node { - case let .orderedChoice(children): - for child in children { - addCaptures(of: child, optionalNesting: nesting.addingOptional, visibleInTypedOutput: visibleInTypedOutput) - } - - case let .concatenation(children): - for child in children { - addCaptures(of: child, optionalNesting: nesting, visibleInTypedOutput: visibleInTypedOutput) - } - - case let .capture(name, _, child, transform): - captures.append(.init( - name: name, - type: transform?.resultType ?? child.wholeMatchType, - optionalDepth: nesting.depth, visibleInTypedOutput: visibleInTypedOutput, .fake)) - addCaptures(of: child, optionalNesting: nesting, visibleInTypedOutput: visibleInTypedOutput) - - case let .nonCapturingGroup(kind, child): - assert(!kind.ast.isCapturing) - addCaptures(of: child, optionalNesting: nesting, visibleInTypedOutput: visibleInTypedOutput) - - case let .ignoreCapturesInTypedOutput(child): - addCaptures(of: child, optionalNesting: nesting, visibleInTypedOutput: false) - - case let .limitCaptureNesting(child): - addCaptures(of: child, optionalNesting: nesting.disablingNesting, visibleInTypedOutput: visibleInTypedOutput) - - case let .conditional(cond, trueBranch, falseBranch): - switch cond.ast { - case .group(let g): - addCaptures(of: .group(g), optionalNesting: nesting, visibleInTypedOutput: visibleInTypedOutput) - default: - break - } - - addCaptures(of: trueBranch, optionalNesting: nesting.addingOptional, visibleInTypedOutput: visibleInTypedOutput) - addCaptures(of: falseBranch, optionalNesting: nesting.addingOptional, visibleInTypedOutput: visibleInTypedOutput) - - case let .quantification(amount, _, child): - var optNesting = nesting - if amount.ast.bounds.atLeast == 0 { - optNesting = optNesting.addingOptional - } - addCaptures(of: child, optionalNesting: optNesting, visibleInTypedOutput: visibleInTypedOutput) - - case let .absentFunction(abs): - switch abs.ast.kind { - case .expression(_, _, let child): - addCaptures(of: child, optionalNesting: nesting, visibleInTypedOutput: visibleInTypedOutput) - case .clearer, .repeater, .stopper: - break - #if RESILIENT_LIBRARIES - @unknown default: - fatalError() - #endif - } - -// case let .convertedRegexLiteral(n, _): -// // We disable nesting for converted AST trees, as literals do not nest -// // captures. This includes literals nested in a DSL. -// return addCaptures(of: n, optionalNesting: nesting.disablingNesting, visibleInTypedOutput: visibleInTypedOutput) -// - case .matcher: - break - - case .customCharacterClass, .atom, .trivia, .empty, - .quotedLiteral, .consumer, .characterPredicate: - break - } - } - - static func build(_ dsl: DSLTree) -> CaptureList { - var builder = Self() - builder.captures.append( - .init(type: dsl.root.wholeMatchType, optionalDepth: 0, visibleInTypedOutput: true, .fake)) - builder.addCaptures(of: dsl.root, optionalNesting: .init(canNest: true), visibleInTypedOutput: true) - return builder.captures - } - mutating func addCaptures( in list: inout ArraySlice, optionalNesting nesting: OptionalNesting, visibleInTypedOutput: Bool ) { guard let node = list.popFirst() else { return } switch node { - case let .orderedChoice(children): - for _ in 0.. CaptureList { var builder = Self() builder.captures.append( - .init(type: dsl.first.wholeMatchType, optionalDepth: 0, visibleInTypedOutput: true, .fake)) + .init(type: dsl.wholeMatchType, optionalDepth: 0, visibleInTypedOutput: true, .fake)) var nodes = dsl.nodes[...] builder.addCaptures(in: &nodes, optionalNesting: .init(canNest: true), visibleInTypedOutput: true) return builder.captures } } -extension DSLTree.Node { - /// Returns true if the node is output-forwarding, i.e. not defining its own - /// output but forwarding its only child's output. - var isOutputForwarding: Bool { - switch self { - case .nonCapturingGroup, .ignoreCapturesInTypedOutput: - return true - case .orderedChoice, .concatenation, .capture, - .conditional, .quantification, .customCharacterClass, .atom, - .trivia, .empty, .quotedLiteral, .limitCaptureNesting, - .consumer, .absentFunction, - .characterPredicate, .matcher: - return false - } - } - - /// Returns the output-defining node, peering through any output-forwarding - /// nodes. - var outputDefiningNode: Self { - if isOutputForwarding { - assert(children.count == 1) - return children[0].outputDefiningNode - } - return self - } - - /// Returns the type of the whole match, i.e. `.0` element type of the output. - var wholeMatchType: Any.Type { - if case .matcher(let type, _) = outputDefiningNode { - return type - } - return Substring.self - } -} - extension DSLList { - - /// Returns the output-defining node, peering through any output-forwarding - /// nodes. - var outputDefiningNode: DSLTree.Node? { - nodes.first(where: { !$0.isOutputForwarding }) - } - /// Returns the type of the whole match, i.e. `.0` element type of the output. var wholeMatchType: Any.Type { - if case .matcher(let type, _) = outputDefiningNode { - return type - } - return Substring.self - } -} - -extension DSLTree.Node { - /// Implementation for `canOnlyMatchAtStart`, which maintains the option - /// state. - /// - /// For a given specific node, this method can return one of three values: - /// - /// - `true`: This node is guaranteed to match only at the start of a subject. - /// - `false`: This node can match anywhere in the subject. - /// - `nil`: This node is inconclusive about where it can match. - /// - /// In particular, non-required groups and option-setting groups are - /// inconclusive about where they can match. - private func _canOnlyMatchAtStartImpl(_ options: inout MatchingOptions) -> Bool? { - switch self { - // Defining cases - case .atom(.assertion(.startOfSubject)): - return true - case .atom(.assertion(.caretAnchor)): - return !options.anchorsMatchNewlines - - // Changing options doesn't determine `true`/`false`. - case .atom(.changeMatchingOptions(let sequence)): - options.apply(sequence.ast) - return nil - - // Any other atom or consuming node returns `false`. - case .atom, .customCharacterClass, .quotedLiteral: - return false - - // Trivia/empty have no effect. - case .trivia, .empty: - return nil - - // In an alternation, all of its children must match only at start. - case .orderedChoice(let children): - return children.allSatisfy { $0._canOnlyMatchAtStartImpl(&options) == true } - - // In a concatenation, the first definitive child provides the answer. - case .concatenation(let children): - for child in children { - if let result = child._canOnlyMatchAtStartImpl(&options) { - return result - } - } - return false - - // Groups (and other parent nodes) defer to the child. - case .nonCapturingGroup(let kind, let child): - // Don't let a negative lookahead affect this - need to continue to next sibling - if kind.isNegativeLookahead { - return nil - } - options.beginScope() - defer { options.endScope() } - if case .changeMatchingOptions(let sequence) = kind.ast { - options.apply(sequence) - } - return child._canOnlyMatchAtStartImpl(&options) - case .capture(_, _, let child, _): - options.beginScope() - defer { options.endScope() } - return child._canOnlyMatchAtStartImpl(&options) - case .ignoreCapturesInTypedOutput(let child), .limitCaptureNesting(let child): - return child._canOnlyMatchAtStartImpl(&options) - - // A quantification that doesn't require its child to exist can still - // allow a start-only match. (e.g. `/(foo)?^bar/`) - case .quantification(let amount, _, let child): - return amount.requiresAtLeastOne - ? child._canOnlyMatchAtStartImpl(&options) - : nil - - // For conditional nodes, both sides must require matching at start. - case .conditional(_, let child1, let child2): - return child1._canOnlyMatchAtStartImpl(&options) == true - && child2._canOnlyMatchAtStartImpl(&options) == true - - // Extended behavior isn't known, so we return `false` for safety. - case .consumer, .matcher, .characterPredicate, .absentFunction: - return false - } - } - - /// Returns a Boolean value indicating whether the regex with this node as - /// the root can _only_ match at the start of a subject. - /// - /// For example, these regexes can only match at the start of a subject: - /// - /// - `/^foo/` - /// - `/(^foo|^bar)/` (both sides of the alternation start with `^`) - /// - /// These can match other places in a subject: - /// - /// - `/(^foo)?bar/` (`^` is in an optional group) - /// - `/(^foo|bar)/` (only one side of the alternation starts with `^`) - /// - `/(?m)^foo/` (`^` means "the start of a line" due to `(?m)`) - internal func canOnlyMatchAtStart() -> Bool { - var options = MatchingOptions() - return _canOnlyMatchAtStartImpl(&options) ?? false + nodes.wholeMatchType } } -// MARK: Required first and last atoms - -extension DSLTree.Node { - private func _requiredAtomImpl(forward: Bool) -> DSLTree.Atom?? { - switch self { - case .atom(let atom): - return switch atom { - case .changeMatchingOptions: - nil +extension Sequence { + var wholeMatchType: Any.Type { + Loop: + for node in self { + switch node { + case .nonCapturingGroup, .ignoreCapturesInTypedOutput: + continue Loop + case .matcher(let type, _): + return type default: - atom + break Loop } - - // In a concatenation, the first definitive child provides the answer. - case .concatenation(let children): - if forward { - for child in children { - if let result = child._requiredAtomImpl(forward: forward) { - return result - } - } - } else { - for child in children.reversed() { - if let result = child._requiredAtomImpl(forward: forward) { - return result - } - } - } - return nil - - // For a quoted literal, we can look at the first char - // TODO: matching semantics??? - case .quotedLiteral(let str): - return str.first.map(DSLTree.Atom.char) - - // TODO: custom character classes could/should participate here somehow - case .customCharacterClass: - return .some(nil) - - // Trivia/empty have no effect. - case .trivia, .empty: - return nil - - // For alternation and conditional, no required first (this could change - // if we identify the _same_ required first atom across all possibilities). - case .orderedChoice, .conditional: - return .some(nil) - - // Groups (and other parent nodes) defer to the child. - case .nonCapturingGroup(_, let child), .capture(_, _, let child, _), - .ignoreCapturesInTypedOutput(let child), - .limitCaptureNesting(let child): - return child._requiredAtomImpl(forward: forward) - - // A quantification that doesn't require its child to exist can still - // allow a start-only match. (e.g. `/(foo)?^bar/`) - case .quantification(let amount, _, let child): - return amount.requiresAtLeastOne - ? child._requiredAtomImpl(forward: forward) - : .some(nil) - - // Extended behavior isn't known, so we return `false` for safety. - case .consumer, .matcher, .characterPredicate, .absentFunction: - return .some(nil) } - } - - internal func requiredFirstAtom() -> DSLTree.Atom? { - self._requiredAtomImpl(forward: true) ?? nil - } - - internal func requiredLastAtom() -> DSLTree.Atom? { - self._requiredAtomImpl(forward: false) ?? nil + return Substring.self } } +// MARK: Required first and last atoms private func _requiredAtomImpl(_ list: inout ArraySlice) -> DSLTree.Atom?? { guard let node = list.popFirst() else { @@ -1111,8 +666,8 @@ private func _requiredAtomImpl(_ list: inout ArraySlice) -> DSLTre } // In a concatenation, the first definitive child provides the answer. - case .concatenation(let children): - for _ in 0..) -> DSLTre // For a quoted literal, we can look at the first char // TODO: matching semantics??? - case .quotedLiteral(let str): + case .quotedLiteral(let str, _): return str.first.map(DSLTree.Atom.char) // TODO: custom character classes could/should participate here somehow @@ -1145,7 +700,7 @@ private func _requiredAtomImpl(_ list: inout ArraySlice) -> DSLTre // A quantification that doesn't require its child to exist can still // allow a start-only match. (e.g. `/(foo)?^bar/`) - case .quantification(let amount, _, _): + case .quantification(let amount, _): return amount.requiresAtLeastOne ? _requiredAtomImpl(&list) : .some(nil) @@ -1166,44 +721,6 @@ internal func requiredFirstAtom(_ list: inout ArraySlice) -> DSLTr // include symbols from implementation-only dependencies. extension DSLTree { - var captureList: CaptureList { .Builder.build(self) } - - /// Presents a wrapped version of `DSLTree.Node` that can provide an internal - /// `_TreeNode` conformance. - struct _Tree: _TreeNode { - var node: DSLTree.Node - - init(_ node: DSLTree.Node) { - self.node = node - } - - var children: [_Tree]? { - switch node { - - case let .orderedChoice(v): return v.map(_Tree.init) - case let .concatenation(v): return v.map(_Tree.init) - - case let .capture(_, _, n, _): return [_Tree(n)] - case let .nonCapturingGroup(_, n): return [_Tree(n)] - case let .quantification(_, _, n): return [_Tree(n)] - case let .ignoreCapturesInTypedOutput(n): return [_Tree(n)] - case let .limitCaptureNesting(n): - // This is a transparent wrapper - return _Tree(n).children - - case let .conditional(_, t, f): return [_Tree(t), _Tree(f)] - - case .trivia, .empty, .quotedLiteral, - .consumer, .matcher, .characterPredicate, - .customCharacterClass, .atom: - return [] - - case let .absentFunction(abs): - return abs.ast.children.map(\.dslTreeNode).map(_Tree.init) - } - } - } - @_spi(RegexBuilder) public enum _AST { @_spi(RegexBuilder) @@ -1342,8 +859,7 @@ extension DSLTree.Node { @available(SwiftStdlib 5.7, *) static func repeating( _ range: Range, - _ behavior: RegexRepetitionBehavior?, - _ node: DSLTree.Node + _ behavior: RegexRepetitionBehavior? ) -> DSLTree.Node { // TODO: Throw these as errors precondition(range.lowerBound >= 0, "Cannot specify a negative lower bound") @@ -1361,23 +877,23 @@ extension DSLTree.Node { if range.upperBound == Int.max { switch lower { case 0: // 0... - return .quantification(.zeroOrMore, kind, node) + return .quantification(.zeroOrMore, kind) case 1: // 1... - return .quantification(.oneOrMore, kind, node) + return .quantification(.oneOrMore, kind) default: // n... - return .quantification(.nOrMore(lower), kind, node) + return .quantification(.nOrMore(lower), kind) } } if range.count == 1 { // ..<1 or ...0 or any range with count == 1 // Note: `behavior` is ignored in this case - return .quantification(.exactly(lower), .default, node) + return .quantification(.exactly(lower), .default) } switch lower { case 0: // 0.. Regex { // Don't wrap `child` again if it's a leaf node. child.regex.list.hasChildren - ? child.regex.prepending(.ignoreCapturesInTypedOutput(TEMP_FAKE_NODE)) as Regex + ? child.regex.prepending(.ignoreCapturesInTypedOutput) as Regex : .init(list: child.regex.program.list) } @@ -107,7 +107,7 @@ public struct _RegexFactory { _ behavior: RegexRepetitionBehavior? = nil ) -> Regex { let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default - return component.regex.prepending(.quantification(.zeroOrOne, kind, TEMP_FAKE_NODE)) + return component.regex.prepending(.quantification(.zeroOrOne, kind)) } @available(SwiftStdlib 5.7, *) @@ -116,7 +116,7 @@ public struct _RegexFactory { _ behavior: RegexRepetitionBehavior? = nil ) -> Regex { let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default - return component.regex.prepending(.quantification(.zeroOrMore, kind, TEMP_FAKE_NODE)) + return component.regex.prepending(.quantification(.zeroOrMore, kind)) } @available(SwiftStdlib 5.7, *) @@ -125,7 +125,7 @@ public struct _RegexFactory { _ behavior: RegexRepetitionBehavior? = nil ) -> Regex { let kind: DSLTree.QuantificationKind = behavior.map { .explicit($0.dslTreeKind) } ?? .default - return component.regex.prepending(.quantification(.oneOrMore, kind, TEMP_FAKE_NODE)) + return component.regex.prepending(.quantification(.oneOrMore, kind)) } @available(SwiftStdlib 5.7, *) @@ -133,7 +133,7 @@ public struct _RegexFactory { _ count: Int, _ component: some RegexComponent ) -> Regex { - component.regex.prepending(.quantification(.exactly(count), .default, TEMP_FAKE_NODE)) + component.regex.prepending(.quantification(.exactly(count), .default)) } @available(SwiftStdlib 5.7, *) @@ -142,14 +142,14 @@ public struct _RegexFactory { _ behavior: RegexRepetitionBehavior?, _ component: some RegexComponent ) -> Regex { - component.regex.prepending(.repeating(range, behavior, TEMP_FAKE_NODE)) + component.regex.prepending(.repeating(range, behavior)) } @available(SwiftStdlib 5.7, *) public func atomicNonCapturing( _ component: some RegexComponent ) -> Regex { - component.regex.prepending(.nonCapturingGroup(.atomicNonCapturing, TEMP_FAKE_NODE)) + component.regex.prepending(.nonCapturingGroup(.atomicNonCapturing)) } @_spi(RegexBuilder) @@ -157,7 +157,7 @@ public struct _RegexFactory { public func lookaheadNonCapturing( _ component: some RegexComponent ) -> Regex { - component.regex.prepending(.nonCapturingGroup(.lookahead, TEMP_FAKE_NODE)) + component.regex.prepending(.nonCapturingGroup(.lookahead)) } @_spi(RegexBuilder) @@ -165,21 +165,21 @@ public struct _RegexFactory { public func negativeLookaheadNonCapturing( _ component: some RegexComponent ) -> Regex { - component.regex.prepending(.nonCapturingGroup(.negativeLookahead, TEMP_FAKE_NODE)) + component.regex.prepending(.nonCapturingGroup(.negativeLookahead)) } @available(SwiftStdlib 5.7, *) public func orderedChoice( _ component: some RegexComponent ) -> Regex { - component.regex.prepending(.orderedChoice([TEMP_FAKE_NODE])) + component.regex.prepending(.orderedChoice(1)) } @available(SwiftStdlib 5.7, *) public func capture( _ component: some RegexComponent ) -> Regex { - component.regex.prepending(.capture(TEMP_FAKE_NODE)) + component.regex.prepending(.capture()) } @available(SwiftStdlib 5.7, *) @@ -187,7 +187,7 @@ public struct _RegexFactory { _ component: some RegexComponent, _ reference: Int ) -> Regex { - component.regex.prepending(.capture(reference: ReferenceID(reference), TEMP_FAKE_NODE)) + component.regex.prepending(.capture(reference: ReferenceID(reference))) } @available(SwiftStdlib 5.7, *) @@ -199,7 +199,6 @@ public struct _RegexFactory { component.regex.prepending( .capture( reference: reference.map { ReferenceID($0) }, - TEMP_FAKE_NODE, CaptureTransform(transform) )) } @@ -213,7 +212,6 @@ public struct _RegexFactory { component.regex.prepending( .capture( reference: reference.map { ReferenceID($0) }, - TEMP_FAKE_NODE, CaptureTransform(transform) )) } diff --git a/Tests/RegexTests/CaptureTests.swift b/Tests/RegexTests/CaptureTests.swift index 34cc20ad7..bbf41d3b3 100644 --- a/Tests/RegexTests/CaptureTests.swift +++ b/Tests/RegexTests/CaptureTests.swift @@ -129,7 +129,7 @@ extension StringCapture { // TODO: Move `flatCaptureTest`s over here too... func compile(_ ast: AST) -> MEProgram { - try! Compiler(tree: ast.dslTree).emit() + try! Compiler(ast: ast).emit() } func captureTest( diff --git a/Tests/RegexTests/OptimizationTests.swift b/Tests/RegexTests/OptimizationTests.swift index a60d9bf5f..d40c8c8ac 100644 --- a/Tests/RegexTests/OptimizationTests.swift +++ b/Tests/RegexTests/OptimizationTests.swift @@ -37,7 +37,7 @@ import Testing list.autoPossessify() for node in list.nodes { switch node { - case .quantification(_, let kind, _): + case .quantification(_, let kind): #expect( kind.isExplicit && kind.quantificationKind?.ast == .possessive, "Expected possessification in '\(pattern._literalPattern!)'") @@ -57,7 +57,7 @@ import Testing list.autoPossessify() for node in list.nodes { switch node { - case .quantification(_, let kind, _): + case .quantification(_, let kind): #expect( kind.quantificationKind?.ast != .possessive, "Unexpected possessification in '\(pattern._literalPattern!)'") diff --git a/Tests/RegexTests/RenderDSLTests.swift b/Tests/RegexTests/RenderDSLTests.swift index 19ab4c35c..b6b249242 100644 --- a/Tests/RegexTests/RenderDSLTests.swift +++ b/Tests/RegexTests/RenderDSLTests.swift @@ -284,12 +284,9 @@ extension RenderDSLTests { } """#) - // TODO: We might want to consider preserving scalar sequences in the DSL, - // and allowing them to merge with other concatenations. try testConversion(#"\u{A B C}\u{d}efg"#, #""" Regex { - "\u{A}\u{B}\u{C}" - "\u{D}efg" + "\u{A}\u{B}\u{C}\u{D}efg" } """#) @@ -303,6 +300,60 @@ extension RenderDSLTests { """#) } + func testQuantifiers() throws { + try testConversion(#"a+b*c?d{1,}e{,3}f{2,4}g{5}"#, #""" + Regex { + OneOrMore { + "a" + } + ZeroOrMore { + "b" + } + Optionally { + "c" + } + Repeat(1...) { + "d" + } + Repeat(...3) { + "e" + } + Repeat(2...4) { + "f" + } + Repeat(count: 5) { + "g" + } + } + """#) + + try testConversion(#"(?:(?:(?:(?:(?:(?:a+b)*c)?d){1,}e){,3}f){2,4}g){5}"#, #""" + Regex { + Repeat(count: 5) { + Repeat(2...4) { + Repeat(...3) { + Repeat(1...) { + Optionally { + ZeroOrMore { + OneOrMore { + "a" + } + "b" + } + "c" + } + "d" + } + "e" + } + "f" + } + "g" + } + } + """#) + } + func testCharacterClass() throws { try testConversion(#"[abc]+"#, #""" Regex { @@ -343,6 +394,30 @@ extension RenderDSLTests { ZeroOrMore(CharacterClass.anyOf("i").inverted) } """#) + + try testConversion(#"[a-z]+"#, #""" + Regex { + OneOrMore(("a"..."z")) + } + """#) + + try testConversion(#"[[a-z]&&[0-9]]+"#, #""" + Regex { + OneOrMore { + One(("a"..."z") + .intersection(("0"..."9"))) + } + } + """#) + + // Non-convertible elements in character class + try testConversion(#"[a-z\N{BEE}]+"#, #""" + Regex { + OneOrMore { + #/[a-z\N{BEE}]/# + } + } + """#) } func testChangeMatchingOptions() throws {