Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 133 additions & 5 deletions src/main/java/com/pkware/generex/Generex.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import dk.brics.automaton.RegExp;
import dk.brics.automaton.State;
import dk.brics.automaton.Transition;
import org.jetbrains.annotations.NotNull;

import java.util.ArrayList;
import java.util.Collection;
Expand Down Expand Up @@ -118,13 +119,110 @@ public Generex(Automaton automaton, Random random) {
* @see #isValidPattern(String)
*/
private static RegExp createRegExp(String regex) {
String finalRegex = regex;
String finalRegex = convertToBricsRegex(regex);
for (Entry<String, String> charClass : PREDEFINED_CHARACTER_CLASSES.entrySet()) {
finalRegex = finalRegex.replaceAll(charClass.getKey(), charClass.getValue());
}
return new RegExp(finalRegex);
}

/**
* Converts a regex pattern to brics-compatible syntax for use with Generex.
*
* <p>Performs the following transformations:
* <ul>
* <li>Removes an unescaped {@code ^} anchor at the start of the pattern, as brics treats
* {@code ^} as a literal character rather than a start-of-input assertion.</li>
* <li>Removes an unescaped {@code $} anchor at the end of the pattern, as brics treats
* {@code $} as a literal character rather than an end-of-input assertion.</li>
* <li>Converts non-capturing groups {@code (?:...)} to plain capturing groups {@code (...)},
* since brics does not support non-capturing group syntax. This is a lossless
* transformation because Generex only generates strings and never extracts capture
* groups.</li>
* </ul>
*
* <p>The conversion is performed in a single pass that tracks escape sequences and character
* class boundaries to avoid incorrect replacements.
*
* @param regex The Java regex pattern to convert.
* @return the brics-compatible regex.
*/
@NotNull
private static String convertToBricsRegex(@NotNull String regex) {
if (regex.isEmpty()) return regex;

StringBuilder result = new StringBuilder(regex.length());
boolean escaped = false;
boolean inCharClass = false;
int start = 0;

// Strip leading ^ anchor (not escaped since it's the first character)
if (regex.charAt(0) == '^') {
start = 1;
}

for (int i = start; i < regex.length(); i++) {
char c = regex.charAt(i);

if (escaped) {
result.append(c);
escaped = false;
continue;
}

if (c == '\\') {
result.append(c);
escaped = true;
continue;
}

if (inCharClass) {
Comment thread
Mehak22852 marked this conversation as resolved.
if (c == ']') inCharClass = false;
result.append(c);
continue;
}

if (c == '[') {
inCharClass = true;
result.append(c);
int next = i + 1;
// Per regex standard, ] right after [ or [^ is a literal ] inside the class, not the closing bracket.
if (next < regex.length() && regex.charAt(next) == '^') {
result.append('^');
next++;
}
if (next < regex.length() && regex.charAt(next) == ']') {
result.append(']');
i = next;
}
continue;
}

// Convert (?:...) to (...) — only outside character classes and not escaped
if (c == '(' && i + 2 < regex.length() && regex.charAt(i + 1) == '?' && regex.charAt(i + 2) == ':') {
result.append('(');
i += 2;
continue;
}

result.append(c);
}

// Strip trailing $ anchor if the last character is an unescaped $
if (result.length() > 0 && result.charAt(result.length() - 1) == '$') {
// Count preceding backslashes — odd means $ is escaped, even means $ is an anchor
int backslashes = 0;
for (int i = result.length() - 2; i >= 0 && result.charAt(i) == '\\'; i--) {
backslashes++;
}
if (backslashes % 2 == 0) {
result.deleteCharAt(result.length() - 1);
}
}

return result.toString();
}

/**
* initialize the random instance used with a seed value to generate a
* pseudo random suite of strings based on the passed seed and matches the used regular expression
Expand Down Expand Up @@ -365,25 +463,49 @@ public String random(int minLength, int maxLength) {
targetLength = actualMinLength + random.nextInt(actualMaxLength - actualMinLength + 1);
}

String result = prepareRandom("", automaton.getInitialState(), minLength, maxLength, targetLength);
String result = prepareRandom("", automaton.getInitialState(), minLength, maxLength, targetLength, isInfinite() ? new AttemptBudget() : null);
// Substring in case a length of 'maxLength + 1' is returned, which is possible if a smaller string can't be produced.
return result.substring(0, Math.min(maxLength, result.length()));
}

/**
* Mutable counter shared by reference across recursive calls to {@link #prepareRandom},
* used to cap the total number of iterations and prevent exponential backtracking
* for infinite regexes.
*/
private static class AttemptBudget {
private static final int MAX_ATTEMPTS = 1000;
int count = 0;

boolean isExhausted() {
return count >= MAX_ATTEMPTS;
}

void increment() {
count++;
}
}

/**
* Recursive function used to generate a regex as defined by {@link Generex#random(int, int)}.
*
* @param currentMatch A string built from the accumulation of previous transitions.
* @param state Current state of the regex.
* @param minLength Minimum wanted length of the produced string.
* @param maxLength Maximum wanted length of produced string.
* @param targetLength The desired length of the produced string, pre-selected uniformly from the valid range.
* @param budget Shared attempt counter to limit recursion for infinite regexes, or {@code null} for finite regexes.
* @return A string built from the accumulation of previous transitions.
*/
private String prepareRandom(String currentMatch, State state, int minLength, int maxLength, int targetLength) {
private String prepareRandom(String currentMatch, State state, int minLength, int maxLength, int targetLength, AttemptBudget budget) {

// Return a string of length 'maxLength + 1' to indicate a dead branch.
if (currentMatch.length() > maxLength || state.getTransitions().isEmpty()) return currentMatch;

// For infinite regexes, the automaton has cycles that can cause exponential recursion.
// This budget limit caps total recursive iterations to prevent hanging.
if (budget != null && budget.isExhausted()) return currentMatch;

String returnValue = null;

if (state.isAccept()) {
Expand All @@ -400,22 +522,28 @@ private String prepareRandom(String currentMatch, State state, int minLength, in

// Will never start as empty due to the initial if statement in the function.
while (!possibleTransitions.isEmpty()) {
if (budget != null) {
budget.increment();
if (budget.isExhausted()) break;
}

Transition randomTransition = pickRandomWeightedTransition(possibleTransitions, totalWeightedTransitions);
int subTransitions = getWeightedTransitions(randomTransition);
totalWeightedTransitions -= subTransitions;
possibleTransitions.remove(randomTransition);

char randomChar = (char) (random.nextInt(subTransitions) + randomTransition.getMin());
String result = prepareRandom(currentMatch + randomChar, randomTransition.getDest(), minLength, maxLength, targetLength);
String result = prepareRandom(currentMatch + randomChar, randomTransition.getDest(), minLength, maxLength, targetLength, budget);

// Greedily return the first valid result found that is of the wanted length..
if (result.length() == targetLength) return result;

returnValue = getBestMatch(result, returnValue, minLength, maxLength, targetLength);
}

return returnValue;
// For infinite regexes, if budget was exhausted before reaching an accept state, return currentMatch
// as a fallback instead of null.
return returnValue != null ? returnValue : currentMatch;
}

/**
Expand Down
86 changes: 86 additions & 0 deletions src/test/kotlin/com/pkware/generex/KotlinTests.kt
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,83 @@ class KotlinTests {
assertThat(generated.length).isEqualTo(targetLength)
}

@ParameterizedTest
@MethodSource("infiniteRegexArgs")
fun `infinite regex does not hang`(regex: String) {
val generex = Generex(regex)
repeat(10) {
val result = generex.random()
assertThat(result).matches(regex)
}
}

@Test
fun `anchors are stripped from regex`() {
val regex = "^[A-Za-z]+$"
val generex = Generex(regex)
val result = generex.random()
assertThat(result).matches(regex)
assertThat(result).doesNotContain("^")
assertThat(result).doesNotContain("$")
}

@Test
fun `non-capturing groups are converted to plain groups`() {
val regex = "(?:abc)+"
val generex = Generex(regex)
val result = generex.random()
assertThat(result).matches(regex)
}

@Test
fun `escaped dollar sign at end is not stripped`() {
Comment thread
Mehak22852 marked this conversation as resolved.
val regex = "abc\\$"
val generex = Generex(regex)
val result = generex.random()
assertThat(result).isEqualTo("abc$")
}

@Test
fun `escaped caret at start is not stripped`() {
val regex = "\\^abc"
val generex = Generex(regex)
val result = generex.random()
assertThat(result).isEqualTo("^abc")
}

@Test
fun `non-capturing group conversion skipped when escaped`() {
val generex = Generex("\\(?:")
assertThat(generex.random()).contains(":")
}

@Test
fun `non-capturing group conversion skipped inside character class`() {
val generex = Generex("[(?:]")
val result = generex.random()
assertThat(result).matches("[(?:]")
}

@Test
fun `non-capturing group conversion skipped when closing bracket is first char in character class`() {
val generex = Generex("[](?:]")
val result = generex.random()
assertThat(result).matches("[](?:]")
}

@Test
fun `non-capturing group conversion skipped when closing bracket is first char in negated character class`() {
val generex = Generex("[^](?:]")
val result = generex.random()
assertThat(result).matches("[^](?:]")
}

@Test
fun `escaped backslash before dollar sign is not stripped`() {
val generex = Generex("hello\\\\$")
assertThat(generex.random()).isEqualTo("hello\\")
}

companion object {

@JvmStatic
Expand Down Expand Up @@ -234,6 +311,15 @@ class KotlinTests {
Arguments.of("\\d{1,10}"),
)

@JvmStatic
fun infiniteRegexArgs() = Stream.of(
Arguments.of("^[A-Za-z]+(?:[ '-][A-Za-z]+)*$"),
Arguments.of("[A-Za-z]+([ '-][A-Za-z]+)*"),
Arguments.of("(\\d{1,3}\\.){1,}\\d{1,3}"),
Arguments.of("[A-Z][a-z]*( [A-Z][a-z]*)*"),
Arguments.of("(a|b)+(c|d)*"),
)

@JvmStatic
fun regexExceedsColumnValue() = Stream.of(
Arguments.of("(hi){3,5}", 7),
Expand Down
Loading