From 6fab8c4ca177362c17b195f449a3ea06edc764eb Mon Sep 17 00:00:00 2001 From: Mehak Nagpal Date: Fri, 13 Mar 2026 18:57:34 +0530 Subject: [PATCH] DSM-3194 RDBMS | Masking | When certain regex is entered in Regular Expression masking and we generate samples or save, the UI freezes. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Issue - The regex used ^[A-Za-z]+(?:[ '-][A-Za-z]+)*$ is an infinte regex and it causes prepareRandom to hang due to exhaustive backtracking in the algorithm. Cause - The prepareRandom() method uses exhaustive backtracking to find a string of an exact target length. For infinite regexes (those with cycles like +, *), when the first random path doesn't produce the exact target length, the algorithm tries all remaining transitions at every level. With states that have multiple transitions (e.g., [A-Za-z] has 2 transitions covering 52 characters, looping to the same state), this creates exponential exploration — 2^N paths for depth N. For a target length of 50, that's 2^50 paths, causing the method to hang indefinitely. Resolution: Added a global attempt counter (MAX_RANDOM_ATTEMPTS = 1000) that limits the total recursive exploration, applied only to infinite regexes. Finite regexes are unaffected since their search space is naturally bounded. When the budget is exhausted, the algorithm returns the best valid match found so far. Each call is O(1000) worst case instead of O(2^N). Additional changes: 1. Strip ^ and $ anchors: The brics automaton library (that Generex uses) does not support ^ and $ as anchors — it treats them as literal characters. This means a regex like ^[A-Za-z]+$ would generate strings like ^aBcDe$ with literal ^ and $ in the output. Added convertToBricsRegex preprocessing to strip ^ and $ anchors in createRegExp() before passing the regex to brics. 2. Convert non-capturing groups (?:...) to (...): The brics library does not support non-capturing group syntax. Without this conversion, (?:abc) would be interpreted as literal ?, :, a, b, c characters inside a group. Since Generex only generates strings and never extracts capture groups, converting (?: to ( is a lossless transformation — the grouping behavior is identical for generation purposes. --- src/main/java/com/pkware/generex/Generex.java | 138 +++++++++++++++++- .../kotlin/com/pkware/generex/KotlinTests.kt | 86 +++++++++++ 2 files changed, 219 insertions(+), 5 deletions(-) diff --git a/src/main/java/com/pkware/generex/Generex.java b/src/main/java/com/pkware/generex/Generex.java index 81925c3..ef1939f 100644 --- a/src/main/java/com/pkware/generex/Generex.java +++ b/src/main/java/com/pkware/generex/Generex.java @@ -22,6 +22,7 @@ import dk.brics.automaton.RegExp; import dk.brics.automaton.State; import dk.brics.automaton.Transition; +import org.jetbrains.annotations.NotNull; import java.util.ArrayList; import java.util.Collection; @@ -118,13 +119,110 @@ public Generex(Automaton automaton, Random random) { * @see #isValidPattern(String) */ private static RegExp createRegExp(String regex) { - String finalRegex = regex; + String finalRegex = convertToBricsRegex(regex); for (Entry charClass : PREDEFINED_CHARACTER_CLASSES.entrySet()) { finalRegex = finalRegex.replaceAll(charClass.getKey(), charClass.getValue()); } return new RegExp(finalRegex); } + /** + * Converts a regex pattern to brics-compatible syntax for use with Generex. + * + *

Performs the following transformations: + *

+ * + *

The conversion is performed in a single pass that tracks escape sequences and character + * class boundaries to avoid incorrect replacements. + * + * @param regex The Java regex pattern to convert. + * @return the brics-compatible regex. + */ + @NotNull + private static String convertToBricsRegex(@NotNull String regex) { + if (regex.isEmpty()) return regex; + + StringBuilder result = new StringBuilder(regex.length()); + boolean escaped = false; + boolean inCharClass = false; + int start = 0; + + // Strip leading ^ anchor (not escaped since it's the first character) + if (regex.charAt(0) == '^') { + start = 1; + } + + for (int i = start; i < regex.length(); i++) { + char c = regex.charAt(i); + + if (escaped) { + result.append(c); + escaped = false; + continue; + } + + if (c == '\\') { + result.append(c); + escaped = true; + continue; + } + + if (inCharClass) { + if (c == ']') inCharClass = false; + result.append(c); + continue; + } + + if (c == '[') { + inCharClass = true; + result.append(c); + int next = i + 1; + // Per regex standard, ] right after [ or [^ is a literal ] inside the class, not the closing bracket. + if (next < regex.length() && regex.charAt(next) == '^') { + result.append('^'); + next++; + } + if (next < regex.length() && regex.charAt(next) == ']') { + result.append(']'); + i = next; + } + continue; + } + + // Convert (?:...) to (...) — only outside character classes and not escaped + if (c == '(' && i + 2 < regex.length() && regex.charAt(i + 1) == '?' && regex.charAt(i + 2) == ':') { + result.append('('); + i += 2; + continue; + } + + result.append(c); + } + + // Strip trailing $ anchor if the last character is an unescaped $ + if (result.length() > 0 && result.charAt(result.length() - 1) == '$') { + // Count preceding backslashes — odd means $ is escaped, even means $ is an anchor + int backslashes = 0; + for (int i = result.length() - 2; i >= 0 && result.charAt(i) == '\\'; i--) { + backslashes++; + } + if (backslashes % 2 == 0) { + result.deleteCharAt(result.length() - 1); + } + } + + return result.toString(); + } + /** * initialize the random instance used with a seed value to generate a * pseudo random suite of strings based on the passed seed and matches the used regular expression @@ -365,11 +463,29 @@ public String random(int minLength, int maxLength) { targetLength = actualMinLength + random.nextInt(actualMaxLength - actualMinLength + 1); } - String result = prepareRandom("", automaton.getInitialState(), minLength, maxLength, targetLength); + String result = prepareRandom("", automaton.getInitialState(), minLength, maxLength, targetLength, isInfinite() ? new AttemptBudget() : null); // Substring in case a length of 'maxLength + 1' is returned, which is possible if a smaller string can't be produced. return result.substring(0, Math.min(maxLength, result.length())); } + /** + * Mutable counter shared by reference across recursive calls to {@link #prepareRandom}, + * used to cap the total number of iterations and prevent exponential backtracking + * for infinite regexes. + */ + private static class AttemptBudget { + private static final int MAX_ATTEMPTS = 1000; + int count = 0; + + boolean isExhausted() { + return count >= MAX_ATTEMPTS; + } + + void increment() { + count++; + } + } + /** * Recursive function used to generate a regex as defined by {@link Generex#random(int, int)}. * @@ -377,13 +493,19 @@ public String random(int minLength, int maxLength) { * @param state Current state of the regex. * @param minLength Minimum wanted length of the produced string. * @param maxLength Maximum wanted length of produced string. + * @param targetLength The desired length of the produced string, pre-selected uniformly from the valid range. + * @param budget Shared attempt counter to limit recursion for infinite regexes, or {@code null} for finite regexes. * @return A string built from the accumulation of previous transitions. */ - private String prepareRandom(String currentMatch, State state, int minLength, int maxLength, int targetLength) { + private String prepareRandom(String currentMatch, State state, int minLength, int maxLength, int targetLength, AttemptBudget budget) { // Return a string of length 'maxLength + 1' to indicate a dead branch. if (currentMatch.length() > maxLength || state.getTransitions().isEmpty()) return currentMatch; + // For infinite regexes, the automaton has cycles that can cause exponential recursion. + // This budget limit caps total recursive iterations to prevent hanging. + if (budget != null && budget.isExhausted()) return currentMatch; + String returnValue = null; if (state.isAccept()) { @@ -400,6 +522,10 @@ private String prepareRandom(String currentMatch, State state, int minLength, in // Will never start as empty due to the initial if statement in the function. while (!possibleTransitions.isEmpty()) { + if (budget != null) { + budget.increment(); + if (budget.isExhausted()) break; + } Transition randomTransition = pickRandomWeightedTransition(possibleTransitions, totalWeightedTransitions); int subTransitions = getWeightedTransitions(randomTransition); @@ -407,7 +533,7 @@ private String prepareRandom(String currentMatch, State state, int minLength, in possibleTransitions.remove(randomTransition); char randomChar = (char) (random.nextInt(subTransitions) + randomTransition.getMin()); - String result = prepareRandom(currentMatch + randomChar, randomTransition.getDest(), minLength, maxLength, targetLength); + String result = prepareRandom(currentMatch + randomChar, randomTransition.getDest(), minLength, maxLength, targetLength, budget); // Greedily return the first valid result found that is of the wanted length.. if (result.length() == targetLength) return result; @@ -415,7 +541,9 @@ private String prepareRandom(String currentMatch, State state, int minLength, in returnValue = getBestMatch(result, returnValue, minLength, maxLength, targetLength); } - return returnValue; + // For infinite regexes, if budget was exhausted before reaching an accept state, return currentMatch + // as a fallback instead of null. + return returnValue != null ? returnValue : currentMatch; } /** diff --git a/src/test/kotlin/com/pkware/generex/KotlinTests.kt b/src/test/kotlin/com/pkware/generex/KotlinTests.kt index 0d6abb7..0dfaf8e 100644 --- a/src/test/kotlin/com/pkware/generex/KotlinTests.kt +++ b/src/test/kotlin/com/pkware/generex/KotlinTests.kt @@ -188,6 +188,83 @@ class KotlinTests { assertThat(generated.length).isEqualTo(targetLength) } + @ParameterizedTest + @MethodSource("infiniteRegexArgs") + fun `infinite regex does not hang`(regex: String) { + val generex = Generex(regex) + repeat(10) { + val result = generex.random() + assertThat(result).matches(regex) + } + } + + @Test + fun `anchors are stripped from regex`() { + val regex = "^[A-Za-z]+$" + val generex = Generex(regex) + val result = generex.random() + assertThat(result).matches(regex) + assertThat(result).doesNotContain("^") + assertThat(result).doesNotContain("$") + } + + @Test + fun `non-capturing groups are converted to plain groups`() { + val regex = "(?:abc)+" + val generex = Generex(regex) + val result = generex.random() + assertThat(result).matches(regex) + } + + @Test + fun `escaped dollar sign at end is not stripped`() { + val regex = "abc\\$" + val generex = Generex(regex) + val result = generex.random() + assertThat(result).isEqualTo("abc$") + } + + @Test + fun `escaped caret at start is not stripped`() { + val regex = "\\^abc" + val generex = Generex(regex) + val result = generex.random() + assertThat(result).isEqualTo("^abc") + } + + @Test + fun `non-capturing group conversion skipped when escaped`() { + val generex = Generex("\\(?:") + assertThat(generex.random()).contains(":") + } + + @Test + fun `non-capturing group conversion skipped inside character class`() { + val generex = Generex("[(?:]") + val result = generex.random() + assertThat(result).matches("[(?:]") + } + + @Test + fun `non-capturing group conversion skipped when closing bracket is first char in character class`() { + val generex = Generex("[](?:]") + val result = generex.random() + assertThat(result).matches("[](?:]") + } + + @Test + fun `non-capturing group conversion skipped when closing bracket is first char in negated character class`() { + val generex = Generex("[^](?:]") + val result = generex.random() + assertThat(result).matches("[^](?:]") + } + + @Test + fun `escaped backslash before dollar sign is not stripped`() { + val generex = Generex("hello\\\\$") + assertThat(generex.random()).isEqualTo("hello\\") + } + companion object { @JvmStatic @@ -234,6 +311,15 @@ class KotlinTests { Arguments.of("\\d{1,10}"), ) + @JvmStatic + fun infiniteRegexArgs() = Stream.of( + Arguments.of("^[A-Za-z]+(?:[ '-][A-Za-z]+)*$"), + Arguments.of("[A-Za-z]+([ '-][A-Za-z]+)*"), + Arguments.of("(\\d{1,3}\\.){1,}\\d{1,3}"), + Arguments.of("[A-Z][a-z]*( [A-Z][a-z]*)*"), + Arguments.of("(a|b)+(c|d)*"), + ) + @JvmStatic fun regexExceedsColumnValue() = Stream.of( Arguments.of("(hi){3,5}", 7),