diff --git a/gradle.properties b/gradle.properties index 7277df2..f38cfba 100644 --- a/gradle.properties +++ b/gradle.properties @@ -1,5 +1,5 @@ kotlin.code.style=official -generexVersion=1.3.0 +generexVersion=1.4.0-SNAPSHOT POM_ARTIFACT_ID=generex POM_NAME=Generex diff --git a/src/main/java/com/pkware/generex/Generex.java b/src/main/java/com/pkware/generex/Generex.java index ef1939f..31de7b5 100644 --- a/src/main/java/com/pkware/generex/Generex.java +++ b/src/main/java/com/pkware/generex/Generex.java @@ -24,6 +24,7 @@ import dk.brics.automaton.Transition; import org.jetbrains.annotations.NotNull; +import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Collection; import java.util.Collections; @@ -62,15 +63,23 @@ public class Generex implements Iterable { private boolean isTransactionNodeBuilt; /** - * Determined possible minimum and maximum length of a regex by traversing the - * Automaton tree using depth first search. + * Minimum length of any string this regex accepts. Populated on first access by + * {@link #calculateLengthBounds()}; {@code null} until then (doubles as the cache-populated flag). */ private Integer cachedMinLength; + + /** + * The regex's own upper bound on generated string length. Populated on first access by + * {@link #calculateLengthBounds()}. For infinite regexes this is {@link Integer#MAX_VALUE} + * (no natural cap) so that {@code Math.min(userMax, cachedMaxLength)} collapses to the user's + * value. Callers that need a default when the user supplied no max should use + * {@link #DEFAULT_INFINITE_MAX_LENGTH} instead for infinite regexes. + */ private Integer cachedMaxLength; /** - * The maximum length a produced string for an infinite regex if {@link #random(int, int)} hasn't been given a max - * length other than {@link Integer#MAX_VALUE}. + * Fallback maximum length used by {@link #random(int)} (and overloads that delegate to it) + * when the regex is infinite and the caller did not supply their own {@code maxLength}. */ public static final int DEFAULT_INFINITE_MAX_LENGTH = 50; @@ -423,8 +432,10 @@ public String random() { */ public String random(int minLength) { calculateLengthBounds(); - int actualMaxLength = isInfinite() ? DEFAULT_INFINITE_MAX_LENGTH : cachedMaxLength; - return random(minLength, actualMaxLength); + // cachedMaxLength is Integer.MAX_VALUE for infinite regexes; fall back to the friendlier + // default since the caller didn't specify their own cap. + int defaultMaxLength = isInfinite() ? DEFAULT_INFINITE_MAX_LENGTH : cachedMaxLength; + return random(minLength, defaultMaxLength); } /** @@ -451,9 +462,10 @@ public String random(int minLength) { public String random(int minLength, int maxLength) { calculateLengthBounds(); - // Calculate actual valid range by comparing the regex and the user defined bounds + // Calculate actual valid range by comparing the regex and the user defined bounds. + // For infinite regexes cachedMaxLength is Integer.MAX_VALUE, so the min() leaves maxLength alone. int actualMinLength = Math.max(minLength, cachedMinLength); - int actualMaxLength = Math.min(maxLength, isInfinite() ? maxLength : cachedMaxLength); + int actualMaxLength = Math.min(maxLength, cachedMaxLength); // Pre-select target length uniformly from valid range int targetLength; @@ -578,45 +590,96 @@ private String getBestMatch(String newMatch, String currentMatch, int min, int m } /** - * Calculate the possible bounds of the generated string by traversing the regex + * Calculate the possible bounds of the generated string by traversing the regex. + *
+ * For finite automatons, both {@code cachedMinLength} and {@code cachedMaxLength} are populated + * from the DFS. For infinite automatons, {@code cachedMinLength} is computed from a BFS to the + * nearest accepting state, and {@code cachedMaxLength} is set to {@link Integer#MAX_VALUE} + * (meaning "no natural upper bound"). */ private void calculateLengthBounds() { if (cachedMinLength != null) return; - int[] bounds = dfsLengthBounds(automaton.getInitialState(), new HashSet<>()); - cachedMinLength = bounds[0]; - cachedMaxLength = bounds[1]; + if (automaton.isFinite()) { + int[] bounds = dfsLengthBounds(automaton.getInitialState(), new HashMap<>()); + cachedMinLength = bounds[0]; + cachedMaxLength = bounds[1]; + } else { + cachedMinLength = bfsMinLength(automaton.getInitialState()); + cachedMaxLength = Integer.MAX_VALUE; + } } /** - * Uses a depth first search to calculate the minimum and maximum length of the regex by - * traversing through the automaton tree. + * Uses a memoized depth first search to calculate the minimum and maximum length of the regex + * by traversing through the automaton. *
- * We can use DFS because the automaton is finite (does not contain infinite loops) and - * we need to visit every state regardless to determine the longest length. + * Assumes the automaton is finite (acyclic). Under that assumption each state's bounds depend + * only on the state itself, so results can be cached in {@code memo}. Without memoization, + * automatons shaped like a chain of states with multiple parallel transitions (e.g. + * {@code [a-zA-Z0-9]{1,100}}, which determinizes to ~3 range-transitions per state) would be + * explored along every path — exponential in the chain length. Memoization makes this linear + * in the number of states. * * @param state the current state of the automaton. - * @param visited the set of visited states. + * @param memo cached bounds for states whose subtree has already been computed. * @return an int array containing the minimum and maximum length of the regex. */ - private int[] dfsLengthBounds(State state, Set visited) { - if (visited.contains(state)) return new int[]{Integer.MAX_VALUE, 0}; + private int[] dfsLengthBounds(State state, Map memo) { + int[] cached = memo.get(state); + if (cached != null) return cached; int minLength = state.isAccept() ? 0 : Integer.MAX_VALUE; int maxLength = 0; - visited.add(state); - for (Transition transition : state.getTransitions()) { - int[] bounds = dfsLengthBounds(transition.getDest(), visited); + int[] bounds = dfsLengthBounds(transition.getDest(), memo); if (bounds[0] != Integer.MAX_VALUE) { minLength = Math.min(minLength, bounds[0] + 1); } maxLength = Math.max(maxLength, bounds[1] + 1); } - visited.remove(state); - return new int[]{minLength, maxLength}; + int[] result = {minLength, maxLength}; + memo.put(state, result); + return result; + } + + /** + * Computes the minimum length of any string the automaton accepts, via a breadth-first search + * from {@code initial} to the nearest accepting state. + *
+ * Used for infinite (cyclic) automatons where the acyclic-memoized DFS assumption does not + * hold. Returns {@code 0} if {@code initial} itself is accepting. Returns + * {@link Integer#MAX_VALUE} if no accepting state is reachable (not expected for a valid regex). + * + * @param initial the state to search from. + * @return the shortest number of transitions needed to reach an accepting state. + */ + private int bfsMinLength(State initial) { + Set visited = new HashSet<>(); + ArrayDeque currentLevel = new ArrayDeque<>(); + ArrayDeque nextLevel = new ArrayDeque<>(); + + currentLevel.add(initial); + visited.add(initial); + + int depth = 0; + while (!currentLevel.isEmpty()) { + for (State state : currentLevel) { + if (state.isAccept()) return depth; + for (Transition transition : state.getTransitions()) { + State dest = transition.getDest(); + if (visited.add(dest)) nextLevel.add(dest); + } + } + ArrayDeque tmp = currentLevel; + currentLevel = nextLevel; + nextLevel = tmp; + nextLevel.clear(); + depth++; + } + return Integer.MAX_VALUE; } /** diff --git a/src/test/kotlin/com/pkware/generex/KotlinTests.kt b/src/test/kotlin/com/pkware/generex/KotlinTests.kt index 0dfaf8e..7600639 100644 --- a/src/test/kotlin/com/pkware/generex/KotlinTests.kt +++ b/src/test/kotlin/com/pkware/generex/KotlinTests.kt @@ -265,6 +265,24 @@ class KotlinTests { assertThat(generex.random()).isEqualTo("hello\\") } + @ParameterizedTest + @MethodSource("longRegexes") + fun `generating from long bounded regexes does not hang`(regex: String) { + val generex = Generex(regex) + val times = mutableListOf() + + repeat(100) { + val start = System.nanoTime() + val result = generex.random() + times.add(System.nanoTime() - start) + + assertThat(result).matches(regex) + } + + val averageMs = times.average() / 1_000_000 + assertThat(averageMs).isLessThan(100.0) + } + companion object { @JvmStatic @@ -326,5 +344,11 @@ class KotlinTests { Arguments.of("aaa", 2), Arguments.of("a{5,10}", 2), ) + + @JvmStatic + fun longRegexes() = Stream.of( + Arguments.of("[a-zA-Z0-9]{1,100}"), + Arguments.of("[a-zA-Z0-9]{1,200}"), + ) } }