Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion gradle.properties
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
kotlin.code.style=official
generexVersion=1.3.0
generexVersion=1.4.0-SNAPSHOT

POM_ARTIFACT_ID=generex
POM_NAME=Generex
Expand Down
111 changes: 87 additions & 24 deletions src/main/java/com/pkware/generex/Generex.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import dk.brics.automaton.Transition;
import org.jetbrains.annotations.NotNull;

import java.util.ArrayDeque;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
Expand Down Expand Up @@ -62,15 +63,23 @@ public class Generex implements Iterable<String> {
private boolean isTransactionNodeBuilt;

/**
* Determined possible minimum and maximum length of a regex by traversing the
* Automaton tree using depth first search.
* Minimum length of any string this regex accepts. Populated on first access by
* {@link #calculateLengthBounds()}; {@code null} until then (doubles as the cache-populated flag).
*/
private Integer cachedMinLength;

/**
* The regex's own upper bound on generated string length. Populated on first access by
* {@link #calculateLengthBounds()}. For infinite regexes this is {@link Integer#MAX_VALUE}
* (no natural cap) so that {@code Math.min(userMax, cachedMaxLength)} collapses to the user's
* value. Callers that need a default when the user supplied no max should use
* {@link #DEFAULT_INFINITE_MAX_LENGTH} instead for infinite regexes.
*/
private Integer cachedMaxLength;

/**
* The maximum length a produced string for an infinite regex if {@link #random(int, int)} hasn't been given a max
* length other than {@link Integer#MAX_VALUE}.
* Fallback maximum length used by {@link #random(int)} (and overloads that delegate to it)
* when the regex is infinite and the caller did not supply their own {@code maxLength}.
*/
public static final int DEFAULT_INFINITE_MAX_LENGTH = 50;

Expand Down Expand Up @@ -423,8 +432,10 @@ public String random() {
*/
public String random(int minLength) {
calculateLengthBounds();
int actualMaxLength = isInfinite() ? DEFAULT_INFINITE_MAX_LENGTH : cachedMaxLength;
return random(minLength, actualMaxLength);
// cachedMaxLength is Integer.MAX_VALUE for infinite regexes; fall back to the friendlier
// default since the caller didn't specify their own cap.
int defaultMaxLength = isInfinite() ? DEFAULT_INFINITE_MAX_LENGTH : cachedMaxLength;
return random(minLength, defaultMaxLength);
}

/**
Expand All @@ -451,9 +462,10 @@ public String random(int minLength) {
public String random(int minLength, int maxLength) {
calculateLengthBounds();

// Calculate actual valid range by comparing the regex and the user defined bounds
// Calculate actual valid range by comparing the regex and the user defined bounds.
// For infinite regexes cachedMaxLength is Integer.MAX_VALUE, so the min() leaves maxLength alone.
int actualMinLength = Math.max(minLength, cachedMinLength);
int actualMaxLength = Math.min(maxLength, isInfinite() ? maxLength : cachedMaxLength);
int actualMaxLength = Math.min(maxLength, cachedMaxLength);

// Pre-select target length uniformly from valid range
int targetLength;
Expand Down Expand Up @@ -578,45 +590,96 @@ private String getBestMatch(String newMatch, String currentMatch, int min, int m
}

/**
* Calculate the possible bounds of the generated string by traversing the regex
* Calculate the possible bounds of the generated string by traversing the regex.
* <br>
* For finite automatons, both {@code cachedMinLength} and {@code cachedMaxLength} are populated
* from the DFS. For infinite automatons, {@code cachedMinLength} is computed from a BFS to the
* nearest accepting state, and {@code cachedMaxLength} is set to {@link Integer#MAX_VALUE}
* (meaning "no natural upper bound").
*/
private void calculateLengthBounds() {
if (cachedMinLength != null) return;

int[] bounds = dfsLengthBounds(automaton.getInitialState(), new HashSet<>());
cachedMinLength = bounds[0];
cachedMaxLength = bounds[1];
if (automaton.isFinite()) {
int[] bounds = dfsLengthBounds(automaton.getInitialState(), new HashMap<>());
cachedMinLength = bounds[0];
cachedMaxLength = bounds[1];
} else {
cachedMinLength = bfsMinLength(automaton.getInitialState());
cachedMaxLength = Integer.MAX_VALUE;
}
}

/**
* Uses a depth first search to calculate the minimum and maximum length of the regex by
* traversing through the automaton tree.
* Uses a memoized depth first search to calculate the minimum and maximum length of the regex
* by traversing through the automaton.
* <br>
* We can use DFS because the automaton is finite (does not contain infinite loops) and
* we need to visit every state regardless to determine the longest length.
* Assumes the automaton is finite (acyclic). Under that assumption each state's bounds depend
* only on the state itself, so results can be cached in {@code memo}. Without memoization,
* automatons shaped like a chain of states with multiple parallel transitions (e.g.
* {@code [a-zA-Z0-9]{1,100}}, which determinizes to ~3 range-transitions per state) would be
* explored along every path — exponential in the chain length. Memoization makes this linear
* in the number of states.
*
* @param state the current state of the automaton.
* @param visited the set of visited states.
* @param memo cached bounds for states whose subtree has already been computed.
* @return an int array containing the minimum and maximum length of the regex.
*/
private int[] dfsLengthBounds(State state, Set<State> visited) {
if (visited.contains(state)) return new int[]{Integer.MAX_VALUE, 0};
private int[] dfsLengthBounds(State state, Map<State, int[]> memo) {
int[] cached = memo.get(state);
if (cached != null) return cached;

int minLength = state.isAccept() ? 0 : Integer.MAX_VALUE;
int maxLength = 0;

visited.add(state);

for (Transition transition : state.getTransitions()) {
int[] bounds = dfsLengthBounds(transition.getDest(), visited);
int[] bounds = dfsLengthBounds(transition.getDest(), memo);
if (bounds[0] != Integer.MAX_VALUE) {
minLength = Math.min(minLength, bounds[0] + 1);
}
maxLength = Math.max(maxLength, bounds[1] + 1);
}

visited.remove(state);
return new int[]{minLength, maxLength};
int[] result = {minLength, maxLength};
memo.put(state, result);
return result;
}

/**
* Computes the minimum length of any string the automaton accepts, via a breadth-first search
* from {@code initial} to the nearest accepting state.
* <br>
* Used for infinite (cyclic) automatons where the acyclic-memoized DFS assumption does not
* hold. Returns {@code 0} if {@code initial} itself is accepting. Returns
* {@link Integer#MAX_VALUE} if no accepting state is reachable (not expected for a valid regex).
*
* @param initial the state to search from.
* @return the shortest number of transitions needed to reach an accepting state.
*/
private int bfsMinLength(State initial) {
Set<State> visited = new HashSet<>();
ArrayDeque<State> currentLevel = new ArrayDeque<>();
ArrayDeque<State> nextLevel = new ArrayDeque<>();

currentLevel.add(initial);
visited.add(initial);

int depth = 0;
while (!currentLevel.isEmpty()) {
for (State state : currentLevel) {
if (state.isAccept()) return depth;
for (Transition transition : state.getTransitions()) {
State dest = transition.getDest();
if (visited.add(dest)) nextLevel.add(dest);
}
}
ArrayDeque<State> tmp = currentLevel;
currentLevel = nextLevel;
nextLevel = tmp;
nextLevel.clear();
depth++;
}
return Integer.MAX_VALUE;
}

/**
Expand Down
24 changes: 24 additions & 0 deletions src/test/kotlin/com/pkware/generex/KotlinTests.kt
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,24 @@ class KotlinTests {
assertThat(generex.random()).isEqualTo("hello\\")
}

@ParameterizedTest
@MethodSource("longRegexes")
fun `generating from long bounded regexes does not hang`(regex: String) {
val generex = Generex(regex)
val times = mutableListOf<Long>()

repeat(100) {
val start = System.nanoTime()
val result = generex.random()
times.add(System.nanoTime() - start)

assertThat(result).matches(regex)
}

val averageMs = times.average() / 1_000_000
assertThat(averageMs).isLessThan(100.0)
}

companion object {

@JvmStatic
Expand Down Expand Up @@ -326,5 +344,11 @@ class KotlinTests {
Arguments.of("aaa", 2),
Arguments.of("a{5,10}", 2),
)

@JvmStatic
fun longRegexes() = Stream.of(
Arguments.of("[a-zA-Z0-9]{1,100}"),
Arguments.of("[a-zA-Z0-9]{1,200}"),
)
}
}
Loading