Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion src/regex/dfa.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,21 @@
use std::collections::{HashMap, HashSet};
use std::fmt;
use std::time::Instant;

use super::sparse::SparseSet;
use super::Error;
use super::Inst;

const STATE_LIMIT: usize = 1_000; // currently at least 2MB >_<

/// Number of inner-loop transitions between deadline checks.
///
/// `Instant::now()` on Linux costs ~20ns; checking once per state-byte would
/// be ~5MB of clock reads for a `STATE_LIMIT=1000` build. Batching by
/// `DEADLINE_CHECK_INTERVAL` keeps the overhead under 1% while still bounding
/// the wall-clock overshoot to a few hundred microseconds in the worst case.
const DEADLINE_CHECK_INTERVAL: usize = 1024;

pub struct DfaBuilder {
dfa: Dfa,
cache: HashMap<Vec<usize>, usize>,
Expand Down Expand Up @@ -34,13 +43,21 @@ impl DfaBuilder {
}
}

pub fn build(mut self) -> Result<Dfa, Error> {
pub fn build(self) -> Result<Dfa, Error> {
self.build_with_deadline(None)
}

pub fn build_with_deadline(
mut self,
deadline: Option<Instant>,
) -> Result<Dfa, Error> {
let mut cur = SparseSet::new(self.dfa.insts.len());
let mut next = SparseSet::new(self.dfa.insts.len());

self.dfa.add(&mut cur, 0);
let mut states = vec![self.cached_state(&cur).unwrap()];
let mut seen = HashSet::new();
let mut transitions_since_check: usize = 0;
while let Some(s) = states.pop() {
for b in 0..256 {
let ns = self.run_state(&mut cur, &mut next, s, b as u8);
Expand All @@ -53,6 +70,15 @@ impl DfaBuilder {
if self.dfa.states.len() > STATE_LIMIT {
return Err(Error::TooManyStates(STATE_LIMIT));
}
transitions_since_check += 1;
if transitions_since_check >= DEADLINE_CHECK_INTERVAL {
transitions_since_check = 0;
if let Some(d) = deadline {
if Instant::now() >= d {
return Err(Error::DeadlineExceeded);
}
}
}
}
}
Ok(self.dfa)
Expand Down
13 changes: 13 additions & 0 deletions src/regex/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,15 @@ pub enum Error {
///
/// This restriction may be lifted in the future.
NoBytes,
/// The configured wall-clock deadline was reached during regex
/// compilation or DFA construction.
///
/// Returned only by APIs that accept an explicit deadline (such as
/// [`Regex::new_with_deadline`](crate::Regex::new_with_deadline)).
/// Callers can use this to bound the latency of compiling adversarial
/// patterns whose NFA-to-DFA expansion is expensive but still finite
/// under the existing `STATE_LIMIT`/size-limit guards.
DeadlineExceeded,
}

impl From<regex_syntax::Error> for Error {
Expand Down Expand Up @@ -77,6 +86,10 @@ impl fmt::Display for Error {
(hopefully temporary)."
),
NoBytes => write!(f, "Byte literals are not allowed."),
DeadlineExceeded => write!(
f,
"Regex compilation deadline exceeded."
),
}
}
}
Expand Down
97 changes: 96 additions & 1 deletion src/regex/mod.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use crate::Automaton;
use regex_syntax;
use std::fmt;
use std::time::Instant;

mod compile;
mod dfa;
Expand Down Expand Up @@ -81,10 +82,39 @@ impl Regex {
Regex::with_size_limit(10 * (1 << 20), re)
}

/// Create a new regular expression query with a wall-clock deadline on
/// compilation.
///
/// Identical to [`Regex::new`] except that the NFA-to-DFA construction
/// will periodically check `Instant::now()` and abort with
/// [`Error::DeadlineExceeded`] if `deadline` has passed.
///
/// This is useful for hardening services that accept user-supplied regex
/// patterns: pathological inputs such as `\w{0,5}\w{0,5}\w{0,5}\w{0,5}`
/// can take multiple seconds to compile even though they ultimately fail
/// the existing `STATE_LIMIT` / size-limit guards. The existing limits
/// bound *memory* but not *wall-clock time*; a deadline closes that gap.
///
/// The check is batched (every ~1024 transitions) so the overhead is well
/// under 1% on well-behaved patterns. Worst-case overshoot of `deadline`
/// is on the order of a few hundred microseconds.
#[inline]
pub fn new_with_deadline(re: &str, deadline: Instant) -> Result<Regex, Error> {
Regex::with_size_limit_and_deadline(10 * (1 << 20), re, Some(deadline))
}

fn with_size_limit(size: usize, re: &str) -> Result<Regex, Error> {
Regex::with_size_limit_and_deadline(size, re, None)
}

fn with_size_limit_and_deadline(
size: usize,
re: &str,
deadline: Option<Instant>,
) -> Result<Regex, Error> {
let hir = regex_syntax::Parser::new().parse(re)?;
let insts = self::compile::Compiler::new(size).compile(&hir)?;
let dfa = self::dfa::DfaBuilder::new(insts).build()?;
let dfa = self::dfa::DfaBuilder::new(insts).build_with_deadline(deadline)?;
Ok(Regex {
original: re.to_owned(),
dfa,
Expand Down Expand Up @@ -134,3 +164,68 @@ impl fmt::Debug for Inst {
}
}
}

#[cfg(test)]
mod tests {
use super::{Error, Regex};
use std::time::{Duration, Instant};

/// A pattern whose NFA-to-DFA expansion is known to take multiple seconds
/// before the existing `STATE_LIMIT=1000` guard kicks in.
///
/// Documented timings (release build, x86_64 Linux):
/// `\w{0,5}\w{0,5}\w{0,5}\w{0,5}` ~ 2.3 s, terminating in
/// `Error::TooManyStates(1000)`. Without a deadline, this is a clean DoS
/// vector for any service that compiles user-supplied regex patterns on a
/// request thread.
const SLOW_PATTERN: &str = r"\w{0,5}\w{0,5}\w{0,5}\w{0,5}";

#[test]
fn deadline_aborts_pathological_pattern() {
let deadline = Instant::now() + Duration::from_millis(50);
let t0 = Instant::now();
let result = Regex::new_with_deadline(SLOW_PATTERN, deadline);
let elapsed = t0.elapsed();
match result {
Err(Error::DeadlineExceeded) => {}
other => panic!(
"expected DeadlineExceeded, got {:?} after {:?}",
other.map(|_| "Ok").map_err(|e| e), elapsed
),
}
// Worst-case overshoot: one full check interval. Allow generous
// headroom for slow CI runners; the important property is that we
// bail in well under the unbounded ~2.3s baseline.
assert!(
elapsed < Duration::from_millis(500),
"deadline overshoot: {:?}", elapsed
);
}

#[test]
fn deadline_does_not_affect_easy_pattern() {
let deadline = Instant::now() + Duration::from_secs(60);
let regex = Regex::new_with_deadline("hello.*world", deadline)
.expect("simple pattern should compile well within deadline");
// Sanity check: the resulting Regex behaves like one built via
// `Regex::new` (round-trips its source via Debug).
let dbg = format!("{:?}", regex);
assert!(dbg.contains("hello.*world"), "unexpected Debug: {}", dbg);
}

#[test]
fn new_unchanged_for_easy_pattern() {
// Backwards-compatibility smoke: existing API unaffected.
Regex::new("hello.*world").expect("simple pattern must compile");
}

#[test]
fn deadline_exceeded_error_is_distinct() {
// Make sure the new variant doesn't accidentally collapse into
// TooManyStates / Syntax in Display / Debug.
let err = Error::DeadlineExceeded;
assert_eq!(format!("{}", err), "Regex compilation deadline exceeded.");
let dbg = format!("{:?}", err);
assert_eq!(dbg, "DeadlineExceeded");
}
}