Loading pageâŚ
Rust walkthroughs
Loading pageâŚ
regex::RegexBuilder::size_limit for complex patterns?size_limit controls the maximum memory (in bytes) that the regex engine can allocate during compilation. The regex crate compiles patterns into a finite state machine (DFA/NFA), and complex patternsâespecially those with many alternations, repetitions, or character classesâcan cause exponential state explosion. Without a limit, malicious or pathological patterns could consume all available memory during compilation. The default limit is conservative (~10MB for the meta engine), balancing safety with functionality. Raising the limit allows more complex patterns to compile but exposes your application to memory exhaustion attacks; lowering it provides stronger protection but may reject legitimate patterns.
use regex::RegexBuilder;
fn basic_size_limit() {
// Default size limit (varies by regex version)
let re = RegexBuilder::new(r"\w+")
.build()
.unwrap();
// Set a custom size limit (in bytes)
let re = RegexBuilder::new(r"\w+")
.size_limit(1024 * 1024) // 1 MB limit
.build()
.unwrap();
// Very small limit - may reject complex patterns
let re = RegexBuilder::new(r"[a-zA-Z0-9_]+")
.size_limit(100) // 100 bytes - very restrictive
.build()
.unwrap();
}size_limit is specified in bytes and constrains memory during compilation.
use regex::RegexBuilder;
fn state_explosion_example() {
// Simple pattern - small DFA
let simple = RegexBuilder::new(r"hello")
.build()
.unwrap();
// Alternations multiply states
let many_alts = RegexBuilder::new(r"a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p")
.build()
.unwrap();
// Repetitions with overlapping possibilities
// This pattern can cause exponential state growth
let explosive = RegexBuilder::new(r"(a|aa|aaa|aaaa)+")
.size_limit(100_000_000) // May need large limit
.build();
match explosive {
Ok(re) => println!("Pattern compiled successfully"),
Err(e) => println!("Pattern too complex: {}", e),
}
}Patterns with many overlapping alternatives can cause exponential DFA state growth.
use regex::RegexBuilder;
use std::time::Instant;
fn compilation_vs_runtime() {
let pattern = r"(a+)+b"; // Classic "evil" regex pattern
// Try with default size limit
let start = Instant::now();
let result = RegexBuilder::new(pattern)
.build();
let compile_time = start.elapsed();
match result {
Ok(re) => {
println!("Compiled in {:?}", compile_time);
// Runtime can still be slow on matching input
let input = "aaaaaaaaaaaaaaaaaaaaaaaac";
let start = Instant::now();
let matches = re.is_match(input);
let match_time = start.elapsed();
println!("Match result: {}, time: {:?}", matches, match_time);
}
Err(e) => {
println!("Compilation failed: {}", e);
}
}
}size_limit affects compilation time and memory, not runtime execution time.
use regex::RegexBuilder;
fn memory_protection() {
// Pattern that could exhaust memory without limits
let complex_pattern = r"(a?){50}b";
// With generous limit - might succeed
match RegexBuilder::new(complex_pattern)
.size_limit(50_000_000) // 50 MB
.build()
{
Ok(_) => println!("Complex pattern compiled"),
Err(e) => println!("Compilation failed: {}", e),
}
// With strict limit - protects memory
match RegexBuilder::new(complex_pattern)
.size_limit(100_000) // 100 KB - very strict
.build()
{
Ok(_) => println!("Pattern accepted"),
Err(e) => println!("Pattern rejected (protected): {}", e),
}
}The size limit protects against runaway memory consumption during compilation.
use regex::RegexBuilder;
fn backend_differences() {
// The regex crate uses multiple internal engines:
// - PikeVM (NFA simulation) - slower but smaller
// - DFA (deterministic) - fast but potentially large
// - Hybrid/Lazy DFA - balanced approach
// size_limit affects DFA compilation most severely
let pattern = r"[a-zA-Z0-9]{1,30}";
// Different size limits for different backends
let re = RegexBuilder::new(pattern)
.size_limit(10_000_000) // Affects DFA cache
.build()
.unwrap();
// The meta engine automatically chooses backend based on
// pattern complexity and size limit
}Different regex backends respond differently to size constraints.
use regex::RegexBuilder;
fn measure_regex_size() {
// There's no direct way to query compiled size
// But you can infer it from compilation success/failure
let patterns = vec![
r"hello",
r"[a-z]+",
r"(a|b|c|d|e|f)+",
r"(a?){20}",
r"(a|aa|aaa)+",
];
let sizes = [1_000, 10_000, 100_000, 1_000_000, 10_000_000];
for pattern in &patterns {
print!("Pattern {:?}: ", pattern);
for &size in &sizes {
match RegexBuilder::new(pattern)
.size_limit(size)
.build()
{
Ok(_) => {
print!("OK at {} bytes, ", size);
break;
}
Err(_) => print!("fail at {} bytes, ", size),
}
}
println!();
}
}Testing against different limits reveals pattern complexity.
use regex::RegexBuilder;
fn cascading_alternations() {
// Each alternation roughly doubles potential states
// (a|b|c) = 3 alternatives
// ((a|b|c)|(d|e|f)) = 6 alternatives, more states
let small = r"(a|b|c)";
let medium = r"(a|b|c|d|e|f|g|h|i|j)";
let large = r"(a|b|c|d|e|f|g|h|i|j|k|l|m|n|o|p|q|r|s|t|u|v|w|x|y|z)";
// Nested alternations multiply complexity
let nested = r"((a|b|c)|(d|e|f)|(g|h|i))";
let very_nested = r"(((a|b)|(c|d))|((e|f)|(g|h)))";
for (name, pattern) in [
("small", small),
("medium", medium),
("large", large),
("nested", nested),
("very_nested", very_nested),
] {
match RegexBuilder::new(pattern)
.size_limit(1_000_000)
.build()
{
Ok(_) => println!("{}: compiled successfully", name),
Err(e) => println!("{}: {}", name, e),
}
}
}Alternations with overlapping character classes can cause state multiplication.
use regex::RegexBuilder;
fn repetition_impact() {
// Fixed repetitions - bounded growth
let fixed = r"a{10}"; // Exactly 10 'a's
// Ranges - moderate growth
let range = r"a{1,10}"; // 1 to 10 'a's
// Unbounded with simple atom - manageable
let unbounded = r"a+"; // One or more 'a's
// Unbounded with alternation - potential explosion
let dangerous = r"(a|b)+"; // Many possible paths
// Nested unbounded - worst case
let very_dangerous = r"((a|b)+)+";
let patterns = [
("fixed", fixed),
("range", range),
("unbounded", unbounded),
("dangerous", dangerous),
("very_dangerous", very_dangerous),
];
for (name, pattern) in patterns {
match RegexBuilder::new(pattern)
.size_limit(10_000_000)
.build()
{
Ok(re) => {
// Test runtime behavior
let test = "abababababababab";
let start = std::time::Instant::now();
let _ = re.is_match(test);
println!("{}: compiled, match time {:?}", name, start.elapsed());
}
Err(e) => println!("{}: failed to compile - {}", name, e),
}
}
}Nested unbounded repetitions are the primary cause of state explosion.
use regex::RegexBuilder;
fn security_implications() {
// ReDoS (Regular Expression Denial of Service) protection
// User-supplied patterns are dangerous
fn compile_user_pattern(pattern: &str) -> Result<regex::Regex, regex::Error> {
RegexBuilder::new(pattern)
.size_limit(1_000_000) // Strict limit for untrusted input
.build()
}
// Attack patterns to watch for:
let attack_patterns = vec![
r"(a+)+", // Nested quantifiers
r"(a|a?)+", // Alternation with optional
r"(a|aa|aaa)+", // Overlapping alternatives
r"(a?){50}", // Many optional repetitions
];
for pattern in attack_patterns {
match compile_user_pattern(pattern) {
Ok(_) => println!("Pattern '{}' compiled (might want to reject)", pattern),
Err(_) => println!("Pattern '{}' rejected by size limit", pattern),
}
}
// For known/safe patterns, higher limits are fine
fn compile_known_pattern(pattern: &str) -> Result<regex::Regex, regex::Error> {
RegexBuilder::new(pattern)
.size_limit(100_000_000) // Generous for trusted patterns
.build()
}
}Use strict limits for user-supplied patterns to prevent denial-of-service attacks.
use regex::RegexBuilder;
fn dfa_config() {
// The regex crate uses a hybrid approach by default
// You can configure DFA-specific behavior
let pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}";
// size_limit affects the lazy DFA cache
let re = RegexBuilder::new(pattern)
.size_limit(10_000_000) // 10 MB for DFA cache
.build()
.unwrap();
// For text-heavy workloads, larger caches can improve performance
let re_large = RegexBuilder::new(pattern)
.size_limit(100_000_000) // 100 MB
.build()
.unwrap();
// Both compile the same pattern, but runtime may differ
// for large inputs due to cache behavior
}The size limit affects the lazy DFA's cache size during compilation.
use regex::RegexBuilder;
fn practical_guidelines() {
// Rule of thumb for size limits:
// 1. User-supplied patterns: strict limits
let user_limit = 1_000_000; // 1 MB
// 2. Known application patterns: moderate limits
let app_limit = 10_000_000; // 10 MB
// 3. Complex validated patterns: generous limits
let complex_limit = 100_000_000; // 100 MB
// Example: email validation (known pattern)
let email = RegexBuilder::new(
r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
)
.size_limit(app_limit)
.build()
.unwrap();
// Example: parsing log format (known pattern)
let log = RegexBuilder::new(
r"\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2} \[[A-Z]+\] .*"
)
.size_limit(app_limit)
.build()
.unwrap();
}Choose size limits based on the trust level and complexity of patterns.
use regex::RegexBuilder;
use std::time::Instant;
fn test_pattern_complexity(pattern: &str) -> ComplexityReport {
let mut report = ComplexityReport {
pattern: pattern.to_string(),
compiled_at_limit: None,
compile_times: Vec::new(),
};
let limits = [100_000, 1_000_000, 10_000_000, 100_000_000];
for &limit in &limits {
let start = Instant::now();
match RegexBuilder::new(pattern).size_limit(limit).build() {
Ok(_) => {
report.compiled_at_limit = Some(limit);
report.compile_times.push((limit, start.elapsed()));
break;
}
Err(_) => {
report.compile_times.push((limit, start.elapsed()));
}
}
}
report
}
struct ComplexityReport {
pattern: String,
compiled_at_limit: Option<usize>,
compile_times: Vec<(usize, std::time::Duration)>,
}
fn pattern_analysis_example() {
let patterns = vec![
r"hello",
r"[a-z]+@[a-z]+\.[a-z]{2,}",
r"(a|b|c|d|e|f)+",
r"(a?){30}b",
];
for pattern in patterns {
let report = test_pattern_complexity(pattern);
match report.compiled_at_limit {
Some(limit) => println!("{:?} compiles at {} bytes", pattern, limit),
None => println!("{:?} too complex for tested limits", pattern),
}
}
}Testing patterns against various limits helps understand their complexity.
use regex::RegexBuilder;
fn combined_settings() {
let pattern = r"(a|b)+";
// size_limit interacts with other compilation settings
let re = RegexBuilder::new(pattern)
.size_limit(10_000_000)
.bytes(true) // Match on bytes instead of Unicode
.case_insensitive(true)
.multi_line(true)
.build()
.unwrap();
// Each setting can affect compiled size:
// - case_insensitive: doubles character class sizes
// - bytes: reduces Unicode handling overhead
// - multi_line: adds anchor handling
// - ignore_whitespace: doesn't affect compiled size
// Consider combined effect when setting limits
}Other compilation settings can affect the final compiled size.
use regex::RegexBuilder;
fn log_parsing_example() {
// Complex log pattern with multiple capture groups
let log_pattern = r#"(\d{4}-\d{2}-\d{2}) (\d{2}:\d{2}:\d{2}) \[(\w+)\] \[([^\]]+)\] (.*)"#;
// Moderate limit for this complexity
let re = RegexBuilder::new(log_pattern)
.size_limit(10_000_000)
.build()
.unwrap();
let log_line = r#"2024-01-15 14:30:00 [INFO] [request-id-123] User logged in"#;
if let Some(caps) = re.captures(log_line) {
println!("Date: {}", &caps[1]);
println!("Time: {}", &caps[2]);
println!("Level: {}", &caps[3]);
println!("Request ID: {}", &caps[4]);
println!("Message: {}", &caps[5]);
}
}Real-world patterns need appropriate limits based on expected complexity.
use regex::RegexBuilder;
fn backtracking_comparison() {
// Rust's regex engine is NOT backtracking-based
// This means it doesn't have exponential runtime on "evil" patterns
// But it can still have exponential compilation size
let evil = r"(a+)+b";
// Compilation might fail due to size limit
match RegexBuilder::new(evil)
.size_limit(1_000_000)
.build()
{
Ok(re) => {
// If it compiles, runtime is guaranteed to be O(n)
let input = "aaaaaaaaaaaaaaaaaaaaaaaac";
// This won't hang like in backtracking engines
let _ = re.is_match(input);
}
Err(e) => {
// Pattern too complex for size limit
println!("Compilation failed: {}", e);
}
}
// This is different from backtracking engines (like PCRE):
// - PCRE: compiles fast, but can hang at runtime
// - Rust regex: might fail to compile, but never hangs at runtime
}Rust's regex engine trades compilation complexity for guaranteed linear runtime.
size_limit protects against memory exhaustion during regex compilation:
What it controls:
When it matters:
(a|b|c|d|e|f)+((a+)+)(a|aa|aaa)+a{1,1000}Performance implications:
Security considerations:
Key insight: The size_limit setting embodies a fundamental trade-off in regex engines. Rust's approach guarantees linear-time matching (O(n) where n is input length) by building a finite automaton at compile time. This means complex patterns can require substantial memory upfrontâpotentially exponential in the pattern length. The size_limit lets you cap this memory, but setting it too low rejects legitimate complex patterns. Unlike backtracking engines where the danger is runtime hangs, Rust's danger is compile-time memory exhaustion. The right limit depends on your threat model: strict for untrusted input, generous for trusted internal patterns.