Loading page…
Rust walkthroughs
Loading page…
regex::RegexSet allow matching multiple patterns simultaneously and what are the performance benefits?RegexSet enables matching a string against multiple regular expressions in a single pass, returning which patterns matched. This provides significant performance benefits when you need to check many patterns against the same text, as it shares the work of scanning the input across all patterns rather than processing each pattern independently.
use regex::Regex;
fn match_multiple_patterns(text: &str) -> Vec<usize> {
let patterns = vec![
r"\bfoo\b",
r"\bbar\b",
r"\bbaz\b",
r"\d+",
r"[A-Z][a-z]+",
];
// Naive approach: compile and run each pattern separately
let regexes: Vec<Regex> = patterns
.iter()
.map(|p| Regex::new(p).unwrap())
.collect();
let mut matches = Vec::new();
for (i, re) in regexes.iter().enumerate() {
if re.is_match(text) {
matches.push(i);
}
}
matches
}
fn main() {
let text = "foo 123 Bar";
let matched = match_multiple_patterns(text);
println!("Matched patterns: {:?}", matched);
// Matched patterns: [0, 3, 4]
// Pattern 0: \bfoo\b
// Pattern 3: \d+
// Pattern 4: [A-Z][a-z]+
}Each pattern scans the text independently, resulting in O(n * m) work where n is text length and m is pattern count.
use regex::RegexSet;
fn basic_regexset() {
// Compile multiple patterns into a single set
let set = RegexSet::new([
r"\bfoo\b",
r"\bbar\b",
r"\bbaz\b",
r"\d+",
r"[A-Z][a-z]+",
]).unwrap();
let text = "foo 123 Bar";
// Match against all patterns in one pass
let matches: Vec<usize> = set.matches(text).into_iter().collect();
println!("Matched patterns: {:?}", matches);
// Matched patterns: [0, 3, 4]
// Check if any pattern matched
if set.is_match(text) {
println!("At least one pattern matched");
}
}RegexSet matches all patterns in a single pass through the text.
use regex::RegexSet;
fn matches_type() {
let set = RegexSet::new([
r"foo",
r"bar",
r"baz",
]).unwrap();
let text = "foobar";
// matches() returns a Matches type
let matches = set.matches(text);
// Check if specific pattern matched
if matches.matched(0) {
println!("Pattern 0 (foo) matched");
}
if matches.matched(1) {
println!("Pattern 1 (bar) matched");
}
// Iterate over matched pattern indices
for idx in matches.iter() {
println!("Pattern {} matched", idx);
}
// Convert to Vec
let matched: Vec<usize> = matches.into_iter().collect();
println!("All matched: {:?}", matched);
}The Matches type provides flexible access to match results.
use regex::RegexSet;
use std::time::Instant;
fn performance_comparison() {
let patterns: Vec<&str> = (0..100)
.map(|i| format!(r"\bword{}\b", i).leak())
.collect();
let text = "This text contains word42 and word77 somewhere in it.";
// Approach 1: Individual regexes
let regexes: Vec<regex::Regex> = patterns
.iter()
.map(|p| regex::Regex::new(p).unwrap())
.collect();
let start = Instant::now();
let mut individual_matches = Vec::new();
for _ in 0..1000 {
individual_matches.clear();
for (i, re) in regexes.iter().enumerate() {
if re.is_match(text) {
individual_matches.push(i);
}
}
}
let individual_time = start.elapsed();
// Approach 2: RegexSet
let set = RegexSet::new(&patterns).unwrap();
let start = Instant::now();
let mut set_matches = Vec::new();
for _ in 0..1000 {
set_matches = set.matches(text).into_iter().collect();
}
let set_time = start.elapsed();
println!("Individual: {:?}", individual_time);
println!("RegexSet: {:?}", set_time);
println!("Speedup: {:.1}x",
individual_time.as_nanos() as f64 / set_time.as_nanos() as f64);
// Typical results: RegexSet is 5-50x faster for many patterns
}RegexSet shares the scanning work across all patterns.
use regex::RegexSet;
fn how_it_works() {
// RegexSet builds a single deterministic finite automaton (DFA)
// that represents ALL patterns simultaneously
let set = RegexSet::new([
r"foo",
r"bar",
r"baz",
]).unwrap();
// The DFA processes the text ONCE
// At each position, it tracks which patterns could match
// When the DFA reaches an "accept" state, it knows which
// patterns matched
// This is different from running 3 separate DFAs
// The combined DFA is more efficient because:
// 1. Single pass through the text
// 2. Shared character processing
// 3. Common pattern prefixes are merged
let text = "foo and bar";
let matches: Vec<usize> = set.matches(text).into_iter().collect();
// Single scan found both pattern 0 and pattern 1
println!("Matched: {:?}", matches);
}The internal DFA represents all patterns as a single state machine.
use regex::RegexSet;
fn match_limitations() {
let set = RegexSet::new([
r"\d+",
r"[a-z]+",
]).unwrap();
let text = "abc 123";
let matches = set.matches(text);
// RegexSet tells you WHICH patterns matched
for idx in matches.iter() {
println!("Pattern {} matched", idx);
}
// But it does NOT tell you WHERE they matched
// No match positions, no captured groups
// If you need positions, you'd need to run individual
// regexes on the matched patterns:
let patterns = [r"\d+", r"[a-z]+"];
let regexes: Vec<regex::Regex> = patterns
.iter()
.map(|p| regex::Regex::new(p).unwrap())
.collect();
for idx in set.matches(text).iter() {
if let Some(m) = regexes[idx].find(text) {
println!("Pattern {} matched at {:?}: {}",
idx, m.range(), m.as_str());
}
}
}RegexSet provides which patterns matched, not where they matched.
use regex::RegexSet;
fn two_phase_matching() {
// Common pattern: use RegexSet as a filter,
// then run full regexes on matches
let filter_patterns = [
r"error",
r"warning",
r"critical",
r"panic",
];
let filter = RegexSet::new(&filter_patterns).unwrap();
// Full regexes with capture groups
let error_re = regex::Regex::new(r"error: (.+)").unwrap();
let warning_re = regex::Regex::new(r"warning: (.+)").unwrap();
let critical_re = regex::Regex::new(r"CRITICAL: (.+)").unwrap();
let panic_re = regex::Regex::new(r"PANIC: (.+)").unwrap();
let log_lines = [
"info: application started",
"error: connection failed",
"warning: low memory",
"debug: processing request",
"CRITICAL: disk full",
];
for line in &log_lines {
// Phase 1: Fast filter with RegexSet
let matches = filter.matches(line);
if matches.matched_any() {
// Phase 2: Run only matched patterns
if matches.matched(0) {
if let Some(caps) = error_re.captures(line) {
println!("Error: {}", &caps[1]);
}
}
if matches.matched(1) {
if let Some(caps) = warning_re.captures(line) {
println!("Warning: {}", &caps[1]);
}
}
// ... etc
}
}
}Use RegexSet as a fast filter, then run detailed matching only on matches.
use regex::RegexSet;
fn pattern_scaling() {
// RegexSet scales well with many patterns
// But has some overhead for pattern count
let few_patterns = RegexSet::new([
r"foo",
r"bar",
]).unwrap();
let many_patterns: Vec<String> = (0..1000)
.map(|i| format!("pattern{}", i))
.collect();
let many_patterns_ref: Vec<&str> = many_patterns
.iter()
.map(String::as_str)
.collect();
let large_set = RegexSet::new(&many_patterns_ref).unwrap();
// The large set has:
// - Longer compile time (builds larger DFA)
// - More memory usage
// - But still single-pass matching
let text = "pattern500 and pattern999";
// Single pass finds both
let matches: Vec<usize> = large_set.matches(text).into_iter().collect();
println!("Matched {} patterns", matches.len());
}RegexSet handles hundreds of patterns efficiently.
use regex::RegexSet;
fn empty_matches() {
let set = RegexSet::new([
r"a*",
r"b*",
]).unwrap();
let text = "";
let matches: Vec<usize> = set.matches(text).into_iter().collect();
// Both patterns match empty string!
println!("{:?}", matches); // [0, 1]
// Be careful with patterns that can match empty strings
// They will always report as matched
}Patterns matching empty strings will always report as matched.
use regex::RegexSet;
use std::time::Instant;
fn compile_time() {
let patterns: Vec<String> = (0..100)
.map(|i| format!(r"\bword{}\b", i))
.collect();
let patterns_ref: Vec<&str> = patterns.iter().map(String::as_str).collect();
// RegexSet compilation builds a combined DFA
// This can be slower than individual regex compilation
let start = Instant::now();
let set = RegexSet::new(&patterns_ref).unwrap();
let set_compile_time = start.elapsed();
// Individual regex compilation
let start = Instant::now();
let regexes: Vec<regex::Regex> = patterns
.iter()
.map(|p| regex::Regex::new(p).unwrap())
.collect();
let individual_compile_time = start.elapsed();
println!("RegexSet compile: {:?}", set_compile_time);
println!("Individual compile: {:?}", individual_compile_time);
// RegexSet may take longer to compile, but is faster to run
// Trade-off: pay once at compile time, benefit at runtime
}RegexSet has higher compilation cost but lower matching cost.
use regex::RegexSet;
fn memory_comparison() {
// Individual regexes: each has its own DFA
let patterns: Vec<&str> = (0..50)
.map(|i| Box::leak(format!(r"\bword{}\b", i).into_boxed_str()) as &str)
.collect();
let regexes: Vec<regex::Regex> = patterns
.iter()
.map(|p| regex::Regex::new(p).unwrap())
.collect();
// RegexSet: single combined DFA
let set = RegexSet::new(&patterns).unwrap();
// Memory usage comparison:
// Individual: ~N * (DFA size per pattern)
// RegexSet: ~DFA size for combined patterns
//
// RegexSet often uses less memory because:
// - Common states are shared
// - Common prefixes are merged
// - Single state table instead of N tables
// But for very different patterns, memory may be similar
}RegexSet often uses less memory due to state sharing.
use regex::RegexSet;
fn building_regexset() {
// From array
let set1 = RegexSet::new([r"a", r"b", r"c"]).unwrap();
// From slice
let patterns = [r"a", r"b", r"c"];
let set2 = RegexSet::new(&patterns).unwrap();
// From iterator
let patterns = vec![r"a", r"b", r"c"];
let set3 = RegexSet::new(patterns.iter().copied()).unwrap();
// From dynamic collection
let mut patterns = Vec::new();
for word in &["error", "warning", "info"] {
patterns.push(format!(r"\b{}\b", word));
}
let patterns_ref: Vec<&str> = patterns.iter().map(String::as_str).collect();
let set4 = RegexSet::new(&patterns_ref).unwrap();
// All work the same way
assert!(set1.matches("a").matched(0));
assert!(set2.matches("b").matched(1));
assert!(set3.matches("c").matched(2));
}RegexSet can be built from various collection types.
use regex::RegexSet;
fn syntax_support() {
// RegexSet supports the same syntax as Regex
let set = RegexSet::new([
r"^start", // Anchors
r"end$", // Anchors
r"\d{2,4}", // Quantifiers
r"[a-z]+", // Character classes
r"(foo|bar)", // Alternation
r"\bword\b", // Word boundaries
]).unwrap();
let text = "start 123 foo word end";
let matches: Vec<usize> = set.matches(text).into_iter().collect();
println!("Matched: {:?}", matches);
// But capture groups are ignored
// RegexSet::new([r"(foo)(bar)"]) works but
// you can't access the captured groups
}RegexSet supports full regex syntax but doesn't return captures.
use regex::RegexSet;
fn use_cases() {
// Use case 1: Keyword detection
let keywords = RegexSet::new([
r"\bfn\b",
r"\blet\b",
r"\bif\b",
r"\belse\b",
r"\bmatch\b",
r"\bstruct\b",
]).unwrap();
// Use case 2: Log level filtering
let log_levels = RegexSet::new([
r"(?i)\berror\b",
r"(?i)\bwarning\b",
r"(?i)\binfo\b",
r"(?i)\bdebug\b",
]).unwrap();
// Use case 3: Content classification
let categories = RegexSet::new([
r"https?://", // URL
r"\b[\w.]+@[\w.]+\b", // Email
r"\b\d{3}-\d{3}-\d{4}\b", // Phone
]).unwrap();
// Use case 4: Tokenization pre-filter
let tokens = RegexSet::new([
r"[a-zA-Z_][a-zA-Z0-9_]*", // Identifier
r"\d+", // Number
r#""[^"]*""#, // String
r"//.*", // Comment
]).unwrap();
}RegexSet excels when you need to detect presence of multiple patterns.
use regex::{Regex, RegexSet};
fn when_not_to_use() {
// Don't use when you need match positions
let set = RegexSet::new([r"foo"]).unwrap();
let matches = set.matches("hello foo world");
// No way to get position of "foo"
// Use individual Regex instead
let re = Regex::new(r"foo").unwrap();
if let Some(m) = re.find("hello foo world") {
println!("Match at {:?}", m.range());
}
// Don't use when you need capture groups
let set = RegexSet::new([r"(\d+)-(\d+)"]).unwrap();
// Can't access captured groups
// Use individual Regex
let re = Regex::new(r"(\d+)-(\d+)").unwrap();
if let Some(caps) = re.captures("123-456") {
println!("First: {}, Second: {}", &caps[1], &caps[2]);
}
// Don't use for a single pattern
// No benefit over single Regex
let set = RegexSet::new([r"single"]).unwrap();
let re = Regex::new(r"single").unwrap();
// Both do the same thing, Regex is simpler
}Use individual Regex when you need positions, captures, or have few patterns.
use regex::{Regex, RegexSet};
struct PatternMatcher {
// Fast filter
filter: RegexSet,
// Detailed patterns (indexed by filter results)
patterns: Vec<Regex>,
}
impl PatternMatcher {
fn new(patterns: &[&str]) -> Self {
Self {
filter: RegexSet::new(patterns).unwrap(),
patterns: patterns
.iter()
.map(|p| Regex::new(p).unwrap())
.collect(),
}
}
fn find_matches(&self, text: &str) -> Vec<(usize, regex::Match)> {
let mut results = Vec::new();
// Fast filter
for idx in self.filter.matches(text).iter() {
// Detailed matching only for matched patterns
if let Some(m) = self.patterns[idx].find(text) {
results.push((idx, m));
}
}
results
}
}
fn combined_usage() {
let matcher = PatternMatcher::new(&[
r"\berror\b",
r"\bwarning\b",
r"\bcritical\b",
]);
let text = "ERROR: something went wrong";
for (idx, m) in matcher.find_matches(text) {
println!("Pattern {} matched '{}' at {:?}", idx, m.as_str(), m.range());
}
}Combine RegexSet filtering with individual Regex for full match details.
RegexSet provides efficient multi-pattern matching through a single-pass algorithm:
Performance characteristics:
| Approach | Text passes | Complexity | Best for | |----------|-------------|------------|----------| | Individual Regex (N patterns) | N passes | O(n × m) | Few patterns, need positions | | RegexSet (N patterns) | 1 pass | O(n) | Many patterns, presence check |
What RegexSet provides:
| Feature | Supported | |---------|-----------| | Multiple patterns | ✅ | | Which patterns matched | ✅ | | Match positions | ❌ | | Capture groups | ❌ | | Full regex syntax | ✅ |
Performance benefits:
// Single text, many patterns
let set = RegexSet::new(&patterns).unwrap();
let matches = set.matches(text); // O(n) single pass
// vs.
// Multiple passes
for pattern in &patterns {
let re = Regex::new(pattern).unwrap();
re.is_match(text); // O(n) each
}Best practices:
RegexSet is the right choice when you need to efficiently check many patterns against the same text and only need to know which patterns matched, not where they matched.