What is the difference between regex::CaptureLocations and Captures for reusing capture buffers?
CaptureLocations provides a low-overhead mechanism to record match boundaries without allocating for captured groups, while Captures provides a full-featured interface with allocated storage for all captured text. The key difference is that CaptureLocations only stores byte offsets, allowing callers to extract text from the original input, whereas Captures allocates and stores the captured strings themselves.
Capturing Groups in Regex
use regex::Regex;
fn capturing_groups() {
let re = Regex::new(r"(\d+)-(\w+)-(\d+)").unwrap();
let text = "123-abc-456";
// Captures holds all matched groups
if let Some(caps) = re.captures(text) {
// Full match: "123-abc-456"
println!("Match: {}", caps.get(0).unwrap().as_str());
// Groups: "123", "abc", "456"
println!("Group 1: {}", caps.get(1).unwrap().as_str());
println!("Group 2: {}", caps.get(2).unwrap().as_str());
println!("Group 3: {}", caps.get(3).unwrap().as_str());
}
}Capturing groups extract substrings that match portions of a pattern.
The Captures Type
use regex::Regex;
fn captures_type() {
let re = Regex::new(r"(\w+)@(\w+)\.(\w+)").unwrap();
let text = "test@example.com";
// Captures stores the match and all captured groups
let caps = re.captures(text).unwrap();
// Captures allocates storage for:
// - The full match
// - Each captured group
// - The actual string slices (as_str())
// Access by index
assert_eq!(caps.get(0).unwrap().as_str(), "test@example.com");
assert_eq!(caps.get(1).unwrap().as_str(), "test");
assert_eq!(caps.get(2).unwrap().as_str(), "example");
assert_eq!(caps.get(3).unwrap().as_str(), "com");
// Access by name
let re_named = Regex::new(r"(?P<user>\w+)@(?P<domain>\w+)\.(?P<tld>\w+)").unwrap();
let caps = re_named.captures(text).unwrap();
assert_eq!(&caps["user"], "test");
assert_eq!(&caps["domain"], "example");
assert_eq!(&caps["tld"], "com");
}Captures provides full access to matched text with string slices.
The CaptureLocations Type
use regex::{Regex, CaptureLocations};
fn capture_locations_type() {
let re = Regex::new(r"(\w+)@(\w+)\.(\w+)").unwrap();
let text = "test@example.com";
// CaptureLocations stores only byte offsets
// It does NOT store the captured strings
let mut locs = re.capture_locations();
re.captures_read(&mut locs, text);
// locs contains positions, not strings
// Each group has (start, end) byte positions
let full_match = locs.get(0).unwrap();
// full_match = (0, 16) - byte offsets
let group1 = locs.get(1).unwrap();
// group1 = (0, 4) - "test" is at bytes 0-4
// Extract text using the offsets
let start = group1.0;
let end = group1.1;
assert_eq!(&text[start..end], "test");
}CaptureLocations stores byte offsets, not the captured strings themselves.
captures_read vs captures
use regex::Regex;
fn captures_read_vs_captures() {
let re = Regex::new(r"(\d+)-(\w+)").unwrap();
let text = "123-abc";
// captures: Allocates and returns Captures
let caps = re.captures(text).unwrap();
// - Allocates storage for match information
// - Returns owned Captures value
// - Can call as_str() directly
// captures_read: Fills existing CaptureLocations
let mut locs = re.capture_locations();
re.captures_read(&mut locs, text);
// - No allocation for captured strings
// - Reuses existing CaptureLocations buffer
// - Must extract text from original string
}captures_read fills a reusable CaptureLocations, avoiding allocation.
Reusing CaptureLocations
use regex::Regex;
fn reusing_locations() {
let re = Regex::new(r"(\d+)").unwrap();
// Allocate CaptureLocations once
let mut locs = re.capture_locations();
// Reuse for multiple matches
for text in &["123", "456", "789"] {
if re.captures_read(&mut locs, text).is_some() {
let (start, end) = locs.get(1).unwrap();
println!("Number: {}", &text[start..end]);
}
// locs is reused, no new allocation per match
}
}Reusing CaptureLocations eliminates allocation overhead in tight loops.
Memory Layout Comparison
use regex::{Regex, CaptureLocations};
fn memory_layout() {
let re = Regex::new(r"(\d+)-(\w+)-(\d+)").unwrap();
let text = "123-abc-456";
// CaptureLocations:
// - Fixed-size array of Option<(usize, usize)>
// - One entry per possible capture group
// - No string allocations
// - Lightweight to create and reuse
// Internally (simplified):
// struct CaptureLocations {
// slots: [Option<(usize, usize)>; N],
// }
// Captures:
// - Contains reference to original text
// - Contains offsets (like CaptureLocations)
// - Named capture indices
// - Additional metadata
// Internally (simplified):
// struct Captures<'t> {
// text: &'t str,
// locs: CaptureLocations,
// named: /* name -> index map */,
// }
}CaptureLocations is a minimal byte-offset container; Captures adds text references and named capture support.
Performance Characteristics
use regex::Regex;
fn performance() {
let re = Regex::new(r"(\d+)-(\w+)-(\d+)").unwrap();
// Benchmark: 1,000,000 matches
// Using Captures (allocates each match):
// for text in texts {
// if let Some(caps) = re.captures(text) {
// let g1 = caps.get(1).unwrap().as_str();
// // Allocates Captures structure each iteration
// }
// }
// Using CaptureLocations (reused):
// let mut locs = re.capture_locations();
// for text in texts {
// if re.captures_read(&mut locs, text).is_some() {
// let (s, e) = locs.get(1).unwrap();
// let g1 = &text[s..e];
// // No allocation for locs
// }
// }
// CaptureLocations is faster when:
// 1. Processing many matches
// 2. You only need byte offsets
// 3. You can extract text from the original string
}CaptureLocations avoids allocation overhead when processing many matches.
When to Use Captures
use regex::Regex;
fn when_to_use_captures() {
let re = Regex::new(r"(?P<key>\w+)=(?P<value>\w+)").unwrap();
let text = "name=value";
// Use Captures when:
// 1. You need named captures
let caps = re.captures(text).unwrap();
let key = &caps["key"];
let value = &caps["value"];
// 2. You need the convenience of as_str()
let full_match = caps.get(0).unwrap().as_str();
// 3. You need to iterate over groups
for m in caps.iter() {
if let Some(m) = m {
println!("Group: {}", m.as_str());
}
}
// 4. You need the full match information
let pos = caps.get(0).unwrap();
println!("Match at {}-{}", pos.start(), pos.end());
}Use Captures for convenience, named captures, and iteration.
When to Use CaptureLocations
use regex::Regex;
fn when_to_use_locations() {
let re = Regex::new(r"(\d+)").unwrap();
// Use CaptureLocations when:
// 1. Processing many matches in a tight loop
let texts = ["123", "456", "789", "012", "345"];
let mut locs = re.capture_locations();
let mut sum = 0;
for text in &texts {
if re.captures_read(&mut locs, text).is_some() {
let (start, end) = locs.get(1).unwrap();
let n: i32 = text[start..end].parse().unwrap();
sum += n;
}
}
// 2. You only need byte offsets
// 3. Memory allocation is a concern
// 4. Parsing high-volume streams
}Use CaptureLocations for high-performance, high-volume matching.
Accessing Group Offsets
use regex::Regex;
fn accessing_offsets() {
let re = Regex::new(r"(\w+)@(\w+)\.(\w+)").unwrap();
let text = "test@example.com";
// Captures: Get offsets via Match
let caps = re.captures(text).unwrap();
let full = caps.get(0).unwrap();
println!("Full match: {}..{}", full.start(), full.end());
let group1 = caps.get(1).unwrap();
println!("Group 1: {}..{} = '{}'", group1.start(), group1.end(), group1.as_str());
// CaptureLocations: Get offsets directly
let mut locs = re.capture_locations();
re.captures_read(&mut locs, text);
let full = locs.get(0).unwrap();
println!("Full match: {:?}", full); // (0, 16)
let group1 = locs.get(1).unwrap();
println!("Group 1: {:?}", group1); // (0, 4)
// Both give same offsets, but CaptureLocations is lighter
}Both provide byte offsets; CaptureLocations returns them directly as tuples.
Working with Named Captures
use regex::Regex;
fn named_captures() {
let re = Regex::new(r"(?P<year>\d{4})-(?P<month>\d{2})-(?P<day>\d{2})").unwrap();
let text = "2023-12-25";
// Captures: Direct named access
let caps = re.captures(text).unwrap();
let year = &caps["year"];
let month = &caps["month"];
let day = &caps["day"];
// CaptureLocations: No named access
// Must use indices
let mut locs = re.capture_locations();
re.captures_read(&mut locs, text);
// You need to know the index:
// year = 1, month = 2, day = 3
let year_pos = locs.get(1).unwrap();
let month_pos = locs.get(2).unwrap();
let day_pos = locs.get(3).unwrap();
// Or use capture_names() iterator
// But this requires Captures for named indexing
}Captures supports named capture access; CaptureLocations requires indices.
len() for Group Count
use regex::Regex;
fn group_count() {
let re = Regex::new(r"(\d+)-(\w+)-(\d+)").unwrap();
let text = "123-abc-456";
// CaptureLocations.len() returns the number of slots
let mut locs = re.capture_locations();
re.captures_read(&mut locs, text);
// len() is the number of capture groups + 1 (for full match)
// Group 0: full match
// Groups 1..len()-1: captured groups
println!("Number of slots: {}", locs.len()); // 4
// Iterate through all possible groups
for i in 0..locs.len() {
if let Some((start, end)) = locs.get(i) {
println!("Group {}: {}..{} = '{}'", i, start, end, &text[start..end]);
}
}
}len() indicates how many capture slots are available.
Empty and Non-Matching Groups
use regex::Regex;
fn empty_groups() {
let re = Regex::new(r"(\w+)(?:-(\w+))?").unwrap();
// Group 2 is optional
let text = "hello-world";
let mut locs = re.capture_locations();
re.captures_read(&mut locs, text);
// Group 1: matches "hello"
assert!(locs.get(1).is_some());
// Group 2: matches "world"
assert!(locs.get(2).is_some());
// With "hello" (no hyphen):
let text2 = "hello";
re.captures_read(&mut locs, text2);
// Group 1: matches
assert!(locs.get(1).is_some());
// Group 2: doesn't match - returns None
assert!(locs.get(2).is_none());
}get() returns None for groups that didn't participate in the match.
Real-World Example: Log Parsing
use regex::Regex;
fn log_parsing() {
// Parse log lines: [TIMESTAMP] LEVEL: message
let re = Regex::new(r"\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\] (\w+): (.*)").unwrap();
let logs = [
"[2023-12-25 10:30:45] INFO: Server started",
"[2023-12-25 10:30:46] WARN: High memory usage",
"[2023-12-25 10:30:47] ERROR: Connection failed",
];
// Using Captures
for log in &logs {
if let Some(caps) = re.captures(log) {
let timestamp = caps.get(1).unwrap().as_str();
let level = caps.get(2).unwrap().as_str();
let message = caps.get(3).unwrap().as_str();
println!("[{}] {} - {}", timestamp, level, message);
}
}
// Using CaptureLocations (more efficient for high volume)
let mut locs = re.capture_locations();
for log in &logs {
if re.captures_read(&mut locs, log).is_some() {
let ts = locs.get(1).unwrap();
let level = locs.get(2).unwrap();
let msg = locs.get(3).unwrap();
println!(
"[{}] {} - {}",
&log[ts.0..ts.1],
&log[level.0..level.1],
&log[msg.0..msg.1]
);
}
}
}For high-volume log parsing, CaptureLocations reduces allocation overhead.
Comparison Table
use regex::{Regex, CaptureLocations};
fn comparison() {
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// β Aspect β Captures β CaptureLocations β
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
// β Storage β Offsets + text refs β Offsets only β
// β Allocation β Per match β Reusable β
// β Access β get(i).as_str() β get(i) -> (start, end) β
// β Named captures β caps["name"] β Not supported β
// β Iteration β caps.iter() β Manual loop β
// β Convenience β High β Low β
// β Performance β Allocation overhead β Minimal overhead β
// β Use case β General matching β High-volume matching β
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
}Complete Example
use regex::Regex;
fn main() {
let re = Regex::new(r"(\w+)=(\d+)").unwrap();
let text = "count=42";
// Method 1: Using Captures
println!("=== Using Captures ===");
if let Some(caps) = re.captures(text) {
println!("Full match: {}", caps.get(0).unwrap().as_str());
println!("Key: {}", caps.get(1).unwrap().as_str());
println!("Value: {}", caps.get(2).unwrap().as_str());
// Convenient iteration
for (i, m) in caps.iter().enumerate() {
if let Some(m) = m {
println!("Group {}: '{}'", i, m.as_str());
}
}
}
// Method 2: Using CaptureLocations
println!("\n=== Using CaptureLocations ===");
let mut locs = re.capture_locations();
if re.captures_read(&mut locs, text).is_some() {
println!("Number of groups: {}", locs.len());
// Extract using offsets
for i in 0..locs.len() {
if let Some((start, end)) = locs.get(i) {
println!("Group {}: '{}' at {}..{}", i, &text[start..end], start, end);
}
}
}
// Performance comparison: many matches
println!("\n=== Performance Comparison ===");
let texts: Vec<&str> = (0..1000).map(|i| "count=42").collect();
// Captures: allocates per match
let mut count1 = 0;
for text in &texts {
if let Some(caps) = re.captures(text) {
if caps.get(2).unwrap().as_str() == "42" {
count1 += 1;
}
}
}
println!("Captures matches: {}", count1);
// CaptureLocations: reuse allocation
let mut count2 = 0;
let mut locs = re.capture_locations();
for text in &texts {
if re.captures_read(&mut locs, text).is_some() {
if let Some((s, e)) = locs.get(2) {
if &text[s..e] == "42" {
count2 += 1;
}
}
}
}
println!("CaptureLocations matches: {}", count2);
}Summary
use regex::{Regex, CaptureLocations};
fn summary() {
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// β Feature β Captures β CaptureLocations β
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ€
// β Stores offsets β Yes β Yes β
// β Stores strings β Yes (references) β No β
// β Named captures β Yes β No β
// β Allocation β Per match β Reusable β
// β as_str() β Yes β Extract manually β
// β Iteration β iter() method β Manual loop β
// β Performance β Convenient β Optimal β
// βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
// Key points:
// 1. CaptureLocations stores byte offsets only
// 2. Captures adds text references and convenience methods
// 3. CaptureLocations can be reused across matches
// 4. Captures provides named capture access
// 5. Use CaptureLocations for high-volume matching
// 6. Use Captures for convenience and named captures
}Key insight: CaptureLocations and Captures represent different points on the convenience-performance spectrum. CaptureLocations is minimalβjust byte offsets in a reusable bufferβmaking it ideal for high-throughput matching where allocation overhead matters. Captures provides ergonomic access to matched text through as_str() and named capture indexing, but allocates for each match. When processing streams or performing many regex operations, reusing a single CaptureLocations buffer eliminates allocation churn. When code clarity and convenience matter more than raw performance, Captures provides a cleaner API with its iteration and named capture support.
