What is the difference between regex::Regex::captures_read and captures_read_at for incremental matching?
regex::Regex::captures_read performs incremental regex matching by reading from any source implementing std::io::Read, returning captures for content as it becomes available, while captures_read_at additionally accepts a starting byte offset that specifies where in the logical input stream the read begins, enabling matching to continue from a specific position in partial or chunked data. Both methods support streaming scenarios where the complete input isn't available in memory, but captures_read_at provides position-aware matching for resumable or offset-based parsing.
The Incremental Matching Problem
use regex::Regex;
use std::io::{self, Read};
fn incremental_matching_problem() {
// Standard regex matching requires the entire string in memory
let pattern = Regex::new(r"(\d{4})-(\d{2})-(\d{2})").unwrap();
let text = "Date: 2024-01-15 and 2024-02-20";
// This works fine for small strings
for caps in pattern.captures_iter(text) {
println!("Found: {}-{}-{}", &caps[1], &caps[2], &caps[3]);
}
// But what if we're reading from a 10GB log file?
// Or from a network stream?
// We can't load it all into memory.
//
// captures_read and captures_read_at solve this by
// matching incrementally as data arrives.
}Standard matching requires all content in memory; incremental matching works with streams.
captures_read: Basic Stream Matching
use regex::Regex;
use std::io::{self, Cursor};
fn basic_captures_read() {
let pattern = Regex::new(r"(\w+)=(\d+)").unwrap();
// Read from an in-memory cursor (implements Read)
let data = b"foo=123 bar=456 baz=789";
let mut cursor = Cursor::new(data);
// Allocate capture locations
let mut caps = pattern.capture_locations();
// Match incrementally - read from the cursor
match pattern.captures_read(&mut caps, &mut cursor) {
Ok(Some(_)) => {
// Found a match
println!("Match at {}-{}", caps.get(0).unwrap().start(), caps.get(0).unwrap().end());
// Group 1: key
println!("Key: {}", String::from_utf8_lossy(&caps[1]));
// Group 2: value
println!("Value: {}", String::from_utf8_lossy(&caps[2]));
}
Ok(None) => {
println!("No match found");
}
Err(e) => {
eprintln!("Error: {}", e);
}
}
}captures_read takes a Read source and matches as content becomes available.
captures_read_at: Position-Aware Matching
use regex::Regex;
use std::io::Cursor;
fn captures_read_at_example() {
let pattern = Regex::new(r"(\w+)=(\d+)").unwrap();
// Simulate reading from a stream where we've already processed
// some data and want to continue from a specific offset
let data = b"prefix_data foo=123 bar=456";
let mut cursor = Cursor::new(data);
// Skip "prefix_data " (12 bytes) in our logical stream
// Tell the regex engine we're starting at offset 12
let start_offset = 12;
let mut caps = pattern.capture_locations();
match pattern.captures_read_at(&mut caps, &mut cursor, start_offset) {
Ok(Some(_)) => {
// The match positions are relative to the entire logical stream
// So the match starts at position 12 + (position within cursor)
println!("Match at logical offset {}", caps.get(0).unwrap().start());
// This lets you maintain correct position tracking
// when processing chunked/stream data
}
Ok(None) => {
println!("No match found");
}
Err(e) => {
eprintln!("Error: {}", e);
}
}
}captures_read_at includes a starting offset for position tracking across chunks.
The Key Difference: Offset Semantics
use regex::Regex;
use std::io::Cursor;
fn offset_semantics() {
let pattern = Regex::new(r"data: (\w+)").unwrap();
// Scenario: We're processing a large file in chunks
// We've already processed bytes 0-999
// New chunk starts at byte 1000
let chunk = b"data: value1 more data: value2";
let mut cursor = Cursor::new(chunk);
// Without captures_read_at (captures_read):
// Match positions are relative to the chunk
let mut caps1 = pattern.capture_locations();
if let Ok(Some(_)) = pattern.captures_read(&mut caps1, &mut cursor) {
// Position is 0-based within the chunk
// Would report position 0, not 1000
println!("captures_read position: {}", caps1.get(0).unwrap().start());
}
// Reset cursor
cursor.set_position(0);
// With captures_read_at:
// Match positions are relative to the entire stream
let mut caps2 = pattern.capture_locations();
if let Ok(Some(_)) = pattern.captures_read_at(&mut caps2, &mut cursor, 1000) {
// Position includes the offset
// Would report position 1000 + (position in chunk)
println!("captures_read_at position: {}", caps2.get(0).unwrap().start());
}
}captures_read_at adjusts reported positions by the starting offset.
Reading from Files Incrementally
use regex::Regex;
use std::fs::File;
use std::io::{self, BufReader, Read};
fn file_incremental_matching() -> io::Result<()> {
let pattern = Regex::new(r"ERROR: (.+)").unwrap();
// Open a large log file
let file = File::open("large_log.txt")?;
let mut reader = BufReader::new(file);
let mut caps = pattern.capture_locations();
let mut total_bytes_read = 0usize;
// Process file incrementally
loop {
match pattern.captures_read_at(&mut caps, &mut reader, total_bytes_read as u64) {
Ok(Some(_)) => {
// Found an error message
let error_msg = &caps[1];
println!("Error at byte {}: {}", total_bytes_read, String::from_utf8_lossy(error_msg));
// Update position for next search
if let Some(m) = caps.get(0) {
total_bytes_read = m.end() as usize;
}
}
Ok(None) => {
// No more matches
break;
}
Err(e) => {
eprintln!("Read error: {}", e);
break;
}
}
}
Ok(())
}Process large files without loading everything into memory.
Network Stream Processing
use regex::Regex;
use std::io::{self, Read};
use std::net::TcpStream;
fn network_stream_processing() -> io::Result<()> {
let pattern = Regex::new(r"(\w+): (\d+)").unwrap();
// Connect to a server
let mut stream = TcpStream::connect("example.com:8080")?;
let mut caps = pattern.capture_locations();
let mut offset = 0u64;
// Process incoming data as it arrives
loop {
match pattern.captures_read_at(&mut caps, &mut stream, offset) {
Ok(Some(_)) => {
// Found a match
let key = String::from_utf8_lossy(&caps[1]);
let value = String::from_utf8_lossy(&caps[2]);
println!("{} = {}", key, value);
// Update offset for correct position tracking
if let Some(m) = caps.get(0) {
offset = m.end() as u64;
}
}
Ok(None) => {
// No match found, but stream might still have data
// This could mean we need more data or pattern didn't match
break;
}
Err(e) => {
if e.kind() == io::ErrorKind::WouldBlock {
// Non-blocking: need to wait for more data
continue;
}
eprintln!("Error: {}", e);
break;
}
}
}
Ok(())
}Match patterns against network data as it arrives.
Capture Locations vs Captures
use regex::Regex;
use std::io::Cursor;
fn capture_locations() {
let pattern = Regex::new(r"(\d+)-(\d+)-(\d+)").unwrap();
// Allocate capture locations once, reuse for efficiency
let mut caps = pattern.capture_locations();
let data = b"2024-01-15 and 2024-02-20";
let mut cursor = Cursor::new(data);
// captures_read fills the capture_locations struct
// It doesn't return a new Captures object
// This is more efficient for repeated matching
while let Ok(Some(_)) = pattern.captures_read(&mut caps, &mut cursor) {
// Access matches through the capture_locations reference
let full_match = caps.get(0).unwrap();
println!("Full match at {}-{}", full_match.start(), full_match.end());
// Access groups by index
if let Some(group1) = caps.get(1) {
println!("Year: {}-{}", group1.start(), group1.end());
}
// Or use indexing (requires extraction)
// Note: captures_read works with bytes, not strings
}
}capture_locations is a reusable buffer for capture positions, avoiding allocations.
Error Handling
use regex::Regex;
use std::io::{self, Cursor, Read};
fn error_handling() {
let pattern = Regex::new(r"test").unwrap();
// Reader that simulates an error
struct ErrorReader;
impl Read for ErrorReader {
fn read(&mut self, _buf: &mut [u8]) -> io::Result<usize> {
Err(io::Error::new(io::ErrorKind::Other, "read error"))
}
}
let mut reader = ErrorReader;
let mut caps = pattern.capture_locations();
match pattern.captures_read(&mut caps, &mut reader) {
Ok(Some(_)) => println!("Match found"),
Ok(None) => println!("No match"),
Err(e) => {
// The error from the reader is propagated
println!("Read error: {}", e);
// Handle I/O errors appropriately
// The match was interrupted
}
}
}I/O errors from the reader are propagated through captures_read.
Partial Matching Behavior
use regex::Regex;
use std::io::{self, Cursor};
fn partial_matching() {
let pattern = Regex::new(r"header: (\w+)").unwrap();
// Scenario: Data arrives in chunks
// "header: v" arrives, then "alue" arrives later
// captures_read requires enough data to find a complete match
// If the data ends mid-pattern, no match is found
let partial_data = b"header: v"; // Incomplete: "header: value"
let mut cursor = Cursor::new(partial_data);
let mut caps = pattern.capture_locations();
match pattern.captures_read(&mut caps, &mut cursor) {
Ok(None) => {
// No complete match found
// The partial data doesn't fully match the pattern
// You'd need to buffer and try again with more data
println!("Incomplete: no match with partial data");
}
Ok(Some(_)) => {
println!("Match found");
}
Err(e) => {
println!("Error: {}", e);
}
}
// For streaming, you'd need to:
// 1. Buffer incoming data
// 2. Try matching
// 3. If no match and buffer isn't full, wait for more data
// 4. If match found, process and advance
}Incomplete data may not match; buffering strategies are needed for true streaming.
Practical Chunked Processing Pattern
use regex::Regex;
use std::io::{self, Read, BufReader};
struct ChunkedMatcher<R: Read> {
reader: BufReader<R>,
pattern: Regex,
buffer: Vec<u8>,
offset: u64,
}
impl<R: Read> ChunkedMatcher<R> {
fn new(reader: R, pattern: Regex) -> Self {
ChunkedMatcher {
reader: BufReader::new(reader),
pattern,
buffer: Vec::with_capacity(8192),
offset: 0,
}
}
fn next_match(&mut self) -> io::Result<Option<Vec<(usize, usize)>>> {
let mut caps = self.pattern.capture_locations();
loop {
// Try to find a match in current buffer
let slice = &self.buffer[..];
let mut cursor = io::Cursor::new(slice);
match self.pattern.captures_read_at(&mut caps, &mut cursor, self.offset) {
Ok(Some(_)) => {
// Found a match
let positions: Vec<_> = (0..=caps.len())
.filter_map(|i| caps.get(i))
.map(|m| (m.start(), m.end()))
.collect();
// Update offset to after the match
if let Some(m) = caps.get(0) {
self.offset = m.end() as u64;
// Remove processed bytes from buffer
let end = m.end() as usize;
self.buffer.drain(..end);
}
return Ok(Some(positions));
}
Ok(None) => {
// No match, need more data
let mut buf = [0u8; 1024];
match self.reader.read(&mut buf) {
Ok(0) => return Ok(None), // EOF
Ok(n) => {
self.buffer.extend_from_slice(&buf[..n]);
}
Err(e) => return Err(e),
}
}
Err(e) => return Err(e),
}
}
}
}A practical pattern combines buffering with incremental matching for true streaming.
Performance Considerations
use regex::Regex;
use std::io::Cursor;
fn performance_notes() {
let pattern = Regex::new(r"(\w+)").unwrap();
// Reuse capture_locations for efficiency
let mut caps = pattern.capture_locations();
// This avoids allocating a new Captures object each match
// Good for tight loops or high-frequency matching
for _ in 0..1000 {
let data = b"test data here";
let mut cursor = Cursor::new(data);
// Reuse caps
if let Ok(Some(_)) = pattern.captures_read(&mut caps, &mut cursor) {
// Process match
}
}
// captures_read_at adds minimal overhead for offset tracking
// The main cost is still the regex matching itself
// For best performance:
// 1. Reuse capture_locations
// 2. Use BufReader for file/network I/O
// 3. Consider pattern optimization (lazy, atomic groups)
}Reuse capture_locations to avoid allocations in tight loops.
When to Use Each Method
use regex::Regex;
use std::io::{Cursor, Read};
fn when_to_use() {
// Use captures_read when:
// 1. Processing a complete stream from beginning
// 2. Position tracking within the stream isn't needed
// 3. Simple streaming scenarios
// Example: Processing a stream from start
let pattern = Regex::new(r"token: (\w+)").unwrap();
let data = b"token: abc token: def";
let mut cursor = Cursor::new(data);
let mut caps = pattern.capture_locations();
// No offset needed - just process from start
// positions reported are 0-based within stream
// Use captures_read_at when:
// 1. Processing data in chunks/segments
// 2. You need to track logical position across chunks
// 3. Resuming matching from a known position
// 4. Building position-aware parsers or indexers
// Example: Multi-chunk file processing
let mut file_offset = 0u64;
// ... process first chunk ...
// file_offset = end of first chunk
// When processing next chunk:
// pattern.captures_read_at(&mut caps, &mut reader, file_offset)
// positions now reflect actual file positions
}Use captures_read for simple streaming; captures_read_at for position-aware processing.
Summary Table
fn summary() {
// | Method | Offset Parameter | Use Case |
// |-------------------|-----------------|------------------------------|
// | captures_read | No | Simple streaming from start |
// | captures_read_at | Yes | Chunked/resumable processing |
// | Method | Position Reporting |
// |-------------------|------------------------------|
// | captures_read | Relative to read start |
// | captures_read_at | Relative to offset + read |
// | Return Type | Meaning |
// |-------------------|------------------------------|
// | Ok(Some(_)) | Match found |
// | Ok(None) | No match in available data |
// | Err(e) | I/O error from reader |
// | Capture Access | Notes |
// |-------------------|------------------------------|
// | caps.get(0) | Full match position |
// | caps.get(n) | Nth capture group position |
// | &caps[n] | Byte slice of capture |
}Synthesis
Quick reference:
use regex::Regex;
use std::io::Cursor;
let pattern = Regex::new(r"(\w+)=(\w+)").unwrap();
let mut caps = pattern.capture_locations();
// Simple streaming: positions relative to read start
let data = b"key=value";
let mut cursor = Cursor::new(data);
if let Ok(Some(_)) = pattern.captures_read(&mut caps, &mut cursor) {
let full_match = caps.get(0).unwrap();
println!("Match at {}-{}", full_match.start(), full_match.end());
}
// Position-aware: positions include offset
let mut cursor2 = Cursor::new(data);
if let Ok(Some(_)) = pattern.captures_read_at(&mut caps, &mut cursor2, 1000) {
let full_match = caps.get(0).unwrap();
// Position is now 1000 + (position within data)
println!("Match at logical offset {}", full_match.start());
}Key insight: Both captures_read and captures_read_at enable regex matching on streaming data via the std::io::Read trait, avoiding the need to load entire inputs into memory. The difference lies in position reporting: captures_read reports positions relative to where the reader started (effectively offset 0), while captures_read_at accepts a starting offset that's added to all reported positions. This offset parameter is crucial when processing data in chunksâyou might process the first 10KB of a file, then the next 10KB, and need all position reports to be relative to the original file, not each chunk. The captures_read_at(&mut caps, &mut reader, offset) call ensures that caps.get(0).unwrap().start() returns the correct logical position in the overall stream, not just the offset within the current chunk. Both methods return Result<Option<()>, io::Error>: Ok(Some(_)) means a match was found, Ok(None) means no match in the available data (but not necessarily EOF), and Err(_) propagates I/O errors from the reader. The capture_locations struct is allocated once and reused across matches, avoiding the allocation overhead of creating new Captures objects for each match. Use captures_read for simple streaming scenarios where you process from the beginning and position tracking isn't critical; use captures_read_at when building parsers, indexers, or tools that need to maintain accurate byte positions across chunked processing or resumable operations.
