What are the trade-offs between rayon::iter::ParallelIterator::map_init and map_with for per-thread initialization?
map_init creates a fresh init value for each thread that invokes the closure, ideal for thread-local state like buffers or RNGs, while map_with clones the init value once per split, sharing it across tasks within a threadâsuitable for immutable configuration or read-only context. The key trade-off is between per-invocation initialization cost (map_init) versus clone-on-split overhead (map_with), with different semantics for mutable state access.
Basic map_init Usage
use rayon::prelude::*;
fn basic_map_init() {
let data = vec
![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
;
let result: Vec<String> = data
.par_iter()
.map_init(
|| String::new(), // Init: called once per thread
|buffer: &mut String, &x| { // Closure: receives mutable buffer
buffer.clear();
buffer.push_str(&format!("{}^2 = {}", x, x * x));
buffer.clone()
}
)
.collect();
println!("{:?}", result);
// Each thread gets its own String buffer, reused across items
}map_init provides a thread-local mutable value that persists across invocations.
Basic map_with Usage
use rayon::prelude::*;
fn basic_map_with() {
let data = vec
![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
;
let config = Config { multiplier: 10 };
let result: Vec<i32> = data
.par_iter()
.map_with(config, |config: &Config, &x| {
// config is cloned per split, then shared
x * config.multiplier
})
.collect();
println!("{:?}", result);
}
#[derive(Clone)]
struct Config {
multiplier: i32,
}map_with clones the init value and shares it across items within a split.
Per-Thread vs Per-Split Semantics
use rayon::prelude::*;
use std::cell::RefCell;
use std::sync::atomic::{AtomicUsize, Ordering};
fn semantics_comparison() {
// map_init: init called once per thread
let init_count = AtomicUsize::new(0);
(0..1000).into_par_iter()
.map_init(
|| {
init_count.fetch_add(1, Ordering::SeqCst);
Vec::new()
},
|vec: &mut Vec<i32>, x| {
vec.push(x);
*vec.last().unwrap()
}
)
.count();
println!("map_init inits: {}", init_count.load(Ordering::SeqCst));
// Approximately equals number of threads (e.g., 8 on 8-core CPU)
// map_with: init cloned once per split
let mut clone_count = 0;
let counter = CloneCounter { count: 0 };
(0..1000).into_par_iter()
.map_with(CloneCounter { count: 0 }, |counter, x| {
x + counter.count
})
.count();
// Clone happens at each split point
}
#[derive(Clone)]
struct CloneCounter {
count: i32,
}map_init initializes per thread; map_with clones per split.
When to Use map_init
use rayon::prelude::*;
fn map_init_use_cases() {
// Use case 1: Thread-local buffers (avoid allocation per item)
let data: Vec<Vec<u8>> = (0..100).map(|i| vec
![0; i])
.collect();
let sums: Vec<usize> = data
.par_iter()
.map_init(
|| Vec::with_capacity(100), // Reusable buffer per thread
|buffer: &mut Vec<u8>, chunk| {
buffer.clear();
buffer.extend(chunk.iter().map(|&b| b * 2));
buffer.len()
}
)
.collect();
// Use case 2: Thread-local RNG
use rand::rngs::ThreadRng;
use rand::Rng;
let random_pairs: Vec<(u32, u32)> = (0..100)
.into_par_iter()
.map_init(
|| rand::thread_rng(),
|rng: &mut ThreadRng, _| {
(rng.gen(), rng.gen())
}
)
.collect();
// Use case 3: Thread-local accumulators
let data = vec
![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
;
let thread_sums: Vec<i32> = data
.par_iter()
.map_init(
|| 0i32, // Thread-local sum
|sum: &mut i32, &x| {
*sum += x;
*sum
}
)
.collect();
}Use map_init when you need thread-local mutable state.
When to Use map_with
use rayon::prelude::*;
#[derive(Clone)]
struct ProcessingConfig {
threshold: f64,
scale: f64,
}
fn map_with_use_cases() {
let config = ProcessingConfig {
threshold: 0.5,
scale: 2.0,
};
// Use case 1: Configuration sharing
let data: Vec<f64> = (0..100).map(|i| i as f64 / 100.0).collect();
let processed: Vec<f64> = data
.par_iter()
.map_with(config.clone(), |cfg: &ProcessingConfig, &x| {
if x > cfg.threshold {
x * cfg.scale
} else {
x
}
})
.collect();
// Use case 2: Read-only context
let lookup_table = vec
![0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
;
let indexed: Vec<i32> = (0..100)
.into_par_iter()
.map_with(lookup_table.clone(), |table: &Vec<i32>, x| {
let idx = x % table.len();
table[idx]
})
.collect();
// Use case 3: Database connection config (immutable)
let db_config = DbConfig { url: "localhost".to_string() };
let results: Vec<String> = (0..10)
.into_par_iter()
.map_with(db_config, |cfg: &DbConfig, id| {
// Each thread can read cfg, but not mutate
format!("{}: {}", cfg.url, id)
})
.collect();
}
#[derive(Clone)]
struct DbConfig {
url: String,
}Use map_with when you need read-only context cloned per split.
Mutable State: map_init Only
use rayon::prelude::*;
fn mutable_state_difference() {
// map_init: Thread-local MUTABLE state
let counts: Vec<i32> = (0..100)
.into_par_iter()
.map_init(
|| std::collections::HashMap::new(),
|counts: &mut HashMap<i32, i32>, x| {
*counts.entry(x % 10).or_insert(0) += 1;
counts.len()
}
)
.collect();
// map_with: Immutable access only (shared reference)
// This would NOT compile:
// .map_with(HashMap::new(), |counts: &mut HashMap<...>, x| {
// counts.insert(x, 1); // Error: cannot mutate shared reference
// })
// map_with only gives &T, not &mut T
}map_init provides &mut T; map_with only provides &T.
Clone Cost Comparison
use rayon::prelude::*;
#[derive(Clone)]
struct ExpensiveToClone {
data: Vec<u8>,
}
impl ExpensiveToClone {
fn new() -> Self {
Self { data: vec
![0; 1_000_000] } // 1MB buffer
}
}
fn clone_cost_comparison() {
let data: Vec<i32> = (0..10_000).collect();
// map_with: Clones ExpensiveToClone at each split
// With 4 threads and deep splits, could clone many times
let expensive = ExpensiveToClone::new();
let result: Vec<i32> = data
.par_iter()
.map_with(expensive, |exp: &ExpensiveToClone, &x| {
x + exp.data.len() as i32 // Read-only access
})
.collect();
// map_init: Creates once per thread, never clones
let result: Vec<i32> = data
.par_iter()
.map_init(
|| ExpensiveToClone::new(),
|exp: &mut ExpensiveToClone, &x| {
x + exp.data.len() as i32
}
)
.collect();
}For expensive-to-clone types, map_init may be more efficient.
Initialization Frequency
use rayon::prelude::*;
use std::sync::atomic::{AtomicUsize, Ordering};
fn init_frequency() {
let init_calls = AtomicUsize::new(0);
let clone_calls = AtomicUsize::new(0);
// Track how often initialization happens
struct Tracker {
id: usize,
}
impl Clone for Tracker {
fn clone(&mut self) -> Self {
clone_calls.fetch_add(1, Ordering::SeqCst);
Tracker { id: self.id }
}
}
// map_init: One init per thread
let result: Vec<_> = (0..1000)
.into_par_iter()
.map_init(
|| Tracker { id: init_calls.fetch_add(1, Ordering::SeqCst) },
|tracker, x| (tracker.id, x)
)
.collect();
let init_count = init_calls.load(Ordering::SeqCst);
println!("map_init initializations: {}", init_count);
// Typically equals thread pool size
// map_with: Clone at each split
init_calls.store(0, Ordering::SeqCst);
clone_calls.store(0, Ordering::SeqCst);
let result: Vec<_> = (0..1000)
.into_par_iter()
.map_with(Tracker { id: 0 }, |tracker, x| (tracker.id, x))
.collect();
println!("map_with clones: {}", clone_calls.load(Ordering::SeqCst));
// Can be higher than thread pool size due to work stealing splits
}map_init initializes fewer times; map_with clones at each split.
Thread-Local Storage Pattern
use rayon::prelude::*;
use std::cell::RefCell;
fn thread_local_pattern() {
// map_init is ideal for thread-local patterns
let result: Vec<i32> = (0..1000)
.into_par_iter()
.map_init(
|| {
// This runs ONCE per thread
RefCell::new(Vec::with_capacity(100))
},
|buffer: &RefCell<Vec<i32>>, x| {
// Reuse the buffer for each item
let mut b = buffer.borrow_mut();
b.clear();
b.push(x);
b.push(x * 2);
b.iter().sum()
}
)
.collect();
// Contrast with allocating inside map:
let result: Vec<i32> = (0..1000)
.into_par_iter()
.map(|x| {
// Allocates NEW Vec for each item
let mut buffer = Vec::with_capacity(100);
buffer.push(x);
buffer.push(x * 2);
buffer.iter().sum()
})
.collect();
}map_init avoids allocation per item by reusing thread-local storage.
Accumulator Pattern
use rayon::prelude::*;
fn accumulator_pattern() {
// Use map_init for thread-local accumulation
let data: Vec<i32> = (0..1000).collect();
// Each thread maintains its own sum
let partial_sums: Vec<i32> = data
.par_iter()
.fold(
|| 0i32, // Per-thread initial value
|sum: &mut i32, &x| {
*sum += x;
*sum
}
)
.collect();
// map_init can also track thread-local state
let with_state: Vec<(i32, usize)> = data
.par_iter()
.map_init(
|| (0i32, 0usize), // (sum, count) per thread
|state: &mut (i32, usize), &x| {
state.0 += x;
state.1 += 1;
(*state.0, *state.1) // Return current thread totals
}
)
.collect();
// This shows per-thread running state
}Use map_init (or fold) for thread-local accumulation.
Configuration Distribution Pattern
use rayon::prelude::*;
#[derive(Clone)]
struct AppConfig {
output_dir: String,
max_items: usize,
verbose: bool,
}
fn config_distribution() {
let config = AppConfig {
output_dir: "/tmp/output".to_string(),
max_items: 100,
verbose: true,
};
let items: Vec<i32> = (0..1000).collect();
// map_with: Clone config at splits, share within split
let processed: Vec<String> = items
.par_iter()
.map_with(config.clone(), |cfg: &AppConfig, &item| {
if cfg.verbose {
println!("Processing item {}", item);
}
if item < cfg.max_items as i32 {
format!("{}/item_{}.txt", cfg.output_dir, item)
} else {
String::new()
}
})
.collect();
// config is cloned at split boundaries
// Within a split, all items share the same config reference
}map_with is natural for distributing configuration.
RNG State Pattern
use rayon::prelude::*;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
fn rng_pattern() {
// map_init for thread-local RNG (each thread gets independent RNG)
let random_values: Vec<u64> = (0..100)
.into_par_iter()
.map_init(
|| StdRng::from_entropy(),
|rng: &mut StdRng, _| {
rng.gen()
}
)
.collect();
// If you want reproducible randomness:
let seed = 42u64;
let reproducible: Vec<u64> = (0..100)
.into_par_iter()
.enumerate()
.map_init(
|| StdRng::seed_from_u64(seed),
|rng: &mut StdRng, (i, _)| {
rng.gen()
}
)
.collect();
// map_with would not work here - RNG needs mutable state
// This would NOT compile:
// .map_with(StdRng::from_entropy(), |rng: &StdRng, _| {
// rng.gen() // Cannot call &mut methods on shared reference
// })
}RNGs require mutable state; use map_init.
Comparison with fold
use rayon::prelude::*;
fn compare_with_fold() {
let data: Vec<i32> = (0..100).collect();
// fold: Thread-local accumulator, reduces at end
let sum: i32 = data
.par_iter()
.fold(
|| 0,
|acc: &mut i32, &x| {
*acc += x;
*acc
}
)
.sum::<i32>(); // Final reduction
// map_init: Thread-local state, returns value per item
let indexed: Vec<(usize, i32)> = data
.par_iter()
.map_init(
|| 0usize,
|idx: &mut usize, &x| {
*idx += 1;
(*idx, x)
}
)
.collect();
// fold combines into single result per thread
// map_init produces output per item, using thread-local state
}fold is for reduction; map_init is for mapping with state.
Performance Characteristics
use rayon::prelude::*;
fn performance_comparison() {
let data: Vec<i32> = (0..1_000_000).collect();
// map_init: Init once per thread, mutate per item
// - Initialization: N threads
// - Per-item overhead: Just state access
// - Best for: Expensive initialization, mutable state
// map_with: Clone per split, read per item
// - Cloning: At split boundaries (potentially many)
// - Per-item overhead: Just state access
// - Best for: Cheap clone, read-only state
// map (no state): Fresh per item
// - Initialization: Every item
// - Per-item overhead: Full initialization
// - Best for: No state needed
// Benchmark scenario 1: Expensive init, read-only
// map_init wins - only init once per thread
// Benchmark scenario 2: Cheap clone, read-only
// map_with wins - simpler API for immutable
// Benchmark scenario 3: Mutable state
// map_init wins - only option
}Choose based on initialization cost and mutability needs.
API Signature Comparison
use rayon::prelude::*;
// map_init signature:
// fn map_init<OP, INIT, T, R>(self, init: INIT, op: OP) -> MapInit<Self, INIT, OP>
// where
// OP: Fn(&mut T, Item) -> R + Sync,
// INIT: Fn() -> T + Sync,
//
// - init: Called once per thread
// - op: Receives &mut T, can mutate state
// - T: Can be any type, mutable within thread
// map_with signature:
// fn map_with<T, OP, R>(self, init: T, op: OP) -> MapWith<Self, T, OP>
// where
// OP: Fn(&T, Item) -> R + Sync,
// T: Clone + Send,
//
// - init: Cloned at each split
// - op: Receives &T, read-only
// - T: Must be Clone
fn signature_demonstration() {
let data: Vec<i32> = (0..10).collect();
// map_init: mutable state allowed
let _: Vec<i32> = data
.par_iter()
.map_init(
|| Vec::new(),
|vec: &mut Vec<i32>, &x| {
vec.push(x); // Mutable!
vec.len()
}
)
.collect();
// map_with: only shared reference
let counter = Counter { count: 0 };
let _: Vec<i32> = data
.par_iter()
.map_with(counter, |counter: &Counter, &x| {
x + counter.count // Read-only!
})
.collect();
}
#[derive(Clone)]
struct Counter {
count: i32,
}The signatures reveal the key difference: &mut T vs &T.
Summary Table
use rayon::prelude::*;
fn summary() {
// | Feature | map_init | map_with |
// |-----------------------|-----------------------|-----------------------|
// | State mutability | &mut T | &T (read-only) |
// | Initialization | Once per thread | Clone per split |
// | Clone requirement | None | Must impl Clone |
// | Per-item overhead | Low (state access) | Low (state access) |
// | Use case | Buffers, RNG, accum | Config, lookup tables |
// | Thread safety | Per-thread isolation | Per-split sharing |
// | Mutable state | Yes | No |
// | Expensive init types | Efficient (one init) | May clone repeatedly |
}Choose based on mutability and initialization cost.
Synthesis
Quick reference:
use rayon::prelude::*;
// Use map_init when:
// 1. You need mutable thread-local state
// 2. Initialization is expensive
// 3. State should persist across items in thread
(0..1000).into_par_iter()
.map_init(
|| MyBuffer::new(), // Once per thread
|buf: &mut MyBuffer, x| { // Mutable access
buf.process(x)
}
)
.collect::<Vec<_>>();
// Use map_with when:
// 1. State is read-only
// 2. Clone is cheap
// 3. You want simpler syntax
let config = MyConfig::default();
(0..1000).into_par_iter()
.map_with(config, |cfg: &MyConfig, x| {
cfg.apply(x) // Read-only access
})
.collect::<Vec<_>>();Decision tree:
// Need mutable state?
// Yes -> map_init
// No -> Need expensive initialization?
// Yes -> map_init (avoids repeated clones)
// No -> map_with (simpler, clone is cheap)Key insight: map_init and map_with serve overlapping but distinct purposes in Rayon's parallel iteration model. map_init is the more powerful primitive: it calls an initialization function once per thread, then provides &mut T to the closure, enabling thread-local state that persists across items. This is essential for mutable buffers, RNGs, and accumulators that must not cross thread boundaries. map_with is simpler but more limited: it takes a pre-existing value, clones it at each split boundary (when work is divided between threads or stolen), and provides only &T to the closure. This is ideal for immutable configuration, lookup tables, and read-only context. The performance trade-off hinges on initialization cost: map_init pays initialization cost once per thread (typically 4-16 threads), while map_with pays clone cost at each split (potentially more clones, but only if Clone::clone is expensive). For types with expensive initialization but cheap clone, either works; for expensive clone, map_init wins; for mutable state, only map_init works. The thread-safety guarantee differs too: map_init's state is truly thread-local (no synchronization needed), while map_with's cloned state is per-split (shared within a split, which can race if mutatedâhence the &T restriction).
