What are the trade-offs between `rayon::iter::ParallelIterator::map_init` and `map_with` for per-thread initialization?

map_init creates a fresh init value for each thread that invokes the closure, ideal for thread-local state like buffers or RNGs, while map_with clones the init value once per split, sharing it across tasks within a thread—suitable for immutable configuration or read-only context. The key trade-off is between per-invocation initialization cost (map_init) versus clone-on-split overhead (map_with), with different semantics for mutable state access.

Basic map_init Usage

use rayon::prelude::*;
 
fn basic_map_init() {
    let data = vec
![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
;
    
    let result: Vec<String> = data
        .par_iter()
        .map_init(
            || String::new(),           // Init: called once per thread
            |buffer: &mut String, &x| {  // Closure: receives mutable buffer
                buffer.clear();
                buffer.push_str(&format!("{}^2 = {}", x, x * x));
                buffer.clone()
            }
        )
        .collect();
    
    println!("{:?}", result);
    // Each thread gets its own String buffer, reused across items
}

map_init provides a thread-local mutable value that persists across invocations.

Basic map_with Usage

use rayon::prelude::*;
 
fn basic_map_with() {
    let data = vec
![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
;
    let config = Config { multiplier: 10 };
    
    let result: Vec<i32> = data
        .par_iter()
        .map_with(config, |config: &Config, &x| {
            // config is cloned per split, then shared
            x * config.multiplier
        })
        .collect();
    
    println!("{:?}", result);
}
 
#[derive(Clone)]
struct Config {
    multiplier: i32,
}

map_with clones the init value and shares it across items within a split.

Per-Thread vs Per-Split Semantics

use rayon::prelude::*;
use std::cell::RefCell;
use std::sync::atomic::{AtomicUsize, Ordering};
 
fn semantics_comparison() {
    // map_init: init called once per thread
    let init_count = AtomicUsize::new(0);
    
    (0..1000).into_par_iter()
        .map_init(
            || {
                init_count.fetch_add(1, Ordering::SeqCst);
                Vec::new()
            },
            |vec: &mut Vec<i32>, x| {
                vec.push(x);
                *vec.last().unwrap()
            }
        )
        .count();
    
    println!("map_init inits: {}", init_count.load(Ordering::SeqCst));
    // Approximately equals number of threads (e.g., 8 on 8-core CPU)
    
    // map_with: init cloned once per split
    let mut clone_count = 0;
    let counter = CloneCounter { count: 0 };
    
    (0..1000).into_par_iter()
        .map_with(CloneCounter { count: 0 }, |counter, x| {
            x + counter.count
        })
        .count();
    
    // Clone happens at each split point
}
 
#[derive(Clone)]
struct CloneCounter {
    count: i32,
}

map_init initializes per thread; map_with clones per split.

When to Use map_init

use rayon::prelude::*;
 
fn map_init_use_cases() {
    // Use case 1: Thread-local buffers (avoid allocation per item)
    let data: Vec<Vec<u8>> = (0..100).map(|i| vec
![0; i])
.collect();
    
    let sums: Vec<usize> = data
        .par_iter()
        .map_init(
            || Vec::with_capacity(100),  // Reusable buffer per thread
            |buffer: &mut Vec<u8>, chunk| {
                buffer.clear();
                buffer.extend(chunk.iter().map(|&b| b * 2));
                buffer.len()
            }
        )
        .collect();
    
    // Use case 2: Thread-local RNG
    use rand::rngs::ThreadRng;
    use rand::Rng;
    
    let random_pairs: Vec<(u32, u32)> = (0..100)
        .into_par_iter()
        .map_init(
            || rand::thread_rng(),
            |rng: &mut ThreadRng, _| {
                (rng.gen(), rng.gen())
            }
        )
        .collect();
    
    // Use case 3: Thread-local accumulators
    let data = vec
![1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
;
    
    let thread_sums: Vec<i32> = data
        .par_iter()
        .map_init(
            || 0i32,  // Thread-local sum
            |sum: &mut i32, &x| {
                *sum += x;
                *sum
            }
        )
        .collect();
}

Use map_init when you need thread-local mutable state.

When to Use map_with

use rayon::prelude::*;
 
#[derive(Clone)]
struct ProcessingConfig {
    threshold: f64,
    scale: f64,
}
 
fn map_with_use_cases() {
    let config = ProcessingConfig {
        threshold: 0.5,
        scale: 2.0,
    };
    
    // Use case 1: Configuration sharing
    let data: Vec<f64> = (0..100).map(|i| i as f64 / 100.0).collect();
    
    let processed: Vec<f64> = data
        .par_iter()
        .map_with(config.clone(), |cfg: &ProcessingConfig, &x| {
            if x > cfg.threshold {
                x * cfg.scale
            } else {
                x
            }
        })
        .collect();
    
    // Use case 2: Read-only context
    let lookup_table = vec
![0, 1, 4, 9, 16, 25, 36, 49, 64, 81]
;
    
    let indexed: Vec<i32> = (0..100)
        .into_par_iter()
        .map_with(lookup_table.clone(), |table: &Vec<i32>, x| {
            let idx = x % table.len();
            table[idx]
        })
        .collect();
    
    // Use case 3: Database connection config (immutable)
    let db_config = DbConfig { url: "localhost".to_string() };
    
    let results: Vec<String> = (0..10)
        .into_par_iter()
        .map_with(db_config, |cfg: &DbConfig, id| {
            // Each thread can read cfg, but not mutate
            format!("{}: {}", cfg.url, id)
        })
        .collect();
}
 
#[derive(Clone)]
struct DbConfig {
    url: String,
}

Use map_with when you need read-only context cloned per split.

Mutable State: map_init Only

use rayon::prelude::*;
 
fn mutable_state_difference() {
    // map_init: Thread-local MUTABLE state
    let counts: Vec<i32> = (0..100)
        .into_par_iter()
        .map_init(
            || std::collections::HashMap::new(),
            |counts: &mut HashMap<i32, i32>, x| {
                *counts.entry(x % 10).or_insert(0) += 1;
                counts.len()
            }
        )
        .collect();
    
    // map_with: Immutable access only (shared reference)
    // This would NOT compile:
    // .map_with(HashMap::new(), |counts: &mut HashMap<...>, x| {
    //     counts.insert(x, 1);  // Error: cannot mutate shared reference
    // })
    
    // map_with only gives &T, not &mut T
}

map_init provides &mut T; map_with only provides &T.

Clone Cost Comparison

use rayon::prelude::*;
 
#[derive(Clone)]
struct ExpensiveToClone {
    data: Vec<u8>,
}
 
impl ExpensiveToClone {
    fn new() -> Self {
        Self { data: vec
![0; 1_000_000] }  // 1MB buffer
 
    }
}
 
fn clone_cost_comparison() {
    let data: Vec<i32> = (0..10_000).collect();
    
    // map_with: Clones ExpensiveToClone at each split
    // With 4 threads and deep splits, could clone many times
    let expensive = ExpensiveToClone::new();
    
    let result: Vec<i32> = data
        .par_iter()
        .map_with(expensive, |exp: &ExpensiveToClone, &x| {
            x + exp.data.len() as i32  // Read-only access
        })
        .collect();
    
    // map_init: Creates once per thread, never clones
    let result: Vec<i32> = data
        .par_iter()
        .map_init(
            || ExpensiveToClone::new(),
            |exp: &mut ExpensiveToClone, &x| {
                x + exp.data.len() as i32
            }
        )
        .collect();
}

For expensive-to-clone types, map_init may be more efficient.

Initialization Frequency

use rayon::prelude::*;
use std::sync::atomic::{AtomicUsize, Ordering};
 
fn init_frequency() {
    let init_calls = AtomicUsize::new(0);
    let clone_calls = AtomicUsize::new(0);
    
    // Track how often initialization happens
    struct Tracker {
        id: usize,
    }
    
    impl Clone for Tracker {
        fn clone(&mut self) -> Self {
            clone_calls.fetch_add(1, Ordering::SeqCst);
            Tracker { id: self.id }
        }
    }
    
    // map_init: One init per thread
    let result: Vec<_> = (0..1000)
        .into_par_iter()
        .map_init(
            || Tracker { id: init_calls.fetch_add(1, Ordering::SeqCst) },
            |tracker, x| (tracker.id, x)
        )
        .collect();
    
    let init_count = init_calls.load(Ordering::SeqCst);
    println!("map_init initializations: {}", init_count);
    // Typically equals thread pool size
    
    // map_with: Clone at each split
    init_calls.store(0, Ordering::SeqCst);
    clone_calls.store(0, Ordering::SeqCst);
    
    let result: Vec<_> = (0..1000)
        .into_par_iter()
        .map_with(Tracker { id: 0 }, |tracker, x| (tracker.id, x))
        .collect();
    
    println!("map_with clones: {}", clone_calls.load(Ordering::SeqCst));
    // Can be higher than thread pool size due to work stealing splits
}

map_init initializes fewer times; map_with clones at each split.

Thread-Local Storage Pattern

use rayon::prelude::*;
use std::cell::RefCell;
 
fn thread_local_pattern() {
    // map_init is ideal for thread-local patterns
    let result: Vec<i32> = (0..1000)
        .into_par_iter()
        .map_init(
            || {
                // This runs ONCE per thread
                RefCell::new(Vec::with_capacity(100))
            },
            |buffer: &RefCell<Vec<i32>>, x| {
                // Reuse the buffer for each item
                let mut b = buffer.borrow_mut();
                b.clear();
                b.push(x);
                b.push(x * 2);
                b.iter().sum()
            }
        )
        .collect();
    
    // Contrast with allocating inside map:
    let result: Vec<i32> = (0..1000)
        .into_par_iter()
        .map(|x| {
            // Allocates NEW Vec for each item
            let mut buffer = Vec::with_capacity(100);
            buffer.push(x);
            buffer.push(x * 2);
            buffer.iter().sum()
        })
        .collect();
}

map_init avoids allocation per item by reusing thread-local storage.

Accumulator Pattern

use rayon::prelude::*;
 
fn accumulator_pattern() {
    // Use map_init for thread-local accumulation
    let data: Vec<i32> = (0..1000).collect();
    
    // Each thread maintains its own sum
    let partial_sums: Vec<i32> = data
        .par_iter()
        .fold(
            || 0i32,  // Per-thread initial value
            |sum: &mut i32, &x| {
                *sum += x;
                *sum
            }
        )
        .collect();
    
    // map_init can also track thread-local state
    let with_state: Vec<(i32, usize)> = data
        .par_iter()
        .map_init(
            || (0i32, 0usize),  // (sum, count) per thread
            |state: &mut (i32, usize), &x| {
                state.0 += x;
                state.1 += 1;
                (*state.0, *state.1)  // Return current thread totals
            }
        )
        .collect();
    
    // This shows per-thread running state
}

Use map_init (or fold) for thread-local accumulation.

Configuration Distribution Pattern

use rayon::prelude::*;
 
#[derive(Clone)]
struct AppConfig {
    output_dir: String,
    max_items: usize,
    verbose: bool,
}
 
fn config_distribution() {
    let config = AppConfig {
        output_dir: "/tmp/output".to_string(),
        max_items: 100,
        verbose: true,
    };
    
    let items: Vec<i32> = (0..1000).collect();
    
    // map_with: Clone config at splits, share within split
    let processed: Vec<String> = items
        .par_iter()
        .map_with(config.clone(), |cfg: &AppConfig, &item| {
            if cfg.verbose {
                println!("Processing item {}", item);
            }
            if item < cfg.max_items as i32 {
                format!("{}/item_{}.txt", cfg.output_dir, item)
            } else {
                String::new()
            }
        })
        .collect();
    
    // config is cloned at split boundaries
    // Within a split, all items share the same config reference
}

map_with is natural for distributing configuration.

RNG State Pattern

use rayon::prelude::*;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
 
fn rng_pattern() {
    // map_init for thread-local RNG (each thread gets independent RNG)
    let random_values: Vec<u64> = (0..100)
        .into_par_iter()
        .map_init(
            || StdRng::from_entropy(),
            |rng: &mut StdRng, _| {
                rng.gen()
            }
        )
        .collect();
    
    // If you want reproducible randomness:
    let seed = 42u64;
    let reproducible: Vec<u64> = (0..100)
        .into_par_iter()
        .enumerate()
        .map_init(
            || StdRng::seed_from_u64(seed),
            |rng: &mut StdRng, (i, _)| {
                rng.gen()
            }
        )
        .collect();
    
    // map_with would not work here - RNG needs mutable state
    // This would NOT compile:
    // .map_with(StdRng::from_entropy(), |rng: &StdRng, _| {
    //     rng.gen()  // Cannot call &mut methods on shared reference
    // })
}

RNGs require mutable state; use map_init.

Comparison with fold

use rayon::prelude::*;
 
fn compare_with_fold() {
    let data: Vec<i32> = (0..100).collect();
    
    // fold: Thread-local accumulator, reduces at end
    let sum: i32 = data
        .par_iter()
        .fold(
            || 0,
            |acc: &mut i32, &x| {
                *acc += x;
                *acc
            }
        )
        .sum::<i32>();  // Final reduction
    
    // map_init: Thread-local state, returns value per item
    let indexed: Vec<(usize, i32)> = data
        .par_iter()
        .map_init(
            || 0usize,
            |idx: &mut usize, &x| {
                *idx += 1;
                (*idx, x)
            }
        )
        .collect();
    
    // fold combines into single result per thread
    // map_init produces output per item, using thread-local state
}

fold is for reduction; map_init is for mapping with state.

Performance Characteristics

use rayon::prelude::*;
 
fn performance_comparison() {
    let data: Vec<i32> = (0..1_000_000).collect();
    
    // map_init: Init once per thread, mutate per item
    // - Initialization: N threads
    // - Per-item overhead: Just state access
    // - Best for: Expensive initialization, mutable state
    
    // map_with: Clone per split, read per item
    // - Cloning: At split boundaries (potentially many)
    // - Per-item overhead: Just state access
    // - Best for: Cheap clone, read-only state
    
    // map (no state): Fresh per item
    // - Initialization: Every item
    // - Per-item overhead: Full initialization
    // - Best for: No state needed
    
    // Benchmark scenario 1: Expensive init, read-only
    // map_init wins - only init once per thread
    
    // Benchmark scenario 2: Cheap clone, read-only
    // map_with wins - simpler API for immutable
    
    // Benchmark scenario 3: Mutable state
    // map_init wins - only option
}

Choose based on initialization cost and mutability needs.

API Signature Comparison

use rayon::prelude::*;
 
// map_init signature:
// fn map_init<OP, INIT, T, R>(self, init: INIT, op: OP) -> MapInit<Self, INIT, OP>
// where
//     OP: Fn(&mut T, Item) -> R + Sync,
//     INIT: Fn() -> T + Sync,
// 
// - init: Called once per thread
// - op: Receives &mut T, can mutate state
// - T: Can be any type, mutable within thread
 
// map_with signature:
// fn map_with<T, OP, R>(self, init: T, op: OP) -> MapWith<Self, T, OP>
// where
//     OP: Fn(&T, Item) -> R + Sync,
//     T: Clone + Send,
// 
// - init: Cloned at each split
// - op: Receives &T, read-only
// - T: Must be Clone
 
fn signature_demonstration() {
    let data: Vec<i32> = (0..10).collect();
    
    // map_init: mutable state allowed
    let _: Vec<i32> = data
        .par_iter()
        .map_init(
            || Vec::new(),
            |vec: &mut Vec<i32>, &x| {
                vec.push(x);  // Mutable!
                vec.len()
            }
        )
        .collect();
    
    // map_with: only shared reference
    let counter = Counter { count: 0 };
    let _: Vec<i32> = data
        .par_iter()
        .map_with(counter, |counter: &Counter, &x| {
            x + counter.count  // Read-only!
        })
        .collect();
}
 
#[derive(Clone)]
struct Counter {
    count: i32,
}

The signatures reveal the key difference: &mut T vs &T.

Summary Table

use rayon::prelude::*;
 
fn summary() {
    // | Feature               | map_init              | map_with              |
    // |-----------------------|-----------------------|-----------------------|
    // | State mutability      | &mut T                | &T (read-only)        |
    // | Initialization        | Once per thread       | Clone per split       |
    // | Clone requirement     | None                  | Must impl Clone       |
    // | Per-item overhead     | Low (state access)    | Low (state access)    |
    // | Use case              | Buffers, RNG, accum   | Config, lookup tables |
    // | Thread safety         | Per-thread isolation  | Per-split sharing     |
    // | Mutable state         | Yes                   | No                    |
    // | Expensive init types  | Efficient (one init)  | May clone repeatedly  |
}

Choose based on mutability and initialization cost.

Synthesis

Quick reference:

use rayon::prelude::*;
 
// Use map_init when:
// 1. You need mutable thread-local state
// 2. Initialization is expensive
// 3. State should persist across items in thread
 
(0..1000).into_par_iter()
    .map_init(
        || MyBuffer::new(),       // Once per thread
        |buf: &mut MyBuffer, x| {  // Mutable access
            buf.process(x)
        }
    )
    .collect::<Vec<_>>();
 
// Use map_with when:
// 1. State is read-only
// 2. Clone is cheap
// 3. You want simpler syntax
 
let config = MyConfig::default();
(0..1000).into_par_iter()
    .map_with(config, |cfg: &MyConfig, x| {
        cfg.apply(x)  // Read-only access
    })
    .collect::<Vec<_>>();

Decision tree:

// Need mutable state?
//   Yes -> map_init
//   No -> Need expensive initialization?
//            Yes -> map_init (avoids repeated clones)
//            No -> map_with (simpler, clone is cheap)

Key insight: map_init and map_with serve overlapping but distinct purposes in Rayon's parallel iteration model. map_init is the more powerful primitive: it calls an initialization function once per thread, then provides &mut T to the closure, enabling thread-local state that persists across items. This is essential for mutable buffers, RNGs, and accumulators that must not cross thread boundaries. map_with is simpler but more limited: it takes a pre-existing value, clones it at each split boundary (when work is divided between threads or stolen), and provides only &T to the closure. This is ideal for immutable configuration, lookup tables, and read-only context. The performance trade-off hinges on initialization cost: map_init pays initialization cost once per thread (typically 4-16 threads), while map_with pays clone cost at each split (potentially more clones, but only if Clone::clone is expensive). For types with expensive initialization but cheap clone, either works; for expensive clone, map_init wins; for mutable state, only map_init works. The thread-safety guarantee differs too: map_init's state is truly thread-local (no synchronization needed), while map_with's cloned state is per-split (shared within a split, which can race if mutated—hence the &T restriction).

What are the trade-offs between rayon::iter::ParallelIterator::map_init and map_with for per-thread initialization?