Loading pageâŚ
Rust walkthroughs
Loading pageâŚ
rayon::ThreadPoolBuilder::num_threads control parallelism granularity for CPU-bound tasks?rayon::ThreadPoolBuilder::num_threads sets the number of worker threads in a Rayon thread pool, determining the maximum degree of parallelism for CPU-bound workloadsâsetting num_threads(4) creates exactly four threads that execute parallel iterators, regardless of available CPU cores. The default uses num_cpus::get() to match physical cores, but explicit configuration enables tuning for specific scenarios: fewer threads than cores reduces contention and improves cache locality, while more threads than cores can help when threads block on I/O or synchronization. The choice affects work-stealing efficiency, memory bandwidth saturation, and the granularity at which tasks split across threadsâfiner parallelism with more threads creates smaller work chunks but increases scheduling overhead, while coarser parallelism with fewer threads reduces overhead but may leave cores idle.
use rayon::prelude::*;
fn main() {
// Default thread pool uses all available CPUs
let data: Vec<i32> = (0..1_000_000).collect();
// Uses num_cpus::get() threads by default
let sum: i32 = data.par_iter()
.map(|&x| x * 2)
.sum();
println!("Sum: {}", sum);
println!("Default threads: {}", rayon::current_num_threads());
}By default, Rayon creates a thread pool with threads equal to CPU cores.
use rayon::ThreadPoolBuilder;
fn main() {
// Create thread pool with exactly 4 threads
let pool = ThreadPoolBuilder::new()
.num_threads(4)
.build()
.unwrap();
pool.install(|| {
let data: Vec<i32> = (0..100_000).collect();
let sum: i32 = data.par_iter().sum();
println!("Sum: {}", sum);
});
// Verify thread count
pool.install(|| {
println!("Pool threads: {}", rayon::current_num_threads());
});
}num_threads(4) creates exactly 4 worker threads for this pool.
use rayon::ThreadPoolBuilder;
fn main() {
// Configure the global thread pool at startup
ThreadPoolBuilder::new()
.num_threads(8)
.build_global()
.unwrap();
// All parallel operations use 8 threads
let data: Vec<i32> = (0..1_000_000).collect();
let sum: i32 = data.par_iter().sum();
println!("Sum: {}", sum);
println!("Global threads: {}", rayon::current_num_threads());
}build_global() configures the default global thread pool used by all parallel operations.
use rayon::ThreadPoolBuilder;
use rayon::prelude::*;
fn cpu_bound_work(n: u64) -> u64 {
// Simulated CPU-intensive computation
let mut result = 0u64;
for i in 0..n {
result = result.wrapping_add(i);
}
result
}
fn main() {
let data: Vec<u64> = (0..100).collect();
let available_cpus = num_cpus::get();
println!("Available CPUs: {}", available_cpus);
// With more threads than CPUs (over-subscription)
let over_pool = ThreadPoolBuilder::new()
.num_threads(available_cpus * 2)
.build()
.unwrap();
let over_time = std::time::Instant::now();
over_pool.install(|| {
data.par_iter().for_each(|&n| {
let _ = cpu_bound_work(1_000_000);
});
});
println!("Over-subscribed time: {:?}", over_time.elapsed());
// With fewer threads than CPUs (under-subscription)
let under_pool = ThreadPoolBuilder::new()
.num_threads(available_cpus / 2)
.build()
.unwrap();
let under_time = std::time::Instant::now();
under_pool.install(|| {
data.par_iter().for_each(|&n| {
let _ = cpu_bound_work(1_000_000);
});
});
println!("Under-subscribed time: {:?}", under_time.elapsed());
}Over-subscription adds contention; under-subscription leaves cores idle.
use rayon::ThreadPoolBuilder;
use rayon::prelude::*;
fn matrix_sum(matrix: &Vec<Vec<f64>>) -> f64 {
matrix.iter().flat_map(|row| row.iter()).sum()
}
fn main() {
// Large matrix that benefits from cache locality
let matrix: Vec<Vec<f64>> = (0..1000)
.map(|_| (0..1000).map(|i| i as f64).collect())
.collect();
// Fewer threads = better L1/L2 cache locality per thread
let small_pool = ThreadPoolBuilder::new()
.num_threads(2)
.build()
.unwrap();
let start = std::time::Instant::now();
small_pool.install(|| {
let sum: f64 = (0..100).into_par_iter()
.map(|_| matrix_sum(&matrix))
.sum();
println!("Sum: {}", sum);
});
println!("Fewer threads: {:?}", start.elapsed());
// More threads = more cache contention
let large_pool = ThreadPoolBuilder::new()
.num_threads(16)
.build()
.unwrap();
let start = std::time::Instant::now();
large_pool.install(|| {
let sum: f64 = (0..100).into_par_iter()
.map(|_| matrix_sum(&matrix))
.sum();
println!("Sum: {}", sum);
});
println!("More threads: {:?}", start.elapsed());
}Fewer threads can improve cache locality when working sets fit in cache.
use rayon::ThreadPoolBuilder;
use rayon::prelude::*;
fn main() {
// Work stealing: idle threads steal from busy threads
let pool = ThreadPoolBuilder::new()
.num_threads(4)
.build()
.unwrap();
// Uneven workload - some items take longer
pool.install(|| {
let data: Vec<u64> = (0..100).collect();
data.par_iter().for_each(|&i| {
// Items 0-9 take much longer
if i < 10 {
std::thread::sleep(std::time::Duration::from_millis(100));
} else {
std::thread::sleep(std::time::Duration::from_millis(1));
}
});
println!("Work stealing balanced the uneven load");
});
// With 4 threads:
// - Thread 1 might get slow items 0-2 (300ms total)
// - Thread 2 might get slow items 3-5 (300ms total)
// - Threads 3-4 steal work from threads 1-2 when done
// Without work stealing, some threads finish early
}Work stealing balances load; thread count affects stealing efficiency.
use rayon::ThreadPoolBuilder;
use rayon::prelude::*;
fn main() {
// Outer parallelism with 4 threads
let outer_pool = ThreadPoolBuilder::new()
.num_threads(4)
.build()
.unwrap();
outer_pool.install(|| {
(0..10).into_par_iter().for_each(|i| {
// Inner parallelism uses same thread pool
let inner_sum: i32 = (0..100).into_par_iter().sum();
println!("Task {}: sum = {}", i, inner_sum);
});
});
// Nested parallelism shares threads from same pool
// Thread count applies to total parallelism depth
// Alternative: separate pools for nested work
let inner_pool = ThreadPoolBuilder::new()
.num_threads(2)
.build()
.unwrap();
outer_pool.install(|| {
(0..10).into_par_iter().for_each(|i| {
inner_pool.install(|| {
let sum: i32 = (0..100).into_par_iter().sum();
println!("Task {}: sum = {}", i, sum);
});
});
});
}Nested parallelism can share threads or use separate pools.
use rayon::ThreadPoolBuilder;
use rayon::prelude::*;
fn memory_intensive(data: &[f64]) -> f64 {
// Memory-bound operation: reads data, minimal compute
data.iter().sum()
}
fn main() {
let data: Vec<f64> = (0..10_000_000).map(|i| i as f64).collect();
let available = num_cpus::get();
// With many threads, memory bandwidth becomes bottleneck
let many_pool = ThreadPoolBuilder::new()
.num_threads(available)
.build()
.unwrap();
let start = std::time::Instant::now();
many_pool.install(|| {
let sum: f64 = (0..100).into_par_iter()
.map(|_| memory_intensive(&data))
.sum();
println!("Sum: {}", sum);
});
println!("Many threads: {:?}", start.elapsed());
// Fewer threads may not help if memory-bound
let few_pool = ThreadPoolBuilder::new()
.num_threads(2)
.build()
.unwrap();
let start = std::time::Instant::now();
few_pool.install(|| {
let sum: f64 = (0..100).into_par_iter()
.map(|_| memory_intensive(&data))
.sum();
println!("Sum: {}", sum);
});
println!("Few threads: {:?}", start.elapsed());
}Memory-bound workloads saturate bandwidth; more threads may not help.
use rayon::ThreadPoolBuilder;
use rayon::prelude::*;
use std::sync::atomic::{AtomicUsize, Ordering};
fn main() {
// For mixed workloads, more threads than CPUs can help
let pool = ThreadPoolBuilder::new()
.num_threads(num_cpus::get() * 2)
.build()
.unwrap();
let tasks_done = AtomicUsize::new(0);
pool.install(|| {
(0..100).into_par_iter().for_each(|i| {
// Mix of CPU and blocking operations
if i % 10 == 0 {
// 10% of tasks block (simulated I/O)
std::thread::sleep(std::time::Duration::from_millis(10));
} else {
// 90% of tasks are CPU-bound
let mut sum = 0u64;
for j in 0..100_000 {
sum = sum.wrapping_add(j);
}
}
tasks_done.fetch_add(1, Ordering::Relaxed);
});
});
println!("Tasks completed: {}", tasks_done.load(Ordering::Relaxed));
// Extra threads help when some threads are blocked
}Over-subscription can help when threads block on I/O or synchronization.
use rayon::ThreadPoolBuilder;
fn main() {
let pool = ThreadPoolBuilder::new()
.num_threads(4)
.thread_name(|index| format!("worker-{}", index))
.build()
.unwrap();
pool.install(|| {
// Thread names appear in debuggers and profilers
let handle = std::thread::current();
println!("Thread name: {:?}", handle.name());
});
}Named threads help identify threads in debuggers and profiling tools.
use rayon::ThreadPoolBuilder;
fn main() {
// Threads with larger stack for deep recursion
let pool = ThreadPoolBuilder::new()
.num_threads(4)
.stack_size(8 * 1024 * 1024) // 8MB stack
.build()
.unwrap();
pool.install(|| {
fn deep_recurse(n: usize) -> usize {
if n == 0 { 0 } else { 1 + deep_recurse(n - 1) }
}
// Larger stack allows deeper recursion
let result = deep_recurse(10_000);
println!("Recursion depth: {}", result);
});
}stack_size configures per-thread stack size for deep recursion or large stack frames.
use rayon::ThreadPoolBuilder;
use std::sync::mpsc;
use std::sync::Arc;
fn main() {
let pool = ThreadPoolBuilder::new()
.num_threads(4)
.build()
.unwrap();
let (tx, rx) = mpsc::channel();
// Spawn tasks directly on the pool
for i in 0..10 {
let tx = tx.clone();
pool.spawn(move || {
tx.send(i * 2).unwrap();
});
}
drop(tx);
// Collect results
let mut results: Vec<_> = rx.iter().collect();
results.sort();
println!("Results: {:?}", results);
// spawn_fifo for FIFO execution order
let (tx2, rx2) = mpsc::channel();
for i in 0..5 {
let tx = tx2.clone();
pool.spawn_fifo(move || {
tx.send(i).unwrap();
});
}
drop(tx2);
let mut results2: Vec<_> = rx2.iter().collect();
println!("FIFO order: {:?}", results2);
}spawn and spawn_fifo submit tasks directly to the thread pool.
use rayon::ThreadPoolBuilder;
// Rayon threads are blocking, not async-friendly by default
// Use async runtimes for async work, Rayon for CPU-bound work
fn main() {
let cpu_pool = ThreadPoolBuilder::new()
.num_threads(4)
.build()
.unwrap();
// CPU-bound work on Rayon
let result = cpu_pool.install(|| {
(0..1_000_000u64).into_iter()
.map(|x| x.wrapping_mul(x))
.sum::<u64>()
});
println!("Compute result: {}", result);
// For async I/O, use tokio or async-std thread pool
// Rayon is designed for CPU-bound parallelism
}Use Rayon for CPU-bound work; async runtimes are better for I/O-bound work.
use rayon::ThreadPoolBuilder;
fn main() {
let pool = ThreadPoolBuilder::new()
.num_threads(4)
.build()
.unwrap();
pool.install(|| {
// join: run two closures in parallel
let (a, b) = rayon::join(
|| (0..100).sum::<i32>(),
|| (100..200).sum::<i32>(),
);
println!("Join results: {} + {} = {}", a, b, a + b);
// scope: spawn tasks that reference local data
let mut data = vec
![1, 2, 3, 4, 5];
rayon::scope(|s| {
s.spawn(|_| {
data[0] *= 2;
});
s.spawn(|_| {
data[1] *= 3;
});
});
println!("After scope: {:?}", data);
});
}join and scope create parallel tasks within the thread pool.
use rayon::ThreadPoolBuilder;
use rayon::prelude::*;
fn main() {
let cpus = num_cpus::get();
// Guideline 1: Match physical cores for pure CPU work
let cpu_bound = ThreadPoolBuilder::new()
.num_threads(cpus)
.build()
.unwrap();
// Guideline 2: Use fewer threads for memory-bandwidth bound
let memory_bound = ThreadPoolBuilder::new()
.num_threads(cpus / 2.max(1))
.build()
.unwrap();
// Guideline 3: More threads for mixed CPU/IO workloads
let mixed_workload = ThreadPoolBuilder::new()
.num_threads(cpus * 2)
.build()
.unwrap();
// Guideline 4: Single thread for debugging
let single_thread = ThreadPoolBuilder::new()
.num_threads(1)
.build()
.unwrap();
// Guideline 5: Leave one core for system/other work
let leaving_headroom = ThreadPoolBuilder::new()
.num_threads((cpus - 1).max(1))
.build()
.unwrap();
println!("CPUs: {}", cpus);
println!("Guidelines applied based on workload characteristics");
}Choose thread count based on workload: CPU-bound, memory-bound, mixed, or debugging.
use rayon::ThreadPoolBuilder;
use rayon::prelude::*;
fn main() {
let pool = ThreadPoolBuilder::new()
.num_threads(8)
.build()
.unwrap();
pool.install(|| {
// current_num_threads() returns actual thread count
let threads = rayon::current_num_threads();
println!("Current thread pool has {} threads", threads);
// current_thread_index() returns 0-based thread ID
let index = rayon::current_thread_index();
println!("Running on thread index {:?}", index);
// Each parallel iteration may run on different thread
(0..10).into_par_iter().for_each(|i| {
let thread_idx = rayon::current_thread_index();
println!("Item {} on thread {:?}", i, thread_idx);
});
});
}current_num_threads() and current_thread_index() provide thread pool introspection.
Thread count effects:
| Thread Count | Effect | Use Case | |--------------|--------|----------| | Equal to CPUs | Optimal for CPU-bound | Pure computation | | Less than CPUs | Better cache locality | Memory-bandwidth bound | | More than CPUs | Parallelism when blocked | Mixed CPU/IO workloads | | Single thread | Deterministic execution | Debugging, profiling |
Workload characteristics:
| Workload Type | Optimal Threads | Reasoning |
|---------------|-----------------|-----------|
| Pure CPU | num_cpus::get() | One thread per core |
| Memory-bandwidth bound | num_cpus / 2 | Bandwidth saturates before cores |
| Mixed CPU/IO | num_cpus * 2 | Extra threads while others block |
| Recursive parallelism | Match outer level | Nested work shares pool |
| Cache-sensitive | Fewer threads | Less cache contention |
Key insight: rayon::ThreadPoolBuilder::num_threads controls the fundamental parallelism unit in Rayonâeach thread in the pool participates in work stealing, claiming tasks from a shared queue and from each other's local queues. The optimal count depends on the ratio of compute to memory access, the presence of blocking operations, and cache characteristics of the workload. For pure CPU-bound tasks, matching physical cores maximizes throughput without oversubscription overhead; for memory-intensive workloads, fewer threads reduce contention for memory bandwidth; for mixed workloads with I/O or synchronization, extra threads provide parallelism while some threads block. The work-stealing scheduler automatically balances load across threads, so uneven work distribution doesn't require manual interventionâbut thread count determines how many workers participate in stealing. Use fewer threads when you want better per-thread cache locality or need to leave cores for other processes; use more threads when tasks contain blocking operations that prevent cores from being fully utilized. The default behavior (matching CPU cores) works well for most CPU-bound workloads, but explicit num_threads configuration enables tuning for specific hardware and workload characteristics.