How do I parallelize iterations across multiple threads?

Walkthrough

Rayon is a data parallelism library that makes it easy to convert sequential iterations into parallel ones. It provides a work-stealing thread pool and parallel iterators that automatically distribute work across available CPU cores. The API mirrors standard Rust iterators, making adoption straightforward.

Key benefits:

  1. Simple conversion — replace .iter() with .par_iter() to parallelize
  2. Work stealing — automatically balances load across threads
  3. Join pattern — fork-join parallelism for recursive algorithms
  4. No data races — compile-time guarantees through Rust's ownership system
  5. Scoped threads — safely access stack data from parallel tasks

Rayon excels at CPU-bound operations on collections: transformations, filters, sorting, and reductions.

Code Example

# Cargo.toml
[dependencies]
rayon = "1"
use rayon::prelude::*;
use std::time::Instant;
 
fn main() {
    // ===== Basic Parallel Iteration =====
    
    let data = (0..1_000_000).collect::<Vec<_>>();
    
    // Sequential sum
    let start = Instant::now();
    let sum: i32 = data.iter().sum();
    println!("Sequential sum: {} (took {:?})", sum, start.elapsed());
    
    // Parallel sum - just add par_ prefix!
    let start = Instant::now();
    let sum: i32 = data.par_iter().sum();
    println!("Parallel sum: {} (took {:?})", sum, start.elapsed());
    
    // ===== Parallel Map =====
    
    let numbers: Vec<i32> = (1..=100).collect();
    
    // Transform each element in parallel
    let squares: Vec<i32> = numbers
        .par_iter()
        .map(|x| x * x)
        .collect();
    
    println!("Squares: {:?}", squares.iter().take(5).collect::<Vec<_>>());
    
    // ===== Parallel Filter =====
    
    let even_squares: Vec<i32> = numbers
        .par_iter()
        .map(|x| x * x)
        .filter(|&x| x % 2 == 0)
        .collect();
    
    println!("Even squares: {:?}", even_squares.iter().take(5).collect::<Vec<_>>());
    
    // ===== Parallel Reduce =====
    
    let product: i32 = (1..=10)
        .into_par_iter()
        .reduce(|| 1, |a, b| a * b);
    
    println!("Product of 1..=10: {}", product);
    
    // ===== Parallel Fold =====
    
    let sum_of_squares: i32 = (1..=100)
        .into_par_iter()
        .fold(|| 0, |acc, x| acc + x * x)
        .sum();
    
    println!("Sum of squares 1..=100: {}", sum_of_squares);
}

Real-World Example: Image Processing

use rayon::prelude::*;
 
#[derive(Clone)]
struct Pixel {
    r: u8,
    g: u8,
    b: u8,
}
 
impl Pixel {
    fn grayscale(&self) -> Pixel {
        let gray = ((self.r as u16 + self.g as u16 + self.b as u16) / 3) as u8;
        Pixel { r: gray, g: gray, b: gray }
    }
    
    fn invert(&self) -> Pixel {
        Pixel {
            r: 255 - self.r,
            g: 255 - self.g,
            b: 255 - self.b,
        }
    }
}
 
struct Image {
    width: usize,
    height: usize,
    pixels: Vec<Pixel>,
}
 
impl Image {
    fn new(width: usize, height: usize) -> Self {
        let pixels = (0..width * height)
            .map(|i| Pixel {
                r: (i % 256) as u8,
                g: ((i * 7) % 256) as u8,
                b: ((i * 13) % 256) as u8,
            })
            .collect();
        
        Self { width, height, pixels }
    }
    
    // Sequential processing
    fn to_grayscale_sequential(&self) -> Image {
        let pixels = self.pixels.iter().map(|p| p.grayscale()).collect();
        Image { width: self.width, height: self.height, pixels }
    }
    
    // Parallel processing
    fn to_grayscale_parallel(&self) -> Image {
        let pixels = self.pixels.par_iter().map(|p| p.grayscale()).collect();
        Image { width: self.width, height: self.height, pixels }
    }
}
 
fn main() {
    let img = Image::new(1920, 1080);
    
    use std::time::Instant;
    
    let start = Instant::now();
    let _gray = img.to_grayscale_sequential();
    println!("Sequential: {:?}", start.elapsed());
    
    let start = Instant::now();
    let _gray = img.to_grayscale_parallel();
    println!("Parallel: {:?}", start.elapsed());
}

Join and Fork-Join Parallelism

use rayon::prelude::*;
 
// Recursive Fibonacci with parallel tasks
fn fib(n: u32) -> u32 {
    if n <= 20 {
        // Small values: sequential to avoid overhead
        fib_sequential(n)
    } else {
        // Large values: parallel recursion
        let (a, b) = rayon::join(|| fib(n - 1), || fib(n - 2));
        a + b
    }
}
 
fn fib_sequential(n: u32) -> u32 {
    if n <= 1 {
        n
    } else {
        fib_sequential(n - 1) + fib_sequential(n - 2)
    }
}
 
// Merge sort with parallelism
fn parallel_merge_sort<T: Ord + Send + Clone>(data: &mut [T]) {
    if data.len() <= 1000 {
        data.sort();
        return;
    }
    
    let mid = data.len() / 2;
    let (left, right) = data.split_at_mut(mid);
    
    rayon::join(
        || parallel_merge_sort(left),
        || parallel_merge_sort(right),
    );
    
    // Merge (simplified)
    let mut temp = left.to_vec();
    temp.extend_from_slice(right);
    temp.sort();
    data.copy_from_slice(&temp);
}
 
fn main() {
    // Quick demo of join
    let (result1, result2) = rayon::join(
        || (1..1_000_000).into_par_iter().sum::<i64>(),
        || (1..1_000_000).into_par_iter().map(|x| x * x).sum::<i64>(),
    );
    
    println!("Sum: {}", result1);
    println!("Sum of squares: {}", result2);
}

Parallel Sorting and Grouping

use rayon::prelude::*;
use std::collections::HashMap;
 
fn main() {
    let numbers: Vec<i32> = (1..=1_000_000).collect();
    
    // Parallel sorting (unstable, in-place)
    let mut nums = numbers.clone();
    nums.par_sort_unstable();
    
    // Parallel sorting with key
    #[derive(Clone, Debug)]
    struct Person {
        name: String,
        age: u32,
    }
    
    let mut people: Vec<Person> = (0..1000)
        .map(|i| Person {
            name: format!("Person {}", i),
            age: (i % 100) + 1,
        })
        .collect();
    
    people.par_sort_by(|a, b| a.age.cmp(&b.age));
    println!("Youngest: {:?}", people.first());
    
    // Parallel grouping with fold
    let grouped: HashMap<u32, Vec<i32>> = numbers
        .par_iter()
        .fold(
            || HashMap::new(),
            |mut map, &x| {
                let key = x % 10;
                map.entry(key).or_default().push(x);
                map
            },
        )
        .reduce(
            || HashMap::new(),
            |mut a, b| {
                for (k, v) in b {
                    a.entry(k).or_default().extend(v);
                }
                a
            },
        );
    
    println!("Groups: {} keys", grouped.len());
}

Scoped Threads

use rayon::prelude::*;
 
fn main() {
    let mut data = vec![0; 1000];
    let factor = 10; // Stack variable
    
    // Modify in place with access to stack data
    data.par_iter_mut().enumerate().for_each(|(i, x)| {
        *x = (i * factor) as i32;
    });
    
    println!("First 5: {:?}", &data[..5]);
}

Summary

  • Convert .iter() to .par_iter() or .into_iter() to .into_par_iter() for immediate parallelization
  • Parallel iterators mirror standard iterators: .map(), .filter(), .fold(), .reduce(), .for_each()
  • Use .collect() to gather results back into a collection (maintains order)
  • par_sort(), par_sort_by(), par_sort_unstable() provide parallel sorting
  • rayon::join(|| task1(), || task2()) runs two closures in parallel and waits for both
  • For small workloads, sequential may be faster due to parallel overhead—Rayon handles this via work stealing
  • Thread-safe by design: closures must implement Send and Sync
  • Use par_iter_mut() for in-place mutations without allocation
  • Control thread count with RAYON_NUM_THREADS environment variable or configure the global thread pool