How do I parallelize data processing with rayon in Rust?

Walkthrough

The rayon crate provides data parallelism for Rust using a work-stealing thread pool. It transforms sequential iterators into parallel ones with a simple method call, automatically distributing work across available CPU cores. Rayon's design guarantees data-race freedom through Rust's ownership system—parallel operations take shared references or mutate disjoint parts of data. The library handles thread pool management, work stealing, and load balancing automatically.

Key concepts:

  1. par_iter() — convert a collection to a parallel iterator
  2. par_iter_mut() — parallel iterator with mutable access
  3. into_par_iter() — parallel iterator that consumes the collection
  4. join() — run two closures in parallel
  5. scope() — create a parallel scope for spawning tasks

Code Example

# Cargo.toml
[dependencies]
rayon = "1.10"
use rayon::prelude::*;
 
fn main() {
    let data = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
    
    // Parallel sum
    let sum: i32 = data.par_iter().sum();
    println!("Sum: {}", sum);
    
    // Parallel map
    let doubled: Vec<i32> = data.par_iter().map(|x| x * 2).collect();
    println!("Doubled: {:?}", doubled);
}

Basic Parallel Iterators

use rayon::prelude::*;
 
fn main() {
    let data = (0..100).collect::<Vec<i32>>();
    
    // Parallel iteration (read-only)
    let sum: i32 = data.par_iter().sum();
    println!("Sum: {}", sum);
    
    // Parallel iteration with index
    data.par_iter().enumerate().for_each(|(i, &val)| {
        println!("Index {}: {}", i, val);
    });
    
    // Count elements
    let count = data.par_iter().count();
    println!("Count: {}", count);
}

Parallel Map

use rayon::prelude::*;
 
fn expensive_computation(n: i32) -> i32 {
    // Simulate expensive work
    (1..=n).sum()
}
 
fn main() {
    let input = vec![1000, 2000, 3000, 4000, 5000];
    
    // Sequential (slow)
    let sequential: Vec<i32> = input.iter().map(|&n| expensive_computation(n)).collect();
    
    // Parallel (fast)
    let parallel: Vec<i32> = input.par_iter().map(|&n| expensive_computation(n)).collect();
    
    println!("Results: {:?}", parallel);
}

Parallel Filter and Fold

use rayon::prelude::*;
 
fn main() {
    let data: Vec<i32> = (1..=100).collect();
    
    // Parallel filter
    let evens: Vec<i32> = data.par_iter().filter(|&&x| x % 2 == 0).cloned().collect();
    println!("Evens: {:?}", &evens[..10]);
    
    // Parallel filter + map
    let doubled_evens: Vec<i32> = data
        .par_iter()
        .filter(|&&x| x % 2 == 0)
        .map(|&x| x * 2)
        .collect();
    println!("Doubled evens: {:?}", &doubled_evens[..10]);
    
    // Parallel reduce
    let product: i32 = data.par_iter().reduce(|| 1, |a, b| a * b);
    println!("Product: {}", product);
}

Parallel Reduce Operations

use rayon::prelude::*;
 
fn main() {
    let data: Vec<i32> = (1..=100).collect();
    
    // sum() - built-in
    let sum: i32 = data.par_iter().sum();
    println!("Sum: {}", sum);
    
    // reduce() - custom reduction
    let max = data.par_iter().reduce(|| &i32::MIN, |a, b| if a > b { a } else { b });
    println!("Max: {}", max);
    
    // reduce_with() - Option result
    let min = data.par_iter().reduce_with(|a, b| if a < b { a } else { b });
    println!("Min: {:?}", min);
    
    // fold() then reduce
    let sum_of_squares: i32 = data
        .par_iter()
        .fold(|| 0, |acc, &x| acc + x * x)
        .sum();
    println!("Sum of squares: {}", sum_of_squares);
}

Mutable Parallel Iteration

use rayon::prelude::*;
 
fn main() {
    let mut data = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
    
    // Parallel mutation
    data.par_iter_mut().for_each(|x| *x *= 2);
    println!("Doubled: {:?}", data);
    
    // Chain operations
    data.par_iter_mut().enumerate().for_each(|(i, x)| {
        *x += i as i32;
    });
    println!("With index added: {:?}", data);
}

Consuming Parallel Iterator

use rayon::prelude::*;
 
fn main() {
    let data = vec!["hello".to_string(), "world".to_string()];
    
    // into_par_iter() consumes the collection
    let uppercased: Vec<String> = data
        .into_par_iter()
        .map(|s| s.to_uppercase())
        .collect();
    
    println!("Uppercased: {:?}", uppercased);
    // data is now consumed and unavailable
}

Parallel For-Each

use rayon::prelude::*;
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::Arc;
 
fn main() {
    let data: Vec<i32> = (1..=100).collect();
    let counter = Arc::new(AtomicUsize::new(0));
    
    // Parallel for_each
    data.par_iter().for_each(|&x| {
        // Process each element
        if x % 10 == 0 {
            counter.fetch_add(1, Ordering::Relaxed);
        }
    });
    
    println!("Multiples of 10: {}", counter.load(Ordering::Relaxed));
}

Parallel Join

use rayon::join;
 
fn fib(n: u32) -> u32 {
    if n <= 1 {
        n
    } else {
        // Run both branches in parallel
        let (a, b) = join(|| fib(n - 1), || fib(n - 2));
        a + b
    }
}
 
fn main() {
    let result = fib(30);
    println!("Fib(30) = {}", result);
}

Parallel Scope

use rayon::scope;
use std::sync::Mutex;
 
fn main() {
    let data = Mutex::new(Vec::new());
    
    scope(|s| {
        s.spawn(|_| {
            data.lock().unwrap().push(1);
        });
        
        s.spawn(|_| {
            data.lock().unwrap().push(2);
        });
        
        s.spawn(|_| {
            data.lock().unwrap().push(3);
        });
    });
    
    // All spawns complete before scope ends
    println!("Data: {:?}", data.into_inner().unwrap());
}

Nested Parallelism

use rayon::prelude::*;
 
fn main() {
    let matrix: Vec<Vec<i32>> = (0..10)
        .map(|i| (0..10).map(|j| i * 10 + j).collect())
        .collect();
    
    // Parallel processing of nested structures
    let row_sums: Vec<i32> = matrix
        .par_iter()
        .map(|row| row.iter().sum())
        .collect();
    
    println!("Row sums: {:?}", row_sums);
    
    // Flatten and process
    let all_elements: Vec<i32> = matrix
        .par_iter()
        .flatten()
        .collect();
    
    println!("Total elements: {}", all_elements.len());
}

Parallel Sort

use rayon::prelude::*;
 
fn main() {
    let mut data: Vec<i32> = (1..=1000).rev().collect();
    
    // Parallel sort (uses parallel merge sort)
    data.par_sort();
    println!("First 10: {:?}", &data[..10]);
    
    // Parallel sort unstable (faster, doesn't preserve order of equal elements)
    let mut data2: Vec<i32> = (1..=1000).rev().collect();
    data2.par_sort_unstable();
    println!("First 10: {:?}", &data2[..10]);
    
    // Parallel sort by key
    let mut words = vec!["banana", "apple", "cherry", "date", "elderberry"];
    words.par_sort_by_key(|s| s.len());
    println!("Sorted by length: {:?}", words);
}

Parallel Find and Any/All

use rayon::prelude::*;
 
fn main() {
    let data: Vec<i32> = (1..=1000).collect();
    
    // Parallel find (short-circuits)
    let found = data.par_iter().find_any(|&&x| x > 500);
    println!("Found: {:?}", found);
    
    // Parallel find first
    let first = data.par_iter().find_first(|&&x| x > 500);
    println!("First: {:?}", first);
    
    // Parallel any (short-circuits)
    let has_large = data.par_iter().any(|&x| x > 900);
    println!("Has > 900: {}", has_large);
    
    // Parallel all (short-circuits)
    let all_positive = data.par_iter().all(|&x| x > 0);
    println!("All positive: {}", all_positive);
}

Parallel Partition and Grouping

use rayon::prelude::*;
 
fn main() {
    let data: Vec<i32> = (1..=20).collect();
    
    // Partition into two groups
    let (evens, odds): (Vec<i32>, Vec<i32>) = data
        .par_iter()
        .partition(|&&x| x % 2 == 0);
    
    println!("Evens: {:?}", evens);
    println!("Odds: {:?}", odds);
}

Parallel Zip

use rayon::prelude::*;
 
fn main() {
    let a = vec![1, 2, 3, 4, 5];
    let b = vec![10, 20, 30, 40, 50];
    
    // Parallel zip
    let sums: Vec<i32> = a.par_iter()
        .zip(b.par_iter())
        .map(|(&x, &y)| x + y)
        .collect();
    
    println!("Sums: {:?}", sums);
}

Thread Pool Configuration

use rayon::{ThreadPool, ThreadPoolBuilder};
 
fn main() -> Result<(), rayon::ThreadPoolBuildError> {
    // Create custom thread pool
    let pool: ThreadPool = ThreadPoolBuilder::new()
        .num_threads(4)                    // Use 4 threads
        .thread_name(|i| format!("rayon-{}", i))
        .build()?;
    
    // Use the pool
    let sum: i32 = pool.install(|| {
        (1..=100).into_par_iter().sum()
    });
    
    println!("Sum: {}", sum);
    
    // Get current thread count
    println!("Current threads: {}", rayon::current_num_threads());
    
    Ok(())
}

Collecting Results

use rayon::prelude::*;
 
fn might_fail(n: i32) -> Result<i32, String> {
    if n > 0 {
        Ok(n * 2)
    } else {
        Err(format!("Invalid: {}", n))
    }
}
 
fn main() {
    let data: Vec<i32> = (1..=10).collect();
    
    // Collect into Vec<Result<T, E>>
    let results: Vec<Result<i32, String>> = data
        .par_iter()
        .map(|&n| might_fail(n))
        .collect();
    
    println!("Results: {:?}", results);
    
    // Collect into Result<Vec<T>, E> (fails fast)
    let all_ok: Result<Vec<i32>, String> = data
        .par_iter()
        .map(|&n| might_fail(n))
        .collect();
    
    println!("All ok: {:?}", all_ok);
}

Flat Map

use rayon::prelude::*;
 
fn main() {
    let words = vec!["hello world", "rust rayon", "parallel processing"];
    
    // Parallel flat_map
    let all_words: Vec<&str> = words
        .par_iter()
        .flat_map(|s| s.split_whitespace())
        .collect();
    
    println!("All words: {:?}", all_words);
}

Chunking for Batch Processing

use rayon::prelude::*;
 
fn process_batch(batch: &[i32]) -> i32 {
    batch.iter().sum()
}
 
fn main() {
    let data: Vec<i32> = (1..=100).collect();
    
    // Process in chunks
    let results: Vec<i32> = data
        .par_chunks(10)
        .map(|chunk| process_batch(chunk))
        .collect();
    
    println!("Batch sums: {:?}", results);
    
    // Mutable chunks
    let mut data2: Vec<i32> = (1..=100).collect();
    data2.par_chunks_mut(10).for_each(|chunk| {
        for item in chunk {
            *item *= 2;
        }
    });
    
    println!("First 10 after doubling: {:?}", &data2[..10]);
}

Parallel Iteration Over Ranges

use rayon::prelude::*;
 
fn main() {
    // Parallel range
    let sum: i32 = (1..=1000).into_par_iter().sum();
    println!("Sum 1 to 1000: {}", sum);
    
    // Parallel inclusive range
    let product: i32 = (1i32..=5).into_par_iter().product();
    println!("Product 1 to 5: {}", product);
}

Working with Strings

use rayon::prelude::*;
 
fn main() {
    let strings = vec!["hello", "world", "rust", "rayon"];
    
    // Process strings in parallel
    let lengths: Vec<usize> = strings
        .par_iter()
        .map(|s| s.len())
        .collect();
    
    println!("Lengths: {:?}", lengths);
    
    // Filter and transform
    let uppercased: Vec<String> = strings
        .par_iter()
        .filter(|s| s.len() > 4)
        .map(|s| s.to_uppercase())
        .collect();
    
    println!("Long words uppercase: {:?}", uppercased);
}

Parallel Hash Map Processing

use rayon::prelude::*;
use std::collections::HashMap;
 
fn main() {
    let mut map = HashMap::new();
    map.insert("a", 1);
    map.insert("b", 2);
    map.insert("c", 3);
    map.insert("d", 4);
    
    // Parallel iteration over HashMap
    let sum: i32 = map.par_iter().map(|(_, &v)| v).sum();
    println!("Sum of values: {}", sum);
    
    // Collect keys
    let keys: Vec<&str> = map.par_iter().map(|(k, _)| *k).collect();
    println!("Keys: {:?}", keys);
}

Comparing Sequential vs Parallel

use rayon::prelude::*;
use std::time::Instant;
 
fn expensive_calculation(n: usize) -> usize {
    (0..n).filter(|i| i % 7 == 0 || i % 11 == 0).count()
}
 
fn main() {
    let data: Vec<usize> = (1000..=10000).step_by(1000).collect();
    
    // Sequential
    let start = Instant::now();
    let seq_results: Vec<usize> = data.iter().map(|&n| expensive_calculation(n)).collect();
    let seq_time = start.elapsed();
    
    // Parallel
    let start = Instant::now();
    let par_results: Vec<usize> = data.par_iter().map(|&n| expensive_calculation(n)).collect();
    let par_time = start.elapsed();
    
    println!("Sequential: {:?} ({:?})", seq_results, seq_time);
    println!("Parallel: {:?} ({:?})", par_results, par_time);
    println!("Speedup: {:.2}x", seq_time.as_secs_f64() / par_time.as_secs_f64());
}

Error Handling in Parallel Code

use rayon::prelude::*;
 
fn parse_numbers(input: &[&str]) -> Vec<Result<i32, std::num::ParseIntError>> {
    input.par_iter().map(|s| s.parse::<i32>()).collect()
}
 
fn main() {
    let inputs = vec!["1", "2", "three", "4", "five"];
    
    // Get all results
    let results = parse_numbers(&inputs);
    println!("Results: {:?}", results);
    
    // Try to get all successes
    let all_valid: Result<Vec<i32>, _> = inputs
        .par_iter()
        .map(|s| s.parse::<i32>())
        .collect();
    
    println!("All valid: {:?}", all_valid);
}

Real-World Example: Image Processing

use rayon::prelude::*;
 
struct Pixel {
    r: u8,
    g: u8,
    b: u8,
}
 
impl Pixel {
    fn grayscale(&self) -> u8 {
        let gray = (self.r as u32 + self.g as u32 + self.b as u32) / 3;
        gray as u8
    }
}
 
fn process_image(pixels: &mut [Pixel]) {
    pixels.par_iter_mut().for_each(|pixel| {
        let gray = pixel.grayscale();
        pixel.r = gray;
        pixel.g = gray;
        pixel.b = gray;
    });
}
 
fn main() {
    let mut pixels: Vec<Pixel> = (0..100)
        .map(|i| Pixel { r: i, g: i * 2, b: i * 3 })
        .collect();
    
    process_image(&mut pixels);
    
    println!("First pixel: r={}, g={}, b={}", 
        pixels[0].r, pixels[0].g, pixels[0].b);
}

Real-World Example: File Processing

use rayon::prelude::*;
use std::path::PathBuf;
 
fn process_file(path: &PathBuf) -> usize {
    // Simulate file processing
    path.to_string_lossy().len()
}
 
fn main() {
    let files: Vec<PathBuf> = (0..100)
        .map(|i| PathBuf::from(format!("/data/file{}.txt", i)))
        .collect();
    
    // Process files in parallel
    let total_length: usize = files
        .par_iter()
        .map(process_file)
        .sum();
    
    println!("Total path length: {}", total_length);
}

Real-World Example: Matrix Operations

use rayon::prelude::*;
 
type Matrix = Vec<Vec<f64>>;
 
fn matrix_multiply(a: &Matrix, b: &Matrix) -> Matrix {
    let n = a.len();
    let m = b[0].len();
    let k = b.len();
    
    (0..n)
        .into_par_iter()
        .map(|i| {
            (0..m)
                .map(|j| {
                    (0..k).map(|l| a[i][l] * b[l][j]).sum()
                })
                .collect()
        })
        .collect()
}
 
fn main() {
    let a = vec![
        vec![1.0, 2.0],
        vec![3.0, 4.0],
    ];
    
    let b = vec![
        vec![5.0, 6.0],
        vec![7.0, 8.0],
    ];
    
    let result = matrix_multiply(&a, &b);
    
    for row in &result {
        println!("{:?}", row);
    }
}

When to Use Rayon

use rayon::prelude::*;
 
fn main() {
    // GOOD: Expensive operations, large collections
    let large_data: Vec<i32> = (0..1_000_000).collect();
    let sum: i32 = large_data.par_iter().sum();
    
    // GOOD: Independent operations
    let results: Vec<_> = large_data
        .par_iter()
        .filter(|&&x| x % 1000 == 0)
        .map(|&x| x * x)
        .collect();
    
    // BAD: Trivial operations on small data
    // let small = vec![1, 2, 3];
    // small.par_iter().sum();  // Overhead > benefit
    
    // BAD: Sequential dependencies
    // par_iter() won't help if each step depends on previous
    
    println!("Sum: {}", sum);
    println!("Results count: {}", results.len());
}

Summary

  • par_iter() creates a read-only parallel iterator
  • par_iter_mut() creates a parallel iterator with mutable access
  • into_par_iter() consumes the collection
  • Use .par_chunks() for batch processing
  • join() runs two closures in parallel
  • scope() creates a scope for spawning tasks
  • Parallel sorts: par_sort(), par_sort_unstable(), par_sort_by_key()
  • Short-circuit operations: find_any(), any(), all()
  • Configure thread pool with ThreadPoolBuilder
  • Rayon handles work-stealing and load balancing automatically
  • Best for CPU-bound tasks on multi-core systems
  • Overhead means small data won't benefit from parallelization
  • Perfect for: data processing, simulations, image processing, matrix operations, file batch processing