How do I benchmark and optimize code performance in Rust?

Walkthrough

Criterion is a statistics-driven benchmarking library that provides accurate measurements and detects performance regressions. Unlike simple timing, Criterion runs benchmarks multiple times, performs statistical analysis, and generates detailed reports. It detects outliers, measures noise, and can compare results against a baseline.

Key features:

Statistical analysis — multiple samples with confidence intervals
Regression detection — compare against previous runs
HTML reports — visual charts and graphs
Warm-up phase — accounts for cache effects and JIT compilation
Custom measurements — wall time, throughput, custom metrics

Criterion benchmarks are placed in benches/ and run with cargo bench.

Code Example

# Cargo.toml
[dev-dependencies]
criterion = { version = "0.5", features = ["html_reports"] }
 
[[bench]]
name = "my_bench"
harness = false

// benches/my_bench.rs
use criterion::{black_box, criterion_group, criterion_main, Criterion, BenchmarkId};
 
// ===== Functions to Benchmark =====
 
fn fibonacci_recursive(n: u64) -> u64 {
    if n <= 1 {
        n
    } else {
        fibonacci_recursive(n - 1) + fibonacci_recursive(n - 2)
    }
}
 
fn fibonacci_iterative(n: u64) -> u64 {
    if n <= 1 {
        return n;
    }
    
    let mut prev = 0;
    let mut curr = 1;
    
    for _ in 2..=n {
        let next = prev + curr;
        prev = curr;
        curr = next;
    }
    
    curr
}
 
fn sum_slice(data: &[i32]) -> i32 {
    data.iter().sum()
}
 
fn sum_manual(data: &[i32]) -> i32 {
    let mut total = 0;
    for &x in data {
        total += x;
    }
    total
}
 
// ===== Basic Benchmark =====
 
fn bench_fibonacci(c: &mut Criterion) {
    c.bench_function("fib recursive 20", |b| {
        b.iter(|| fibonacci_recursive(black_box(20)))
    });
    
    c.bench_function("fib iterative 20", |b| {
        b.iter(|| fibonacci_iterative(black_box(20)))
    });
}
 
// ===== Comparing Implementations =====
 
fn bench_sum_methods(c: &mut Criterion) {
    let data: Vec<i32> = (0..10_000).collect();
    
    let mut group = c.benchmark_group("sum_methods");
    
    group.bench_function("iter sum", |b| {
        b.iter(|| sum_slice(black_box(&data)))
    });
    
    group.bench_function("manual loop", |b| {
        b.iter(|| sum_manual(black_box(&data)))
    });
    
    group.finish();
}
 
// ===== Benchmarking with Parameters =====
 
fn bench_parameterized(c: &mut Criterion) {
    let mut group = c.benchmark_group("fibonacci_comparison");
    
    for size in [10, 15, 20, 25].iter() {
        group.bench_with_input(BenchmarkId::new("recursive", size), size, |b, &n| {
            b.iter(|| fibonacci_recursive(black_box(n)));
        });
        
        group.bench_with_input(BenchmarkId::new("iterative", size), size, |b, &n| {
            b.iter(|| fibonacci_iterative(black_box(n)));
        });
    }
    
    group.finish();
}
 
// ===== Throughput Measurement =====
 
fn bench_throughput(c: &mut Criterion) {
    let data: Vec<i32> = (0..100_000).collect();
    
    let mut group = c.benchmark_group("throughput");
    group.throughput(criterion::Throughput::Bytes(data.len() as u64 * 4));
    
    group.bench_function("sum 100k i32", |b| {
        b.iter(|| sum_slice(black_box(&data)))
    });
    
    group.finish();
}
 
criterion_group!(benches, bench_fibonacci, bench_sum_methods, bench_parameterized, bench_throughput);
criterion_main!(benches);

Advanced Benchmarking Patterns

// benches/advanced_bench.rs
use criterion::{black_box, criterion_group, criterion_main, Criterion, BatchSize, BenchmarkId};
use std::collections::{HashMap, BTreeMap};
 
// ===== Setup and Teardown =====
 
fn generate_data(size: usize) -> Vec<i32> {
    (0..size as i32).collect()
}
 
fn bench_with_setup(c: &mut Criterion) {
    let mut group = c.benchmark_group("with_setup");
    
    // With setup: setup runs before each iteration
    group.bench_function("with_setup", |b| {
        b.iter_batched(
            || generate_data(1000),      // Setup
            |data| sum_slice(&data),      // Routine
            BatchSize::SmallInput,        // Batch size strategy
        )
    });
    
    group.finish();
}
 
// ===== Comparing Data Structures =====
 
fn bench_maps(c: &mut Criterion) {
    let mut group = c.benchmark_group("map_insert");
    
    for size in [100, 1000, 10_000].iter() {
        group.bench_with_input(BenchmarkId::new("HashMap", size), size, |b, &size| {
            b.iter(|| {
                let mut map = HashMap::new();
                for i in 0..size {
                    map.insert(i, i * 2);
                }
                map
            });
        });
        
        group.bench_with_input(BenchmarkId::new("BTreeMap", size), size, |b, &size| {
            b.iter(|| {
                let mut map = BTreeMap::new();
                for i in 0..size {
                    map.insert(i, i * 2);
                }
                map
            });
        });
    }
    
    group.finish();
}
 
// ===== Benchmarking Algorithms =====
 
fn bench_sorting(c: &mut Criterion) {
    let mut group = c.benchmark_group("sorting");
    
    for size in [100, 1000, 10_000].iter() {
        group.bench_with_input(BenchmarkId::new("sort", size), size, |b, &size| {
            b.iter_batched(
                || (0..size).rev().collect::<Vec<_>>(),  // Reverse sorted
                |mut data| data.sort(),
                BatchSize::SmallInput,
            )
        });
        
        group.bench_with_input(BenchmarkId::new("sort_unstable", size), size, |b, &size| {
            b.iter_batched(
                || (0..size).rev().collect::<Vec<_>>(),
                |mut data| data.sort_unstable(),
                BatchSize::SmallInput,
            )
        });
    }
    
    group.finish();
}
 
// ===== String Operations =====
 
fn bench_string_concat(c: &mut Criterion) {
    let words: Vec<String> = (0..100).map(|i| format!("word_{}", i)).collect();
    
    let mut group = c.benchmark_group("string_concat");
    
    group.bench_function("push_string", |b| {
        b.iter(|| {
            let mut result = String::new();
            for word in &words {
                result.push_str(word);
            }
            result
        });
    });
    
    group.bench_function("join", |b| {
        b.iter(|| words.join(""))
    });
    
    group.bench_function("format", |b| {
        b.iter(|| {
            let mut result = String::new();
            for word in &words {
                result = format!("{}{}", result, word);
            }
            result
        });
    });
    
    group.finish();
}
 
criterion_group!(advanced_benches, bench_with_setup, bench_maps, bench_sorting, bench_string_concat);
criterion_main!(advanced_benches);

Benchmarking Async Code

// benches/async_bench.rs
use criterion::{black_box, criterion_group, criterion_main, Criterion};
use tokio::runtime::Runtime;
 
async fn async_sum(data: Vec<i32>) -> i32 {
    data.into_iter().sum()
}
 
async fn async_operation(n: u64) -> u64 {
    let mut result = 0;
    for i in 0..n {
        result += i;
    }
    result
}
 
fn bench_async(c: &mut Criterion) {
    let rt = Runtime::new().unwrap();
    
    c.bench_function("async_sum", |b| {
        b.to_async(&rt).iter(|| {
            let data = (0..1000).collect::<Vec<_>>();
            async_sum(data)
        })
    });
    
    c.bench_function("async_operation", |b| {
        b.to_async(&rt).iter(|| async_operation(black_box(1000)))
    });
}
 
criterion_group!(async_benches, bench_async);
criterion_main!(async_benches);

Running Benchmarks

# Run all benchmarks
cargo bench
 
# Run specific benchmark
cargo bench -- fibonacci_comparison
 
# Save baseline for comparison
cargo bench -- --save-baseline main
 
# Compare against baseline
cargo bench -- --baseline main
 
# Generate HTML reports (in target/criterion/)
cargo bench

Summary

Use c.bench_function("name", |b| b.iter(|| ...)) for simple benchmarks
Use black_box() to prevent compiler optimizations from eliminating dead code
Group related benchmarks with c.benchmark_group("name") for organized output
Parameterize benchmarks with bench_with_input(BenchmarkId::new(...), input, ...)
Use iter_batched(setup, routine, BatchSize) when setup is needed before each iteration
Set throughput with group.throughput(Throughput::Bytes(n)) for bytes/second measurements
Benchmark async code with b.to_async(&runtime).iter(|| async_fn())
Compare against baselines with --save-baseline and --baseline flags
View HTML reports in target/criterion/<benchmark_name>/report/index.html
Criterion runs warm-up iterations, then multiple samples for statistical significance
Use criterion to catch performance regressions in CI pipelines