How do I write benchmarks with criterion in Rust?

Walkthrough

The criterion crate is a statistics-driven benchmarking library for Rust that provides accurate and reliable performance measurements. Unlike basic timing approaches, criterion uses statistical analysis to detect small regressions, handle outliers, and provide confidence intervals. It automatically runs multiple iterations, warms up caches, and produces detailed reports including HTML visualizations. Criterion is essential for performance-critical code, detecting regressions in CI/CD, and optimizing hot paths.

Key concepts:

  1. Benchmark Groups — organize related benchmarks together
  2. Bencher — provides methods for benchmarking code
  3. Throughput — measure bytes/elements processed per second
  4. Comparison — compare current run against previous results
  5. HTML Reports — generate visual benchmark reports

Code Example

# Cargo.toml
[dependencies]
criterion = { version = "0.5", features = ["html_reports"] }
 
[[bench]]
name = "my_benchmark"
harness = false
// benches/my_benchmark.rs
use criterion::{black_box, criterion_group, criterion_main, Criterion};
 
fn fibonacci(n: u64) -> u64 {
    match n {
        0 => 1,
        1 => 1,
        n => fibonacci(n - 1) + fibonacci(n - 2),
    }
}
 
fn criterion_benchmark(c: &mut Criterion) {
    c.bench_function("fib 20", |b| b.iter(|| fibonacci(black_box(20))));
}
 
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

Basic Benchmark Setup

// benches/basic.rs
use criterion::{black_box, criterion_group, criterion_main, Criterion};
 
fn add(a: i32, b: i32) -> i32 {
    a + b
}
 
fn bench_add(c: &mut Criterion) {
    c.bench_function("add", |b| {
        b.iter(|| add(black_box(10), black_box(20)))
    });
}
 
criterion_group!(benches, bench_add);
criterion_main!(benches);

Run with:

cargo bench

Understanding black_box

use criterion::{black_box, criterion_group, criterion_main, Criterion};
 
// black_box prevents compiler optimizations
// Without it, the compiler might optimize away our computation
 
fn expensive_computation(n: u64) -> u64 {
    (1..=n).sum()
}
 
fn bench_computation(c: &mut Criterion) {
    // GOOD: Use black_box to prevent optimization
    c.bench_function("sum 1 to 1000", |b| {
        b.iter(|| expensive_computation(black_box(1000)))
    });
    
    // Also black_box the result to ensure it's used
    c.bench_function("sum with result black_box", |b| {
        b.iter(|| black_box(expensive_computation(1000)))
    });
}
 
criterion_group!(benches, bench_computation);
criterion_main!(benches);

Benchmark Groups

use criterion::{criterion_group, criterion_main, Criterion};
 
fn sort_vec(data: &mut Vec<i32>) {
    data.sort();
}
 
fn sort_slice(data: &mut [i32]) {
    data.sort();
}
 
fn bench_sorting(c: &mut Criterion) {
    let mut group = c.benchmark_group("sorting");
    
    // Benchmark vec sorting
    group.bench_function("sort vec 100", |b| {
        let mut data: Vec<i32> = (0..100).rev().collect();
        b.iter(|| {
            let mut v = data.clone();
            sort_vec(&mut v);
            v
        })
    });
    
    // Different size
    group.bench_function("sort vec 1000", |b| {
        let mut data: Vec<i32> = (0..1000).rev().collect();
        b.iter(|| {
            let mut v = data.clone();
            sort_vec(&mut v);
            v
        })
    });
    
    group.finish();
}
 
criterion_group!(benches, bench_sorting);
criterion_main!(benches);

Throughput Measurement

use criterion::{criterion_group, criterion_main, Criterion, Throughput};
 
fn process_data(data: &[u8]) -> u64 {
    data.iter().map(|&b| b as u64).sum()
}
 
fn bench_throughput(c: &mut Criterion) {
    let data: Vec<u8> = (0..=255).cycle().take(1024 * 1024).collect();
    
    let mut group = c.benchmark_group("throughput");
    
    // Tell criterion how much data we're processing
    group.throughput(Throughput::Bytes(data.len() as u64));
    
    group.bench_function("process 1MB", |b| {
        b.iter(|| process_data(&data))
    });
    
    group.finish();
}
 
criterion_group!(benches, bench_throughput);
criterion_main!(benches);

Elements Throughput

use criterion::{criterion_group, criterion_main, Criterion, Throughput};
 
fn find_item(haystack: &[i32], needle: i32) -> Option<usize> {
    haystack.iter().position(|&x| x == needle)
}
 
fn bench_search(c: &mut Criterion) {
    let mut group = c.benchmark_group("search");
    
    let data: Vec<i32> = (0..10_000).collect();
    
    // Measure in elements, not bytes
    group.throughput(Throughput::Elements(data.len() as u64));
    
    group.bench_function("linear search", |b| {
        b.iter(|| find_item(&data, black_box(9999)))
    });
    
    group.finish();
}
 
criterion_group!(benches, bench_search);
criterion_main!(benches);

Parameterized Benchmarks

use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId};
 
fn count_primes(limit: usize) -> usize {
    (2..=limit).filter(|&n| is_prime(n)).count()
}
 
fn is_prime(n: usize) -> bool {
    if n < 2 { return false; }
    (2..=(n as f64).sqrt() as usize).all(|i| n % i != 0)
}
 
fn bench_primes(c: &mut Criterion) {
    let mut group = c.benchmark_group("primes");
    
    for size in [100, 1000, 10000, 100000].iter() {
        group.bench_with_input(BenchmarkId::new("count", size), size, |b, &size| {
            b.iter(|| count_primes(size))
        });
    }
    
    group.finish();
}
 
criterion_group!(benches, bench_primes);
criterion_main!(benches);

Comparing Implementations

use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId};
 
// Different implementations
fn sum_iter(data: &[i32]) -> i32 {
    data.iter().sum()
}
 
fn sum_loop(data: &[i32]) -> i32 {
    let mut total = 0;
    for &n in data {
        total += n;
    }
    total
}
 
fn sum_fold(data: &[i32]) -> i32 {
    data.iter().fold(0, |acc, &n| acc + n)
}
 
fn bench_implementations(c: &mut Criterion) {
    let mut group = c.benchmark_group("sum implementations");
    
    let data: Vec<i32> = (0..10000).collect();
    
    for (name, func) in [
        ("iter", sum_iter as fn(&[i32]) -> i32),
        ("loop", sum_loop as fn(&[i32]) -> i32),
        ("fold", sum_fold as fn(&[i32]) -> i32),
    ] {
        group.bench_function(name, |b| b.iter(|| func(&data)));
    }
    
    group.finish();
}
 
criterion_group!(benches, bench_implementations);
criterion_main!(benches);

Comparing Multiple Input Sizes

use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId};
 
fn hash_string(s: &str) -> u64 {
    let mut hash: u64 = 0;
    for byte in s.bytes() {
        hash = hash.wrapping_mul(31).wrapping_add(byte as u64);
    }
    hash
}
 
fn bench_hash_sizes(c: &mut Criterion) {
    let mut group = c.benchmark_group("hash_string");
    
    let sizes: Vec<(String, usize)> = vec![
        ("small".to_string(), 10),
        ("medium".to_string(), 100),
        ("large".to_string(), 1000),
    ];
    
    for (name, size) in sizes {
        let data = "x".repeat(size);
        group.throughput(criterion::Throughput::Bytes(size as u64));
        group.bench_with_input(BenchmarkId::new("hash", name), &data, |b, data| {
            b.iter(|| hash_string(data))
        });
    }
    
    group.finish();
}
 
criterion_group!(benches, bench_hash_sizes);
criterion_main!(benches);

Custom Configuration

use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId};
use std::time::Duration;
 
fn work(n: usize) -> usize {
    (0..n).sum()
}
 
fn bench_custom_config(c: &mut Criterion) {
    let mut group = c.benchmark_group("custom_config");
    
    // Custom sample size (default: 100)
    group.sample_size(50);
    
    // Custom measurement time (default: 5 seconds)
    group.measurement_time(Duration::from_secs(10));
    
    // Custom warm-up time (default: 3 seconds)
    group.warm_up_time(Duration::from_secs(2));
    
    // Number of warm-up iterations
    group.nresamples(10_000);
    
    group.bench_function("work 1000", |b| {
        b.iter(|| work(black_box(1000)))
    });
    
    group.finish();
}
 
criterion_group!(benches, bench_custom_config);
criterion_main!(benches);

Benchmarking with Setup

use criterion::{criterion_group, criterion_main, Criterion, BatchSize};
 
fn process_string(s: String) -> String {
    s.to_uppercase()
}
 
fn bench_with_setup(c: &mut Criterion) {
    c.bench_function("process_string", |b| {
        // iter_batched runs setup for each iteration
        b.iter_batched(
            || "hello world".to_string(),  // setup
            |s| process_string(s),          // routine
            BatchSize::SmallInput,          // batch size
        )
    });
}
 
criterion_group!(benches, bench_with_setup);
criterion_main!(benches);

Batch Sizes

use criterion::{criterion_group, criterion_main, Criterion, BatchSize};
 
fn create_vec(n: usize) -> Vec<i32> {
    (0..n).collect()
}
 
fn sort_vec(v: &mut Vec<i32>) {
    v.sort();
}
 
fn bench_batch_sizes(c: &mut Criterion) {
    let mut group = c.benchmark_group("batching");
    
    // iter_batched_ref for mutable references
    group.bench_function("sort 1000", |b| {
        b.iter_batched_ref(
            || create_vec(1000),    // setup
            |v| sort_vec(v),         // routine (gets &mut Vec)
            BatchSize::SmallInput,
        )
    });
    
    // Different batch sizes
    // SmallInput: setup cost dominates, many iterations
    // LargeInput: test cost dominates, fewer iterations
    // PerIteration: run setup for every iteration
    
    group.finish();
}
 
criterion_group!(benches, bench_batch_sizes);
criterion_main!(benches);

External Setup (Avoid Timing)

use criterion::{black_box, criterion_group, criterion_main, Criterion};
 
fn expensive_setup() -> Vec<i32> {
    // This should NOT be in the benchmark
    (0..10000).collect()
}
 
fn fast_operation(data: &[i32]) -> i32 {
    data.iter().sum()
}
 
fn bench_external_setup(c: &mut Criterion) {
    // Setup happens once, outside the benchmark
    let data = expensive_setup();
    
    c.bench_function("fast_operation", |b| {
        b.iter(|| fast_operation(&data))
    });
}
 
criterion_group!(benches, bench_external_setup);
criterion_main!(benches);

Async Benchmarking

use criterion::{criterion_group, criterion_main, Criterion};
 
// Async benchmarking with tokio
async fn async_operation(n: usize) -> usize {
    // Simulate async work
    tokio::time::sleep(std::time::Duration::from_micros(100)).await;
    n * 2
}
 
fn bench_async(c: &mut Criterion) {
    let rt = tokio::runtime::Runtime::new().unwrap();
    
    c.bench_function("async_operation", |b| {
        b.to_async(&rt).iter(|| async_operation(100))
    });
}
 
criterion_group!(benches, bench_async);
criterion_main!(benches);

Benchmarking File I/O

use criterion::{criterion_group, criterion_main, Criterion, Throughput};
use std::io::{Read, Cursor};
 
fn read_all<R: Read>(mut reader: R, buf: &mut Vec<u8>) -> std::io::Result<()> {
    buf.clear();
    reader.read_to_end(buf)?;
    Ok(())
}
 
fn bench_io(c: &mut Criterion) {
    let data = vec![0u8; 1024 * 1024]; // 1MB
    let mut buf = Vec::with_capacity(data.len());
    
    let mut group = c.benchmark_group("file_io");
    group.throughput(Throughput::Bytes(data.len() as u64));
    
    group.bench_function("read_1mb", |b| {
        b.iter(|| {
            let cursor = Cursor::new(&data);
            read_all(cursor, &mut buf).unwrap();
            black_box(&buf);
        })
    });
    
    group.finish();
}
 
criterion_group!(benches, bench_io);
criterion_main!(benches);

Memory Usage Profiling

use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId};
 
fn allocate_vec(size: usize) -> Vec<u8> {
    vec![0u8; size]
}
 
fn bench_memory(c: &mut Criterion) {
    let mut group = c.benchmark_group("allocation");
    
    for size in [100, 1000, 10000, 100000] {
        group.bench_with_input(
            BenchmarkId::new("allocate", size),
            &size,
            |b, &size| b.iter(|| allocate_vec(size))
        );
    }
    
    group.finish();
}
 
criterion_group!(benches, bench_memory);
criterion_main!(benches);

Regression Detection

use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId};
 
fn slow_function(n: usize) -> usize {
    let mut result = 0;
    for i in 0..n {
        for j in 0..n {
            result += i * j;
        }
    }
    result
}
 
fn bench_regression(c: &mut Criterion) {
    let mut group = c.benchmark_group("regression");
    
    // Criterion compares against saved baseline
    // To save baseline: cargo bench -- --save-baseline main
    // To compare: cargo bench -- --baseline main
    
    group.bench_function("slow 100", |b| {
        b.iter(|| slow_function(black_box(100)))
    });
    
    group.finish();
}
 
criterion_group!(benches, bench_regression);
criterion_main!(benches);

Multiple Benchmark Groups

use criterion::{criterion_group, criterion_main, Criterion};
 
// Group 1: String operations
fn bench_strings(c: &mut Criterion) {
    let mut group = c.benchmark_group("strings");
    
    group.bench_function("to_uppercase", |b| {
        b.iter(|| "hello".to_uppercase())
    });
    
    group.bench_function("to_lowercase", |b| {
        b.iter(|| "HELLO".to_lowercase())
    });
    
    group.finish();
}
 
// Group 2: Number operations
fn bench_numbers(c: &mut Criterion) {
    let mut group = c.benchmark_group("numbers");
    
    group.bench_function("sqrt", |b| {
        b.iter(|| (1..1000).map(|n| (n as f64).sqrt()).sum::<f64>())
    });
    
    group.bench_function("pow", |b| {
        b.iter(|| (1..100).map(|n| (n as f64).powi(2)).sum::<f64>())
    });
    
    group.finish();
}
 
criterion_group!(string_benches, bench_strings);
criterion_group!(number_benches, bench_numbers);
criterion_main!(string_benches, number_benches);

Filtering Benchmarks

// Run specific benchmarks:
// cargo bench -- string_benches
// cargo bench -- to_uppercase
// cargo bench -- "strings/*"
 
use criterion::{criterion_group, criterion_main, Criterion};
 
fn bench_a(c: &mut Criterion) {
    c.bench_function("function_a", |b| b.iter(|| 1 + 1));
}
 
fn bench_b(c: &mut Criterion) {
    c.bench_function("function_b", |b| b.iter(|| 2 + 2));
}
 
criterion_group!(group_a, bench_a);
criterion_group!(group_b, bench_b);
criterion_main!(group_a, group_b);

Profiling Integration

use criterion::{criterion_group, criterion_main, Criterion, Profiler, ProfileData};
use std::process::Command;
 
// Custom profiler (e.g., for perf or valgrind)
struct PerfProfiler;
 
impl Profiler for PerfProfiler {
    fn start(&self, _profile_data: &mut ProfileData) {
        // Start profiling
    }
    
    fn stop(&self, _profile_data: &mut ProfileData) {
        // Stop profiling
    }
}
 
fn bench_with_profiler(c: &mut Criterion) {
    // Uncomment to enable profiling:
    // c.profile BenchProfiler::new();
    
    c.bench_function("profiled", |b| {
        b.iter(|| (0..1000).sum::<u64>())
    });
}
 
criterion_group!(benches, bench_with_profiler);
criterion_main!(benches);

Real-World Example: Parser Benchmark

use criterion::{black_box, criterion_group, criterion_main, Criterion, BenchmarkId, Throughput};
 
#[derive(Debug)]
enum Token {
    Number(i32),
    Plus,
    Minus,
}
 
fn parse_expression(input: &str) -> Vec<Token> {
    input.split_whitespace()
        .filter_map(|s| match s {
            "+" => Some(Token::Plus),
            "-" => Some(Token::Minus),
            n => n.parse().ok().map(Token::Number),
        })
        .collect()
}
 
fn bench_parser(c: &mut Criterion) {
    let mut group = c.benchmark_group("parser");
    
    let inputs = [
        ("simple", "1 + 2"),
        ("medium", "1 + 2 - 3 + 4 - 5"),
        ("complex", "100 + 200 - 300 + 400 - 500 + 600 - 700 + 800"),
    ];
    
    for (name, input) in inputs {
        group.throughput(Throughput::Bytes(input.len() as u64));
        group.bench_with_input(
            BenchmarkId::new("parse", name),
            input,
            |b, input| b.iter(|| parse_expression(black_box(input)))
        );
    }
    
    group.finish();
}
 
criterion_group!(benches, bench_parser);
criterion_main!(benches);

Real-World Example: Collection Benchmarks

use criterion::{criterion_group, criterion_main, Criterion, BenchmarkId};
use std::collections::{HashMap, BTreeMap};
 
fn bench_collections(c: &mut Criterion) {
    let mut group = c.benchmark_group("map_insert");
    
    for size in [100, 1000, 10000] {
        group.bench_with_input(
            BenchmarkId::new("HashMap", size),
            &size,
            |b, &size| {
                b.iter(|| {
                    let mut map = HashMap::new();
                    for i in 0..size {
                        map.insert(i, i * 2);
                    }
                    map
                })
            }
        );
        
        group.bench_with_input(
            BenchmarkId::new("BTreeMap", size),
            &size,
            |b, &size| {
                b.iter(|| {
                    let mut map = BTreeMap::new();
                    for i in 0..size {
                        map.insert(i, i * 2);
                    }
                    map
                })
            }
        );
    }
    
    group.finish();
}
 
criterion_group!(benches, bench_collections);
criterion_main!(benches);

Summary

  • c.bench_function() benchmarks a single function
  • c.benchmark_group() organizes related benchmarks
  • Use black_box() to prevent compiler optimizations
  • Throughput::Bytes() and Throughput::Elements() measure throughput
  • BenchmarkId for parameterized benchmarks
  • iter_batched() for setup routines that shouldn't be timed
  • to_async() for async benchmarks with a runtime
  • Enable html_reports feature for visual reports
  • Run with cargo bench
  • Filter with cargo bench -- "pattern"
  • Compare baselines with --save-baseline and --baseline
  • Criterion provides statistical confidence intervals, not just averages
  • Perfect for: detecting performance regressions, optimizing hot paths, comparing implementations, CI/CD benchmarking