Files
Sprimo/skills/m10-performance/patterns/optimization-guide.md
2026-02-12 22:58:33 +08:00

6.9 KiB

Rust Performance Optimization Guide

Profiling First

Tools

# CPU profiling
cargo install flamegraph
cargo flamegraph --bin myapp

# Memory profiling
cargo install cargo-instruments  # macOS
heaptrack ./target/release/myapp  # Linux

# Benchmarking
cargo bench  # with criterion

# Cache analysis
valgrind --tool=cachegrind ./target/release/myapp

Criterion Benchmarks

use criterion::{criterion_group, criterion_main, Criterion};

fn benchmark_parse(c: &mut Criterion) {
    let input = "test data".repeat(1000);

    c.bench_function("parse_v1", |b| {
        b.iter(|| parse_v1(&input))
    });

    c.bench_function("parse_v2", |b| {
        b.iter(|| parse_v2(&input))
    });
}

criterion_group!(benches, benchmark_parse);
criterion_main!(benches);

Common Optimizations

1. Avoid Unnecessary Allocations

// BAD: allocates on every call
fn to_uppercase(s: &str) -> String {
    s.to_uppercase()
}

// GOOD: return Cow, allocate only if needed
use std::borrow::Cow;

fn to_uppercase(s: &str) -> Cow<'_, str> {
    if s.chars().all(|c| c.is_uppercase()) {
        Cow::Borrowed(s)
    } else {
        Cow::Owned(s.to_uppercase())
    }
}

2. Reuse Allocations

// BAD: creates new Vec each iteration
for item in items {
    let mut buffer = Vec::new();
    process(&mut buffer, item);
}

// GOOD: reuse buffer
let mut buffer = Vec::new();
for item in items {
    buffer.clear();
    process(&mut buffer, item);
}

3. Use Appropriate Collections

Need Collection Notes
Sequential access Vec<T> Best cache locality
Random access by key HashMap<K, V> O(1) lookup
Ordered keys BTreeMap<K, V> O(log n) lookup
Small sets (<20) Vec<T> + linear search Lower overhead
FIFO queue VecDeque<T> O(1) push/pop both ends

4. Pre-allocate Capacity

// BAD: many reallocations
let mut v = Vec::new();
for i in 0..10000 {
    v.push(i);
}

// GOOD: single allocation
let mut v = Vec::with_capacity(10000);
for i in 0..10000 {
    v.push(i);
}

String Optimization

Avoid String Concatenation in Loops

// BAD: O(n²) allocations
let mut result = String::new();
for s in strings {
    result = result + &s;
}

// GOOD: O(n) with push_str
let mut result = String::new();
for s in strings {
    result.push_str(&s);
}

// BETTER: pre-calculate capacity
let total_len: usize = strings.iter().map(|s| s.len()).sum();
let mut result = String::with_capacity(total_len);
for s in strings {
    result.push_str(&s);
}

// BEST: use join for simple cases
let result = strings.join("");

Use &str When Possible

// BAD: requires allocation
fn greet(name: String) {
    println!("Hello, {}", name);
}

// GOOD: borrows, no allocation
fn greet(name: &str) {
    println!("Hello, {}", name);
}

// Works with both:
greet("world");                    // &str
greet(&String::from("world"));     // &String coerces to &str

Iterator Optimization

Use Iterators Over Indexing

// BAD: bounds checking on each access
let mut sum = 0;
for i in 0..vec.len() {
    sum += vec[i];
}

// GOOD: no bounds checking
let sum: i32 = vec.iter().sum();

// GOOD: when index needed
for (i, item) in vec.iter().enumerate() {
    // ...
}

Lazy Evaluation

// Iterators are lazy - computation happens at collect
let result: Vec<_> = data
    .iter()
    .filter(|x| x.is_valid())
    .map(|x| x.process())
    .take(10)  // stop after 10 items
    .collect();

Avoid Collecting When Not Needed

// BAD: unnecessary intermediate allocation
let filtered: Vec<_> = items.iter().filter(|x| x.valid).collect();
let count = filtered.len();

// GOOD: no allocation
let count = items.iter().filter(|x| x.valid).count();

Parallelism with Rayon

use rayon::prelude::*;

// Sequential
let sum: i32 = (0..1_000_000).map(|x| x * x).sum();

// Parallel (automatic work stealing)
let sum: i32 = (0..1_000_000).into_par_iter().map(|x| x * x).sum();

// Parallel with custom chunk size
let results: Vec<_> = data
    .par_chunks(1000)
    .map(|chunk| process_chunk(chunk))
    .collect();

Memory Layout

Use Appropriate Integer Sizes

// If values are small, use smaller types
struct Item {
    count: u8,      // 0-255, not u64
    flags: u8,      // small enum
    id: u32,        // if 4 billion is enough
}

Pack Structs Efficiently

// BAD: 24 bytes due to padding
struct Bad {
    a: u8,   // 1 byte + 7 padding
    b: u64,  // 8 bytes
    c: u8,   // 1 byte + 7 padding
}

// GOOD: 16 bytes (or use #[repr(packed)])
struct Good {
    b: u64,  // 8 bytes
    a: u8,   // 1 byte
    c: u8,   // 1 byte + 6 padding
}

Box Large Values

// Large enum variants waste space
enum Message {
    Quit,
    Data([u8; 10000]),  // all variants are 10000+ bytes
}

// Better: box the large variant
enum Message {
    Quit,
    Data(Box<[u8; 10000]>),  // variants are pointer-sized
}

Async Performance

Avoid Blocking in Async

// BAD: blocks the executor
async fn bad() {
    std::thread::sleep(Duration::from_secs(1));  // blocking!
    std::fs::read_to_string("file.txt").unwrap();  // blocking!
}

// GOOD: use async versions
async fn good() {
    tokio::time::sleep(Duration::from_secs(1)).await;
    tokio::fs::read_to_string("file.txt").await.unwrap();
}

// For CPU work: spawn_blocking
async fn compute() -> i32 {
    tokio::task::spawn_blocking(|| {
        heavy_computation()
    }).await.unwrap()
}

Buffer Async I/O

use tokio::io::{AsyncBufReadExt, BufReader};

// BAD: many small reads
async fn bad(file: File) {
    let mut byte = [0u8];
    while file.read(&mut byte).await.unwrap() > 0 {
        process(byte[0]);
    }
}

// GOOD: buffered reading
async fn good(file: File) {
    let reader = BufReader::new(file);
    let mut lines = reader.lines();
    while let Some(line) = lines.next_line().await.unwrap() {
        process(&line);
    }
}

Release Build Optimization

Cargo.toml Settings

[profile.release]
lto = true           # Link-time optimization
codegen-units = 1    # Single codegen unit (slower compile, faster code)
panic = "abort"      # Smaller binary, no unwinding
strip = true         # Strip symbols

[profile.release-fast]
inherits = "release"
opt-level = 3        # Maximum optimization

[profile.release-small]
inherits = "release"
opt-level = "s"      # Optimize for size

Compile-Time Assertions

// Zero runtime cost
const _: () = assert!(std::mem::size_of::<MyStruct>() <= 64);

Checklist

Before optimizing:

  • Profile to find actual bottlenecks
  • Have benchmarks to measure improvement
  • Consider if optimization is worth complexity

Common wins:

  • Reduce allocations (Cow, reuse buffers)
  • Use appropriate collections
  • Pre-allocate with_capacity
  • Use iterators instead of indexing
  • Enable LTO for release builds
  • Use rayon for parallel workloads