manbo/Sprimo

Fork 0

Files

DaZuo0122 61825f647d Add: windows mvp - transparent bugs not fixed

2026-02-12 22:58:33 +08:00

6.9 KiB

Raw Permalink Blame History

Rust Performance Optimization Guide

Profiling First

Tools

# CPU profiling
cargo install flamegraph
cargo flamegraph --bin myapp

# Memory profiling
cargo install cargo-instruments  # macOS
heaptrack ./target/release/myapp  # Linux

# Benchmarking
cargo bench  # with criterion

# Cache analysis
valgrind --tool=cachegrind ./target/release/myapp

Criterion Benchmarks

use criterion::{criterion_group, criterion_main, Criterion};

fn benchmark_parse(c: &mut Criterion) {
    let input = "test data".repeat(1000);

    c.bench_function("parse_v1", |b| {
        b.iter(|| parse_v1(&input))
    });

    c.bench_function("parse_v2", |b| {
        b.iter(|| parse_v2(&input))
    });
}

criterion_group!(benches, benchmark_parse);
criterion_main!(benches);

Common Optimizations

1. Avoid Unnecessary Allocations

// BAD: allocates on every call
fn to_uppercase(s: &str) -> String {
    s.to_uppercase()
}

// GOOD: return Cow, allocate only if needed
use std::borrow::Cow;

fn to_uppercase(s: &str) -> Cow<'_, str> {
    if s.chars().all(|c| c.is_uppercase()) {
        Cow::Borrowed(s)
    } else {
        Cow::Owned(s.to_uppercase())
    }
}

2. Reuse Allocations

// BAD: creates new Vec each iteration
for item in items {
    let mut buffer = Vec::new();
    process(&mut buffer, item);
}

// GOOD: reuse buffer
let mut buffer = Vec::new();
for item in items {
    buffer.clear();
    process(&mut buffer, item);
}

3. Use Appropriate Collections

Need	Collection	Notes
Sequential access	`Vec<T>`	Best cache locality
Random access by key	`HashMap<K, V>`	O(1) lookup
Ordered keys	`BTreeMap<K, V>`	O(log n) lookup
Small sets (<20)	`Vec<T>` + linear search	Lower overhead
FIFO queue	`VecDeque<T>`	O(1) push/pop both ends

4. Pre-allocate Capacity

// BAD: many reallocations
let mut v = Vec::new();
for i in 0..10000 {
    v.push(i);
}

// GOOD: single allocation
let mut v = Vec::with_capacity(10000);
for i in 0..10000 {
    v.push(i);
}

String Optimization

Avoid String Concatenation in Loops

// BAD: O(n²) allocations
let mut result = String::new();
for s in strings {
    result = result + &s;
}

// GOOD: O(n) with push_str
let mut result = String::new();
for s in strings {
    result.push_str(&s);
}

// BETTER: pre-calculate capacity
let total_len: usize = strings.iter().map(|s| s.len()).sum();
let mut result = String::with_capacity(total_len);
for s in strings {
    result.push_str(&s);
}

// BEST: use join for simple cases
let result = strings.join("");

Use &str When Possible

// BAD: requires allocation
fn greet(name: String) {
    println!("Hello, {}", name);
}

// GOOD: borrows, no allocation
fn greet(name: &str) {
    println!("Hello, {}", name);
}

// Works with both:
greet("world");                    // &str
greet(&String::from("world"));     // &String coerces to &str

Iterator Optimization

Use Iterators Over Indexing

// BAD: bounds checking on each access
let mut sum = 0;
for i in 0..vec.len() {
    sum += vec[i];
}

// GOOD: no bounds checking
let sum: i32 = vec.iter().sum();

// GOOD: when index needed
for (i, item) in vec.iter().enumerate() {
    // ...
}

Lazy Evaluation

// Iterators are lazy - computation happens at collect
let result: Vec<_> = data
    .iter()
    .filter(|x| x.is_valid())
    .map(|x| x.process())
    .take(10)  // stop after 10 items
    .collect();

Avoid Collecting When Not Needed

// BAD: unnecessary intermediate allocation
let filtered: Vec<_> = items.iter().filter(|x| x.valid).collect();
let count = filtered.len();

// GOOD: no allocation
let count = items.iter().filter(|x| x.valid).count();

Parallelism with Rayon

use rayon::prelude::*;

// Sequential
let sum: i32 = (0..1_000_000).map(|x| x * x).sum();

// Parallel (automatic work stealing)
let sum: i32 = (0..1_000_000).into_par_iter().map(|x| x * x).sum();

// Parallel with custom chunk size
let results: Vec<_> = data
    .par_chunks(1000)
    .map(|chunk| process_chunk(chunk))
    .collect();

Memory Layout

Use Appropriate Integer Sizes

// If values are small, use smaller types
struct Item {
    count: u8,      // 0-255, not u64
    flags: u8,      // small enum
    id: u32,        // if 4 billion is enough
}

Pack Structs Efficiently

// BAD: 24 bytes due to padding
struct Bad {
    a: u8,   // 1 byte + 7 padding
    b: u64,  // 8 bytes
    c: u8,   // 1 byte + 7 padding
}

// GOOD: 16 bytes (or use #[repr(packed)])
struct Good {
    b: u64,  // 8 bytes
    a: u8,   // 1 byte
    c: u8,   // 1 byte + 6 padding
}

Box Large Values

// Large enum variants waste space
enum Message {
    Quit,
    Data([u8; 10000]),  // all variants are 10000+ bytes
}

// Better: box the large variant
enum Message {
    Quit,
    Data(Box<[u8; 10000]>),  // variants are pointer-sized
}

Async Performance

Avoid Blocking in Async

// BAD: blocks the executor
async fn bad() {
    std::thread::sleep(Duration::from_secs(1));  // blocking!
    std::fs::read_to_string("file.txt").unwrap();  // blocking!
}

// GOOD: use async versions
async fn good() {
    tokio::time::sleep(Duration::from_secs(1)).await;
    tokio::fs::read_to_string("file.txt").await.unwrap();
}

// For CPU work: spawn_blocking
async fn compute() -> i32 {
    tokio::task::spawn_blocking(|| {
        heavy_computation()
    }).await.unwrap()
}

Buffer Async I/O

use tokio::io::{AsyncBufReadExt, BufReader};

// BAD: many small reads
async fn bad(file: File) {
    let mut byte = [0u8];
    while file.read(&mut byte).await.unwrap() > 0 {
        process(byte[0]);
    }
}

// GOOD: buffered reading
async fn good(file: File) {
    let reader = BufReader::new(file);
    let mut lines = reader.lines();
    while let Some(line) = lines.next_line().await.unwrap() {
        process(&line);
    }
}

Release Build Optimization

Cargo.toml Settings

[profile.release]
lto = true           # Link-time optimization
codegen-units = 1    # Single codegen unit (slower compile, faster code)
panic = "abort"      # Smaller binary, no unwinding
strip = true         # Strip symbols

[profile.release-fast]
inherits = "release"
opt-level = 3        # Maximum optimization

[profile.release-small]
inherits = "release"
opt-level = "s"      # Optimize for size

Compile-Time Assertions

// Zero runtime cost
const _: () = assert!(std::mem::size_of::<MyStruct>() <= 64);

Checklist

Before optimizing:

Profile to find actual bottlenecks
Have benchmarks to measure improvement
Consider if optimization is worth complexity

Common wins:

Reduce allocations (Cow, reuse buffers)
Use appropriate collections
Pre-allocate with_capacity
Use iterators instead of indexing
Enable LTO for release builds
Use rayon for parallel workloads

6.9 KiB Raw Permalink Blame History

Rust Performance Optimization Guide

Profiling First

Tools

Criterion Benchmarks

Common Optimizations

1. Avoid Unnecessary Allocations

2. Reuse Allocations

3. Use Appropriate Collections

4. Pre-allocate Capacity

String Optimization

Avoid String Concatenation in Loops

Use &str When Possible

Iterator Optimization

Use Iterators Over Indexing

Lazy Evaluation

Avoid Collecting When Not Needed

Parallelism with Rayon

Memory Layout

Use Appropriate Integer Sizes

Pack Structs Efficiently

Box Large Values

Async Performance

Avoid Blocking in Async

Buffer Async I/O

Release Build Optimization

Cargo.toml Settings

Compile-Time Assertions

Checklist

6.9 KiB

Raw Permalink Blame History