Update drafts/2026-01-24-profiling-rust-written-network-program.md

2026-01-24 15:01:56 +08:00
6 changed files with 2 additions and 2003 deletions
--- a/card_game_dsl_example.rs
+++ b/card_game_dsl_example.rs
@@ -1,506 +0,0 @@
 // stack data structure
 #[derive(Component, Clone, Debug)]
 pub struct TriggerStack {
    /// LIFO queue of pending triggers
    pub stack: Vec<TriggerObject>,
    /// Current player with priority
    pub priority_player: PlayerId,
    /// Players who have passed priority
    pub passed_priority: HashSet<PlayerId>,
    /// State snapshot for rollback (AI simulation, undo)
    pub snapshot_history: Vec<GameStateSnapshot>,
 }
 #[derive(Clone, Debug)]
 pub struct TriggerObject {
    pub id: TriggerId,
    pub source_entity: Entity,
    pub trigger_type: TriggerType,
    pub controller: PlayerId,
    pub priority: TriggerPriority,
    pub parameters: TriggerParams,
    pub state: TriggerState, // Pending, Resolving, Resolved, Counteracted
 }
 #[derive(Clone, Debug, PartialEq)]
 pub enum TriggerType {
    /// "When you play a card" - Slay the Spire
    OnCardPlayed(CardId),
    /// "When a creature enters battlefield" - MTG
    OnEntersBattlefield(EntityType),
    /// "When damage is dealt" - Hearthstone
    OnDamageDealt { source: Entity, amount: u32 },
    /// "At start of turn" - All games
    OnTurnStart(PlayerId),
    /// "When health drops below X" - Conditional
    OnHealthThreshold { entity: Entity, threshold: u32 },
    /// Custom game-specific triggers
    Custom(String),
 }
 #[derive(Clone, Copy, Debug, PartialEq, PartialOrd)]
 pub enum TriggerPriority {
    /// State-based actions (checked before priority)
    StateBased = 0,
    /// Replacement effects (modify what happens)
    Replacement = 1,
    /// Triggered abilities (go on stack)
    Triggered = 2,
    /// Activated abilities (player chooses)
    Activated = 3,
    /// Spells (highest level)
    Spell = 4,
 }
 // Stack Resolution System, APNAP Order
 pub fn resolve_trigger_stack(
    stack: &mut TriggerStack,
    game_state: &mut GameState,
 ) {
    loop {
        // Check state-based actions first (don't use stack)
        check_state_based_actions(game_state);
        // If all players passed priority and stack is empty → next phase
        if stack.stack.is_empty() && stack.passed_priority.len() == game_state.player_count {
            advance_phase(game_state);
            return;
        }
        // If all players passed priority, resolve top of stack
        if stack.passed_priority.len() == game_state.player_count {
            if let Some(trigger) = stack.stack.pop() {
                // Clear passed priority for next resolution
                stack.passed_priority.clear();
                stack.priority_player = trigger.controller;
                // Execute trigger, may add new triggers to stack
                execute_trigger(trigger, stack, game_state);
            }
            continue;
        }
        // Current player with priority can:
        // 1. Add spell/ability to stack
        // 2. Pass priority
        // For AI/simulation, we auto-resolve
        if game_state.is_ai_game() {
            auto_resolve_priority(stack, game_state);
        } else {
            // Wait for player input (network message in multiplayer)
            return;
        }
    }
 }
 // APNAP Order for simultaneous triggers
 pub fn add_simultaneous_triggers(
    stack: &mut TriggerStack,
    triggers: Vec<TriggerObject>,
    active_player: PlayerId,
    player_order: &[PlayerId],
 ) {
    // Group by controller
    let mut by_controller: HashMap<PlayerId, Vec<TriggerObject>> = HashMap::new();
    for trigger in triggers {
        by_controller.entry(trigger.controller).or_default().push(trigger);
    }
    // APNAP: Active Player first (lowest on stack), then Non-Active in turn order
    // Last added resolves first (LIFO)
    let mut ordered = Vec::new();
    // Active player's triggers go on stack first (bottom)
    if let Some(ap_triggers) = by_controller.remove(&active_player) {
        ordered.extend(ap_triggers);
    }
    // Non-active players in turn order
    for player in player_order {
        if *player != active_player {
            if let Some(nap_triggers) = by_controller.remove(player) {
                ordered.extend(nap_triggers);
            }
        }
    }
    // Add to stack (last in = first to resolve)
    stack.stack.extend(ordered);
 }
 // mbf
 #[derive(Component, Clone, Debug)]
 pub struct ModifierStack {
    /// All active modifiers on this entity
    pub modifiers: Vec<Modifier>,
    /// Cached computed values (for performance)
    pub cached_values: HashMap<ModifierType, i32>,
    /// Dirty flag for recalculation
    pub dirty: bool,
 }
 #[derive(Clone, Debug)]
 pub struct Modifier {
    pub id: ModifierId,
    pub source_entity: Entity,
    pub modifier_type: ModifierType,
    pub value: i32,
    pub priority: u32,      // Higher priority applies first
    pub layer: ModifierLayer, // For same-type modifiers
    pub timestamp: u64,     // For timestamp order within layer
    pub duration: Option<u32>, // Turns/rounds remaining
    pub is_cumulative: bool, // Stack with same modifiers?
 }
 #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
 pub enum ModifierType {
    Attack,
    Health,
    MaxHealth,
    Cost,
    Damage,
    Healing,
    DrawCount,
    CardTargetCount,
    // Game-specific
    SpellPower,        // Hearthstone
    Power,             // MTG
    Strength,          // Yu-Gi-Oh
    EnergyCost,        // Slay the Spire
 }
 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
 pub enum ModifierLayer {
    /// Base value (card's printed stats)
    Base = 0,
    /// Characteristic-defining abilities (MTG)
    CharacteristicDefining = 1,
    /// Control-changing effects
    Control = 2,
    /// All other effects (timestamp order)
    General = 3,
    /// Setting to specific value (overwrites all)
    Set = 4,
    /// Multiplying/dividing
    Multiply = 5,
    /// Additive (most common)
    Additive = 6,
 }
 // mbf resolution system
 pub fn compute_modified_value(
    entity: Entity,
    modifier_type: ModifierType,
    base_value: i32,
    modifiers: &ModifierStack,
    game_state: &GameState,
 ) -> i32 {
    if !modifiers.dirty {
        return *modifiers.cached_values.get(&modifier_type).unwrap_or(&base_value);
    }
    let mut value = base_value;
    // Get all modifiers of this type, sorted by priority → layer → timestamp
    let mut applicable: Vec<&Modifier> = modifiers
        .modifiers
        .iter()
        .filter(|m| m.modifier_type == modifier_type)
        .collect();
    applicable.sort_by(|a, b| {
        a.priority.cmp(&b.priority)
            .then(a.layer.cmp(&b.layer))
            .then(a.timestamp.cmp(&b.timestamp))
    });
    // Apply in order
    for modifier in applicable {
        value = match modifier.layer {
            ModifierLayer::Set => modifier.value,
            ModifierLayer::Multiply => value * modifier.value,
            ModifierLayer::Additive => value + modifier.value,
            _ => value + modifier.value, // Default to additive
        };
    }
    // Cache result
    modifiers.cached_values.insert(modifier_type, value);
    modifiers.dirty = false;
    value
 }
 // Example: "Give all minions +2/+2" (Hearthstone)
 pub fn apply_buff(
    commands: &mut Commands,
    target_query: &Query<Entity, With<MinionComponent>>,
    source: Entity,
    attack_buff: i32,
    health_buff: i32,
    duration: Option<u32>,
 ) {
    let timestamp = get_game_timestamp();
    for entity in target_query.iter() {
        commands.entity(entity).with_modifiers(|mods: &mut ModifierStack| {
            mods.modifiers.push(Modifier {
                id: generate_modifier_id(),
                source_entity: source,
                modifier_type: ModifierType::Attack,
                value: attack_buff,
                priority: 100,
                layer: ModifierLayer::Additive,
                timestamp,
                duration,
                is_cumulative: true,
            });
            mods.modifiers.push(Modifier {
                id: generate_modifier_id(),
                source_entity: source,
                modifier_type: ModifierType::Health,
                value: health_buff,
                priority: 100,
                layer: ModifierLayer::Additive,
                timestamp,
                duration,
                is_cumulative: true,
            });
            mods.dirty = true;
        });
    }
 }
 // ecs core components
 #[derive(Component, Clone, Debug)]
 pub struct CardComponent {
    pub card_id: CardId,
    pub card_type: CardType,
    pub cost: u32,
    pub owner: PlayerId,
    pub controller: PlayerId,
    pub zone: CardZone, // Hand, Deck, Battlefield, Graveyard, Exile
 }
 #[derive(Component, Clone, Debug)]
 pub struct StatsComponent {
    pub base_attack: i32,
    pub base_health: i32,
    pub current_attack: i32,  // Computed from MBF
    pub current_health: i32,  // Computed from MBF
    pub damage_taken: i32,
 }
 #[derive(Component, Clone, Debug)]
 pub struct TriggerComponent {
    pub triggers: Vec<TriggerDefinition>,
 }
 #[derive(Clone, Debug)]
 pub struct TriggerDefinition {
    pub event_type: TriggerType,
    pub condition: Option<TriggerCondition>,
    pub effects: Vec<EffectDefinition>,
    pub is_mandatory: bool,
    pub once_per_turn: bool,
 }
 // ============ PLAYER COMPONENTS ============
 #[derive(Component, Clone, Debug)]
 pub struct PlayerComponent {
    pub player_id: PlayerId,
    pub hand: Vec<Entity>,
    pub deck: Vec<Entity>,
    pub graveyard: Vec<Entity>,
    pub exile: Vec<Entity>,
    pub health: i32,
    pub max_health: i32,
    pub resource: u32,      // Mana/Energy
    pub max_resource: u32,
 }
 // ============ GAME STATE ============
 #[derive(Resource, Clone, Debug)]
 pub struct GameState {
    pub current_player: PlayerId,
    pub phase: GamePhase,
    pub turn_number: u32,
    pub player_order: Vec<PlayerId>,
    pub stack: TriggerStack,
    pub is_multiplayer: bool,
    pub authority: AuthorityType, // Server/Client
 }
 // dsl atomic commands
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub enum EffectCommand {
    // Damage/Healing
    DealDamage { amount: u32, target: TargetSelector },
    Heal { amount: u32, target: TargetSelector },
    // Card Operations
    DrawCards { count: u32, player: PlayerId },
    DiscardCards { count: u32, player: PlayerId },
    AddCardToHand { card_id: CardId, player: PlayerId },
    // Resource
    GainResource { amount: u32, player: PlayerId },
    SpendResource { amount: u32, player: PlayerId },
    // Modifiers
    ApplyModifier { modifier: Modifier, target: TargetSelector },
    RemoveModifier { modifier_id: ModifierId, target: TargetSelector },
    // Movement
    MoveCard { entity: Entity, from: CardZone, to: CardZone },
    SummonMinion { card_id: CardId, player: PlayerId, position: BoardPosition },
    // Trigger Management
    AddTrigger { trigger: TriggerDefinition, target: Entity },
    RemoveTrigger { trigger_id: TriggerId, target: Entity },
    // Game State
    SetPhase { phase: GamePhase },
    EndTurn,
    WinGame { player: PlayerId },
    LoseGame { player: PlayerId },
    // Conditional
    IfCondition {
        condition: EffectCondition,
        then_effects: Vec<EffectCommand>,
        else_effects: Option<Vec<EffectCommand>>,
    },
    // Repeat
    ForEach {
        targets: TargetSelector,
        effects: Vec<EffectCommand>,
    },
 }
 #[derive(Clone, Debug, Serialize, Deserialize)]
 pub enum TargetSelector {
    Self,
    RandomEnemy,
    AllEnemies,
    AllAllies,
    SpecificEntity(Entity),
    LowestHealthEnemy,
    HighestAttackEnemy,
    // MTG-style
    TargetCreature,
    TargetPlayer,
    TargetArtifact,
    // Yu-Gi-Oh-style
    TargetMonster,
    TargetSpellTrap,
 }
 /// Examples
 /// Slay the Spire
 // "When you play a card, draw 1 card"
 TriggerDefinition {
    event_type: TriggerType::OnCardPlayed(CardId::Any),
    condition: Some(TriggerCondition::PlayerControlled),
    effects: vec![EffectCommand::DrawCards { count: 1, player: TriggerSource }],
    is_mandatory: true,
    once_per_turn: false,
 }
 // "When you take damage, gain 1 block"
 TriggerDefinition {
    event_type: TriggerType::OnDamageTaken { min_amount: 1 },
    condition: None,
    effects: vec![EffectCommand::ApplyModifier {
        modifier: Modifier::new_block(1),
        target: TargetSelector::Self,
    }],
    is_mandatory: true,
    once_per_turn: false,
 }
 /// Magic: The Gathering
 // "Whenever a creature enters the battlefield, draw a card"
 TriggerDefinition {
    event_type: TriggerType::OnEntersBattlefield(EntityType::Creature),
    condition: Some(TriggerCondition::ControllerIsSelf),
    effects: vec![EffectCommand::DrawCards { count: 1, player: TriggerSource }],
    is_mandatory: true,
    once_per_turn: false,
 }
 // "At the beginning of your upkeep, sacrifice a creature"
 TriggerDefinition {
    event_type: TriggerType::OnTurnStart(TriggerSource),
    condition: Some(TriggerCondition::Phase(Phase::Upkeep)),
    effects: vec![EffectCommand::SelectAndSacrifice {
        selector: TargetSelector::TargetCreature,
        count: 1,
    }],
    is_mandatory: true,
    once_per_turn: true,
 }
 /// Hearthstone
 // "Whenever you summon a minion, deal 1 damage to all enemies"
 TriggerDefinition {
    event_type: TriggerType::OnSummon(EntityType::Minion),
    condition: Some(TriggerCondition::ControllerIsSelf),
    effects: vec![EffectCommand::DealDamage {
        amount: 1,
        target: TargetSelector::AllEnemies,
    }],
    is_mandatory: true,
    once_per_turn: false,
 }
 // "Deathrattle: Draw a card"
 TriggerDefinition {
    event_type: TriggerType::OnDeath,
    condition: None,
    effects: vec![EffectCommand::DrawCards { count: 1, player: Owner }],
    is_mandatory: true,
    once_per_turn: false,
 }
 /// Yu-Gi-Oh!
 // "When this card is Normal Summoned: You can add 1 Spell from deck to hand"
 TriggerDefinition {
    event_type: TriggerType::OnNormalSummon,
    condition: Some(TriggerCondition::ThisCard),
    effects: vec![EffectCommand::SearchDeck {
        card_type: CardType::Spell,
        count: 1,
        to: Zone::Hand,
    }],
    is_mandatory: false, // Optional trigger
    once_per_turn: true,
 }
 // "When a monster declares an attack: Negate the attack"
 TriggerDefinition {
    event_type: TriggerType::OnAttackDeclared,
    condition: Some(TriggerCondition::FacingThisCard),
    effects: vec![EffectCommand::NegateAttack],
    is_mandatory: true,
    once_per_turn: false,
 }
--- a/designs/grel-rs/TECHNICAL_DESIGN.md
+++ b/designs/grel-rs/TECHNICAL_DESIGN.md
@@ -1,339 +0,0 @@
 # 📘 `grel-rs` Technical Design Document
 **Binary:** `grel` | **Repository:** `grel-rs`  
 **Target Platforms:** Linux, Windows (macOS optional)  
 **Core Philosophy:** Pure CLI, deterministic asset resolution, explicit over implicit, robust upgrade handling, transparent user control.
 ---
 ## 1. Overview & Goals
 `grel` is a terminal-native, high-performance release downloader and package manager for Git forges. It abstracts provider APIs into a unified pipeline, supports proxies, caches DNS/IPs for CDN routing, downloads in parallel, and delivers a transparent, scriptable, pacman-compatible UX.
 **Key Requirements Met:**
 - ✅ Pure CLI (no TUI/GUI), standard `std::io` prompts & warnings
 - ✅ Deterministic resolution: **strict filters → priority sorting → explicit policy fallback** (zero scoring)
 - ✅ `exclude_keywords` config to block installer/setup/bundle artifacts
 - ✅ Warning system for unmanaged or extra-step packages (applies to `ignore_formats` overrides & keyword matches)
 - ✅ Configurable `download_dir` for unmanaged packages (defaults to OS `Downloads/`)
 - ✅ `default_selection_policy` (`first` | `largest`) as explicit fallback, overridden by detailed flags
 - ✅ `-Syu` resilience: filename rename detection, orphan tracking, explicit migration
 - ✅ Cross-platform PATH integration, XDG-compliant, no `sudo`
 ---
 ## 2. Workspace Architecture
 ```
 grel-rs/
 ├── Cargo.toml                  # Workspace root (resolver = "2")
 ├── crates/
 │   ├── grel-cli/               # CLI parsing, pure-text prompts, progress routing
 │   ├── grel-core/              # Resolution, tokenization, filter/sort pipeline, upgrade state
 │   ├── grel-providers/         # Forge trait, registry, API implementations (modules)
 │   ├── grel-network/           # HTTP client, proxy routing, IP cache resolver, parallel downloader
 │   ├── grel-cache/             # SQLite state, IP cache, artifact storage, TTL eviction
 │   └── grel-config/            # Layered config, migrations, asset priority matrices
 ├── tests/                      # Integration, mock servers, e2e fixtures
 └── scripts/                    # CI, release, benchmark helpers
 ```
 | Crate | Responsibility |
 |-------|----------------|
 | `grel-cli` | Subcommand routing, pure CLI prompt loops, `indicatif` progress, `tabled` output |
 | `grel-core` | `PackageRef` parsing, `AssetTokens` extraction, deterministic filter/sort, upgrade planning |
 | `grel-providers` | `ReleaseProvider` trait, GitHub/GitLab/Gitea/Codeberg modules, self-hosted auto-detection |
 | `grel-network` | Proxy chaining, DNS/IP cache resolver, `JoinSet` parallel downloads, streaming extraction |
 | `grel-cache` | `state.sqlite` management, ETag/IP cache, LRU artifact eviction, atomic install temp dirs |
 | `grel-config` | TOML loading, env/CLI overrides, schema validation, config migrations |
 ---
 ## 3. CLI Specification & Warning Flow
 All interaction uses standard terminal I/O. Unmanaged packages trigger explicit warnings & confirmation.
 ### Warning & Confirmation Flow
 When an asset matches `exclude_keywords` or falls under an overridden `ignore_formats`:
 ```
 ⚠️  Asset "foo-setup-1.0.0.exe" matches excluded keyword "setup".
 ℹ️  grel cannot manage installers directly. Package will download to ~/Downloads/.
 :: Proceed with download? [y/N]: _
 ```
 - `y`/`Enter` → Downloads to `download_dir`, marks `is_managed = false` in DB
 - `N`/`n`/`Esc` → Aborts sync for this package, continues with others
 - `--noconfirm` / `-y` → Auto-accepts, prints `ℹ️ Non-interactive: accepted unmanaged asset` to `stderr`
 - **Never blocks CI/pipes:** `!std::io::stdin().is_terminal()` → auto-accepts, logs to `stderr`
 ---
 ## 4. Asset Resolution Pipeline (Deterministic)
 **No scoring. No fuzzy logic.** Strict filtering → transparent priority → policy fallback.
 ### Precedence Rules (Strict → Override → Fallback)
 | Priority | Mechanism | Override Capability |
 |----------|-----------|---------------------|
 | 1️⃣ | OS/Arch exact match or `Unknown` | None |
 | 2️⃣ | `exclude_keywords` filter | None (hard block unless CLI `--allow-keyword`) |
 | 3️⃣ | `ignore_formats` filter | Overridable via CLI/config |
 | 4️⃣ | `arch_priority` index | Overrides `default_selection_policy` |
 | 5️⃣ | `prefer_formats` index | Overrides `default_selection_policy` |
 | 6️⃣ | `default_selection_policy` | **Only applies to remaining ties** |
 ### Step 1: Strict Filtering
 ```rust
 assets.iter()
    .map(|a| AssetTokens::from_filename(&a.filename))
    .filter(|t| t.os == target.os || t.os == Os::Unknown)
    .filter(|t| !keyword_excluded(t, &config.exclude_keywords))
    .filter(|t| arch_matches_priority(t, &config.arch_priority, config.fallback_to_32bit))
    .filter(|t| !format_ignored(t, &config.ignore_formats))
    .collect()
 ```
 ### Step 2: Deterministic Sorting
 Sorted by explicit cascade:
 1. `arch_priority` index
 2. `prefer_formats` index
 3. Lexicographic filename
 4. Size descending
 ### Step 3: Policy Fallback (`default_selection_policy`)
 Only applied if `>1` asset survives sorting and remains tied.
 - `first` → picks top of sorted list
 - `largest` → picks by `size_bytes` descending
 - **Never overrides** arch/format priority or keyword filters.
 ### Step 4: Selection & Warning
 - `0` → `❌ No compatible assets`
 - `1` → Check if unmanaged → warn/confirm → auto-select or prompt
 - `>1` → Pure CLI numbered prompt → warn/confirm if unmanaged
 ---
 ## 5. Platform Enums & Alias Mapping
 Strongly-typed, exhaustive, infallible parsing.
 ```rust
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 #[non_exhaustive]
 pub enum Os { Linux, Windows, MacOS, FreeBSD, Android, iOS, Unknown(String) }
 #[derive(Debug, Clone, PartialEq, Eq, Hash)]
 #[non_exhaustive]
 pub enum Arch { X86_64, Aarch64, I686, ArmV7, ArmV6, Riscv64, S390x, PowerPC64, Unknown(String) }
 ```
 - `FromStr` implemented with `to_lowercase()` + alias table
 - `serde` ready for TOML config
 - `Display` outputs canonical names for DB/CLI consistency
 ---
 ## 6. Configuration Structure (Updated)
 Strict separation of human-editable TOML and machine-managed SQLite.
 ```toml
 # ~/.config/grel/config.toml
 [general]
 version = 1
 max_concurrent = 4
 proxy = "http://127.0.0.1:7890"
 [assets]
 default_selection_policy = "largest"  # first | largest (tiebreaker only)
 exclude_keywords = ["setup", "installer", "portable", "bundle", "nupkg"]
 ignore_formats = ["*.deb", "*.rpm", "*.msi", "*.dmg", "*.pkg", "*.AppImage"]
 prefer_formats = ["*.tar.gz", "*.tar.xz", "*.zip", "*.exe"]
 arch_priority = ["x86_64", "aarch64", "x86", "armv7"]
 fallback_to_32bit = true
 prefer_musl = false
 [paths]
 install_root = "~/.local/share/grel"
 bin_dir = "~/.local/share/grel/bin"
 download_dir = "~/Downloads"  # Fallback for unmanaged/extra-op packages
 [upgrade]
 check_interval_hours = 6
 max_parallel_checks = 10
 [migrations]
 "legacy/old-tool" = "new-org/old-tool"
 ```
 ---
 ## 7. State Database Schema (`state.sqlite`)
 ```sql
 CREATE TABLE installed (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    forge TEXT NOT NULL,
    owner TEXT NOT NULL,
    repo TEXT NOT NULL,
    version TEXT NOT NULL,
    asset_filename TEXT NOT NULL,
    checksum TEXT,
    install_path TEXT NOT NULL,       -- Actual path (bin/ or download_dir)
    is_managed BOOLEAN NOT NULL DEFAULT 1,
    status TEXT NOT NULL DEFAULT 'active' CHECK (status IN ('active', 'orphaned', 'migrated')),
    orphaned_at INTEGER,
    last_checked INTEGER,
    installed_at INTEGER DEFAULT (strftime('%s', 'now'))
 );
 CREATE UNIQUE INDEX idx_pkg_unique ON installed(forge, owner, repo);
 CREATE INDEX idx_status ON installed(status);
 ```
 - `is_managed`: `true` = auto-extracted/linked to `bin/`; `false` = left in `download_dir`
 - `install_path`: Stores actual destination for accurate cleanup/migration
 ---
 ## 8. Download Path & Installation Behavior
 | Asset Type | Destination | Management |
 |------------|-------------|------------|
 | Standard binary/archive (`.tar.gz`, `.zip`, `.exe`) | `bin_dir/` (Linux) / `bin\` (Win) | ✅ Managed (extracted, checksummed, linked) |
 | Unmanaged/Extra-op (`.msi`, `.deb`, matched keywords) | `download_dir` (default: `~/Downloads`) | ⚠️ Download-only, no extraction/execution |
 | CLI Override (`--output-dir ~/tmp`) | User-specified path | ✅ Respected regardless of type |
 **Cross-Platform Default Resolution:**
 ```rust
 fn default_download_dir() -> PathBuf {
    directories::UserDirs::new()
        .map(|d| d.download_dir().clone())
        .unwrap_or_else(|| std::env::temp_dir().join("grel-downloads"))
 }
 ```
 ---
 ## 9. `-Syu` Upgrade Logic & Edge Cases
 ### 9.1 Filename Change Detection
 Authors often rename assets (`x86_64` → `amd64`, `.tar.gz` → `.tar.xz`).
 1. Load stored `asset_filename` from DB
 2. Resolve new release → filter → sort → get `new_best`
 3. If `new_best.filename != stored.filename`:
   ```
   ⚠️ Remote asset renamed: old-name.tar.gz → new-name.tar.xz
   ℹ️ Proceeding with update...
   ```
 4. Download → verify → extract → **update DB record** with new filename. Proceeds safely.
 ### 9.2 Repository Rename / 404 Handling
 1. Provider fetch → `404` or unreachable
 2. Mark `status = 'orphaned'`, set `orphaned_at = now()`
 3. Skip in future `-Syu` runs
 4. Warn user:
   ```
   ⚠️ Package unreachable: foo/bar (HTTP 404)
   ℹ️ Skipped. Run `grel migrate foo/bar new/path` or `grel remove foo/bar`
   ```
 5. **No auto-migration.** Explicit user action required.
 ### 9.3 Orphaned Visibility
 ```bash
 $ grel list
 :: Installed packages (3 active, 1 orphaned)
  🟢 bar/fuzz          1.2.3  linux/x86_64  tar.gz
  🟢 foo/tool          0.9.1  windows/amd64 exe
  🟡 legacy/old-proj   2.0.0  linux/x86_64  tar.gz  [orphaned since 2024-05-01]
 ```
 ---
 ## 10. Networking, Proxy & DNS/IP Caching
 - **Proxy Priority:** `--proxy` > `GREL_PROXY` env > `http_proxy`/`all_proxy` > `config.toml`
 - **DNS/IP Cache:** 
  1. Resolve `A/AAAA` via `hickory-resolver`
  2. Parallel `TcpStream::connect` probes on `443`
  3. Store fastest IP + RTT in SQLite with `300s` TTL
  4. `reqwest::ClientBuilder::resolve(host, ip)` forces routing
 - **Background Refresh:** Idle task pre-warms hot domains, evicts stale entries
 ---
 ## 11. Rate Limiting & ETag Optimization
 GitHub: 60/hr unauth, 5000/hr auth. `grel` handles gracefully:
 1. **ETag Caching:** `If-None-Match` → `304 Not Modified` **free**. Cached in DB.
 2. **Smart Polling:** Only checks packages where `last_checked < now() - check_interval_hours`
 3. **Rate Limit Parsing:** Reads `X-RateLimit-Remaining`/`Reset`. If `< 50` → sleep/queue.
 4. **Auth Prompt:** First run suggests `GREL_GITHUB_TOKEN` for 5000/hr.
 5. **`--no-api` Fallback:** Skips remote checks, uses local timestamps only.
 ---
 ## 12. PATH Integration & Post-Install
 - Installs to `~/.local/share/grel/bin/` (Linux) / `%LOCALAPPDATA%\grel\bin\` (Windows)
 - `grel path add` prints exact shell/registry snippets:
  ```bash
  export PATH="$HOME/.local/share/grel/bin:$PATH"
  ```
  ```powershell
  [Environment]::SetEnvironmentVariable("Path", "$env:LOCALAPPDATA\grel\bin;$env:Path", "User")
  ```
 - First run: `💡 Run: grel path add to enable global command access`
 - XDG-compliant, no `sudo`, no system conflicts.
 ---
 ## 13. Security & Reliability
 - ✅ TLS: `rustls` only
 - ✅ Checksums: `sha256` before extraction (managed packages only)
 - ✅ Archive safety: Reject `..`, absolute paths, external symlinks
 - ✅ Atomic installs: Temp dir → verify → `rename` → DB update
 - ✅ Unmanaged warnings: Explicit user consent required (unless `--noconfirm`)
 - ✅ Graceful degradation: Network errors → retry/backoff, rate limits → sleep/queue, missing assets → clear error + suggestions
 ---
 ## 14. Dependency Matrix
 ```toml
 # Core & Async
 tokio = { version = "1", features = ["full"] }
 futures = "0.3"
 async-trait = "0.1"
 # CLI & UX (Pure CLI, no TUI)
 clap = { version = "4", features = ["derive", "wrap_help", "string"] }
 indicatif = { version = "0.17", features = ["tokio"] }
 tracing-indicatif = "0.3"
 tabled = "0.16"
 owo-colors = "4"
 # Network & Proxy
 reqwest = { version = "0.12", features = ["rustls-tls", "json", "socks", "stream"] }
 hickory-resolver = "0.24"
 dashmap = "6"
 # Cache & Storage
 sqlx = { version = "0.8", features = ["runtime-tokio-rustls", "sqlite"] }
 sha2 = "0.10"
 directories = "5"
 tar = "0.4"
 zip = "2"
 zstd = "0.13"
 xz2 = "0.1"
 bzip2 = "0.5"
 sanitize-filename = "0.5"
 # Config & Utils
 figment = { version = "0.10", features = ["toml", "env"] }
 semver = "1"
 url = "2"
 chrono = { version = "0.4", features = ["serde"] }
 regex = "1"
 serde = { version = "1", features = ["derive"] }
 anyhow = "1"
 thiserror = "2"
 tracing = "0.1"
 tracing-subscriber = { version = "0.3", features = ["env-filter", "fmt"] }
 ```
 ---
 ## 15. Testing & CI Strategy
 - **Unit:** Keyword exclusion, policy fallback precedence, filter/sort determinism, config parsing
 - **Mocked Providers:** `wiremock` for 200/304/403/429/500, rate limit headers
 - **Integration:** Unmanaged package download → `download_dir` verification, `grel migrate` state changes
 - **Non-Interactive:** `echo "" | grel sync foo/bar` must auto-accept with stderr warning
 - **Cross-Platform CI:** `ubuntu-latest`, `windows-latest`
 - **Performance:** `criterion` for DNS/IP cache + parallel download throughput
 - **Fuzzing:** `cargo-fuzz` on `AssetTokens::from_filename()` + archive path validation
 ---
--- a/drafts/2026-01-24-profiling-rust-written-network-program.md
+++ b/drafts/2026-01-24-profiling-rust-written-network-program.md
@@ -192,7 +192,7 @@ move to smol::Executor + N threads (usually num_cpus)
 or run multiple block_on() workers (careful: avoid accept() duplication)
-## outcome oi
+## outcome
 ### CPU hotspot
 testing commands: 
@@ -254,280 +254,8 @@ oi_dwarf.svg
 ### syscall-cost check
 ```bash
-sudo strace -ff -C -p $(pidof oi) -o /tmp/oi.strace
+sudo strace -ff -c -p $(pidof oi) -o /tmp/oi.strace
 # run 15–30s under load, then Ctrl+C
 tail -n +1 /tmp/oi.strace.*
 ```
 ## More real setup
 traffic goes through real kernel routing + 2 TCP legs
 Create namespaces + veth links:
 ```bash
 sudo ip netns add ns_client
 sudo ip netns add ns_server
 sudo ip link add veth_c type veth peer name veth_c_ns
 sudo ip link set veth_c_ns netns ns_client
 sudo ip link add veth_s type veth peer name veth_s_ns
 sudo ip link set veth_s_ns netns ns_server
 sudo ip addr add 10.0.1.1/24 dev veth_c
 sudo ip link set veth_c up
 sudo ip addr add 10.0.0.1/24 dev veth_s
 sudo ip link set veth_s up
 sudo ip netns exec ns_client ip addr add 10.0.1.2/24 dev veth_c_ns
 sudo ip netns exec ns_client ip link set veth_c_ns up
 sudo ip netns exec ns_client ip link set lo up
 sudo ip netns exec ns_server ip addr add 10.0.0.2/24 dev veth_s_ns
 sudo ip netns exec ns_server ip link set veth_s_ns up
 sudo ip netns exec ns_server ip link set lo up
 sudo sysctl -w net.ipv4.ip_forward=1
 ```
 Config to force redirect path:
 ```yaml
 10.0.1.1 9000 10.0.0.2 9001
 ```
 Start backend server in ns_server:
 ```bash
 sudo ip netns exec ns_server iperf3 -s -p 9001
 ```
 Run client in ns_client → forwarder → backend:
 ```bash
 sudo ip netns exec ns_client iperf3 -c 10.0.1.1 -p 9000 -t 30 -P 8
 ```
 perf report: 
 ```text
 sudo perf stat -p $(pidof oi) -e   cycles,instructions,cache-misses,branches,branch-misses,context-switches,cpu-migrations   -- sleep 33
 Performance counter stats for process id '209785':
   113,810,599,893      cpu_atom/cycles/                                                        (0.11%)
   164,681,878,450      cpu_core/cycles/                                                        (99.89%)
   102,575,167,734      cpu_atom/instructions/           #    0.90  insn per cycle              (0.11%)
   237,094,207,911      cpu_core/instructions/           #    1.44  insn per cycle              (99.89%)
        33,093,338      cpu_atom/cache-misses/                                                  (0.11%)
         5,381,441      cpu_core/cache-misses/                                                  (99.89%)
    20,012,975,873      cpu_atom/branches/                                                      (0.11%)
    46,120,077,111      cpu_core/branches/                                                      (99.89%)
       211,767,555      cpu_atom/branch-misses/          #    1.06% of all branches             (0.11%)
       245,969,685      cpu_core/branch-misses/          #    0.53% of all branches             (99.89%)
             1,686      context-switches
               150      cpu-migrations
      33.004363800 seconds time elapsed
 ```
 flamegraph
 ### Add latency + small-packet tests
 netperf (request/response)
 Start netserver in backend namespace:
 ```bash
 sudo ip netns exec ns_server netserver -p 9001
 ```
 Run TCP_RR against forwarded port:
 ```bash
 sudo ip netns exec ns_client netperf -H 10.0.1.1 -p 9000 -t TCP_RR -l 30 -- -r 32,32
 ```
 ## After opt
 Here, we changed future_lite::io 8KiB buffer to a customized 16KiB buffer. (To avoid conflict, I changed binary name to oiopt).
 ```rust
 async fn pump(mut r: TcpStream, mut w: TcpStream) -> io::Result<u64> {
    // let's try 16KiB instead of future_lite::io 8KiB
    // and do a profiling to see the outcome
    let mut buf = vec![0u8; 16 * 1024];
    let mut total = 0u64;
    loop {
        let n = r.read(&mut buf).await?;
        if n == 0 {
            // EOF: send FIN to peer
            let _ = w.shutdown(Shutdown::Write);
            break;
        }
        w.write_all(&buf[0..n]).await?;
        total += n as u64;
    }
    Ok(total)
 }
 // And change the function call in handle_tcp_connection 
 let client_to_server = pump(client_stream.clone(), server_stream.clone());
 let server_to_client = pump(server_stream, client_stream);
 ```
 ### outcomes
 Still with `sudo ip netns exec ns_client iperf3 -c 10.0.1.1 -p 9000 -t 30 -P 8`
 perf stat: 
 ```text
 sudo perf stat -p $(pidof oiopt) -e   cycles,instructions,cache-misses,branches,branch-misses,context-switches,cpu-migrations   -- sleep 33
 Performance counter stats for process id '883435':
   118,960,667,431      cpu_atom/cycles/                                                        (0.05%)
   131,934,369,110      cpu_core/cycles/                                                        (99.95%)
   100,530,466,140      cpu_atom/instructions/           #    0.85  insn per cycle              (0.05%)
   185,203,788,299      cpu_core/instructions/           #    1.40  insn per cycle              (99.95%)
        11,027,490      cpu_atom/cache-misses/                                                  (0.05%)
         2,123,369      cpu_core/cache-misses/                                                  (99.95%)
    19,641,945,774      cpu_atom/branches/                                                      (0.05%)
    36,245,438,057      cpu_core/branches/                                                      (99.95%)
       214,098,497      cpu_atom/branch-misses/          #    1.09% of all branches             (0.05%)
       179,848,095      cpu_core/branch-misses/          #    0.50% of all branches             (99.95%)
             2,308      context-switches
                31      cpu-migrations
      33.004555878 seconds time elapsed
 ```
 system call check:
 ```bash
 sudo timeout 30s strace -c -f -p $(pidof oiopt)
 ```
 output:
 ```text
 strace: Process 883435 attached with 4 threads
 strace: Process 883438 detached
 strace: Process 883437 detached
 strace: Process 883436 detached
 strace: Process 883435 detached
 % time     seconds  usecs/call     calls    errors syscall
 ------ ----------- ----------- --------- --------- ----------------
 57.80   14.590016      442121        33           epoll_wait
 28.84    7.279883           4   1771146           sendto
 13.33    3.363882           1   1771212        48 recvfrom
  0.02    0.003843          61        62        44 futex
  0.01    0.001947          12       159           epoll_ctl
  0.00    0.000894          99         9         9 connect
  0.00    0.000620          34        18         9 accept4
  0.00    0.000503          14        34           timerfd_settime
  0.00    0.000446          13        33        33 read
  0.00    0.000271          15        18           ioctl
  0.00    0.000189          21         9           write
  0.00    0.000176          19         9           socket
  0.00    0.000099          11         9           getsockopt
  0.00    0.000079           4        18           shutdown
  0.00    0.000049           2        18           close
 ------ ----------- ----------- --------- --------- ----------------
 100.00   25.242897           7   3542787       143 total
 ```
 ## Further tests to explain why this huge
 Changed 16KiB buffer to 64KiB, and named the binary to oiopt64
 iperf3 throughput under `-P 8`, highest 54.1Gbits/sec, other threads are much higher than before(16KiB buffer)
 perf stat: 
 ```text
 sudo perf stat -p $(pidof oiopt64) -e   cycles,instructions,cache-misses,branches,branch-misses,context-switches,cpu-migrations   -- sleep 33
 Performance counter stats for process id '893123':
   120,859,810,675      cpu_atom/cycles/                                                        (0.15%)
   134,735,934,329      cpu_core/cycles/                                                        (99.85%)
    79,946,979,880      cpu_atom/instructions/           #    0.66  insn per cycle              (0.15%)
   127,036,644,759      cpu_core/instructions/           #    0.94  insn per cycle              (99.85%)
        24,713,474      cpu_atom/cache-misses/                                                  (0.15%)
         9,604,449      cpu_core/cache-misses/                                                  (99.85%)
    15,584,074,530      cpu_atom/branches/                                                      (0.15%)
    24,796,180,117      cpu_core/branches/                                                      (99.85%)
       175,778,825      cpu_atom/branch-misses/          #    1.13% of all branches             (0.15%)
       135,067,353      cpu_core/branch-misses/          #    0.54% of all branches             (99.85%)
             1,519      context-switches
                50      cpu-migrations
      33.006529572 seconds time elapsed
 ```
 system call check:
 ```bash
 sudo timeout 30s strace -c -f -p $(pidof oiopt64)
 ```
 output:
 ```text
 strace: Process 893123 attached with 4 threads
 strace: Process 893126 detached
 strace: Process 893125 detached
 strace: Process 893124 detached
 strace: Process 893123 detached
 % time     seconds  usecs/call     calls    errors syscall
 ------ ----------- ----------- --------- --------- ----------------
 54.56   18.079500      463576        39           epoll_wait
 27.91    9.249443           7   1294854         2 sendto
 17.49    5.796927           4   1294919        51 recvfrom
  0.01    0.003778          50        75        49 futex
  0.01    0.002188          12       175           epoll_ctl
  0.00    0.000747          83         9         9 connect
  0.00    0.000714          17        40           timerfd_settime
  0.00    0.000510          13        39        38 read
  0.00    0.000452          25        18         9 accept4
  0.00    0.000310          17        18           ioctl
  0.00    0.000232          23        10           write
  0.00    0.000200          22         9           socket
  0.00    0.000183          20         9           getsockopt
  0.00    0.000100           5        18           shutdown
  0.00    0.000053           2        18           close
  0.00    0.000020          20         1           mprotect
  0.00    0.000015          15         1           sched_yield
  0.00    0.000005           5         1           madvise
 ------ ----------- ----------- --------- --------- ----------------
 100.00   33.135377          12   2590253       158 total
 ```
 ### Cleanup:
 ```bash
 sudo ip netns del ns_client
 sudo ip netns del ns_server
 sudo ip link del veth_c
 sudo ip link del veth_s
 ```
--- a/drafts/2026-01-25-ace-profiling-attorney-the-case-of-the-missing-gbits.md
+++ b/drafts/2026-01-25-ace-profiling-attorney-the-case-of-the-missing-gbits.md
@@ -1,475 +0,0 @@
 ---
 title: Ace Profiling Attorney - The Case of the Missing Gbits
 categories: [Programming, Profiling]
 tags: [Rust, kernel, networking]
 ---
 > **Disclaimer:** This is not a language-war post. No “X vs Y”.  
 > This is a profiling detective story about my Rust TCP forwarder [`oi`](https://github.com/DaZuo0122/oxidinetd).
 ---
 ## 0) Prologue — The Courthouse Lobby
 > **Me:** I wrote a Rust TCP port forwarder. It works. It forwards.  
 > 
 > **Inner Prosecutor (Phoenix voice):** *Hold it!* “Works” is not a metric. How fast?  
 > 
 > **Me:** Not fast enough under load.  
 > 
 > **Inner Prosecutor:** *Objection!* “Not fast enough” is an emotion. Bring evidence.  
 > 
 > **Me:** Fine. I’ll bring **perf**, **strace**, and a **flamegraph**.  
 > 
 > **Inner Prosecutor:** Good. This court accepts only facts.
 ## 1) The Crime Scene — Setup & Reproduction
 **Me:** Single machine, Debian 13. No WAN noise, no tunnel bottlenecks.  
 **Inner Prosecutor:** *Hold it!* If it’s “single machine”, how do you avoid loopback cheating?  
 **Me:** Network namespaces + veth. Local, repeatable, closer to real networking.
 ### Environment
 - Debian 13
 - Kernel: `6.12.48+deb13-amd64`
 - Runtime: `smol`
 - Test topology: `ns_client → oi (root ns) → ns_server` via veth
 ### Reproduction commands
 **Exhibit A: Start backend server in `ns_server`**
 ```bash
 sudo ip netns exec ns_server iperf3 -s -p 9001
 ````
 **Exhibit B: Run client in `ns_client` through forwarder**
 ```bash
 sudo ip netns exec ns_client iperf3 -c 10.0.1.1 -p 9000 -t 30 -P 8
 ```
 **Inner Prosecutor:** *Hold it!* Why `-P 8`?  
 **Me:** Because a forwarder can look fine in `-P 1` and fall apart when syscall pressure scales.  
 **Inner Prosecutor:** …Acceptable.  
 ---
 ## 2) The Suspects — What Could Be Limiting Throughput?
 **Me:** Four suspects.
 1. **CPU bound** (pure compute wall)
 2. **Kernel TCP stack bound** (send/recv path, skb, softirq, netfilter/conntrack)
 3. **Syscall-rate wall** (too many `sendto/recvfrom` per byte)
 4. **Runtime scheduling / contention** (wake storms, locks, futex)
 **Inner Prosecutor:** *Objection!* That’s too broad. Narrow it down.  
 **Me:** That’s what the tools are for.
 ---
 ## 3) Evidence #1 — `perf stat` (The Macro View)
 **Me:** First I ask: are we burning CPU, thrashing schedulers, or stalling on memory?
 **Command:**
 ```bash
 sudo perf stat -p $(pidof oi) -e \
  cycles,instructions,cache-misses,branches,branch-misses,context-switches,cpu-migrations \
  -- sleep 33
 ```
 **What I’m looking for:**
 * Huge `context-switches` → runtime thrash / lock contention
 * Huge `cpu-migrations` → unstable scheduling
 * Very low IPC + huge cache misses → memory stalls
 * Otherwise: likely syscall/kernel path
 Output: 
 ```text
 Performance counter stats for process id '209785':
   113,810,599,893      cpu_atom/cycles/                                                        (0.11%)
   164,681,878,450      cpu_core/cycles/                                                        (99.89%)
   102,575,167,734      cpu_atom/instructions/           #    0.90  insn per cycle              (0.11%)
   237,094,207,911      cpu_core/instructions/           #    1.44  insn per cycle              (99.89%)
        33,093,338      cpu_atom/cache-misses/                                                  (0.11%)
         5,381,441      cpu_core/cache-misses/                                                  (99.89%)
    20,012,975,873      cpu_atom/branches/                                                      (0.11%)
    46,120,077,111      cpu_core/branches/                                                      (99.89%)
       211,767,555      cpu_atom/branch-misses/          #    1.06% of all branches             (0.11%)
       245,969,685      cpu_core/branch-misses/          #    0.53% of all branches             (99.89%)
             1,686      context-switches
               150      cpu-migrations
      33.004363800 seconds time elapsed
 ```
 **Low context switching**:
   - context-switches: 1,686 over ~33s → ~51 switches/sec
   - cpu-migrations: 150 over ~33s → ~4.5/s → very stable CPU placement
 **CPU is working hard**:
   - 237,094,207,911 cpu_core instructions
   - IPC: 1.44 (instructions per cycle) → not lock-bound or stalling badly
 **Clean cache, branch metrics**:
   - cache-misses: ~3.1M (tiny compared to the instruction count)
   - branch-misses: 0.62%
 **Inner Prosecutor:** *Hold it!* You didn’t show the numbers.  
 **Me:** Patience. The next exhibit makes the culprit confess.  
 ---
 ## 4) Evidence #2 — `strace -c` (The Confession: Syscall Composition)
 **Me:** Next: “What syscalls are we paying for?”
 **Command:**
 ```bash
 sudo timeout 30s strace -c -f -p $(pidof oi)
 ```
 **What I expect if this is a forwarding wall:**
 * `sendto` and `recvfrom` dominate calls
 * call counts in the millions
 Output (simplified):
 ```text
 sendto   2,190,751 calls   4.146799s  (57.6%)
 recvfrom 2,190,763 calls   3.052340s  (42.4%)
 total syscall time:        7.200789s
 ```
 (A) **100% syscall/copy dominated:**
 - Almost all traced time is inside:
 	- sendto() (TCP send)
 	- recvfrom() (TCP recv)
 (B) **syscall rate is massive**
 - Total send+recv calls:
 	- ~4,381,500 syscalls in ~32s
 	- → ~137k `sendto` per sec + ~137k `recvfrom` per sec
 	- → ~274k syscalls/sec total
 **Inner Prosecutor:** *Objection!* Syscalls alone don’t prove the bottleneck.  
 **Me:** True. So I brought a witness.
 ---
 ## 5) Evidence #3 — FlameGraph (The Witness)
 **Me:** The flamegraph doesn’t lie. It testifies where cycles go.
 **Commands:**
 ```bash
 sudo perf record -F 199 --call-graph dwarf,16384 -p $(pidof oi) -- sleep 30
 sudo perf script | stackcollapse-perf.pl | flamegraph.pl > oi.svg
 ```
 **What the flamegraph showed (described, not embedded):**
 * The widest towers were kernel TCP send/recv paths:
  * `__x64_sys_sendto` → `tcp_sendmsg_locked` → `tcp_write_xmit` → ...
  * `__x64_sys_recvfrom` → `tcp_recvmsg` → ...
 * My userspace frames existed, but were comparatively thin.
 * The call chain still pointed into my forwarding implementation.
 **Inner Prosecutor:** *Hold it!* So you’re saying… the kernel is doing the heavy lifting?  
 **Me:** Exactly. Which means my job is to **stop annoying the kernel** with too many tiny operations.  
 ---
 ## 6) The Real Culprit — A “Perfectly Reasonable” Copy Loop
 **Me:** Here’s the original relay code. Looks clean, right?
 ```rust
 let client_to_server = io::copy(client_stream.clone(), server_stream.clone());
 let server_to_client = io::copy(server_stream, client_stream);
 futures_lite::future::try_zip(client_to_server, server_to_client).await?;
 ```
 **Inner Prosecutor:** *Objection!* This is idiomatic and correct.  
 **Me:** Yes. That’s why it’s dangerous.  
 **Key detail:** `futures_lite::io::copy` uses a small internal buffer (~8KiB in practice).
 Small buffer → more iterations → more syscalls → more overhead.
 If a forwarder is syscall-rate bound, this becomes a ceiling.
 ---
 ## 7) The First Breakthrough — Replace `io::copy` with `pump()`
 **Me:** I wrote a manual pump loop:
 * allocate a buffer once
 * `read()` into it
 * `write_all()` out
 * on EOF: `shutdown(Write)` to propagate half-close
 ```rust
 async fn pump(mut r: TcpStream, mut w: TcpStream, buf_sz: usize) -> io::Result<u64> {
    let mut buf = vec![0u8; buf_sz];
    let mut total = 0u64;
    loop {
        let n = r.read(&mut buf).await?;
        if n == 0 {
            let _ = w.shutdown(std::net::Shutdown::Write);
            break;
        }
        w.write_all(&buf[..n]).await?;
        total += n as u64;
    }
    Ok(total)
 }
 ```
 Run both directions:
 ```rust
 let c2s = pump(client_stream.clone(), server_stream.clone(), BUF);
 let s2c = pump(server_stream, client_stream, BUF);
 try_zip(c2s, s2c).await?;
 ```
 **Inner Prosecutor:** *Hold it!* That’s just a loop. How does that win?  
 **Me:** Not the loop. The **bytes per syscall**.  
 ---
 ### 8) Exhibit C — The Numbers (8KiB → 16KiB → 64KiB)
 ### Baseline: ~8KiB (generic copy helper)
 Throughput:
 ```text
 17.8 Gbit/s
 ```
 **Inner Prosecutor:** *Objection!* That’s your “crime scene” number?  
 **Me:** Yes. Now watch what happens when the kernel stops getting spammed.  
 ### Pump + 16KiB buffer
 Throughput:
 ```text
 28.6 Gbit/s
 ```
 `strace -c` showed `sendto/recvfrom` call count dropped:
 ```text
 % time     seconds  usecs/call     calls    errors syscall
 ------ ----------- ----------- --------- --------- ----------------
 57.80   14.590016      442121        33           epoll_wait
 28.84    7.279883           4   1771146           sendto
 13.33    3.363882           1   1771212        48 recvfrom
  0.02    0.003843          61        62        44 futex
  0.01    0.001947          12       159           epoll_ctl
 ...
 ------ ----------- ----------- --------- --------- ----------------
 100.00   25.242897           7   3542787       143 total
 ```
 **Inner Prosecutor:** *Hold it!* That’s already big. But you claim there’s more?  
 **Me:** Oh, there’s more.  
 ### Pump + 64KiB buffer
 Throughput:
 ```text
 54.1 Gbit/s (best observed)
 ```
 `perf stat` output:
 ```text
 Performance counter stats for process id '893123':
   120,859,810,675      cpu_atom/cycles/                                                        (0.15%)
   134,735,934,329      cpu_core/cycles/                                                        (99.85%)
    79,946,979,880      cpu_atom/instructions/           #    0.66  insn per cycle              (0.15%)
   127,036,644,759      cpu_core/instructions/           #    0.94  insn per cycle              (99.85%)
        24,713,474      cpu_atom/cache-misses/                                                  (0.15%)
         9,604,449      cpu_core/cache-misses/                                                  (99.85%)
    15,584,074,530      cpu_atom/branches/                                                      (0.15%)
    24,796,180,117      cpu_core/branches/                                                      (99.85%)
       175,778,825      cpu_atom/branch-misses/          #    1.13% of all branches             (0.15%)
       135,067,353      cpu_core/branch-misses/          #    0.54% of all branches             (99.85%)
             1,519      context-switches
                50      cpu-migrations
      33.006529572 seconds time elapsed
 ```
 `strace -c` output:
 ```text
 % time     seconds  usecs/call     calls    errors syscall
 ------ ----------- ----------- --------- --------- ----------------
 54.56   18.079500      463576        39           epoll_wait
 27.91    9.249443           7   1294854         2 sendto
 17.49    5.796927           4   1294919        51 recvfrom
 ...
 ------ ----------- ----------- --------- --------- ----------------
 100.00   33.135377          12   2590253       158 total
 ```
 **Inner Prosecutor:** *OBJECTION!* `epoll_wait` is eating the time. That’s the bottleneck!  
 **Me:** Nice try. That’s a classic trap.  
 ---
 ## 9) Cross-Examination — The `epoll_wait` Trap
 **Me:** `strace -c` measures time spent *inside syscalls*, including time spent **blocked**.
 In async runtimes:
 * One thread can sit in `epoll_wait(timeout=...)`
 * Other threads do `sendto/recvfrom`
 * `strace` charges the blocking time to `epoll_wait`
 So `epoll_wait` dominating **does not** mean “epoll is slow”.
 It often means “one thread is waiting while others work”.
 **What matters here:**
 * `sendto` / `recvfrom` call counts
 * and how they change with buffer size
 ---
 ## 10) Final Explanation — Why 64KiB Causes a “Nonlinear” Jump
 **Inner Prosecutor:** *Hold it!* You only reduced syscall calls by ~some percent. How do you nearly triple throughput?  
 **Me:** Because syscall walls are **nonlinear**.  
 A forwarder’s throughput is approximately:
 > **Throughput ≈ bytes_per_syscall_pair × syscall_pairs_per_second**
 If you’re syscall-rate limited, increasing `bytes_per_syscall_pair` pushes you past a threshold where:
 * socket buffers stay fuller
 * the TCP window is better utilized
 * each stream spends less time in per-chunk bookkeeping
 * concurrency (`-P 8`) stops fighting overhead and starts helping
 Once you cross that threshold, throughput can jump until the next ceiling (kernel TCP, memory bandwidth, iperf itself).
 That’s why a “small” change can create a big effect.
 ---
 ## 11. Trade-offs: buffer size is not free
 **Inner Prosecutor:** *Objection!* Bigger buffers waste memory!  
 **Me:** Sustained.  
 A forwarder allocates **two buffers per connection** (one per direction).
 So for 64KiB:
 * ~128KiB per connection (just for relay buffers)
 * plus runtime + socket buffers
 That’s fine for “few heavy streams”, but it matters if you handle thousands of concurrent connections.
 In practice, the right move is:
 * choose a good default (64KiB is common)
 * make it configurable
 * consider buffer pooling if connection churn is heavy
 ---
 ## Epilogue — Case Closed (for now)
 **Inner Prosecutor:** So the culprit was…  
 **Me:** A perfectly reasonable helper with a default buffer size I didn’t question.  
 **Inner Prosecutor:** And the lesson?  
 **Me:** Don’t guess. Ask sharp questions. Use the tools. Let the system testify.  
 > **Verdict:** Guilty of “too many syscalls per byte.”  
 > 
 > **Sentence:** 64KiB buffers and a better relay loop.  
 --- 
 ## Ending
 This was a good reminder that performance work is not guessing — it’s a dialogue with the system:
 1. Describe the situation
 2. Ask sharp questions
 3. Use tools to confirm
 4. Explain the results using low-level knowledge
 5. Make one change
 6. Re-measure
 And the funniest part: the “clean” one-liner `io::copy` was correct, but its defaults were hiding a performance policy I didn’t want.
 > **Inner Prosecutor:** “Case closed?”
 >
 > **Me:** “For now. Next case: buffer pooling, socket buffer tuning, and maybe a Linux-only `splice(2)` fast path — carefully, behind a safe wrapper.”
 ---
--- a/drafts/2026-03-03-building-babel-a-fuzzy-llm-vs-the-os.md
+++ b/drafts/2026-03-03-building-babel-a-fuzzy-llm-vs-the-os.md
@@ -1,350 +0,0 @@
 ---
 title: Building Babel - a fuzzy LLM vs the OS
 categories: [Thoughts]
 tags: [os]
 math: true
 ---
 *A post about reliability, memory, and the compiler we didn’t mean to write.*
 When people talk about “prompt engineering,” it often sounds like a bag of tricks: write clearer instructions, add examples, constrain the format, keep history short, and pray. But if you zoom out, the pattern looks less like copywriting and more like systems engineering. We’re trying to run workloads on a machine whose “CPU” is probabilistic, whose “RAM” is fixed-size, and whose caching behavior depends on keeping the same prefix intact.
 That framing is useful, because it pushes us toward familiar tools: define an ISA, build a runtime, control errors with redundancy and verification, and manage memory with explicit lifetimes. 
 ---
 ## 0 — The foundation: why the OS analogy is structurally correct
 ### 0.1 A minimal machine model
 If you strip away the marketing, an LLM session is a constrained compute device with:
 * a **finite internal working set** (context window),
 * a **state-dependent transition function** (next-token generation),
 * and an **acceleration cache** tied to previously seen prefixes (KV cache).
 That is already enough to justify an OS framing, because OS thinking starts from exactly these questions:
 * What is my *working set limit*?
 * What is *resident* vs *non-resident* state?
 * Where do failures originate: computation, I/O, or state corruption?
 * How do I bound resource usage and isolate side effects?
 So we don’t start with “LLM = OS.”
 We start with “LLM session = state machine with bounded memory + caching,” and OS is the natural language we use to design such machines.
 ### 0.2 Formal mapping: LLM as a bounded-state transducer
 A practical theoretical model looks like this:
 * Let $X$ be the set of possible contexts (token sequences) with max length $N$.
 * Let $Y$ be token outputs.
 * The model implements a stochastic policy:
  $$
  \pi(y \mid x)
  $$
  where $x \in X$.
 In each interaction, you append some new tokens to $x$, then the model emits tokens $y$, producing a new context $x' = \text{append}(x, y)$, then truncation/packing happens due to the context limit $N$.
 From an OS perspective, the key point is not stochasticity. The key point is **boundedness**:
 * The model is not a Turing machine with unlimited tape.
 * It is a finite-memory device (finite context window) with expensive state transitions.
 That’s why memory management dominates in practice.
 ### 0.3 “Main context ⇒ RAM” : the working-set equivalence
 In OS terms, RAM is defined by three properties:
 1. **bounded capacity**
 2. **fast access for computation**
 3. **content determines behavior** (program correctness depends on which pages are resident and what they contain)
 An LLM context window has exactly those properties:
 * bounded capacity: fixed token limit $N$
 * fast access: everything in-context is “directly addressable” by attention
 * content determines behavior: the probability distribution $\pi(\cdot\mid x)$ changes when $x$ changes
 That’s enough to justify the equivalence “context behaves like RAM,” even though the representation isn’t bytes.
 If you want a stronger OS-flavored claim: the LLM’s output is effectively a function of its **resident working set**. If important facts fall out of the window, the model behaves as if the memory page was evicted.
 ### 0.4 “Immutable prefix ⇒ kernel text” is about invariants and reentrancy
 Kernel code in an OS is special for reasons that map cleanly:
 * it establishes invariants (security model, syscalls, scheduling policy)
 * it is expected to be stable/reentrant across workloads
 * it is always “mapped” (logically present) and influences everything
 Similarly, the immutable prefix (role, policies, high-level goals, tool contracts) is the part of context we want:
 * stable across turns,
 * consistently applied to all tasks,
 * and reusable for performance.
 This isn’t metaphorical; it’s a design constraint. If you mutate “kernel invariants” mid-execution, you get undefined behavior in both worlds.
 ### 0.5 “KV cache ⇒ TLB/cache” is justified by prefix-dependent reuse
 KV cache reuse is not magic; it’s a computational caching mechanism:
 * If the prefix is identical, the model can reuse previously computed key/value tensors for those tokens.
 * If the prefix changes, cache reuse degrades.
 That is the same structural property as a TLB/cache: stable mappings/prefixes produce high hit rates; churn kills locality.
 From an OS dev perspective, this creates an optimization target: keep the “kernel prefix” stable to maximize cache locality across turns.
 ### 0.6 “Skills compiler” is justified by separation of concerns
 OS devs separate *policy* from *mechanism*:
 * policy decides what to do (scheduler policy, VM policy)
 * mechanism executes it deterministically (context switch, page fault handler)
 This is exactly what the compiler/runtime split is doing:
 * **LLM is policy**: produce a plan from a messy spec
 * **Sandbox runtime is mechanism**: execute primitives deterministically
 So “NL → instructions compiler” isn’t just a nice idea; it’s literally applying a core OS design principle: isolate fuzzy/high-level decisions from low-level mechanisms, so failures can be contained and reasoned about.
 ### 0.7 Why binaries (success/failure) matter: observability and debugging
 OS engineering relies on observability:
 * syscalls return error codes
 * page faults are explicit
 * sched events are traceable
 * invariants can be asserted
 When you compile skills into basic instructions with binary status, you create **syscall-like boundaries**. That gives you:
 * localized failure (which instruction failed)
 * structured error types (timeout vs empty input vs permission)
 * the ability to retry/repair minimally
 This is the real foundation of the “ECC/control loop” later: error correction requires a detectable syndrome. Binary instruction results are your syndrome.
 ---
 ## Part 1 — Reliability: the fuzzy ALU problem, and an ECC-shaped solution
 ### The reliability issue: deterministic execution vs probabilistic compilation
 In a classic machine, the critical property is that execution is deterministic: given the same instruction stream and machine state, you get the same result. That’s what makes debugging possible, and it’s why “bit flips” are an exceptional event handled by ECC, parity checks, and redundancy.
 LLMs invert that. The core model is best understood as a conditional distribution $P(y \mid x)$: the next token depends on the prompt/context. Even if you force deterministic decoding, the *system-level behavior* remains fragile because the mapping from a messy human request to an internal strategy is not explicit and not stable. Small context changes, minor phrasing differences, or irrelevant baggage in the prompt can flip the “mode” the model enters. In practice, this looks like the ALU occasionally returning the wrong result, except the “wrongness” is semantic, not bit-level.
 A direct way to improve reliability is to reduce the amount of “semantic work” the model must do while it is producing final outputs. Instead of asking the LLM to execute tasks in free-form language, we ask it to **compile** the request into a small set of **deterministic primitives**. Then we run those primitives in a runtime we control.
 ### The solution: compile to a tiny ISA (`read/write/grep/exec`) and execute in a strict sandbox
 Assume we define a minimal instruction set:
 * `read`  — load data from a declared source
 * `write` — store a value into a declared state slot
 * `grep`  — deterministically match/filter/extract
 * `exec`  — call a deterministic tool (curl, a parser, a local command) inside a sandbox
 Now, a “skill” is no longer a blob of natural language. It becomes a program: a sequence of these instructions. Web search becomes “write the query → exec curl → read the response → grep for indicators → write the extracted results.” Local doc search becomes “read the corpus → grep for pattern → write the match list.”
 You can visualize the pipeline like a compiler+runtime split:
 ```
 User request (natural language)
          |
          v
 +-------------------+
 |  Skills Compiler  |   (LLM: NL -> ISA plan)
 +-------------------+
          |
          v
 ISA Program: [write, exec, read, grep, ...]
          |
          v
 +-------------------+
 | Sandbox Runtime   |   (deterministic executor + logs)
 +-------------------+
          |
          v
 Structured result + trace (success/failure per instruction)
 ```
 This architecture deliberately moves uncertainty into one place: compilation. Execution becomes observable and mostly deterministic.
 ### Why this increases reliability
 Let $U$ be the user request (the “spec”), $C$ be the compiler (LLM), $P$ the produced plan (ISA program), $R$ the runtime, and $O$ the observed output. Let $V(U,O)\in{0,1}$ be a checker that says whether the output satisfies the request (even a weak checker helps).
 Because the runtime is deterministic and instrumented, the overall success probability decomposes conceptually into:
 * how often the compiler emits a correct plan, and
 * how often correct plans actually pass in the runtime environment.
 Informally:
 $$
 \Pr[V=1] \approx \Pr[P\ \text{correct}] \cdot \Pr[V=1\mid P\ \text{correct}]
 $$
 If your runtime is strict and your primitives are deterministic, $\Pr[V=1\mid P\ \text{correct}]$ is high. That’s the central win: you turn “LLM unpredictability everywhere” into “LLM uncertainty mainly at compilation time.” Once the failure surface is concentrated, you can apply ECC-like techniques there.
 ### ECC for compilation: redundancy + decoding
 ECC works by adding redundancy and then decoding based on constraints that detect and correct errors. You can do the same for plans:
 1. generate multiple candidate plans $P_1, …, P_k$
 2. statically validate them (types, allowed effects, resource access)
 3. partially execute cheap prefixes if needed
 4. select the plan that passes checks / yields valid outputs
 Even without strong independence assumptions, plan diversity (different “compilation prompts,” constraints, or decomposition templates) tends to decorrelate failure modes. With a checker, the selection step becomes the decoder.
 Here’s an OS-style view of this redundancy:
 ```
             +------------------+
 U ----------> |  Compiler (LLM)  |
             +------------------+
                |    |    |
               P1   P2   P3    ... (redundant plans)
                \    |    /
                 \   |   /
                  v  v  v
            +-------------------+
            |  Verifier/Decoder |
            +-------------------+
                     |
                     v
                 choose P*
                     |
                     v
              +----------------+
              |  Runtime Exec  |
              +----------------+
 ```
 ### Control theory as practical “ECC glue”: localize failures, repair minimally
 Once every step is an instruction with a success/failure status, you gain something that free-form prompting almost never gives you: precise failure localization.
 * `exec curl` failed → network/tool failure; retry/backoff or switch endpoint
 * `grep` failed → query too narrow or wrong target; rewrite only the grep or the query
 * `read` failed → missing resource; fix source selection
 * `write` failed → schema/invariant violation; repair the state shape
 This is where a control loop becomes natural. The “plant” is your plan+runtime; the “sensor” is the instruction trace; the “controller” decides whether to retry, patch the plan, or escalate. You don’t need to force PID math onto it, but the conceptual mapping is clean: proportional fixes handle immediate errors, integral memory of repeated errors adjusts defaults, and derivative-like logic prevents oscillations (e.g., endless retries).
 ---
 ## Part 2 — Memory: context is RAM, and we need a real allocator
 ### The memory issue: fixed context window behaves like physical memory
 If the LLM is single-threaded, the context window is its working memory. It’s fixed size. It is expensive. It gets polluted. And when it overflows, you start doing swap-like hacks: aggressive summarization, pruning, or retrieval-augmented generation (RAG).
 The OS analogy isn’t decorative here. It’s operationally accurate:
 * **context window** is physical RAM (bounded)
 * **immutable prefix** is kernel code mapped and hot
 * **KV cache** behaves like a cache/TLB where prefix stability matters
 * “adding more prompt” is “allocating memory,” and you can run out
 So memory management isn’t just “write less.” It’s deciding what stays resident, what gets offloaded, and how to keep the working set small.
 ### The solution: arena allocator + staged lifetimes + lazy loading + offload
 Rust’s arena allocator is a good mental model because it matches a core workflow truth: most intermediate data is useful only during a phase. You don’t need fine-grained frees; you need cheap allocation and bulk reclamation at phase boundaries.
 We treat the context as an append-only arena with a bump pointer. We also define “stages” in a skill:
 1. compile plan
 2. execute primitives
 3. validate
 4. commit minimal results
 5. free intermediates
 At each stage start, we take a checkpoint. During the stage, we append logs and intermediate outputs. When the stage completes, we write a compact “commit record” (summary + hashes + pointers), then reset the arena to the checkpoint. That gives you explicit lifetimes and prevents context leaks.
 Here is the memory layout you proposed, as an OS-style diagram:
 ```
 +----------------------------------------------------------------------------------+
 |                           LLM Context Window (fixed)                             |
 +----------------------------------------------------------------------------------+
 |  Immutable Prefix  |  Disk Index (pointers)  |  Working Set (stage arena) | Free |
 |  (kernel text)     |  (skills/tools/docs)    |  (plan + outputs + trace)  |      |
 +----------------------------------------------------------------------------------+
 ```
 And here is the arena checkpoint mechanism:
 ```
 Stage S begins:
    bump_ptr = B
    checkpoint[S] = B
 During stage:
    append(plan fragments)
    append(exec outputs)
    append(grep matches)
    append(validation notes)
 Stage S commits:
    write(commit_record[S])   <-- compact, structured
    bump_ptr = checkpoint[S]  <-- bulk free (arena reset)
 ```
 ### Why this works
 This approach gives two strong properties that are “theoretical” in the systems sense.
 **(1) Bounded working set by construction.**
 If you cap the amount of intermediate data a stage is allowed to keep (or the number/size of instruction traces), then your memory usage becomes bounded not by “how long the conversation is,” but by “how big the current stage is.” That’s exactly how a well-structured kernel avoids unbounded growth: scope lifetimes, control allocations, and reclaim aggressively when a phase ends.
 **(2) Reduced need for lossy compaction.**
 Most prompt-based “memory management” relies on summarization, which is effectively lossy compression. An arena+stages approach keeps the majority of the “bulk data” as structured runtime artifacts (commit records, hashes, pointers), not as raw narrative. That reduces how often you must compress meaning into fewer tokens. Prune and RAG become swap mechanisms rather than the default mode.
 ### Lazy loading: the disk index is your page table
 The “disk index” part is important. It’s the page table of your LLM OS: it stores *addresses* (skill names, tool contracts, doc IDs), not the content itself. You keep this index resident because it’s small and highly reusable. When a plan needs a resource, it pages it in via deterministic steps:
 * `exec(fetch_doc, id)`
 * `read(output)`
 * `grep(...)`
 This is consistent with your “treat prune and RAG as last option” stance. The system shouldn’t drag content into RAM unless it’s demanded by the current working set.
 ### Prune & RAG: swap/compaction when you must, not when you feel anxious
 Eventually you will hit memory pressure. When you do, you have two OS-like fallback moves:
 * **pruning**: compact resident memory by removing older low-value regions
 * **RAG/swap**: offload bulk to external storage and page it back when needed
 The key is policy: these are last-resort mechanisms, triggered by pressure and guided by working-set logic, not something you do on every turn.
 ---
 ## Are we building the Tower of Babel?
 In mood terms, yes: we are building a translation layer from human intent to machine action, and history says that “universal translators” are where complexity goes to multiply.
 But there’s a difference between *Babel as confusion* and *Babel as a compiler toolchain*. The dangerous version is when natural language directly triggers actions without strict interfaces. That system becomes un-debuggable and hard to secure because the space of meanings is unbounded and the boundary between interpretation and execution is blurry.
 The OS-shaped version is constrained:
 * the target language is not “whatever the model feels like,” it’s a tiny ISA
 * the runtime is deterministic and instrumented
 * the boundary between compilation (fuzzy) and execution (controlled) is explicit
 * failures are localized to instruction traces
 * memory is managed with lifetimes and checkpoints, not hope
 So yes, we’re building Babel—but the grown-up, systems version. Not a tower of endless languages, but a tower with a syscall table, an allocator, and something that looks suspiciously like ECC wrapped around the only fuzzy component: compilation.
--- a/zed-config/settings.json
+++ b/zed-config/settings.json
@@ -1,59 +0,0 @@
 {
  "telemetry": {
    "metrics": false,
    "diagnostics": false
  },
  "auto_update": false,
  "ui_font_size": 16,
  "buffer_font_size": 15,
  "base_keymap": "JetBrains",
  "buffer_font_family": "JetBrains Mono",
  "theme": {
    "mode": "dark",
    "light": "One Light",
    "dark": "One Dark"
  },
  "languages": {
    "Python": {
      "tab_size": 4
    }
  },
  "inlay_hints": {
    "enabled": true
  },
  "lsp": {
    "rust-analyzer": {
      "binary": {
        "ignore_system_version": true
      },
      "initialization_options": {
        "inlayHints": {
          "maxLength": null,
          "lifetimeElisionHints": {
            "enable": "skip_trivial",
            "useParameterNames": true
          },
          "closureReturnTypeHints": {
            "enable": "always"
          },
          "closureCaptureHints": {
            "enable": true
          }
        }
      }
    },
    "hls": {
      "initialization_options": {
        "haskell": {
          "formattingProvider": "fourmolu"
        }
      }
    }
  },
  "disable_ai": true,
  "icon_theme": {
    "mode": "dark",
    "light": "Zed (Default)",
    "dark": "Zed (Default)"
  }
 }