Add: python scripts for figure generation

Add: ignore rules for drawio
Fix: Author Block
2026-02-09 00:24:40 +08:00 · 2026-02-08 17:26:38 +08:00 · 2026-02-06 15:51:07 +08:00 · 2026-02-06 15:24:43 +08:00 · 2026-02-06 15:11:17 +08:00 · 2026-02-06 14:55:53 +08:00
18 changed files with 9108 additions and 146 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -5,3 +5,5 @@ arxiv-style/*.log
 arxiv-style/*.blg
 arxiv-style/*.bbl
 arxiv-style/*.out
+fig/
+.DS_Store
--- a/arxiv-style/IEEEtran.cls
+++ b/arxiv-style/IEEEtran.cls
--- a/arxiv-style/README.md
+++ b/arxiv-style/README.md
@@ -0,0 +1,27 @@
+## Files layout
+- `arxiv.sty` and `template.tex`: The arxiv template we are using.
+
+- `equations.tex`: **Duplicated**, contains equations in methodology section
+
+- `main.tex` and `references.bib`:This prints out our paper, currently using arxiv template. Note that references are template independent.
+
+## How to compile
+It's recommanded to use `MiKTeX` as compiler on windows.
+
+To compile latex into pdf, follow these steps:
+
+```bash
+pdflatex ./main.tex
+
+# Build reference DB, run once unless references.bib updated
+bibtex main
+
+# Always running compiling command twice
+pdflatex ./main.tex
+pdflatex ./main.tex
+```
+
+## Troubleshooting
+If you encounter warnings during the compiling process, simply press `Enter`.
+
+If you find the reference in pdf is like `[??]`, compile twice.
--- a/arxiv-style/fig-overall-benchmark-v1.png
+++ b/arxiv-style/fig-overall-benchmark-v1.png
--- a/arxiv-style/fig-scripts/.python-version
+++ b/arxiv-style/fig-scripts/.python-version
@@ -0,0 +1 @@
+3.12
--- a/arxiv-style/fig-scripts/draw_channels.py
+++ b/arxiv-style/fig-scripts/draw_channels.py
@@ -0,0 +1,237 @@
+#!/usr/bin/env python3
+"""
+Draw *separate* SVG figures for:
+  1) Continuous channels  (multiple smooth curves per figure)
+  2) Discrete channels    (multiple step-like/token curves per figure)
+
+Outputs (default):
+  out/continuous_channels.svg
+  out/discrete_channels.svg
+
+Notes:
+- Transparent background (good for draw.io / LaTeX / diagrams).
+- No axes/frames by default (diagram-friendly).
+- Curves are synthetic placeholders; replace `make_*_channels()` with your real data.
+"""
+
+from __future__ import annotations
+
+import argparse
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+# ----------------------------
+# Data generators (placeholders)
+# ----------------------------
+
+@dataclass
+class GenParams:
+    seconds: float = 10.0
+    fs: int = 200
+    seed: int = 7
+    n_cont: int = 6          # number of continuous channels (curves)
+    n_disc: int = 5          # number of discrete channels (curves)
+    disc_vocab: int = 8      # token/vocab size for discrete channels
+    disc_change_rate_hz: float = 1.2  # how often discrete tokens change
+
+
+def make_continuous_channels(p: GenParams) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Returns:
+      t: shape (T,)
+      Y: shape (n_cont, T)
+    """
+    rng = np.random.default_rng(p.seed)
+    T = int(p.seconds * p.fs)
+    t = np.linspace(0, p.seconds, T, endpoint=False)
+
+    Y = []
+    for i in range(p.n_cont):
+        # Multi-scale smooth-ish signals
+        f1 = 0.15 + 0.06 * i
+        f2 = 0.8 + 0.15 * (i % 3)
+        phase = rng.uniform(0, 2 * np.pi)
+        y = (
+            0.9 * np.sin(2 * np.pi * f1 * t + phase)
+            + 0.35 * np.sin(2 * np.pi * f2 * t + 1.3 * phase)
+        )
+        # Add mild colored-ish noise by smoothing white noise
+        w = rng.normal(0, 1, size=T)
+        w = np.convolve(w, np.ones(9) / 9.0, mode="same")
+        y = y + 0.15 * w
+
+        # Normalize each channel for consistent visual scale
+        y = (y - np.mean(y)) / (np.std(y) + 1e-9)
+        y = 0.8 * y + 0.15 * i  # vertical offset to separate curves a bit
+        Y.append(y)
+
+    return t, np.vstack(Y)
+
+
+def make_discrete_channels(p: GenParams) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Discrete channels as piecewise-constant token IDs (integers).
+    Returns:
+      t: shape (T,)
+      X: shape (n_disc, T)  (integers in [0, disc_vocab-1])
+    """
+    rng = np.random.default_rng(p.seed + 100)
+    T = int(p.seconds * p.fs)
+    t = np.linspace(0, p.seconds, T, endpoint=False)
+
+    # expected number of changes per channel
+    expected_changes = int(max(1, p.seconds * p.disc_change_rate_hz))
+
+    X = np.zeros((p.n_disc, T), dtype=int)
+    for c in range(p.n_disc):
+        # pick change points
+        k = rng.poisson(expected_changes) + 1
+        change_pts = np.unique(rng.integers(0, T, size=k))
+        change_pts = np.sort(np.concatenate([[0], change_pts, [T]]))
+
+        cur = rng.integers(0, p.disc_vocab)
+        for a, b in zip(change_pts[:-1], change_pts[1:]):
+            # occasional token jump
+            if a != 0:
+                if rng.random() < 0.85:
+                    cur = rng.integers(0, p.disc_vocab)
+            X[c, a:b] = cur
+
+    return t, X
+
+
+# ----------------------------
+# Plotting helpers
+# ----------------------------
+
+def _make_transparent_figure(width_in: float, height_in: float) -> tuple[plt.Figure, plt.Axes]:
+    fig = plt.figure(figsize=(width_in, height_in), dpi=200)
+    fig.patch.set_alpha(0.0)
+    ax = fig.add_axes([0.03, 0.03, 0.94, 0.94])
+    ax.patch.set_alpha(0.0)
+    return fig, ax
+
+
+def save_continuous_channels_svg(
+    t: np.ndarray,
+    Y: np.ndarray,
+    out_path: Path,
+    *,
+    lw: float = 2.0,
+    clean: bool = True,
+) -> None:
+    """
+    Plot multiple continuous curves in one figure and save SVG.
+    Y shape: (n_cont, T)
+    """
+    fig, ax = _make_transparent_figure(width_in=6.0, height_in=2.2)
+
+    # Let matplotlib choose different colors automatically (good defaults).
+    for i in range(Y.shape[0]):
+        ax.plot(t, Y[i], linewidth=lw)
+
+    if clean:
+        ax.set_axis_off()
+    else:
+        ax.set_xlabel("t")
+        ax.set_ylabel("value")
+
+    # Set limits with padding
+    y_all = Y.reshape(-1)
+    ymin, ymax = float(np.min(y_all)), float(np.max(y_all))
+    ypad = 0.08 * (ymax - ymin + 1e-9)
+    ax.set_xlim(t[0], t[-1])
+    ax.set_ylim(ymin - ypad, ymax + ypad)
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(out_path, format="svg", bbox_inches="tight", pad_inches=0.0, transparent=True)
+    plt.close(fig)
+
+
+def save_discrete_channels_svg(
+    t: np.ndarray,
+    X: np.ndarray,
+    out_path: Path,
+    *,
+    lw: float = 2.0,
+    clean: bool = True,
+    vertical_spacing: float = 1.25,
+) -> None:
+    """
+    Plot multiple discrete (piecewise-constant) curves in one figure and save SVG.
+    X shape: (n_disc, T) integers.
+
+    We draw each channel as a step plot, offset vertically so curves don't overlap.
+    """
+    fig, ax = _make_transparent_figure(width_in=6.0, height_in=2.2)
+
+    for i in range(X.shape[0]):
+        y = X[i].astype(float) + i * vertical_spacing
+        ax.step(t, y, where="post", linewidth=lw)
+
+    if clean:
+        ax.set_axis_off()
+    else:
+        ax.set_xlabel("t")
+        ax.set_ylabel("token id (offset)")
+
+    y_all = (X.astype(float) + np.arange(X.shape[0])[:, None] * vertical_spacing).reshape(-1)
+    ymin, ymax = float(np.min(y_all)), float(np.max(y_all))
+    ypad = 0.10 * (ymax - ymin + 1e-9)
+    ax.set_xlim(t[0], t[-1])
+    ax.set_ylim(ymin - ypad, ymax + ypad)
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(out_path, format="svg", bbox_inches="tight", pad_inches=0.0, transparent=True)
+    plt.close(fig)
+
+
+# ----------------------------
+# CLI
+# ----------------------------
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--outdir", type=Path, default=Path("out"))
+    ap.add_argument("--seed", type=int, default=7)
+    ap.add_argument("--seconds", type=float, default=10.0)
+    ap.add_argument("--fs", type=int, default=200)
+
+    ap.add_argument("--n-cont", type=int, default=6)
+    ap.add_argument("--n-disc", type=int, default=5)
+    ap.add_argument("--disc-vocab", type=int, default=8)
+    ap.add_argument("--disc-change-rate", type=float, default=1.2)
+
+    ap.add_argument("--keep-axes", action="store_true", help="Show axes/labels (default: off)")
+    args = ap.parse_args()
+
+    p = GenParams(
+        seconds=args.seconds,
+        fs=args.fs,
+        seed=args.seed,
+        n_cont=args.n_cont,
+        n_disc=args.n_disc,
+        disc_vocab=args.disc_vocab,
+        disc_change_rate_hz=args.disc_change_rate,
+    )
+
+    t_c, Y = make_continuous_channels(p)
+    t_d, X = make_discrete_channels(p)
+
+    cont_path = args.outdir / "continuous_channels.svg"
+    disc_path = args.outdir / "discrete_channels.svg"
+
+    save_continuous_channels_svg(t_c, Y, cont_path, clean=not args.keep_axes)
+    save_discrete_channels_svg(t_d, X, disc_path, clean=not args.keep_axes)
+
+    print("Wrote:")
+    print(f"  {cont_path}")
+    print(f"  {disc_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/arxiv-style/fig-scripts/draw_synthetic_ics_optionA.py
+++ b/arxiv-style/fig-scripts/draw_synthetic_ics_optionA.py
@@ -0,0 +1,272 @@
+#!/usr/bin/env python3
+"""
+Option A: "Synthetic ICS Data" mini-panel (high-level features, not packets)
+
+What it draws (one SVG, transparent background):
+- Top: 2–3 continuous feature curves (smooth, time-aligned)
+- Bottom: discrete/categorical feature strip (colored blocks)
+- One vertical dashed alignment line crossing both
+- Optional shaded regime window
+- Optional "real vs synthetic" ghost overlay (faint gray behind one curve)
+
+Usage:
+  uv run python draw_synthetic_ics_optionA.py --out ./assets/synth_ics_optionA.svg
+"""
+
+from __future__ import annotations
+
+import argparse
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.patches import Rectangle
+
+
+@dataclass
+class Params:
+    seed: int = 7
+    seconds: float = 10.0
+    fs: int = 300
+
+    n_curves: int = 3              # continuous channels shown
+    n_bins: int = 40               # discrete blocks across x
+    disc_vocab: int = 8            # number of discrete categories
+
+    # Layout / style
+    width_in: float = 6.0
+    height_in: float = 2.2
+    curve_lw: float = 2.3
+    ghost_lw: float = 2.0          # "real" overlay line width
+    strip_height: float = 0.65     # bar height in [0,1] strip axis
+    strip_gap_frac: float = 0.10   # gap between blocks (fraction of block width)
+
+    # Visual cues
+    show_alignment_line: bool = True
+    align_x_frac: float = 0.58     # where to place dashed line, fraction of timeline
+    show_regime_window: bool = True
+    regime_start_frac: float = 0.30
+    regime_end_frac: float = 0.45
+    show_real_ghost: bool = True   # faint gray "real" behind first synthetic curve
+
+
+def _smooth(x: np.ndarray, win: int) -> np.ndarray:
+    win = max(3, int(win) | 1)  # odd
+    k = np.ones(win, dtype=float)
+    k /= k.sum()
+    return np.convolve(x, k, mode="same")
+
+
+def make_continuous_curves(p: Params) -> tuple[np.ndarray, np.ndarray, np.ndarray | None]:
+    """
+    Returns:
+      t: (T,)
+      Y_syn: (n_curves, T)  synthetic curves
+      y_real: (T,) or None  optional "real" ghost curve (for one channel)
+    """
+    rng = np.random.default_rng(p.seed)
+    T = int(p.seconds * p.fs)
+    t = np.linspace(0, p.seconds, T, endpoint=False)
+
+    Y = []
+    for i in range(p.n_curves):
+        # multi-scale smooth temporal patterns
+        f_slow = 0.09 + 0.03 * (i % 3)
+        f_mid = 0.65 + 0.18 * (i % 4)
+        ph = rng.uniform(0, 2 * np.pi)
+
+        y = (
+            0.95 * np.sin(2 * np.pi * f_slow * t + ph)
+            + 0.30 * np.sin(2 * np.pi * f_mid * t + 0.7 * ph)
+        )
+
+        # regime-like bumps
+        bumps = np.zeros_like(t)
+        for _ in range(2):
+            mu = rng.uniform(0.8, p.seconds - 0.8)
+            sig = rng.uniform(0.35, 0.85)
+            bumps += np.exp(-0.5 * ((t - mu) / (sig + 1e-9)) ** 2)
+        y += 0.55 * bumps
+
+        # mild smooth noise
+        noise = _smooth(rng.normal(0, 1, size=T), win=int(p.fs * 0.04))
+        y += 0.10 * noise
+
+        # normalize for clean presentation
+        y = (y - y.mean()) / (y.std() + 1e-9)
+        y *= 0.42
+        Y.append(y)
+
+    Y_syn = np.vstack(Y)
+
+    # Optional "real" ghost: similar to first curve, but slightly different
+    y_real = None
+    if p.show_real_ghost:
+        base = Y_syn[0].copy()
+        drift = _smooth(rng.normal(0, 1, size=T), win=int(p.fs * 0.18))
+        drift = drift / (np.std(drift) + 1e-9)
+        y_real = base * 0.95 + 0.07 * drift
+
+    return t, Y_syn, y_real
+
+
+def make_discrete_strip(p: Params) -> np.ndarray:
+    """
+    Piecewise-constant categorical IDs across n_bins.
+    Returns:
+      ids: (n_bins,) in [0, disc_vocab-1]
+    """
+    rng = np.random.default_rng(p.seed + 123)
+    n = p.n_bins
+    ids = np.zeros(n, dtype=int)
+
+    cur = rng.integers(0, p.disc_vocab)
+    for i in range(n):
+        # occasional change
+        if i == 0 or rng.random() < 0.28:
+            cur = rng.integers(0, p.disc_vocab)
+        ids[i] = cur
+
+    return ids
+
+
+def _axes_clean(ax: plt.Axes) -> None:
+    """Keep axes lines optional but remove all text/numbers (diagram-friendly)."""
+    ax.set_xlabel("")
+    ax.set_ylabel("")
+    ax.set_title("")
+    ax.set_xticks([])
+    ax.set_yticks([])
+    ax.tick_params(
+        axis="both",
+        which="both",
+        bottom=False,
+        left=False,
+        top=False,
+        right=False,
+        labelbottom=False,
+        labelleft=False,
+    )
+
+
+def draw_optionA(out_path: Path, p: Params) -> None:
+    # Figure
+    fig = plt.figure(figsize=(p.width_in, p.height_in), dpi=200)
+    fig.patch.set_alpha(0.0)
+
+    # Two stacked axes (shared x)
+    ax_top = fig.add_axes([0.08, 0.32, 0.90, 0.62])
+    ax_bot = fig.add_axes([0.08, 0.12, 0.90, 0.16], sharex=ax_top)
+    ax_top.patch.set_alpha(0.0)
+    ax_bot.patch.set_alpha(0.0)
+
+    # Generate data
+    t, Y_syn, y_real = make_continuous_curves(p)
+    ids = make_discrete_strip(p)
+
+    x0, x1 = float(t[0]), float(t[-1])
+    span = x1 - x0
+
+    # Optional shaded regime window
+    if p.show_regime_window:
+        rs = x0 + p.regime_start_frac * span
+        re = x0 + p.regime_end_frac * span
+        ax_top.axvspan(rs, re, alpha=0.12)  # default color, semi-transparent
+        ax_bot.axvspan(rs, re, alpha=0.12)
+
+    # Optional vertical dashed alignment line
+    if p.show_alignment_line:
+        vx = x0 + p.align_x_frac * span
+        ax_top.axvline(vx, linestyle="--", linewidth=1.2, alpha=0.7)
+        ax_bot.axvline(vx, linestyle="--", linewidth=1.2, alpha=0.7)
+
+    # Continuous curves (use fixed colors for consistency)
+    curve_colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#9467bd"]  # blue, orange, green, purple
+
+    # Ghost "real" behind the first curve (faint gray)
+    if y_real is not None:
+        ax_top.plot(t, y_real, linewidth=p.ghost_lw, color="0.65", alpha=0.55, zorder=1)
+
+    for i in range(Y_syn.shape[0]):
+        ax_top.plot(
+            t, Y_syn[i],
+            linewidth=p.curve_lw,
+            color=curve_colors[i % len(curve_colors)],
+            zorder=2
+        )
+
+    # Set top y-limits with padding
+    ymin, ymax = float(Y_syn.min()), float(Y_syn.max())
+    ypad = 0.10 * (ymax - ymin + 1e-9)
+    ax_top.set_xlim(x0, x1)
+    ax_top.set_ylim(ymin - ypad, ymax + ypad)
+
+    # Discrete strip as colored blocks
+    palette = [
+        "#e41a1c", "#377eb8", "#4daf4a", "#984ea3",
+        "#ff7f00", "#ffff33", "#a65628", "#f781bf",
+    ]
+
+    n = len(ids)
+    bin_w = span / n
+    gap = p.strip_gap_frac * bin_w
+    ax_bot.set_ylim(0, 1)
+
+    y = (1 - p.strip_height) / 2
+    for i, cat in enumerate(ids):
+        left = x0 + i * bin_w + gap / 2
+        width = bin_w - gap
+        ax_bot.add_patch(
+            Rectangle(
+                (left, y), width, p.strip_height,
+                facecolor=palette[int(cat) % len(palette)],
+                edgecolor="none",
+            )
+        )
+
+    # Clean axes: no ticks/labels; keep spines (axes lines) visible
+    _axes_clean(ax_top)
+    _axes_clean(ax_bot)
+    for ax in (ax_top, ax_bot):
+        for side in ("left", "bottom", "top", "right"):
+            ax.spines[side].set_visible(True)
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(out_path, format="svg", transparent=True, bbox_inches="tight", pad_inches=0.0)
+    plt.close(fig)
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--out", type=Path, default=Path("synth_ics_optionA.svg"))
+    ap.add_argument("--seed", type=int, default=7)
+    ap.add_argument("--seconds", type=float, default=10.0)
+    ap.add_argument("--fs", type=int, default=300)
+    ap.add_argument("--curves", type=int, default=3)
+    ap.add_argument("--bins", type=int, default=40)
+    ap.add_argument("--vocab", type=int, default=8)
+
+    ap.add_argument("--no-align", action="store_true")
+    ap.add_argument("--no-regime", action="store_true")
+    ap.add_argument("--no-ghost", action="store_true")
+    args = ap.parse_args()
+
+    p = Params(
+        seed=args.seed,
+        seconds=args.seconds,
+        fs=args.fs,
+        n_curves=args.curves,
+        n_bins=args.bins,
+        disc_vocab=args.vocab,
+        show_alignment_line=not args.no_align,
+        show_regime_window=not args.no_regime,
+        show_real_ghost=not args.no_ghost,
+    )
+
+    draw_optionA(args.out, p)
+    print(f"Wrote: {args.out}")
+
+
+if __name__ == "__main__":
+    main()
--- a/arxiv-style/fig-scripts/draw_synthetic_ics_optionB.py
+++ b/arxiv-style/fig-scripts/draw_synthetic_ics_optionB.py
@@ -0,0 +1,318 @@
+#!/usr/bin/env python3
+"""
+Option B: "Synthetic ICS Data" as a mini process-story strip (high-level features)
+- ONE SVG, transparent background
+- Two frames by default: "steady/normal" -> "disturbance/recovery"
+- Each frame contains:
+    - Top: multiple continuous feature curves
+    - Bottom: discrete/categorical strip (colored blocks)
+    - A vertical dashed alignment line crossing both
+    - Optional shaded regime window
+- A right-pointing arrow between frames
+
+No text, no numbers (axes lines only). Good for draw.io embedding.
+
+Run:
+  uv run python draw_synthetic_ics_optionB.py --out ./assets/synth_ics_optionB.svg
+"""
+
+from __future__ import annotations
+
+import argparse
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.patches import Rectangle, FancyArrowPatch
+
+
+@dataclass
+class Params:
+    seed: int = 7
+    seconds: float = 8.0
+    fs: int = 250
+
+    # Two-frame story
+    n_frames: int = 2
+
+    # Per-frame visuals
+    n_curves: int = 3
+    n_bins: int = 32
+    disc_vocab: int = 8
+
+    # Layout
+    width_in: float = 8.2
+    height_in: float = 2.4
+    # Relative layout inside the figure
+    margin_left: float = 0.05
+    margin_right: float = 0.05
+    margin_bottom: float = 0.12
+    margin_top: float = 0.10
+    frame_gap: float = 0.08   # gap (figure fraction) between frames (space for arrow)
+
+    # Styling
+    curve_lw: float = 2.1
+    ghost_lw: float = 1.8
+    strip_height: float = 0.65
+    strip_gap_frac: float = 0.12
+
+    # Cues
+    show_alignment_line: bool = True
+    align_x_frac: float = 0.60
+    show_regime_window: bool = True
+    regime_start_frac: float = 0.30
+    regime_end_frac: float = 0.46
+    show_real_ghost: bool = False  # keep default off for cleaner story
+    show_axes_spines: bool = True  # axes lines only (no ticks/labels)
+
+
+# ---------- helpers ----------
+
+def _smooth(x: np.ndarray, win: int) -> np.ndarray:
+    win = max(3, int(win) | 1)
+    k = np.ones(win, dtype=float)
+    k /= k.sum()
+    return np.convolve(x, k, mode="same")
+
+
+def _axes_only(ax: plt.Axes, *, keep_spines: bool) -> None:
+    ax.set_xlabel("")
+    ax.set_ylabel("")
+    ax.set_title("")
+    ax.set_xticks([])
+    ax.set_yticks([])
+    ax.tick_params(
+        axis="both",
+        which="both",
+        bottom=False,
+        left=False,
+        top=False,
+        right=False,
+        labelbottom=False,
+        labelleft=False,
+    )
+    ax.grid(False)
+    if keep_spines:
+        for s in ("left", "right", "top", "bottom"):
+            ax.spines[s].set_visible(True)
+    else:
+        for s in ("left", "right", "top", "bottom"):
+            ax.spines[s].set_visible(False)
+
+
+def make_frame_continuous(seed: int, seconds: float, fs: int, n_curves: int, style: str) -> tuple[np.ndarray, np.ndarray]:
+    """
+    style:
+      - "steady": smoother, smaller bumps
+      - "disturb": larger bumps and more variance
+    """
+    rng = np.random.default_rng(seed)
+    T = int(seconds * fs)
+    t = np.linspace(0, seconds, T, endpoint=False)
+
+    amp_bump = 0.40 if style == "steady" else 0.85
+    amp_noise = 0.09 if style == "steady" else 0.14
+    amp_scale = 0.38 if style == "steady" else 0.46
+
+    base_freqs = [0.10, 0.08, 0.12, 0.09]
+    mid_freqs = [0.65, 0.78, 0.90, 0.72]
+
+    Y = []
+    for i in range(n_curves):
+        f_slow = base_freqs[i % len(base_freqs)]
+        f_mid = mid_freqs[i % len(mid_freqs)]
+        ph = rng.uniform(0, 2 * np.pi)
+
+        y = (
+            0.95 * np.sin(2 * np.pi * f_slow * t + ph)
+            + 0.28 * np.sin(2 * np.pi * f_mid * t + 0.65 * ph)
+        )
+
+        bumps = np.zeros_like(t)
+        n_bumps = 2 if style == "steady" else 3
+        for _ in range(n_bumps):
+            mu = rng.uniform(0.9, seconds - 0.9)
+            sig = rng.uniform(0.35, 0.75) if style == "steady" else rng.uniform(0.20, 0.55)
+            bumps += np.exp(-0.5 * ((t - mu) / (sig + 1e-9)) ** 2)
+        y += amp_bump * bumps
+
+        noise = _smooth(rng.normal(0, 1, size=T), win=int(fs * 0.04))
+        y += amp_noise * noise
+
+        y = (y - y.mean()) / (y.std() + 1e-9)
+        y *= amp_scale
+        Y.append(y)
+
+    return t, np.vstack(Y)
+
+
+def make_frame_discrete(seed: int, n_bins: int, vocab: int, style: str) -> np.ndarray:
+    """
+    style:
+      - "steady": fewer transitions
+      - "disturb": more transitions
+    """
+    rng = np.random.default_rng(seed + 111)
+    ids = np.zeros(n_bins, dtype=int)
+
+    p_change = 0.20 if style == "steady" else 0.38
+    cur = rng.integers(0, vocab)
+    for i in range(n_bins):
+        if i == 0 or rng.random() < p_change:
+            cur = rng.integers(0, vocab)
+        ids[i] = cur
+    return ids
+
+
+def draw_frame(ax_top: plt.Axes, ax_bot: plt.Axes, t: np.ndarray, Y: np.ndarray, ids: np.ndarray, p: Params) -> None:
+    # Optional cues
+    x0, x1 = float(t[0]), float(t[-1])
+    span = x1 - x0
+
+    if p.show_regime_window:
+        rs = x0 + p.regime_start_frac * span
+        re = x0 + p.regime_end_frac * span
+        ax_top.axvspan(rs, re, alpha=0.12)  # default color
+        ax_bot.axvspan(rs, re, alpha=0.12)
+
+    if p.show_alignment_line:
+        vx = x0 + p.align_x_frac * span
+        ax_top.axvline(vx, linestyle="--", linewidth=1.15, alpha=0.7)
+        ax_bot.axvline(vx, linestyle="--", linewidth=1.15, alpha=0.7)
+
+    # Curves
+    curve_colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#9467bd"]
+    for i in range(Y.shape[0]):
+        ax_top.plot(t, Y[i], linewidth=p.curve_lw, color=curve_colors[i % len(curve_colors)])
+
+    ymin, ymax = float(Y.min()), float(Y.max())
+    ypad = 0.10 * (ymax - ymin + 1e-9)
+    ax_top.set_xlim(x0, x1)
+    ax_top.set_ylim(ymin - ypad, ymax + ypad)
+
+    # Discrete strip
+    palette = [
+        "#e41a1c", "#377eb8", "#4daf4a", "#984ea3",
+        "#ff7f00", "#ffff33", "#a65628", "#f781bf",
+    ]
+
+    ax_bot.set_xlim(x0, x1)
+    ax_bot.set_ylim(0, 1)
+
+    n = len(ids)
+    bin_w = span / n
+    gap = p.strip_gap_frac * bin_w
+    y = (1 - p.strip_height) / 2
+
+    for i, cat in enumerate(ids):
+        left = x0 + i * bin_w + gap / 2
+        width = bin_w - gap
+        ax_bot.add_patch(
+            Rectangle((left, y), width, p.strip_height, facecolor=palette[int(cat) % len(palette)], edgecolor="none")
+        )
+
+    # Axes-only style
+    _axes_only(ax_top, keep_spines=p.show_axes_spines)
+    _axes_only(ax_bot, keep_spines=p.show_axes_spines)
+
+
+# ---------- main drawing ----------
+
+def draw_optionB(out_path: Path, p: Params) -> None:
+    fig = plt.figure(figsize=(p.width_in, p.height_in), dpi=200)
+    fig.patch.set_alpha(0.0)
+
+    # Compute frame layout in figure coordinates
+    # Each frame has two stacked axes: top curves and bottom strip.
+    usable_w = 1.0 - p.margin_left - p.margin_right
+    usable_h = 1.0 - p.margin_bottom - p.margin_top
+
+    # Leave gap between frames for arrow
+    total_gap = p.frame_gap * (p.n_frames - 1)
+    frame_w = (usable_w - total_gap) / p.n_frames
+
+    # Within each frame: vertical split
+    top_h = usable_h * 0.70
+    bot_h = usable_h * 0.18
+    v_gap = usable_h * 0.06
+    # bottoms
+    bot_y = p.margin_bottom
+    top_y = bot_y + bot_h + v_gap
+
+    axes_pairs = []
+    for f in range(p.n_frames):
+        left = p.margin_left + f * (frame_w + p.frame_gap)
+        ax_top = fig.add_axes([left, top_y, frame_w, top_h])
+        ax_bot = fig.add_axes([left, bot_y, frame_w, bot_h], sharex=ax_top)
+        ax_top.patch.set_alpha(0.0)
+        ax_bot.patch.set_alpha(0.0)
+        axes_pairs.append((ax_top, ax_bot))
+
+    # Data per frame
+    styles = ["steady", "disturb"] if p.n_frames == 2 else ["steady"] * (p.n_frames - 1) + ["disturb"]
+    for idx, ((ax_top, ax_bot), style) in enumerate(zip(axes_pairs, styles)):
+        t, Y = make_frame_continuous(p.seed + 10 * idx, p.seconds, p.fs, p.n_curves, style=style)
+        ids = make_frame_discrete(p.seed + 10 * idx, p.n_bins, p.disc_vocab, style=style)
+        draw_frame(ax_top, ax_bot, t, Y, ids, p)
+
+    # Add a visual arrow between frames (in figure coordinates)
+    if p.n_frames >= 2:
+        for f in range(p.n_frames - 1):
+            # center between frame f and f+1
+            x_left = p.margin_left + f * (frame_w + p.frame_gap) + frame_w
+            x_right = p.margin_left + (f + 1) * (frame_w + p.frame_gap)
+            x_mid = (x_left + x_right) / 2
+            # arrow y in the middle of the frame stack
+            y_mid = bot_y + (bot_h + v_gap + top_h) / 2
+
+            arr = FancyArrowPatch(
+                (x_mid - 0.015, y_mid),
+                (x_mid + 0.015, y_mid),
+                transform=fig.transFigure,
+                arrowstyle="-|>",
+                mutation_scale=18,
+                linewidth=1.6,
+                color="black",
+            )
+            fig.patches.append(arr)
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(out_path, format="svg", transparent=True, bbox_inches="tight", pad_inches=0.0)
+    plt.close(fig)
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--out", type=Path, default=Path("synth_ics_optionB.svg"))
+    ap.add_argument("--seed", type=int, default=7)
+    ap.add_argument("--seconds", type=float, default=8.0)
+    ap.add_argument("--fs", type=int, default=250)
+    ap.add_argument("--frames", type=int, default=2, choices=[2, 3], help="2 or 3 frames (story strip)")
+    ap.add_argument("--curves", type=int, default=3)
+    ap.add_argument("--bins", type=int, default=32)
+    ap.add_argument("--vocab", type=int, default=8)
+    ap.add_argument("--no-align", action="store_true")
+    ap.add_argument("--no-regime", action="store_true")
+    ap.add_argument("--no-spines", action="store_true")
+    args = ap.parse_args()
+
+    p = Params(
+        seed=args.seed,
+        seconds=args.seconds,
+        fs=args.fs,
+        n_frames=args.frames,
+        n_curves=args.curves,
+        n_bins=args.bins,
+        disc_vocab=args.vocab,
+        show_alignment_line=not args.no_align,
+        show_regime_window=not args.no_regime,
+        show_axes_spines=not args.no_spines,
+    )
+
+    draw_optionB(args.out, p)
+    print(f"Wrote: {args.out}")
+
+
+if __name__ == "__main__":
+    main()
--- a/arxiv-style/fig-scripts/draw_transformer_lower_half.py
+++ b/arxiv-style/fig-scripts/draw_transformer_lower_half.py
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+"""
+Draw the *Transformer section* lower-half visuals:
+- Continuous channels: multiple smooth curves (like the colored trend lines)
+- Discrete channels: small colored bars/ticks along the bottom
+
+Output: ONE SVG with transparent background, axes hidden.
+
+Run:
+  uv run python draw_transformer_lower_half.py --out ./assets/transformer_lower_half.svg
+"""
+
+from __future__ import annotations
+
+import argparse
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.patches import Rectangle
+
+
+@dataclass
+class Params:
+    seed: int = 7
+    seconds: float = 10.0
+    fs: int = 300
+
+    # Continuous channels
+    n_curves: int = 3
+    curve_lw: float = 2.4
+
+    # Discrete bars
+    n_bins: int = 40          # number of discrete bars/ticks across time
+    bar_height: float = 0.11  # relative height inside bar strip axis
+    bar_gap: float = 0.08     # gap between bars (fraction of bar width)
+
+    # Canvas sizing
+    width_in: float = 5.8
+    height_in: float = 1.9
+
+
+def _smooth(x: np.ndarray, win: int) -> np.ndarray:
+    win = max(3, int(win) | 1)  # odd
+    k = np.ones(win, dtype=float)
+    k /= k.sum()
+    return np.convolve(x, k, mode="same")
+
+
+def make_continuous_curves(p: Params) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Produce 3 smooth curves with gentle long-term temporal patterning.
+    Returns:
+      t: (T,)
+      Y: (n_curves, T)
+    """
+    rng = np.random.default_rng(p.seed)
+    T = int(p.seconds * p.fs)
+    t = np.linspace(0, p.seconds, T, endpoint=False)
+
+    Y = []
+    base_freqs = [0.12, 0.09, 0.15]
+    mid_freqs = [0.65, 0.85, 0.75]
+
+    for i in range(p.n_curves):
+        f1 = base_freqs[i % len(base_freqs)]
+        f2 = mid_freqs[i % len(mid_freqs)]
+        ph = rng.uniform(0, 2 * np.pi)
+
+        # Smooth trend + mid wiggle
+        y = (
+            1.00 * np.sin(2 * np.pi * f1 * t + ph)
+            + 0.35 * np.sin(2 * np.pi * f2 * t + 0.7 * ph)
+        )
+
+        # Add a couple of smooth bumps (like slow pattern changes)
+        bumps = np.zeros_like(t)
+        for _ in range(2):
+            mu = rng.uniform(0.8, p.seconds - 0.8)
+            sig = rng.uniform(0.35, 0.75)
+            bumps += np.exp(-0.5 * ((t - mu) / sig) ** 2)
+        y += 0.55 * bumps
+
+        # Mild smooth noise
+        noise = _smooth(rng.normal(0, 1, size=T), win=int(p.fs * 0.04))
+        y += 0.12 * noise
+
+        # Normalize and compress amplitude to fit nicely
+        y = (y - y.mean()) / (y.std() + 1e-9)
+        y *= 0.42
+
+        Y.append(y)
+
+    return t, np.vstack(Y)
+
+
+def make_discrete_bars(p: Params) -> np.ndarray:
+    """
+    Generate discrete "token-like" bars across time bins.
+    Returns:
+      ids: (n_bins,) integer category ids
+    """
+    rng = np.random.default_rng(p.seed + 123)
+    n = p.n_bins
+
+    # A piecewise-constant sequence with occasional changes (looks like discrete channel)
+    ids = np.zeros(n, dtype=int)
+    cur = rng.integers(0, 8)
+    for i in range(n):
+        if i == 0 or rng.random() < 0.25:
+            cur = rng.integers(0, 8)
+        ids[i] = cur
+    return ids
+
+
+def draw_transformer_lower_half_svg(out_path: Path, p: Params) -> None:
+    # --- Figure + transparent background ---
+    fig = plt.figure(figsize=(p.width_in, p.height_in), dpi=200)
+    fig.patch.set_alpha(0.0)
+
+    # Two stacked axes: curves (top), bars (bottom)
+    # Tight, diagram-style layout
+    ax_curves = fig.add_axes([0.06, 0.28, 0.90, 0.68])  # [left, bottom, width, height]
+    ax_bars = fig.add_axes([0.06, 0.10, 0.90, 0.14])
+
+    ax_curves.patch.set_alpha(0.0)
+    ax_bars.patch.set_alpha(0.0)
+
+    for ax in (ax_curves, ax_bars):
+        ax.set_axis_off()
+
+    # --- Data ---
+    t, Y = make_continuous_curves(p)
+    ids = make_discrete_bars(p)
+
+    # --- Continuous curves (explicit colors to match the “multi-colored” look) ---
+    # Feel free to swap these hex colors to match your figure theme.
+    curve_colors = ["#1f77b4", "#ff7f0e", "#2ca02c"]  # blue / orange / green
+
+    for i in range(Y.shape[0]):
+        ax_curves.plot(t, Y[i], linewidth=p.curve_lw, color=curve_colors[i % len(curve_colors)])
+
+    # Set curve bounds with padding (keeps it clean)
+    ymin, ymax = float(Y.min()), float(Y.max())
+    pad = 0.10 * (ymax - ymin + 1e-9)
+    ax_curves.set_xlim(t[0], t[-1])
+    ax_curves.set_ylim(ymin - pad, ymax + pad)
+
+    # --- Discrete bars: small colored rectangles along the timeline ---
+    # A small palette for categories (repeats if more categories appear)
+    bar_palette = [
+        "#e41a1c", "#377eb8", "#4daf4a", "#984ea3",
+        "#ff7f00", "#ffff33", "#a65628", "#f781bf",
+    ]
+
+    # Convert bins into time spans
+    n = len(ids)
+    x0, x1 = t[0], t[-1]
+    total = x1 - x0
+    bin_w = total / n
+    gap = p.bar_gap * bin_w
+
+    # Draw bars in [0,1] y-space inside ax_bars
+    ax_bars.set_xlim(x0, x1)
+    ax_bars.set_ylim(0, 1)
+
+    for i, cat in enumerate(ids):
+        left = x0 + i * bin_w + gap / 2
+        width = bin_w - gap
+        color = bar_palette[int(cat) % len(bar_palette)]
+        rect = Rectangle(
+            (left, (1 - p.bar_height) / 2),
+            width,
+            p.bar_height,
+            facecolor=color,
+            edgecolor="none",
+        )
+        ax_bars.add_patch(rect)
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(out_path, format="svg", transparent=True, bbox_inches="tight", pad_inches=0.0)
+    plt.close(fig)
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--out", type=Path, default=Path("transformer_lower_half.svg"))
+    ap.add_argument("--seed", type=int, default=7)
+    ap.add_argument("--seconds", type=float, default=10.0)
+    ap.add_argument("--fs", type=int, default=300)
+    ap.add_argument("--bins", type=int, default=40)
+    args = ap.parse_args()
+
+    p = Params(seed=args.seed, seconds=args.seconds, fs=args.fs, n_bins=args.bins)
+    draw_transformer_lower_half_svg(args.out, p)
+    print(f"Wrote: {args.out}")
+
+
+if __name__ == "__main__":
+    main()
--- a/arxiv-style/fig-scripts/draw_transformer_lower_half_axes.py
+++ b/arxiv-style/fig-scripts/draw_transformer_lower_half_axes.py
@@ -0,0 +1,202 @@
+#!/usr/bin/env python3
+"""
+Transformer section lower-half visuals WITH AXES ONLY:
+- Axes spines visible
+- NO numbers (tick labels hidden)
+- NO words (axis labels removed)
+- Transparent background
+- One SVG output
+
+Run:
+  uv run python draw_transformer_lower_half_axes_only.py --out ./assets/transformer_lower_half_axes_only.svg
+"""
+
+from __future__ import annotations
+
+import argparse
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+import matplotlib.pyplot as plt
+from matplotlib.patches import Rectangle
+
+
+@dataclass
+class Params:
+    seed: int = 7
+    seconds: float = 10.0
+    fs: int = 300
+
+    # Continuous channels
+    n_curves: int = 3
+    curve_lw: float = 2.4
+
+    # Discrete bars
+    n_bins: int = 40
+    bar_height: float = 0.55   # fraction of the discrete-axis y-range
+    bar_gap: float = 0.08      # fraction of bar width
+
+    # Figure size
+    width_in: float = 6.6
+    height_in: float = 2.6
+
+
+def _smooth(x: np.ndarray, win: int) -> np.ndarray:
+    win = max(3, int(win) | 1)  # odd
+    k = np.ones(win, dtype=float)
+    k /= k.sum()
+    return np.convolve(x, k, mode="same")
+
+
+def make_continuous_curves(p: Params) -> tuple[np.ndarray, np.ndarray]:
+    rng = np.random.default_rng(p.seed)
+    T = int(p.seconds * p.fs)
+    t = np.linspace(0, p.seconds, T, endpoint=False)
+
+    Y = []
+    base_freqs = [0.12, 0.09, 0.15]
+    mid_freqs = [0.65, 0.85, 0.75]
+
+    for i in range(p.n_curves):
+        f1 = base_freqs[i % len(base_freqs)]
+        f2 = mid_freqs[i % len(mid_freqs)]
+        ph = rng.uniform(0, 2 * np.pi)
+
+        y = (
+            1.00 * np.sin(2 * np.pi * f1 * t + ph)
+            + 0.35 * np.sin(2 * np.pi * f2 * t + 0.7 * ph)
+        )
+
+        bumps = np.zeros_like(t)
+        for _ in range(2):
+            mu = rng.uniform(0.8, p.seconds - 0.8)
+            sig = rng.uniform(0.35, 0.75)
+            bumps += np.exp(-0.5 * ((t - mu) / sig) ** 2)
+        y += 0.55 * bumps
+
+        noise = _smooth(rng.normal(0, 1, size=T), win=int(p.fs * 0.04))
+        y += 0.12 * noise
+
+        y = (y - y.mean()) / (y.std() + 1e-9)
+        y *= 0.42
+        Y.append(y)
+
+    return t, np.vstack(Y)
+
+
+def make_discrete_bars(p: Params) -> np.ndarray:
+    rng = np.random.default_rng(p.seed + 123)
+    n = p.n_bins
+
+    ids = np.zeros(n, dtype=int)
+    cur = rng.integers(0, 8)
+    for i in range(n):
+        if i == 0 or rng.random() < 0.25:
+            cur = rng.integers(0, 8)
+        ids[i] = cur
+    return ids
+
+
+def _axes_only(ax: plt.Axes) -> None:
+    """Keep spines (axes lines), remove all ticks/labels/words."""
+    # No labels
+    ax.set_xlabel("")
+    ax.set_ylabel("")
+    ax.set_title("")
+
+    # Keep spines as the only axes element
+    for side in ("top", "right", "bottom", "left"):
+        ax.spines[side].set_visible(True)
+
+    # Remove tick marks and tick labels entirely
+    ax.set_xticks([])
+    ax.set_yticks([])
+    ax.tick_params(
+        axis="both",
+        which="both",
+        bottom=False,
+        left=False,
+        top=False,
+        right=False,
+        labelbottom=False,
+        labelleft=False,
+    )
+
+    # No grid
+    ax.grid(False)
+
+
+def draw_transformer_lower_half_svg(out_path: Path, p: Params) -> None:
+    fig = plt.figure(figsize=(p.width_in, p.height_in), dpi=200)
+    fig.patch.set_alpha(0.0)
+
+    # Two axes sharing x (top curves, bottom bars)
+    ax_curves = fig.add_axes([0.10, 0.38, 0.86, 0.56])
+    ax_bars = fig.add_axes([0.10, 0.14, 0.86, 0.18], sharex=ax_curves)
+
+    ax_curves.patch.set_alpha(0.0)
+    ax_bars.patch.set_alpha(0.0)
+
+    # Data
+    t, Y = make_continuous_curves(p)
+    ids = make_discrete_bars(p)
+
+    # Top: continuous curves
+    curve_colors = ["#1f77b4", "#ff7f0e", "#2ca02c"]  # blue / orange / green
+    for i in range(Y.shape[0]):
+        ax_curves.plot(t, Y[i], linewidth=p.curve_lw, color=curve_colors[i % len(curve_colors)])
+
+    ymin, ymax = float(Y.min()), float(Y.max())
+    ypad = 0.10 * (ymax - ymin + 1e-9)
+    ax_curves.set_xlim(t[0], t[-1])
+    ax_curves.set_ylim(ymin - ypad, ymax + ypad)
+
+    # Bottom: discrete bars (colored strip)
+    bar_palette = [
+        "#e41a1c", "#377eb8", "#4daf4a", "#984ea3",
+        "#ff7f00", "#ffff33", "#a65628", "#f781bf",
+    ]
+
+    x0, x1 = t[0], t[-1]
+    total = x1 - x0
+    n = len(ids)
+    bin_w = total / n
+    gap = p.bar_gap * bin_w
+
+    ax_bars.set_xlim(x0, x1)
+    ax_bars.set_ylim(0, 1)
+
+    bar_y = (1 - p.bar_height) / 2
+    for i, cat in enumerate(ids):
+        left = x0 + i * bin_w + gap / 2
+        width = bin_w - gap
+        color = bar_palette[int(cat) % len(bar_palette)]
+        ax_bars.add_patch(Rectangle((left, bar_y), width, p.bar_height, facecolor=color, edgecolor="none"))
+
+    # Apply "axes only" styling (no numbers/words)
+    _axes_only(ax_curves)
+    _axes_only(ax_bars)
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(out_path, format="svg", transparent=True, bbox_inches="tight", pad_inches=0.0)
+    plt.close(fig)
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--out", type=Path, default=Path("transformer_lower_half_axes_only.svg"))
+    ap.add_argument("--seed", type=int, default=7)
+    ap.add_argument("--seconds", type=float, default=10.0)
+    ap.add_argument("--fs", type=int, default=300)
+    ap.add_argument("--bins", type=int, default=40)
+    ap.add_argument("--curves", type=int, default=3)
+    args = ap.parse_args()
+
+    p = Params(seed=args.seed, seconds=args.seconds, fs=args.fs, n_bins=args.bins, n_curves=args.curves)
+    draw_transformer_lower_half_svg(args.out, p)
+    print(f"Wrote: {args.out}")
+
+
+if __name__ == "__main__":
+    main()
--- a/arxiv-style/fig-scripts/gen_noise_ddmp.py
+++ b/arxiv-style/fig-scripts/gen_noise_ddmp.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+"""
+Generate "Noisy Residual" and "Denoised Residual" curves as SVGs.
+
+- Produces TWO separate SVG files:
+    noisy_residual.svg
+    denoised_residual.svg
+
+- Curves are synthetic but shaped like residual noise + denoised residual.
+- Uses only matplotlib + numpy.
+"""
+
+from __future__ import annotations
+
+import argparse
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+@dataclass
+class CurveParams:
+    seconds: float = 12.0          # length of the signal
+    fs: int = 250                  # samples per second
+    seed: int = 7                  # RNG seed for reproducibility
+    base_amp: float = 0.12         # smooth baseline amplitude
+    noise_amp: float = 0.55        # high-frequency noise amplitude
+    burst_amp: float = 1.2         # occasional spike amplitude
+    burst_rate_hz: float = 0.35    # average spike frequency
+    denoise_smooth_ms: float = 120 # smoothing window for "denoised" (ms)
+
+
+def gaussian_smooth(x: np.ndarray, sigma_samples: float) -> np.ndarray:
+    """Gaussian smoothing using explicit kernel convolution (no SciPy dependency)."""
+    if sigma_samples <= 0:
+        return x.copy()
+
+    radius = int(np.ceil(4 * sigma_samples))
+    k = np.arange(-radius, radius + 1, dtype=float)
+    kernel = np.exp(-(k**2) / (2 * sigma_samples**2))
+    kernel /= kernel.sum()
+    return np.convolve(x, kernel, mode="same")
+
+
+def make_residual(params: CurveParams) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """
+    Create synthetic residual:
+    - baseline: smooth wavy trend + slight drift
+    - noise: band-limited-ish high-frequency noise
+    - bursts: sparse spikes / impulse-like events
+    Returns: (t, noisy, denoised)
+    """
+    rng = np.random.default_rng(params.seed)
+    n = int(params.seconds * params.fs)
+    t = np.linspace(0, params.seconds, n, endpoint=False)
+
+    # Smooth baseline (small): combination of sinusoids + small random drift
+    baseline = (
+        0.7 * np.sin(2 * np.pi * 0.35 * t + 0.2)
+        + 0.35 * np.sin(2 * np.pi * 0.9 * t + 1.2)
+        + 0.25 * np.sin(2 * np.pi * 0.15 * t + 2.0)
+    )
+    baseline *= params.base_amp
+    drift = np.cumsum(rng.normal(0, 1, size=n))
+    drift = drift / (np.max(np.abs(drift)) + 1e-9) * (params.base_amp * 0.25)
+    baseline = baseline + drift
+
+    # High-frequency noise: whitened then lightly smoothed to look "oscillatory"
+    raw = rng.normal(0, 1, size=n)
+    hf = raw - gaussian_smooth(raw, sigma_samples=params.fs * 0.03)  # remove slow part
+    hf = hf / (np.std(hf) + 1e-9)
+    hf *= params.noise_amp
+
+    # Bursts/spikes: Poisson process impulses convolved with short kernel
+    expected_bursts = params.burst_rate_hz * params.seconds
+    k_bursts = rng.poisson(expected_bursts)
+    impulses = np.zeros(n)
+    if k_bursts > 0:
+        idx = rng.integers(0, n, size=k_bursts)
+        impulses[idx] = rng.normal(loc=1.0, scale=0.4, size=k_bursts)
+
+    # Shape impulses into spikes (asymmetric bump)
+    spike_kernel_len = int(params.fs * 0.06)  # ~60ms
+    spike_kernel_len = max(spike_kernel_len, 7)
+    spike_t = np.arange(spike_kernel_len)
+    spike_kernel = np.exp(-spike_t / (params.fs * 0.012))  # fast decay
+    spike_kernel *= np.hanning(spike_kernel_len)  # taper
+    spike_kernel /= (spike_kernel.max() + 1e-9)
+
+    bursts = np.convolve(impulses, spike_kernel, mode="same")
+    bursts *= params.burst_amp
+
+    noisy = baseline + hf + bursts
+
+    # "Denoised": remove high-frequency using Gaussian smoothing,
+    # but keep spike structures partially.
+    smooth_sigma = (params.denoise_smooth_ms / 1000.0) * params.fs / 3.0
+    denoised = gaussian_smooth(noisy, sigma_samples=smooth_sigma)
+
+    return t, noisy, denoised
+
+
+def save_curve_svg(
+    t: np.ndarray,
+    y: np.ndarray,
+    out_path: Path,
+    *,
+    width_in: float = 5.4,
+    height_in: float = 1.6,
+    lw: float = 2.2,
+    pad: float = 0.03,
+) -> None:
+    """
+    Save a clean, figure-only SVG suitable for embedding in diagrams.
+    - No axes, ticks, labels.
+    - Tight bounding box.
+    """
+    fig = plt.figure(figsize=(width_in, height_in), dpi=200)
+    ax = fig.add_axes([pad, pad, 1 - 2 * pad, 1 - 2 * pad])
+
+    ax.plot(t, y, linewidth=lw)
+
+    # Make it "icon-like" for diagrams: no axes or frames
+    ax.set_axis_off()
+
+    # Ensure bounds include a little padding
+    ymin, ymax = np.min(y), np.max(y)
+    ypad = 0.08 * (ymax - ymin + 1e-9)
+    ax.set_xlim(t[0], t[-1])
+    ax.set_ylim(ymin - ypad, ymax + ypad)
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(out_path, format="svg", bbox_inches="tight", pad_inches=0.0)
+    plt.close(fig)
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--outdir", type=Path, default=Path("."), help="Output directory")
+    ap.add_argument("--seed", type=int, default=7, help="RNG seed")
+    ap.add_argument("--seconds", type=float, default=12.0, help="Signal length (s)")
+    ap.add_argument("--fs", type=int, default=250, help="Sampling rate (Hz)")
+    ap.add_argument("--prefix", type=str, default="", help="Filename prefix (optional)")
+    args = ap.parse_args()
+
+    params = CurveParams(seconds=args.seconds, fs=args.fs, seed=args.seed)
+    t, noisy, denoised = make_residual(params)
+
+    noisy_path = args.outdir / f"{args.prefix}noisy_residual.svg"
+    den_path = args.outdir / f"{args.prefix}denoised_residual.svg"
+
+    save_curve_svg(t, noisy, noisy_path)
+    save_curve_svg(t, denoised, den_path)
+
+    print(f"Wrote:\n  {noisy_path}\n  {den_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/arxiv-style/fig-scripts/make_ddpm_like_svg.py
+++ b/arxiv-style/fig-scripts/make_ddpm_like_svg.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+"""
+DDPM-like residual curve SVGs (separate files, fixed colors):
+- noisy_residual.svg    (blue)
+- denoised_residual.svg (purple)
+"""
+
+from __future__ import annotations
+
+import argparse
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+@dataclass
+class DDPMStyleParams:
+    seconds: float = 12.0
+    fs: int = 250
+    seed: int = 7
+
+    baseline_amp: float = 0.10
+    mid_wiggle_amp: float = 0.18
+    colored_noise_amp: float = 0.65
+    colored_alpha: float = 1.0
+
+    burst_rate_hz: float = 0.30
+    burst_amp: float = 0.9
+    burst_width_ms: float = 55
+
+    denoise_sigmas_ms: tuple[float, ...] = (25, 60, 140)
+    denoise_weights: tuple[float, ...] = (0.25, 0.35, 0.40)
+    denoise_texture_keep: float = 0.10
+
+
+def gaussian_smooth(x: np.ndarray, sigma_samples: float) -> np.ndarray:
+    if sigma_samples <= 0:
+        return x.copy()
+    radius = int(np.ceil(4 * sigma_samples))
+    k = np.arange(-radius, radius + 1, dtype=float)
+    kernel = np.exp(-(k**2) / (2 * sigma_samples**2))
+    kernel /= kernel.sum()
+    return np.convolve(x, kernel, mode="same")
+
+
+def colored_noise_1_f(n: int, rng: np.random.Generator, alpha: float) -> np.ndarray:
+    white = rng.normal(0, 1, size=n)
+    spec = np.fft.rfft(white)
+
+    freqs = np.fft.rfftfreq(n, d=1.0)
+    scale = np.ones_like(freqs)
+    nonzero = freqs > 0
+    scale[nonzero] = 1.0 / (freqs[nonzero] ** (alpha / 2.0))
+
+    spec *= scale
+    x = np.fft.irfft(spec, n=n)
+
+    x = x - np.mean(x)
+    x = x / (np.std(x) + 1e-9)
+    return x
+
+
+def make_ddpm_like_residual(p: DDPMStyleParams) -> tuple[np.ndarray, np.ndarray, np.ndarray]:
+    rng = np.random.default_rng(p.seed)
+    n = int(p.seconds * p.fs)
+    t = np.linspace(0, p.seconds, n, endpoint=False)
+
+    baseline = (
+        0.8 * np.sin(2 * np.pi * 0.18 * t + 0.4)
+        + 0.35 * np.sin(2 * np.pi * 0.06 * t + 2.2)
+    ) * p.baseline_amp
+
+    mid = (
+        0.9 * np.sin(2 * np.pi * 0.9 * t + 1.1)
+        + 0.5 * np.sin(2 * np.pi * 1.6 * t + 0.2)
+        + 0.3 * np.sin(2 * np.pi * 2.4 * t + 2.6)
+    ) * p.mid_wiggle_amp
+
+    col = colored_noise_1_f(n, rng, alpha=p.colored_alpha) * p.colored_noise_amp
+
+    expected = p.burst_rate_hz * p.seconds
+    k = rng.poisson(expected)
+    impulses = np.zeros(n)
+    if k > 0:
+        idx = rng.integers(0, n, size=k)
+        impulses[idx] = rng.normal(loc=1.0, scale=0.35, size=k)
+
+    width = max(int(p.fs * (p.burst_width_ms / 1000.0)), 7)
+    u = np.arange(width)
+    kernel = np.exp(-u / (p.fs * 0.012)) * np.hanning(width)
+    kernel /= (kernel.max() + 1e-9)
+    bursts = np.convolve(impulses, kernel, mode="same") * p.burst_amp
+
+    noisy = baseline + mid + col + bursts
+
+    sigmas_samples = [(ms / 1000.0) * p.fs / 3.0 for ms in p.denoise_sigmas_ms]
+    smooths = [gaussian_smooth(noisy, s) for s in sigmas_samples]
+
+    den_base = np.zeros_like(noisy)
+    for w, sm in zip(p.denoise_weights, smooths):
+        den_base += w * sm
+
+    hf = noisy - gaussian_smooth(noisy, sigma_samples=p.fs * 0.03)
+    denoised = den_base + p.denoise_texture_keep * (hf / (np.std(hf) + 1e-9)) * (0.10 * np.std(den_base))
+
+    return t, noisy, denoised
+
+
+def save_single_curve_svg(
+    t: np.ndarray,
+    y: np.ndarray,
+    out_path: Path,
+    *,
+    color: str,
+    lw: float = 2.2,
+) -> None:
+    fig = plt.figure(figsize=(5.4, 1.6), dpi=200)
+
+    # Make figure background transparent
+    fig.patch.set_alpha(0.0)
+
+    ax = fig.add_axes([0.03, 0.03, 0.94, 0.94])
+
+    # Make axes background transparent
+    ax.patch.set_alpha(0.0)
+
+    ax.plot(t, y, linewidth=lw, color=color)
+
+    # clean, diagram-friendly
+    ax.set_axis_off()
+    ymin, ymax = np.min(y), np.max(y)
+    ypad = 0.08 * (ymax - ymin + 1e-9)
+    ax.set_xlim(t[0], t[-1])
+    ax.set_ylim(ymin - ypad, ymax + ypad)
+
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(
+        out_path,
+        format="svg",
+        bbox_inches="tight",
+        pad_inches=0.0,
+        transparent=True,   # <-- key for transparent output
+    )
+    plt.close(fig)
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--outdir", type=Path, default=Path("."))
+    ap.add_argument("--seed", type=int, default=7)
+    ap.add_argument("--seconds", type=float, default=12.0)
+    ap.add_argument("--fs", type=int, default=250)
+
+    ap.add_argument("--alpha", type=float, default=1.0)
+    ap.add_argument("--noise-amp", type=float, default=0.65)
+    ap.add_argument("--texture-keep", type=float, default=0.10)
+
+    ap.add_argument("--prefix", type=str, default="")
+    args = ap.parse_args()
+
+    p = DDPMStyleParams(
+        seconds=args.seconds,
+        fs=args.fs,
+        seed=args.seed,
+        colored_alpha=args.alpha,
+        colored_noise_amp=args.noise_amp,
+        denoise_texture_keep=args.texture_keep,
+    )
+
+    t, noisy, den = make_ddpm_like_residual(p)
+
+    outdir = args.outdir
+    noisy_path = outdir / f"{args.prefix}noisy_residual.svg"
+    den_path = outdir / f"{args.prefix}denoised_residual.svg"
+
+    # Fixed colors as you requested
+    save_single_curve_svg(t, noisy, noisy_path, color="blue")
+    save_single_curve_svg(t, den, den_path, color="purple")
+
+    print("Wrote:")
+    print(f"  {noisy_path}")
+    print(f"  {den_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/arxiv-style/fig-scripts/pyproject.toml
+++ b/arxiv-style/fig-scripts/pyproject.toml
@@ -0,0 +1,10 @@
+[project]
+name = "fig-gen-ddpm"
+version = "0.1.0"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.12"
+dependencies = [
+  "numpy>=1.26",
+  "matplotlib>=3.8",
+]
--- a/arxiv-style/fig-scripts/synth_ics_3d_waterfall.py
+++ b/arxiv-style/fig-scripts/synth_ics_3d_waterfall.py
@@ -0,0 +1,240 @@
+#!/usr/bin/env python3
+"""
+3D "final combined outcome" (time × channel × value) with:
+- NO numbers on axes (tick labels removed)
+- Axis *titles* kept (texts are okay)
+- Reduced whitespace: tight bbox + minimal margins
+- White background (non-transparent) suitable for embedding into another SVG
+
+Output:
+  default PNG, optional SVG (2D projected vectors)
+
+Run:
+  uv run python synth_ics_3d_waterfall_tight.py --out ./assets/synth_ics_3d.png
+  uv run python synth_ics_3d_waterfall_tight.py --out ./assets/synth_ics_3d.svg --format svg
+"""
+
+from __future__ import annotations
+
+import argparse
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+@dataclass
+class Params:
+    seed: int = 7
+    seconds: float = 10.0
+    fs: int = 220
+
+    n_cont: int = 5
+    n_disc: int = 2
+    disc_vocab: int = 8
+    disc_change_rate_hz: float = 1.1
+
+    # view
+    elev: float = 25.0
+    azim: float = -58.0
+
+    # figure size (smaller, more "cube-like")
+    fig_w: float = 5.4
+    fig_h: float = 5.0
+
+    # discrete rendering
+    disc_z_scale: float = 0.45
+    disc_z_offset: float = -1.4
+
+    # margins (figure fraction)
+    left: float = 0.03
+    right: float = 0.99
+    bottom: float = 0.03
+    top: float = 0.99
+
+
+def _smooth(x: np.ndarray, win: int) -> np.ndarray:
+    win = max(3, int(win) | 1)
+    k = np.ones(win, dtype=float)
+    k /= k.sum()
+    return np.convolve(x, k, mode="same")
+
+
+def make_continuous(p: Params) -> tuple[np.ndarray, np.ndarray]:
+    rng = np.random.default_rng(p.seed)
+    T = int(p.seconds * p.fs)
+    t = np.linspace(0, p.seconds, T, endpoint=False)
+
+    Y = []
+    base_freqs = [0.08, 0.10, 0.12, 0.09, 0.11]
+    mid_freqs = [0.55, 0.70, 0.85, 0.62, 0.78]
+
+    for i in range(p.n_cont):
+        f1 = base_freqs[i % len(base_freqs)]
+        f2 = mid_freqs[i % len(mid_freqs)]
+        ph = rng.uniform(0, 2 * np.pi)
+
+        y = (
+            0.95 * np.sin(2 * np.pi * f1 * t + ph)
+            + 0.28 * np.sin(2 * np.pi * f2 * t + 0.65 * ph)
+        )
+
+        bumps = np.zeros_like(t)
+        for _ in range(rng.integers(2, 4)):
+            mu = rng.uniform(0.8, p.seconds - 0.8)
+            sig = rng.uniform(0.25, 0.80)
+            bumps += np.exp(-0.5 * ((t - mu) / (sig + 1e-9)) ** 2)
+        y += 0.55 * bumps
+
+        noise = _smooth(rng.normal(0, 1, size=T), win=int(p.fs * 0.05))
+        y += 0.10 * noise
+
+        y = (y - y.mean()) / (y.std() + 1e-9)
+        Y.append(y)
+
+    return t, np.vstack(Y)  # (n_cont, T)
+
+
+def make_discrete(p: Params, t: np.ndarray) -> np.ndarray:
+    rng = np.random.default_rng(p.seed + 123)
+    T = len(t)
+
+    expected_changes = max(1, int(p.seconds * p.disc_change_rate_hz))
+    X = np.zeros((p.n_disc, T), dtype=int)
+
+    for c in range(p.n_disc):
+        k = rng.poisson(expected_changes) + 1
+        pts = np.unique(rng.integers(0, T, size=k))
+        pts = np.sort(np.concatenate([[0], pts, [T]]))
+
+        cur = rng.integers(0, p.disc_vocab)
+        for a, b in zip(pts[:-1], pts[1:]):
+            if a != 0 and rng.random() < 0.85:
+                cur = rng.integers(0, p.disc_vocab)
+            X[c, a:b] = cur
+
+    return X
+
+
+def style_3d_axes(ax):
+    # Make panes white but less visually heavy
+    try:
+        # Keep pane fill ON (white background) but reduce edge prominence
+        ax.xaxis.pane.set_edgecolor("0.7")
+        ax.yaxis.pane.set_edgecolor("0.7")
+        ax.zaxis.pane.set_edgecolor("0.7")
+    except Exception:
+        pass
+
+    ax.grid(True, linewidth=0.4, alpha=0.30)
+
+
+def remove_tick_numbers_keep_axis_titles(ax):
+    # Remove tick labels (numbers) and tick marks, keep axis titles
+    ax.set_xticklabels([])
+    ax.set_yticklabels([])
+    ax.set_zticklabels([])
+
+    ax.tick_params(
+        axis="both",
+        which="both",
+        length=0,   # no tick marks
+        pad=0,
+    )
+    # 3D has separate tick_params for z on some versions; this still works broadly:
+    try:
+        ax.zaxis.set_tick_params(length=0, pad=0)
+    except Exception:
+        pass
+
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--out", type=Path, default=Path("synth_ics_3d.png"))
+    ap.add_argument("--format", choices=["png", "svg"], default="png")
+
+    ap.add_argument("--seed", type=int, default=7)
+    ap.add_argument("--seconds", type=float, default=10.0)
+    ap.add_argument("--fs", type=int, default=220)
+
+    ap.add_argument("--n-cont", type=int, default=5)
+    ap.add_argument("--n-disc", type=int, default=2)
+    ap.add_argument("--disc-vocab", type=int, default=8)
+    ap.add_argument("--disc-rate", type=float, default=1.1)
+
+    ap.add_argument("--elev", type=float, default=25.0)
+    ap.add_argument("--azim", type=float, default=-58.0)
+
+    ap.add_argument("--fig-w", type=float, default=5.4)
+    ap.add_argument("--fig-h", type=float, default=5.0)
+
+    ap.add_argument("--disc-z-scale", type=float, default=0.45)
+    ap.add_argument("--disc-z-offset", type=float, default=-1.4)
+
+    args = ap.parse_args()
+
+    p = Params(
+        seed=args.seed,
+        seconds=args.seconds,
+        fs=args.fs,
+        n_cont=args.n_cont,
+        n_disc=args.n_disc,
+        disc_vocab=args.disc_vocab,
+        disc_change_rate_hz=args.disc_rate,
+        elev=args.elev,
+        azim=args.azim,
+        fig_w=args.fig_w,
+        fig_h=args.fig_h,
+        disc_z_scale=args.disc_z_scale,
+        disc_z_offset=args.disc_z_offset,
+    )
+
+    t, Yc = make_continuous(p)
+    Xd = make_discrete(p, t)
+
+    fig = plt.figure(figsize=(p.fig_w, p.fig_h), dpi=220, facecolor="white")
+    ax = fig.add_subplot(111, projection="3d")
+    style_3d_axes(ax)
+
+    # Reduce whitespace around axes (tight placement)
+    fig.subplots_adjust(left=p.left, right=p.right, bottom=p.bottom, top=p.top)
+
+    # Draw continuous channels
+    for i in range(p.n_cont):
+        y = np.full_like(t, fill_value=i, dtype=float)
+        z = Yc[i]
+        ax.plot(t, y, z, linewidth=2.0)
+
+    # Draw discrete channels as steps
+    for j in range(p.n_disc):
+        ch = p.n_cont + j
+        y = np.full_like(t, fill_value=ch, dtype=float)
+        z = p.disc_z_offset + p.disc_z_scale * Xd[j].astype(float)
+        ax.step(t, y, z, where="post", linewidth=2.2)
+
+    # Axis titles kept
+    ax.set_xlabel("time")
+    ax.set_ylabel("channel")
+    ax.set_zlabel("value")
+
+    # Remove numeric tick labels + tick marks
+    remove_tick_numbers_keep_axis_titles(ax)
+
+    # Camera
+    ax.view_init(elev=p.elev, azim=p.azim)
+
+    # Save tightly (minimize white border)
+    args.out.parent.mkdir(parents=True, exist_ok=True)
+    save_kwargs = dict(bbox_inches="tight", pad_inches=0.03, facecolor="white")
+    if args.format == "svg" or args.out.suffix.lower() == ".svg":
+        fig.savefig(args.out, format="svg", **save_kwargs)
+    else:
+        fig.savefig(args.out, format="png", **save_kwargs)
+
+    plt.close(fig)
+    print(f"Wrote: {args.out}")
+
+
+if __name__ == "__main__":
+    main()
--- a/arxiv-style/fig-scripts/transformer_math_figure.py
+++ b/arxiv-style/fig-scripts/transformer_math_figure.py
@@ -0,0 +1,262 @@
+#!/usr/bin/env python3
+"""
+Transformer-ish "trend" visuals with NO equations:
+- attention_weights.svg      : heatmap-like attention map (looks like "Transformer attends to positions")
+- token_activation_trends.svg: multiple token-channel curves (continuous trends)
+- discrete_tokens.svg        : step-like discrete channel trends (optional)
+
+All SVGs have transparent background and no axes (diagram-friendly).
+"""
+
+from __future__ import annotations
+
+import argparse
+from dataclasses import dataclass
+from pathlib import Path
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+# ----------------------------
+# Synthetic data generators
+# ----------------------------
+
+@dataclass
+class Params:
+    seed: int = 7
+    T: int = 24                 # sequence length (positions)
+    n_heads: int = 4            # attention heads to blend/choose
+    n_curves: int = 7           # curves in token_activation_trends
+    seconds: float = 10.0
+    fs: int = 200
+
+
+def _gaussian(x: np.ndarray, mu: float, sig: float) -> np.ndarray:
+    return np.exp(-0.5 * ((x - mu) / (sig + 1e-9)) ** 2)
+
+
+def make_attention_map(T: int, rng: np.random.Generator, mode: str) -> np.ndarray:
+    """
+    Create a transformer-like attention weight matrix A (T x T) with different visual styles:
+      - "local": mostly near-diagonal attention
+      - "global": some global tokens attend broadly
+      - "causal": lower-triangular (decoder-like) with local preference
+    """
+    i = np.arange(T)[:, None]  # query positions
+    j = np.arange(T)[None, :]  # key positions
+
+    if mode == "local":
+        logits = -((i - j) ** 2) / (2 * (2.2 ** 2))
+        logits += 0.15 * rng.normal(size=(T, T))
+    elif mode == "global":
+        logits = -((i - j) ** 2) / (2 * (3.0 ** 2))
+        # Add a few "global" key positions that many queries attend to
+        globals_ = rng.choice(T, size=max(2, T // 10), replace=False)
+        for g in globals_:
+            logits += 1.2 * _gaussian(j, mu=g, sig=1.0)
+        logits += 0.12 * rng.normal(size=(T, T))
+    elif mode == "causal":
+        logits = -((i - j) ** 2) / (2 * (2.0 ** 2))
+        logits += 0.12 * rng.normal(size=(T, T))
+        logits = np.where(j <= i, logits, -1e9)  # mask future
+    else:
+        raise ValueError(f"Unknown attention mode: {mode}")
+
+    # softmax rows
+    logits = logits - np.max(logits, axis=1, keepdims=True)
+    A = np.exp(logits)
+    A /= (np.sum(A, axis=1, keepdims=True) + 1e-9)
+    return A
+
+
+def make_token_activation_trends(p: Params) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Multiple smooth curves that feel like "representations evolving across layers/time".
+    Returns:
+      t: (N,)
+      Y: (n_curves, N)
+    """
+    rng = np.random.default_rng(p.seed)
+    N = int(p.seconds * p.fs)
+    t = np.linspace(0, p.seconds, N, endpoint=False)
+
+    Y = []
+    for k in range(p.n_curves):
+        # Multi-scale smooth components + some bursty response
+        f1 = 0.10 + 0.04 * k
+        f2 = 0.60 + 0.18 * (k % 3)
+        phase = rng.uniform(0, 2 * np.pi)
+
+        base = 0.9 * np.sin(2 * np.pi * f1 * t + phase) + 0.35 * np.sin(2 * np.pi * f2 * t + 0.7 * phase)
+
+        # "attention-like gating": a few bumps where the curve spikes smoothly
+        bumps = np.zeros_like(t)
+        for _ in range(rng.integers(2, 5)):
+            mu = rng.uniform(0.5, p.seconds - 0.5)
+            sig = rng.uniform(0.15, 0.55)
+            bumps += 0.9 * _gaussian(t, mu=mu, sig=sig)
+
+        noise = rng.normal(0, 1, size=N)
+        noise = np.convolve(noise, np.ones(11) / 11.0, mode="same")  # smooth noise
+
+        y = base + 0.85 * bumps + 0.12 * noise
+
+        # normalize and vertically offset
+        y = (y - y.mean()) / (y.std() + 1e-9)
+        y = 0.75 * y + 0.18 * k
+        Y.append(y)
+
+    return t, np.vstack(Y)
+
+
+def make_discrete_trends(p: Params, vocab: int = 9, change_rate_hz: float = 1.3) -> tuple[np.ndarray, np.ndarray]:
+    """
+    Discrete step-like channels: useful if you want a "token-id / discrete feature" feel.
+    Returns:
+      t: (N,)
+      X: (n_curves, N) integers
+    """
+    rng = np.random.default_rng(p.seed + 123)
+    N = int(p.seconds * p.fs)
+    t = np.linspace(0, p.seconds, N, endpoint=False)
+
+    expected = max(1, int(p.seconds * change_rate_hz))
+    X = np.zeros((p.n_curves, N), dtype=int)
+    for c in range(p.n_curves):
+        k = rng.poisson(expected) + 1
+        pts = np.unique(rng.integers(0, N, size=k))
+        pts = np.sort(np.concatenate([[0], pts, [N]]))
+
+        cur = rng.integers(0, vocab)
+        for a, b in zip(pts[:-1], pts[1:]):
+            if a != 0 and rng.random() < 0.9:
+                cur = rng.integers(0, vocab)
+            X[c, a:b] = cur
+
+    return t, X
+
+
+# ----------------------------
+# Plot helpers (SVG, transparent, axes-free)
+# ----------------------------
+
+def _transparent_fig_ax(width_in: float, height_in: float):
+    fig = plt.figure(figsize=(width_in, height_in), dpi=200)
+    fig.patch.set_alpha(0.0)
+    ax = fig.add_axes([0.03, 0.03, 0.94, 0.94])
+    ax.patch.set_alpha(0.0)
+    ax.set_axis_off()
+    return fig, ax
+
+
+def save_attention_svg(A: np.ndarray, out: Path, *, show_colorbar: bool = False) -> None:
+    fig, ax = _transparent_fig_ax(4.2, 4.2)
+
+    # Using default colormap (no explicit color specification)
+    im = ax.imshow(A, aspect="equal", interpolation="nearest")
+
+    if show_colorbar:
+        # colorbar can be useful, but it adds clutter in diagrams
+        cax = fig.add_axes([0.92, 0.10, 0.03, 0.80])
+        cb = fig.colorbar(im, cax=cax)
+        cb.outline.set_linewidth(1.0)
+
+    out.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(out, format="svg", bbox_inches="tight", pad_inches=0.0, transparent=True)
+    plt.close(fig)
+
+
+def save_multi_curve_svg(t: np.ndarray, Y: np.ndarray, out: Path, *, lw: float = 2.0) -> None:
+    fig, ax = _transparent_fig_ax(6.0, 2.2)
+
+    for i in range(Y.shape[0]):
+        ax.plot(t, Y[i], linewidth=lw)
+
+    y_all = Y.reshape(-1)
+    ymin, ymax = float(np.min(y_all)), float(np.max(y_all))
+    ypad = 0.08 * (ymax - ymin + 1e-9)
+    ax.set_xlim(t[0], t[-1])
+    ax.set_ylim(ymin - ypad, ymax + ypad)
+
+    out.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(out, format="svg", bbox_inches="tight", pad_inches=0.0, transparent=True)
+    plt.close(fig)
+
+
+def save_discrete_svg(t: np.ndarray, X: np.ndarray, out: Path, *, lw: float = 2.0, spacing: float = 1.25) -> None:
+    fig, ax = _transparent_fig_ax(6.0, 2.2)
+
+    for i in range(X.shape[0]):
+        y = X[i].astype(float) + i * spacing
+        ax.step(t, y, where="post", linewidth=lw)
+
+    y_all = (X.astype(float) + np.arange(X.shape[0])[:, None] * spacing).reshape(-1)
+    ymin, ymax = float(np.min(y_all)), float(np.max(y_all))
+    ypad = 0.10 * (ymax - ymin + 1e-9)
+    ax.set_xlim(t[0], t[-1])
+    ax.set_ylim(ymin - ypad, ymax + ypad)
+
+    out.parent.mkdir(parents=True, exist_ok=True)
+    fig.savefig(out, format="svg", bbox_inches="tight", pad_inches=0.0, transparent=True)
+    plt.close(fig)
+
+
+# ----------------------------
+# CLI
+# ----------------------------
+
+def main() -> None:
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--outdir", type=Path, default=Path("out"))
+    ap.add_argument("--seed", type=int, default=7)
+
+    # attention
+    ap.add_argument("--T", type=int, default=24)
+    ap.add_argument("--attn-mode", type=str, default="local", choices=["local", "global", "causal"])
+    ap.add_argument("--colorbar", action="store_true")
+
+    # curves
+    ap.add_argument("--seconds", type=float, default=10.0)
+    ap.add_argument("--fs", type=int, default=200)
+    ap.add_argument("--n-curves", type=int, default=7)
+
+    # discrete optional
+    ap.add_argument("--with-discrete", action="store_true")
+    ap.add_argument("--disc-vocab", type=int, default=9)
+    ap.add_argument("--disc-rate", type=float, default=1.3)
+
+    args = ap.parse_args()
+
+    p = Params(
+        seed=args.seed,
+        T=args.T,
+        n_curves=args.n_curves,
+        seconds=args.seconds,
+        fs=args.fs,
+    )
+
+    rng = np.random.default_rng(args.seed)
+
+    # 1) attention map
+    A = make_attention_map(args.T, rng, mode=args.attn_mode)
+    save_attention_svg(A, args.outdir / "attention_weights.svg", show_colorbar=args.colorbar)
+
+    # 2) continuous trends
+    t, Y = make_token_activation_trends(p)
+    save_multi_curve_svg(t, Y, args.outdir / "token_activation_trends.svg")
+
+    # 3) discrete trends (optional)
+    if args.with_discrete:
+        td, X = make_discrete_trends(p, vocab=args.disc_vocab, change_rate_hz=args.disc_rate)
+        save_discrete_svg(td, X, args.outdir / "discrete_tokens.svg")
+
+    print("Wrote:")
+    print(f"  {args.outdir / 'attention_weights.svg'}")
+    print(f"  {args.outdir / 'token_activation_trends.svg'}")
+    if args.with_discrete:
+        print(f"  {args.outdir / 'discrete_tokens.svg'}")
+
+
+if __name__ == "__main__":
+    main()
--- a/arxiv-style/main-ieee.tex
+++ b/arxiv-style/main-ieee.tex
@@ -0,0 +1,302 @@
+\documentclass[conference]{IEEEtran}
+\IEEEoverridecommandlockouts
+
+\usepackage{cite}
+\usepackage{amsmath,amssymb,amsfonts}
+\usepackage{graphicx}
+\usepackage{url}
+\usepackage{textcomp}
+\usepackage{xcolor}
+\usepackage{booktabs}
+\usepackage{bm}
+
+\title{Mask-DDPM: Transformer-Conditioned Mixed-Type Diffusion for Semantically Valid ICS Telemetry Synthesis}
+
+% 若不需要日期，取消下面一行的注释
+
+\author{
+\IEEEauthorblockN{Zhenglan Chen}
+\IEEEauthorblockA{\textit{Aberdeen Institute of Data Science and Artificial Intelligence} \\
+\textit{South China Normal University}\\
+Guangzhuo, China \\
+email address or ORCID}
+\and
+\IEEEauthorblockN{Mingzhe Yang}
+\IEEEauthorblockA{\textit{Aberdeen Institute of Data Science and Artificial Intelligence} \\
+\textit{South China Normal University}\\
+Guangzhuo, China \\
+email address or ORCID}
+\and
+\IEEEauthorblockN{Hongyu Yan}
+\IEEEauthorblockA{\textit{Aberdeen Institute of Data Science and Artificial Intelligence} \\
+\textit{South China Normal University}\\
+Guangzhuo, China \\
+email address or ORCID}
+\and
+\IEEEauthorblockN{Huan Yang}
+\IEEEauthorblockA{\textit{dept. name of organization (of Aff.)} \\
+\textit{South China Normal University}\\
+Guangzhuo, China \\
+email address or ORCID}
+}
+
+% 页眉设置
+
+%%% PDF 元数'
+
+\begin{document}
+\maketitle
+
+\begin{abstract}
+Industrial control systems (ICS) security research is increasingly constrained by the scarcity and non-shareability of realistic traffic and telemetry, especially for attack scenarios. To mitigate this bottleneck, we study synthetic generation at the protocol feature/telemetry level, where samples must simultaneously preserve temporal coherence, match continuous marginal distributions, and keep discrete supervisory variables strictly within valid vocabularies. We propose Mask-DDPM, a hybrid framework tailored to mixed-type, multi-scale ICS sequences. Mask-DDPM factorizes generation into (i) a causal Transformer trend module that rolls out a stable long-horizon temporal scaffold for continuous channels, (ii) a trend-conditioned residual DDPM that refines local stochastic structure and heavy-tailed fluctuations without degrading global dynamics, (iii) a masked (absorbing) diffusion branch for discrete variables that guarantees categorical legality by construction, and (iv) a type-aware decomposition/routing layer that aligns modeling mechanisms with heterogeneous ICS variable origins and enforces deterministic reconstruction where appropriate. Evaluated on fixed-length windows (L=96) derived from the HAI Security Dataset, Mask-DDPM achieves stable fidelity across seeds with mean KS = 0.3311 ± 0.0079 (continuous), mean JSD = 0.0284 ± 0.0073 (discrete), and mean absolute lag-1 autocorrelation difference = 0.2684 ± 0.0027, indicating faithful marginals, preserved short-horizon dynamics, and valid discrete semantics. The resulting generator provides a reproducible basis for data augmentation, benchmarking, and downstream ICS protocol reconstruction workflows.
+\end{abstract}
+
+% 关键'
+\begin{IEEEkeywords}Machine Learning, Cyber Defense, ICS\end{IEEEkeywords}
+
+% 1. Introduction
+\section{Introduction}
+\label{sec:intro}
+Industrial control systems (ICS) form the backbone of modern critical infrastructure, which includes power grids, water treatment, manufacturing, and transportation, among others. These systems monitor, regulate, and automate the physical processes through sensors, actuators, programmable logic controllers (PLCs), and monitoring software. Unlike conventional IT systems, ICS operate in real time, closely coupled with physical processes and safety-critical constraints, using heterogeneous and legacy communication protocols such as Modbus/TCP and DNP3 that were not originally designed with robust security in mind. This architectural complexity and operational criticality make ICS high-impact targets for cyber attacks, where disruptions can result in physical damage, environmental harm, and even loss of life. Recent reviews of ICS security highlight the expanding attack surface due to increased connectivity, legacy systems' vulnerabilities, and the inadequacy of traditional security controls in capturing the nuances of ICS networks and protocols \cite{10.1007/s10844-022-00753-1,Nankya2023-gp}
+
+While machine learning (ML) techniques have shown promise for anomaly detection and automated cybersecurity within ICS, they rely heavily on labeled datasets that capture both benign operations and diverse attack patterns. In practice, real ICS traffic data, especially attack-triggered captures, are scarce due to confidentiality, safety, and legal restrictions, and available public ICS datasets are few, limited in scope, or fail to reflect current threat modalities. For instance, the HAI Security Dataset provides operational telemetry and anomaly flags from a realistic control system setup for research purposes, but must be carefully preprocessed to derive protocol-relevant features for ML tasks \cite{shin}. Data scarcity directly undermines model generalization, evaluation reproducibility, and the robustness of intrusion detection research, especially when training or testing ML models on realistic ICS behavior remains confined to small or outdated collections of examples \cite{info16100910}.
+
+Synthetic data generation offers a practical pathway to mitigate these challenges. By programmatically generating feature-level sequences that mimic the statistical and temporal structure of real ICS telemetry, researchers can augment scarce training sets, standardize benchmarking, and preserve operational confidentiality. Relative to raw packet captures, feature-level synthesis abstracts critical protocol semantics and statistical patterns without exposing sensitive fields, making it more compatible with safety constraints and compliance requirements in ICS environments. Modern generative modeling, including diffusion models, has advanced significantly in producing high-fidelity synthetic data across domains. Diffusion approaches, such as denoising diffusion probabilistic models, learn to transform noise into coherent structured samples and have been successfully applied to tabular or time series data synthesis with better stability and data coverage compared to adversarial methods \cite{pmlr-v202-kotelnikov23a,rasul2021autoregressivedenoisingdiffusionmodels}
+
+Despite these advances, most existing work either focuses on packet-level generation \cite{jiang2023netdiffusionnetworkdataaugmentation} or is limited to generic tabular data \cite{pmlr-v202-kotelnikov23a}, rather than domain-specific control sequence synthesis tailored for ICS protocols where temporal coherence, multi-channel dependencies, and discrete protocol legality are jointly required. This gap motivates our focus on protocol feature-level generation for ICS, which involves synthesizing sequences of protocol-relevant fields conditioned on their temporal and cross-channel structure. In this work, we formulate a hybrid modeling pipeline that decouples long-horizon trends and local statistical detail while preserving discrete semantics of protocol tokens. By combining causal Transformers with diffusion-based refiners, and enforcing deterministic validity constraints during sampling, our framework generates semantically coherent, temporally consistent, and distributionally faithful ICS feature sequences. We evaluate features derived from the HAI Security Dataset and demonstrate that our approach produces high-quality synthetic sequences suitable for downstream augmentation, benchmarking, and integration into packet-construction workflows that respect realistic ICS constraints.
+
+% 2. Related Work
+\section{Related Work}
+\label{sec:related}
+Early generation of network data oriented towards ``realism'' mostly remained at the packet/flow header level, either through replay or statistical synthesis based on single-point observations. Swing, in a closed-loop, network-responsive manner, extracts user/application/network distributions from single-point observations to reproduce burstiness and correlation across multiple time scales \cite{10.1145/1159913.1159928}. Subsequently, a series of works advanced header synthesis to learning-based generation: the WGAN-based method added explicit verification of protocol field consistency to NetFlow/IPFIX \cite{Ring_2019}, NetShare reconstructed header modeling as flow-level time series and improved fidelity and scalability through domain encoding and parallel fine-tuning \cite{10.1145/3544216.3544251}, and DoppelGANger preserved the long-range structure and downstream sorting consistency of networked time series by decoupling attributes from sequences \cite{Lin_2020}. However, in industrial control system (ICS) scenarios, the original PCAP is usually not shareable, and public testbeds (such as SWaT, WADI) mostly provide process/monitoring telemetry and protocol interactions for security assessment, but public datasets emphasize operational variables rather than packet-level traces \cite{7469060,10.1145/3055366.3055375}. This makes ``synthesis at the feature/telemetry level, aware of protocol and semantics'' more feasible and necessary in practice: we are more concerned with reproducing high-level distributions and multi-scale temporal patterns according to operational semantics and physical constraints without relying on the original packets. From this perspective, the generation paradigm naturally shifts from ``packet syntax reproduction'' to ``modeling of high-level spatio-temporal distributions and uncertainties'', requiring stable training, strong distribution fitting, and interpretable uncertainty characterization.
+
+Diffusion models exhibit good fit along this path: DDPM achieves high-quality sampling and stable optimization through efficient $\epsilon$ parameterization and weighted variational objectives \cite{ho2020denoising}, the SDE perspective unifies score-based and diffusion, providing likelihood evaluation and prediction-correction sampling strategies based on probability flow ODEs \cite{song2021score}. For time series, TimeGrad replaces the constrained output distribution with conditional denoising, capturing high-dimensional correlations at each step \cite{rasul2021autoregressivedenoisingdiffusionmodels}; CSDI explicitly performs conditional diffusion and uses two-dimensional attention to simultaneously leverage temporal and cross-feature dependencies, suitable for conditioning and filling in missing values \cite{tashiro2021csdiconditionalscorebaseddiffusion}; in a more general spatio-temporal structure, DiffSTG generalizes diffusion to spatio-temporal graphs, combining TCN/GCN with denoising U-Net to improve CRPS and inference efficiency in a non-autoregressive manner \cite{wen2024diffstgprobabilisticspatiotemporalgraph}, and PriSTI further enhances conditional features and geographical relationships, maintaining robustness under high missing rates and sensor failures \cite{liu2023pristiconditionaldiffusionframework}; in long sequences and continuous domains, DiffWave verifies that diffusion can also match the quality of strong vocoders under non-autoregressive fast synthesis \cite{kong2021diffwaveversatilediffusionmodel}; studies on cellular communication traffic show that diffusion can recover spatio-temporal patterns and provide uncertainty characterization at the urban scale \cite{11087622}. These results overall point to a conclusion: when the research focus is on ``telemetry/high-level features'' rather than raw messages, diffusion models provide stable and fine-grained distribution fitting and uncertainty quantification, which is exactly in line with the requirements of ICS telemetry synthesis. Meanwhile, directly entrusting all structures to a ``monolithic diffusion'' is not advisable: long-range temporal skeletons and fine-grained marginal distributions often have optimization tensions, requiring explicit decoupling in modeling.
+
+Looking further into the mechanism complexity of ICS: its channel types are inherently mixed, containing both continuous process trajectories and discrete supervision/status variables, and discrete channels must be ``legal'' under operational constraints. The aforementioned progress in time series diffusion has mainly occurred in continuous spaces, but discrete diffusion has also developed systematic methods: D3PM improves sampling quality and likelihood through absorption/masking and structured transitions in discrete state spaces \cite{austin2021structured}, subsequent masked diffusion provides stable reconstruction on categorical data in a more simplified form \cite{Lin_2020}, multinomial diffusion directly defines diffusion on a finite vocabulary through mechanisms such as argmax flows \cite{hoogeboom2021argmaxflowsmultinomialdiffusion}, and Diffusion-LM demonstrates an effective path for controllable text generation by imposing gradient constraints in continuous latent spaces \cite{li2022diffusionlmimprovescontrollabletext}. From the perspectives of protocols and finite-state machines, coverage-guided fuzz testing emphasizes the criticality of ``sequence legality and state coverage'' \cite{meng2025aflnetyearslatercoverageguided,godefroid2017learnfuzzmachinelearninginput,she2019neuzzefficientfuzzingneural}, echoing the concept of ``legality by construction'' in discrete diffusion: preferentially adopting absorption/masking diffusion on discrete channels, supplemented by type-aware conditioning and sampling constraints, to avoid semantic invalidity and marginal distortion caused by post hoc thresholding.
+
+From the perspective of high-level synthesis, the temporal structure is equally indispensable: ICS control often involves delay effects, phased operating conditions, and cross-channel coupling, requiring models to be able to characterize low-frequency, long-range dependencies while also overlaying multi-modal fine-grained fluctuations on them. The Transformer series has provided sufficient evidence in long-sequence time series tasks: Transformer-XL breaks through the fixed-length context limitation through a reusable memory mechanism and significantly enhances long-range dependency expression \cite{dai2019transformerxlattentivelanguagemodels}; Informer uses ProbSparse attention and efficient decoding to balance span and efficiency in long-sequence prediction \cite{zhou2021informerefficienttransformerlong}; Autoformer robustly models long-term seasonality and trends through autocorrelation and decomposition mechanisms \cite{wu2022autoformerdecompositiontransformersautocorrelation}; FEDformer further improves long-period prediction performance in frequency domain enhancement and decomposition \cite{zhou2022fedformerfrequencyenhanceddecomposed}; PatchTST enhances the stability and generalization of long-sequence multivariate prediction through local patch-based representation and channel-independent modeling \cite{2023}. Combining our previous positioning of diffusion, this chain of evidence points to a natural division of labor: using attention-based sequence models to first extract stable low-frequency trends/conditions (long-range skeletons), and then allowing diffusion to focus on margins and details in the residual space; meanwhile, discrete masking/absorbing diffusion is applied to supervised/pattern variables to ensure vocabulary legality by construction. This design not only inherits the advantages of time series diffusion in distribution fitting and uncertainty characterization \cite{rasul2021autoregressivedenoisingdiffusionmodels,tashiro2021csdiconditionalscorebaseddiffusion,wen2024diffstgprobabilisticspatiotemporalgraph,liu2023pristiconditionaldiffusionframework,kong2021diffwaveversatilediffusionmodel,11087622}, but also stabilizes the macroscopic temporal support through the long-range attention of Transformer, enabling the formation of an operational integrated generation pipeline under the mixed types and multi-scale dynamics of ICS.
+
+% 3. Methodology
+\section{Methodology}
+\label{sec:method}
+Industrial control system (ICS) telemetry is intrinsically mixed-type and mechanistically heterogeneous: continuous process trajectories (e.g., sensor and actuator signals) coexist with discrete supervisory states (e.g., modes, alarms, interlocks), and the underlying generating mechanisms range from physical inertia to program-driven step logic. This heterogeneity is not cosmetic--it directly affects what "realistic'synthesis means, because a generator must jointly satisfy (i) temporal coherence, (ii) distributional fidelity, and (iii) discrete semantic validity (i.e., every discrete output must belong to its legal vocabulary by construction). These properties are emphasized broadly in operational-technology security guidance and ICS engineering practice, where state logic and physical dynamics are tightly coupled \cite{nist2023sp80082}.
+
+We formalize each training instance as a fixed-length window of length We model each training instance as a fixed-length window of length $L$, comprising continuous channels $\bm{X} \in \mathbb{R}^{L \times d_c}$ and discrete channels $\bm{Y} = \{y^{(j)}_{1:L}\}_{j=1}^{d_d}$, where each discrete variable satisfies $y^{(j)}_t \in \mathcal{V}_j$ for a finite vocabulary $\mathcal{V}_j$. Our objective is to learn a generator that produces synthetic $(\hat{\bm{X}}, \hat{\bm{Y}})$ that are simultaneously coherent and distributionally faithful, while also ensuring $\hat{y}^{(j)}_t\in\mathcal{V}_j$ for all $j$, $t$ by construction (rather than via post-hoc rounding or thresholding).
+
+A key empirical and methodological tension in ICS synthesis is that temporal realism and marginal/distributional realism can compete when optimized monolithically: sequence models trained primarily for regression often over-smooth heavy tails and intermittent bursts, while purely distribution-matching objectives can erode long-range structure. Diffusion models provide a principled route to rich distribution modeling through iterative denoising, but they do not, by themselves, resolve (i) the need for a stable low-frequency temporal scaffold, nor (ii) the discrete legality constraints for supervisory variables \cite{ho2020denoising,song2021score}. Recent time-series diffusion work further suggests that separating coarse structure from stochastic refinement can be an effective inductive bias for long-horizon realism \cite{kollovieh2023tsdiff,sikder2023transfusion}.
+
+\begin{figure*}[t]
+  \centering
+  \includegraphics[width=\textwidth]{fig-design-v2.png}
+  % \caption{Description of the figure.}
+  \label{fig:design}
+\end{figure*}
+
+Motivated by these considerations, we propose Mask-DDPM, organized in the following order:
+\begin{enumerate}
+  \item Transformer trend module: learns the dominant temporal backbone of continuous dynamics via attention-based sequence modeling \cite{vaswani2017attention}.
+
+  \item Residual DDPM for continuous variables: models distributional detail as stochastic residual structure conditioned on the learned trend \cite{ho2020denoising,kollovieh2023tsdiff}.
+
+  \item Masked diffusion for discrete variables: generates discrete ICS states with an absorbing/masking corruption process and categorical reconstruction \cite{austin2021structured,shi2024simplified}.
+
+  \item Type-aware decomposition: a type-aware factorization and routing layer that assigns variables to the most appropriate modeling mechanism and enforces deterministic constraints where warranted.
+\end{enumerate}
+
+This ordering is intentional. The trend module establishes a macro-temporal scaffold; residual diffusion then concentrates capacity on micro-structure and marginal fidelity; masked diffusion provides a native mechanism for discrete legality; and the type-aware layer operationalizes the observation that not all ICS variables should be modeled with the same stochastic mechanism. Importantly, while diffusion-based generation for ICS telemetry has begun to emerge, existing approaches remain limited and typically emphasize continuous synthesis or augmentation; in contrast, our pipeline integrates (i) a Transformer-conditioned residual diffusion backbone, (ii) a discrete masked-diffusion branch, and (iii) explicit type-aware routing for heterogeneous variable mechanisms within a single coherent generator \cite{yuan2025ctu,sha2026ddpm}.
+
+\subsection{Transformer trend module for continuous dynamics}
+\label{sec:method-trans}
+We instantiate the temporal backbone as a causal Transformer trend extractor, leveraging self-attention's ability to represent long-range dependencies and cross-channel interactions without recurrence \cite{vaswani2017attention}. Compared with recurrent trend extractors (e.g., GRU-style backbones), a Transformer trend module offers a direct mechanism to model delayed effects and multivariate coupling--common in ICS, where control actions may influence downstream sensors with nontrivial lags and regime-dependent propagation \cite{vaswani2017attention,nist2023sp80082}. Crucially, in our design the Transformer is not asked to be the entire generator; instead, it serves a deliberately restricted role: providing a stable, temporally coherent conditioning signal that later stochastic components refine.
+
+For continuous channels $\bm{X}$, we posit an additive decomposition:
+\begin{equation}
+\bm{X} = \bm{S} + \bm{R},
+\label{eq:additive_decomp}
+\end{equation}
+where $\bm{S} \in \mathbb{R}^{L \times d_c}$ is a smooth trend capturing predictable temporal evolution, and $\bm{R} \in \mathbb{R}^{L \times d_c}$ is a residual capturing distributional detail (e.g., bursts, heavy tails, local fluctuations) that is difficult to represent robustly with a purely regression-based temporal objective. This separation reflects an explicit division of labor: the trend module prioritizes temporal coherence, while diffusion (introduced next) targets distributional realism at the residual level--a strategy aligned with "predict-then-refine'perspectives in time-series diffusion modeling \cite{kollovieh2023tsdiff,sikder2023transfusion}.
+
+We parameterize the trend $\bm{S}$ using a causal Transformer $f_\phi$. With teacher forcing, we train $F_\phi$ to predict the next-step trend from past observations:
+\begin{equation}
+\hat{\bm{S}}_{t+1} = f_{\phi}(\bm{X}_{1:t}), \quad t = 1, \dots, L-1.
+\label{eq:trend_prediction}
+\end{equation}
+using the mean-squared error objective:
+\begin{equation}
+\mathcal{L}_{\text{trend}}(\phi) = \frac{1}{(L-1)d_c} \sum_{t=1}^{L-1} \bigl\| \hat{\bm{S}}_{t+1} - \bm{X}_{t+1} \bigr\|_2^2.
+\label{eq:trend_loss}
+\end{equation}
+At inference, we roll out the Transformer autoregressively to obtain $\hat{\bm{S}}$, and and then define the residual target for diffusion as $\bm{R} = \bm{X} - \hat{\bm{S}}$. This setup intentionally "locks in'a coherent low-frequency scaffold before any stochastic refinement is applied, thereby reducing the burden on downstream diffusion modules to simultaneously learn both long-range structure and marginal detail. In this sense, our use of Transformers is distinctive: it is a conditioning-first temporal backbone designed to stabilize mixed-type diffusion synthesis in ICS, rather than an end-to-end monolithic generator \cite{vaswani2017attention,kollovieh2023tsdiff,yuan2025ctu}.
+
+\subsection{DDPM for continuous residual generation}
+\label{sec:method-ddpm}
+We model the residual RRR with a denoising diffusion probabilistic model (DDPM) conditioned on the trend $\hat{\bm{S}}$ \cite{ho2020denoising}. Diffusion models learn complex data distributions by inverting a tractable noising process through iterative denoising, and have proven effective at capturing multimodality and heavy-tailed structure that is often attenuated by purely regression-based sequence models \cite{ho2020denoising,song2021score}. Conditioning the diffusion model on $\hat{\bm{S}}$ is central: it prevents the denoiser from re-learning the low-frequency scaffold and focuses capacity on residual micro-structure, mirroring the broader principle that diffusion excels as a distributional corrector when a reasonable coarse structure is available \cite{kollovieh2023tsdiff,sikder2023transfusion}.
+
+Let $\bm{K}$ denote the number of diffusion steps, with a noise schedule $\{\beta_k\}_{k=1}^K$, $\alpha_k = 1 - \beta_k$, and $\bar{\alpha}_k = \prod_{i=1}^k \alpha_i$. The forward corruption process is:
+\begin{equation}
+q(\bm{r}_k \mid \bm{r}_0) = \mathcal{N}\bigl( \sqrt{\bar{\alpha}_k}\,\bm{r}_0,\; (1 - \bar{\alpha}_k)\mathbf{I} \bigr)
+\label{eq:forward_corruption}
+\end{equation}
+equivalently,
+\begin{equation}
+\bm{r}_k = \sqrt{\bar{\alpha}_k}\,\bm{r}_0 + \sqrt{1 - \bar{\alpha}_k}\,\boldsymbol{\epsilon}, \quad \boldsymbol{\epsilon} \sim \mathcal{N}(\mathbf{0}, \mathbf{I})
+\label{eq:forward_corruption_eq}
+\end{equation}
+The learned reverse process is parameterized as:
+\begin{equation}
+p_{\theta}(\bm{r}_{k-1} \mid \bm{r}_k, \hat{\bm{S}}) = \mathcal{N}\bigl( \boldsymbol{\mu}_{\theta}(\bm{r}_k, k, \hat{\bm{S}}),\; \boldsymbol{\Sigma}(k) \bigr).
+\label{eq:reverse_process}
+\end{equation}
+where $\mu_\theta$ is implemented by a Transformer denoiser that consumes (i) the noised residual $r_k$, (ii) a timestep embedding for $k$, and (iii) conditioning features derived from $\hat{\bm{S}}$.  This denoiser architecture is consistent with the growing use of attention-based denoisers for long-context time-series diffusion, while our key methodological emphasis is the trend-conditioned residual factorization as the object of diffusion learning \cite{ho2020denoising,sikder2023transfusion}.
+
+We train the denoiser using the standard DDPM $\epsilon$-prediction objective:
+\begin{equation}
+\mathcal{L}_{\text{cont}}(\theta) = \mathbb{E}_{k,\bm{r}_0,\boldsymbol{\epsilon}} \left[ \bigl\| \boldsymbol{\epsilon} - \boldsymbol{\epsilon}_{\theta}(\bm{r}_k, k, \hat{\bm{S}}) \bigr\|_2^2 \right].
+\label{eq:ddpm_loss}
+\end{equation}
+Because diffusion optimization can exhibit timestep imbalance (i.e., some timesteps dominate gradients), we optionally apply an SNR-based reweighting consistent with Min-SNR training:
+\begin{equation}
+\mathcal{L}^{\text{snr}}_{\text{cont}}(\theta) = \mathbb{E}_{k,\bm{r}_0,\boldsymbol{\epsilon}} \left[ w_k \bigl\| \boldsymbol{\epsilon} - \boldsymbol{\epsilon}_{\theta}(\bm{r}_k, k, \hat{\bm{S}}) \bigr\|_2^2 \right],
+\label{eq:snr_loss}
+\end{equation}
+where $\mathrm{SNR}_k=\bar{\alpha}_k/(1-\bar{\alpha}_k)$ and $\gamma>0$ is a cap parameter \cite{hang2023efficient}.
+
+After sampling $\hat{\bm{R}}$ by reverse diffusion, we reconstruct the continuous output as $\hat{\bm{X}} = \hat{\bm{S}} + \hat{\bm{R}}$. Overall, the DDPM component serves as a distributional corrector on top of a temporally coherent backbone, which is particularly suited to ICS where low-frequency dynamics are strong and persistent but fine-scale variability (including bursts and regime-conditioned noise) remains important for realism. Relative to prior ICS diffusion efforts that primarily focus on continuous augmentation, our formulation elevates trend-conditioned residual diffusion as a modular mechanism for disentangling temporal structure from distributional refinement \cite{yuan2025ctu,sha2026ddpm}.
+
+\subsection{Masked diffusion for discrete ICS variables}
+\label{sec:method-discrete}
+Discrete ICS variables must remain categorical, making Gaussian diffusion inappropriate for supervisory states and mode-like channels. While one can attempt continuous relaxations or post-hoc discretization, such strategies risk producing semantically invalid intermediate states (e.g., "in-between'modes) and can distort the discrete marginal distribution. Discrete-state diffusion provides a principled alternative by defining a valid corruption process directly on categorical variables \cite{austin2021structured,shi2024simplified}. In the ICS setting, this is not a secondary detail: supervisory tags often encode control logic boundaries (modes, alarms, interlocks) that must remain within a finite vocabulary to preserve semantic correctness \cite{nist2023sp80082}.
+
+We therefore adopt masked (absorbing) diffusion for discrete channels, where corruption replaces tokens with a special $\texttt{[MASK]}$ symbol according to a schedule \cite{shi2024simplified}. For each variable $j$, define a masking schedule $\{m_k\}_{k=1}^K$ (with $m_k\in[0,1]$) increasing in $k$. The forward corruption process is:
+\begin{equation}
+q(y^{(j)}_k \mid y^{(j)}_0) =
+\begin{cases}
+y^{(j)}_0,            & \text{with probability } 1 - m_k, \\
+\texttt{[MASK]},      & \text{with probability } m_k,
+\end{cases}
+\label{eq:masking_process}
+\end{equation}
+applied independently across $j$ and $t$. Let $\mathcal{M}$ denote the set of masked positions at step $k$. The denoiser $h_{\psi}$ predicts a categorical distribution over $\mathcal{V}_j$ for each masked token, conditioned on (i) the corrupted discrete sequence, (ii) the diffusion step $k$, and (iii) continuous context. Concretely, we condition on $\hat{\bm{S}}$ and $\hat{\bm{X}}$ to couple supervisory reconstruction to the underlying continuous dynamics:
+\begin{equation}
+p_{\psi}\bigl( y^{(j)}_0 \mid y_k, k, \hat{\bm{S}}, \hat{\bm{X}} \bigr) = h_{\psi}(y_k, k, \hat{\bm{S}}, \hat{\bm{X}}).
+\label{eq:discrete_denoising}
+\end{equation}
+This conditioning choice is motivated by the fact that many discrete ICS states are not standalone, they are functions of regimes, thresholds, and procedural phases that manifest in continuous channels \cite{nist2023sp80082}. Training uses a categorical denoising objective:
+\begin{equation}
+\mathcal{L}_{\text{disc}}(\psi) = \mathbb{E}_{k} \left[ \frac{1}{|\mathcal{M}|} \sum_{(j,t) \in \mathcal{M}} \mathrm{CE}\bigl( h_{\psi}(y_k, k, \hat{\bm{S}}, \hat{\bm{X}})_{j,t},\; y^{(j)}_{0,t} \bigr) \right],
+\label{eq:discrete_loss}
+\end{equation}
+where $\mathrm{CE}(\cdot,\cdot)$ is cross-entropy. At sampling time, we initialize all discrete tokens as $\texttt{[MASK]}$ and iteratively unmask them using the learned conditionals, ensuring that every output token lies in its legal vocabulary by construction. This discrete branch is a key differentiator of our pipeline: unlike typical continuous-only diffusion augmentation in ICS, we integrate masked diffusion as a first-class mechanism for supervisory-variable legality within the same end-to-end synthesis workflow \cite{shi2024simplified,yuan2025ctu}.
+
+\subsection{Type-aware decomposition as factorization and routing layer}
+\label{sec:method-types}
+Even with a trend-conditioned residual DDPM and a discrete masked-diffusion branch, a single uniform modeling treatment can remain suboptimal because ICS variables are generated by qualitatively different mechanisms. For example, program-driven setpoints exhibit step-and-dwell dynamics; controller outputs follow control laws conditioned on process feedback; actuator positions may show saturation and dwell; and some "derived tags'are deterministic functions of other channels. Treating all channels as if they were exchangeable stochastic processes can misallocate model capacity and induce systematic error concentration on a small subset of mechanistically distinct variables \cite{nist2023sp80082}.
+
+We therefore introduce a type-aware decomposition that formalizes this heterogeneity as a routing and constraint layer.  Let $\tau(i)\in{1,\dots,6}$ assign each variable (i) to a type class. The type assignment can be initialized from domain semantics (tag metadata, value domains, and engineering meaning), and subsequently refined via an error-attribution workflow described in the Benchmark section. Importantly, this refinement does not change the core diffusion backbone; it changes which mechanism is responsible for which variable, thereby aligning inductive bias with variable-generating mechanism while preserving overall coherence.
+
+We use the following taxonomy:
+\begin{enumerate}
+	\item Type 1 (program-driven / setpoint-like): externally commanded, step-and-dwell variables. These variables can be treated as exogenous drivers (conditioning signals) or routed to specialized change-point / dwell-time models, rather than being forced into a smooth denoiser that may over-regularize step structure.
+
+	\item Type 2 (controller outputs): continuous variables tightly coupled to feedback loops; these benefit from conditional modeling where the conditioning includes relevant process variables and commanded setpoints.
+
+	\item Type 3 (actuator states/positions): often exhibit saturation, dwell, and rate limits; these may require stateful dynamics beyond generic residual diffusion, motivating either specialized conditional modules or additional inductive constraints.
+
+	\item Type 4 (process variables): inertia-dominated continuous dynamics; these are the primary beneficiaries of the Transformer trend + residual DDPM pipeline.
+
+	\item Type 5 (derived/deterministic variables): algebraic or rule-based functions of other variables; we enforce deterministic reconstruction $\hat{x}^{(i)} = g_i(\hat{X},\hat{Y})$ rather than learning a stochastic generator, improving logical consistency and sample efficiency.
+
+	\item Type 6 (auxiliary/low-impact variables): weakly coupled or sparse signals; we allow simplified modeling (e.g., calibrated marginals or lightweight temporal models) to avoid allocating diffusion capacity where it is not warranted.
+\end{enumerate}
+
+Type-aware decomposition improves synthesis quality through three mechanisms. First, it improves capacity allocation by preventing a small set of mechanistically atypical variables from dominating gradients and distorting the learned distribution for the majority class (typically Type 4). Second, it enables constraint enforcement by deterministically reconstructing Type 5 variables, preventing logically inconsistent samples that purely learned generators can produce. Third, it improves mechanism alignment by attaching inductive biases consistent with step/dwell or saturation behaviors where generic denoisers may implicitly favor smoothness.
+
+From a novelty standpoint, this layer is not merely an engineering "patch' it is an explicit methodological statement that ICS synthesis benefits from typed factorization--a principle that has analogues in mixed-type generative modeling more broadly, but that remains underexplored in diffusion-based ICS telemetry synthesis \cite{shi2025tabdiff,yuan2025ctu,nist2023sp80082}.
+
+\subsection{Joint optimization and end-to-end sampling}
+\label{sec:method-joint}
+We train the model in a staged manner consistent with the above factorization, which improves optimization stability and encourages each component to specialize in its intended role. Specifically: (i) we train the trend Transformer $f_{\phi}$ to obtain $\hat{\bm{S}}$; (ii) we compute residual targets $\hat{\bm{R}} = \bm{X} - \hat{\bm{S}}$ for the continuous variables routed to residual diffusion; (iii) we train the residual DDPM $p_{\theta}(\bm{R}\mid \hat{\bm{S}})$ and masked diffusion model $p_{\psi}(\bm{Y}\mid \text{masked}(\bm{Y}), \hat{\bm{S}}, \hat{\bm{X}})$; and (iv) we apply type-aware routing and deterministic reconstruction during sampling. This staged strategy is aligned with the design goal of separating temporal scaffolding from distributional refinement, and it mirrors the broader intuition in time-series diffusion that decoupling coarse structure and stochastic detail can mitigate "structure vs. realism'conflicts \cite{kollovieh2023tsdiff,sikder2023transfusion}.
+
+A simple combined objective is $\mathcal{L} = \lambda\mathcal{L}_{\text{cont}} + (1-\lambda)\mathcal{L}_{\text{disc}}$ with $\lambda\in[0,1]$ controlling the balance between continuous and discrete learning. Type-aware routing determines which channels contribute to which loss and which are excluded in favor of deterministic reconstruction. In practice, this routing acts as a principled guardrail against negative transfer across variable mechanisms: channels that are best handled deterministically (Type 5) or by specialized drivers (Type 1/3, depending on configuration) are prevented from forcing the diffusion models into statistically incoherent compromises.
+
+At inference time, generation follows the same structured order: (i) trend $\hat{\bm{S}}$ via the Transformer, (ii) residual $\hat{\bm{R}}$ via DDPM, (iii) discrete $\hat{\bm{Y}}$ via masked diffusion, and (iv) type-aware assembly with deterministic reconstruction for routed variables. This pipeline produces $(\hat{\bm{X}},\hat{\bm{Y}})$ that are temporally coherent by construction (through $\hat{\bm{S}}$), distributionally expressive (through $\hat{\bm{R}}$ denoising), and discretely valid (through masked diffusion), while explicitly accounting for heterogeneous variable-generating mechanisms through type-aware routing. In combination, these choices constitute our central methodological contribution: a unified Transformer + mixed diffusion generator for ICS telemetry, augmented by typed factorization to align model capacity with domain mechanism \cite{ho2020denoising,shi2024simplified,yuan2025ctu,nist2023sp80082}.
+
+% 4. Benchmark
+\section{Benchmark}
+\label{sec:benchmark}
+We evaluate the proposed pipeline on feature sequences derived from the HAI Security Dataset, using fixed-length windows (L=96) that preserve the mixed-type structure of ICS telemetry. The goal of this benchmark is not only to report "overall similarity' but to justify why the proposed factorization is a better fit for protocol feature synthesis: continuous channels must match physical marginals \cite{coletta2023constrained}, discrete channels must remain semantically legal, and both must retain short-horizon dynamics that underpin state transitions and interlocks \cite{yang2001interlock}.
+
+This emphasis reflects evaluation practice in time-series generation, where strong results are typically supported by multiple complementary views (marginal fidelity, dependency/temporal structure, and downstream plausibility), rather than a single aggregate score \cite{stenger2024survey}. In the ICS setting, this multi-view requirement is sharper: a generator that matches continuous marginals while emitting out-of-vocabulary supervisory tokens is unusable for protocol reconstruction, and a generator that matches marginals but breaks lag structure can produce temporally implausible command/response sequences.
+
+Recent ICS time-series generators often emphasize aggregate similarity scores and utility-driven evaluations (e.g., anomaly-detection performance) to demonstrate realism, which is valuable but can under-specify mixed-type protocol constraints. Our benchmark complements these practices by making mixed-type legality and per-feature distributional alignment explicit: discrete outputs are evaluated as categorical distributions (JSD) and are constrained to remain within the legal vocabulary by construction, while continuous channels are evaluated with nonparametric distribution tests (KS) \cite{yoon2019timegan}. This combination provides a direct, protocol-relevant justification for the hybrid design, rather than relying on a single composite score that may mask discrete failures.
+
+For continuous channels, we measure distributional alignment using the Kolmogorov--Smirnov (KS) statistic computed per feature between the empirical distributions of real and synthetic samples, and then averaged across features. For discrete channels, we quantify marginal fidelity with Jensen--Shannon divergence (JSD) \cite{lin1991divergence,yoon2019timegan} between categorical distributions per feature, averaged across discrete variables. To assess temporal realism, we compare lag-1 autocorrelation at the feature level and report the mean absolute difference between real and synthetic lag-1 autocorrelation, averaged across features. In addition, to avoid degenerate comparisons driven by near-constant tags, features whose empirical standard deviation falls below a small threshold are excluded from continuous KS aggregation; such channels carry limited distributional information and can distort summary statistics.
+
+\subsection{Quantitative results}
+\label{sec:benchmark-quant}
+Across all runs, the mean continuous KS is 0.3311 (std 0.0079) and the mean discrete JSD is 0.0284 (std 0.0073), indicating that the generator preserves both continuous marginals and discrete semantic distributions at the feature level. Temporal consistency is similarly stable across runs, with a mean lag-1 autocorrelation difference of 0.2684 (std 0.0027), suggesting that the synthesized windows retain short-horizon dynamical structure \cite{ni2021sigwasserstein} instead of collapsing to marginal matching alone. The best-performing instance (by mean KS) attains 0.3224, and the small inter-seed variance shows that the reported fidelity is reproducible rather than driven by a single favorable initialization.
+\begin{figure*}[t]
+  \centering
+  \includegraphics[width=\textwidth]{fig-overall-benchmark-v1.png}
+  % \caption{Description of the figure.}
+  \label{fig:benchmark}
+\end{figure*}
+
+\begin{table}[htbp]
+\centering
+\caption{Summary of benchmark metrics. Lower values indicate better performance.}
+\label{tab:metrics}
+\begin{tabular}{@{}l l c@{}}
+\toprule
+\textbf{Metric} & \textbf{Aggregation} & \textbf{Mean $\pm$ Std} \\
+\midrule
+KS (continuous) & mean over continuous features & 0.3311 $\pm$ 0.0079 \\
+JSD (discrete)  & mean over discrete features   & 0.0284 $\pm$ 0.0073 \\
+Abs $\Delta$ lag-1 autocorr & mean over features & 0.2684 $\pm$ 0.0027 \\
+\bottomrule
+\end{tabular}
+\end{table}
+
+To make the benchmark actionable (and comparable to prior work), we report type-appropriate, interpretable statistics instead of collapsing everything into a single similarity score. This matters in mixed-type ICS telemetry: continuous fidelity can be high while discrete semantics fail, and vice versa. By separating continuous (KS), discrete (JSD), and temporal (lag-1) views, the evaluation directly matches the design goals of the hybrid generator: distributional refinement for continuous residuals, vocabulary-valid reconstruction for discrete supervision, and trend-induced short-horizon coherence.
+
+In addition, the seed-averaged reporting mirrors evaluation conventions in recent diffusion-based time-series generation studies, where robustness across runs is increasingly treated as a first-class signal rather than an afterthought. In this sense, the small inter-seed variance is itself evidence that the factorized training and typed routing reduce instability and localized error concentration, which is frequently observed when heterogeneous channels compete for the same modeling capacity.
+
+% 5. Future Work
+\section{Future Work}
+\label{sec:future}
+Future work will further expand from "generating legal ICS feature sequences" to "data construction and adversarial evaluation for security tasks". The core contribution of this paper focuses on generating feature sequences that are temporally consistent, have credible distributions, and have legal discrete values under mixed types and multi-scale dynamics. However, in the actual research of intrusion detection and anomaly detection, the more critical bottleneck is often the lack of "illegal data/anomaly data" with clear attack semantics and sufficient coverage. Therefore, a direct and important extension direction is to use the legal sequences generated in this paper as a controllable and reproducible "base line operation flow", and then, on the premise of maintaining sequence-level legality and engineering constraints, inject or mix illegal behaviors according to specified attack patterns, thereby systematically constructing a dataset for training and evaluating the recognition of illegal data packets.
+
+Specifically, attack injection can be upgraded from "simple perturbation" to "semantically consistent patterned rewriting": on continuous channels, implement bias injection, covert manipulation near thresholds, instantaneous mutations, and intermittent bursts, etc., so that it can both mimic the temporal characteristics pursued by attackers for concealment and not violate the basic boundary conditions of process dynamics; on discrete channels, implement illegal state transitions, alarm suppression/delayed triggering, pattern camouflage, etc., so that it reflects the trajectory morphology of "unreachable but forcibly created" under real control logic. Furthermore, the attack injection process itself can be coordinated with the type routing and constraint layer in this paper: for deterministically derived variables, illegal behaviors should be transmitted through the modification of upstream variables to maintain consistency; for supervised variables constrained by finite-state machines, interpretable illegal transitions should be generated through the "minimum violation path" or "controlled violation intensity", and violation points and violation types should be explicitly marked to facilitate downstream detection tasks to learn more fine-grained discrimination criteria.
+
+In terms of method morphology, this direction also naturally supports stronger controllability and measurability: attack patterns can be regarded as conditional variables to uniformly conditionally orchestrate legitimate generation and illegal injection, generating control samples of "different attack strategies under the same legitimate framework", thereby transforming dataset construction into a repeatable scenario generation process; meanwhile, by controlling the injection location, duration, amplitude, and coupling range, the performance degradation curves of detectors under different threat intensities and different operating condition stages can be systematically scanned, forming a more stable benchmark than "single acquisition/single script". Ultimately, this approach will transform the legitimate data generation capabilities presented in this paper into the infrastructure for security research: first providing a shareable and reproducible legitimate operation distribution, then injecting illegal patterns with clear semantics in a controllable manner, producing a dataset with sufficient coverage and consistent annotation for training and evaluating models that identify illegal packets/abnormal sequences, and promoting the improvement of reproducibility and engineering credibility in this direction.
+
+% 6. Conclusion
+\section{Conclusion}
+\label{sec:conclusion}
+This paper addresses the data scarcity and shareability barriers that limit machine-learning research for industrial control system (ICS) security by proposing a practical synthetic telemetry generation framework at the protocol feature level. We introduced Mask-DDPM, a hybrid generator designed explicitly for the mixed-type and multi-scale nature of ICS data, where continuous process dynamics must remain temporally coherent while discrete supervisory variables must remain categorically legal by construction.
+
+Our main contributions are: (i) a causal Transformer trend module that provides a stable long-horizon temporal scaffold for continuous channels; (ii) a trend-conditioned residual DDPM that focuses modeling capacity on local stochastic detail and marginal fidelity without destabilizing global structure; (iii) a masked (absorbing) diffusion branch for discrete variables that guarantees in-vocabulary outputs and supports semantics-aware conditioning on continuous context; and (iv) a type-aware decomposition/routing layer that aligns model mechanisms with heterogeneous ICS variable origins (e.g., process inertia, step-and-dwell setpoints, deterministic derived tags), enabling deterministic enforcement where appropriate and improving capacity allocation.
+
+We evaluated the approach on windows derived from the HAI Security Dataset and reported mixed-type, protocol-relevant metrics rather than a single aggregate score. Across seeds, the model achieves stable fidelity with mean KS = 0.3311 ± 0.0079 on continuous features, mean JSD = 0.0284 ± 0.0073 on discrete features, and mean absolute lag-1 autocorrelation difference 0.2684 ± 0.0027, indicating that Mask-DDPM preserves both marginal distributions and short-horizon dynamics while maintaining discrete legality.
+
+Overall, Mask-DDPM provides a reproducible foundation for generating shareable, semantically valid ICS feature sequences suitable for data augmentation, benchmarking, and downstream packet/trace reconstruction workflows. Building on this capability, a natural next step is to move from purely legal synthesis toward controllable scenario construction, including structured attack/violation injection under engineering constraints to support adversarial evaluation and more comprehensive security benchmarks.
+% 参考文'
+\bibliographystyle{IEEEtran}
+\bibliography{references}
+
+\end{document}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
--- a/arxiv-style/main.tex
+++ b/arxiv-style/main.tex
@@ -14,41 +14,49 @@
 \usepackage{cleveref}       % smart cross-referencing
 \usepackage{lipsum}         % Can be removed after putting your text content
 \usepackage{graphicx}
-\usepackage{natbib}
+\usepackage[numbers]{natbib}
 \usepackage{doi}

 % Packages for equations
 \usepackage{amssymb}
 \usepackage{bm}
+\usepackage{array}       % For column formatting
+\usepackage{caption}     % Better caption spacing

 % 标题
-\title{Your Paper Title: A Deep Learning Approach for Something}
+\title{Mask-DDPM: Transformer-Conditioned Mixed-Type Diffusion for Semantically Valid ICS Telemetry Synthesis}

 % 若不需要日期，取消下面一行的注释
-%\date{}
+\date{}

 \newif\ifuniqueAffiliation
 \uniqueAffiliationtrue

 \ifuniqueAffiliation % 标准作者块
 \author{
-    David S.~Hippocampus \\
-	Department of Computer Science\\
-	Cranberry-Lemon University\\
-	Pittsburgh, PA 15213 \\
-	\texttt{hippo@cs.cranberry-lemon.edu} \\
+    Zhenglan Chen \\
+	Aberdeen Institute of Data Science and Artificial Intelligence\\
+	South China Normal University\\
+	Guangzhou, Guangdong 510631, China \\
+	\texttt{20223803054@m.scnu.edu.cn} \\
 	\And
-	Elias D.~Striatum \\
-	Department of Electrical Engineering\\
-	Mount-Sheikh University\\
-	Santa Narimana, Levand \\
-	\texttt{stariate@ee.mount-sheikh.edu} \\
+	Mingzhe Yang \\
+	Aberdeen Institute of Data Science and Artificial Intelligence\\
+	South China Normal University\\
+	Guangzhou, Guangdong 510631, China \\
+	\texttt{20223803063@m.scnu.edu.cn} \\
 	\And
-	John Q.~Doe \\
-	Department of Mathematics\\
-	University of California, Berkeley\\
-	Berkeley, CA 94720 \\
-	\texttt{johndoe@math.berkeley.edu}
+	Hongyu Yan \\
+	Aberdeen Institute of Data Science and Artificial Intelligence\\
+	South China Normal University\\
+	Guangzhou, Guangdong 510631, China \\
+	\texttt{20223803065@m.scnu.edu.cn}
+	\And
+	Huan Yang \\
+    foo\\
+    South China Normal University\\
+    Guangzhou, Guangdong 510631, China \\
+    \texttt{foo@bar.com} \\
 }
 \fi

@@ -67,16 +75,22 @@ pdfkeywords={Keyword1, Keyword2, Keyword3},
 \maketitle

 \begin{abstract}
-	Here is the abstract of your paper.
+Industrial control systems (ICS) security research is increasingly constrained by the scarcity and non-shareability of realistic traffic and telemetry, especially for attack scenarios. To mitigate this bottleneck, we study synthetic generation at the protocol feature/telemetry level, where samples must simultaneously preserve temporal coherence, match continuous marginal distributions, and keep discrete supervisory variables strictly within valid vocabularies. We propose Mask-DDPM, a hybrid framework tailored to mixed-type, multi-scale ICS sequences. Mask-DDPM factorizes generation into (i) a causal Transformer trend module that rolls out a stable long-horizon temporal scaffold for continuous channels, (ii) a trend-conditioned residual DDPM that refines local stochastic structure and heavy-tailed fluctuations without degrading global dynamics, (iii) a masked (absorbing) diffusion branch for discrete variables that guarantees categorical legality by construction, and (iv) a type-aware decomposition/routing layer that aligns modeling mechanisms with heterogeneous ICS variable origins and enforces deterministic reconstruction where appropriate. Evaluated on fixed-length windows (L=96) derived from the HAI Security Dataset, Mask-DDPM achieves stable fidelity across seeds with mean KS = 0.3311 ± 0.0079 (continuous), mean JSD = 0.0284 ± 0.0073 (discrete), and mean absolute lag-1 autocorrelation difference = 0.2684 ± 0.0027, indicating faithful marginals, preserved short-horizon dynamics, and valid discrete semantics. The resulting generator provides a reproducible basis for data augmentation, benchmarking, and downstream ICS protocol reconstruction workflows.
 \end{abstract}

 % 关键词
-\keywords{Machine Learning \and Cyber Defense \and Benchmark \and Methodology}
+\keywords{Machine Learning \and Cyber Defense \and ICS}

 % 1. Introduction
 \section{Introduction}
 \label{sec:intro}
-Here introduces the background, problem statement, and contribution.
+Industrial control systems (ICS) form the backbone of modern critical infrastructure, which includes power grids, water treatment, manufacturing, and transportation, among others. These systems monitor, regulate, and automate the physical processes through sensors, actuators, programmable logic controllers (PLCs), and monitoring software. Unlike conventional IT systems, ICS operate in real time, closely coupled with physical processes and safety‑critical constraints, using heterogeneous and legacy communication protocols such as Modbus/TCP and DNP3 that were not originally designed with robust security in mind. This architectural complexity and operational criticality make ICS high‑impact targets for cyber attacks, where disruptions can result in physical damage, environmental harm, and even loss of life. Recent reviews of ICS security highlight the expanding attack surface due to increased connectivity, legacy systems’ vulnerabilities, and the inadequacy of traditional security controls in capturing the nuances of ICS networks and protocols \citep{10.1007/s10844-022-00753-1, Nankya2023-gp}
+
+While machine learning (ML) techniques have shown promise for anomaly detection and automated cybersecurity within ICS, they rely heavily on labeled datasets that capture both benign operations and diverse attack patterns. In practice, real ICS traffic data, especially attack‑triggered captures, are scarce due to confidentiality, safety, and legal restrictions, and available public ICS datasets are few, limited in scope, or fail to reflect current threat modalities. For instance, the HAI Security Dataset provides operational telemetry and anomaly flags from a realistic control system setup for research purposes, but must be carefully preprocessed to derive protocol‑relevant features for ML tasks \citep{shin}. Data scarcity directly undermines model generalization, evaluation reproducibility, and the robustness of intrusion detection research, especially when training or testing ML models on realistic ICS behavior remains confined to small or outdated collections of examples \citep{info16100910}.
+
+Synthetic data generation offers a practical pathway to mitigate these challenges. By programmatically generating feature‑level sequences that mimic the statistical and temporal structure of real ICS telemetry, researchers can augment scarce training sets, standardize benchmarking, and preserve operational confidentiality. Relative to raw packet captures, feature‑level synthesis abstracts critical protocol semantics and statistical patterns without exposing sensitive fields, making it more compatible with safety constraints and compliance requirements in ICS environments. Modern generative modeling, including diffusion models, has advanced significantly in producing high‑fidelity synthetic data across domains. Diffusion approaches, such as denoising diffusion probabilistic models, learn to transform noise into coherent structured samples and have been successfully applied to tabular or time series data synthesis with better stability and data coverage compared to adversarial methods \citep{pmlr-v202-kotelnikov23a, rasul2021autoregressivedenoisingdiffusionmodels}
+
+Despite these advances, most existing work either focuses on packet‑level generation \citep{jiang2023netdiffusionnetworkdataaugmentation} or is limited to generic tabular data \citep{pmlr-v202-kotelnikov23a}, rather than domain‑specific control sequence synthesis tailored for ICS protocols where temporal coherence, multi‑channel dependencies, and discrete protocol legality are jointly required. This gap motivates our focus on protocol feature-level generation for ICS, which involves synthesizing sequences of protocol-relevant fields conditioned on their temporal and cross-channel structure. In this work, we formulate a hybrid modeling pipeline that decouples long‑horizon trends and local statistical detail while preserving discrete semantics of protocol tokens. By combining causal Transformers with diffusion‑based refiners, and enforcing deterministic validity constraints during sampling, our framework generates semantically coherent, temporally consistent, and distributionally faithful ICS feature sequences. We evaluate features derived from the HAI Security Dataset and demonstrate that our approach produces high‑quality synthetic sequences suitable for downstream augmentation, benchmarking, and integration into packet‑construction workflows that respect realistic ICS constraints.

 % 2. Related Work
 \section{Related Work}
@@ -237,18 +251,62 @@ At inference time, generation follows the same structured order: (i) trend $\hat
 % 4. Benchmark
 \section{Benchmark}
 \label{sec:benchmark}
-In this section, we present the experimental setup and results.
+We evaluate the proposed pipeline on feature sequences derived from the HAI Security Dataset, using fixed-length windows (L=96) that preserve the mixed-type structure of ICS telemetry. The goal of this benchmark is not only to report “overall similarity”, but to justify why the proposed factorization is a better fit for protocol feature synthesis: continuous channels must match physical marginals \citep{coletta2023constrained}, discrete channels must remain semantically legal, and both must retain short-horizon dynamics that underpin state transitions and interlocks \citep{yang2001interlock}.
+
+This emphasis reflects evaluation practice in time-series generation, where strong results are typically supported by multiple complementary views (marginal fidelity, dependency/temporal structure, and downstream plausibility), rather than a single aggregate score \citep{stenger2024survey}. In the ICS setting, this multi-view requirement is sharper: a generator that matches continuous marginals while emitting out-of-vocabulary supervisory tokens is unusable for protocol reconstruction, and a generator that matches marginals but breaks lag structure can produce temporally implausible command/response sequences.
+
+Recent ICS time-series generators often emphasize aggregate similarity scores and utility-driven evaluations (e.g., anomaly-detection performance) to demonstrate realism, which is valuable but can under-specify mixed-type protocol constraints. Our benchmark complements these practices by making mixed-type legality and per-feature distributional alignment explicit: discrete outputs are evaluated as categorical distributions (JSD) and are constrained to remain within the legal vocabulary by construction, while continuous channels are evaluated with nonparametric distribution tests (KS) \citep{yoon2019timegan}. This combination provides a direct, protocol-relevant justification for the hybrid design, rather than relying on a single composite score that may mask discrete failures.
+
+For continuous channels, we measure distributional alignment using the Kolmogorov–Smirnov (KS) statistic computed per feature between the empirical distributions of real and synthetic samples, and then averaged across features. For discrete channels, we quantify marginal fidelity with Jensen–Shannon divergence (JSD) \citep{lin1991divergence,yoon2019timegan} between categorical distributions per feature, averaged across discrete variables. To assess temporal realism, we compare lag-1 autocorrelation at the feature level and report the mean absolute difference between real and synthetic lag-1 autocorrelation, averaged across features. In addition, to avoid degenerate comparisons driven by near-constant tags, features whose empirical standard deviation falls below a small threshold are excluded from continuous KS aggregation; such channels carry limited distributional information and can distort summary statistics.
+
+\subsection{Quantitative results}
+\label{sec:benchmark-quant}
+Across all runs, the mean continuous KS is 0.3311 (std 0.0079) and the mean discrete JSD is 0.0284 (std 0.0073), indicating that the generator preserves both continuous marginals and discrete semantic distributions at the feature level. Temporal consistency is similarly stable across runs, with a mean lag-1 autocorrelation difference of 0.2684 (std 0.0027), suggesting that the synthesized windows retain short-horizon dynamical structure \citep{ni2021sigwasserstein} instead of collapsing to marginal matching alone. The best-performing instance (by mean KS) attains 0.3224, and the small inter-seed variance shows that the reported fidelity is reproducible rather than driven by a single favorable initialization.
+\begin{figure}[htbp]
+  \centering
+  \includegraphics[width=0.8\textwidth]{fig-overall-benchmark-v1.png}
+  % \caption{Description of the figure.}
+  \label{fig:benchmark}
+\end{figure}
+
+\begin{table}[htbp]
+\centering
+\caption{Summary of benchmark metrics. Lower values indicate better performance.}
+\label{tab:metrics}
+\begin{tabular}{@{}l l c c@{}}
+\toprule
+\textbf{Metric} & \textbf{Aggregation} & \textbf{Lower is better} & \textbf{Mean $\pm$ Std} \\
+\midrule
+KS (continuous) & mean over continuous features & \checkmark & 0.3311 $\pm$ 0.0079 \\
+JSD (discrete)  & mean over discrete features   & \checkmark & 0.0284 $\pm$ 0.0073 \\
+Abs $\Delta$ lag-1 autocorr & mean over features & \checkmark & 0.2684 $\pm$ 0.0027 \\
+\bottomrule
+\end{tabular}
+\end{table}
+
+To make the benchmark actionable (and comparable to prior work), we report type-appropriate, interpretable statistics instead of collapsing everything into a single similarity score. This matters in mixed-type ICS telemetry: continuous fidelity can be high while discrete semantics fail, and vice versa. By separating continuous (KS), discrete (JSD), and temporal (lag-1) views, the evaluation directly matches the design goals of the hybrid generator: distributional refinement for continuous residuals, vocabulary-valid reconstruction for discrete supervision, and trend-induced short-horizon coherence.
+
+In addition, the seed-averaged reporting mirrors evaluation conventions in recent diffusion-based time-series generation studies, where robustness across runs is increasingly treated as a first-class signal rather than an afterthought. In this sense, the small inter-seed variance is itself evidence that the factorized training and typed routing reduce instability and localized error concentration, which is frequently observed when heterogeneous channels compete for the same modeling capacity.

 % 5. Future Work
 \section{Future Work}
 \label{sec:future}
-In this section, we present the future work.
+Future work will further expand from "generating legal ICS feature sequences" to "data construction and adversarial evaluation for security tasks". The core contribution of this paper focuses on generating feature sequences that are temporally consistent, have credible distributions, and have legal discrete values under mixed types and multi-scale dynamics. However, in the actual research of intrusion detection and anomaly detection, the more critical bottleneck is often the lack of "illegal data/anomaly data" with clear attack semantics and sufficient coverage. Therefore, a direct and important extension direction is to use the legal sequences generated in this paper as a controllable and reproducible "base line operation flow", and then, on the premise of maintaining sequence-level legality and engineering constraints, inject or mix illegal behaviors according to specified attack patterns, thereby systematically constructing a dataset for training and evaluating the recognition of illegal data packets.
+
+Specifically, attack injection can be upgraded from "simple perturbation" to "semantically consistent patterned rewriting": on continuous channels, implement bias injection, covert manipulation near thresholds, instantaneous mutations, and intermittent bursts, etc., so that it can both mimic the temporal characteristics pursued by attackers for concealment and not violate the basic boundary conditions of process dynamics; on discrete channels, implement illegal state transitions, alarm suppression/delayed triggering, pattern camouflage, etc., so that it reflects the trajectory morphology of "unreachable but forcibly created" under real control logic. Furthermore, the attack injection process itself can be coordinated with the type routing and constraint layer in this paper: for deterministically derived variables, illegal behaviors should be transmitted through the modification of upstream variables to maintain consistency; for supervised variables constrained by finite-state machines, interpretable illegal transitions should be generated through the "minimum violation path" or "controlled violation intensity", and violation points and violation types should be explicitly marked to facilitate downstream detection tasks to learn more fine-grained discrimination criteria.
+
+In terms of method morphology, this direction also naturally supports stronger controllability and measurability: attack patterns can be regarded as conditional variables to uniformly conditionally orchestrate legitimate generation and illegal injection, generating control samples of "different attack strategies under the same legitimate framework", thereby transforming dataset construction into a repeatable scenario generation process; meanwhile, by controlling the injection location, duration, amplitude, and coupling range, the performance degradation curves of detectors under different threat intensities and different operating condition stages can be systematically scanned, forming a more stable benchmark than "single acquisition/single script". Ultimately, this approach will transform the legitimate data generation capabilities presented in this paper into the infrastructure for security research: first providing a shareable and reproducible legitimate operation distribution, then injecting illegal patterns with clear semantics in a controllable manner, producing a dataset with sufficient coverage and consistent annotation for training and evaluating models that identify illegal packets/abnormal sequences, and promoting the improvement of reproducibility and engineering credibility in this direction.

 % 6. Conclusion
 \section{Conclusion}
 \label{sec:conclusion}
-In this section, we summarize our contributions and future directions.
+This paper addresses the data scarcity and shareability barriers that limit machine-learning research for industrial control system (ICS) security by proposing a practical synthetic telemetry generation framework at the protocol feature level. We introduced Mask-DDPM, a hybrid generator designed explicitly for the mixed-type and multi-scale nature of ICS data, where continuous process dynamics must remain temporally coherent while discrete supervisory variables must remain categorically legal by construction.

+Our main contributions are: (i) a causal Transformer trend module that provides a stable long-horizon temporal scaffold for continuous channels; (ii) a trend-conditioned residual DDPM that focuses modeling capacity on local stochastic detail and marginal fidelity without destabilizing global structure; (iii) a masked (absorbing) diffusion branch for discrete variables that guarantees in-vocabulary outputs and supports semantics-aware conditioning on continuous context; and (iv) a type-aware decomposition/routing layer that aligns model mechanisms with heterogeneous ICS variable origins (e.g., process inertia, step-and-dwell setpoints, deterministic derived tags), enabling deterministic enforcement where appropriate and improving capacity allocation.
+
+We evaluated the approach on windows derived from the HAI Security Dataset and reported mixed-type, protocol-relevant metrics rather than a single aggregate score. Across seeds, the model achieves stable fidelity with mean KS = 0.3311 ± 0.0079 on continuous features, mean JSD = 0.0284 ± 0.0073 on discrete features, and mean absolute lag-1 autocorrelation difference 0.2684 ± 0.0027, indicating that Mask-DDPM preserves both marginal distributions and short-horizon dynamics while maintaining discrete legality.
+
+Overall, Mask-DDPM provides a reproducible foundation for generating shareable, semantically valid ICS feature sequences suitable for data augmentation, benchmarking, and downstream packet/trace reconstruction workflows. Building on this capability, a natural next step is to move from purely legal synthesis toward controllable scenario construction, including structured attack/violation injection under engineering constraints to support adversarial evaluation and more comprehensive security benchmarks.
 % 参考文献
 \bibliographystyle{unsrtnat}
 \bibliography{references}
--- a/arxiv-style/references.bib
+++ b/arxiv-style/references.bib
@@ -1,3 +1,4 @@
+
@inproceedings{vaswani2017attention,
  title={Attention Is All You Need},
  author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia},
@@ -116,26 +117,136 @@
  url={https://csrc.nist.gov/pubs/sp/800/82/r3/final}
 }

-@article{10.1145/1151659.1159928,
-author = {Vishwanath, Kashi Venkatesh and Vahdat, Amin},
-title = {Realistic and responsive network traffic generation},
-year = {2006},
-issue_date = {October 2006},
-publisher = {Association for Computing Machinery},
-address = {New York, NY, USA},
-volume = {36},
-number = {4},
-issn = {0146-4833},
-url = {https://doi.org/10.1145/1151659.1159928},
-doi = {10.1145/1151659.1159928},
-abstract = {This paper presents Swing, a closed-loop, network-responsive traffic generator that accurately captures the packet interactions of a range of applications using a simple structural model. Starting from observed traffic at a single point in the network, Swing automatically extracts distributions for user, application, and network behavior. It then generates live traffic corresponding to the underlying models in a network emulation environment running commodity network protocol stacks. We find that the generated traces are statistically similar to the original traces. Further, to the best of our knowledge, we are the first to reproduce burstiness in traffic across a range of timescales using a model applicable to a variety of network settings. An initial sensitivity analysis reveals the importance of capturing and recreating user, application, and network characteristics to accurately reproduce such burstiness. Finally, we explore Swing's ability to vary user characteristics, application properties, and wide-area network conditions to project traffic characteristics into alternate scenarios.},
-journal = {SIGCOMM Comput. Commun. Rev.},
-month = aug,
-pages = {111–122},
-numpages = {12},
-keywords = {burstiness, energy plot, generator, internet, modeling, structural model, traffic, wavelets}
+Reference for Introduction Part
+
+@article{10.1007/s10844-022-00753-1,
+author = {Koay, Abigail M. Y. and Ko, Ryan K. L and Hettema, Hinne and Radke, Kenneth},
+title = {Machine learning in industrial control system (ICS) security: current landscape, opportunities and challenges},
+year = {2022},
+issue_date = {Apr 2023},
+publisher = {Kluwer Academic Publishers},
+address = {USA},
+volume = {60},
+number = {2},
+issn = {0925-9902},
+url = {https://doi.org/10.1007/s10844-022-00753-1},
+doi = {10.1007/s10844-022-00753-1},
+abstract = {The advent of Industry 4.0 has led to a rapid increase in cyber attacks on industrial systems and processes, particularly on Industrial Control Systems (ICS). These systems are increasingly becoming prime targets for cyber criminals and nation-states looking to extort large ransoms or cause disruptions due to their ability to cause devastating impact whenever they cease working or malfunction. Although myriads of cyber attack detection systems have been proposed and developed, these detection systems still face many challenges that are typically not found in traditional detection systems. Motivated by the need to better understand these challenges to improve current approaches, this paper aims to (1) understand the current vulnerability landscape in ICS, (2) survey current advancements of Machine Learning (ML) based methods with respect to the usage of ML base classifiers (3) provide insights to benefits and limitations of recent advancement with respect to two performance vectors; detection accuracy and attack variety. Based on our findings, we present key open challenges which will represent exciting research opportunities for the research community.},
+journal = {J. Intell. Inf. Syst.},
+month = oct,
+pages = {377–405},
+numpages = {29},
+keywords = {Operational technology, Cyber security, Dataset, Industrial control systems, Machine learning, Critical infrastructure}
 }

+@ARTICLE{Nankya2023-gp,
+  title     = "Securing industrial Control Systems: Components, cyber threats,
+               and machine learning-driven defense strategies",
+  author    = "Nankya, Mary and Chataut, Robin and Akl, Robert",
+  abstract  = "Industrial Control Systems (ICS), which include Supervisory
+               Control and Data Acquisition (SCADA) systems, Distributed
+               Control Systems (DCS), and Programmable Logic Controllers (PLC),
+               play a crucial role in managing and regulating industrial
+               processes. However, ensuring the security of these systems is of
+               utmost importance due to the potentially severe consequences of
+               cyber attacks. This article presents an overview of ICS
+               security, covering its components, protocols, industrial
+               applications, and performance aspects. It also highlights the
+               typical threats and vulnerabilities faced by these systems.
+               Moreover, the article identifies key factors that influence the
+               design decisions concerning control, communication, reliability,
+               and redundancy properties of ICS, as these are critical in
+               determining the security needs of the system. The article
+               outlines existing security countermeasures, including network
+               segmentation, access control, patch management, and security
+               monitoring. Furthermore, the article explores the integration of
+               machine learning techniques to enhance the cybersecurity of ICS.
+               Machine learning offers several advantages, such as anomaly
+               detection, threat intelligence analysis, and predictive
+               maintenance. However, combining machine learning with other
+               security measures is essential to establish a comprehensive
+               defense strategy for ICS. The article also addresses the
+               challenges associated with existing measures and provides
+               recommendations for improving ICS security. This paper becomes a
+               valuable reference for researchers aiming to make meaningful
+               contributions within the constantly evolving ICS domain by
+               providing an in-depth examination of the present state,
+               challenges, and potential future advancements.",
+  journal   = "Sensors (Basel)",
+  publisher = "MDPI AG",
+  volume    =  23,
+  number    =  21,
+  pages     = "8840",
+  month     =  oct,
+  year      =  2023,
+  keywords  = "SCADA; anomaly detection; artificial intelligence; attacks;
+               cyber defense; cyber threats; industrial control systems;
+               security; vulnerabilities",
+  copyright = "https://creativecommons.org/licenses/by/4.0/",
+  language  = "en"
+}
+
+@misc{shin,
+  title = {HAI Security Dataset},
+  url = {https://www.kaggle.com/dsv/5821622},
+  doi = {10.34740/kaggle/dsv/5821622},
+  publisher = {Kaggle},
+  author = {Shin, Hyeok-Ki and Lee, Woomyo and Choi, Seungoh and Yun, Jeong-Han and Min, Byung Gil and Kim, HyoungChun},
+  year = {2023}
+}
+
+@Article{info16100910,
+AUTHOR = {Ali, Jokha and Ali, Saqib and Al Balushi, Taiseera and Nadir, Zia},
+TITLE = {Intrusion Detection in Industrial Control Systems Using Transfer Learning Guided by Reinforcement Learning},
+JOURNAL = {Information},
+VOLUME = {16},
+YEAR = {2025},
+NUMBER = {10},
+ARTICLE-NUMBER = {910},
+URL = {https://www.mdpi.com/2078-2489/16/10/910},
+ISSN = {2078-2489},
+ABSTRACT = {Securing Industrial Control Systems (ICSs) is critical, but it is made challenging by the constant evolution of cyber threats and the scarcity of labeled attack data in these specialized environments. Standard intrusion detection systems (IDSs) often fail to adapt when transferred to new networks with limited data. To address this, this paper introduces an adaptive intrusion detection framework that combines a hybrid Convolutional Neural Network and Long Short-Term Memory (CNN-LSTM) model with a novel transfer learning strategy. We employ a Reinforcement Learning (RL) agent to intelligently guide the fine-tuning process, which allows the IDS to dynamically adjust its parameters such as layer freezing and learning rates in real-time based on performance feedback. We evaluated our system in a realistic data-scarce scenario using only 50 labeled training samples. Our RL-Guided model achieved a final F1-score of 0.9825, significantly outperforming a standard neural fine-tuning model (0.861) and a target baseline model (0.759). Analysis of the RL agent’s behavior confirmed that it learned a balanced and effective policy for adapting the model to the target domain. We conclude that the proposed RL-guided approach creates a highly accurate and adaptive IDS that overcomes the limitations of static transfer learning methods. This dynamic fine-tuning strategy is a powerful and promising direction for building resilient cybersecurity defenses for critical infrastructure.},
+DOI = {10.3390/info16100910}
+}
+
+@InProceedings{pmlr-v202-kotelnikov23a,
+  title =          {{T}ab{DDPM}: Modelling Tabular Data with Diffusion Models},
+  author =       {Kotelnikov, Akim and Baranchuk, Dmitry and Rubachev, Ivan and Babenko, Artem},
+  booktitle =          {Proceedings of the 40th International Conference on Machine Learning},
+  pages =          {17564--17579},
+  year =          {2023},
+  editor =          {Krause, Andreas and Brunskill, Emma and Cho, Kyunghyun and Engelhardt, Barbara and Sabato, Sivan and Scarlett, Jonathan},
+  volume =          {202},
+  series =          {Proceedings of Machine Learning Research},
+  month =          {23--29 Jul},
+  publisher =    {PMLR},
+  pdf =          {https://proceedings.mlr.press/v202/kotelnikov23a/kotelnikov23a.pdf},
+  url =          {https://proceedings.mlr.press/v202/kotelnikov23a.html},
+  abstract =          {Denoising diffusion probabilistic models are becoming the leading generative modeling paradigm for many important data modalities. Being the most prevalent in the computer vision community, diffusion models have recently gained some attention in other domains, including speech, NLP, and graph-like data. In this work, we investigate if the framework of diffusion models can be advantageous for general tabular problems, where data points are typically represented by vectors of heterogeneous features. The inherent heterogeneity of tabular data makes it quite challenging for accurate modeling since the individual features can be of a completely different nature, i.e., some of them can be continuous and some can be discrete. To address such data types, we introduce TabDDPM — a diffusion model that can be universally applied to any tabular dataset and handles any feature types. We extensively evaluate TabDDPM on a wide set of benchmarks and demonstrate its superiority over existing GAN/VAE alternatives, which is consistent with the advantage of diffusion models in other fields.}
+}
+
+@misc{rasul2021autoregressivedenoisingdiffusionmodels,
+      title={Autoregressive Denoising Diffusion Models for Multivariate Probabilistic Time Series Forecasting},
+      author={Kashif Rasul and Calvin Seward and Ingmar Schuster and Roland Vollgraf},
+      year={2021},
+      eprint={2101.12072},
+      archivePrefix={arXiv},
+      primaryClass={cs.LG},
+      url={https://arxiv.org/abs/2101.12072},
+}
+
+@misc{jiang2023netdiffusionnetworkdataaugmentation,
+      title={NetDiffusion: Network Data Augmentation Through Protocol-Constrained Traffic Generation},
+      author={Xi Jiang and Shinan Liu and Aaron Gember-Jacobson and Arjun Nitin Bhagoji and Paul Schmitt and Francesco Bronzino and Nick Feamster},
+      year={2023},
+      eprint={2310.08543},
+      archivePrefix={arXiv},
+      primaryClass={cs.NI},
+      url={https://arxiv.org/abs/2310.08543},
+}
+
+Reference for Related Work
+
@inproceedings{10.1145/1159913.1159928,
 author = {Vishwanath, Kashi Venkatesh and Vahdat, Amin},
 title = {Realistic and responsive network traffic generation},
@@ -160,51 +271,54 @@ series = {SIGCOMM '06}
   ISSN={0167-4048},
   url={http://dx.doi.org/10.1016/j.cose.2018.12.012},
   DOI={10.1016/j.cose.2018.12.012},
-   journal={Computers &amp; Security},
+   journal={Computers \& Security},
   publisher={Elsevier BV},
   author={Ring, Markus and Schlör, Daniel and Landes, Dieter and Hotho, Andreas},
   year={2019},
-   month=may, pages={156–172} }
+   month=may, pages={156–172}
+}

-   @inproceedings{10.1145/3544216.3544251,
-   author = {Yin, Yucheng and Lin, Zinan and Jin, Minhao and Fanti, Giulia and Sekar, Vyas},
-   title = {Practical GAN-based synthetic IP header trace generation using NetShare},
-   year = {2022},
-   isbn = {9781450394208},
-   publisher = {Association for Computing Machinery},
-   address = {New York, NY, USA},
-   url = {https://doi.org/10.1145/3544216.3544251},
-   doi = {10.1145/3544216.3544251},
-   abstract = {We explore the feasibility of using Generative Adversarial Networks (GANs) to automatically learn generative models to generate synthetic packet- and flow header traces for networking tasks (e.g., telemetry, anomaly detection, provisioning). We identify key fidelity, scalability, and privacy challenges and tradeoffs in existing GAN-based approaches. By synthesizing domain-specific insights with recent advances in machine learning and privacy, we identify design choices to tackle these challenges. Building on these insights, we develop an end-to-end framework, NetShare. We evaluate NetShare on six diverse packet header traces and find that: (1) across all distributional metrics and traces, it achieves 46\% more accuracy than baselines and (2) it meets users' requirements of downstream tasks in evaluating accuracy and rank ordering of candidate approaches.},
-   booktitle = {Proceedings of the ACM SIGCOMM 2022 Conference},
-   pages = {458–472},
-   numpages = {15},
-   keywords = {synthetic data generation, privacy, network packets, network flows, generative adversarial networks},
-   location = {Amsterdam, Netherlands},
-   series = {SIGCOMM '22}
-   }
+@inproceedings{10.1145/3544216.3544251,
+author = {Yin, Yucheng and Lin, Zinan and Jin, Minhao and Fanti, Giulia and Sekar, Vyas},
+title = {Practical GAN-based synthetic IP header trace generation using NetShare},
+year = {2022},
+isbn = {9781450394208},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/3544216.3544251},
+doi = {10.1145/3544216.3544251},
+abstract = {We explore the feasibility of using Generative Adversarial Networks (GANs) to automatically learn generative models to generate synthetic packet- and flow header traces for networking tasks (e.g., telemetry, anomaly detection, provisioning). We identify key fidelity, scalability, and privacy challenges and tradeoffs in existing GAN-based approaches. By synthesizing domain-specific insights with recent advances in machine learning and privacy, we identify design choices to tackle these challenges. Building on these insights, we develop an end-to-end framework, NetShare. We evaluate NetShare on six diverse packet header traces and find that: (1) across all distributional metrics and traces, it achieves 46\% more accuracy than baselines and (2) it meets users' requirements of downstream tasks in evaluating accuracy and rank ordering of candidate approaches.},
+booktitle = {Proceedings of the ACM SIGCOMM 2022 Conference},
+pages = {458–472},
+numpages = {15},
+keywords = {synthetic data generation, privacy, network packets, network flows, generative adversarial networks},
+location = {Amsterdam, Netherlands},
+series = {SIGCOMM '22}
+}

@inproceedings{Lin_2020, series={IMC ’20},
-    title={Using GANs for Sharing Networked Time Series Data: Challenges, Initial Promise, and Open Questions},
-    url={http://dx.doi.org/10.1145/3419394.3423643},
-    DOI={10.1145/3419394.3423643},
-    booktitle={Proceedings of the ACM Internet Measurement Conference},
-    publisher={ACM},
-    author={Lin, Zinan and Jain, Alankar and Wang, Chen and Fanti, Giulia and Sekar, Vyas},
-    year={2020},
-    month=oct, pages={464–483},
-    collection={IMC ’20} }
+   title={Using GANs for Sharing Networked Time Series Data: Challenges, Initial Promise, and Open Questions},
+   url={http://dx.doi.org/10.1145/3419394.3423643},
+   DOI={10.1145/3419394.3423643},
+   booktitle={Proceedings of the ACM Internet Measurement Conference},
+   publisher={ACM},
+   author={Lin, Zinan and Jain, Alankar and Wang, Chen and Fanti, Giulia and Sekar, Vyas},
+   year={2020},
+   month=oct, pages={464–483},
+   collection={IMC ’20}
+}

@INPROCEEDINGS{7469060,
-    author={Mathur, Aditya P. and Tippenhauer, Nils Ole},
-    booktitle={2016 International Workshop on Cyber-physical Systems for Smart Water Networks (CySWater)},
-    title={SWaT: a water treatment testbed for research and training on ICS security},
-    year={2016},
-    volume={},
-    number={},
-    pages={31-36},
-    keywords={Sensors;Actuators;Feeds;Process control;Chemicals;Chemical sensors;Security;Cyber Physical Systems;Industrial Control Systems;Cyber Attacks;Cyber Defense;Water Testbed},
-    doi={10.1109/CySWater.2016.7469060}}
+  author={Mathur, Aditya P. and Tippenhauer, Nils Ole},
+  booktitle={2016 International Workshop on Cyber-physical Systems for Smart Water Networks (CySWater)},
+  title={SWaT: a water treatment testbed for research and training on ICS security},
+  year={2016},
+  volume={},
+  number={},
+  pages={31-36},
+  keywords={Sensors;Actuators;Feeds;Process control;Chemicals;Chemical sensors;Security;Cyber Physical Systems;Industrial Control Systems;Cyber Attacks;Cyber Defense;Water Testbed},
+  doi={10.1109/CySWater.2016.7469060}
+}

@inproceedings{10.1145/3055366.3055375,
 author = {Ahmed, Chuadhry Mujeeb and Palleti, Venkata Reddy and Mathur, Aditya P.},
@@ -224,38 +338,6 @@ location = {Pittsburgh, Pennsylvania},
 series = {CySWATER '17}
 }

-@inproceedings{NEURIPS2020_4c5bcfec,
-author = {Ho, Jonathan and Jain, Ajay and Abbeel, Pieter},
-booktitle = {Advances in Neural Information Processing Systems},
-editor = {H. Larochelle and M. Ranzato and R. Hadsell and M.F. Balcan and H. Lin},
-pages = {6840--6851},
-publisher = {Curran Associates, Inc.},
-title = {Denoising Diffusion Probabilistic Models},
-url = {https://proceedings.neurips.cc/paper_files/paper/2020/file/4c5bcfec8584af0d967f1ab10179ca4b-Paper.pdf},
-volume = {33},
-year = {2020}
-}
-
-@misc{song2021scorebasedgenerativemodelingstochastic,
-      title={Score-Based Generative Modeling through Stochastic Differential Equations},
-      author={Yang Song and Jascha Sohl-Dickstein and Diederik P. Kingma and Abhishek Kumar and Stefano Ermon and Ben Poole},
-      year={2021},
-      eprint={2011.13456},
-      archivePrefix={arXiv},
-      primaryClass={cs.LG},
-      url={https://arxiv.org/abs/2011.13456},
-}
-
-@misc{rasul2021autoregressivedenoisingdiffusionmodels,
-      title={Autoregressive Denoising Diffusion Models for Multivariate Probabilistic Time Series Forecasting},
-      author={Kashif Rasul and Calvin Seward and Ingmar Schuster and Roland Vollgraf},
-      year={2021},
-      eprint={2101.12072},
-      archivePrefix={arXiv},
-      primaryClass={cs.LG},
-      url={https://arxiv.org/abs/2101.12072},
-}
-
@misc{tashiro2021csdiconditionalscorebaseddiffusion,
      title={CSDI Conditional Score-based Diffusion Models for Probabilistic Time Series Imputation},
      author={Yusuke Tashiro and Jiaming Song and Yang Song and Stefano Ermon},
@@ -305,16 +387,17 @@ year = {2020}
  number={1},
  pages={257-271},
  keywords={Base stations;Diffusion models;Data models;Uncertainty;Predictive models;Generative adversarial networks;Knowledge graphs;Mobile computing;Telecommunication traffic;Semantics;Cellular traffic;data generation;diffusion model;spatio-temporal graph},
-  doi={10.1109/TMC.2025.3591183}}
+  doi={10.1109/TMC.2025.3591183}
+}

-@misc{austin2023structureddenoisingdiffusionmodels,
-    title={Structured Denoising Diffusion Models in Discrete State-Spaces},
-    author={Jacob Austin and Daniel D. Johnson and Jonathan Ho and Daniel Tarlow and Rianne van den Berg},
-    year={2023},
-    eprint={2107.03006},
-    archivePrefix={arXiv},
-    primaryClass={cs.LG},
-    url={https://arxiv.org/abs/2107.03006},
+@misc{hoogeboom2021argmaxflowsmultinomialdiffusion,
+      title={Argmax Flows and Multinomial Diffusion: Learning Categorical Distributions},
+      author={Emiel Hoogeboom and Didrik Nielsen and Priyank Jaini and Patrick Forré and Max Welling},
+      year={2021},
+      eprint={2102.05379},
+      archivePrefix={arXiv},
+      primaryClass={stat.ML},
+      url={https://arxiv.org/abs/2102.05379},
 }

@misc{li2022diffusionlmimprovescontrollabletext,
@@ -338,7 +421,7 @@ year = {2020}
 }

@misc{godefroid2017learnfuzzmachinelearninginput,
-      title={Learn&Fuzz: Machine Learning for Input Fuzzing},
+      title={Learn\&Fuzz: Machine Learning for Input Fuzzing},
      author={Patrice Godefroid and Hila Peleg and Rishabh Singh},
      year={2017},
      eprint={1701.07232},
@@ -357,16 +440,6 @@ year = {2020}
      url={https://arxiv.org/abs/1807.05620},
 }

-@misc{hoogeboom2021argmaxflowsmultinomialdiffusion,
-      title={Argmax Flows and Multinomial Diffusion: Learning Categorical Distributions},
-      author={Emiel Hoogeboom and Didrik Nielsen and Priyank Jaini and Patrick Forré and Max Welling},
-      year={2021},
-      eprint={2102.05379},
-      archivePrefix={arXiv},
-      primaryClass={stat.ML},
-      url={https://arxiv.org/abs/2102.05379},
-}
-
@misc{dai2019transformerxlattentivelanguagemodels,
      title={Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context},
      author={Zihang Dai and Zhilin Yang and Yiming Yang and Jaime Carbonell and Quoc V. Le and Ruslan Salakhutdinov},
@@ -418,4 +491,65 @@ year = {2020}
   publisher={University Library in Kragujevac},
   author={Damjanović, Ivan and Milošević, Marko and Stevanović, Dragan},
   year={2023},
-   pages={197–202} }
+   pages={197–202}
+}
+
+Reference for Benchmark
+
+@article{stenger2024survey,
+  title={Evaluation is key: a survey on evaluation measures for synthetic time series},
+  author={Stenger, Michael and Leppich, Robert and Foster, Ian T and Kounev, Samuel and Bauer, Andre},
+  journal={Journal of Big Data},
+  volume={11},
+  number={1},
+  pages={66},
+  year={2024},
+  publisher={Springer}
+}
+
+@article{lin1991divergence,
+  title={Divergence measures based on the Shannon entropy},
+  author={Lin, Jianhua},
+  journal={IEEE Transactions on Information Theory},
+  volume={37},
+  number={1},
+  pages={145--151},
+  year={1991}
+}
+
+@inproceedings{yoon2019timegan,
+  title={Time-series generative adversarial networks},
+  author={Yoon, Jinsung and Jarrett, Daniel and van der Schaar, Mihaela},
+  booktitle={Advances in Neural Information Processing Systems},
+  volume={32},
+  year={2019}
+}
+
+@article{ni2021sigwasserstein,
+  title={Sig-Wasserstein GANs for time series generation},
+  author={Ni, Hao and Szpruch, Lukasz and Wiese, Magnus and Liao, Shujian and Xiao, Baoren},
+  journal={Proceedings of the ACM on Measurement and Analysis of Computing Systems},
+  volume={5},
+  number={3},
+  pages={1--25},
+  year={2021}
+}
+
+@inproceedings{coletta2023constrained,
+  title={On the constrained time-series generation problem},
+  author={Coletta, Alessandro and Rossi, Roberto and others},
+  booktitle={Advances in Neural Information Processing Systems},
+  volume={36},
+  year={2023}
+}
+
+@article{yang2001interlock,
+  title={Automatic verification of safety interlock systems for industrial processes},
+  author={Yang, Sheng-Hong and Hsieh, Min-Chi},
+  journal={Journal of Loss Prevention in the Process Industries},
+  volume={14},
+  number={6},
+  pages={473--483},
+  year={2001},
+  publisher={Elsevier}
+}
Author	SHA1	Message	Date
DaZuo0122	566e251743	Add: python scripts for figure generation	2026-02-09 00:24:40 +08:00
DaZuo0122	0a74d67f19	Add: ignore rules for drawio	2026-02-08 17:26:38 +08:00
DaZuo0122	096af1bcb6	Fix: Author Block	2026-02-06 15:51:07 +08:00
DaZuo0122	c146138443	Fix: reference issue, from 50 to 46	2026-02-06 15:24:43 +08:00
DaZuo0122	0b3e79841b	Fix: pictures misplacement, remove the lower is better column from the table	2026-02-06 15:11:17 +08:00
DaZuo0122	67466f8e3d	Add: IEEE style double column - exceed 6 page limit and graph lispalced	2026-02-06 14:55:53 +08:00
DaZuo0122	250428b176	Add: README.md for latex compiling	2026-02-05 17:06:27 +08:00
Hongyu Yan	6f1e7a9994	Update main.tex	2026-02-04 23:02:58 +08:00
Hongyu Yan	b3280dcc19	Update main.tex	2026-02-04 22:48:59 +08:00
Hongyu Yan	de445963b5	Update main.tex	2026-02-04 22:45:41 +08:00
Hongyu Yan	5ede1a11f1	The first draft is completed.	2026-02-04 22:37:35 +08:00
Hongyu Yan	21053b4f13	Future Work Part Complete	2026-02-04 22:21:42 +08:00
Hongyu Yan	a7c8250d1a	Reapply "Update references.bib" This reverts commit `03640302db`.	2026-02-04 22:18:35 +08:00
Hongyu Yan	03640302db	Revert "Update references.bib" This reverts commit `3a9836d15d`.	2026-02-04 22:16:52 +08:00
Hongyu Yan	3a9836d15d	Update references.bib Solve the ref problem of HAI dataset	2026-02-04 22:16:18 +08:00
DaZuo0122	1e3eb39dea	Add: benchmark, bibtex shows error in line 377, should be reference for other section than benchmark/method	2026-02-04 22:13:46 +08:00
DaZuo0122	9f8af2c67c	Add: reference for benchmark	2026-02-04 21:46:53 +08:00
Hongyu Yan	272e159df1	Intro and Related Work Completed - The reference of HAI dataset still have problems.	2026-02-04 19:39:36 +08:00
Hongyu Yan	81625b5c4e	Update .gitignore	2026-02-04 18:19:30 +08:00
DaZuo0122	4815d05127	Change: citations are now presented in numbers	2026-02-04 17:14:03 +08:00
DaZuo0122	5fbfd1068f	Add: figure for benchmark section	2026-02-04 17:08:45 +08:00