internal-docs/LaTeX2e+Proceedings+Templates+download/main.tex

\documentclass[runningheads]{llncs}
\usepackage[T1]{fontenc}
\usepackage{graphicx}
\usepackage{amsmath}
\usepackage{amssymb}
\usepackage{amsfonts}
\usepackage{bm}
\usepackage{array}
\usepackage{booktabs}
\usepackage{microtype}
\usepackage{float}
\usepackage{url}

% Compatibility shim: the source manuscript uses natbib-style citep commands.
\newcommand{\citep}[1]{\cite{#1}}

\title{Mask-DDPM: Transformer-Conditioned Mixed-Type Diffusion for Semantically Valid ICS Telemetry Synthesis}
\titlerunning{Mask-DDPM for ICS Telemetry Synthesis}

\author{Zhenglan Chen\inst{1} \and Mingzhe Yang\inst{1} \and Hongyu Yan\inst{1} \and Huan Yang\inst{2}}
\authorrunning{Z. Chen et al.}

\institute{Aberdeen Institute of Data Science and Artificial Intelligence, South China Normal University, Guangzhou, Guangdong 510631, China \\
\email{\{20223803054,20223803063,20223803065\}@m.scnu.edu.cn}
\and
School of Artificial Intelligence, South China Normal University, Guangzhou, Guangdong 510631, China \\
\email{huan.yang@m.scnu.edu.cn}}

\begin{document}
\maketitle

\begin{abstract}
Industrial control systems (ICS) security research is increasingly constrained by the scarcity and limited shareability of realistic communication traces and process measurements, especially for attack scenarios. To mitigate this bottleneck, we study synthetic generation at the protocol-feature and process-signal level, where samples must simultaneously preserve temporal coherence, match continuous marginal distributions, and keep discrete supervisory variables strictly within valid vocabularies. We propose Mask-DDPM, a hybrid framework tailored to mixed-type, multi-scale ICS sequences. Mask-DDPM factorizes generation into (i) a causal Transformer trend module that rolls out a stable long-range temporal scaffold for continuous channels, (ii) a trend-conditioned residual DDPM that refines local stochastic structure and heavy-tailed fluctuations without degrading global dynamics, (iii) a masked (absorbing) diffusion branch for discrete variables that guarantees valid symbol generation by construction, and (iv) a type-aware decomposition/routing layer that aligns modeling mechanisms with heterogeneous ICS variable origins and enforces deterministic reconstruction where appropriate. Evaluated on fixed-length windows ($L=96$) derived from the HAI Security Dataset, Mask-DDPM achieves stable fidelity across seeds with mean KS = 0.3311 $\pm$ 0.0079 (continuous), mean JSD = 0.0284 $\pm$ 0.0073 (discrete), and mean absolute lag-1 autocorrelation difference = 0.2684 $\pm$ 0.0027, indicating faithful marginals, preserved short-horizon dynamics, and valid discrete semantics. The resulting generator provides a reproducible basis for data augmentation, benchmarking, and downstream ICS protocol reconstruction workflows.

\keywords{Machine Learning \and Cyber Defense \and ICS}
\end{abstract}

\section{Introduction}
\label{sec:intro}
Industrial control systems (ICS) form the backbone of modern critical infrastructure, which includes power grids, water treatment, manufacturing, and transportation, among others. These systems monitor, regulate, and automate physical processes through sensors, actuators, programmable logic controllers (PLCs), and monitoring software. Unlike conventional IT systems, ICS operate in real time, closely coupled with physical processes and safety-critical constraints, using heterogeneous and legacy communication protocols such as Modbus/TCP and DNP3 that were not originally designed with robust security in mind. This architectural complexity and operational criticality make ICS high-impact targets for cyber attacks, where disruptions can result in physical damage, environmental harm, and even loss of life. Recent reviews of ICS security highlight the expanding attack surface due to increased connectivity, vulnerabilities in legacy systems, and the inadequacy of traditional security controls in capturing the nuances of ICS networks and protocols \citep{10.1007/s10844-022-00753-1,Nankya2023-gp}

While machine learning (ML) techniques have shown promise for anomaly detection and automated cybersecurity within ICS, they rely heavily on labeled datasets that capture both benign operations and diverse attack patterns. In practice, real ICS traffic data, especially attack-triggered captures, are scarce due to confidentiality, safety, and legal restrictions, and available public ICS datasets are few, limited in scope, or fail to reflect current threat modalities. For instance, the HAI Security Dataset provides operational telemetry and anomaly flags from a realistic control system setup for research purposes, but must be carefully preprocessed to derive protocol-relevant features for ML tasks \citep{shin}. Data scarcity directly undermines model generalization, evaluation reproducibility, and the robustness of intrusion detection research, especially when training or testing ML models on realistic ICS behavior remains confined to small or outdated collections of examples \citep{info16100910}.

Synthetic data generation offers a practical pathway to mitigate these challenges. By programmatically generating feature-level sequences that mimic the statistical and temporal structure of real ICS telemetry, researchers can augment scarce training sets, standardize benchmarking, and preserve operational confidentiality. Relative to raw packet captures, feature-level synthesis abstracts critical protocol semantics and statistical patterns without exposing sensitive fields, making it more compatible with safety constraints and compliance requirements in ICS environments. Modern generative modeling, including diffusion models, has advanced significantly in producing high-fidelity synthetic data across domains. Diffusion approaches, such as denoising diffusion probabilistic models, learn to transform noise into coherent structured samples and have been successfully applied to tabular or time series data synthesis with better stability and data coverage compared to adversarial methods \citep{pmlr-v202-kotelnikov23a,rasul2021autoregressivedenoisingdiffusionmodels}

Despite these advances, most existing work either focuses on packet-level generation \citep{jiang2023netdiffusionnetworkdataaugmentation} or is limited to generic tabular data \citep{pmlr-v202-kotelnikov23a}, rather than domain-specific control sequence synthesis tailored for ICS protocols where temporal coherence, multi-channel dependencies, and discrete protocol legality are jointly required. This gap motivates our focus on protocol feature-level generation for ICS, which involves synthesizing sequences of protocol-relevant fields conditioned on their temporal and cross-channel structure. In this work, we formulate a hybrid modeling pipeline that decouples long-horizon trends and local statistical detail while preserving discrete semantics of protocol tokens. By combining causal Transformers with diffusion-based refiners, and enforcing deterministic validity constraints during sampling, our framework generates semantically coherent, temporally consistent, and distributionally faithful ICS feature sequences. We evaluate features derived from the HAI Security Dataset and demonstrate that our approach produces high-quality synthetic sequences suitable for downstream augmentation, benchmarking, and integration into packet-reconstruction workflows that respect realistic ICS constraints.

% 2. Related Work
\section{Related Work}
\label{sec:related}
Early generation of network data oriented towards "realism" mostly remained at the packet/flow header level, either through replay or statistical synthesis based on single-point observations. Swing, in a closed-loop, network-responsive manner, extracts user/application/network distributions from single-point observations to reproduce burstiness and correlation across multiple time scales \citep{10.1145/1151659.1159928,10.1145/1159913.1159928}. Subsequently, a series of works advanced header synthesis to learning-based generation: the WGAN-based method added explicit verification of protocol field consistency to NetFlow/IPFIX \citep{Ring_2019}, NetShare reconstructed header modeling as flow-level time series and improved fidelity and scalability through domain encoding and parallel fine-tuning \citep{10.1145/3544216.3544251}, and DoppelGANger preserved the long-range structure and downstream sorting consistency of networked time series by decoupling attributes from sequences \citep{Lin_2020}. However, in industrial control system (ICS) scenarios, the original PCAP is usually not shareable, and public testbeds (such as SWaT, WADI) mostly provide process/monitoring telemetry and protocol interactions for security assessment, but public datasets emphasize operational variables rather than packet-level traces \citep{7469060,10.1145/3055366.3055375}. This makes "synthesis at the feature/telemetry level, aware of protocol and semantics" more feasible and necessary in practice: we are more concerned with reproducing high-level distributions and multi-scale temporal patterns according to operational semantics and physical constraints without relying on the original packets. From this perspective, the generation paradigm naturally shifts from "packet syntax reproduction" to "modeling of high-level spatio-temporal distributions and uncertainties", requiring stable training, strong distribution fitting, and interpretable uncertainty characterization.

Diffusion models exhibit good fit along this path: DDPM achieves high-quality sampling and stable optimization through efficient $\epsilon$ parameterization and weighted variational objectives \citep{NEURIPS2020_4c5bcfec}, the SDE perspective unifies score-based and diffusion, providing likelihood evaluation and prediction-correction sampling strategies based on probability flow ODEs \citep{song2021scorebasedgenerativemodelingstochastic}. For time series, TimeGrad replaces the constrained output distribution with conditional denoising, capturing high-dimensional correlations at each step \citep{rasul2021autoregressivedenoisingdiffusionmodels}; CSDI explicitly performs conditional diffusion and uses two-dimensional attention to simultaneously leverage temporal and cross-feature dependencies, suitable for conditioning and filling in missing values \citep{tashiro2021csdiconditionalscorebaseddiffusion}; in a more general spatio-temporal structure, DiffSTG generalizes diffusion to spatio-temporal graphs, combining TCN/GCN with denoising U-Net to improve CRPS and inference efficiency in a non-autoregressive manner \citep{wen2024diffstgprobabilisticspatiotemporalgraph}, and PriSTI further enhances conditional features and geographical relationships, maintaining robustness under high missing rates and sensor failures \citep{liu2023pristiconditionaldiffusionframework}; in long sequences and continuous domains, DiffWave verifies that diffusion can also match the quality of strong vocoders under non-autoregressive fast synthesis \citep{kong2021diffwaveversatilediffusionmodel}; studies on cellular communication traffic show that diffusion can recover spatio-temporal patterns and provide uncertainty characterization at the urban scale \citep{11087622}. These results overall point to a conclusion: when the research focus is on "telemetry/high-level features" rather than raw messages, diffusion models provide stable and fine-grained distribution fitting and uncertainty quantification, which is exactly in line with the requirements of ICS telemetry synthesis. Meanwhile, directly entrusting all structures to a "monolithic diffusion" is not advisable: long-range temporal skeletons and fine-grained marginal distributions often have optimization tensions, requiring explicit decoupling in modeling.

Looking further into the mechanism complexity of ICS: its channel types are inherently mixed, containing both continuous process trajectories and discrete supervision/status variables, and discrete channels must be "legal" under operational constraints. The aforementioned progress in time series diffusion has mainly occurred in continuous spaces, but discrete diffusion has also developed systematic methods: D3PM improves sampling quality and likelihood through absorption/masking and structured transitions in discrete state spaces \citep{austin2023structureddenoisingdiffusionmodels}, subsequent masked diffusion provides stable reconstruction on categorical data in a more simplified form \citep{Lin_2020}, multinomial diffusion directly defines diffusion on a finite vocabulary through mechanisms such as argmax flows \citep{hoogeboom2021argmaxflowsmultinomialdiffusion}, and Diffusion-LM demonstrates an effective path for controllable text generation by imposing gradient constraints in continuous latent spaces \citep{li2022diffusionlmimprovescontrollabletext}. From the perspectives of protocols and finite-state machines, coverage-guided fuzz testing emphasizes the criticality of "sequence legality and state coverage" \citep{meng2025aflnetyearslatercoverageguided,godefroid2017learnfuzzmachinelearninginput,she2019neuzzefficientfuzzingneural}, echoing the concept of "legality by construction" in discrete diffusion: preferentially adopting absorption/masking diffusion on discrete channels, supplemented by type-aware conditioning and sampling constraints, to avoid semantic invalidity and marginal distortion caused by post hoc thresholding.

From the perspective of high-level synthesis, the temporal structure is equally indispensable: ICS control often involves delay effects, phased operating conditions, and cross-channel coupling, requiring models to be able to characterize low-frequency, long-range dependencies while also overlaying multi-facated fine-grained fluctuations on them. The Transformer series has provided sufficient evidence in long-sequence time series tasks: Transformer-XL breaks through the fixed-length context limitation through a reusable memory mechanism and significantly enhances long-range dependency expression \citep{dai2019transformerxlattentivelanguagemodels}; Informer uses ProbSparse attention and efficient decoding to balance span and efficiency in long-sequence prediction \citep{zhou2021informerefficienttransformerlong}; Autoformer robustly models long-term seasonality and trends through autocorrelation and decomposition mechanisms \citep{wu2022autoformerdecompositiontransformersautocorrelation}; FEDformer further improves long-period prediction performance in frequency domain enhancement and decomposition \citep{zhou2022fedformerfrequencyenhanceddecomposed}; PatchTST enhances the stability and generalization of long-sequence multivariate prediction through local patch-based representation and channel-independent modeling \citep{nie2023patchtst}. Combining our previous positioning of diffusion, this chain of evidence points to a natural division of labor: using attention-based sequence models to first extract stable low-frequency trends/conditions (long-range skeletons), and then allowing diffusion to focus on margins and details in the residual space; meanwhile, discrete masking/absorbing diffusion is applied to supervised/pattern variables to ensure vocabulary legality by construction. This design not only inherits the advantages of time series diffusion in distribution fitting and uncertainty characterization \citep{rasul2021autoregressivedenoisingdiffusionmodels,tashiro2021csdiconditionalscorebaseddiffusion,wen2024diffstgprobabilisticspatiotemporalgraph,liu2023pristiconditionaldiffusionframework,kong2021diffwaveversatilediffusionmodel,11087622}, but also stabilizes the macroscopic temporal support through the long-range attention of Transformer, enabling the formation of an operational integrated generation pipeline under the mixed types and multi-scale dynamics of ICS.

% 3. Methodology
\section{Methodology}
\label{sec:method}
Industrial control system (ICS) telemetry is intrinsically mixed-type and mechanistically heterogeneous: continuous process trajectories (e.g., sensor and actuator signals) coexist with discrete supervisory states (e.g., modes, alarms, interlocks), and the underlying generating mechanisms range from physical inertia to program-driven step logic. This heterogeneity is not cosmetic: it directly affects what realistic synthesis means, because a generator must jointly satisfy (i) temporal coherence, (ii) distributional fidelity, and (iii) discrete semantic validity (i.e., every discrete output must belong to its legal vocabulary by construction). These properties are emphasized broadly in operational-technology security guidance and ICS engineering practice, where state logic and physical dynamics are tightly coupled \citep{nist2023sp80082}.

We model each training instance as a fixed-length window of length $L$, comprising continuous channels $\bm{X} \in \mathbb{R}^{L \times d_c}$ and discrete channels $\bm{Y} = \{y^{(j)}_{1:L}\}_{j=1}^{d_d}$, where each discrete variable satisfies $y^{(j)}_t \in \mathcal{V}_j$ for a finite vocabulary $\mathcal{V}_j$. Our objective is to learn a generator that produces synthetic $(\hat{\bm{X}}, \hat{\bm{Y}})$ that are simultaneously coherent and distributionally faithful, while also ensuring $\hat{y}^{(j)}_t\in\mathcal{V}_j$ for all $j$, $t$ by construction (rather than via post-hoc rounding or thresholding).

A key empirical and methodological tension in ICS synthesis is that temporal realism and marginal/distributional realism can compete when optimized monolithically: sequence models trained primarily for regression often over-smooth heavy tails and intermittent bursts, while purely distribution-matching objectives can erode long-range structure. Diffusion models provide a principled route to rich distribution modeling through iterative denoising, but they do not, by themselves, resolve (i) the need for a stable low-frequency temporal scaffold, nor (ii) the discrete legality constraints for supervisory variables \citep{ho2020denoising,song2021score}. Recent time-series diffusion work further suggests that separating coarse structure from stochastic refinement can be an effective inductive bias for long-horizon realism \citep{kollovieh2023tsdiff,sikder2023transfusion}. Figure~\ref{fig:design} summarizes how our framework maps these requirements into a staged generator for mixed-type ICS telemetry.

\begin{figure}[htbp]
  \centering
  \includegraphics[width=0.8\textwidth]{fig-design-v4-from-user-svg-cropped.pdf}
  \caption{Masked-DDPM: Unified Synthesis for ICS traffic}
  \label{fig:design}
\end{figure}

Motivated by these considerations, we propose Mask-DDPM, organized in the following order:
\begin{enumerate}
  \item Transformer trend module: learns the dominant temporal backbone of continuous dynamics via attention-based sequence modeling \citep{vaswani2017attention}.

  \item Residual DDPM for continuous variables: models distributional detail as stochastic residual structure conditioned on the learned trend \citep{ho2020denoising,kollovieh2023tsdiff}.

  \item Masked diffusion for discrete variables: generates discrete ICS states with an absorbing/masking corruption process and categorical reconstruction \citep{austin2021structured,shi2024simplified}.

  \item Type-aware decomposition: a type-aware factorization and routing layer that assigns variables to the most appropriate modeling mechanism and enforces deterministic constraints where warranted.
\end{enumerate}

This ordering is intentional. The trend module establishes a macro-temporal scaffold; residual diffusion then concentrates capacity on micro-structure and marginal fidelity; masked diffusion provides a native mechanism for discrete legality; and the type-aware layer operationalizes the observation that not all ICS variables should be modeled with the same stochastic mechanism. As shown in Figure~\ref{fig:design}, these components are arranged sequentially so that temporal scaffolding, residual refinement, and discrete legality are enforced in complementary rather than competing stages. Importantly, while diffusion-based generation for ICS telemetry has begun to emerge, existing approaches remain limited and typically emphasize continuous synthesis or augmentation; in contrast, our pipeline integrates (i) a Transformer-conditioned residual diffusion backbone, (ii) a discrete masked-diffusion branch, and (iii) explicit type-aware routing for heterogeneous variable mechanisms within a single coherent generator \citep{yuan2025ctu,sha2026ddpm}.

\subsection{Transformer trend module for continuous dynamics}
\label{sec:method-trans}
We instantiate the temporal backbone as a causal Transformer trend extractor, leveraging self-attention's ability to represent long-range dependencies and cross-channel interactions without recurrence \citep{vaswani2017attention}. Compared with recurrent trend extractors (e.g., GRU-style backbones), a Transformer trend module offers a direct mechanism to model delayed effects and multivariate coupling common in ICS, where control actions may influence downstream sensors with nontrivial lags and regime-dependent propagation \citep{vaswani2017attention,nist2023sp80082}. Crucially, in our design the Transformer is not asked to be the entire generator; instead, it serves a deliberately restricted role: providing a stable, temporally coherent conditioning signal that later stochastic components refine.

For continuous channels $\bm{X}$, we posit an additive decomposition:
\begin{equation}
\bm{X} = \bm{S} + \bm{R},
\label{eq:additive_decomp}
\end{equation}
where $\bm{S} \in \mathbb{R}^{L \times d_c}$ is a smooth trend capturing predictable temporal evolution, and $\bm{R} \in \mathbb{R}^{L \times d_c}$ is a residual capturing distributional detail (e.g., bursts, heavy tails, local fluctuations) that is difficult to represent robustly with a purely regression-based temporal objective. This separation reflects an explicit division of labor: the trend module prioritizes temporal coherence, while diffusion (introduced next) targets distributional realism at the residual level, a strategy aligned with predict-then-refine perspectives in time-series diffusion modeling \citep{kollovieh2023tsdiff,sikder2023transfusion}.

We parameterize the trend $\bm{S}$ using a causal Transformer $f_\phi$. With teacher forcing, we train $F_\phi$ to predict the next-step trend from past observations:
\begin{equation}
\hat{\bm{S}}_{t+1} = f_{\phi}(\bm{X}_{1:t}), \quad t = 1, \dots, L-1.
\label{eq:trend_prediction}
\end{equation}
using the mean-squared error objective:
\begin{equation}
\mathcal{L}_{\text{trend}}(\phi) = \frac{1}{(L-1)d_c} \sum_{t=1}^{L-1} \bigl\| \hat{\bm{S}}_{t+1} - \bm{X}_{t+1} \bigr\|_2^2.
\label{eq:trend_loss}
\end{equation}
At inference, we roll out the Transformer autoregressively to obtain $\hat{\bm{S}}$, and then define the residual target for diffusion as $\bm{R} = \bm{X} - \hat{\bm{S}}$. This setup intentionally locks in a coherent low-frequency scaffold before any stochastic refinement is applied, thereby reducing the burden on downstream diffusion modules to simultaneously learn both long-range structure and marginal detail. In this sense, our use of Transformers is distinctive: it is a conditioning-first temporal backbone designed to stabilize mixed-type diffusion synthesis in ICS, rather than an end-to-end monolithic generator \citep{vaswani2017attention,kollovieh2023tsdiff,yuan2025ctu}.

\subsection{DDPM for continuous residual generation}
\label{sec:method-ddpm}
We model the residual $\bm{R}$ with a denoising diffusion probabilistic model (DDPM) conditioned on the trend $\hat{\bm{S}}$ \citep{ho2020denoising}. Diffusion models learn complex data distributions by inverting a tractable noising process through iterative denoising, and have proven effective at capturing multimodality and heavy-tailed structure that is often attenuated by purely regression-based sequence models \citep{ho2020denoising,song2021score}. Conditioning the diffusion model on $\hat{\bm{S}}$ is central: it prevents the denoiser from re-learning the low-frequency scaffold and focuses capacity on residual micro-structure, mirroring the broader principle that diffusion excels as a distributional corrector when a reasonable coarse structure is available \citep{kollovieh2023tsdiff,sikder2023transfusion}.

Let $\bm{K}$ denote the number of diffusion steps, with a noise schedule $\{\beta_k\}_{k=1}^K$, $\alpha_k = 1 - \beta_k$, and $\bar{\alpha}_k = \prod_{i=1}^k \alpha_i$. The forward corruption process is:
\begin{equation}
q(\bm{r}_k \mid \bm{r}_0) = \mathcal{N}\bigl( \sqrt{\bar{\alpha}_k}\,\bm{r}_0,\; (1 - \bar{\alpha}_k)\mathbf{I} \bigr)
\label{eq:forward_corruption}
\end{equation}
equivalently,
\begin{equation}
\bm{r}_k = \sqrt{\bar{\alpha}_k}\,\bm{r}_0 + \sqrt{1 - \bar{\alpha}_k}\,\boldsymbol{\epsilon}, \quad \boldsymbol{\epsilon} \sim \mathcal{N}(\mathbf{0}, \mathbf{I})
\label{eq:forward_corruption_eq}
\end{equation}
The learned reverse process is parameterized as:
\begin{equation}
p_{\theta}(\bm{r}_{k-1} \mid \bm{r}_k, \hat{\bm{S}}) = \mathcal{N}\bigl( \boldsymbol{\mu}_{\theta}(\bm{r}_k, k, \hat{\bm{S}}),\; \boldsymbol{\Sigma}(k) \bigr).
\label{eq:reverse_process}
\end{equation}
where $\mu_\theta$ is implemented by a Transformer denoiser that consumes (i) the noised residual $r_k$, (ii) a timestep embedding for $k$, and (iii) conditioning features derived from $\hat{\bm{S}}$.  This denoiser architecture is consistent with the growing use of attention-based denoisers for long-context time-series diffusion, while our key methodological emphasis is the trend-conditioned residual factorization as the object of diffusion learning \citep{ho2020denoising,sikder2023transfusion}.

We train the denoiser using the standard DDPM $\epsilon$-prediction objective:
\begin{equation}
\mathcal{L}_{\text{cont}}(\theta) = \mathbb{E}_{k,\bm{r}_0,\boldsymbol{\epsilon}} \left[ \bigl\| \boldsymbol{\epsilon} - \boldsymbol{\epsilon}_{\theta}(\bm{r}_k, k, \hat{\bm{S}}) \bigr\|_2^2 \right].
\label{eq:ddpm_loss}
\end{equation}
Because diffusion optimization can exhibit timestep imbalance (i.e., some timesteps dominate gradients), we optionally apply an SNR-based reweighting consistent with Min-SNR training:
\begin{equation}
\mathcal{L}^{\text{snr}}_{\text{cont}}(\theta) = \mathbb{E}_{k,\bm{r}_0,\boldsymbol{\epsilon}} \left[ w_k \bigl\| \boldsymbol{\epsilon} - \boldsymbol{\epsilon}_{\theta}(\bm{r}_k, k, \hat{\bm{S}}) \bigr\|_2^2 \right],
\label{eq:snr_loss}
\end{equation}
where $\mathrm{SNR}_k=\bar{\alpha}_k/(1-\bar{\alpha}_k)$ and $\gamma>0$ is a cap parameter \citep{hang2023efficient}.

After sampling $\hat{\bm{R}}$ by reverse diffusion, we reconstruct the continuous output as $\hat{\bm{X}} = \hat{\bm{S}} + \hat{\bm{R}}$. Overall, the DDPM component serves as a distributional corrector on top of a temporally coherent backbone, which is particularly suited to ICS where low-frequency dynamics are strong and persistent but fine-scale variability (including bursts and regime-conditioned noise) remains important for realism. Relative to prior ICS diffusion efforts that primarily focus on continuous augmentation, our formulation elevates trend-conditioned residual diffusion as a modular mechanism for disentangling temporal structure from distributional refinement \citep{yuan2025ctu,sha2026ddpm}.

\subsection{Masked diffusion for discrete ICS variables}
\label{sec:method-discrete}
Discrete ICS variables must remain categorical, making Gaussian diffusion inappropriate for supervisory states and mode-like channels. While one can attempt continuous relaxations or post-hoc discretization, such strategies risk producing semantically invalid intermediate states (e.g., in-between modes) and can distort the discrete marginal distribution. Discrete-state diffusion provides a principled alternative by defining a valid corruption process directly on categorical variables \citep{austin2021structured,shi2024simplified}. In the ICS setting, this is not a secondary detail: supervisory tags often encode control logic boundaries (modes, alarms, interlocks) that must remain within a finite vocabulary to preserve semantic correctness \citep{nist2023sp80082}.

We therefore adopt masked (absorbing) diffusion for discrete channels, where corruption replaces tokens with a special $\texttt{[MASK]}$ symbol according to a schedule \citep{shi2024simplified}. For each variable $j$, define a masking schedule $\{m_k\}_{k=1}^K$ (with $m_k\in[0,1]$) increasing in $k$. The forward corruption process is:
\begin{equation}
q(y^{(j)}_k \mid y^{(j)}_0) =
\begin{cases}
y^{(j)}_0,            & \text{with probability } 1 - m_k, \\
\texttt{[MASK]},      & \text{with probability } m_k,
\end{cases}
\label{eq:masking_process}
\end{equation}
applied independently across $j$ and $t$. Let $\mathcal{M}$ denote the set of masked positions at step $k$. The denoiser $h_{\psi}$ predicts a categorical distribution over $\mathcal{V}_j$ for each masked token, conditioned on (i) the corrupted discrete sequence, (ii) the diffusion step $k$, and (iii) continuous context. Concretely, we condition on $\hat{\bm{S}}$ and $\hat{\bm{X}}$ to couple supervisory reconstruction to the underlying continuous dynamics:
\begin{equation}
p_{\psi}\bigl( y^{(j)}_0 \mid y_k, k, \hat{\bm{S}}, \hat{\bm{X}} \bigr) = h_{\psi}(y_k, k, \hat{\bm{S}}, \hat{\bm{X}}).
\label{eq:discrete_denoising}
\end{equation}
This conditioning choice is motivated by the fact that many discrete ICS states are not standalone, they are functions of regimes, thresholds, and procedural phases that manifest in continuous channels \citep{nist2023sp80082}. Training uses a categorical denoising objective:
\begin{equation}
\mathcal{L}_{\text{disc}}(\psi) = \mathbb{E}_{k} \left[ \frac{1}{|\mathcal{M}|} \sum_{(j,t) \in \mathcal{M}} \mathrm{CE}\bigl( h_{\psi}(y_k, k, \hat{\bm{S}}, \hat{\bm{X}})_{j,t},\; y^{(j)}_{0,t} \bigr) \right],
\label{eq:discrete_loss}
\end{equation}
where $\mathrm{CE}(\cdot,\cdot)$ is cross-entropy. At sampling time, we initialize all discrete tokens as $\texttt{[MASK]}$ and iteratively unmask them using the learned conditionals, ensuring that every output token lies in its legal vocabulary by construction. This discrete branch is a key differentiator of our pipeline: unlike typical continuous-only diffusion augmentation in ICS, we integrate masked diffusion as a first-class mechanism for supervisory-variable legality within the same end-to-end synthesis workflow \citep{shi2024simplified,yuan2025ctu}.

\subsection{Type-aware decomposition as factorization and routing layer}
\label{sec:method-types}
Even with a trend-conditioned residual DDPM and a discrete masked-diffusion branch, a single uniform modeling treatment can remain suboptimal because ICS variables are generated by qualitatively different mechanisms. For example, program-driven setpoints exhibit step-and-dwell dynamics; controller outputs follow control laws conditioned on process feedback; actuator positions may show saturation and dwell; and some derived tags are deterministic functions of other channels. Treating all channels as if they were exchangeable stochastic processes can misallocate model capacity and induce systematic error concentration on a small subset of mechanistically distinct variables \citep{nist2023sp80082}.

We therefore introduce a type-aware decomposition that formalizes this heterogeneity as a routing and constraint layer. Let $\tau(i)\in{1,\dots,6}$ assign each variable $i$ to a type class. For expository convenience, the assignment can be viewed as a mapping $\tau(i)=\mathrm{TypeAssign}(m_i, s_i, d_i)$, where $m_i$, $s_i$, and $d_i$ denote metadata/engineering role, temporal signature, and dependency pattern, respectively. The type assignment can be initialized from domain semantics (tag metadata, value domains, and engineering meaning), and subsequently refined via an error-attribution workflow described in the Benchmark section. Importantly, this refinement does not change the core diffusion backbone; it changes which mechanism is responsible for which variable, thereby aligning inductive bias with variable-generating mechanism while preserving overall coherence.

We use the following taxonomy:
\begin{enumerate}
	\item Type 1 (program-driven / setpoint-like): externally commanded, step-and-dwell variables. These variables can be treated as exogenous drivers (conditioning signals) or routed to specialized change-point / dwell-time models, rather than being forced into a smooth denoiser that may over-regularize step structure.

	\item Type 2 (controller outputs): continuous variables tightly coupled to feedback loops; these benefit from conditional modeling where the conditioning includes relevant process variables and commanded setpoints.

	\item Type 3 (actuator states/positions): often exhibit saturation, dwell, and rate limits; these may require stateful dynamics beyond generic residual diffusion, motivating either specialized conditional modules or additional inductive constraints.

	\item Type 4 (process variables): inertia-dominated continuous dynamics; these are the primary beneficiaries of the Transformer trend + residual DDPM pipeline.

	\item Type 5 (derived/deterministic variables): algebraic or rule-based functions of other variables; we enforce deterministic reconstruction $\hat{x}^{(i)} = g_i(\hat{X},\hat{Y})$ rather than learning a stochastic generator, improving logical consistency and sample efficiency.

	\item Type 6 (auxiliary/low-impact variables): weakly coupled or sparse signals; we allow simplified modeling (e.g., calibrated marginals or lightweight temporal models) to avoid allocating diffusion capacity where it is not warranted.
\end{enumerate}

\begin{figure}[H]
  \centering
  \includegraphics[width=0.98\textwidth]{typeclass-cropped.pdf}
  \caption{Type assignment and six-type taxonomy.}
  \label{fig:type_taxonomy}
\end{figure}

Figure~\ref{fig:type_taxonomy} visualizes the six-type taxonomy and the routing logic behind it. Type-aware decomposition improves synthesis quality through three mechanisms. First, it improves capacity allocation by preventing a small set of mechanistically atypical variables from dominating gradients and distorting the learned distribution for the majority class (typically Type 4). Second, it enables constraint enforcement by deterministically reconstructing Type 5 variables, preventing logically inconsistent samples that purely learned generators can produce. Third, it improves mechanism alignment by attaching inductive biases consistent with step/dwell or saturation behaviors where generic denoisers may implicitly favor smoothness.

From a novelty standpoint, this layer is not merely an engineering patch; it is an explicit methodological statement that ICS synthesis benefits from typed factorization, a principle that has analogues in mixed-type generative modeling more broadly, but that remains underexplored in diffusion-based ICS telemetry synthesis \citep{shi2025tabdiff,yuan2025ctu,nist2023sp80082}.

\subsection{Joint optimization and end-to-end sampling}
\label{sec:method-joint}
We train the model in a staged manner consistent with the above factorization, which improves optimization stability and encourages each component to specialize in its intended role. Specifically: (i) we train the trend Transformer $f_{\phi}$ to obtain $\hat{\bm{S}}$; (ii) we compute residual targets $\hat{\bm{R}} = \bm{X} - \hat{\bm{S}}$ for the continuous variables routed to residual diffusion; (iii) we train the residual DDPM $p_{\theta}(\bm{R}\mid \hat{\bm{S}})$ and masked diffusion model $p_{\psi}(\bm{Y}\mid \text{masked}(\bm{Y}), \hat{\bm{S}}, \hat{\bm{X}})$; and (iv) we apply type-aware routing and deterministic reconstruction during sampling. This staged strategy is aligned with the design goal of separating temporal scaffolding from distributional refinement, and it mirrors the broader intuition in time-series diffusion that decoupling coarse structure and stochastic detail can mitigate structure-vs.-realism conflicts \citep{kollovieh2023tsdiff,sikder2023transfusion}.

A simple combined objective is $\mathcal{L} = \lambda\mathcal{L}_{\text{cont}} + (1-\lambda)\mathcal{L}_{\text{disc}}$ with $\lambda\in[0,1]$ controlling the balance between continuous and discrete learning. Type-aware routing determines which channels contribute to which loss and which are excluded in favor of deterministic reconstruction. In practice, this routing acts as a principled guardrail against negative transfer across variable mechanisms: channels that are best handled deterministically (Type 5) or as exogenous / specialized state channels (e.g., driver-like or actuator-state variables) are prevented from forcing the diffusion models into statistically incoherent compromises.

At inference time, generation follows the same structured order: (i) trend $\hat{\bm{S}}$ via the Transformer, (ii) residual $\hat{\bm{R}}$ via DDPM, (iii) discrete $\hat{\bm{Y}}$ via masked diffusion, and (iv) type-aware assembly with deterministic reconstruction for routed variables. This pipeline produces $(\hat{\bm{X}},\hat{\bm{Y}})$ that are temporally coherent by construction (through $\hat{\bm{S}}$), distributionally expressive (through $\hat{\bm{R}}$ denoising), and discretely valid (through masked diffusion), while explicitly accounting for heterogeneous variable-generating mechanisms through type-aware routing. In combination, these choices constitute our central methodological contribution: a unified Transformer + mixed diffusion generator for ICS telemetry, augmented by typed factorization to align model capacity with domain mechanism \citep{ho2020denoising,shi2024simplified,yuan2025ctu,nist2023sp80082}.

% 4. Benchmark
\section{Benchmark}
\label{sec:benchmark}
A credible ICS generator must clear three hurdles. It must first be \emph{semantically legal}: any out-of-vocabulary supervisory token renders a sample unusable, regardless of marginal fidelity. It must then match the heterogeneous statistics of mixed-type telemetry, including continuous process channels and discrete supervisory states. Finally, it must preserve \emph{mechanism-level realism}: switch-and-dwell behavior, bounded control motion, cross-tag coordination, and short-horizon persistence. We therefore organize the benchmark as a funnel from legality and reproducibility to structural diagnosis and ablation \citep{coletta2023constrained,yang2001interlock,stenger2024survey}.

For continuous channels, we use the Kolmogorov--Smirnov (KS) statistic because ICS process signals are often bounded, saturated, heavy-tailed, and plateau-dominated, so moment matching alone is too weak. KS directly compares empirical cumulative distributions, makes no parametric assumption, and is sensitive to support or shape mismatches that are operationally meaningful in telemetry. For discrete channels, realism is primarily about how probability mass is distributed over a finite vocabulary, so we use Jensen--Shannon divergence (JSD) between per-feature categorical marginals and average across discrete variables \citep{lin1991divergence,yoon2019timegan}. To assess short-horizon dynamics, we compare lag-1 autocorrelation feature-wise and report the mean absolute difference between real and synthetic lag-1 coefficients. We also track semantic legality by counting out-of-vocabulary discrete outputs and report a filtered KS that excludes near-constant channels so that trivially flat tags do not dominate the aggregate.

\subsection{Core fidelity, legality, and reproducibility}
\label{sec:benchmark-quant}
Across three independent runs, Mask-DDPM achieves mean KS $=0.3311 \pm 0.0079$, mean JSD $=0.0284 \pm 0.0073$, and mean absolute lag-1 difference $=0.2684 \pm 0.0027$, while maintaining a validity rate of \textbf{100\%} across the modeled discrete channels. The small dispersion across runs shows that mixed-type fidelity is reproducible rather than dependent on a single favorable seed. On a representative diagnostic slice, the model attains mean KS $=0.4025$, filtered mean KS $=0.3191$, mean JSD $=0.0166$, and mean absolute lag-1 difference $=0.2859$, again with zero invalid discrete tokens. The main pattern is that discrete legality is already solved, while continuous mismatch is concentrated in a limited subset of difficult channels rather than spread uniformly across the telemetry space.

\begin{figure}[htbp]
  \centering
  \includegraphics[width=\textwidth]{fig-benchmark-story-v2.png}
  \caption{Benchmark evidence chain.}
  \label{fig:benchmark_story}
\end{figure}

\begin{table}[htbp]
\centering
\caption{Core benchmark summary. Lower is better except for validity rate.}
\label{tab:core_metrics}
\begin{tabular}{@{}lcc@{}}
\toprule
\textbf{Metric} & \textbf{3-run mean $\pm$ std} & \textbf{Diagnostic slice} \\
\midrule
Mean KS (continuous) & $0.3311 \pm 0.0079$ & $0.4025$ \\
Filtered mean KS & -- & $0.3191$ \\
Mean JSD (discrete) & $0.0284 \pm 0.0073$ & $0.0166$ \\
Mean abs. $\Delta$ lag-1 autocorr & $0.2684 \pm 0.0027$ & $0.2859$ \\
Validity rate (26 discrete tags) $\uparrow$ & $100.0 \pm 0.0\%$ & $100.0\%$ \\
\bottomrule
\end{tabular}
\end{table}

%Question about the following part. "Figure~\ref{fig:benchmark_story} turns the table into a structural diagnosis."
Figure~\ref{fig:benchmark_story} turns the table into a structural diagnosis. The left panel shows seed-level stability across the three benchmark runs. The middle panel shows that the dominant continuous mismatch is concentrated in a relatively small subset of control-sensitive variables rather than indicating a global collapse of the generator. The right panel shows that the remaining realism gap is mechanism-specific, with program-like long-dwell behavior and actuator-state occupancy contributing more strongly than PV-like channels on this slice.

\subsection{Type-aware diagnostics}
\label{sec:benchmark-typed}
Type-aware diagnostics make that mechanism gap explicit. Table~\ref{tab:typed_diagnostics} reports one representative statistic per variable family on the same diagnostic slice. Because each family is evaluated with a different proxy, the absolute-error column should be interpreted within type, while the relative-error column is the more comparable cross-type indicator.

\begin{table}[htbp]
\centering
\caption{Type-aware diagnostic summary. Lower values indicate better alignment.}
\label{tab:typed_diagnostics}
\begin{tabular}{@{}llcc@{}}
\toprule
\textbf{Type} & \textbf{Proxy statistic} & \textbf{Mean abs. error} & \textbf{Mean rel. error} \\
\midrule
Program & mean dwell & $318.70$ & $2.19$ \\
Controller & change rate & $0.104$ & $0.25$ \\
Actuator & top-3 mass & $0.0615$ & $0.69$ \\
PV & tail ratio & $1.614$ & $0.20$ \\
Auxiliary & lag-1 autocorr & $0.125$ & $0.37$ \\
\bottomrule
\end{tabular}
\end{table}

Program-like channels remain the hardest family by a clear margin: mean-dwell mismatch is still large, indicating that the generator does not yet sustain the long plateaus characteristic of schedule-driven behavior. Actuator channels form the next clear difficulty, while PV channels are the most stable family under this diagnostic. In short, legality is solved, but the remaining realism gap is not uniform across types; it is dominated primarily by long-dwell program behavior and actuator-state occupancy.

\subsection{Ablation study}
\label{sec:benchmark-ablation}
We evaluate ten controlled variants under a shared pipeline and summarize six representative metrics: continuous fidelity (KS), discrete fidelity (JSD), short-horizon dynamics (lag-1), cross-variable coupling, predictive transfer, and downstream anomaly utility. Figure~\ref{fig:ablation_impact} visualizes signed changes relative to the full model, and Table~\ref{tab:ablation} gives the underlying values.

\begin{figure}[htbp]
  \centering
  \includegraphics[width=\textwidth]{fig-benchmark-ablations-v1.png}
  \caption{Ablation impact.}
  \label{fig:ablation_impact}
\end{figure}

\begin{table}[htbp]
\centering
\small
\caption{Ablation study. Lower is better except for anomaly AUPRC.}
\label{tab:ablation}
\begin{tabular}{@{}lcccccc@{}}
\toprule
\textbf{Variant} & \textbf{KS$\downarrow$} & \textbf{JSD$\downarrow$} & \textbf{Lag-1$\downarrow$} & \textbf{Coupling$\downarrow$} & \textbf{Pred. RMSE$\downarrow$} & \textbf{AUPRC$\uparrow$} \\
\midrule
\multicolumn{7}{@{}l}{\textit{Full model}} \\
Full model & $0.402$ & $0.028$ & $0.291$ & $0.215$ & $0.972$ & $0.644$ \\
\midrule
\multicolumn{7}{@{}l}{\textit{Structure and conditioning}} \\
No temporal scaffold & $0.408$ & $0.031$ & $0.664$ & $0.306$ & $0.977$ & $0.645$ \\
No file-level context & $0.405$ & $0.033$ & $0.237$ & $0.262$ & $0.986$ & $0.640$ \\
No type routing & $0.356$ & $0.022$ & $0.138$ & $0.324$ & $1.017$ & $0.647$ \\
\midrule
\multicolumn{7}{@{}l}{\textit{Distribution shaping}} \\
No quantile transform & $0.599$ & $0.010$ & $0.156$ & $0.300$ & $1.653$ & $0.417$ \\
No post-calibration & $0.543$ & $0.024$ & $0.253$ & $0.249$ & $1.086$ & $0.647$ \\
\midrule
\multicolumn{7}{@{}l}{\textit{Loss and target design}} \\
No SNR weighting & $0.400$ & $0.022$ & $0.299$ & $0.214$ & $0.961$ & $0.637$ \\
No quantile loss & $0.413$ & $0.018$ & $0.311$ & $0.213$ & $0.965$ & $0.645$ \\
No residual-stat loss & $0.404$ & $0.029$ & $0.285$ & $0.210$ & $0.970$ & $0.647$ \\
Epsilon target & $0.482$ & $0.102$ & $0.728$ & $0.195$ & $0.968$ & $0.647$ \\
\bottomrule
\end{tabular}
\end{table}

The ablation results reveal three distinct roles. First, temporal staging is what makes the sequence look dynamical rather than merely plausible frame by frame: removing the temporal scaffold leaves KS nearly unchanged but more than doubles lag-1 error ($0.291 \rightarrow 0.664$) and substantially worsens coupling ($0.215 \rightarrow 0.306$). Second, quantile-based distribution shaping is one of the main contributors to usable continuous realism: without the quantile transform, KS degrades sharply ($0.402 \rightarrow 0.599$), synthetic-only predictive RMSE deteriorates ($0.972 \rightarrow 1.653$), and anomaly utility collapses ($0.644 \rightarrow 0.417$). Third, routing is the key counterexample to one-dimensional evaluation: disabling type routing can improve KS or lag-1 in isolation, yet it worsens coupling ($0.215 \rightarrow 0.324$) and predictive transfer ($0.972 \rightarrow 1.017$), showing that typed decomposition helps preserve coordinated mechanism-level behavior.

Taken together, the benchmark supports a focused claim. Mask-DDPM already provides stable mixed-type fidelity and perfect discrete legality, while the remaining error is concentrated in ICS-specific channels whose realism depends on rare switching, long dwell intervals, constrained occupancy, and persistent local dynamics.

% 5. Conclusion and Future Work
\section{Conclusion and Future Work}
\label{sec:conclusion}
This paper addresses the data scarcity and shareability barriers that limit machine-learning research for industrial control systems (ICS) security by proposing Mask-DDPM, a hybrid synthetic telemetry generator at the protocol-feature level. By combining a causal Transformer trend module, a trend-conditioned residual DDPM, a masked diffusion branch for discrete variables, and a type-aware routing layer, the framework preserves long-horizon temporal structure, improves local distributional fidelity, and guarantees discrete semantic legality. On windows derived from the HAI Security Dataset, the model achieves stable mixed-type fidelity across seeds, with mean KS = 0.3311 $\pm$ 0.0079 on continuous features, mean JSD = 0.0284 $\pm$ 0.0073 on discrete features, and mean absolute lag-1 autocorrelation difference = 0.2684 $\pm$ 0.0027.

Overall, Mask-DDPM provides a reproducible foundation for generating shareable and semantically valid ICS feature sequences for data augmentation, benchmarking, and downstream packet/trace reconstruction workflows. Future work will proceed in two complementary directions. Vertically, we will strengthen the theoretical foundation of the framework by introducing more explicit control-theoretic constraints, structured state-space or causal priors, and formal transition models for supervisory logic, so that legality, stability, and cross-channel coupling can be characterized more rigorously. Horizontally, we will extend the framework beyond the current setting to additional industrial control protocols such as Modbus/TCP, DNP3, IEC 104, and OPC UA, and investigate analogous adaptations to automotive communication protocols such as CAN/CAN FD and automotive Ethernet. A related extension is controllable attack or violation injection on top of legal base traces, enabling reproducible adversarial benchmarks for anomaly detection and intrusion-detection studies.

\bibliographystyle{splncs04}
\bibliography{references}
\end{document}