diff --git a/arxiv-style/main-ieee.tex b/arxiv-style/main-ieee.tex index 9d32365..2c702ea 100644 --- a/arxiv-style/main-ieee.tex +++ b/arxiv-style/main-ieee.tex @@ -81,11 +81,11 @@ Despite these advances, most existing work either focuses on packet-level genera % 2. Related Work \section{Related Work} \label{sec:related} -Early generation of network data oriented towards ``realism'' mostly remained at the packet/flow header level, either through replay or statistical synthesis based on single-point observations. Swing, in a closed-loop, network-responsive manner, extracts user/application/network distributions from single-point observations to reproduce burstiness and correlation across multiple time scales \cite{10.1145/1151659.1159928,10.1145/1159913.1159928}. Subsequently, a series of works advanced header synthesis to learning-based generation: the WGAN-based method added explicit verification of protocol field consistency to NetFlow/IPFIX \cite{Ring_2019}, NetShare reconstructed header modeling as flow-level time series and improved fidelity and scalability through domain encoding and parallel fine-tuning \cite{10.1145/3544216.3544251}, and DoppelGANger preserved the long-range structure and downstream sorting consistency of networked time series by decoupling attributes from sequences \cite{Lin_2020}. However, in industrial control system (ICS) scenarios, the original PCAP is usually not shareable, and public testbeds (such as SWaT, WADI) mostly provide process/monitoring telemetry and protocol interactions for security assessment, but public datasets emphasize operational variables rather than packet-level traces \cite{7469060,10.1145/3055366.3055375}. This makes ``synthesis at the feature/telemetry level, aware of protocol and semantics'' more feasible and necessary in practice: we are more concerned with reproducing high-level distributions and multi-scale temporal patterns according to operational semantics and physical constraints without relying on the original packets. From this perspective, the generation paradigm naturally shifts from ``packet syntax reproduction'' to ``modeling of high-level spatio-temporal distributions and uncertainties'', requiring stable training, strong distribution fitting, and interpretable uncertainty characterization. +Early generation of network data oriented towards ``realism'' mostly remained at the packet/flow header level, either through replay or statistical synthesis based on single-point observations. Swing, in a closed-loop, network-responsive manner, extracts user/application/network distributions from single-point observations to reproduce burstiness and correlation across multiple time scales \cite{10.1145/1159913.1159928}. Subsequently, a series of works advanced header synthesis to learning-based generation: the WGAN-based method added explicit verification of protocol field consistency to NetFlow/IPFIX \cite{Ring_2019}, NetShare reconstructed header modeling as flow-level time series and improved fidelity and scalability through domain encoding and parallel fine-tuning \cite{10.1145/3544216.3544251}, and DoppelGANger preserved the long-range structure and downstream sorting consistency of networked time series by decoupling attributes from sequences \cite{Lin_2020}. However, in industrial control system (ICS) scenarios, the original PCAP is usually not shareable, and public testbeds (such as SWaT, WADI) mostly provide process/monitoring telemetry and protocol interactions for security assessment, but public datasets emphasize operational variables rather than packet-level traces \cite{7469060,10.1145/3055366.3055375}. This makes ``synthesis at the feature/telemetry level, aware of protocol and semantics'' more feasible and necessary in practice: we are more concerned with reproducing high-level distributions and multi-scale temporal patterns according to operational semantics and physical constraints without relying on the original packets. From this perspective, the generation paradigm naturally shifts from ``packet syntax reproduction'' to ``modeling of high-level spatio-temporal distributions and uncertainties'', requiring stable training, strong distribution fitting, and interpretable uncertainty characterization. -Diffusion models exhibit good fit along this path: DDPM achieves high-quality sampling and stable optimization through efficient $\epsilon$ parameterization and weighted variational objectives \cite{NEURIPS2020_4c5bcfec}, the SDE perspective unifies score-based and diffusion, providing likelihood evaluation and prediction-correction sampling strategies based on probability flow ODEs \cite{song2021scorebasedgenerativemodelingstochastic}. For time series, TimeGrad replaces the constrained output distribution with conditional denoising, capturing high-dimensional correlations at each step \cite{rasul2021autoregressivedenoisingdiffusionmodels}; CSDI explicitly performs conditional diffusion and uses two-dimensional attention to simultaneously leverage temporal and cross-feature dependencies, suitable for conditioning and filling in missing values \cite{tashiro2021csdiconditionalscorebaseddiffusion}; in a more general spatio-temporal structure, DiffSTG generalizes diffusion to spatio-temporal graphs, combining TCN/GCN with denoising U-Net to improve CRPS and inference efficiency in a non-autoregressive manner \cite{wen2024diffstgprobabilisticspatiotemporalgraph}, and PriSTI further enhances conditional features and geographical relationships, maintaining robustness under high missing rates and sensor failures \cite{liu2023pristiconditionaldiffusionframework}; in long sequences and continuous domains, DiffWave verifies that diffusion can also match the quality of strong vocoders under non-autoregressive fast synthesis \cite{kong2021diffwaveversatilediffusionmodel}; studies on cellular communication traffic show that diffusion can recover spatio-temporal patterns and provide uncertainty characterization at the urban scale \cite{11087622}. These results overall point to a conclusion: when the research focus is on ``telemetry/high-level features'' rather than raw messages, diffusion models provide stable and fine-grained distribution fitting and uncertainty quantification, which is exactly in line with the requirements of ICS telemetry synthesis. Meanwhile, directly entrusting all structures to a ``monolithic diffusion'' is not advisable: long-range temporal skeletons and fine-grained marginal distributions often have optimization tensions, requiring explicit decoupling in modeling. +Diffusion models exhibit good fit along this path: DDPM achieves high-quality sampling and stable optimization through efficient $\epsilon$ parameterization and weighted variational objectives \cite{ho2020denoising}, the SDE perspective unifies score-based and diffusion, providing likelihood evaluation and prediction-correction sampling strategies based on probability flow ODEs \cite{song2021score}. For time series, TimeGrad replaces the constrained output distribution with conditional denoising, capturing high-dimensional correlations at each step \cite{rasul2021autoregressivedenoisingdiffusionmodels}; CSDI explicitly performs conditional diffusion and uses two-dimensional attention to simultaneously leverage temporal and cross-feature dependencies, suitable for conditioning and filling in missing values \cite{tashiro2021csdiconditionalscorebaseddiffusion}; in a more general spatio-temporal structure, DiffSTG generalizes diffusion to spatio-temporal graphs, combining TCN/GCN with denoising U-Net to improve CRPS and inference efficiency in a non-autoregressive manner \cite{wen2024diffstgprobabilisticspatiotemporalgraph}, and PriSTI further enhances conditional features and geographical relationships, maintaining robustness under high missing rates and sensor failures \cite{liu2023pristiconditionaldiffusionframework}; in long sequences and continuous domains, DiffWave verifies that diffusion can also match the quality of strong vocoders under non-autoregressive fast synthesis \cite{kong2021diffwaveversatilediffusionmodel}; studies on cellular communication traffic show that diffusion can recover spatio-temporal patterns and provide uncertainty characterization at the urban scale \cite{11087622}. These results overall point to a conclusion: when the research focus is on ``telemetry/high-level features'' rather than raw messages, diffusion models provide stable and fine-grained distribution fitting and uncertainty quantification, which is exactly in line with the requirements of ICS telemetry synthesis. Meanwhile, directly entrusting all structures to a ``monolithic diffusion'' is not advisable: long-range temporal skeletons and fine-grained marginal distributions often have optimization tensions, requiring explicit decoupling in modeling. -Looking further into the mechanism complexity of ICS: its channel types are inherently mixed, containing both continuous process trajectories and discrete supervision/status variables, and discrete channels must be ``legal'' under operational constraints. The aforementioned progress in time series diffusion has mainly occurred in continuous spaces, but discrete diffusion has also developed systematic methods: D3PM improves sampling quality and likelihood through absorption/masking and structured transitions in discrete state spaces \cite{austin2023structureddenoisingdiffusionmodels}, subsequent masked diffusion provides stable reconstruction on categorical data in a more simplified form \cite{Lin_2020}, multinomial diffusion directly defines diffusion on a finite vocabulary through mechanisms such as argmax flows \cite{hoogeboom2021argmaxflowsmultinomialdiffusion}, and Diffusion-LM demonstrates an effective path for controllable text generation by imposing gradient constraints in continuous latent spaces \cite{li2022diffusionlmimprovescontrollabletext}. From the perspectives of protocols and finite-state machines, coverage-guided fuzz testing emphasizes the criticality of ``sequence legality and state coverage'' \cite{meng2025aflnetyearslatercoverageguided,godefroid2017learnfuzzmachinelearninginput,she2019neuzzefficientfuzzingneural}, echoing the concept of ``legality by construction'' in discrete diffusion: preferentially adopting absorption/masking diffusion on discrete channels, supplemented by type-aware conditioning and sampling constraints, to avoid semantic invalidity and marginal distortion caused by post hoc thresholding. +Looking further into the mechanism complexity of ICS: its channel types are inherently mixed, containing both continuous process trajectories and discrete supervision/status variables, and discrete channels must be ``legal'' under operational constraints. The aforementioned progress in time series diffusion has mainly occurred in continuous spaces, but discrete diffusion has also developed systematic methods: D3PM improves sampling quality and likelihood through absorption/masking and structured transitions in discrete state spaces \cite{austin2021structured}, subsequent masked diffusion provides stable reconstruction on categorical data in a more simplified form \cite{Lin_2020}, multinomial diffusion directly defines diffusion on a finite vocabulary through mechanisms such as argmax flows \cite{hoogeboom2021argmaxflowsmultinomialdiffusion}, and Diffusion-LM demonstrates an effective path for controllable text generation by imposing gradient constraints in continuous latent spaces \cite{li2022diffusionlmimprovescontrollabletext}. From the perspectives of protocols and finite-state machines, coverage-guided fuzz testing emphasizes the criticality of ``sequence legality and state coverage'' \cite{meng2025aflnetyearslatercoverageguided,godefroid2017learnfuzzmachinelearninginput,she2019neuzzefficientfuzzingneural}, echoing the concept of ``legality by construction'' in discrete diffusion: preferentially adopting absorption/masking diffusion on discrete channels, supplemented by type-aware conditioning and sampling constraints, to avoid semantic invalidity and marginal distortion caused by post hoc thresholding. From the perspective of high-level synthesis, the temporal structure is equally indispensable: ICS control often involves delay effects, phased operating conditions, and cross-channel coupling, requiring models to be able to characterize low-frequency, long-range dependencies while also overlaying multi-modal fine-grained fluctuations on them. The Transformer series has provided sufficient evidence in long-sequence time series tasks: Transformer-XL breaks through the fixed-length context limitation through a reusable memory mechanism and significantly enhances long-range dependency expression \cite{dai2019transformerxlattentivelanguagemodels}; Informer uses ProbSparse attention and efficient decoding to balance span and efficiency in long-sequence prediction \cite{zhou2021informerefficienttransformerlong}; Autoformer robustly models long-term seasonality and trends through autocorrelation and decomposition mechanisms \cite{wu2022autoformerdecompositiontransformersautocorrelation}; FEDformer further improves long-period prediction performance in frequency domain enhancement and decomposition \cite{zhou2022fedformerfrequencyenhanceddecomposed}; PatchTST enhances the stability and generalization of long-sequence multivariate prediction through local patch-based representation and channel-independent modeling \cite{2023}. Combining our previous positioning of diffusion, this chain of evidence points to a natural division of labor: using attention-based sequence models to first extract stable low-frequency trends/conditions (long-range skeletons), and then allowing diffusion to focus on margins and details in the residual space; meanwhile, discrete masking/absorbing diffusion is applied to supervised/pattern variables to ensure vocabulary legality by construction. This design not only inherits the advantages of time series diffusion in distribution fitting and uncertainty characterization \cite{rasul2021autoregressivedenoisingdiffusionmodels,tashiro2021csdiconditionalscorebaseddiffusion,wen2024diffstgprobabilisticspatiotemporalgraph,liu2023pristiconditionaldiffusionframework,kong2021diffwaveversatilediffusionmodel,11087622}, but also stabilizes the macroscopic temporal support through the long-range attention of Transformer, enabling the formation of an operational integrated generation pipeline under the mixed types and multi-scale dynamics of ICS. @@ -312,3 +312,4 @@ Overall, Mask-DDPM provides a reproducible foundation for generating shareable, + diff --git a/arxiv-style/references.bib b/arxiv-style/references.bib index 94bcbf4..ceb3fe0 100644 --- a/arxiv-style/references.bib +++ b/arxiv-style/references.bib @@ -1,4 +1,4 @@ -Reference for Methodology Part + @inproceedings{vaswani2017attention, title={Attention Is All You Need}, author={Vaswani, Ashish and Shazeer, Noam and Parmar, Niki and Uszkoreit, Jakob and Jones, Llion and Gomez, Aidan N and Kaiser, {\L}ukasz and Polosukhin, Illia}, @@ -118,6 +118,7 @@ } Reference for Introduction Part + @article{10.1007/s10844-022-00753-1, author = {Koay, Abigail M. Y. and Ko, Ryan K. L and Hettema, Hinne and Radke, Kenneth}, title = {Machine learning in industrial control system (ICS) security: current landscape, opportunities and challenges}, @@ -194,7 +195,6 @@ keywords = {Operational technology, Cyber security, Dataset, Industrial control year = {2023} } - @Article{info16100910, AUTHOR = {Ali, Jokha and Ali, Saqib and Al Balushi, Taiseera and Nadir, Zia}, TITLE = {Intrusion Detection in Industrial Control Systems Using Transfer Learning Guided by Reinforcement Learning}, @@ -246,25 +246,6 @@ DOI = {10.3390/info16100910} } Reference for Related Work -@article{10.1145/1151659.1159928, -author = {Vishwanath, Kashi Venkatesh and Vahdat, Amin}, -title = {Realistic and responsive network traffic generation}, -year = {2006}, -issue_date = {October 2006}, -publisher = {Association for Computing Machinery}, -address = {New York, NY, USA}, -volume = {36}, -number = {4}, -issn = {0146-4833}, -url = {https://doi.org/10.1145/1151659.1159928}, -doi = {10.1145/1151659.1159928}, -abstract = {This paper presents Swing, a closed-loop, network-responsive traffic generator that accurately captures the packet interactions of a range of applications using a simple structural model. Starting from observed traffic at a single point in the network, Swing automatically extracts distributions for user, application, and network behavior. It then generates live traffic corresponding to the underlying models in a network emulation environment running commodity network protocol stacks. We find that the generated traces are statistically similar to the original traces. Further, to the best of our knowledge, we are the first to reproduce burstiness in traffic across a range of timescales using a model applicable to a variety of network settings. An initial sensitivity analysis reveals the importance of capturing and recreating user, application, and network characteristics to accurately reproduce such burstiness. Finally, we explore Swing's ability to vary user characteristics, application properties, and wide-area network conditions to project traffic characteristics into alternate scenarios.}, -journal = {SIGCOMM Comput. Commun. Rev.}, -month = aug, -pages = {111–122}, -numpages = {12}, -keywords = {burstiness, energy plot, generator, internet, modeling, structural model, traffic, wavelets} -} @inproceedings{10.1145/1159913.1159928, author = {Vishwanath, Kashi Venkatesh and Vahdat, Amin}, @@ -357,28 +338,6 @@ location = {Pittsburgh, Pennsylvania}, series = {CySWATER '17} } -@inproceedings{NEURIPS2020_4c5bcfec, - author = {Ho, Jonathan and Jain, Ajay and Abbeel, Pieter}, - booktitle = {Advances in Neural Information Processing Systems}, - editor = {H. Larochelle and M. Ranzato and R. Hadsell and M.F. Balcan and H. Lin}, - pages = {6840--6851}, - publisher = {Curran Associates, Inc.}, - title = {Denoising Diffusion Probabilistic Models}, - url = {https://proceedings.neurips.cc/paper_files/paper/2020/file/4c5bcfec8584af0d967f1ab10179ca4b-Paper.pdf}, - volume = {33}, - year = {2020} -} - -@misc{song2021scorebasedgenerativemodelingstochastic, - title={Score-Based Generative Modeling through Stochastic Differential Equations}, - author={Yang Song and Jascha Sohl-Dickstein and Diederik P. Kingma and Abhishek Kumar and Stefano Ermon and Ben Poole}, - year={2021}, - eprint={2011.13456}, - archivePrefix={arXiv}, - primaryClass={cs.LG}, - url={https://arxiv.org/abs/2011.13456}, -} - @misc{tashiro2021csdiconditionalscorebaseddiffusion, title={CSDI Conditional Score-based Diffusion Models for Probabilistic Time Series Imputation}, author={Yusuke Tashiro and Jiaming Song and Yang Song and Stefano Ermon}, @@ -431,16 +390,6 @@ series = {CySWATER '17} doi={10.1109/TMC.2025.3591183} } -@misc{austin2023structureddenoisingdiffusionmodels, - title={Structured Denoising Diffusion Models in Discrete State-Spaces}, - author={Jacob Austin and Daniel D. Johnson and Jonathan Ho and Daniel Tarlow and Rianne van den Berg}, - year={2023}, - eprint={2107.03006}, - archivePrefix={arXiv}, - primaryClass={cs.LG}, - url={https://arxiv.org/abs/2107.03006}, -} - @misc{hoogeboom2021argmaxflowsmultinomialdiffusion, title={Argmax Flows and Multinomial Diffusion: Learning Categorical Distributions}, author={Emiel Hoogeboom and Didrik Nielsen and Priyank Jaini and Patrick Forré and Max Welling}, @@ -546,6 +495,7 @@ series = {CySWATER '17} } Reference for Benchmark + @article{stenger2024survey, title={Evaluation is key: a survey on evaluation measures for synthetic time series}, author={Stenger, Michael and Leppich, Robert and Foster, Ian T and Kounev, Samuel and Bauer, Andre}, @@ -603,7 +553,3 @@ Reference for Benchmark year={2001}, publisher={Elsevier} } - - - -