From 89997f1125d33146e65533796fee03cd5f09b293 Mon Sep 17 00:00:00 2001 From: DaZuo0122 <1085701449@qq.com> Date: Wed, 4 Feb 2026 10:48:24 +0800 Subject: [PATCH] Add: equations of methodology in a separate file --- arxiv-style/equations.tex | 109 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) create mode 100644 arxiv-style/equations.tex diff --git a/arxiv-style/equations.tex b/arxiv-style/equations.tex new file mode 100644 index 0000000..7b64203 --- /dev/null +++ b/arxiv-style/equations.tex @@ -0,0 +1,109 @@ +\documentclass[10pt, twocolumn]{article} +\usepackage{amsmath, amssymb} +\usepackage{bm} +\usepackage{booktabs} +\usepackage[margin=1in]{geometry} +\usepackage{microtype} + +% Custom operators +\DeclareMathOperator*{\argmin}{arg\,min} +\DeclareMathOperator*{\argmax}{arg\,max} +\DeclareMathOperator{\CE}{CE} +\DeclareMathOperator{\SNR}{SNR} + +% Bold math symbols +\newcommand{\bX}{\bm{X}} +\newcommand{\bS}{\bm{S}} +\newcommand{\bR}{\bm{R}} +\newcommand{\br}{\bm{r}} +\newcommand{\by}{\bm{y}} + +\title{Equations: Mask-DDPM Methodology} +\author{} +\date{} + +\begin{document} +\maketitle + +\section{Problem Formulation} +Each training instance is a fixed-length window of length $L$, comprising continuous channels $\bX \in \mathbb{R}^{L \times d_c}$ and discrete channels $\bY = \{y^{(j)}_{1:L}\}_{j=1}^{d_d}$, where each discrete variable satisfies $y^{(j)}_t \in \mathcal{V}_j$ for a finite vocabulary $\mathcal{V}_j$. + +\section{Transformer Trend Module for Continuous Dynamics} +We posit an additive decomposition of the continuous signal: +\begin{equation} +\bX = \bS + \bR, +\label{eq:additive_decomp} +\end{equation} +where $\bS \in \mathbb{R}^{L \times d_c}$ captures the smooth temporal trend and $\bR \in \mathbb{R}^{L \times d_c}$ represents distributional residuals. + +The causal Transformer trend extractor $f_{\phi}$ predicts the next-step trend via: +\begin{equation} +\hat{\bS}_{t+1} = f_{\phi}(\bX_{1:t}), \quad t = 1, \dots, L-1. +\label{eq:trend_prediction} +\end{equation} +Training minimizes the mean-squared error: +\begin{equation} +\mathcal{L}_{\text{trend}}(\phi) = \frac{1}{(L-1)d_c} \sum_{t=1}^{L-1} \bigl\| \hat{\bS}_{t+1} - \bX_{t+1} \bigr\|_2^2. +\label{eq:trend_loss} +\end{equation} +At inference, the residual target is defined as $\bR = \bX - \hat{\bS}$. + +\section{DDPM for Continuous Residual Generation} +Let $K$ denote diffusion steps with noise schedule $\{\beta_k\}_{k=1}^K$, $\alpha_k = 1 - \beta_k$, and $\bar{\alpha}_k = \prod_{i=1}^k \alpha_i$. The forward corruption process is: +\begin{align} +q(\br_k \mid \br_0) &= \mathcal{N}\bigl( \sqrt{\bar{\alpha}_k}\,\br_0,\; (1 - \bar{\alpha}_k)\mathbf{I} \bigr), \\ +\br_k &= \sqrt{\bar{\alpha}_k}\,\br_0 + \sqrt{1 - \bar{\alpha}_k}\,\boldsymbol{\epsilon}, \quad \boldsymbol{\epsilon} \sim \mathcal{N}(\mathbf{0}, \mathbf{I}), +\label{eq:forward_process} +\end{align} +where $\br_0 \equiv \bR$. + +The reverse process is parameterized as: +\begin{equation} +p_{\theta}(\br_{k-1} \mid \br_k, \hat{\bS}) = \mathcal{N}\bigl( \boldsymbol{\mu}_{\theta}(\br_k, k, \hat{\bS}),\; \boldsymbol{\Sigma}(k) \bigr). +\label{eq:reverse_process} +\end{equation} +Training employs the $\epsilon$-prediction objective: +\begin{equation} +\mathcal{L}_{\text{cont}}(\theta) = \mathbb{E}_{k,\br_0,\boldsymbol{\epsilon}} \left[ \bigl\| \boldsymbol{\epsilon} - \boldsymbol{\epsilon}_{\theta}(\br_k, k, \hat{\bS}) \bigr\|_2^2 \right]. +\label{eq:ddpm_loss} +\end{equation} +Optionally, SNR-based reweighting yields: +\begin{equation} +\mathcal{L}^{\text{snr}}_{\text{cont}}(\theta) = \mathbb{E}_{k,\br_0,\boldsymbol{\epsilon}} \left[ w_k \bigl\| \boldsymbol{\epsilon} - \boldsymbol{\epsilon}_{\theta}(\br_k, k, \hat{\bS}) \bigr\|_2^2 \right], +\label{eq:snr_loss} +\end{equation} +where $w_k = \min(\SNR_k, \gamma) / \SNR_k$ and $\SNR_k = \bar{\alpha}_k / (1 - \bar{\alpha}_k)$. The final continuous output is reconstructed as $\hat{\bX} = \hat{\bS} + \hat{\bR}$. + +\section{Masked Diffusion for Discrete Variables} +For discrete channel $j$, the forward masking process follows schedule $\{m_k\}_{k=1}^K$: +\begin{equation} +q(y^{(j)}_k \mid y^{(j)}_0) = +\begin{cases} +y^{(j)}_0, & \text{with probability } 1 - m_k, \\ +\texttt{[MASK]}, & \text{with probability } m_k, +\end{cases} +\label{eq:masking_process} +\end{equation} +applied independently across variables and timesteps. + +The denoiser $h_{\psi}$ predicts categorical distributions conditioned on continuous context: +\begin{equation} +p_{\psi}\bigl( y^{(j)}_0 \mid y_k, k, \hat{\bS}, \hat{\bX} \bigr) = h_{\psi}(y_k, k, \hat{\bS}, \hat{\bX}). +\label{eq:discrete_denoising} +\end{equation} +Training minimizes the categorical cross-entropy: +\begin{equation} +\mathcal{L}_{\text{disc}}(\psi) = \mathbb{E}_{k} \left[ \frac{1}{|\mathcal{M}|} \sum_{(j,t) \in \mathcal{M}} \CE\bigl( h_{\psi}(y_k, k, \hat{\bS}, \hat{\bX})_{j,t},\; y^{(j)}_{0,t} \bigr) \right], +\label{eq:discrete_loss} +\end{equation} +where $\mathcal{M}$ denotes masked positions at step $k$. + +\section{Joint Optimization} +The combined objective balances continuous and discrete learning: +\begin{equation} +\mathcal{L} = \lambda \, \mathcal{L}_{\text{cont}} + (1 - \lambda) \, \mathcal{L}_{\text{disc}}, \quad \lambda \in [0,1]. +\label{eq:joint_objective} +\end{equation} +Type-aware routing enforces deterministic reconstruction $\hat{x}^{(i)} = g_i(\hat{\bX}, \hat{\bY})$ for derived variables. + +\end{document} \ No newline at end of file