forked from manbo/internal-docs
Remove: all custom \DeclareMathOperator and \newcommand
This commit is contained in:
@@ -1,23 +1,9 @@
|
|||||||
\documentclass[10pt, twocolumn]{article}
|
\documentclass[10pt, twocolumn]{article}
|
||||||
\usepackage{amsmath, amssymb}
|
\usepackage{amsmath, amssymb}
|
||||||
\usepackage{bm}
|
\usepackage{bm}
|
||||||
\usepackage{booktabs}
|
|
||||||
\usepackage[margin=1in]{geometry}
|
\usepackage[margin=1in]{geometry}
|
||||||
\usepackage{microtype}
|
\usepackage{microtype}
|
||||||
|
|
||||||
% Custom operators
|
|
||||||
\DeclareMathOperator*{\argmin}{arg\,min}
|
|
||||||
\DeclareMathOperator*{\argmax}{arg\,max}
|
|
||||||
\DeclareMathOperator{\CE}{CE}
|
|
||||||
\DeclareMathOperator{\SNR}{SNR}
|
|
||||||
|
|
||||||
% Bold math symbols
|
|
||||||
\newcommand{\bX}{\bm{X}}
|
|
||||||
\newcommand{\bS}{\bm{S}}
|
|
||||||
\newcommand{\bR}{\bm{R}}
|
|
||||||
\newcommand{\br}{\bm{r}}
|
|
||||||
\newcommand{\by}{\bm{y}}
|
|
||||||
|
|
||||||
\title{Equations: Mask-DDPM Methodology}
|
\title{Equations: Mask-DDPM Methodology}
|
||||||
\author{}
|
\author{}
|
||||||
\date{}
|
\date{}
|
||||||
@@ -26,53 +12,53 @@
|
|||||||
\maketitle
|
\maketitle
|
||||||
|
|
||||||
\section{Problem Formulation}
|
\section{Problem Formulation}
|
||||||
Each training instance is a fixed-length window of length $L$, comprising continuous channels $\bX \in \mathbb{R}^{L \times d_c}$ and discrete channels $\bY = \{y^{(j)}_{1:L}\}_{j=1}^{d_d}$, where each discrete variable satisfies $y^{(j)}_t \in \mathcal{V}_j$ for a finite vocabulary $\mathcal{V}_j$.
|
Each training instance is a fixed-length window of length $L$, comprising continuous channels $\bm{X} \in \mathbb{R}^{L \times d_c}$ and discrete channels $\bm{Y} = \{y^{(j)}_{1:L}\}_{j=1}^{d_d}$, where each discrete variable satisfies $y^{(j)}_t \in \mathcal{V}_j$ for a finite vocabulary $\mathcal{V}_j$.
|
||||||
|
|
||||||
\section{Transformer Trend Module for Continuous Dynamics}
|
\section{Transformer Trend Module for Continuous Dynamics}
|
||||||
We posit an additive decomposition of the continuous signal:
|
We posit an additive decomposition of the continuous signal:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\bX = \bS + \bR,
|
\bm{X} = \bm{S} + \bm{R},
|
||||||
\label{eq:additive_decomp}
|
\label{eq:additive_decomp}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
where $\bS \in \mathbb{R}^{L \times d_c}$ captures the smooth temporal trend and $\bR \in \mathbb{R}^{L \times d_c}$ represents distributional residuals.
|
where $\bm{S} \in \mathbb{R}^{L \times d_c}$ captures the smooth temporal trend and $\bm{R} \in \mathbb{R}^{L \times d_c}$ represents distributional residuals.
|
||||||
|
|
||||||
The causal Transformer trend extractor $f_{\phi}$ predicts the next-step trend via:
|
The causal Transformer trend extractor $f_{\phi}$ predicts the next-step trend via:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\hat{\bS}_{t+1} = f_{\phi}(\bX_{1:t}), \quad t = 1, \dots, L-1.
|
\hat{\bm{S}}_{t+1} = f_{\phi}(\bm{X}_{1:t}), \quad t = 1, \dots, L-1.
|
||||||
\label{eq:trend_prediction}
|
\label{eq:trend_prediction}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
Training minimizes the mean-squared error:
|
Training minimizes the mean-squared error:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\mathcal{L}_{\text{trend}}(\phi) = \frac{1}{(L-1)d_c} \sum_{t=1}^{L-1} \bigl\| \hat{\bS}_{t+1} - \bX_{t+1} \bigr\|_2^2.
|
\mathcal{L}_{\text{trend}}(\phi) = \frac{1}{(L-1)d_c} \sum_{t=1}^{L-1} \bigl\| \hat{\bm{S}}_{t+1} - \bm{X}_{t+1} \bigr\|_2^2.
|
||||||
\label{eq:trend_loss}
|
\label{eq:trend_loss}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
At inference, the residual target is defined as $\bR = \bX - \hat{\bS}$.
|
At inference, the residual target is defined as $\bm{R} = \bm{X} - \hat{\bm{S}}$.
|
||||||
|
|
||||||
\section{DDPM for Continuous Residual Generation}
|
\section{DDPM for Continuous Residual Generation}
|
||||||
Let $K$ denote diffusion steps with noise schedule $\{\beta_k\}_{k=1}^K$, $\alpha_k = 1 - \beta_k$, and $\bar{\alpha}_k = \prod_{i=1}^k \alpha_i$. The forward corruption process is:
|
Let $K$ denote diffusion steps with noise schedule $\{\beta_k\}_{k=1}^K$, $\alpha_k = 1 - \beta_k$, and $\bar{\alpha}_k = \prod_{i=1}^k \alpha_i$. The forward corruption process is:
|
||||||
\begin{align}
|
\begin{align}
|
||||||
q(\br_k \mid \br_0) &= \mathcal{N}\bigl( \sqrt{\bar{\alpha}_k}\,\br_0,\; (1 - \bar{\alpha}_k)\mathbf{I} \bigr), \\
|
q(\bm{r}_k \mid \bm{r}_0) &= \mathcal{N}\bigl( \sqrt{\bar{\alpha}_k}\,\bm{r}_0,\; (1 - \bar{\alpha}_k)\mathbf{I} \bigr), \\
|
||||||
\br_k &= \sqrt{\bar{\alpha}_k}\,\br_0 + \sqrt{1 - \bar{\alpha}_k}\,\boldsymbol{\epsilon}, \quad \boldsymbol{\epsilon} \sim \mathcal{N}(\mathbf{0}, \mathbf{I}),
|
\bm{r}_k &= \sqrt{\bar{\alpha}_k}\,\bm{r}_0 + \sqrt{1 - \bar{\alpha}_k}\,\boldsymbol{\epsilon}, \quad \boldsymbol{\epsilon} \sim \mathcal{N}(\mathbf{0}, \mathbf{I}),
|
||||||
\label{eq:forward_process}
|
\label{eq:forward_process}
|
||||||
\end{align}
|
\end{align}
|
||||||
where $\br_0 \equiv \bR$.
|
where $\bm{r}_0 \equiv \bm{R}$.
|
||||||
|
|
||||||
The reverse process is parameterized as:
|
The reverse process is parameterized as:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
p_{\theta}(\br_{k-1} \mid \br_k, \hat{\bS}) = \mathcal{N}\bigl( \boldsymbol{\mu}_{\theta}(\br_k, k, \hat{\bS}),\; \boldsymbol{\Sigma}(k) \bigr).
|
p_{\theta}(\bm{r}_{k-1} \mid \bm{r}_k, \hat{\bm{S}}) = \mathcal{N}\bigl( \boldsymbol{\mu}_{\theta}(\bm{r}_k, k, \hat{\bm{S}}),\; \boldsymbol{\Sigma}(k) \bigr).
|
||||||
\label{eq:reverse_process}
|
\label{eq:reverse_process}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
Training employs the $\epsilon$-prediction objective:
|
Training employs the $\epsilon$-prediction objective:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\mathcal{L}_{\text{cont}}(\theta) = \mathbb{E}_{k,\br_0,\boldsymbol{\epsilon}} \left[ \bigl\| \boldsymbol{\epsilon} - \boldsymbol{\epsilon}_{\theta}(\br_k, k, \hat{\bS}) \bigr\|_2^2 \right].
|
\mathcal{L}_{\text{cont}}(\theta) = \mathbb{E}_{k,\bm{r}_0,\boldsymbol{\epsilon}} \left[ \bigl\| \boldsymbol{\epsilon} - \boldsymbol{\epsilon}_{\theta}(\bm{r}_k, k, \hat{\bm{S}}) \bigr\|_2^2 \right].
|
||||||
\label{eq:ddpm_loss}
|
\label{eq:ddpm_loss}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
Optionally, SNR-based reweighting yields:
|
Optionally, SNR-based reweighting yields:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\mathcal{L}^{\text{snr}}_{\text{cont}}(\theta) = \mathbb{E}_{k,\br_0,\boldsymbol{\epsilon}} \left[ w_k \bigl\| \boldsymbol{\epsilon} - \boldsymbol{\epsilon}_{\theta}(\br_k, k, \hat{\bS}) \bigr\|_2^2 \right],
|
\mathcal{L}^{\text{snr}}_{\text{cont}}(\theta) = \mathbb{E}_{k,\bm{r}_0,\boldsymbol{\epsilon}} \left[ w_k \bigl\| \boldsymbol{\epsilon} - \boldsymbol{\epsilon}_{\theta}(\bm{r}_k, k, \hat{\bm{S}}) \bigr\|_2^2 \right],
|
||||||
\label{eq:snr_loss}
|
\label{eq:snr_loss}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
where $w_k = \min(\SNR_k, \gamma) / \SNR_k$ and $\SNR_k = \bar{\alpha}_k / (1 - \bar{\alpha}_k)$. The final continuous output is reconstructed as $\hat{\bX} = \hat{\bS} + \hat{\bR}$.
|
where $w_k = \min(\mathrm{SNR}_k, \gamma) / \mathrm{SNR}_k$ and $\mathrm{SNR}_k = \bar{\alpha}_k / (1 - \bar{\alpha}_k)$. The final continuous output is reconstructed as $\hat{\bm{X}} = \hat{\bm{S}} + \hat{\bm{R}}$.
|
||||||
|
|
||||||
\section{Masked Diffusion for Discrete Variables}
|
\section{Masked Diffusion for Discrete Variables}
|
||||||
For discrete channel $j$, the forward masking process follows schedule $\{m_k\}_{k=1}^K$:
|
For discrete channel $j$, the forward masking process follows schedule $\{m_k\}_{k=1}^K$:
|
||||||
@@ -88,12 +74,12 @@ applied independently across variables and timesteps.
|
|||||||
|
|
||||||
The denoiser $h_{\psi}$ predicts categorical distributions conditioned on continuous context:
|
The denoiser $h_{\psi}$ predicts categorical distributions conditioned on continuous context:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
p_{\psi}\bigl( y^{(j)}_0 \mid y_k, k, \hat{\bS}, \hat{\bX} \bigr) = h_{\psi}(y_k, k, \hat{\bS}, \hat{\bX}).
|
p_{\psi}\bigl( y^{(j)}_0 \mid y_k, k, \hat{\bm{S}}, \hat{\bm{X}} \bigr) = h_{\psi}(y_k, k, \hat{\bm{S}}, \hat{\bm{X}}).
|
||||||
\label{eq:discrete_denoising}
|
\label{eq:discrete_denoising}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
Training minimizes the categorical cross-entropy:
|
Training minimizes the categorical cross-entropy:
|
||||||
\begin{equation}
|
\begin{equation}
|
||||||
\mathcal{L}_{\text{disc}}(\psi) = \mathbb{E}_{k} \left[ \frac{1}{|\mathcal{M}|} \sum_{(j,t) \in \mathcal{M}} \CE\bigl( h_{\psi}(y_k, k, \hat{\bS}, \hat{\bX})_{j,t},\; y^{(j)}_{0,t} \bigr) \right],
|
\mathcal{L}_{\text{disc}}(\psi) = \mathbb{E}_{k} \left[ \frac{1}{|\mathcal{M}|} \sum_{(j,t) \in \mathcal{M}} \mathrm{CE}\bigl( h_{\psi}(y_k, k, \hat{\bm{S}}, \hat{\bm{X}})_{j,t},\; y^{(j)}_{0,t} \bigr) \right],
|
||||||
\label{eq:discrete_loss}
|
\label{eq:discrete_loss}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
where $\mathcal{M}$ denotes masked positions at step $k$.
|
where $\mathcal{M}$ denotes masked positions at step $k$.
|
||||||
@@ -104,6 +90,6 @@ The combined objective balances continuous and discrete learning:
|
|||||||
\mathcal{L} = \lambda \, \mathcal{L}_{\text{cont}} + (1 - \lambda) \, \mathcal{L}_{\text{disc}}, \quad \lambda \in [0,1].
|
\mathcal{L} = \lambda \, \mathcal{L}_{\text{cont}} + (1 - \lambda) \, \mathcal{L}_{\text{disc}}, \quad \lambda \in [0,1].
|
||||||
\label{eq:joint_objective}
|
\label{eq:joint_objective}
|
||||||
\end{equation}
|
\end{equation}
|
||||||
Type-aware routing enforces deterministic reconstruction $\hat{x}^{(i)} = g_i(\hat{\bX}, \hat{\bY})$ for derived variables.
|
Type-aware routing enforces deterministic reconstruction $\hat{x}^{(i)} = g_i(\hat{\bm{X}}, \hat{\bm{Y}})$ for derived variables.
|
||||||
|
|
||||||
\end{document}
|
\end{document}
|
||||||
Reference in New Issue
Block a user