Clean artifacts and update example pipeline

This commit is contained in:
2026-01-22 16:32:51 +08:00
parent c0639386be
commit c3f750cd9d
20 changed files with 651 additions and 30826 deletions

View File

@@ -12,33 +12,54 @@ CSV (train1) and produces a continuous/discrete split using a simple heuristic.
- train_stub.py: end-to-end scaffold for loss computation.
- train.py: minimal training loop with checkpoints.
- sample.py: minimal sampling loop.
- export_samples.py: sample + export to CSV with original column names.
- evaluate_generated.py: basic eval of generated CSV vs training stats.
- config.json: training defaults for train.py.
- model_design.md: step-by-step design notes.
- results/feature_split.txt: comma-separated feature lists.
- results/summary.txt: basic stats (rows sampled, column counts).
## Run
```
python /home/anay/Dev/diffusion/mask-ddpm/example/analyze_hai21_03.py
python example/analyze_hai21_03.py
```
Prepare vocab + stats (writes to `example/results`):
```
python /home/anay/Dev/diffusion/mask-ddpm/example/prepare_data.py
python example/prepare_data.py
```
Train a small run:
```
python /home/anay/Dev/diffusion/mask-ddpm/example/train.py
python example/train.py --config example/config.json
```
Sample from the trained model:
```
python /home/anay/Dev/diffusion/mask-ddpm/example/sample.py
python example/sample.py
```
Sample and export CSV:
```
python example/export_samples.py --include-time --device cpu
```
Evaluate generated CSV (writes eval.json):
```
python example/evaluate_generated.py
```
One-click pipeline (prepare -> train -> export -> eval -> plot):
```
python example/run_pipeline.py --device auto
```
## Notes
- Heuristic: integer-like values with low cardinality (<=10) are treated as
discrete. All other numeric columns are continuous.
- Set `device` in `example/config.json` to `auto` or `cuda` when moving to a GPU machine.
- Attack label columns (`attack*`) are excluded from training and generation.
- `time` column is always excluded from training and generation (optional for export only).
- The script only samples the first 5000 rows to stay fast.
- `prepare_data.py` runs without PyTorch, but `train.py` and `sample.py` require it.
- `train.py` and `sample.py` auto-select GPU if available; otherwise they fall back to CPU.

View File

@@ -8,9 +8,12 @@ Everything else numeric -> continuous. Non-numeric -> discrete.
import csv
import gzip
import os
from pathlib import Path
DATA_PATH = "/home/anay/Dev/diffusion/dataset/hai/hai-21.03/train1.csv.gz"
OUT_DIR = "/home/anay/Dev/diffusion/mask-ddpm/example/results"
BASE_DIR = Path(__file__).resolve().parent
REPO_DIR = BASE_DIR.parent.parent
DATA_PATH = str(REPO_DIR / "dataset" / "hai" / "hai-21.03" / "train1.csv.gz")
OUT_DIR = str(BASE_DIR / "results")
MAX_ROWS = 5000

18
example/config.json Normal file
View File

@@ -0,0 +1,18 @@
{
"data_path": "../../dataset/hai/hai-21.03/train1.csv.gz",
"split_path": "./feature_split.json",
"stats_path": "./results/cont_stats.json",
"vocab_path": "./results/disc_vocab.json",
"out_dir": "./results",
"device": "auto",
"timesteps": 400,
"batch_size": 64,
"seq_len": 128,
"epochs": 5,
"max_batches": 2000,
"lambda": 0.5,
"lr": 0.0005,
"seed": 1337,
"log_every": 10,
"ckpt_every": 50
}

View File

@@ -66,6 +66,8 @@ def build_vocab(
vocab = {}
for c in disc_cols:
tokens = sorted(values[c])
if "<UNK>" not in tokens:
tokens.append("<UNK>")
vocab[c] = {tok: idx for idx, tok in enumerate(tokens)}
return vocab
@@ -105,7 +107,7 @@ def windowed_batches(
batches_yielded = 0
for row in iter_rows(path):
cont_row = [float(row[c]) for c in cont_cols]
disc_row = [vocab[c][row[c]] for c in disc_cols]
disc_row = [vocab[c].get(row[c], vocab[c]["<UNK>"]) for c in disc_cols]
seq_cont.append(cont_row)
seq_disc.append(disc_row)
if len(seq_cont) == seq_len:

View File

@@ -0,0 +1,116 @@
#!/usr/bin/env python3
"""Evaluate generated samples against simple stats and vocab validity."""
import argparse
import csv
import gzip
import json
from pathlib import Path
from typing import Dict, Tuple
def load_json(path: str) -> Dict:
with open(path, "r", encoding="ascii") as f:
return json.load(f)
def open_csv(path: str):
if path.endswith(".gz"):
return gzip.open(path, "rt", newline="")
return open(path, "r", newline="")
def parse_args():
parser = argparse.ArgumentParser(description="Evaluate generated CSV samples.")
base_dir = Path(__file__).resolve().parent
parser.add_argument("--generated", default=str(base_dir / "results" / "generated.csv"))
parser.add_argument("--split", default=str(base_dir / "feature_split.json"))
parser.add_argument("--stats", default=str(base_dir / "results" / "cont_stats.json"))
parser.add_argument("--vocab", default=str(base_dir / "results" / "disc_vocab.json"))
parser.add_argument("--out", default=str(base_dir / "results" / "eval.json"))
return parser.parse_args()
def init_stats(cols):
return {c: {"count": 0, "mean": 0.0, "m2": 0.0} for c in cols}
def update_stats(stats, col, value):
st = stats[col]
st["count"] += 1
delta = value - st["mean"]
st["mean"] += delta / st["count"]
delta2 = value - st["mean"]
st["m2"] += delta * delta2
def finalize_stats(stats):
out = {}
for c, st in stats.items():
if st["count"] > 1:
var = st["m2"] / (st["count"] - 1)
else:
var = 0.0
out[c] = {"mean": st["mean"], "std": var ** 0.5}
return out
def main():
args = parse_args()
split = load_json(args.split)
time_col = split.get("time_column", "time")
cont_cols = [c for c in split["continuous"] if c != time_col]
disc_cols = [c for c in split["discrete"] if not c.startswith("attack") and c != time_col]
stats_ref = load_json(args.stats)["mean"]
std_ref = load_json(args.stats)["std"]
vocab = load_json(args.vocab)["vocab"]
vocab_sets = {c: set(vocab[c].keys()) for c in disc_cols}
cont_stats = init_stats(cont_cols)
disc_invalid = {c: 0 for c in disc_cols}
rows = 0
with open_csv(args.generated) as f:
reader = csv.DictReader(f)
for row in reader:
rows += 1
if time_col in row:
row.pop(time_col, None)
for c in cont_cols:
try:
v = float(row[c])
except Exception:
v = 0.0
update_stats(cont_stats, c, v)
for c in disc_cols:
if row[c] not in vocab_sets[c]:
disc_invalid[c] += 1
cont_summary = finalize_stats(cont_stats)
cont_err = {}
for c in cont_cols:
ref_mean = float(stats_ref[c])
ref_std = float(std_ref[c]) if float(std_ref[c]) != 0 else 1.0
gen_mean = cont_summary[c]["mean"]
gen_std = cont_summary[c]["std"]
cont_err[c] = {
"mean_abs_err": abs(gen_mean - ref_mean),
"std_abs_err": abs(gen_std - ref_std),
}
report = {
"rows": rows,
"continuous_summary": cont_summary,
"continuous_error": cont_err,
"discrete_invalid_counts": disc_invalid,
}
with open(args.out, "w", encoding="ascii") as f:
json.dump(report, f, indent=2)
print("eval_report", args.out)
if __name__ == "__main__":
main()

183
example/export_samples.py Normal file
View File

@@ -0,0 +1,183 @@
#!/usr/bin/env python3
"""Sample from a trained hybrid diffusion model and export to CSV."""
import argparse
import csv
import gzip
import json
import os
from pathlib import Path
from typing import Dict, List
import torch
import torch.nn.functional as F
from data_utils import load_split
from hybrid_diffusion import HybridDiffusionModel, cosine_beta_schedule
def load_vocab(path: str) -> Dict[str, Dict[str, int]]:
with open(path, "r", encoding="ascii") as f:
return json.load(f)["vocab"]
def load_stats(path: str):
with open(path, "r", encoding="ascii") as f:
return json.load(f)
def read_header(path: str) -> List[str]:
if path.endswith(".gz"):
opener = gzip.open
mode = "rt"
else:
opener = open
mode = "r"
with opener(path, mode, newline="") as f:
reader = csv.reader(f)
return next(reader)
def build_inverse_vocab(vocab: Dict[str, Dict[str, int]]) -> Dict[str, List[str]]:
inv = {}
for col, mapping in vocab.items():
inverse = [""] * len(mapping)
for tok, idx in mapping.items():
inverse[idx] = tok
inv[col] = inverse
return inv
def parse_args():
parser = argparse.ArgumentParser(description="Sample and export HAI feature sequences.")
base_dir = Path(__file__).resolve().parent
repo_dir = base_dir.parent.parent
parser.add_argument("--data-path", default=str(repo_dir / "dataset" / "hai" / "hai-21.03" / "train1.csv.gz"))
parser.add_argument("--split-path", default=str(base_dir / "feature_split.json"))
parser.add_argument("--stats-path", default=str(base_dir / "results" / "cont_stats.json"))
parser.add_argument("--vocab-path", default=str(base_dir / "results" / "disc_vocab.json"))
parser.add_argument("--model-path", default=str(base_dir / "results" / "model.pt"))
parser.add_argument("--out", default=str(base_dir / "results" / "generated.csv"))
parser.add_argument("--timesteps", type=int, default=200)
parser.add_argument("--seq-len", type=int, default=64)
parser.add_argument("--batch-size", type=int, default=2)
parser.add_argument("--device", default="auto", help="cpu, cuda, or auto")
parser.add_argument("--include-time", action="store_true", help="Include time column as a simple index")
return parser.parse_args()
def resolve_device(mode: str) -> str:
mode = mode.lower()
if mode == "cpu":
return "cpu"
if mode == "cuda":
if not torch.cuda.is_available():
raise SystemExit("device set to cuda but CUDA is not available")
return "cuda"
if torch.cuda.is_available():
return "cuda"
return "cpu"
def main():
args = parse_args()
if not os.path.exists(args.model_path):
raise SystemExit("missing model file: %s" % args.model_path)
split = load_split(args.split_path)
time_col = split.get("time_column", "time")
cont_cols = [c for c in split["continuous"] if c != time_col]
disc_cols = [c for c in split["discrete"] if not c.startswith("attack") and c != time_col]
stats = load_stats(args.stats_path)
mean = stats["mean"]
std = stats["std"]
vocab = load_vocab(args.vocab_path)
inv_vocab = build_inverse_vocab(vocab)
vocab_sizes = [len(vocab[c]) for c in disc_cols]
device = resolve_device(args.device)
model = HybridDiffusionModel(cont_dim=len(cont_cols), disc_vocab_sizes=vocab_sizes).to(device)
model.load_state_dict(torch.load(args.model_path, map_location=device, weights_only=True))
model.eval()
betas = cosine_beta_schedule(args.timesteps).to(device)
alphas = 1.0 - betas
alphas_cumprod = torch.cumprod(alphas, dim=0)
x_cont = torch.randn(args.batch_size, args.seq_len, len(cont_cols), device=device)
x_disc = torch.full(
(args.batch_size, args.seq_len, len(disc_cols)),
0,
device=device,
dtype=torch.long,
)
mask_tokens = torch.tensor(vocab_sizes, device=device)
for i in range(len(disc_cols)):
x_disc[:, :, i] = mask_tokens[i]
for t in reversed(range(args.timesteps)):
t_batch = torch.full((args.batch_size,), t, device=device, dtype=torch.long)
eps_pred, logits = model(x_cont, x_disc, t_batch)
a_t = alphas[t]
a_bar_t = alphas_cumprod[t]
coef1 = 1.0 / torch.sqrt(a_t)
coef2 = (1 - a_t) / torch.sqrt(1 - a_bar_t)
mean_x = coef1 * (x_cont - coef2 * eps_pred)
if t > 0:
noise = torch.randn_like(x_cont)
x_cont = mean_x + torch.sqrt(betas[t]) * noise
else:
x_cont = mean_x
for i, logit in enumerate(logits):
if t == 0:
probs = F.softmax(logit, dim=-1)
x_disc[:, :, i] = torch.argmax(probs, dim=-1)
else:
mask = x_disc[:, :, i] == mask_tokens[i]
if mask.any():
probs = F.softmax(logit, dim=-1)
sampled = torch.multinomial(probs.view(-1, probs.size(-1)), 1).view(
args.batch_size, args.seq_len
)
x_disc[:, :, i][mask] = sampled[mask]
x_cont = x_cont.cpu()
x_disc = x_disc.cpu()
mean_vec = torch.tensor([mean[c] for c in cont_cols], dtype=x_cont.dtype)
std_vec = torch.tensor([std[c] for c in cont_cols], dtype=x_cont.dtype)
x_cont = x_cont * std_vec + mean_vec
header = read_header(args.data_path)
out_cols = [c for c in header if c != time_col or args.include_time]
os.makedirs(os.path.dirname(args.out), exist_ok=True)
with open(args.out, "w", newline="", encoding="ascii") as f:
writer = csv.DictWriter(f, fieldnames=out_cols)
writer.writeheader()
row_index = 0
for b in range(args.batch_size):
for t in range(args.seq_len):
row = {}
if args.include_time and time_col in header:
row[time_col] = str(row_index)
for i, c in enumerate(cont_cols):
row[c] = ("%.6f" % float(x_cont[b, t, i]))
for i, c in enumerate(disc_cols):
tok_idx = int(x_disc[b, t, i])
tok = inv_vocab[c][tok_idx] if tok_idx < len(inv_vocab[c]) else "0"
row[c] = tok
writer.writerow(row)
row_index += 1
print("exported_csv", args.out)
print("rows", args.batch_size * args.seq_len)
if __name__ == "__main__":
main()

63
example/plot_loss.py Normal file
View File

@@ -0,0 +1,63 @@
#!/usr/bin/env python3
"""Plot training loss curves from train_log.csv."""
import argparse
import csv
from pathlib import Path
import matplotlib.pyplot as plt
def parse_args():
parser = argparse.ArgumentParser(description="Plot loss curves from train_log.csv")
base_dir = Path(__file__).resolve().parent
parser.add_argument(
"--log",
default=str(base_dir / "results" / "train_log.csv"),
help="Path to train_log.csv",
)
parser.add_argument(
"--out",
default=str(base_dir / "results" / "train_loss.png"),
help="Output PNG path",
)
return parser.parse_args()
def main():
args = parse_args()
log_path = Path(args.log)
if not log_path.exists():
raise SystemExit("missing log file: %s" % log_path)
steps = []
loss = []
loss_cont = []
loss_disc = []
with log_path.open("r", encoding="ascii", newline="") as f:
reader = csv.DictReader(f)
for row in reader:
steps.append(int(row["step"]))
loss.append(float(row["loss"]))
loss_cont.append(float(row["loss_cont"]))
loss_disc.append(float(row["loss_disc"]))
if not steps:
raise SystemExit("no rows in log file: %s" % log_path)
plt.figure(figsize=(8, 5))
plt.plot(steps, loss, label="total")
plt.plot(steps, loss_cont, label="continuous")
plt.plot(steps, loss_disc, label="discrete")
plt.xlabel("step")
plt.ylabel("loss")
plt.title("Training Loss")
plt.legend()
plt.tight_layout()
plt.savefig(args.out, dpi=150)
print("saved", args.out)
if __name__ == "__main__":
main()

View File

@@ -2,20 +2,24 @@
"""Prepare vocab and normalization stats for HAI 21.03."""
import json
from pathlib import Path
from typing import Optional
from data_utils import compute_cont_stats, build_vocab, load_split
DATA_PATH = "/home/anay/Dev/diffusion/dataset/hai/hai-21.03/train1.csv.gz"
SPLIT_PATH = "/home/anay/Dev/diffusion/mask-ddpm/example/feature_split.json"
OUT_STATS = "/home/anay/Dev/diffusion/mask-ddpm/example/results/cont_stats.json"
OUT_VOCAB = "/home/anay/Dev/diffusion/mask-ddpm/example/results/disc_vocab.json"
BASE_DIR = Path(__file__).resolve().parent
REPO_DIR = BASE_DIR.parent.parent
DATA_PATH = str(REPO_DIR / "dataset" / "hai" / "hai-21.03" / "train1.csv.gz")
SPLIT_PATH = str(BASE_DIR / "feature_split.json")
OUT_STATS = str(BASE_DIR / "results" / "cont_stats.json")
OUT_VOCAB = str(BASE_DIR / "results" / "disc_vocab.json")
def main(max_rows: Optional[int] = None):
split = load_split(SPLIT_PATH)
cont_cols = split["continuous"]
disc_cols = split["discrete"]
time_col = split.get("time_column", "time")
cont_cols = [c for c in split["continuous"] if c != time_col]
disc_cols = [c for c in split["discrete"] if not c.startswith("attack") and c != time_col]
mean, std = compute_cont_stats(DATA_PATH, cont_cols, max_rows=max_rows)
vocab = build_vocab(DATA_PATH, disc_cols, max_rows=max_rows)

View File

@@ -1,113 +0,0 @@
{
"mean": {
"P1_B2004": 0.08649086820000026,
"P1_B2016": 1.376161456000001,
"P1_B3004": 396.1861596906018,
"P1_B3005": 1037.372384413793,
"P1_B4002": 32.564872940799994,
"P1_B4005": 65.98190757240047,
"P1_B400B": 1925.0391570245934,
"P1_B4022": 36.28908066800001,
"P1_FCV02Z": 21.744261118400036,
"P1_FCV03D": 57.36123274140044,
"P1_FCV03Z": 58.05084519640002,
"P1_FT01": 184.18615112319728,
"P1_FT01Z": 851.8781750705965,
"P1_FT02": 1255.8572173544069,
"P1_FT02Z": 1925.0210755194114,
"P1_FT03": 269.37285885780574,
"P1_FT03Z": 1037.366172230601,
"P1_LCV01D": 11.228849048599963,
"P1_LCV01Z": 10.991610181600016,
"P1_LIT01": 396.8845311109994,
"P1_PCV01D": 53.80101618419986,
"P1_PCV01Z": 54.646640287199595,
"P1_PCV02Z": 12.017773542800072,
"P1_PIT01": 1.3692859488000075,
"P1_PIT02": 0.44459071260000227,
"P1_TIT01": 35.64255813999988,
"P1_TIT02": 36.44807823060023,
"P2_24Vdc": 28.0280019013999,
"P2_CO_rpm": 54105.64434999997,
"P2_HILout": 712.0588667425922,
"P2_MSD": 763.19324,
"P2_SIT01": 778.7769850000013,
"P2_SIT02": 778.7778935471981,
"P2_VT01": 11.914949448200044,
"P2_VXT02": -3.5267871940000175,
"P2_VXT03": -1.5520904921999914,
"P2_VYT02": 3.796112737600002,
"P2_VYT03": 6.121691697000018,
"P3_FIT01": 1168.2528800000014,
"P3_LCP01D": 4675.465239999989,
"P3_LCV01D": 7445.208720000017,
"P3_LIT01": 13728.982314999852,
"P3_PIT01": 668.9722350000003,
"P4_HT_FD": -0.00010012580000000082,
"P4_HT_LD": 35.41945000099953,
"P4_HT_PO": 35.4085699912002,
"P4_LD": 365.3833745803986,
"P4_ST_FD": -6.5205999999999635e-06,
"P4_ST_GOV": 17801.81294499996,
"P4_ST_LD": 329.83259218199964,
"P4_ST_PO": 330.1079461497967,
"P4_ST_PT01": 10047.679605000127,
"P4_ST_TT01": 27606.860070000155
},
"std": {
"P1_B2004": 0.024492489898690458,
"P1_B2016": 0.12949272564759745,
"P1_B3004": 10.16264800653289,
"P1_B3005": 70.85697659109,
"P1_B4002": 0.7578213113008356,
"P1_B4005": 41.80065314991797,
"P1_B400B": 1176.6445547448632,
"P1_B4022": 0.8221115066487089,
"P1_FCV02Z": 39.11843197764176,
"P1_FCV03D": 7.889507447726624,
"P1_FCV03Z": 8.046068905945717,
"P1_FT01": 30.80117031882856,
"P1_FT01Z": 91.2786865433318,
"P1_FT02": 879.7163277334494,
"P1_FT02Z": 1176.6699531305114,
"P1_FT03": 38.18015841964941,
"P1_FT03Z": 70.73100774436428,
"P1_LCV01D": 3.3355655415557597,
"P1_LCV01Z": 3.386332233773545,
"P1_LIT01": 10.57871476010412,
"P1_PCV01D": 19.61567943613885,
"P1_PCV01Z": 19.778754467302086,
"P1_PCV02Z": 0.004804797893159998,
"P1_PIT01": 0.0776614954053113,
"P1_PIT02": 0.44823231815652304,
"P1_TIT01": 0.5986678527528815,
"P1_TIT02": 1.1892341204521049,
"P2_24Vdc": 0.00320884250409781,
"P2_CO_rpm": 20.57547782150726,
"P2_HILout": 8.17885337990861,
"P2_MSD": 1.0,
"P2_SIT01": 3.894535775667256,
"P2_SIT02": 3.882477078857941,
"P2_VT01": 0.06812990916670243,
"P2_VXT02": 0.43104157117568803,
"P2_VXT03": 0.26894251958139775,
"P2_VYT02": 0.46109078832075856,
"P2_VYT03": 0.3059642938507547,
"P3_FIT01": 1787.2987693141868,
"P3_LCP01D": 5145.4094261812725,
"P3_LCV01D": 6785.602781765096,
"P3_LIT01": 4060.915441872745,
"P3_PIT01": 1168.1071264424027,
"P4_HT_FD": 0.002032582380617592,
"P4_HT_LD": 33.212361169253235,
"P4_HT_PO": 31.187825914515162,
"P4_LD": 59.736616589045646,
"P4_ST_FD": 0.0016428787127432496,
"P4_ST_GOV": 1740.5997458128215,
"P4_ST_LD": 35.86633288900077,
"P4_ST_PO": 32.375012735256696,
"P4_ST_PT01": 22.459962818146252,
"P4_ST_TT01": 24.745939350221477
},
"max_rows": 50000
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,4 +0,0 @@
discrete
P1_FCV01D,P1_FCV01Z,P1_FCV02D,P1_PCV02D,P1_PP01AD,P1_PP01AR,P1_PP01BD,P1_PP01BR,P1_PP02D,P1_PP02R,P1_STSP,P2_ASD,P2_AutoGO,P2_Emerg,P2_ManualGO,P2_OnOff,P2_RTR,P2_TripEx,P2_VTR01,P2_VTR02,P2_VTR03,P2_VTR04,P3_LH,P3_LL,P4_HT_PS,P4_ST_PS,attack,attack_P1,attack_P2,attack_P3
continuous
P1_B2004,P1_B2016,P1_B3004,P1_B3005,P1_B4002,P1_B4005,P1_B400B,P1_B4022,P1_FCV02Z,P1_FCV03D,P1_FCV03Z,P1_FT01,P1_FT01Z,P1_FT02,P1_FT02Z,P1_FT03,P1_FT03Z,P1_LCV01D,P1_LCV01Z,P1_LIT01,P1_PCV01D,P1_PCV01Z,P1_PCV02Z,P1_PIT01,P1_PIT02,P1_TIT01,P1_TIT02,P2_24Vdc,P2_CO_rpm,P2_HILout,P2_MSD,P2_SIT01,P2_SIT02,P2_VT01,P2_VXT02,P2_VXT03,P2_VYT02,P2_VYT03,P3_FIT01,P3_LCP01D,P3_LCV01D,P3_LIT01,P3_PIT01,P4_HT_FD,P4_HT_LD,P4_HT_PO,P4_LD,P4_ST_FD,P4_ST_GOV,P4_ST_LD,P4_ST_PO,P4_ST_PT01,P4_ST_TT01

Binary file not shown.

View File

@@ -1,6 +0,0 @@
rows_sampled: 5000
columns_total: 84
continuous: 53
discrete: 30
unknown: 0
data_path: /home/anay/Dev/diffusion/dataset/hai/hai-21.03/train1.csv.gz

50
example/run_pipeline.py Normal file
View File

@@ -0,0 +1,50 @@
#!/usr/bin/env python3
"""One-click pipeline: prepare -> train -> export -> evaluate -> plot loss."""
import argparse
import subprocess
import sys
from pathlib import Path
def run(cmd):
print("running:", " ".join(cmd))
subprocess.run(cmd, check=True)
def parse_args():
parser = argparse.ArgumentParser(description="Run full HAI pipeline.")
base_dir = Path(__file__).resolve().parent
parser.add_argument(
"--config",
default=str(base_dir / "config.json"),
help="Path to training config JSON",
)
parser.add_argument(
"--device",
default="auto",
help="cpu, cuda, or auto (used for export_samples.py)",
)
return parser.parse_args()
def main():
args = parse_args()
base_dir = Path(__file__).resolve().parent
run([sys.executable, str(base_dir / "prepare_data.py")])
run([sys.executable, str(base_dir / "train.py"), "--config", args.config])
run(
[
sys.executable,
str(base_dir / "export_samples.py"),
"--include-time",
"--device",
args.device,
]
)
run([sys.executable, str(base_dir / "evaluate_generated.py")])
run([sys.executable, str(base_dir / "plot_loss.py")])
if __name__ == "__main__":
main()

View File

@@ -4,6 +4,7 @@
import json
import math
import os
from pathlib import Path
import torch
import torch.nn.functional as F
@@ -11,11 +12,25 @@ import torch.nn.functional as F
from data_utils import load_split
from hybrid_diffusion import HybridDiffusionModel, cosine_beta_schedule
SPLIT_PATH = "/home/anay/Dev/diffusion/mask-ddpm/example/feature_split.json"
VOCAB_PATH = "/home/anay/Dev/diffusion/mask-ddpm/example/results/disc_vocab.json"
MODEL_PATH = "/home/anay/Dev/diffusion/mask-ddpm/example/results/model.pt"
BASE_DIR = Path(__file__).resolve().parent
SPLIT_PATH = str(BASE_DIR / "feature_split.json")
VOCAB_PATH = str(BASE_DIR / "results" / "disc_vocab.json")
MODEL_PATH = str(BASE_DIR / "results" / "model.pt")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
def resolve_device(mode: str) -> str:
mode = mode.lower()
if mode == "cpu":
return "cpu"
if mode == "cuda":
if not torch.cuda.is_available():
raise SystemExit("device set to cuda but CUDA is not available")
return "cuda"
if torch.cuda.is_available():
return "cuda"
return "cpu"
DEVICE = resolve_device("auto")
TIMESTEPS = 200
SEQ_LEN = 64
BATCH_SIZE = 2
@@ -28,8 +43,9 @@ def load_vocab():
def main():
split = load_split(SPLIT_PATH)
cont_cols = split["continuous"]
disc_cols = split["discrete"]
time_col = split.get("time_column", "time")
cont_cols = [c for c in split["continuous"] if c != time_col]
disc_cols = [c for c in split["discrete"] if not c.startswith("attack") and c != time_col]
vocab = load_vocab()
vocab_sizes = [len(vocab[c]) for c in disc_cols]

View File

@@ -1,8 +1,12 @@
#!/usr/bin/env python3
"""Train hybrid diffusion on HAI 21.03 (minimal runnable example)."""
"""Train hybrid diffusion on HAI (configurable runnable example)."""
import argparse
import json
import os
import random
from pathlib import Path
from typing import Dict
import torch
import torch.nn.functional as F
@@ -15,83 +19,140 @@ from hybrid_diffusion import (
q_sample_discrete,
)
DATA_PATH = "/home/anay/Dev/diffusion/dataset/hai/hai-21.03/train1.csv.gz"
SPLIT_PATH = "/home/anay/Dev/diffusion/mask-ddpm/example/feature_split.json"
STATS_PATH = "/home/anay/Dev/diffusion/mask-ddpm/example/results/cont_stats.json"
VOCAB_PATH = "/home/anay/Dev/diffusion/mask-ddpm/example/results/disc_vocab.json"
OUT_DIR = "/home/anay/Dev/diffusion/mask-ddpm/example/results"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
TIMESTEPS = 1000
BATCH_SIZE = 8
SEQ_LEN = 64
EPOCHS = 1
MAX_BATCHES = 50
LAMBDA = 0.5
LR = 1e-3
BASE_DIR = Path(__file__).resolve().parent
REPO_DIR = BASE_DIR.parent.parent
DEFAULTS = {
"data_path": str(REPO_DIR / "dataset" / "hai" / "hai-21.03" / "train1.csv.gz"),
"split_path": str(BASE_DIR / "feature_split.json"),
"stats_path": str(BASE_DIR / "results" / "cont_stats.json"),
"vocab_path": str(BASE_DIR / "results" / "disc_vocab.json"),
"out_dir": str(BASE_DIR / "results"),
"device": "auto",
"timesteps": 1000,
"batch_size": 8,
"seq_len": 64,
"epochs": 1,
"max_batches": 50,
"lambda": 0.5,
"lr": 1e-3,
"seed": 1337,
"log_every": 10,
"ckpt_every": 50,
}
def load_stats():
with open(STATS_PATH, "r", encoding="ascii") as f:
def load_json(path: str) -> Dict:
with open(path, "r", encoding="ascii") as f:
return json.load(f)
def load_vocab():
with open(VOCAB_PATH, "r", encoding="ascii") as f:
return json.load(f)["vocab"]
def set_seed(seed: int):
random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
def resolve_device(mode: str) -> str:
mode = mode.lower()
if mode == "cpu":
return "cpu"
if mode == "cuda":
if not torch.cuda.is_available():
raise SystemExit("device set to cuda but CUDA is not available")
return "cuda"
if torch.cuda.is_available():
return "cuda"
return "cpu"
def parse_args():
parser = argparse.ArgumentParser(description="Train hybrid diffusion on HAI.")
parser.add_argument("--config", default=None, help="Path to JSON config.")
return parser.parse_args()
def resolve_config_paths(config, base_dir: Path):
keys = ["data_path", "split_path", "stats_path", "vocab_path", "out_dir"]
for key in keys:
if key in config:
path = Path(str(config[key]))
if not path.is_absolute():
config[key] = str((base_dir / path).resolve())
return config
def main():
split = load_split(SPLIT_PATH)
cont_cols = split["continuous"]
disc_cols = split["discrete"]
args = parse_args()
config = dict(DEFAULTS)
if args.config:
cfg_path = Path(args.config).resolve()
config.update(load_json(str(cfg_path)))
config = resolve_config_paths(config, cfg_path.parent)
else:
config = resolve_config_paths(config, BASE_DIR)
stats = load_stats()
set_seed(int(config["seed"]))
split = load_split(config["split_path"])
time_col = split.get("time_column", "time")
cont_cols = [c for c in split["continuous"] if c != time_col]
disc_cols = [c for c in split["discrete"] if not c.startswith("attack") and c != time_col]
stats = load_json(config["stats_path"])
mean = stats["mean"]
std = stats["std"]
vocab = load_vocab()
vocab = load_json(config["vocab_path"])["vocab"]
vocab_sizes = [len(vocab[c]) for c in disc_cols]
print("device", DEVICE)
model = HybridDiffusionModel(cont_dim=len(cont_cols), disc_vocab_sizes=vocab_sizes).to(DEVICE)
opt = torch.optim.Adam(model.parameters(), lr=LR)
device = resolve_device(str(config["device"]))
print("device", device)
model = HybridDiffusionModel(cont_dim=len(cont_cols), disc_vocab_sizes=vocab_sizes).to(device)
opt = torch.optim.Adam(model.parameters(), lr=float(config["lr"]))
betas = cosine_beta_schedule(TIMESTEPS).to(DEVICE)
betas = cosine_beta_schedule(int(config["timesteps"])).to(device)
alphas = 1.0 - betas
alphas_cumprod = torch.cumprod(alphas, dim=0)
os.makedirs(OUT_DIR, exist_ok=True)
os.makedirs(config["out_dir"], exist_ok=True)
log_path = os.path.join(config["out_dir"], "train_log.csv")
with open(log_path, "w", encoding="ascii") as f:
f.write("epoch,step,loss,loss_cont,loss_disc\n")
for epoch in range(EPOCHS):
total_step = 0
for epoch in range(int(config["epochs"])):
for step, (x_cont, x_disc) in enumerate(
windowed_batches(
DATA_PATH,
config["data_path"],
cont_cols,
disc_cols,
vocab,
mean,
std,
batch_size=BATCH_SIZE,
seq_len=SEQ_LEN,
max_batches=MAX_BATCHES,
batch_size=int(config["batch_size"]),
seq_len=int(config["seq_len"]),
max_batches=int(config["max_batches"]),
)
):
x_cont = x_cont.to(DEVICE)
x_disc = x_disc.to(DEVICE)
x_cont = x_cont.to(device)
x_disc = x_disc.to(device)
bsz = x_cont.size(0)
t = torch.randint(0, TIMESTEPS, (bsz,), device=DEVICE)
t = torch.randint(0, int(config["timesteps"]), (bsz,), device=device)
x_cont_t, noise = q_sample_continuous(x_cont, t, alphas_cumprod)
mask_tokens = torch.tensor(vocab_sizes, device=DEVICE)
x_disc_t, mask = q_sample_discrete(x_disc, t, mask_tokens, TIMESTEPS)
mask_tokens = torch.tensor(vocab_sizes, device=device)
x_disc_t, mask = q_sample_discrete(x_disc, t, mask_tokens, int(config["timesteps"]))
eps_pred, logits = model(x_cont_t, x_disc_t, t)
loss_cont = F.mse_loss(eps_pred, noise)
loss_disc = 0.0
for i, logit in enumerate(logits):
if mask[:, :, i].any():
@@ -99,15 +160,31 @@ def main():
logit[mask[:, :, i]], x_disc[:, :, i][mask[:, :, i]]
)
loss = LAMBDA * loss_cont + (1 - LAMBDA) * loss_disc
lam = float(config["lambda"])
loss = lam * loss_cont + (1 - lam) * loss_disc
opt.zero_grad()
loss.backward()
opt.step()
if step % 10 == 0:
if step % int(config["log_every"]) == 0:
print("epoch", epoch, "step", step, "loss", float(loss))
with open(log_path, "a", encoding="ascii") as f:
f.write(
"%d,%d,%.6f,%.6f,%.6f\n"
% (epoch, step, float(loss), float(loss_cont), float(loss_disc))
)
torch.save(model.state_dict(), os.path.join(OUT_DIR, "model.pt"))
total_step += 1
if total_step % int(config["ckpt_every"]) == 0:
ckpt = {
"model": model.state_dict(),
"optim": opt.state_dict(),
"config": config,
"step": total_step,
}
torch.save(ckpt, os.path.join(config["out_dir"], "model_ckpt.pt"))
torch.save(model.state_dict(), os.path.join(config["out_dir"], "model.pt"))
if __name__ == "__main__":

View File

@@ -8,6 +8,7 @@ import csv
import gzip
import json
import math
from pathlib import Path
from typing import Dict, List, Tuple
import torch
@@ -20,8 +21,10 @@ from hybrid_diffusion import (
q_sample_discrete,
)
DATA_PATH = "/home/anay/Dev/diffusion/dataset/hai/hai-21.03/train1.csv.gz"
SPLIT_PATH = "/home/anay/Dev/diffusion/mask-ddpm/example/feature_split.json"
BASE_DIR = Path(__file__).resolve().parent
REPO_DIR = BASE_DIR.parent.parent
DATA_PATH = str(REPO_DIR / "dataset" / "hai" / "hai-21.03" / "train1.csv.gz")
SPLIT_PATH = str(BASE_DIR / "feature_split.json")
DEVICE = "cpu"
TIMESTEPS = 1000