101 lines
3.4 KiB
Python
101 lines
3.4 KiB
Python
#!/usr/bin/env python3
|
|
"""One-click ablation runner for split variants."""
|
|
|
|
import argparse
|
|
import json
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
from platform_utils import safe_path, is_windows
|
|
|
|
|
|
def run(cmd):
|
|
cmd = [safe_path(c) for c in cmd]
|
|
if is_windows():
|
|
subprocess.run(cmd, check=True, shell=False)
|
|
else:
|
|
subprocess.run(cmd, check=True)
|
|
|
|
|
|
def parse_args():
|
|
parser = argparse.ArgumentParser(description="Run ablations over split variants.")
|
|
base_dir = Path(__file__).resolve().parent
|
|
parser.add_argument("--device", default="auto")
|
|
parser.add_argument("--config", default=str(base_dir / "config.json"))
|
|
parser.add_argument("--data-glob", default=str(base_dir.parent.parent / "dataset" / "hai" / "hai-21.03" / "train*.csv.gz"))
|
|
parser.add_argument("--max-rows", type=int, default=50000)
|
|
return parser.parse_args()
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
base_dir = Path(__file__).resolve().parent
|
|
results_dir = base_dir / "results"
|
|
splits_dir = results_dir / "ablation_splits"
|
|
splits_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# generate splits
|
|
run([sys.executable, str(base_dir / "ablation_splits.py"), "--data-glob", args.data_glob, "--max-rows", str(args.max_rows)])
|
|
|
|
split_files = [
|
|
splits_dir / "split_baseline.json",
|
|
splits_dir / "split_strict.json",
|
|
splits_dir / "split_loose.json",
|
|
]
|
|
|
|
for split_path in split_files:
|
|
tag = split_path.stem
|
|
run([
|
|
sys.executable,
|
|
str(base_dir / "prepare_data.py"),
|
|
"--data-glob",
|
|
args.data_glob,
|
|
"--split-path",
|
|
str(split_path),
|
|
"--out-stats",
|
|
str(results_dir / f"cont_stats_{tag}.json"),
|
|
"--out-vocab",
|
|
str(results_dir / f"disc_vocab_{tag}.json"),
|
|
])
|
|
|
|
# load base config, override split/stats/vocab/out_dir
|
|
cfg_path = Path(args.config)
|
|
cfg = json.loads(cfg_path.read_text(encoding="utf-8"))
|
|
repo_dir = base_dir.parent.parent
|
|
cfg["split_path"] = str(split_path)
|
|
cfg["stats_path"] = str(results_dir / f"cont_stats_{tag}.json")
|
|
cfg["vocab_path"] = str(results_dir / f"disc_vocab_{tag}.json")
|
|
cfg["out_dir"] = str(results_dir / f"ablation_{tag}")
|
|
# ensure data paths are absolute for Windows
|
|
cfg["data_glob"] = str(Path(args.data_glob).resolve())
|
|
cfg["data_path"] = str((repo_dir / "dataset" / "hai" / "hai-21.03" / "train1.csv.gz").resolve())
|
|
|
|
temp_cfg = results_dir / f"config_{tag}.json"
|
|
temp_cfg.write_text(json.dumps(cfg, indent=2), encoding="utf-8")
|
|
|
|
run([sys.executable, str(base_dir / "train.py"), "--config", str(temp_cfg), "--device", args.device])
|
|
run([
|
|
sys.executable,
|
|
str(base_dir / "export_samples.py"),
|
|
"--include-time",
|
|
"--device",
|
|
args.device,
|
|
"--config",
|
|
str(temp_cfg),
|
|
"--timesteps",
|
|
str(cfg.get("timesteps", 400)),
|
|
"--seq-len",
|
|
str(cfg.get("sample_seq_len", cfg.get("seq_len", 128))),
|
|
"--batch-size",
|
|
str(cfg.get("sample_batch_size", 8)),
|
|
"--clip-k",
|
|
str(cfg.get("clip_k", 3.0)),
|
|
"--use-ema",
|
|
])
|
|
run([sys.executable, str(base_dir / "evaluate_generated.py"), "--out", str(results_dir / f"ablation_{tag}" / "eval.json")])
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|