#!/usr/bin/env python3 """One-click ablation runner for split variants.""" import argparse import json import subprocess import sys from pathlib import Path from platform_utils import safe_path, is_windows, resolve_path def run(cmd): cmd = [safe_path(c) for c in cmd] if is_windows(): subprocess.run(cmd, check=True, shell=False) else: subprocess.run(cmd, check=True) def parse_args(): parser = argparse.ArgumentParser(description="Run ablations over split variants.") base_dir = Path(__file__).resolve().parent parser.add_argument("--device", default="auto") parser.add_argument("--config", default=str(base_dir / "config.json")) parser.add_argument("--data-glob", default=str(base_dir.parent.parent / "dataset" / "hai" / "hai-21.03" / "train*.csv.gz")) parser.add_argument("--max-rows", type=int, default=50000) return parser.parse_args() def main(): args = parse_args() base_dir = Path(__file__).resolve().parent results_dir = base_dir / "results" splits_dir = results_dir / "ablation_splits" splits_dir.mkdir(parents=True, exist_ok=True) # generate splits run([sys.executable, str(base_dir / "ablation_splits.py"), "--data-glob", args.data_glob, "--max-rows", str(args.max_rows)]) split_files = [ splits_dir / "split_baseline.json", splits_dir / "split_strict.json", splits_dir / "split_loose.json", ] for split_path in split_files: tag = split_path.stem run([ sys.executable, str(base_dir / "prepare_data.py"), "--data-glob", args.data_glob, "--split-path", str(split_path), "--out-stats", str(results_dir / f"cont_stats_{tag}.json"), "--out-vocab", str(results_dir / f"disc_vocab_{tag}.json"), ]) # load base config, override split/stats/vocab/out_dir cfg_path = Path(args.config) cfg = json.loads(cfg_path.read_text(encoding="utf-8")) repo_dir = base_dir.parent.parent cfg["split_path"] = str(split_path) cfg["stats_path"] = str(results_dir / f"cont_stats_{tag}.json") cfg["vocab_path"] = str(results_dir / f"disc_vocab_{tag}.json") cfg["out_dir"] = str(results_dir / f"ablation_{tag}") # ensure data paths are absolute for Windows cfg["data_glob"] = str(resolve_path(base_dir, args.data_glob)) cfg["data_path"] = str((repo_dir / "dataset" / "hai" / "hai-21.03" / "train1.csv.gz").resolve()) temp_cfg = results_dir / f"config_{tag}.json" temp_cfg.write_text(json.dumps(cfg, indent=2), encoding="utf-8") run([sys.executable, str(base_dir / "train.py"), "--config", str(temp_cfg), "--device", args.device]) run([ sys.executable, str(base_dir / "export_samples.py"), "--include-time", "--device", args.device, "--config", str(temp_cfg), "--timesteps", str(cfg.get("timesteps", 400)), "--seq-len", str(cfg.get("sample_seq_len", cfg.get("seq_len", 128))), "--batch-size", str(cfg.get("sample_batch_size", 8)), "--clip-k", str(cfg.get("clip_k", 3.0)), "--use-ema", ]) run([sys.executable, str(base_dir / "evaluate_generated.py"), "--out", str(results_dir / f"ablation_{tag}" / "eval.json")]) if __name__ == "__main__": main()