#!/usr/bin/env python3 """Prepare vocab and normalization stats for HAI 21.03.""" import json from pathlib import Path from typing import Optional from data_utils import compute_cont_stats, build_vocab, load_split BASE_DIR = Path(__file__).resolve().parent REPO_DIR = BASE_DIR.parent.parent DATA_PATH = str(REPO_DIR / "dataset" / "hai" / "hai-21.03" / "train1.csv.gz") SPLIT_PATH = str(BASE_DIR / "feature_split.json") OUT_STATS = str(BASE_DIR / "results" / "cont_stats.json") OUT_VOCAB = str(BASE_DIR / "results" / "disc_vocab.json") def main(max_rows: Optional[int] = None): split = load_split(SPLIT_PATH) time_col = split.get("time_column", "time") cont_cols = [c for c in split["continuous"] if c != time_col] disc_cols = [c for c in split["discrete"] if not c.startswith("attack") and c != time_col] mean, std = compute_cont_stats(DATA_PATH, cont_cols, max_rows=max_rows) vocab = build_vocab(DATA_PATH, disc_cols, max_rows=max_rows) with open(OUT_STATS, "w", encoding="ascii") as f: json.dump({"mean": mean, "std": std, "max_rows": max_rows}, f, indent=2) with open(OUT_VOCAB, "w", encoding="ascii") as f: json.dump({"vocab": vocab, "max_rows": max_rows}, f, indent=2) if __name__ == "__main__": # Default: sample 50000 rows for speed. Set to None for full scan. main(max_rows=50000)