From a1ff64aa40903f374efea23544605bd51d3d5770 Mon Sep 17 00:00:00 2001 From: MingzheYang Date: Wed, 28 Jan 2026 13:46:36 +0800 Subject: [PATCH] Add filtered KS diagnostics and feature-type plan --- docs/README.md | 1 + docs/decisions.md | 7 ++++ docs/ideas.md | 3 ++ example/filtered_metrics.py | 65 +++++++++++++++++++++++++++++++++++++ example/run_all_full.py | 1 + report.md | 32 ++++++++++++++++++ 6 files changed, 109 insertions(+) create mode 100644 example/filtered_metrics.py diff --git a/docs/README.md b/docs/README.md index bb5aedd..a396fae 100644 --- a/docs/README.md +++ b/docs/README.md @@ -14,6 +14,7 @@ Conventions: Tools: - `example/diagnose_ks.py` for per-feature KS + CDF plots. - `example/run_all_full.py` for one-command full pipeline + diagnostics. + - `example/filtered_metrics.py` for filtered KS after removing collapsed/outlier features. Notes: - If `use_quantile_transform` is enabled, run `prepare_data.py` with `full_stats: true` to build quantile tables. diff --git a/docs/decisions.md b/docs/decisions.md index f644804..b51a1c5 100644 --- a/docs/decisions.md +++ b/docs/decisions.md @@ -78,3 +78,10 @@ - **Files**: - `example/prepare_data.py` - `example/config.json` + +## 2026-01-27 — Filtered KS for diagnostics +- **Decision**: Add a filtered KS metric that excludes collapsed/outlier features. +- **Why**: Avoid a handful of features dominating the aggregate KS while still reporting full KS. +- **Files**: + - `example/filtered_metrics.py` + - `example/run_all_full.py` diff --git a/docs/ideas.md b/docs/ideas.md index 193c3d7..03476c4 100644 --- a/docs/ideas.md +++ b/docs/ideas.md @@ -14,3 +14,6 @@ ## Discrete calibration - Hypothesis: post-hoc calibration on discrete marginals can reduce JSD without harming KS. + +## Feature-type split modeling +- Hypothesis: separate generation per feature type (setpoints, controllers, actuators, quantized, derived, aux) yields better overall fidelity. diff --git a/example/filtered_metrics.py b/example/filtered_metrics.py new file mode 100644 index 0000000..7591c72 --- /dev/null +++ b/example/filtered_metrics.py @@ -0,0 +1,65 @@ +#!/usr/bin/env python3 +"""Compute filtered KS/JSD by excluding hard-to-learn features.""" + +import argparse +import json +from pathlib import Path + + +def parse_args(): + parser = argparse.ArgumentParser(description="Filtered metrics from eval.json.") + base_dir = Path(__file__).resolve().parent + parser.add_argument("--eval", default=str(base_dir / "results" / "eval.json")) + parser.add_argument("--min-std", type=float, default=1e-3, help="threshold for std collapse") + parser.add_argument("--ks-threshold", type=float, default=0.95, help="auto-exclude if KS >= threshold") + parser.add_argument("--out", default=str(base_dir / "results" / "filtered_metrics.json")) + return parser.parse_args() + + +def main(): + args = parse_args() + eval_path = Path(args.eval) + if not eval_path.exists(): + raise SystemExit(f"missing eval.json: {eval_path}") + data = json.loads(eval_path.read_text(encoding="utf-8")) + + cont_ks = data.get("continuous_ks", {}) + cont_stats = data.get("continuous_summary", {}) + + dropped = [] + kept = [] + ks_vals = [] + for feat, ks in cont_ks.items(): + std = None + if feat in cont_stats: + std = cont_stats[feat].get("std", None) + drop = False + if std is not None and std <= args.min_std: + drop = True + if ks is not None and ks >= args.ks_threshold: + drop = True + if drop: + dropped.append({"feature": feat, "ks": ks, "std": std}) + else: + kept.append(feat) + ks_vals.append(ks) + + filtered_avg_ks = sum(ks_vals) / len(ks_vals) if ks_vals else None + out = { + "filtered_avg_ks": filtered_avg_ks, + "kept_features": kept, + "dropped_features": dropped, + "rules": { + "min_std": args.min_std, + "ks_threshold": args.ks_threshold, + }, + "original_avg_ks": data.get("avg_ks"), + } + Path(args.out).write_text(json.dumps(out, indent=2), encoding="utf-8") + print("filtered_avg_ks", filtered_avg_ks) + print("dropped", len(dropped)) + print("wrote", args.out) + + +if __name__ == "__main__": + main() diff --git a/example/run_all_full.py b/example/run_all_full.py index c04920c..5ede78e 100644 --- a/example/run_all_full.py +++ b/example/run_all_full.py @@ -88,6 +88,7 @@ def main(): else: run([sys.executable, str(base_dir / "evaluate_generated.py")]) run([sys.executable, str(base_dir / "summary_metrics.py")]) + run([sys.executable, str(base_dir / "filtered_metrics.py")]) if not args.skip_diagnose: run( [ diff --git a/report.md b/report.md index d42c10f..5e7b5a7 100644 --- a/report.md +++ b/report.md @@ -94,6 +94,33 @@ residual = x - trend **Two-stage training:** temporal GRU first, diffusion on residuals. +### 4.3 Feature-Type Aware Strategy / 特征类型分治方案 +Based on HAI feature semantics and observed KS outliers, we classify problematic features into six types and plan separate modeling paths: + +1) **Type 1: Exogenous setpoints / demands** (schedule-driven, piecewise-constant) + Examples: P1_B4002, P2_MSD, P4_HT_LD + Strategy: program generator (HSMM / change-point), or sample from program library; condition diffusion on these. + +2) **Type 2: Controller outputs** (policy-like, saturation / rate limits) + Example: P1_B4005 + Strategy: small controller emulator (PID/NARX) with clamp + rate-limit. + +3) **Type 3: Spiky actuators** (few operating points + long dwell) + Examples: P1_PCV02Z, P1_FCV02Z + Strategy: spike-and-slab + dwell-time modeling or command‑driven actuator dynamics. + +4) **Type 4: Quantized / digital-as-continuous** + Examples: P4_ST_PT01, P4_ST_TT01 + Strategy: generate latent continuous then quantize or treat as ordinal discrete diffusion. + +5) **Type 5: Derived conversions** + Examples: *FT* → *FTZ* + Strategy: generate base variable and derive conversions deterministically. + +6) **Type 6: Aux / vibration / narrow-band** + Examples: P2_24Vdc, P2_HILout + Strategy: AR/ARMA or regime‑conditioned narrow-band models. + --- ## 5. Diffusion Formulations / 扩散形式 @@ -201,6 +228,11 @@ Metrics (with reference): - 输出 `example/results/cdf_.svg`(真实 vs 生成 CDF) - 统计生成数据是否堆积在边界(gen_frac_at_min / gen_frac_at_max) +**Filtered KS(剔除难以学习特征,仅用于诊断):** `example/filtered_metrics.py` +- 规则:std 过小或 KS 过高自动剔除 +- 输出 `example/results/filtered_metrics.json` +- 只用于诊断,不作为最终指标 + Recent runs (Windows): - 2026-01-27 21:22:34 — avg_ks 0.4046 / avg_jsd 0.0376 / avg_lag1_diff 0.1449