diff --git a/docs/范文.pdf b/docs/范文.pdf new file mode 100644 index 0000000..7bafb1c Binary files /dev/null and b/docs/范文.pdf differ diff --git a/example/config.json b/example/config.json index 255524b..691ae9d 100644 --- a/example/config.json +++ b/example/config.json @@ -51,9 +51,9 @@ "full_stats": true, "type1_features": ["P1_B4002","P2_MSD","P4_HT_LD","P1_B2004","P1_B3004","P1_B4022","P1_B3005"], "type2_features": ["P1_B4005"], - "type3_features": ["P1_PCV02Z","P1_PCV01Z","P1_PCV01D","P1_FCV02Z"], + "type3_features": ["P1_PCV02Z","P1_PCV01Z","P1_PCV01D","P1_FCV02Z","P1_FCV03D","P1_FCV03Z","P1_LCV01D","P1_LCV01Z"], "type4_features": ["P1_PIT02","P2_SIT02","P1_FT03"], - "type5_features": ["P1_FT03Z"], + "type5_features": ["P1_FT03Z","P1_FT02Z"], "type6_features": ["P4_HT_PO","P2_24Vdc","P2_HILout"], "shuffle_buffer": 256, "use_temporal_stage1": true, diff --git a/example/plot_benchmark.py b/example/plot_benchmark.py index a32dfb1..4baa3b1 100644 --- a/example/plot_benchmark.py +++ b/example/plot_benchmark.py @@ -12,15 +12,37 @@ def parse_args(): base_dir = Path(__file__).resolve().parent parser.add_argument( "--figure", - choices=["panel", "summary"], + choices=["panel", "summary", "ranked_ks", "lines", "cdf_grid", "disc_grid", "disc_points"], default="panel", - help="Figure type: panel (paper-style multi-panel) or summary (seed robustness only).", + help="Figure type: panel (paper-style multi-panel), summary (seed robustness only), ranked_ks (outlier attribution), lines (feature series), or cdf_grid (distributions).", ) parser.add_argument( "--history", default=str(base_dir / "results" / "benchmark_history.csv"), help="Path to benchmark_history.csv", ) + parser.add_argument( + "--generated", + default=str(base_dir / "results" / "generated.csv"), + help="Path to generated.csv (used for per-feature profile in panel A).", + ) + parser.add_argument( + "--cont-stats", + default=str(base_dir / "results" / "cont_stats.json"), + help="Path to cont_stats.json (used for per-feature profile in panel A).", + ) + parser.add_argument( + "--profile-order", + choices=["ks_desc", "name"], + default="ks_desc", + help="Feature ordering for panel A profile: ks_desc or name.", + ) + parser.add_argument( + "--profile-max-features", + type=int, + default=64, + help="Max features in panel A profile (0 means all).", + ) parser.add_argument( "--ks-per-feature", default=str(base_dir / "results" / "ks_per_feature.csv"), @@ -41,6 +63,17 @@ def parse_args(): default=str(base_dir / "results" / "filtered_metrics.json"), help="Path to filtered_metrics.json (optional).", ) + parser.add_argument( + "--ranked-ks", + default=str(base_dir / "results" / "ranked_ks.csv"), + help="Path to ranked_ks.csv (used for --figure ranked_ks).", + ) + parser.add_argument( + "--ranked-ks-top-n", + type=int, + default=30, + help="Number of top KS features to show in ranked_ks figure.", + ) parser.add_argument( "--out", default="", @@ -52,6 +85,73 @@ def parse_args(): default="auto", help="Plotting engine: auto prefers matplotlib if available; otherwise uses pure-SVG.", ) + parser.add_argument( + "--lines-features", + default="", + help="Comma-separated feature names for --figure lines (default: top-4 from ranked_ks.csv or fallback set).", + ) + parser.add_argument( + "--lines-top-k", + type=int, + default=8, + help="When features not specified, take top-K features from ranked_ks.csv.", + ) + parser.add_argument( + "--lines-max-rows", + type=int, + default=1000, + help="Max rows to read from generated.csv for --figure lines.", + ) + parser.add_argument( + "--lines-normalize", + choices=["none", "real_range"], + default="none", + help="Normalization for --figure lines: none or real_range (use cont_stats min/max).", + ) + parser.add_argument( + "--reference", + default=str(base_dir / "config.json"), + help="Reference source for real data: config.json with data_glob or direct CSV/GZ path.", + ) + parser.add_argument( + "--lines-ref-index", + type=int, + default=0, + help="Index of matched reference file to plot (0-based) when using a glob.", + ) + parser.add_argument( + "--cdf-features", + default="", + help="Comma-separated features for cdf_grid; empty = all continuous features from cont_stats.", + ) + parser.add_argument( + "--cdf-max-features", + type=int, + default=64, + help="Max features to include in cdf_grid.", + ) + parser.add_argument( + "--cdf-bins", + type=int, + default=80, + help="Number of bins for empirical CDF.", + ) + parser.add_argument( + "--disc-features", + default="", + help="Comma-separated features for disc_grid; empty = use discrete list from feature_split.json.", + ) + parser.add_argument( + "--disc-max-features", + type=int, + default=64, + help="Max features to include in disc_grid.", + ) + parser.add_argument( + "--feature-split", + default=str(base_dir / "feature_split.json"), + help="Path to feature_split.json with 'continuous' and 'discrete' lists.", + ) return parser.parse_args() @@ -181,6 +281,109 @@ def plot_matplotlib(rows, seeds, metrics, out_path): fig.savefig(out_path, format="svg") plt.close(fig) +def discrete_points_matplotlib(generated_csv_path, reference_arg, features, out_path, max_rows=5000): + import matplotlib.pyplot as plt + import numpy as np + import gzip + try: + plt.style.use("seaborn-v0_8-whitegrid") + except Exception: + pass + def resolve_reference_glob(ref_arg: str) -> str: + ref_path = Path(ref_arg) + if ref_path.suffix == ".json": + cfg = json.loads(ref_path.read_text(encoding="utf-8")) + data_glob = cfg.get("data_glob") or cfg.get("data_path") or "" + if not data_glob: + raise SystemExit("reference config has no data_glob/data_path") + combined = ref_path.parent / data_glob + if "*" in str(combined) or "?" in str(combined): + return str(combined) + return str(combined.resolve()) + return str(ref_path) + def read_rows(path, limit): + rows = [] + opener = gzip.open if str(path).endswith(".gz") else open + with opener(path, "rt", newline="") as fh: + reader = csv.DictReader(fh) + for i, r in enumerate(reader): + rows.append(r) + if limit > 0 and i + 1 >= limit: + break + return rows + def cats(rows, feat): + vs = [] + for r in rows: + v = r.get(feat) + if v is None: + continue + s = str(v).strip() + if s == "" or s.lower() == "nan": + continue + vs.append(s) + return vs + ref_glob = resolve_reference_glob(reference_arg) + ref_paths = sorted(Path(ref_glob).parent.glob(Path(ref_glob).name)) + gen_rows = read_rows(generated_csv_path, max_rows) + ref_rows = read_rows(ref_paths[0] if ref_paths else generated_csv_path, max_rows) + points = [] + group_spans = [] + x = 0 + for feat in features: + gvs = cats(gen_rows, feat) + rvs = cats(ref_rows, feat) + cats_all = sorted(set(gvs) | set(rvs)) + start_x = x + for c in cats_all: + g_count = sum(1 for v in gvs if v == c) + r_count = sum(1 for v in rvs if v == c) + g_total = len(gvs) or 1 + r_total = len(rvs) or 1 + g_p = g_count / g_total + r_p = r_count / r_total + points.append({"x": x, "feat": feat, "cat": c, "g": g_p, "r": r_p}) + x += 1 + end_x = x - 1 + if end_x >= start_x: + group_spans.append({"feat": feat, "x0": start_x, "x1": end_x}) + x += 1 + if not points: + fig, ax = plt.subplots(figsize=(9, 3)) + ax.axis("off") + ax.text(0.5, 0.5, "no discrete data", ha="center", va="center") + out_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(out_path, format="svg") + plt.close(fig) + return + width = max(9.0, min(18.0, 0.08 * len(points))) + fig, ax = plt.subplots(figsize=(width, 6.2)) + xs = np.array([p["x"] for p in points], dtype=float) + jitter = 0.18 + ax.scatter(xs - jitter, [p["g"] for p in points], s=26, color="#2563eb", alpha=0.85, edgecolors="white", linewidths=0.6, label="generated") + ax.scatter(xs + jitter, [p["r"] for p in points], s=26, color="#ef4444", alpha=0.75, edgecolors="white", linewidths=0.6, label="real") + for span in group_spans: + xc = (span["x0"] + span["x1"]) / 2.0 + ax.axvline(span["x1"] + 0.5, color="#e5e7eb", lw=1.0) + ax.text(xc, -0.06, span["feat"], ha="center", va="top", rotation=25, fontsize=8, color="#374151") + xg_all = xs - jitter + yg_all = [p["g"] for p in points] + xr_all = xs + jitter + yr_all = [p["r"] for p in points] + ax.plot(xg_all, yg_all, color="#2563eb", linewidth=1.2, alpha=0.85) + ax.plot(xr_all, yr_all, color="#ef4444", linewidth=1.2, alpha=0.75) + ax.fill_between(xg_all, yg_all, 0.0, color="#2563eb", alpha=0.14, step=None) + ax.fill_between(xr_all, yr_all, 0.0, color="#ef4444", alpha=0.14, step=None) + ax.set_ylim(-0.08, 1.02) + ax.set_xlim(-0.5, max(xs) + 0.5) + ax.set_ylabel("probability", fontsize=10) + ax.set_xticks([]) + ax.grid(True, axis="y", color="#e5e7eb") + ax.legend(loc="upper right", fontsize=9) + fig.suptitle("Discrete Marginals (dot plot): generated vs real", fontsize=12, color="#111827", y=0.98) + fig.tight_layout(rect=(0, 0, 1, 0.96)) + out_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(out_path, format="svg") + plt.close(fig) def plot_svg(rows, seeds, metrics, out_path): W, H = 980, 440 @@ -305,6 +508,489 @@ def plot_svg(rows, seeds, metrics, out_path): out_path.write_text("\n".join(parts), encoding="utf-8") +def ranked_ks_svg(ranked_rows, out_path, top_n=30): + parsed = [] + for r in ranked_rows or []: + feat = (r.get("feature") or "").strip() + if not feat: + continue + ks = parse_float(r.get("ks")) + if ks is None: + continue + rank = parse_float(r.get("rank")) + contrib = parse_float(r.get("contribution_to_avg")) + avg_if = parse_float(r.get("avg_ks_if_remove_top_n")) + parsed.append( + { + "rank": int(rank) if isinstance(rank, (int, float)) else None, + "feature": feat, + "ks": float(ks), + "contrib": float(contrib) if isinstance(contrib, (int, float)) else None, + "avg_if": float(avg_if) if isinstance(avg_if, (int, float)) else None, + } + ) + + if not parsed: + raise SystemExit("no valid rows in ranked_ks.csv") + + parsed_by_ks = sorted(parsed, key=lambda x: x["ks"], reverse=True) + top_n = max(1, int(top_n)) + top = parsed_by_ks[: min(top_n, len(parsed_by_ks))] + parsed_by_rank = sorted( + [p for p in parsed if isinstance(p.get("rank"), int) and p.get("avg_if") is not None], + key=lambda x: x["rank"], + ) + + baseline = None + if parsed_by_rank: + first = parsed_by_rank[0] + if first.get("avg_if") is not None and first.get("contrib") is not None: + baseline = first["avg_if"] + first["contrib"] + elif first.get("avg_if") is not None: + baseline = first["avg_if"] + if baseline is None: + baseline = sum(p["ks"] for p in parsed_by_ks) / len(parsed_by_ks) + + xs = [0] + ys = [baseline] + for p in parsed_by_rank: + xs.append(p["rank"]) + ys.append(p["avg_if"]) + + W, H = 980, 560 + bg = "#ffffff" + ink = "#0f172a" + subtle = "#64748b" + grid = "#e2e8f0" + border = "#cbd5e1" + blue = "#2563eb" + bar = "#0ea5e9" + + ff = "system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif" + + def text(x, y, s, size=12, anchor="start", color=ink, weight="normal"): + return "{t}".format( + x=x, + y=y, + a=anchor, + ff=ff, + fs=size, + c=color, + w=weight, + t=svg_escape(s), + ) + + def line(x1, y1, x2, y2, color=grid, width=1.0, dash=None, opacity=1.0, cap="round"): + extra = "" + if dash: + extra += " stroke-dasharray='{d}'".format(d=dash) + if opacity != 1.0: + extra += " stroke-opacity='{o}'".format(o=opacity) + return "".format( + x1=x1, + y1=y1, + x2=x2, + y2=y2, + c=color, + w=width, + cap=cap, + extra=extra, + ) + + pad = 26 + title_h = 62 + gap = 18 + card_w = (W - 2 * pad - gap) / 2 + card_h = H - pad - title_h + xA = pad + xB = pad + card_w + gap + y0 = title_h + + parts = [] + parts.append("".format(w=W, h=H)) + parts.append("".format(w=W, h=H, bg=bg)) + parts.append(text(W / 2, 28, "KS Outlier Attribution", size=16, anchor="middle", color=ink, weight="bold")) + parts.append( + text( + W / 2, + 48, + "Left: top-K features by KS · Right: avg KS after removing top-n outliers", + size=11, + anchor="middle", + color=subtle, + weight="normal", + ) + ) + + parts.append( + "".format( + x=xA, y=y0, w=card_w, h=card_h, b=border + ) + ) + parts.append( + "".format( + x=xB, y=y0, w=card_w, h=card_h, b=border + ) + ) + + parts.append(text(xA + 18, y0 + 28, "A Top-{k} KS features".format(k=len(top)), size=12, color=ink, weight="bold")) + parts.append(text(xB + 18, y0 + 28, "B Removing worst features", size=12, color=ink, weight="bold")) + + ax_y0 = y0 + 52 + ax_h = card_h - 76 + + label_w = 165 + ax_x0 = xA + 18 + label_w + ax_x1 = xA + card_w - 18 + row_h = ax_h / max(1, len(top)) + + for t in range(6): + xx = ax_x0 + (ax_x1 - ax_x0) * (t / 5) + parts.append(line(xx, ax_y0, xx, ax_y0 + ax_h, color=grid, width=1.0)) + parts.append(text(xx, ax_y0 + ax_h + 20, "{:.1f}".format(t / 5), size=9, anchor="middle", color=subtle)) + + parts.append(line(ax_x0, ax_y0 + ax_h, ax_x1, ax_y0 + ax_h, color=border, width=1.2, cap="butt")) + parts.append(text(ax_x0, ax_y0 + ax_h + 38, "KS", size=10, anchor="start", color=subtle, weight="bold")) + + for i, p in enumerate(top): + cy = ax_y0 + i * row_h + row_h / 2 + parts.append(text(ax_x0 - 10, cy + 4, p["feature"], size=9, anchor="end", color=ink)) + w = (ax_x1 - ax_x0) * clamp(p["ks"], 0.0, 1.0) + parts.append( + "".format( + x=ax_x0, y=cy - row_h * 0.34, w=w, h=row_h * 0.68, c=bar + ) + ) + parts.append(text(ax_x1, cy + 4, "{:.3f}".format(p["ks"]), size=9, anchor="end", color=subtle)) + if p.get("contrib") is not None: + parts.append(text(ax_x1 + 10, cy + 4, "{:.2f}%".format(100.0 * p["contrib"]), size=9, anchor="start", color=subtle)) + + px0 = xB + 54 + py0 = y0 + 70 + pw = card_w - 78 + ph = card_h - 108 + parts.append(text(px0, py0 - 20, "avg KS (lower is better)", size=10, anchor="start", color=subtle, weight="bold")) + + xmax = max(xs) if xs else 1 + ymin = min(ys) if ys else 0.0 + ymax = max(ys) if ys else 1.0 + if ymax == ymin: + ymax = ymin + 1.0 + yr = ymax - ymin + ymin -= 0.12 * yr + ymax += 0.12 * yr + + def X(v): + return px0 + (v / max(1e-9, xmax)) * pw + + def Y(v): + return py0 + ph - ((v - ymin) / (ymax - ymin)) * ph + + for k in range(6): + yy = py0 + (ph * k / 5) + parts.append(line(px0, yy, px0 + pw, yy, color=grid, width=1.0)) + val = ymax - (ymax - ymin) * (k / 5) + parts.append(text(px0 - 8, yy + 4, "{:.3f}".format(val), size=9, anchor="end", color=subtle)) + + for k in range(6): + xx = px0 + (pw * k / 5) + parts.append(line(xx, py0, xx, py0 + ph, color=grid, width=1.0)) + val = int(round(xmax * (k / 5))) + parts.append(text(xx, py0 + ph + 22, str(val), size=9, anchor="middle", color=subtle)) + parts.append(text(px0 + pw / 2, py0 + ph + 42, "remove top-n features", size=10, anchor="middle", color=subtle, weight="bold")) + + d = [] + for x, y in zip(xs, ys, strict=True): + d.append(("M" if not d else "L") + " {x:.1f} {y:.1f}".format(x=X(x), y=Y(y))) + parts.append("".format(d=" ".join(d), c=blue)) + + for x, y in zip(xs, ys, strict=True): + parts.append("".format(x=X(x), y=Y(y), c=blue)) + + parts.append("") + out_path.parent.mkdir(parents=True, exist_ok=True) + out_path.write_text("\n".join(parts), encoding="utf-8") + + +def lines_matplotlib(generated_csv_path, cont_stats, features, out_path, max_rows=1000, normalize="none", reference_arg="", ref_index=0): + import matplotlib.pyplot as plt + import gzip + try: + plt.style.use("seaborn-v0_8-whitegrid") + except Exception: + pass + + rows_gen = [] + with Path(generated_csv_path).open("r", encoding="utf-8", newline="") as f: + reader = csv.DictReader(f) + for i, r in enumerate(reader): + rows_gen.append(r) + if len(rows_gen) >= max_rows: + break + + xs_gen = [parse_float(r.get("time")) or (i if (r.get("time") is None) else 0.0) for i, r in enumerate(rows_gen)] + + mins = {} + maxs = {} + if isinstance(cont_stats, dict): + mins = cont_stats.get("min", {}) or {} + maxs = cont_stats.get("max", {}) or {} + + def norm_val(feat, v): + if normalize != "real_range": + return v + mn = parse_float(mins.get(feat)) + mx = parse_float(maxs.get(feat)) + if mn is None or mx is None: + return v + denom = mx - mn + if denom == 0: + return v + return (v - mn) / denom + + def resolve_reference_glob(ref_arg: str) -> str: + ref_path = Path(ref_arg) + if ref_path.suffix == ".json": + cfg = json.loads(ref_path.read_text(encoding="utf-8")) + data_glob = cfg.get("data_glob") or cfg.get("data_path") or "" + if not data_glob: + raise SystemExit("reference config has no data_glob/data_path") + combined = ref_path.parent / data_glob + if "*" in str(combined) or "?" in str(combined): + return str(combined) + return str(combined.resolve()) + return str(ref_path) + + def read_series(path: Path, cols, max_rows: int): + vals = {c: [] for c in cols} + opener = gzip.open if str(path).endswith(".gz") else open + with opener(path, "rt", newline="") as fh: + reader = csv.DictReader(fh) + for i, row in enumerate(reader): + for c in cols: + try: + vals[c].append(float(row[c])) + except Exception: + pass + if max_rows > 0 and i + 1 >= max_rows: + break + return vals + + ref_glob = resolve_reference_glob(reference_arg or str(Path(__file__).resolve().parent / "config.json")) + ref_paths = sorted(Path(ref_glob).parent.glob(Path(ref_glob).name)) + ref_rows = [] + if ref_paths: + idx = max(0, min(ref_index, len(ref_paths) - 1)) + first = ref_paths[idx] + opener = gzip.open if str(first).endswith(".gz") else open + with opener(first, "rt", newline="") as fh: + reader = csv.DictReader(fh) + for i, r in enumerate(reader): + ref_rows.append(r) + if len(ref_rows) >= max_rows: + break + xs_ref = [i for i, _ in enumerate(ref_rows)] + + fig, axes = plt.subplots(nrows=len(features), ncols=1, figsize=(9.2, 6.6), sharex=True) + if len(features) == 1: + axes = [axes] + + for ax, feat in zip(axes, features, strict=True): + ys_gen = [norm_val(feat, parse_float(r.get(feat)) or 0.0) for r in rows_gen] + ax.plot(xs_gen, ys_gen, color="#2563eb", linewidth=1.6, label="generated") + if ref_rows: + ys_ref = [norm_val(feat, parse_float(r.get(feat)) or 0.0) for r in ref_rows] + ax.plot(xs_ref, ys_ref, color="#ef4444", linewidth=1.2, alpha=0.75, label="real") + ax.set_ylabel(feat, fontsize=10) + ax.grid(True, color="#e5e7eb") + ax.legend(loc="upper right", fontsize=8) + + axes[-1].set_xlabel("time", fontsize=10) + fig.suptitle("Feature Series: generated vs real", fontsize=12, color="#111827", y=0.98) + fig.tight_layout(rect=(0, 0, 1, 0.96)) + + out_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(out_path, format="svg") + plt.close(fig) + +def cdf_grid_matplotlib(generated_csv_path, reference_arg, cont_stats, features, out_path, max_rows=5000, bins=80): + import matplotlib.pyplot as plt + import gzip + import numpy as np + try: + plt.style.use("seaborn-v0_8-whitegrid") + except Exception: + pass + mins = cont_stats.get("min", {}) if isinstance(cont_stats, dict) else {} + maxs = cont_stats.get("max", {}) if isinstance(cont_stats, dict) else {} + def resolve_reference_glob(ref_arg: str) -> str: + ref_path = Path(ref_arg) + if ref_path.suffix == ".json": + cfg = json.loads(ref_path.read_text(encoding="utf-8")) + data_glob = cfg.get("data_glob") or cfg.get("data_path") or "" + if not data_glob: + raise SystemExit("reference config has no data_glob/data_path") + combined = ref_path.parent / data_glob + if "*" in str(combined) or "?" in str(combined): + return str(combined) + return str(combined.resolve()) + return str(ref_path) + def read_rows(path, limit): + rows = [] + opener = gzip.open if str(path).endswith(".gz") else open + with opener(path, "rt", newline="") as fh: + reader = csv.DictReader(fh) + for i, r in enumerate(reader): + rows.append(r) + if limit > 0 and i + 1 >= limit: + break + return rows + gen_rows = read_rows(generated_csv_path, max_rows) + ref_glob = resolve_reference_glob(reference_arg) + ref_paths = sorted(Path(ref_glob).parent.glob(Path(ref_glob).name)) + ref_rows = read_rows(ref_paths[0] if ref_paths else generated_csv_path, max_rows) + def values(rows, feat): + vs = [] + for r in rows: + x = parse_float(r.get(feat)) + if x is not None: + vs.append(x) + return vs + cols = 4 + rows_n = int(math.ceil(len(features) / cols)) if features else 1 + fig, axes = plt.subplots(nrows=rows_n, ncols=cols, figsize=(cols * 3.2, rows_n * 2.6)) + axes = np.array(axes).reshape(rows_n, cols) + for i, feat in enumerate(features): + rr = i // cols + cc = i % cols + ax = axes[rr][cc] + gvs = values(gen_rows, feat) + rvs = values(ref_rows, feat) + mn = parse_float(mins.get(feat)) + mx = parse_float(maxs.get(feat)) + if mn is None or mx is None or mx <= mn: + lo = min(gvs + rvs) if (gvs or rvs) else 0.0 + hi = max(gvs + rvs) if (gvs or rvs) else (lo + 1.0) + else: + lo = mn + hi = mx + edges = np.linspace(lo, hi, bins + 1) + def ecdf(vs): + if not vs: + return edges[1:], np.zeros_like(edges[1:]) + hist, _ = np.histogram(vs, bins=edges) + cdf = np.cumsum(hist).astype(float) + cdf /= cdf[-1] if cdf[-1] > 0 else 1.0 + xs = edges[1:] + return xs, cdf + xg, yg = ecdf(gvs) + xr, yr = ecdf(rvs) + ax.plot(xg, yg, color="#2563eb", linewidth=1.6, label="generated") + ax.plot(xr, yr, color="#ef4444", linewidth=1.2, alpha=0.85, label="real") + ax.set_title(feat, fontsize=9, loc="left") + ax.set_ylim(0, 1) + ax.grid(True, color="#e5e7eb") + for j in range(i + 1, rows_n * cols): + rr = j // cols + cc = j % cols + axes[rr][cc].axis("off") + handles, labels = axes[0][0].get_legend_handles_labels() + fig.legend(handles, labels, loc="upper center", ncol=2, fontsize=9) + fig.suptitle("Empirical CDF: generated vs real", fontsize=12, color="#111827", y=0.98) + fig.tight_layout(rect=(0, 0, 1, 0.96)) + out_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(out_path, format="svg") + plt.close(fig) + +def discrete_grid_matplotlib(generated_csv_path, reference_arg, features, out_path, max_rows=5000): + import matplotlib.pyplot as plt + import numpy as np + import gzip + try: + plt.style.use("seaborn-v0_8-whitegrid") + except Exception: + pass + def resolve_reference_glob(ref_arg: str) -> str: + ref_path = Path(ref_arg) + if ref_path.suffix == ".json": + cfg = json.loads(ref_path.read_text(encoding="utf-8")) + data_glob = cfg.get("data_glob") or cfg.get("data_path") or "" + if not data_glob: + raise SystemExit("reference config has no data_glob/data_path") + combined = ref_path.parent / data_glob + if "*" in str(combined) or "?" in str(combined): + return str(combined) + return str(combined.resolve()) + return str(ref_path) + def read_rows(path, limit): + rows = [] + opener = gzip.open if str(path).endswith(".gz") else open + with opener(path, "rt", newline="") as fh: + reader = csv.DictReader(fh) + for i, r in enumerate(reader): + rows.append(r) + if limit > 0 and i + 1 >= limit: + break + return rows + ref_glob = resolve_reference_glob(reference_arg) + ref_paths = sorted(Path(ref_glob).parent.glob(Path(ref_glob).name)) + gen_rows = read_rows(generated_csv_path, max_rows) + ref_rows = read_rows(ref_paths[0] if ref_paths else generated_csv_path, max_rows) + def cats(rows, feat): + vs = [] + for r in rows: + v = r.get(feat) + if v is None: + continue + s = str(v).strip() + if s == "" or s.lower() == "nan": + continue + vs.append(s) + return vs + cols = 4 + rows_n = int(math.ceil(len(features) / cols)) if features else 1 + fig, axes = plt.subplots(nrows=rows_n, ncols=cols, figsize=(cols * 3.0, rows_n * 2.6), sharey=False) + axes = np.array(axes).reshape(rows_n, cols) + for i, feat in enumerate(features): + rr = i // cols + cc = i % cols + ax = axes[rr][cc] + gvs = cats(gen_rows, feat) + rvs = cats(ref_rows, feat) + all_vals = sorted(set(gvs) | set(rvs)) + if not all_vals: + ax.axis("off") + continue + g_counts = {v: 0 for v in all_vals} + r_counts = {v: 0 for v in all_vals} + for v in gvs: + g_counts[v] += 1 + for v in rvs: + r_counts[v] += 1 + g_total = sum(g_counts.values()) or 1 + r_total = sum(r_counts.values()) or 1 + g_p = [g_counts[v] / g_total for v in all_vals] + r_p = [r_counts[v] / r_total for v in all_vals] + x = np.arange(len(all_vals)) + w = 0.42 + ax.bar(x - w / 2, g_p, width=w, color="#2563eb", alpha=0.85, label="generated") + ax.bar(x + w / 2, r_p, width=w, color="#ef4444", alpha=0.75, label="real") + ax.set_title(feat, fontsize=9, loc="left") + ax.set_xticks(x) + ax.set_xticklabels(all_vals, rotation=25, ha="right", fontsize=8) + ax.set_ylim(0, 1) + ax.grid(True, axis="y", color="#e5e7eb") + for j in range(i + 1, rows_n * cols): + rr = j // cols + cc = j % cols + axes[rr][cc].axis("off") + handles, labels = axes[0][0].get_legend_handles_labels() + fig.legend(handles, labels, loc="upper center", ncol=2, fontsize=9) + fig.suptitle("Discrete Marginals: generated vs real", fontsize=12, color="#111827", y=0.98) + fig.tight_layout(rect=(0, 0, 1, 0.96)) + out_path.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(out_path, format="svg") + plt.close(fig) def read_csv_rows(path): p = Path(path) if not p.exists(): @@ -330,6 +1016,126 @@ def parse_float(s): return float(ss) +def compute_csv_means(path): + p = Path(path) + if not p.exists(): + return {} + with p.open("r", encoding="utf-8", newline="") as f: + reader = csv.DictReader(f) + sums = {} + counts = {} + for r in reader: + for k, v in r.items(): + x = parse_float(v) + if x is None: + continue + sums[k] = sums.get(k, 0.0) + x + counts[k] = counts.get(k, 0) + 1 + means = {} + for k, s in sums.items(): + c = counts.get(k, 0) + if c > 0: + means[k] = s / c + return means + + +def build_mean_profile(ks_rows, cont_stats, generated_csv_path, order="ks_desc", max_features=64): + if not isinstance(cont_stats, dict): + return None + means_real = cont_stats.get("mean") + if not isinstance(means_real, dict): + return None + + means_gen = compute_csv_means(generated_csv_path) + if not means_gen: + return None + + ks_by_feat = {} + mins = {} + maxs = {} + for r in ks_rows or []: + feat = (r.get("feature") or "").strip() + if not feat: + continue + ks = parse_float(r.get("ks")) + mn = parse_float(r.get("real_min")) + mx = parse_float(r.get("real_max")) + if ks is not None: + ks_by_feat[feat] = ks + if mn is not None: + mins[feat] = mn + if mx is not None: + maxs[feat] = mx + + feats = [] + real_vals = [] + gen_vals = [] + ks_vals = [] + for feat, mu_real in means_real.items(): + if feat not in means_gen: + continue + if feat not in mins or feat not in maxs: + continue + mn = mins[feat] + mx = maxs[feat] + denom = mx - mn + if denom == 0: + continue + mu_gen = means_gen[feat] + feats.append(feat) + real_vals.append(clamp((mu_real - mn) / denom, 0.0, 1.0)) + gen_vals.append(clamp((mu_gen - mn) / denom, 0.0, 1.0)) + ks_vals.append(ks_by_feat.get(feat)) + + if not feats: + return None + + idx = list(range(len(feats))) + if order == "name": + idx.sort(key=lambda i: feats[i]) + else: + idx.sort(key=lambda i: ks_by_feat.get(feats[i], -1.0), reverse=True) + + if isinstance(max_features, int) and max_features > 0 and len(idx) > max_features: + idx = idx[:max_features] + + sel_feats = [feats[i] for i in idx] + sel_real = [real_vals[i] for i in idx] + sel_gen = [gen_vals[i] for i in idx] + sel_ks = [ks_vals[i] for i in idx] + sel_diff = [abs(a - b) for a, b in zip(sel_real, sel_gen, strict=True)] + + def pearsonr(x, y): + if not x or len(x) != len(y): + return None + n = len(x) + mx = sum(x) / n + my = sum(y) / n + vx = sum((xi - mx) ** 2 for xi in x) + vy = sum((yi - my) ** 2 for yi in y) + if vx <= 0 or vy <= 0: + return None + cov = sum((xi - mx) * (yi - my) for xi, yi in zip(x, y, strict=True)) + return cov / math.sqrt(vx * vy) + + r = pearsonr(sel_real, sel_gen) + mae = (sum(sel_diff) / len(sel_diff)) if sel_diff else None + + points = [] + for f, xr, yg, ks, d in zip(sel_feats, sel_real, sel_gen, sel_ks, sel_diff, strict=True): + points.append({"feature": f, "x": xr, "y": yg, "ks": ks, "diff": d}) + + return { + "features": sel_feats, + "real": sel_real, + "gen": sel_gen, + "ks": sel_ks, + "diff": sel_diff, + "points": points, + "stats": {"n": len(sel_feats), "r": r, "mae": mae}, + } + + def zscores(vals): if not vals: return [] @@ -341,7 +1147,7 @@ def zscores(vals): return [(x - m) / s for x in vals] -def panel_svg(bh_rows, ks_rows, shift_rows, hist_rows, filtered_metrics, out_path): +def panel_svg(bh_rows, ks_rows, shift_rows, hist_rows, filtered_metrics, profile, out_path): W, H = 1400, 900 margin = 42 gap = 26 @@ -357,15 +1163,36 @@ def panel_svg(bh_rows, ks_rows, shift_rows, hist_rows, filtered_metrics, out_pat blue = "#3b82f6" red = "#ef4444" green = "#10b981" + card_bg = "#ffffff" + card_shadow = "#0f172a" + plot_bg = "#f8fafc" def panel_rect(x, y): - return "".format( - x=x, y=y, w=panel_w, h=panel_h, b=border + return ( + "" + "" + ).format( + x=x + 2.0, + y=y + 3.0, + x0=x, + y0=y, + w=panel_w, + h=panel_h, + f=card_bg, + b=border, + s=card_shadow, ) def text(x, y, s, size=12, anchor="start", color=ink, weight="normal"): - return "{t}".format( - x=x, y=y, a=anchor, fs=size, c=color, w=weight, t=svg_escape(s) + return "{t}".format( + x=x, + y=y, + a=anchor, + ff="system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif", + fs=size, + c=color, + w=weight, + t=svg_escape(s), ) def line(x1, y1, x2, y2, color=border, width=1.0, dash=None, opacity=1.0, cap="round"): @@ -403,13 +1230,20 @@ def panel_svg(bh_rows, ks_rows, shift_rows, hist_rows, filtered_metrics, out_pat "".format(w=W, h=H) ) parts.append("".format(w=W, h=H, bg=bg)) + parts.append( + "" + "" + "" + "" + "" + ) parts.append(text(W / 2, 32, "Benchmark Overview (HAI Security Dataset)", size=18, anchor="middle", weight="bold")) parts.append( text( W / 2, 54, - "A: workflow · B: per-feature KS · C: train-file mean shift · D: seed robustness and metric history", + "A: per-feature mean profile · B: per-feature KS · C: train-file mean shift · D: seed robustness and metric history", size=11, anchor="middle", color=subtle, @@ -430,7 +1264,7 @@ def panel_svg(bh_rows, ks_rows, shift_rows, hist_rows, filtered_metrics, out_pat parts.append(text(x + 18, y + 28, letter, size=16, weight="bold")) parts.append(text(x + 44, y + 28, title, size=14, weight="bold")) - panel_label(xA, yA, "A", "Typed Hybrid Generation") + panel_label(xA, yA, "A", "Feature-wise Similarity Profile") panel_label(xB, yB, "B", "Feature-Level Distribution Fidelity") panel_label(xC, yC, "C", "Dataset Shift Across Training Files") panel_label(xD, yD, "D", "Robustness Across Seeds") @@ -440,47 +1274,249 @@ def panel_svg(bh_rows, ks_rows, shift_rows, hist_rows, filtered_metrics, out_pat aw0 = panel_w - 44 ah0 = panel_h - 78 - box_h = 56 - box_w = (aw0 - 3 * 26) / 4 - by = ay0 + (ah0 - box_h) / 2 - 14 - boxes = [ - ("HAI windows\n(L=96)", "#f8fafc"), - ("Typed\ndecomposition", "#f8fafc"), - ("Hybrid\ngenerator", "#f8fafc"), - ("Synthetic\nwindows", "#f8fafc"), - ] - for i, (lbl, fill) in enumerate(boxes): - bx = ax0 + i * (box_w + 26) - parts.append(round_box(bx, by, box_w, box_h, fill=fill)) - for j, line_txt in enumerate(lbl.split("\n")): - parts.append(text(bx + box_w / 2, by + 22 + j * 16, line_txt, size=11, anchor="middle", weight="bold" if j == 0 else "normal")) - if i < len(boxes) - 1: - parts.append(arrow(bx + box_w, by + box_h / 2, bx + box_w + 26, by + box_h / 2, color=subtle, width=1.6)) + chart_pad_l = 54 + chart_pad_r = 14 + chart_pad_t = 24 + chart_pad_b = 52 + cx0 = ax0 + chart_pad_l + cx1 = ax0 + aw0 - chart_pad_r + cy0 = ay0 + chart_pad_t + cy1 = ay0 + ah0 - chart_pad_b + plot_side = min(cx1 - cx0, cy1 - cy0) + cx1p = cx0 + plot_side - hx = ax0 + 2 * (box_w + 26) - hy = by + box_h + 18 - parts.append(text(hx + 6, hy - 6, "Type-aware routes", size=10, color=subtle, weight="bold")) + parts.append(text(ax0, ay0 + 6, "Mean agreement (continuous features)", size=11, color=subtle, weight="bold")) + parts.append(text(ax0 + aw0, ay0 + 6, "range-normalized by real min/max", size=10, anchor="end", color=subtle)) - inner_w = box_w - inner_gap = 10 - inner_h = 32 - inner_y = hy - inner_colors = [("#e0f2fe", blue, "Trend (det.)"), ("#fee2e2", red, "Residual (DDPM)"), ("#dcfce7", green, "Discrete head")] - for k, (fill, stroke, name) in enumerate(inner_colors): - iy = inner_y + k * (inner_h + inner_gap) - parts.append(round_box(hx, iy, inner_w, inner_h, fill=fill, stroke=stroke, sw=1.4, rx=10)) - parts.append(text(hx + 10, iy + 20, name, size=10, color=ink, weight="bold")) - parts.append(arrow(hx + inner_w / 2, by + box_h, hx + inner_w / 2, inner_y, color=subtle, width=1.4)) + parts.append(round_box(cx0 - 10, cy0 - 10, (cx1p - cx0) + 20, (cy1 - cy0) + 20, fill=plot_bg, stroke=border, sw=1.0, rx=14)) + parts.append(line(cx0, cy0, cx0, cy1, color=border, width=1.0, cap="butt")) + parts.append(line(cx0, cy1, cx1p, cy1, color=border, width=1.0, cap="butt")) + for t, lbl in [(0.0, "0.0"), (0.5, "0.5"), (1.0, "1.0")]: + xx = cx0 + t * (cx1p - cx0) + yy = cy1 - t * (cy1 - cy0) + parts.append(line(xx, cy0, xx, cy1, color=grid, width=1.0, dash="4,6")) + parts.append(line(cx0, yy, cx1p, yy, color=grid, width=1.0, dash="4,6")) + parts.append(text(cx0 - 8, yy + 4, lbl, size=9, anchor="end", color=subtle)) + parts.append(text(xx, cy1 + 22, lbl, size=9, anchor="middle", color=subtle)) parts.append( - text( - xA + 22, - yA + panel_h - 18, - "Separation aligns metrics with data types: KS (continuous), JSD (discrete), lag-1 (temporal).", - size=10, - color=subtle, + "" + "Generated mean" + "".format( + x=ax0 + 10, + y=(cy0 + cy1) / 2, + ff="system-ui, -apple-system, Segoe UI, Roboto, Helvetica, Arial, sans-serif", + c=subtle, ) ) + parts.append(text((cx0 + cx1p) / 2, cy1 + 48, "Real mean", size=10, anchor="middle", color=subtle, weight="bold")) + + legend_x = cx1p + 22 + legend_y = cy0 + 6 + legend_w = 226 + legend_h = 74 + parts.append(round_box(legend_x - 10, legend_y - 14, legend_w, legend_h, fill="#ffffff", stroke=border, sw=1.0, rx=12)) + parts.append( + "".format( + x=legend_x + 4, y=legend_y + 4, c=blue + ) + ) + parts.append(text(legend_x + 16, legend_y + 8, "KDE density", size=10, color=ink, weight="bold")) + parts.append( + "".format( + x1=legend_x + 2, x2=legend_x + 18, y=legend_y + 26, c="#1d4ed8" + ) + ) + parts.append(text(legend_x + 24, legend_y + 30, "Contours", size=10, color=ink, weight="bold")) + parts.append( + "".format( + x1=legend_x + 2, x2=legend_x + 18, y=legend_y + 48, c="#94a3b8" + ) + ) + parts.append(text(legend_x + 24, legend_y + 52, "y=x", size=10, color=ink, weight="bold")) + + if isinstance(profile, dict) and profile.get("points"): + pts = profile["points"] + stats = profile.get("stats") if isinstance(profile.get("stats"), dict) else {} + n = stats.get("n") + r = stats.get("r") + mae = stats.get("mae") + + pad_px = 2.5 + eps = pad_px / max(1e-9, (cx1p - cx0)) + + def X(v): + return cx0 + clamp(v, eps, 1.0 - eps) * (cx1p - cx0) + + def Y(v): + return cy1 - clamp(v, eps, 1.0 - eps) * (cy1 - cy0) + + parts.append( + "".format( + x=cx0, + y=cy0, + w=(cx1p - cx0), + h=(cy1 - cy0), + ) + ) + + parts.append("") + + parts.append("") + for p in pts: + x = X(p["x"]) + y = Y(p["y"]) + parts.append("".format(x=x, y=y, c=blue)) + parts.append("") + + parts.append(line(cx0, cy1, cx1p, cy0, color="#94a3b8", width=2.0, dash="6,6", opacity=0.9)) + + out_idx = sorted(range(len(pts)), key=lambda i: (pts[i].get("diff") or 0.0), reverse=True)[:5] + + grid_n = 64 + bw = 0.085 + bw2 = bw * bw + kde = [[0.0 for _ in range(grid_n + 1)] for _ in range(grid_n + 1)] + for iy in range(grid_n + 1): + yy = iy / grid_n + for ix in range(grid_n + 1): + xx = ix / grid_n + s = 0.0 + for p in pts: + dx = xx - clamp(p["x"], 0.0, 1.0) + dy = yy - clamp(p["y"], 0.0, 1.0) + s += math.exp(-(dx * dx + dy * dy) / (2.0 * bw2)) + kde[iy][ix] = s + vmax = max(max(row) for row in kde) if kde else 1.0 + if vmax <= 0: + vmax = 1.0 + for iy in range(grid_n + 1): + for ix in range(grid_n + 1): + kde[iy][ix] /= vmax + + segments_by_level = { + 1: [(3, 0)], + 2: [(0, 1)], + 3: [(3, 1)], + 4: [(1, 2)], + 5: [(3, 2), (0, 1)], + 6: [(0, 2)], + 7: [(3, 2)], + 8: [(2, 3)], + 9: [(0, 2)], + 10: [(0, 3), (1, 2)], + 11: [(1, 2)], + 12: [(1, 3)], + 13: [(0, 1)], + 14: [(3, 0)], + } + + def edge_pt(edge, x0, y0, x1, y1, v00, v10, v11, v01, level): + if edge == 0: + a, b = v00, v10 + t = 0.5 if b == a else (level - a) / (b - a) + return x0 + clamp(t, 0.0, 1.0) * (x1 - x0), y0 + if edge == 1: + a, b = v10, v11 + t = 0.5 if b == a else (level - a) / (b - a) + return x1, y0 + clamp(t, 0.0, 1.0) * (y1 - y0) + if edge == 2: + a, b = v01, v11 + t = 0.5 if b == a else (level - a) / (b - a) + return x0 + clamp(t, 0.0, 1.0) * (x1 - x0), y1 + a, b = v00, v01 + t = 0.5 if b == a else (level - a) / (b - a) + return x0, y0 + clamp(t, 0.0, 1.0) * (y1 - y0) + + contour_levels = [0.18, 0.30, 0.42, 0.54, 0.66, 0.78] + for li, lev in enumerate(contour_levels): + segs = [] + for iy in range(grid_n): + y0n = iy / grid_n + y1n = (iy + 1) / grid_n + for ix in range(grid_n): + x0n = ix / grid_n + x1n = (ix + 1) / grid_n + v00 = kde[iy][ix] + v10 = kde[iy][ix + 1] + v11 = kde[iy + 1][ix + 1] + v01 = kde[iy + 1][ix] + c0 = 1 if v00 >= lev else 0 + c1 = 2 if v10 >= lev else 0 + c2 = 4 if v11 >= lev else 0 + c3 = 8 if v01 >= lev else 0 + idx = c0 | c1 | c2 | c3 + pairs = segments_by_level.get(idx) + if not pairs: + continue + for e0, e1 in pairs: + ax, ay = edge_pt(e0, x0n, y0n, x1n, y1n, v00, v10, v11, v01, lev) + bx, by = edge_pt(e1, x0n, y0n, x1n, y1n, v00, v10, v11, v01, lev) + segs.append((X(ax), Y(ay), X(bx), Y(by))) + + if segs: + d = " ".join("M {x1:.1f} {y1:.1f} L {x2:.1f} {y2:.1f}".format(x1=a, y1=b, x2=c, y2=d) for a, b, c, d in segs) + op = 0.14 + 0.09 * li + sw = 0.9 + 0.18 * li + parts.append( + "".format( + d=d, c="#1d4ed8", w=sw, o=op + ) + ) + + for i in out_idx: + p = pts[i] + x = X(p["x"]) + y = Y(p["y"]) + parts.append("".format(x=x, y=y, c=red)) + parts.append("".format(x=x, y=y, c=red)) + + parts.append("") + + for i in out_idx: + p = pts[i] + x = X(p["x"]) + y = Y(p["y"]) + + dx = 18 if p["y"] >= p["x"] else -18 + if x <= cx0 + 16 and dx < 0: + dx = 18 + if x >= cx1p - 16 and dx > 0: + dx = -18 + + dy = -12 + if y <= cy0 + 12: + dy = 14 + + anchor = "start" if dx > 0 else "end" + parts.append(line(x, y, x + dx, y + dy + 2, color="#94a3b8", width=1.2, dash="3,5", opacity=0.9)) + parts.append(text(x + dx, y + dy, p["feature"], size=9, anchor=anchor, color=ink, weight="bold")) + + s1 = "n={n}".format(n=n if isinstance(n, int) else len(pts)) + s2 = "Pearson r={r:.3f}".format(r=r) if isinstance(r, float) else "Pearson r=NA" + s3 = "MAE={m:.3f}".format(m=mae) if isinstance(mae, float) else "MAE=NA" + parts.append( + text( + legend_x - 10, + legend_y + legend_h + 18, + s1 + " · " + s2 + " · " + s3, + size=10, + color=subtle, + weight="bold", + ) + ) + else: + parts.append( + text( + (cx0 + cx1p) / 2, + (cy0 + cy1) / 2, + "missing cont_stats.json or generated.csv", + size=12, + anchor="middle", + color=subtle, + weight="bold", + ) + ) bx0 = xB + 22 by0 = yB + 62 @@ -739,7 +1775,7 @@ def panel_svg(bh_rows, ks_rows, shift_rows, hist_rows, filtered_metrics, out_pat out_path.write_text("\n".join(parts), encoding="utf-8") -def panel_matplotlib(bh_rows, ks_rows, shift_rows, hist_rows, filtered_metrics, out_path): +def panel_matplotlib(bh_rows, ks_rows, shift_rows, hist_rows, filtered_metrics, profile, out_path): import matplotlib.pyplot as plt import matplotlib.patches as patches @@ -758,33 +1794,73 @@ def panel_matplotlib(bh_rows, ks_rows, shift_rows, hist_rows, filtered_metrics, fig.suptitle("Benchmark Overview (HAI Security Dataset)", fontsize=16, y=0.98) - axA.set_title("A Typed Hybrid Generation", loc="left", fontsize=12, fontweight="bold") - axA.axis("off") - axA.set_xlim(0, 1) - axA.set_ylim(0, 1) - box_y = 0.55 - box_w = 0.18 - box_h = 0.16 - x_positions = [0.06, 0.30, 0.54, 0.78] - labels = ["HAI windows\n(L=96)", "Typed\ndecomposition", "Hybrid\ngenerator", "Synthetic\nwindows"] - for x, lbl in zip(x_positions, labels, strict=True): - axA.add_patch(patches.FancyBboxPatch((x, box_y), box_w, box_h, boxstyle="round,pad=0.02,rounding_size=0.02", facecolor="#f8fafc", edgecolor="#e5e7eb")) - axA.text(x + box_w / 2, box_y + box_h / 2, lbl, ha="center", va="center", fontsize=10, fontweight="bold") - for i in range(3): - x1 = x_positions[i] + box_w - x2 = x_positions[i + 1] - axA.annotate("", xy=(x2, box_y + box_h / 2), xytext=(x1, box_y + box_h / 2), arrowprops=dict(arrowstyle="-|>", lw=1.4, color="#6b7280")) + axA.set_title("A Feature-wise Similarity Profile", loc="left", fontsize=12, fontweight="bold") + if isinstance(profile, dict) and profile.get("points"): + pts = profile["points"] + xr = [p["x"] for p in pts] + yg = [p["y"] for p in pts] + diffs = [p.get("diff") or abs(p["x"] - p["y"]) for p in pts] + out_idx = sorted(range(len(pts)), key=lambda i: diffs[i], reverse=True)[:5] - hx = x_positions[2] - hy = 0.20 - axA.text(hx, hy + 0.27, "Type-aware routes", fontsize=9, color="#6b7280", fontweight="bold") - inner = [("Trend (det.)", "#e0f2fe", "#3b82f6"), ("Residual (DDPM)", "#fee2e2", "#ef4444"), ("Discrete head", "#dcfce7", "#10b981")] - for k, (name, fc, ec) in enumerate(inner): - y = hy + 0.18 - k * 0.11 - axA.add_patch(patches.FancyBboxPatch((hx, y), box_w, 0.08, boxstyle="round,pad=0.02,rounding_size=0.02", facecolor=fc, edgecolor=ec, lw=1.2)) - axA.text(hx + 0.01, y + 0.04, name, ha="left", va="center", fontsize=9, fontweight="bold") - axA.annotate("", xy=(hx + box_w / 2, hy + 0.20), xytext=(hx + box_w / 2, box_y), arrowprops=dict(arrowstyle="-|>", lw=1.2, color="#6b7280")) - axA.text(0.06, 0.06, "Metrics align with types: KS (continuous), JSD (discrete), lag-1 (temporal).", fontsize=9, color="#6b7280") + axA.plot([0, 1], [0, 1], linestyle="--", color="#94a3b8", lw=1.8, dashes=(6, 6), label="y=x") + try: + hb = axA.hexbin( + xr, + yg, + gridsize=26, + extent=(0, 1, 0, 1), + cmap="Blues", + mincnt=1, + linewidths=0.0, + alpha=0.95, + ) + hb.set_edgecolor("face") + except Exception: + axA.scatter(xr, yg, s=22, color="#3b82f6", alpha=0.35, edgecolors="none") + + if out_idx: + axA.scatter( + [xr[i] for i in out_idx], + [yg[i] for i in out_idx], + s=90, + facecolors="none", + edgecolors="#ef4444", + linewidths=2.0, + alpha=0.95, + ) + axA.scatter([xr[i] for i in out_idx], [yg[i] for i in out_idx], s=22, color="#ef4444", alpha=0.95, edgecolors="white", linewidths=0.8) + for i in out_idx: + axA.annotate( + pts[i]["feature"], + (xr[i], yg[i]), + textcoords="offset points", + xytext=(12, -10), + ha="left", + va="top", + fontsize=8, + color="#111827", + fontweight="bold", + arrowprops=dict(arrowstyle="-", color="#94a3b8", lw=1.0, linestyle=(0, (3, 4))), + ) + + axA.set_xlim(0.0, 1.0) + axA.set_ylim(0.0, 1.0) + axA.set_xlabel("Real mean (range-normalized)") + axA.set_ylabel("Generated mean (range-normalized)") + axA.set_aspect("equal", adjustable="box") + axA.grid(True, color="#eef2f7") + axA.legend(loc="lower right", frameon=False, fontsize=9) + + stats = profile.get("stats") if isinstance(profile.get("stats"), dict) else {} + n = stats.get("n") if isinstance(stats.get("n"), int) else len(pts) + r = stats.get("r") + mae = stats.get("mae") + s2 = "Pearson r={r:.3f}".format(r=r) if isinstance(r, float) else "Pearson r=NA" + s3 = "MAE={m:.3f}".format(m=mae) if isinstance(mae, float) else "MAE=NA" + axA.text(0.02, 0.98, "n={n} · {s2} · {s3}".format(n=n, s2=s2, s3=s3), transform=axA.transAxes, ha="left", va="top", fontsize=9, color="#6b7280", fontweight="bold") + else: + axA.axis("off") + axA.text(0.5, 0.5, "missing cont_stats.json or generated.csv", ha="center", va="center", fontsize=12, color="#6b7280") axB.set_title("B Feature-Level Distribution Fidelity", loc="left", fontsize=12, fontweight="bold") ks_sorted = sorted( @@ -916,8 +1992,10 @@ def main(): else: if args.figure == "panel": out_path = Path(__file__).resolve().parent.parent / "figures" / "benchmark_panel.svg" - else: + elif args.figure == "summary": out_path = Path(__file__).resolve().parent.parent / "figures" / "benchmark_metrics.svg" + else: + out_path = Path(__file__).resolve().parent.parent / "figures" / "ranked_ks.svg" if args.figure == "summary": if args.engine in {"auto", "matplotlib"}: @@ -932,23 +2010,99 @@ def main(): print("saved", out_path) return + if args.figure == "ranked_ks": + ranked_rows = read_csv_rows(args.ranked_ks) + ranked_ks_svg(ranked_rows, out_path, top_n=args.ranked_ks_top_n) + print("saved", out_path) + return + + if args.figure == "lines": + feats_arg = [f.strip() for f in (args.lines_features or "").split(",") if f.strip()] + features = feats_arg + if not features: + rk_rows = read_csv_rows(args.ranked_ks) + if rk_rows: + sorted_rows = sorted( + [{"feature": (r.get("feature") or "").strip(), "ks": parse_float(r.get("ks"))} for r in rk_rows if (r.get("feature") or "").strip()], + key=lambda x: (x["ks"] if x["ks"] is not None else -1.0), + reverse=True, + ) + features = [r["feature"] for r in sorted_rows[:max(1, int(args.lines_top_k))]] + if not features: + features = ["P1_B4002", "P1_PIT02", "P1_FCV02Z", "P1_B3004"] + if not args.out: + out_path = Path(__file__).resolve().parent.parent / "figures" / "lines.svg" + cont_stats = read_json(args.cont_stats) + lines_matplotlib(args.generated, cont_stats, features, out_path, max_rows=args.lines_max_rows, normalize=args.lines_normalize, reference_arg=args.reference, ref_index=args.lines_ref_index) + print("saved", out_path) + return + + if args.figure == "cdf_grid": + cont_stats = read_json(args.cont_stats) + feats_arg = [f.strip() for f in (args.cdf_features or "").split(",") if f.strip()] + if feats_arg: + features = feats_arg + else: + features = sorted(list((cont_stats.get("mean") or {}).keys())) + if args.cdf_max_features > 0: + features = features[: args.cdf_max_features] + if not args.out: + out_path = Path(__file__).resolve().parent.parent / "figures" / "cdf_grid.svg" + cdf_grid_matplotlib(args.generated, args.reference, cont_stats, features, out_path, max_rows=max(1000, args.lines_max_rows), bins=args.cdf_bins) + print("saved", out_path) + return + + if args.figure == "disc_grid": + split = read_json(args.feature_split) + disc_list = list((split.get("discrete") or [])) if isinstance(split, dict) else [] + feats_arg = [f.strip() for f in (args.disc_features or "").split(",") if f.strip()] + if feats_arg: + features = feats_arg + else: + features = sorted(disc_list) + if args.disc_max_features > 0: + features = features[: args.disc_max_features] + if not args.out: + out_path = Path(__file__).resolve().parent.parent / "figures" / "disc_grid.svg" + discrete_grid_matplotlib(args.generated, args.reference, features, out_path, max_rows=max(1000, args.lines_max_rows)) + print("saved", out_path) + return + + if args.figure == "disc_points": + split = read_json(args.feature_split) + disc_list = list((split.get("discrete") or [])) if isinstance(split, dict) else [] + feats_arg = [f.strip() for f in (args.disc_features or "").split(",") if f.strip()] + if feats_arg: + features = feats_arg + else: + features = sorted(disc_list) + if args.disc_max_features > 0: + features = features[: args.disc_max_features] + if not args.out: + out_path = Path(__file__).resolve().parent.parent / "figures" / "disc_points.svg" + discrete_points_matplotlib(args.generated, args.reference, features, out_path, max_rows=max(1000, args.lines_max_rows)) + print("saved", out_path) + return + ks_rows = read_csv_rows(args.ks_per_feature) shift_rows = read_csv_rows(args.data_shift) mh_rows = read_csv_rows(args.metrics_history) fm = read_json(args.filtered_metrics) + cont_stats = read_json(args.cont_stats) + profile = build_mean_profile(ks_rows, cont_stats, args.generated, order=args.profile_order, max_features=args.profile_max_features) bh_rows = [{"seed": r["seed"], "avg_ks": r["avg_ks"], "avg_jsd": r["avg_jsd"], "avg_lag1_diff": r["avg_lag1_diff"]} for r in rows] if args.engine in {"auto", "matplotlib"}: try: - panel_matplotlib(bh_rows, ks_rows, shift_rows, mh_rows, fm, out_path) + panel_matplotlib(bh_rows, ks_rows, shift_rows, mh_rows, fm, profile, out_path) print("saved", out_path) return except Exception: if args.engine == "matplotlib": raise - panel_svg(bh_rows, ks_rows, shift_rows, mh_rows, fm, out_path) + panel_svg(bh_rows, ks_rows, shift_rows, mh_rows, fm, profile, out_path) print("saved", out_path) diff --git a/figures/benchmark_panel.svg b/figures/benchmark_panel.svg index 50c4d85..1882114 100644 --- a/figures/benchmark_panel.svg +++ b/figures/benchmark_panel.svg @@ -1,112 +1,200 @@ -Benchmark Overview (HAI Security Dataset) -A: workflow · B: per-feature KS · C: train-file mean shift · D: seed robustness and metric history - - - - -A -Typed Hybrid Generation -B -Feature-Level Distribution Fidelity -C -Dataset Shift Across Training Files -D -Robustness Across Seeds - -HAI windows -(L=96) - - -Typed -decomposition - - -Hybrid -generator - - -Synthetic -windows -Type-aware routes - -Trend (det.) - -Residual (DDPM) - -Discrete head - -Separation aligns metrics with data types: KS (continuous), JSD (discrete), lag-1 (temporal). -Top-14 KS outliers (lower is better) -dropped: P2_MSD + +Benchmark Overview (HAI Security Dataset) +A: per-feature mean profile · B: per-feature KS · C: train-file mean shift · D: seed robustness and metric history + + + + +A +Feature-wise Similarity Profile +B +Feature-Level Distribution Fidelity +C +Dataset Shift Across Training Files +D +Robustness Across Seeds +Mean agreement (continuous features) +range-normalized by real min/max + + + + + +0.0 +0.0 + + +0.5 +0.5 + + +1.0 +1.0 +Generated mean +Real mean + + +KDE density + +Contours + +y=x + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +P1_FT03Z + +P1_B4002 + +P4_HT_LD + +P4_HT_PO + +P1_PCV02Z +n=52 · Pearson r=0.637 · MAE=0.066 +Top-14 KS outliers (lower is better) +dropped: P2_MSD -0.0 +0.0 -0.2 +0.2 -0.4 +0.4 -0.6 +0.6 -0.8 +0.8 -1.0 +1.0 -P2_MSD -1.000 +P2_MSD +1.000 -collapse -P1_PIT02 -0.801 +collapse +P1_PIT02 +0.801 -P4_ST_PT01 -0.492 +P4_ST_PT01 +0.492 -P3_LCV01D -0.458 +P3_LCV01D +0.458 -P1_B4005 -0.427 +P1_B4005 +0.427 -P1_PCV02Z -0.413 +P1_PCV02Z +0.413 -P4_ST_TT01 -0.408 +P4_ST_TT01 +0.408 -P1_B4002 -0.374 +P1_B4002 +0.374 -P4_HT_LD -0.334 +P4_HT_LD +0.334 -P4_HT_FD -0.266 +P4_HT_FD +0.266 -P3_PIT01 -0.252 +P3_PIT01 +0.252 -P1_B2004 -0.185 +P1_B2004 +0.185 -P1_B3005 -0.184 +P1_B3005 +0.184 -P1_TIT01 -0.164 +P1_TIT01 +0.164 -Mean shift (z-score) across train files -rows: 4321, 4537, 9577 -P1_FT01 -P1_LIT01 -P1_PIT01 -P2_CO_rpm -P3_LIT01 -P4_ST_PT01 -train1.csv.gz -train2.csv.gz -train3.csv.gz +Mean shift (z-score) across train files +rows: 4321, 4537, 9577 +P1_FT01 +P1_LIT01 +P1_PIT01 +P2_CO_rpm +P3_LIT01 +P4_ST_PT01 +train1.csv.gz +train2.csv.gz +train3.csv.gz @@ -119,42 +207,42 @@ -+0.52 ++0.52 --1.38 +-1.38 --0.98 +-0.98 -+0.81 ++0.81 -+0.96 ++0.96 --1.19 +-1.19 --1.40 +-1.40 -+0.96 ++0.96 -+1.37 ++1.37 --1.41 +-1.41 -+0.42 ++0.42 -+1.25 ++1.25 -+0.88 ++0.88 -+0.42 ++0.42 --0.39 +-0.39 -+0.59 ++0.59 --1.38 +-1.38 --0.06 -z +-0.06 +z @@ -180,56 +268,56 @@ --2 -0 -+2 -Seed robustness (mean ± 1 std; dots: seeds) -seeds: 7, 1337, 2025 +-2 +0 ++2 +Seed robustness (mean ± 1 std; dots: seeds) +seeds: 7, 1337, 2025 -KS (cont.) history +KS (cont.) history -0.322 +0.322 -JSD (disc.) history +JSD (disc.) history -0.042 +0.042 -Abs Δ lag-1 history +Abs Δ lag-1 history -0.277 +0.277 -KS (cont.) +KS (cont.) -mean=0.3311±0.0079 +mean=0.3311±0.0079 -JSD (disc.) +JSD (disc.) -mean=0.0284±0.0073 +mean=0.0284±0.0073 -Abs Δ lag-1 +Abs Δ lag-1 -mean=0.2684±0.0027 +mean=0.2684±0.0027 diff --git a/figures/cdf_grid.svg b/figures/cdf_grid.svg new file mode 100644 index 0000000..ab2319e --- /dev/null +++ b/figures/cdf_grid.svg @@ -0,0 +1,3139 @@ + + + + + + + + 2026-02-06T11:30:39.169268 + image/svg+xml + + + Matplotlib v3.8.3, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/figures/disc_grid.svg b/figures/disc_grid.svg new file mode 100644 index 0000000..ecebe46 --- /dev/null +++ b/figures/disc_grid.svg @@ -0,0 +1,6849 @@ + + + + + + + + 2026-02-06T11:22:24.643689 + image/svg+xml + + + Matplotlib v3.8.3, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/figures/disc_points.svg b/figures/disc_points.svg new file mode 100644 index 0000000..1fd2e2a --- /dev/null +++ b/figures/disc_points.svg @@ -0,0 +1,2318 @@ + + + + + + + + 2026-02-06T11:28:50.475328 + image/svg+xml + + + Matplotlib v3.8.3, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/figures/lines.svg b/figures/lines.svg new file mode 100644 index 0000000..7b4bfce --- /dev/null +++ b/figures/lines.svg @@ -0,0 +1,1830 @@ + + + + + + + + 2026-02-06T11:30:54.455564 + image/svg+xml + + + Matplotlib v3.8.3, https://matplotlib.org/ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/figures/ranked_ks.svg b/figures/ranked_ks.svg new file mode 100644 index 0000000..642ac2a --- /dev/null +++ b/figures/ranked_ks.svg @@ -0,0 +1,223 @@ + + +KS Outlier Attribution +Left: top-K features by KS · Right: avg KS after removing top-n outliers + + +A Top-30 KS features +B Removing worst features + +0.0 + +0.2 + +0.4 + +0.6 + +0.8 + +1.0 + +KS +P1_B4002 + +0.792 +1.49% +P1_PIT02 + +0.792 +1.49% +P1_FCV02Z + +0.775 +1.46% +P1_B3004 + +0.672 +1.27% +P1_B2004 + +0.660 +1.24% +P1_PCV02Z + +0.659 +1.24% +P1_B4022 + +0.576 +1.09% +P1_PCV01D + +0.530 +1.00% +P1_B3005 + +0.529 +1.00% +P1_FT03 + +0.524 +0.99% +P1_PCV01Z + +0.518 +0.98% +P3_FIT01 + +0.508 +0.96% +P2_SIT02 + +0.502 +0.95% +P1_FCV03Z + +0.492 +0.93% +P1_LIT01 + +0.491 +0.93% +P1_FT03Z + +0.491 +0.93% +P1_FCV03D + +0.488 +0.92% +P1_FT01 + +0.475 +0.90% +P1_FT01Z + +0.475 +0.90% +P1_B400B + +0.473 +0.89% +P1_FT02Z + +0.470 +0.89% +P2_SIT01 + +0.460 +0.87% +P1_FT02 + +0.459 +0.87% +P1_LCV01Z + +0.455 +0.86% +P1_LCV01D + +0.447 +0.84% +P2_VXT03 + +0.437 +0.82% +P3_PIT01 + +0.357 +0.67% +P1_B4005 + +0.306 +0.58% +P2_VXT02 + +0.282 +0.53% +P2_VYT03 + +0.270 +0.51% +avg KS (lower is better) + +0.379 + +0.295 + +0.211 + +0.127 + +0.043 + +-0.041 + +0 + +10 + +21 + +31 + +42 + +52 +remove top-n features + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/paper.md b/paper.md index 1b98c32..876895c 100644 --- a/paper.md +++ b/paper.md @@ -671,6 +671,33 @@ Table: Summary of benchmark metrics (three independent seeds). ![Benchmark overview figure (workflow, feature fidelity, dataset shift, and robustness).](figures/benchmark_panel.svg) Benchmark 综合图(流程、特征级分布保真、训练集分布漂移与跨种子鲁棒性)。 +![Seed robustness summary across three independent runs.](figures/benchmark_metrics.svg) +跨三次独立运行的鲁棒性汇总图。 + +![KS outlier attribution (top-K features and average KS after removing worst features).](figures/ranked_ks.svg) +KS 离群归因图(Top-K 误差特征与“移除最差特征后”的平均 KS 变化)。 + +![CDF alignment for a representative set of high-KS continuous features: P1_B4002, P1_PIT02, P1_FCV02Z, P1_B3004.](example/results/cdf_P1_B4002.svg) +代表性高 KS 连续特征的 CDF 对齐:P1_B4002。 + +![CDF alignment for P1_PIT02.](example/results/cdf_P1_PIT02.svg) +P1_PIT02 的 CDF 对齐图。 + +![CDF alignment for P1_FCV02Z.](example/results/cdf_P1_FCV02Z.svg) +P1_FCV02Z 的 CDF 对齐图。 + +![CDF alignment for P1_B3004.](example/results/cdf_P1_B3004.svg) +P1_B3004 的 CDF 对齐图。 + +![All continuous features’ distribution comparison (empirical CDF grid: generated vs real).](figures/cdf_grid.svg) +所有连续特征的分布对比(经验 CDF 网格:生成 vs 原始)。 + +![Discrete features’ categorical distribution comparison (dot plot: generated vs real).](figures/disc_points.svg) +离散特征的类别分布对比(点图:两种颜色分别代表生成与原始)。 + +![Generated line series (normalized by real min/max) for four representative features: P1_B4002, P1_PIT02, P1_FCV02Z, P1_B3004.](figures/lines.svg) +四个代表性特征的生成序列折线图(按真实 min/max 归一化)。 + Why this benchmark highlights where the method helps To make the benchmark actionable (and comparable to prior work), we report type-appropriate, interpretable statistics instead of collapsing everything into a single similarity score. This matters in mixed-type ICS telemetry: continuous fidelity can be high while discrete semantics fail, and vice versa. By separating continuous (KS), discrete (JSD), and temporal (lag-1) views, the evaluation directly matches the design goals of the hybrid generator: distributional refinement for continuous residuals, vocabulary-valid reconstruction for discrete supervision, and trend-induced short-horizon coherence. 为何该基准测试能够凸显方法优势