This commit is contained in:
MZ YANG
2026-02-12 01:46:47 +08:00
parent 26b9b8a447
commit f1afd4bf38
11 changed files with 77395 additions and 13341 deletions

View File

@@ -107,6 +107,14 @@ def lag1_corr(values: List[float]) -> float:
def resolve_reference_paths(path: str) -> List[str]:
if not path:
return []
if path.endswith(".json") and Path(path).exists():
try:
cfg = load_json(path)
ref = cfg.get("data_glob") or cfg.get("data_path") or ""
if ref:
return resolve_reference_paths(str(ref))
except Exception:
return []
if any(ch in path for ch in ["*", "?", "["]):
base = Path(path).parent.resolve()
pat = Path(path).name
@@ -151,10 +159,11 @@ def main():
std_ref = stats_json.get("raw_std", stats_json.get("std"))
transforms = stats_json.get("transform", {})
vocab = load_json(args.vocab)["vocab"]
vocab_sets = {c: set(vocab[c].keys()) for c in disc_cols}
vocab_sets = {c: set(vocab.get(c, {}).keys()) for c in disc_cols}
cont_stats = init_stats(cont_cols)
disc_invalid = {c: 0 for c in disc_cols}
missing_generated = {c: 0 for c in disc_cols}
rows = 0
with open_csv(args.generated) as f:
@@ -172,7 +181,11 @@ def main():
if ref_paths:
pass
for c in disc_cols:
if row[c] not in vocab_sets[c]:
tok = row.get(c, None)
if tok is None:
missing_generated[c] += 1
continue
if tok not in vocab_sets[c]:
disc_invalid[c] += 1
cont_summary = finalize_stats(cont_stats)
@@ -192,6 +205,7 @@ def main():
"continuous_summary": cont_summary,
"continuous_error": cont_err,
"discrete_invalid_counts": disc_invalid,
"missing_generated_columns": {k: v for k, v in missing_generated.items() if v > 0},
}
# Optional richer metrics using reference data
@@ -212,7 +226,7 @@ def main():
except Exception:
gen_cont[c].append(0.0)
for c in disc_cols:
tok = row[c]
tok = row.get(c, "")
gen_disc[c][tok] = gen_disc[c].get(tok, 0) + 1
loaded = 0
@@ -228,7 +242,7 @@ def main():
except Exception:
ref_cont[c].append(0.0)
for c in disc_cols:
tok = row[c]
tok = row.get(c, "")
ref_disc[c][tok] = ref_disc[c].get(tok, 0) + 1
loaded += 1
if args.max_rows and loaded >= args.max_rows: