"""Identify paragraph IDs where v3.5 6-model majority regressed vs v3.0. A "regression" = v3.0 majority matched human majority but v3.5 majority does not. We compute category majority from 6 models (excluding minimax): opus, gpt-5.4, gemini-3.1-pro-preview, glm-5:exacto, kimi-k2.5, mimo-v2-pro:exacto v3.0 annotations are filtered to the 359 PIDs present in holdout-rerun-v35.jsonl. """ from __future__ import annotations import json from collections import Counter from pathlib import Path ROOT = Path(__file__).resolve().parent.parent DATA = ROOT / "data" # ── Model files (excluding minimax) ────────────────────────────────────────── V30_FILES = [ DATA / "annotations" / "golden" / "opus.jsonl", DATA / "annotations" / "bench-holdout" / "gpt-5.4.jsonl", DATA / "annotations" / "bench-holdout" / "gemini-3.1-pro-preview.jsonl", DATA / "annotations" / "bench-holdout" / "glm-5:exacto.jsonl", DATA / "annotations" / "bench-holdout" / "kimi-k2.5.jsonl", DATA / "annotations" / "bench-holdout" / "mimo-v2-pro:exacto.jsonl", ] V35_FILES = [ DATA / "annotations" / "golden-v35" / "opus.jsonl", DATA / "annotations" / "bench-holdout-v35" / "gpt-5.4.jsonl", DATA / "annotations" / "bench-holdout-v35" / "gemini-3.1-pro-preview.jsonl", DATA / "annotations" / "bench-holdout-v35" / "glm-5:exacto.jsonl", DATA / "annotations" / "bench-holdout-v35" / "kimi-k2.5.jsonl", DATA / "annotations" / "bench-holdout-v35" / "mimo-v2-pro:exacto.jsonl", ] def load_annotations(files: list[Path]) -> dict[str, list[str]]: """Load annotations, returning {pid: [category, ...]} across models.""" result: dict[str, list[str]] = {} for f in files: with open(f) as fh: for line in fh: rec = json.loads(line) pid = rec["paragraphId"] cat = rec["label"]["content_category"] result.setdefault(pid, []).append(cat) return result def majority_vote(labels: list[str]) -> str | None: """Return the most common label, or None if tied.""" counts = Counter(labels) top = counts.most_common(2) if len(top) == 1: return top[0][0] if top[0][1] > top[1][1]: return top[0][0] return None # tie def load_human_majority() -> dict[str, str]: """Compute human majority label per PID from 3-annotator raw labels.""" pid_labels: dict[str, list[str]] = {} with open(DATA / "gold" / "human-labels-raw.jsonl") as f: for line in f: rec = json.loads(line) pid = rec["paragraphId"] pid_labels.setdefault(pid, []).append(rec["contentCategory"]) return { pid: maj for pid, labels in pid_labels.items() if (maj := majority_vote(labels)) is not None } def load_holdout_pids() -> dict[str, list[str]]: """Load the 359 confusion-axis PIDs and their axes.""" result: dict[str, list[str]] = {} with open(DATA / "gold" / "holdout-rerun-v35.jsonl") as f: for line in f: rec = json.loads(line) result[rec["paragraphId"]] = rec["axes"] return result # Axis name → output key mapping AXIS_TO_KEY = { "BG_MR": "bg_mr_regressions", "BG_RMP": "bg_mr_regressions", # BG confusion axes both go to bg_mr bucket "MR_RMP": "mr_rmp_regressions", "SI_NO": "mr_rmp_regressions", # SI/NO doesn't fit neatly; group with mr_rmp } def main() -> None: holdout = load_holdout_pids() holdout_pids = set(holdout.keys()) human_maj = load_human_majority() v30_ann = load_annotations(V30_FILES) v35_ann = load_annotations(V35_FILES) # Compute model majorities filtered to holdout PIDs v30_maj: dict[str, str | None] = {} for pid in holdout_pids: labels = v30_ann.get(pid, []) v30_maj[pid] = majority_vote(labels) if len(labels) == 6 else None v35_maj: dict[str, str | None] = {} for pid in holdout_pids: labels = v35_ann.get(pid, []) v35_maj[pid] = majority_vote(labels) if len(labels) == 6 else None # Find regressions bg_mr_regressions: list[str] = [] mr_rmp_regressions: list[str] = [] for pid in sorted(holdout_pids): h = human_maj.get(pid) v30 = v30_maj.get(pid) v35 = v35_maj.get(pid) if h is None or v30 is None or v35 is None: continue # Regression: v3.0 matched human, v3.5 does not if v30 == h and v35 != h: axes = holdout[pid] # Assign to bucket based on axes is_bg_mr = any(a in ("BG_MR", "BG_RMP") for a in axes) is_mr_rmp = any(a in ("MR_RMP", "SI_NO") for a in axes) if is_bg_mr: bg_mr_regressions.append(pid) if is_mr_rmp: mr_rmp_regressions.append(pid) # If somehow neither axis matched, still include in all if not is_bg_mr and not is_mr_rmp: # Fallback: put in mr_rmp mr_rmp_regressions.append(pid) all_regressions = sorted(set(bg_mr_regressions + mr_rmp_regressions)) output = { "bg_mr_regressions": sorted(bg_mr_regressions), "mr_rmp_regressions": sorted(mr_rmp_regressions), "all_regressions": all_regressions, } out_path = DATA / "gold" / "regression-pids.json" with open(out_path, "w") as f: json.dump(output, f, indent=2) f.write("\n") print(f"BG/MR regressions: {len(bg_mr_regressions)}") print(f"MR/RMP regressions: {len(mr_rmp_regressions)}") print(f"Total unique: {len(all_regressions)}") print(f"Written to {out_path}") if __name__ == "__main__": main()