SEC-cyBERT/scripts/extract-regression-pids.py

"""Identify paragraph IDs where v3.5 6-model majority regressed vs v3.0.

A "regression" = v3.0 majority matched human majority but v3.5 majority does not.

We compute category majority from 6 models (excluding minimax):
  opus, gpt-5.4, gemini-3.1-pro-preview, glm-5:exacto, kimi-k2.5, mimo-v2-pro:exacto

v3.0 annotations are filtered to the 359 PIDs present in holdout-rerun-v35.jsonl.
"""

from __future__ import annotations

import json
from collections import Counter
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent
DATA = ROOT / "data"

# ── Model files (excluding minimax) ──────────────────────────────────────────

V30_FILES = [
    DATA / "annotations" / "golden" / "opus.jsonl",
    DATA / "annotations" / "bench-holdout" / "gpt-5.4.jsonl",
    DATA / "annotations" / "bench-holdout" / "gemini-3.1-pro-preview.jsonl",
    DATA / "annotations" / "bench-holdout" / "glm-5:exacto.jsonl",
    DATA / "annotations" / "bench-holdout" / "kimi-k2.5.jsonl",
    DATA / "annotations" / "bench-holdout" / "mimo-v2-pro:exacto.jsonl",
]

V35_FILES = [
    DATA / "annotations" / "golden-v35" / "opus.jsonl",
    DATA / "annotations" / "bench-holdout-v35" / "gpt-5.4.jsonl",
    DATA / "annotations" / "bench-holdout-v35" / "gemini-3.1-pro-preview.jsonl",
    DATA / "annotations" / "bench-holdout-v35" / "glm-5:exacto.jsonl",
    DATA / "annotations" / "bench-holdout-v35" / "kimi-k2.5.jsonl",
    DATA / "annotations" / "bench-holdout-v35" / "mimo-v2-pro:exacto.jsonl",
]


def load_annotations(files: list[Path]) -> dict[str, list[str]]:
    """Load annotations, returning {pid: [category, ...]} across models."""
    result: dict[str, list[str]] = {}
    for f in files:
        with open(f) as fh:
            for line in fh:
                rec = json.loads(line)
                pid = rec["paragraphId"]
                cat = rec["label"]["content_category"]
                result.setdefault(pid, []).append(cat)
    return result


def majority_vote(labels: list[str]) -> str | None:
    """Return the most common label, or None if tied."""
    counts = Counter(labels)
    top = counts.most_common(2)
    if len(top) == 1:
        return top[0][0]
    if top[0][1] > top[1][1]:
        return top[0][0]
    return None  # tie


def load_human_majority() -> dict[str, str]:
    """Compute human majority label per PID from 3-annotator raw labels."""
    pid_labels: dict[str, list[str]] = {}
    with open(DATA / "gold" / "human-labels-raw.jsonl") as f:
        for line in f:
            rec = json.loads(line)
            pid = rec["paragraphId"]
            pid_labels.setdefault(pid, []).append(rec["contentCategory"])
    return {
        pid: maj
        for pid, labels in pid_labels.items()
        if (maj := majority_vote(labels)) is not None
    }


def load_holdout_pids() -> dict[str, list[str]]:
    """Load the 359 confusion-axis PIDs and their axes."""
    result: dict[str, list[str]] = {}
    with open(DATA / "gold" / "holdout-rerun-v35.jsonl") as f:
        for line in f:
            rec = json.loads(line)
            result[rec["paragraphId"]] = rec["axes"]
    return result


# Axis name → output key mapping
AXIS_TO_KEY = {
    "BG_MR": "bg_mr_regressions",
    "BG_RMP": "bg_mr_regressions",  # BG confusion axes both go to bg_mr bucket
    "MR_RMP": "mr_rmp_regressions",
    "SI_NO": "mr_rmp_regressions",  # SI/NO doesn't fit neatly; group with mr_rmp
}


def main() -> None:
    holdout = load_holdout_pids()
    holdout_pids = set(holdout.keys())

    human_maj = load_human_majority()

    v30_ann = load_annotations(V30_FILES)
    v35_ann = load_annotations(V35_FILES)

    # Compute model majorities filtered to holdout PIDs
    v30_maj: dict[str, str | None] = {}
    for pid in holdout_pids:
        labels = v30_ann.get(pid, [])
        v30_maj[pid] = majority_vote(labels) if len(labels) == 6 else None

    v35_maj: dict[str, str | None] = {}
    for pid in holdout_pids:
        labels = v35_ann.get(pid, [])
        v35_maj[pid] = majority_vote(labels) if len(labels) == 6 else None

    # Find regressions
    bg_mr_regressions: list[str] = []
    mr_rmp_regressions: list[str] = []

    for pid in sorted(holdout_pids):
        h = human_maj.get(pid)
        v30 = v30_maj.get(pid)
        v35 = v35_maj.get(pid)

        if h is None or v30 is None or v35 is None:
            continue

        # Regression: v3.0 matched human, v3.5 does not
        if v30 == h and v35 != h:
            axes = holdout[pid]
            # Assign to bucket based on axes
            is_bg_mr = any(a in ("BG_MR", "BG_RMP") for a in axes)
            is_mr_rmp = any(a in ("MR_RMP", "SI_NO") for a in axes)

            if is_bg_mr:
                bg_mr_regressions.append(pid)
            if is_mr_rmp:
                mr_rmp_regressions.append(pid)
            # If somehow neither axis matched, still include in all
            if not is_bg_mr and not is_mr_rmp:
                # Fallback: put in mr_rmp
                mr_rmp_regressions.append(pid)

    all_regressions = sorted(set(bg_mr_regressions + mr_rmp_regressions))

    output = {
        "bg_mr_regressions": sorted(bg_mr_regressions),
        "mr_rmp_regressions": sorted(mr_rmp_regressions),
        "all_regressions": all_regressions,
    }

    out_path = DATA / "gold" / "regression-pids.json"
    with open(out_path, "w") as f:
        json.dump(output, f, indent=2)
        f.write("\n")

    print(f"BG/MR regressions:  {len(bg_mr_regressions)}")
    print(f"MR/RMP regressions: {len(mr_rmp_regressions)}")
    print(f"Total unique:       {len(all_regressions)}")
    print(f"Written to {out_path}")


if __name__ == "__main__":
    main()