168 lines
5.6 KiB
Python
168 lines
5.6 KiB
Python
"""Identify paragraph IDs where v3.5 6-model majority regressed vs v3.0.
|
|
|
|
A "regression" = v3.0 majority matched human majority but v3.5 majority does not.
|
|
|
|
We compute category majority from 6 models (excluding minimax):
|
|
opus, gpt-5.4, gemini-3.1-pro-preview, glm-5:exacto, kimi-k2.5, mimo-v2-pro:exacto
|
|
|
|
v3.0 annotations are filtered to the 359 PIDs present in holdout-rerun-v35.jsonl.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
|
|
ROOT = Path(__file__).resolve().parent.parent
|
|
DATA = ROOT / "data"
|
|
|
|
# ── Model files (excluding minimax) ──────────────────────────────────────────
|
|
|
|
V30_FILES = [
|
|
DATA / "annotations" / "golden" / "opus.jsonl",
|
|
DATA / "annotations" / "bench-holdout" / "gpt-5.4.jsonl",
|
|
DATA / "annotations" / "bench-holdout" / "gemini-3.1-pro-preview.jsonl",
|
|
DATA / "annotations" / "bench-holdout" / "glm-5:exacto.jsonl",
|
|
DATA / "annotations" / "bench-holdout" / "kimi-k2.5.jsonl",
|
|
DATA / "annotations" / "bench-holdout" / "mimo-v2-pro:exacto.jsonl",
|
|
]
|
|
|
|
V35_FILES = [
|
|
DATA / "annotations" / "golden-v35" / "opus.jsonl",
|
|
DATA / "annotations" / "bench-holdout-v35" / "gpt-5.4.jsonl",
|
|
DATA / "annotations" / "bench-holdout-v35" / "gemini-3.1-pro-preview.jsonl",
|
|
DATA / "annotations" / "bench-holdout-v35" / "glm-5:exacto.jsonl",
|
|
DATA / "annotations" / "bench-holdout-v35" / "kimi-k2.5.jsonl",
|
|
DATA / "annotations" / "bench-holdout-v35" / "mimo-v2-pro:exacto.jsonl",
|
|
]
|
|
|
|
|
|
def load_annotations(files: list[Path]) -> dict[str, list[str]]:
|
|
"""Load annotations, returning {pid: [category, ...]} across models."""
|
|
result: dict[str, list[str]] = {}
|
|
for f in files:
|
|
with open(f) as fh:
|
|
for line in fh:
|
|
rec = json.loads(line)
|
|
pid = rec["paragraphId"]
|
|
cat = rec["label"]["content_category"]
|
|
result.setdefault(pid, []).append(cat)
|
|
return result
|
|
|
|
|
|
def majority_vote(labels: list[str]) -> str | None:
|
|
"""Return the most common label, or None if tied."""
|
|
counts = Counter(labels)
|
|
top = counts.most_common(2)
|
|
if len(top) == 1:
|
|
return top[0][0]
|
|
if top[0][1] > top[1][1]:
|
|
return top[0][0]
|
|
return None # tie
|
|
|
|
|
|
def load_human_majority() -> dict[str, str]:
|
|
"""Compute human majority label per PID from 3-annotator raw labels."""
|
|
pid_labels: dict[str, list[str]] = {}
|
|
with open(DATA / "gold" / "human-labels-raw.jsonl") as f:
|
|
for line in f:
|
|
rec = json.loads(line)
|
|
pid = rec["paragraphId"]
|
|
pid_labels.setdefault(pid, []).append(rec["contentCategory"])
|
|
return {
|
|
pid: maj
|
|
for pid, labels in pid_labels.items()
|
|
if (maj := majority_vote(labels)) is not None
|
|
}
|
|
|
|
|
|
def load_holdout_pids() -> dict[str, list[str]]:
|
|
"""Load the 359 confusion-axis PIDs and their axes."""
|
|
result: dict[str, list[str]] = {}
|
|
with open(DATA / "gold" / "holdout-rerun-v35.jsonl") as f:
|
|
for line in f:
|
|
rec = json.loads(line)
|
|
result[rec["paragraphId"]] = rec["axes"]
|
|
return result
|
|
|
|
|
|
# Axis name → output key mapping
|
|
AXIS_TO_KEY = {
|
|
"BG_MR": "bg_mr_regressions",
|
|
"BG_RMP": "bg_mr_regressions", # BG confusion axes both go to bg_mr bucket
|
|
"MR_RMP": "mr_rmp_regressions",
|
|
"SI_NO": "mr_rmp_regressions", # SI/NO doesn't fit neatly; group with mr_rmp
|
|
}
|
|
|
|
|
|
def main() -> None:
|
|
holdout = load_holdout_pids()
|
|
holdout_pids = set(holdout.keys())
|
|
|
|
human_maj = load_human_majority()
|
|
|
|
v30_ann = load_annotations(V30_FILES)
|
|
v35_ann = load_annotations(V35_FILES)
|
|
|
|
# Compute model majorities filtered to holdout PIDs
|
|
v30_maj: dict[str, str | None] = {}
|
|
for pid in holdout_pids:
|
|
labels = v30_ann.get(pid, [])
|
|
v30_maj[pid] = majority_vote(labels) if len(labels) == 6 else None
|
|
|
|
v35_maj: dict[str, str | None] = {}
|
|
for pid in holdout_pids:
|
|
labels = v35_ann.get(pid, [])
|
|
v35_maj[pid] = majority_vote(labels) if len(labels) == 6 else None
|
|
|
|
# Find regressions
|
|
bg_mr_regressions: list[str] = []
|
|
mr_rmp_regressions: list[str] = []
|
|
|
|
for pid in sorted(holdout_pids):
|
|
h = human_maj.get(pid)
|
|
v30 = v30_maj.get(pid)
|
|
v35 = v35_maj.get(pid)
|
|
|
|
if h is None or v30 is None or v35 is None:
|
|
continue
|
|
|
|
# Regression: v3.0 matched human, v3.5 does not
|
|
if v30 == h and v35 != h:
|
|
axes = holdout[pid]
|
|
# Assign to bucket based on axes
|
|
is_bg_mr = any(a in ("BG_MR", "BG_RMP") for a in axes)
|
|
is_mr_rmp = any(a in ("MR_RMP", "SI_NO") for a in axes)
|
|
|
|
if is_bg_mr:
|
|
bg_mr_regressions.append(pid)
|
|
if is_mr_rmp:
|
|
mr_rmp_regressions.append(pid)
|
|
# If somehow neither axis matched, still include in all
|
|
if not is_bg_mr and not is_mr_rmp:
|
|
# Fallback: put in mr_rmp
|
|
mr_rmp_regressions.append(pid)
|
|
|
|
all_regressions = sorted(set(bg_mr_regressions + mr_rmp_regressions))
|
|
|
|
output = {
|
|
"bg_mr_regressions": sorted(bg_mr_regressions),
|
|
"mr_rmp_regressions": sorted(mr_rmp_regressions),
|
|
"all_regressions": all_regressions,
|
|
}
|
|
|
|
out_path = DATA / "gold" / "regression-pids.json"
|
|
with open(out_path, "w") as f:
|
|
json.dump(output, f, indent=2)
|
|
f.write("\n")
|
|
|
|
print(f"BG/MR regressions: {len(bg_mr_regressions)}")
|
|
print(f"MR/RMP regressions: {len(mr_rmp_regressions)}")
|
|
print(f"Total unique: {len(all_regressions)}")
|
|
print(f"Written to {out_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|