SEC-cyBERT/scripts/cross-analyze-human-vs-genai.py

"""
Cross-analysis: Human annotators vs GenAI models on 1,200-paragraph holdout set.

Categories: BG, ID, MR, N/O, RMP, SI, TPR
Specificity: 1-4
13 signals per paragraph: 3 human (BIBD), 3 Stage 1, 1 Opus 4.6, 6 benchmark
"""

import json
import sys
from collections import Counter, defaultdict
from pathlib import Path

# ── Category abbreviation mapping ────────────────────────────────────────────
FULL_TO_ABBR = {
    "Board Governance": "BG",
    "Incident Disclosure": "ID",
    "Management Role": "MR",
    "None/Other": "N/O",
    "Risk Management Process": "RMP",
    "Strategy Integration": "SI",
    "Third-Party Risk": "TPR",
}
ABBR_TO_FULL = {v: k for k, v in FULL_TO_ABBR.items()}
CATS = ["BG", "ID", "MR", "N/O", "RMP", "SI", "TPR"]

DATA = Path("data")


def abbr(cat: str) -> str:
    return FULL_TO_ABBR.get(cat, cat)


def majority_vote(labels: list[str]) -> str:
    """Return majority label or 'split' if no majority."""
    c = Counter(labels)
    top = c.most_common(1)[0]
    if top[1] > len(labels) / 2:
        return top[0]
    # Check for a plurality with tie-break: if top 2 are tied, it's split
    if len(c) >= 2:
        top2 = c.most_common(2)
        if top2[0][1] == top2[1][1]:
            return "split"
    return top[0]


def median_spec(specs: list[int]) -> float:
    s = sorted(specs)
    n = len(s)
    if n % 2 == 1:
        return float(s[n // 2])
    return (s[n // 2 - 1] + s[n // 2]) / 2.0


def mean_spec(specs: list[int]) -> float:
    return sum(specs) / len(specs) if specs else 0.0


# ── Load data ────────────────────────────────────────────────────────────────

print("Loading data...\n")

# Human labels: paragraphId → list of (annotatorName, category, specificity)
human_labels: dict[str, list[tuple[str, str, int]]] = defaultdict(list)
with open(DATA / "gold" / "human-labels-raw.jsonl") as f:
    for line in f:
        d = json.loads(line)
        human_labels[d["paragraphId"]].append(
            (d["annotatorName"], abbr(d["contentCategory"]), d["specificityLevel"])
        )

holdout_pids = sorted(human_labels.keys())
assert len(holdout_pids) == 1200, f"Expected 1200 holdout paragraphs, got {len(holdout_pids)}"

# GenAI labels: paragraphId → list of (modelName, category, specificity)
genai_labels: dict[str, list[tuple[str, str, int]]] = defaultdict(list)

# Stage 1 (filter to holdout only)
holdout_set = set(holdout_pids)
with open(DATA / "annotations" / "stage1.patched.jsonl") as f:
    for line in f:
        d = json.loads(line)
        pid = d["paragraphId"]
        if pid in holdout_set:
            model = d["provenance"]["modelId"].split("/")[-1]
            genai_labels[pid].append(
                (model, abbr(d["label"]["content_category"]), d["label"]["specificity_level"])
            )

# Opus
with open(DATA / "annotations" / "golden" / "opus.jsonl") as f:
    for line in f:
        d = json.loads(line)
        genai_labels[d["paragraphId"]].append(
            ("opus-4.6", abbr(d["label"]["content_category"]), d["label"]["specificity_level"])
        )

# Bench-holdout models
bench_files = [
    "gpt-5.4.jsonl",
    "gemini-3.1-pro-preview.jsonl",
    "glm-5:exacto.jsonl",
    "kimi-k2.5.jsonl",
    "mimo-v2-pro:exacto.jsonl",
    "minimax-m2.7:exacto.jsonl",
]
for fname in bench_files:
    fpath = DATA / "annotations" / "bench-holdout" / fname
    model_name = fname.replace(".jsonl", "")
    with open(fpath) as f:
        for line in f:
            d = json.loads(line)
            genai_labels[d["paragraphId"]].append(
                (model_name, abbr(d["label"]["content_category"]), d["label"]["specificity_level"])
            )

# Paragraph metadata
para_meta: dict[str, dict] = {}
with open(DATA / "gold" / "paragraphs-holdout.jsonl") as f:
    for line in f:
        d = json.loads(line)
        if d["id"] in holdout_set:
            para_meta[d["id"]] = d

# ── Compute per-paragraph aggregates ─────────────────────────────────────────

results = []
for pid in holdout_pids:
    h = human_labels[pid]
    g = genai_labels[pid]

    h_cats = [x[1] for x in h]
    h_specs = [x[2] for x in h]
    g_cats = [x[1] for x in g]
    g_specs = [x[2] for x in g]

    all_cats = h_cats + g_cats
    all_specs = h_specs + g_specs

    h_maj = majority_vote(h_cats)
    g_maj = majority_vote(g_cats)
    all_maj = majority_vote(all_cats)

    h_mean_spec = mean_spec(h_specs)
    g_mean_spec = mean_spec(g_specs)
    all_mean_spec = mean_spec(all_specs)

    # Agreement count: how many of 13 agree with overall majority
    agree_count = sum(1 for c in all_cats if c == all_maj) if all_maj != "split" else 0

    meta = para_meta.get(pid, {})

    results.append({
        "pid": pid,
        "h_maj": h_maj,
        "g_maj": g_maj,
        "all_maj": all_maj,
        "h_cats": h_cats,
        "g_cats": g_cats,
        "h_specs": h_specs,
        "g_specs": g_specs,
        "h_mean_spec": h_mean_spec,
        "g_mean_spec": g_mean_spec,
        "all_mean_spec": all_mean_spec,
        "agree_count": agree_count,
        "word_count": meta.get("wordCount", 0),
        "text": meta.get("text", ""),
        "human_annotators": [x[0] for x in h],
        "genai_models": [x[0] for x in g],
        "human_labels": h,
        "genai_labels": g,
    })


def fmt_table(headers: list[str], rows: list[list], align: list[str] | None = None):
    """Format a simple text table."""
    col_widths = [len(h) for h in headers]
    str_rows = []
    for row in rows:
        sr = [str(x) for x in row]
        str_rows.append(sr)
        for i, s in enumerate(sr):
            col_widths[i] = max(col_widths[i], len(s))

    if align is None:
        align = ["r"] * len(headers)

    def fmt_cell(s, w, a):
        return s.rjust(w) if a == "r" else s.ljust(w)

    sep = "+-" + "-+-".join("-" * w for w in col_widths) + "-+"
    hdr = "| " + " | ".join(fmt_cell(h, col_widths[i], "l") for i, h in enumerate(headers)) + " |"
    lines = [sep, hdr, sep]
    for sr in str_rows:
        line = "| " + " | ".join(fmt_cell(sr[i], col_widths[i], align[i]) for i in range(len(headers))) + " |"
        lines.append(line)
    lines.append(sep)
    return "\n".join(lines)


# ══════════════════════════════════════════════════════════════════════════════
# 1. PER-CATEGORY CONFUSION MATRIX: HUMAN MAJORITY vs GENAI MAJORITY
# ══════════════════════════════════════════════════════════════════════════════

print("=" * 80)
print("1. CONFUSION MATRIX: Human Majority (rows) vs GenAI Majority (cols)")
print("=" * 80)

cats_plus = CATS + ["split"]
cm = defaultdict(lambda: defaultdict(int))
for r in results:
    cm[r["h_maj"]][r["g_maj"]] += 1

headers = ["H\\G"] + cats_plus + ["Total"]
rows = []
for hc in cats_plus:
    row = [hc]
    total = 0
    for gc in cats_plus:
        v = cm[hc][gc]
        row.append(v if v else ".")
        total += v
    row.append(total)
    rows.append(row)

# Column totals
col_totals = ["Total"]
for gc in cats_plus:
    col_totals.append(sum(cm[hc][gc] for hc in cats_plus))
col_totals.append(sum(sum(cm[hc][gc] for gc in cats_plus) for hc in cats_plus))
rows.append(col_totals)

align = ["l"] + ["r"] * (len(headers) - 1)
print(fmt_table(headers, rows, align))

# Diagonal agreement
diag = sum(cm[c][c] for c in cats_plus)
total_paras = len(results)
print(f"\nDiagonal agreement: {diag}/{total_paras} = {diag/total_paras:.1%}")
print(f"Disagreement: {total_paras - diag}/{total_paras} = {(total_paras - diag)/total_paras:.1%}")

# Over/under prediction
print("\nGenAI over/under-prediction relative to human majority:")
headers2 = ["Category", "Human N", "GenAI N", "Diff", "Direction"]
rows2 = []
for c in CATS:
    h_n = sum(cm[c][gc] for gc in cats_plus)
    g_n = sum(cm[hc][c] for hc in cats_plus)
    diff = g_n - h_n
    direction = "OVER" if diff > 0 else ("UNDER" if diff < 0 else "MATCH")
    rows2.append([c, h_n, g_n, f"{diff:+d}", direction])
align2 = ["l", "r", "r", "r", "l"]
print(fmt_table(headers2, rows2, align2))


# ══════════════════════════════════════════════════════════════════════════════
# 2. DIRECTIONAL DISAGREEMENT ANALYSIS
# ══════════════════════════════════════════════════════════════════════════════

print("\n" + "=" * 80)
print("2. DIRECTIONAL DISAGREEMENT: Human Majority -> GenAI Majority transitions")
print("=" * 80)

disagree = [(r["h_maj"], r["g_maj"]) for r in results if r["h_maj"] != r["g_maj"]]
print(f"\nTotal disagreements: {len(disagree)}/{total_paras}")

trans = Counter(disagree)
print("\nTop transitions (H_maj -> G_maj):")
headers3 = ["From (Human)", "To (GenAI)", "Count", "Reverse", "Net", "Symmetric?"]
rows3 = []
seen = set()
for (a, b), cnt in sorted(trans.items(), key=lambda x: -x[1]):
    pair = tuple(sorted([a, b]))
    if pair in seen:
        continue
    seen.add(pair)
    rev = trans.get((b, a), 0)
    net = cnt - rev
    sym = "Yes" if abs(net) <= max(1, min(cnt, rev) * 0.3) else "No"
    rows3.append([a, b, cnt, rev, f"{net:+d}", sym])
align3 = ["l", "l", "r", "r", "r", "l"]
print(fmt_table(headers3, rows3, align3))


# ══════════════════════════════════════════════════════════════════════════════
# 3. PER-CATEGORY PRECISION/RECALL (Human majority as truth)
# ══════════════════════════════════════════════════════════════════════════════

print("\n" + "=" * 80)
print("3. PER-CATEGORY PRECISION/RECALL (Human majority as ground truth)")
print("=" * 80)

# Filter out splits for clean P/R
valid = [(r["h_maj"], r["g_maj"]) for r in results if r["h_maj"] != "split" and r["g_maj"] != "split"]

headers4 = ["Category", "TP", "FP", "FN", "Precision", "Recall", "F1"]
rows4 = []
for c in CATS:
    tp = sum(1 for h, g in valid if h == c and g == c)
    fp = sum(1 for h, g in valid if h != c and g == c)
    fn = sum(1 for h, g in valid if h == c and g != c)
    prec = tp / (tp + fp) if (tp + fp) > 0 else 0
    rec = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0
    rows4.append([c, tp, fp, fn, f"{prec:.3f}", f"{rec:.3f}", f"{f1:.3f}"])
align4 = ["l", "r", "r", "r", "r", "r", "r"]
print("\nGenAI predictions evaluated against human majority:")
print(fmt_table(headers4, rows4, align4))

# Macro averages
macro_p = sum(float(r[4]) for r in rows4) / len(CATS)
macro_r = sum(float(r[5]) for r in rows4) / len(CATS)
macro_f1 = sum(float(r[6]) for r in rows4) / len(CATS)
print(f"\nMacro-avg: P={macro_p:.3f}  R={macro_r:.3f}  F1={macro_f1:.3f}")

# Vice versa: GenAI as truth
print("\n--- Vice versa: Human predictions evaluated against GenAI majority ---")
rows4b = []
for c in CATS:
    tp = sum(1 for h, g in valid if g == c and h == c)
    fp = sum(1 for h, g in valid if g != c and h == c)
    fn = sum(1 for h, g in valid if g == c and h != c)
    prec = tp / (tp + fp) if (tp + fp) > 0 else 0
    rec = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0
    rows4b.append([c, tp, fp, fn, f"{prec:.3f}", f"{rec:.3f}", f"{f1:.3f}"])
print(fmt_table(headers4, rows4b, align4))


# ══════════════════════════════════════════════════════════════════════════════
# 4. SPECIFICITY SYSTEMATIC BIAS
# ══════════════════════════════════════════════════════════════════════════════

print("\n" + "=" * 80)
print("4. SPECIFICITY SYSTEMATIC BIAS: Human vs GenAI")
print("=" * 80)

# Overall
all_h_specs = [s for r in results for s in r["h_specs"]]
all_g_specs = [s for r in results for s in r["g_specs"]]
h_avg = mean_spec(all_h_specs)
g_avg = mean_spec(all_g_specs)
print(f"\nOverall mean specificity:  Human={h_avg:.3f}  GenAI={g_avg:.3f}  Diff={g_avg - h_avg:+.3f}")
print(f"Overall median:           Human={median_spec(all_h_specs):.1f}    GenAI={median_spec(all_g_specs):.1f}")

# Distribution
print("\nSpecificity distribution:")
h_dist = Counter(all_h_specs)
g_dist = Counter(all_g_specs)
headers5 = ["Spec", "Human N", "Human %", "GenAI N", "GenAI %", "Diff %"]
rows5 = []
for s in [1, 2, 3, 4]:
    hn = h_dist.get(s, 0)
    gn = g_dist.get(s, 0)
    hp = hn / len(all_h_specs) * 100
    gp = gn / len(all_g_specs) * 100
    rows5.append([s, hn, f"{hp:.1f}%", gn, f"{gp:.1f}%", f"{gp - hp:+.1f}%"])
print(fmt_table(headers5, rows5, ["r", "r", "r", "r", "r", "r"]))

# By category
print("\nMean specificity by category:")
headers6 = ["Category", "Human", "GenAI", "Diff", "H count", "G count"]
rows6 = []
for c in CATS:
    h_s = [s for r in results for ann in r["human_labels"] if ann[1] == c for s in [ann[2]]]
    g_s = [s for r in results for ann in r["genai_labels"] if ann[1] == c for s in [ann[2]]]
    if h_s and g_s:
        hm = mean_spec(h_s)
        gm = mean_spec(g_s)
        rows6.append([c, f"{hm:.3f}", f"{gm:.3f}", f"{gm - hm:+.3f}", len(h_s), len(g_s)])
    else:
        rows6.append([c, "N/A", "N/A", "N/A", len(h_s), len(g_s)])
print(fmt_table(headers6, rows6, ["l", "r", "r", "r", "r", "r"]))

# Per-paragraph directional bias
h_higher = sum(1 for r in results if r["h_mean_spec"] > r["g_mean_spec"])
g_higher = sum(1 for r in results if r["g_mean_spec"] > r["h_mean_spec"])
same = sum(1 for r in results if abs(r["h_mean_spec"] - r["g_mean_spec"]) < 0.01)
print(f"\nPer-paragraph: Human higher spec={h_higher}  GenAI higher={g_higher}  Same={same}")


# ══════════════════════════════════════════════════════════════════════════════
# 5. DIFFICULTY-STRATIFIED ANALYSIS
# ══════════════════════════════════════════════════════════════════════════════

print("\n" + "=" * 80)
print("5. DIFFICULTY-STRATIFIED ANALYSIS")
print("=" * 80)

# Tiers based on 13-signal agreement
# Tier 1: 10+ agree, Tier 2: 7-9 agree, Tier 3: 5-6 agree, Tier 4: <5 agree
def get_tier(agree_count: int) -> str:
    if agree_count >= 10:
        return "T1-Easy"
    elif agree_count >= 7:
        return "T2-Medium"
    elif agree_count >= 5:
        return "T3-Hard"
    else:
        return "T4-VHard"

for r in results:
    r["tier"] = get_tier(r["agree_count"])

tier_counts = Counter(r["tier"] for r in results)
print(f"\nTier distribution:")
for t in ["T1-Easy", "T2-Medium", "T3-Hard", "T4-VHard"]:
    print(f"  {t}: {tier_counts.get(t, 0)} paragraphs")

print("\nHuman-GenAI category agreement rate by difficulty tier:")
headers7 = ["Tier", "N", "Agree", "Agree%", "H=consensus%", "G=consensus%"]
rows7 = []
for t in ["T1-Easy", "T2-Medium", "T3-Hard", "T4-VHard"]:
    tier_r = [r for r in results if r["tier"] == t]
    n = len(tier_r)
    if n == 0:
        continue
    agree = sum(1 for r in tier_r if r["h_maj"] == r["g_maj"])
    h_match_cons = sum(1 for r in tier_r if r["h_maj"] == r["all_maj"])
    g_match_cons = sum(1 for r in tier_r if r["g_maj"] == r["all_maj"])
    rows7.append([
        t, n, agree, f"{agree/n:.1%}",
        f"{h_match_cons/n:.1%}", f"{g_match_cons/n:.1%}"
    ])
print(fmt_table(headers7, rows7, ["l", "r", "r", "r", "r", "r"]))

# On hard paragraphs, who is the odd one out?
print("\nOn hard paragraphs (T3+T4), disagreement breakdown:")
hard = [r for r in results if r["tier"] in ("T3-Hard", "T4-VHard")]
h_odd = sum(1 for r in hard if r["g_maj"] == r["all_maj"] and r["h_maj"] != r["all_maj"])
g_odd = sum(1 for r in hard if r["h_maj"] == r["all_maj"] and r["g_maj"] != r["all_maj"])
both_off = sum(1 for r in hard if r["h_maj"] != r["all_maj"] and r["g_maj"] != r["all_maj"])
both_on = sum(1 for r in hard if r["h_maj"] == r["all_maj"] and r["g_maj"] == r["all_maj"])
print(f"  Human is odd-one-out (GenAI=consensus, Human!=consensus): {h_odd}")
print(f"  GenAI is odd-one-out (Human=consensus, GenAI!=consensus): {g_odd}")
print(f"  Both match consensus: {both_on}")
print(f"  Both differ from consensus: {both_off}")
print(f"  Total hard: {len(hard)}")


# ══════════════════════════════════════════════════════════════════════════════
# 6. ANNOTATOR-LEVEL PATTERNS
# ══════════════════════════════════════════════════════════════════════════════

print("\n" + "=" * 80)
print("6. ANNOTATOR-LEVEL PATTERNS")
print("=" * 80)

annotators = ["Anuj", "Elisabeth", "Joey", "Meghan", "Xander", "Aaryan"]

# For each annotator, compute agreement with GenAI majority
print("\nPer-annotator agreement with GenAI majority (category):")
headers8 = ["Annotator", "N labels", "Agree w/G_maj", "Agree%", "Agree w/13_maj", "13_maj%", "Avg Spec", "Note"]
rows8 = []
for ann in annotators:
    agree_g = 0
    agree_all = 0
    total = 0
    specs = []
    for r in results:
        for name, cat, spec in r["human_labels"]:
            if name == ann:
                total += 1
                specs.append(spec)
                if cat == r["g_maj"]:
                    agree_g += 1
                if cat == r["all_maj"]:
                    agree_all += 1
    if total == 0:
        continue
    note = "(excluded from aggregates)" if ann == "Aaryan" else ""
    rows8.append([
        ann, total,
        agree_g, f"{agree_g/total:.1%}",
        agree_all, f"{agree_all/total:.1%}",
        f"{mean_spec(specs):.2f}",
        note,
    ])
align8 = ["l", "r", "r", "r", "r", "r", "r", "l"]
print(fmt_table(headers8, rows8, align8))

# Annotator category distributions
print("\nPer-annotator category distribution:")
for ann in annotators:
    cat_counts = Counter()
    for r in results:
        for name, cat, spec in r["human_labels"]:
            if name == ann:
                cat_counts[cat] += 1
    if not cat_counts:
        continue
    total = sum(cat_counts.values())
    dist = "  ".join(f"{c}:{cat_counts.get(c, 0):3d}({cat_counts.get(c, 0)/total:.0%})" for c in CATS)
    flag = " ** OUTLIER" if ann == "Aaryan" else ""
    print(f"  {ann:10s} (n={total:3d}): {dist}{flag}")


# ══════════════════════════════════════════════════════════════════════════════
# 7. TEXT-FEATURE CORRELATIONS
# ══════════════════════════════════════════════════════════════════════════════

print("\n" + "=" * 80)
print("7. TEXT-FEATURE CORRELATIONS WITH DISAGREEMENT")
print("=" * 80)

agree_r = [r for r in results if r["h_maj"] == r["g_maj"]]
disagree_r = [r for r in results if r["h_maj"] != r["g_maj"]]

# Word count
agree_wc = [r["word_count"] for r in agree_r if r["word_count"] > 0]
disagree_wc = [r["word_count"] for r in disagree_r if r["word_count"] > 0]
print(f"\nWord count (agree vs disagree):")
print(f"  Agreement paragraphs:    mean={mean_spec(agree_wc):.1f}  median={median_spec(agree_wc):.0f}  n={len(agree_wc)}")
print(f"  Disagreement paragraphs: mean={mean_spec(disagree_wc):.1f}  median={median_spec(disagree_wc):.0f}  n={len(disagree_wc)}")

# Word count buckets
print("\nDisagreement rate by word count bucket:")
buckets = [(0, 30, "0-30"), (31, 60, "31-60"), (61, 100, "61-100"), (101, 150, "101-150"), (151, 250, "151-250"), (251, 9999, "251+")]
headers9 = ["WC Bucket", "N", "Disagree", "Disagree%"]
rows9 = []
for lo, hi, label in buckets:
    in_bucket = [r for r in results if lo <= r["word_count"] <= hi]
    dis = sum(1 for r in in_bucket if r["h_maj"] != r["g_maj"])
    if in_bucket:
        rows9.append([label, len(in_bucket), dis, f"{dis/len(in_bucket):.1%}"])
print(fmt_table(headers9, rows9, ["l", "r", "r", "r"]))

# Stage1 method (unanimous vs majority) as proxy for quality tier
print("\nDisagreement rate by Stage 1 confidence method:")
for method in ["unanimous", "majority"]:
    in_method = [r for r in results if para_meta.get(r["pid"], {}).get("stage1Method") == method]
    dis = sum(1 for r in in_method if r["h_maj"] != r["g_maj"])
    if in_method:
        print(f"  {method:10s}: {dis}/{len(in_method)} = {dis/len(in_method):.1%} disagree")

# Keyword analysis
print("\nDisagreement rate for paragraphs containing key terms:")
keywords = ["material", "NIST", "CISO", "board", "third party", "third-party", "incident",
            "insurance", "audit", "framework", "breach", "ransomware"]
headers10 = ["Keyword", "N", "Disagree", "Disagree%"]
rows10 = []
for kw in keywords:
    matching = [r for r in results if kw.lower() in r["text"].lower()]
    if not matching:
        continue
    dis = sum(1 for r in matching if r["h_maj"] != r["g_maj"])
    rows10.append([kw, len(matching), dis, f"{dis/len(matching):.1%}"])
rows10.sort(key=lambda x: -int(x[2]))
print(fmt_table(headers10, rows10, ["l", "r", "r", "r"]))


# ══════════════════════════════════════════════════════════════════════════════
# 8. "HUMAN RIGHT, GenAI WRONG" vs "GenAI RIGHT, HUMAN WRONG"
# ══════════════════════════════════════════════════════════════════════════════

print("\n" + "=" * 80)
print("8. HUMAN RIGHT/GENAI WRONG vs GENAI RIGHT/HUMAN WRONG (13-signal consensus)")
print("=" * 80)

# Only consider paragraphs where all_maj is not split and h/g disagree with each other or consensus
h_right_g_wrong = [r for r in results if r["all_maj"] != "split" and r["h_maj"] == r["all_maj"] and r["g_maj"] != r["all_maj"]]
g_right_h_wrong = [r for r in results if r["all_maj"] != "split" and r["g_maj"] == r["all_maj"] and r["h_maj"] != r["all_maj"]]
both_right = [r for r in results if r["all_maj"] != "split" and r["h_maj"] == r["all_maj"] and r["g_maj"] == r["all_maj"]]
both_wrong = [r for r in results if r["all_maj"] != "split" and r["h_maj"] != r["all_maj"] and r["g_maj"] != r["all_maj"]]
has_split = [r for r in results if r["all_maj"] == "split"]

print(f"\n  Both correct:                {len(both_right)}")
print(f"  Human right, GenAI wrong:    {len(h_right_g_wrong)}")
print(f"  GenAI right, Human wrong:    {len(g_right_h_wrong)}")
print(f"  Both wrong:                  {len(both_wrong)}")
print(f"  13-signal split (no consensus): {len(has_split)}")

# Category breakdown
print("\nCategory breakdown of 'Human right, GenAI wrong':")
cat_dist_hrg = Counter(r["all_maj"] for r in h_right_g_wrong)
for c in CATS:
    n = cat_dist_hrg.get(c, 0)
    if n > 0:
        print(f"  {c}: {n}")

print("\nCategory breakdown of 'GenAI right, Human wrong':")
cat_dist_grh = Counter(r["all_maj"] for r in g_right_h_wrong)
for c in CATS:
    n = cat_dist_grh.get(c, 0)
    if n > 0:
        print(f"  {c}: {n}")

# What did the wrong side predict?
print("\nWhen GenAI is wrong, what does it predict instead?")
wrong_g = Counter(r["g_maj"] for r in h_right_g_wrong)
for label, cnt in wrong_g.most_common():
    print(f"  {label}: {cnt}")

print("\nWhen Human is wrong, what do they predict instead?")
wrong_h = Counter(r["h_maj"] for r in g_right_h_wrong)
for label, cnt in wrong_h.most_common():
    print(f"  {label}: {cnt}")


# ══════════════════════════════════════════════════════════════════════════════
# 9. SPECIFICITY BY SOURCE TYPE
# ══════════════════════════════════════════════════════════════════════════════

print("\n" + "=" * 80)
print("9. SPECIFICITY BY SOURCE TYPE AND CATEGORY")
print("=" * 80)

# Group models into source types
stage1_models = {"gemini-3.1-flash-lite-preview", "grok-4.1-fast", "mimo-v2-flash"}
frontier_models = {"opus-4.6", "gpt-5.4", "gemini-3.1-pro-preview", "kimi-k2.5"}
budget_models = {"glm-5:exacto", "mimo-v2-pro:exacto", "minimax-m2.7:exacto"}

# Collect specs by source type and category
source_specs: dict[str, dict[str, list[int]]] = {
    "Human": defaultdict(list),
    "Stage1": defaultdict(list),
    "Frontier": defaultdict(list),
    "Budget": defaultdict(list),
}

for r in results:
    for name, cat, spec in r["human_labels"]:
        source_specs["Human"][cat].append(spec)
        source_specs["Human"]["ALL"].append(spec)

    for model, cat, spec in r["genai_labels"]:
        if model in stage1_models:
            src = "Stage1"
        elif model in frontier_models:
            src = "Frontier"
        elif model in budget_models:
            src = "Budget"
        else:
            src = "Budget"  # fallback
        source_specs[src][cat].append(spec)
        source_specs[src]["ALL"].append(spec)

print("\nMean specificity by source type and category:")
src_order = ["Human", "Stage1", "Frontier", "Budget"]
headers11 = ["Category"] + src_order
rows11 = []
for c in CATS + ["ALL"]:
    row = [c]
    for src in src_order:
        specs = source_specs[src].get(c, [])
        if specs:
            row.append(f"{mean_spec(specs):.3f}")
        else:
            row.append("N/A")
    rows11.append(row)
align11 = ["l"] + ["r"] * len(src_order)
print(fmt_table(headers11, rows11, align11))

# Specificity standard deviation by source
print("\nSpecificity std dev by source type:")
import math
for src in src_order:
    specs = source_specs[src]["ALL"]
    if specs:
        m = mean_spec(specs)
        var = sum((s - m) ** 2 for s in specs) / len(specs)
        std = math.sqrt(var)
        print(f"  {src:10s}: mean={m:.3f}  std={std:.3f}  n={len(specs)}")

# ── Per-model specificity rankings ───────────────────────────────────────────
print("\nPer-model mean specificity (all categories):")
model_specs: dict[str, list[int]] = defaultdict(list)
for r in results:
    for name, cat, spec in r["human_labels"]:
        model_specs[f"H:{name}"].append(spec)
    for model, cat, spec in r["genai_labels"]:
        model_specs[f"G:{model}"].append(spec)

headers12 = ["Model", "Mean Spec", "N"]
rows12 = []
for model, specs in sorted(model_specs.items(), key=lambda x: mean_spec(x[1])):
    rows12.append([model, f"{mean_spec(specs):.3f}", len(specs)])
print(fmt_table(headers12, rows12, ["l", "r", "r"]))


# ══════════════════════════════════════════════════════════════════════════════
# SUMMARY
# ══════════════════════════════════════════════════════════════════════════════

print("\n" + "=" * 80)
print("SUMMARY OF KEY FINDINGS")
print("=" * 80)

print(f"""
Dataset: {total_paras} paragraphs, 13 signals each (3 human, 10 GenAI)

1. CATEGORY AGREEMENT: Human majority and GenAI majority agree on {diag/total_paras:.1%} of
   paragraphs. The biggest confusions are in the off-diagonal cells above.

2. DIRECTIONAL DISAGREEMENTS: The most common category swaps reveal systematic
   differences in how humans and GenAI interpret boundary cases.

3. PRECISION/RECALL: GenAI macro F1={macro_f1:.3f} against human majority.

4. SPECIFICITY BIAS: Human mean={h_avg:.3f}, GenAI mean={g_avg:.3f}
   (diff={g_avg - h_avg:+.3f}). {"GenAI rates higher" if g_avg > h_avg else "Humans rate higher"} on average.

5. DIFFICULTY: On easy paragraphs (T1, 10+/13 agree), agreement is very high.
   On hard paragraphs, {"humans" if h_odd > g_odd else "GenAI"} are more often the odd-one-out.

6. ANNOTATORS: See table above for individual alignment with GenAI and consensus.

7. TEXT FEATURES: {"Longer" if mean_spec(disagree_wc) > mean_spec(agree_wc) else "Shorter"} paragraphs
   tend to produce more disagreement.

8. RIGHT/WRONG: Human right & GenAI wrong: {len(h_right_g_wrong)}, GenAI right &
   Human wrong: {len(g_right_h_wrong)}. {"Humans are more often right" if len(h_right_g_wrong) > len(g_right_h_wrong) else "GenAI is more often right"} when they disagree.
""")