SEC-cyBERT/scripts/cross-analyze-human-vs-genai.py
2026-04-03 14:43:53 -04:00

715 lines
30 KiB
Python

"""
Cross-analysis: Human annotators vs GenAI models on 1,200-paragraph holdout set.
Categories: BG, ID, MR, N/O, RMP, SI, TPR
Specificity: 1-4
13 signals per paragraph: 3 human (BIBD), 3 Stage 1, 1 Opus 4.6, 6 benchmark
"""
import json
import sys
from collections import Counter, defaultdict
from pathlib import Path
# ── Category abbreviation mapping ────────────────────────────────────────────
FULL_TO_ABBR = {
"Board Governance": "BG",
"Incident Disclosure": "ID",
"Management Role": "MR",
"None/Other": "N/O",
"Risk Management Process": "RMP",
"Strategy Integration": "SI",
"Third-Party Risk": "TPR",
}
ABBR_TO_FULL = {v: k for k, v in FULL_TO_ABBR.items()}
CATS = ["BG", "ID", "MR", "N/O", "RMP", "SI", "TPR"]
DATA = Path("data")
def abbr(cat: str) -> str:
return FULL_TO_ABBR.get(cat, cat)
def majority_vote(labels: list[str]) -> str:
"""Return majority label or 'split' if no majority."""
c = Counter(labels)
top = c.most_common(1)[0]
if top[1] > len(labels) / 2:
return top[0]
# Check for a plurality with tie-break: if top 2 are tied, it's split
if len(c) >= 2:
top2 = c.most_common(2)
if top2[0][1] == top2[1][1]:
return "split"
return top[0]
def median_spec(specs: list[int]) -> float:
s = sorted(specs)
n = len(s)
if n % 2 == 1:
return float(s[n // 2])
return (s[n // 2 - 1] + s[n // 2]) / 2.0
def mean_spec(specs: list[int]) -> float:
return sum(specs) / len(specs) if specs else 0.0
# ── Load data ────────────────────────────────────────────────────────────────
print("Loading data...\n")
# Human labels: paragraphId → list of (annotatorName, category, specificity)
human_labels: dict[str, list[tuple[str, str, int]]] = defaultdict(list)
with open(DATA / "gold" / "human-labels-raw.jsonl") as f:
for line in f:
d = json.loads(line)
human_labels[d["paragraphId"]].append(
(d["annotatorName"], abbr(d["contentCategory"]), d["specificityLevel"])
)
holdout_pids = sorted(human_labels.keys())
assert len(holdout_pids) == 1200, f"Expected 1200 holdout paragraphs, got {len(holdout_pids)}"
# GenAI labels: paragraphId → list of (modelName, category, specificity)
genai_labels: dict[str, list[tuple[str, str, int]]] = defaultdict(list)
# Stage 1 (filter to holdout only)
holdout_set = set(holdout_pids)
with open(DATA / "annotations" / "stage1.patched.jsonl") as f:
for line in f:
d = json.loads(line)
pid = d["paragraphId"]
if pid in holdout_set:
model = d["provenance"]["modelId"].split("/")[-1]
genai_labels[pid].append(
(model, abbr(d["label"]["content_category"]), d["label"]["specificity_level"])
)
# Opus
with open(DATA / "annotations" / "golden" / "opus.jsonl") as f:
for line in f:
d = json.loads(line)
genai_labels[d["paragraphId"]].append(
("opus-4.6", abbr(d["label"]["content_category"]), d["label"]["specificity_level"])
)
# Bench-holdout models
bench_files = [
"gpt-5.4.jsonl",
"gemini-3.1-pro-preview.jsonl",
"glm-5:exacto.jsonl",
"kimi-k2.5.jsonl",
"mimo-v2-pro:exacto.jsonl",
"minimax-m2.7:exacto.jsonl",
]
for fname in bench_files:
fpath = DATA / "annotations" / "bench-holdout" / fname
model_name = fname.replace(".jsonl", "")
with open(fpath) as f:
for line in f:
d = json.loads(line)
genai_labels[d["paragraphId"]].append(
(model_name, abbr(d["label"]["content_category"]), d["label"]["specificity_level"])
)
# Paragraph metadata
para_meta: dict[str, dict] = {}
with open(DATA / "gold" / "paragraphs-holdout.jsonl") as f:
for line in f:
d = json.loads(line)
if d["id"] in holdout_set:
para_meta[d["id"]] = d
# ── Compute per-paragraph aggregates ─────────────────────────────────────────
results = []
for pid in holdout_pids:
h = human_labels[pid]
g = genai_labels[pid]
h_cats = [x[1] for x in h]
h_specs = [x[2] for x in h]
g_cats = [x[1] for x in g]
g_specs = [x[2] for x in g]
all_cats = h_cats + g_cats
all_specs = h_specs + g_specs
h_maj = majority_vote(h_cats)
g_maj = majority_vote(g_cats)
all_maj = majority_vote(all_cats)
h_mean_spec = mean_spec(h_specs)
g_mean_spec = mean_spec(g_specs)
all_mean_spec = mean_spec(all_specs)
# Agreement count: how many of 13 agree with overall majority
agree_count = sum(1 for c in all_cats if c == all_maj) if all_maj != "split" else 0
meta = para_meta.get(pid, {})
results.append({
"pid": pid,
"h_maj": h_maj,
"g_maj": g_maj,
"all_maj": all_maj,
"h_cats": h_cats,
"g_cats": g_cats,
"h_specs": h_specs,
"g_specs": g_specs,
"h_mean_spec": h_mean_spec,
"g_mean_spec": g_mean_spec,
"all_mean_spec": all_mean_spec,
"agree_count": agree_count,
"word_count": meta.get("wordCount", 0),
"text": meta.get("text", ""),
"human_annotators": [x[0] for x in h],
"genai_models": [x[0] for x in g],
"human_labels": h,
"genai_labels": g,
})
def fmt_table(headers: list[str], rows: list[list], align: list[str] | None = None):
"""Format a simple text table."""
col_widths = [len(h) for h in headers]
str_rows = []
for row in rows:
sr = [str(x) for x in row]
str_rows.append(sr)
for i, s in enumerate(sr):
col_widths[i] = max(col_widths[i], len(s))
if align is None:
align = ["r"] * len(headers)
def fmt_cell(s, w, a):
return s.rjust(w) if a == "r" else s.ljust(w)
sep = "+-" + "-+-".join("-" * w for w in col_widths) + "-+"
hdr = "| " + " | ".join(fmt_cell(h, col_widths[i], "l") for i, h in enumerate(headers)) + " |"
lines = [sep, hdr, sep]
for sr in str_rows:
line = "| " + " | ".join(fmt_cell(sr[i], col_widths[i], align[i]) for i in range(len(headers))) + " |"
lines.append(line)
lines.append(sep)
return "\n".join(lines)
# ══════════════════════════════════════════════════════════════════════════════
# 1. PER-CATEGORY CONFUSION MATRIX: HUMAN MAJORITY vs GENAI MAJORITY
# ══════════════════════════════════════════════════════════════════════════════
print("=" * 80)
print("1. CONFUSION MATRIX: Human Majority (rows) vs GenAI Majority (cols)")
print("=" * 80)
cats_plus = CATS + ["split"]
cm = defaultdict(lambda: defaultdict(int))
for r in results:
cm[r["h_maj"]][r["g_maj"]] += 1
headers = ["H\\G"] + cats_plus + ["Total"]
rows = []
for hc in cats_plus:
row = [hc]
total = 0
for gc in cats_plus:
v = cm[hc][gc]
row.append(v if v else ".")
total += v
row.append(total)
rows.append(row)
# Column totals
col_totals = ["Total"]
for gc in cats_plus:
col_totals.append(sum(cm[hc][gc] for hc in cats_plus))
col_totals.append(sum(sum(cm[hc][gc] for gc in cats_plus) for hc in cats_plus))
rows.append(col_totals)
align = ["l"] + ["r"] * (len(headers) - 1)
print(fmt_table(headers, rows, align))
# Diagonal agreement
diag = sum(cm[c][c] for c in cats_plus)
total_paras = len(results)
print(f"\nDiagonal agreement: {diag}/{total_paras} = {diag/total_paras:.1%}")
print(f"Disagreement: {total_paras - diag}/{total_paras} = {(total_paras - diag)/total_paras:.1%}")
# Over/under prediction
print("\nGenAI over/under-prediction relative to human majority:")
headers2 = ["Category", "Human N", "GenAI N", "Diff", "Direction"]
rows2 = []
for c in CATS:
h_n = sum(cm[c][gc] for gc in cats_plus)
g_n = sum(cm[hc][c] for hc in cats_plus)
diff = g_n - h_n
direction = "OVER" if diff > 0 else ("UNDER" if diff < 0 else "MATCH")
rows2.append([c, h_n, g_n, f"{diff:+d}", direction])
align2 = ["l", "r", "r", "r", "l"]
print(fmt_table(headers2, rows2, align2))
# ══════════════════════════════════════════════════════════════════════════════
# 2. DIRECTIONAL DISAGREEMENT ANALYSIS
# ══════════════════════════════════════════════════════════════════════════════
print("\n" + "=" * 80)
print("2. DIRECTIONAL DISAGREEMENT: Human Majority -> GenAI Majority transitions")
print("=" * 80)
disagree = [(r["h_maj"], r["g_maj"]) for r in results if r["h_maj"] != r["g_maj"]]
print(f"\nTotal disagreements: {len(disagree)}/{total_paras}")
trans = Counter(disagree)
print("\nTop transitions (H_maj -> G_maj):")
headers3 = ["From (Human)", "To (GenAI)", "Count", "Reverse", "Net", "Symmetric?"]
rows3 = []
seen = set()
for (a, b), cnt in sorted(trans.items(), key=lambda x: -x[1]):
pair = tuple(sorted([a, b]))
if pair in seen:
continue
seen.add(pair)
rev = trans.get((b, a), 0)
net = cnt - rev
sym = "Yes" if abs(net) <= max(1, min(cnt, rev) * 0.3) else "No"
rows3.append([a, b, cnt, rev, f"{net:+d}", sym])
align3 = ["l", "l", "r", "r", "r", "l"]
print(fmt_table(headers3, rows3, align3))
# ══════════════════════════════════════════════════════════════════════════════
# 3. PER-CATEGORY PRECISION/RECALL (Human majority as truth)
# ══════════════════════════════════════════════════════════════════════════════
print("\n" + "=" * 80)
print("3. PER-CATEGORY PRECISION/RECALL (Human majority as ground truth)")
print("=" * 80)
# Filter out splits for clean P/R
valid = [(r["h_maj"], r["g_maj"]) for r in results if r["h_maj"] != "split" and r["g_maj"] != "split"]
headers4 = ["Category", "TP", "FP", "FN", "Precision", "Recall", "F1"]
rows4 = []
for c in CATS:
tp = sum(1 for h, g in valid if h == c and g == c)
fp = sum(1 for h, g in valid if h != c and g == c)
fn = sum(1 for h, g in valid if h == c and g != c)
prec = tp / (tp + fp) if (tp + fp) > 0 else 0
rec = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0
rows4.append([c, tp, fp, fn, f"{prec:.3f}", f"{rec:.3f}", f"{f1:.3f}"])
align4 = ["l", "r", "r", "r", "r", "r", "r"]
print("\nGenAI predictions evaluated against human majority:")
print(fmt_table(headers4, rows4, align4))
# Macro averages
macro_p = sum(float(r[4]) for r in rows4) / len(CATS)
macro_r = sum(float(r[5]) for r in rows4) / len(CATS)
macro_f1 = sum(float(r[6]) for r in rows4) / len(CATS)
print(f"\nMacro-avg: P={macro_p:.3f} R={macro_r:.3f} F1={macro_f1:.3f}")
# Vice versa: GenAI as truth
print("\n--- Vice versa: Human predictions evaluated against GenAI majority ---")
rows4b = []
for c in CATS:
tp = sum(1 for h, g in valid if g == c and h == c)
fp = sum(1 for h, g in valid if g != c and h == c)
fn = sum(1 for h, g in valid if g == c and h != c)
prec = tp / (tp + fp) if (tp + fp) > 0 else 0
rec = tp / (tp + fn) if (tp + fn) > 0 else 0
f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0
rows4b.append([c, tp, fp, fn, f"{prec:.3f}", f"{rec:.3f}", f"{f1:.3f}"])
print(fmt_table(headers4, rows4b, align4))
# ══════════════════════════════════════════════════════════════════════════════
# 4. SPECIFICITY SYSTEMATIC BIAS
# ══════════════════════════════════════════════════════════════════════════════
print("\n" + "=" * 80)
print("4. SPECIFICITY SYSTEMATIC BIAS: Human vs GenAI")
print("=" * 80)
# Overall
all_h_specs = [s for r in results for s in r["h_specs"]]
all_g_specs = [s for r in results for s in r["g_specs"]]
h_avg = mean_spec(all_h_specs)
g_avg = mean_spec(all_g_specs)
print(f"\nOverall mean specificity: Human={h_avg:.3f} GenAI={g_avg:.3f} Diff={g_avg - h_avg:+.3f}")
print(f"Overall median: Human={median_spec(all_h_specs):.1f} GenAI={median_spec(all_g_specs):.1f}")
# Distribution
print("\nSpecificity distribution:")
h_dist = Counter(all_h_specs)
g_dist = Counter(all_g_specs)
headers5 = ["Spec", "Human N", "Human %", "GenAI N", "GenAI %", "Diff %"]
rows5 = []
for s in [1, 2, 3, 4]:
hn = h_dist.get(s, 0)
gn = g_dist.get(s, 0)
hp = hn / len(all_h_specs) * 100
gp = gn / len(all_g_specs) * 100
rows5.append([s, hn, f"{hp:.1f}%", gn, f"{gp:.1f}%", f"{gp - hp:+.1f}%"])
print(fmt_table(headers5, rows5, ["r", "r", "r", "r", "r", "r"]))
# By category
print("\nMean specificity by category:")
headers6 = ["Category", "Human", "GenAI", "Diff", "H count", "G count"]
rows6 = []
for c in CATS:
h_s = [s for r in results for ann in r["human_labels"] if ann[1] == c for s in [ann[2]]]
g_s = [s for r in results for ann in r["genai_labels"] if ann[1] == c for s in [ann[2]]]
if h_s and g_s:
hm = mean_spec(h_s)
gm = mean_spec(g_s)
rows6.append([c, f"{hm:.3f}", f"{gm:.3f}", f"{gm - hm:+.3f}", len(h_s), len(g_s)])
else:
rows6.append([c, "N/A", "N/A", "N/A", len(h_s), len(g_s)])
print(fmt_table(headers6, rows6, ["l", "r", "r", "r", "r", "r"]))
# Per-paragraph directional bias
h_higher = sum(1 for r in results if r["h_mean_spec"] > r["g_mean_spec"])
g_higher = sum(1 for r in results if r["g_mean_spec"] > r["h_mean_spec"])
same = sum(1 for r in results if abs(r["h_mean_spec"] - r["g_mean_spec"]) < 0.01)
print(f"\nPer-paragraph: Human higher spec={h_higher} GenAI higher={g_higher} Same={same}")
# ══════════════════════════════════════════════════════════════════════════════
# 5. DIFFICULTY-STRATIFIED ANALYSIS
# ══════════════════════════════════════════════════════════════════════════════
print("\n" + "=" * 80)
print("5. DIFFICULTY-STRATIFIED ANALYSIS")
print("=" * 80)
# Tiers based on 13-signal agreement
# Tier 1: 10+ agree, Tier 2: 7-9 agree, Tier 3: 5-6 agree, Tier 4: <5 agree
def get_tier(agree_count: int) -> str:
if agree_count >= 10:
return "T1-Easy"
elif agree_count >= 7:
return "T2-Medium"
elif agree_count >= 5:
return "T3-Hard"
else:
return "T4-VHard"
for r in results:
r["tier"] = get_tier(r["agree_count"])
tier_counts = Counter(r["tier"] for r in results)
print(f"\nTier distribution:")
for t in ["T1-Easy", "T2-Medium", "T3-Hard", "T4-VHard"]:
print(f" {t}: {tier_counts.get(t, 0)} paragraphs")
print("\nHuman-GenAI category agreement rate by difficulty tier:")
headers7 = ["Tier", "N", "Agree", "Agree%", "H=consensus%", "G=consensus%"]
rows7 = []
for t in ["T1-Easy", "T2-Medium", "T3-Hard", "T4-VHard"]:
tier_r = [r for r in results if r["tier"] == t]
n = len(tier_r)
if n == 0:
continue
agree = sum(1 for r in tier_r if r["h_maj"] == r["g_maj"])
h_match_cons = sum(1 for r in tier_r if r["h_maj"] == r["all_maj"])
g_match_cons = sum(1 for r in tier_r if r["g_maj"] == r["all_maj"])
rows7.append([
t, n, agree, f"{agree/n:.1%}",
f"{h_match_cons/n:.1%}", f"{g_match_cons/n:.1%}"
])
print(fmt_table(headers7, rows7, ["l", "r", "r", "r", "r", "r"]))
# On hard paragraphs, who is the odd one out?
print("\nOn hard paragraphs (T3+T4), disagreement breakdown:")
hard = [r for r in results if r["tier"] in ("T3-Hard", "T4-VHard")]
h_odd = sum(1 for r in hard if r["g_maj"] == r["all_maj"] and r["h_maj"] != r["all_maj"])
g_odd = sum(1 for r in hard if r["h_maj"] == r["all_maj"] and r["g_maj"] != r["all_maj"])
both_off = sum(1 for r in hard if r["h_maj"] != r["all_maj"] and r["g_maj"] != r["all_maj"])
both_on = sum(1 for r in hard if r["h_maj"] == r["all_maj"] and r["g_maj"] == r["all_maj"])
print(f" Human is odd-one-out (GenAI=consensus, Human!=consensus): {h_odd}")
print(f" GenAI is odd-one-out (Human=consensus, GenAI!=consensus): {g_odd}")
print(f" Both match consensus: {both_on}")
print(f" Both differ from consensus: {both_off}")
print(f" Total hard: {len(hard)}")
# ══════════════════════════════════════════════════════════════════════════════
# 6. ANNOTATOR-LEVEL PATTERNS
# ══════════════════════════════════════════════════════════════════════════════
print("\n" + "=" * 80)
print("6. ANNOTATOR-LEVEL PATTERNS")
print("=" * 80)
annotators = ["Anuj", "Elisabeth", "Joey", "Meghan", "Xander", "Aaryan"]
# For each annotator, compute agreement with GenAI majority
print("\nPer-annotator agreement with GenAI majority (category):")
headers8 = ["Annotator", "N labels", "Agree w/G_maj", "Agree%", "Agree w/13_maj", "13_maj%", "Avg Spec", "Note"]
rows8 = []
for ann in annotators:
agree_g = 0
agree_all = 0
total = 0
specs = []
for r in results:
for name, cat, spec in r["human_labels"]:
if name == ann:
total += 1
specs.append(spec)
if cat == r["g_maj"]:
agree_g += 1
if cat == r["all_maj"]:
agree_all += 1
if total == 0:
continue
note = "(excluded from aggregates)" if ann == "Aaryan" else ""
rows8.append([
ann, total,
agree_g, f"{agree_g/total:.1%}",
agree_all, f"{agree_all/total:.1%}",
f"{mean_spec(specs):.2f}",
note,
])
align8 = ["l", "r", "r", "r", "r", "r", "r", "l"]
print(fmt_table(headers8, rows8, align8))
# Annotator category distributions
print("\nPer-annotator category distribution:")
for ann in annotators:
cat_counts = Counter()
for r in results:
for name, cat, spec in r["human_labels"]:
if name == ann:
cat_counts[cat] += 1
if not cat_counts:
continue
total = sum(cat_counts.values())
dist = " ".join(f"{c}:{cat_counts.get(c, 0):3d}({cat_counts.get(c, 0)/total:.0%})" for c in CATS)
flag = " ** OUTLIER" if ann == "Aaryan" else ""
print(f" {ann:10s} (n={total:3d}): {dist}{flag}")
# ══════════════════════════════════════════════════════════════════════════════
# 7. TEXT-FEATURE CORRELATIONS
# ══════════════════════════════════════════════════════════════════════════════
print("\n" + "=" * 80)
print("7. TEXT-FEATURE CORRELATIONS WITH DISAGREEMENT")
print("=" * 80)
agree_r = [r for r in results if r["h_maj"] == r["g_maj"]]
disagree_r = [r for r in results if r["h_maj"] != r["g_maj"]]
# Word count
agree_wc = [r["word_count"] for r in agree_r if r["word_count"] > 0]
disagree_wc = [r["word_count"] for r in disagree_r if r["word_count"] > 0]
print(f"\nWord count (agree vs disagree):")
print(f" Agreement paragraphs: mean={mean_spec(agree_wc):.1f} median={median_spec(agree_wc):.0f} n={len(agree_wc)}")
print(f" Disagreement paragraphs: mean={mean_spec(disagree_wc):.1f} median={median_spec(disagree_wc):.0f} n={len(disagree_wc)}")
# Word count buckets
print("\nDisagreement rate by word count bucket:")
buckets = [(0, 30, "0-30"), (31, 60, "31-60"), (61, 100, "61-100"), (101, 150, "101-150"), (151, 250, "151-250"), (251, 9999, "251+")]
headers9 = ["WC Bucket", "N", "Disagree", "Disagree%"]
rows9 = []
for lo, hi, label in buckets:
in_bucket = [r for r in results if lo <= r["word_count"] <= hi]
dis = sum(1 for r in in_bucket if r["h_maj"] != r["g_maj"])
if in_bucket:
rows9.append([label, len(in_bucket), dis, f"{dis/len(in_bucket):.1%}"])
print(fmt_table(headers9, rows9, ["l", "r", "r", "r"]))
# Stage1 method (unanimous vs majority) as proxy for quality tier
print("\nDisagreement rate by Stage 1 confidence method:")
for method in ["unanimous", "majority"]:
in_method = [r for r in results if para_meta.get(r["pid"], {}).get("stage1Method") == method]
dis = sum(1 for r in in_method if r["h_maj"] != r["g_maj"])
if in_method:
print(f" {method:10s}: {dis}/{len(in_method)} = {dis/len(in_method):.1%} disagree")
# Keyword analysis
print("\nDisagreement rate for paragraphs containing key terms:")
keywords = ["material", "NIST", "CISO", "board", "third party", "third-party", "incident",
"insurance", "audit", "framework", "breach", "ransomware"]
headers10 = ["Keyword", "N", "Disagree", "Disagree%"]
rows10 = []
for kw in keywords:
matching = [r for r in results if kw.lower() in r["text"].lower()]
if not matching:
continue
dis = sum(1 for r in matching if r["h_maj"] != r["g_maj"])
rows10.append([kw, len(matching), dis, f"{dis/len(matching):.1%}"])
rows10.sort(key=lambda x: -int(x[2]))
print(fmt_table(headers10, rows10, ["l", "r", "r", "r"]))
# ══════════════════════════════════════════════════════════════════════════════
# 8. "HUMAN RIGHT, GenAI WRONG" vs "GenAI RIGHT, HUMAN WRONG"
# ══════════════════════════════════════════════════════════════════════════════
print("\n" + "=" * 80)
print("8. HUMAN RIGHT/GENAI WRONG vs GENAI RIGHT/HUMAN WRONG (13-signal consensus)")
print("=" * 80)
# Only consider paragraphs where all_maj is not split and h/g disagree with each other or consensus
h_right_g_wrong = [r for r in results if r["all_maj"] != "split" and r["h_maj"] == r["all_maj"] and r["g_maj"] != r["all_maj"]]
g_right_h_wrong = [r for r in results if r["all_maj"] != "split" and r["g_maj"] == r["all_maj"] and r["h_maj"] != r["all_maj"]]
both_right = [r for r in results if r["all_maj"] != "split" and r["h_maj"] == r["all_maj"] and r["g_maj"] == r["all_maj"]]
both_wrong = [r for r in results if r["all_maj"] != "split" and r["h_maj"] != r["all_maj"] and r["g_maj"] != r["all_maj"]]
has_split = [r for r in results if r["all_maj"] == "split"]
print(f"\n Both correct: {len(both_right)}")
print(f" Human right, GenAI wrong: {len(h_right_g_wrong)}")
print(f" GenAI right, Human wrong: {len(g_right_h_wrong)}")
print(f" Both wrong: {len(both_wrong)}")
print(f" 13-signal split (no consensus): {len(has_split)}")
# Category breakdown
print("\nCategory breakdown of 'Human right, GenAI wrong':")
cat_dist_hrg = Counter(r["all_maj"] for r in h_right_g_wrong)
for c in CATS:
n = cat_dist_hrg.get(c, 0)
if n > 0:
print(f" {c}: {n}")
print("\nCategory breakdown of 'GenAI right, Human wrong':")
cat_dist_grh = Counter(r["all_maj"] for r in g_right_h_wrong)
for c in CATS:
n = cat_dist_grh.get(c, 0)
if n > 0:
print(f" {c}: {n}")
# What did the wrong side predict?
print("\nWhen GenAI is wrong, what does it predict instead?")
wrong_g = Counter(r["g_maj"] for r in h_right_g_wrong)
for label, cnt in wrong_g.most_common():
print(f" {label}: {cnt}")
print("\nWhen Human is wrong, what do they predict instead?")
wrong_h = Counter(r["h_maj"] for r in g_right_h_wrong)
for label, cnt in wrong_h.most_common():
print(f" {label}: {cnt}")
# ══════════════════════════════════════════════════════════════════════════════
# 9. SPECIFICITY BY SOURCE TYPE
# ══════════════════════════════════════════════════════════════════════════════
print("\n" + "=" * 80)
print("9. SPECIFICITY BY SOURCE TYPE AND CATEGORY")
print("=" * 80)
# Group models into source types
stage1_models = {"gemini-3.1-flash-lite-preview", "grok-4.1-fast", "mimo-v2-flash"}
frontier_models = {"opus-4.6", "gpt-5.4", "gemini-3.1-pro-preview", "kimi-k2.5"}
budget_models = {"glm-5:exacto", "mimo-v2-pro:exacto", "minimax-m2.7:exacto"}
# Collect specs by source type and category
source_specs: dict[str, dict[str, list[int]]] = {
"Human": defaultdict(list),
"Stage1": defaultdict(list),
"Frontier": defaultdict(list),
"Budget": defaultdict(list),
}
for r in results:
for name, cat, spec in r["human_labels"]:
source_specs["Human"][cat].append(spec)
source_specs["Human"]["ALL"].append(spec)
for model, cat, spec in r["genai_labels"]:
if model in stage1_models:
src = "Stage1"
elif model in frontier_models:
src = "Frontier"
elif model in budget_models:
src = "Budget"
else:
src = "Budget" # fallback
source_specs[src][cat].append(spec)
source_specs[src]["ALL"].append(spec)
print("\nMean specificity by source type and category:")
src_order = ["Human", "Stage1", "Frontier", "Budget"]
headers11 = ["Category"] + src_order
rows11 = []
for c in CATS + ["ALL"]:
row = [c]
for src in src_order:
specs = source_specs[src].get(c, [])
if specs:
row.append(f"{mean_spec(specs):.3f}")
else:
row.append("N/A")
rows11.append(row)
align11 = ["l"] + ["r"] * len(src_order)
print(fmt_table(headers11, rows11, align11))
# Specificity standard deviation by source
print("\nSpecificity std dev by source type:")
import math
for src in src_order:
specs = source_specs[src]["ALL"]
if specs:
m = mean_spec(specs)
var = sum((s - m) ** 2 for s in specs) / len(specs)
std = math.sqrt(var)
print(f" {src:10s}: mean={m:.3f} std={std:.3f} n={len(specs)}")
# ── Per-model specificity rankings ───────────────────────────────────────────
print("\nPer-model mean specificity (all categories):")
model_specs: dict[str, list[int]] = defaultdict(list)
for r in results:
for name, cat, spec in r["human_labels"]:
model_specs[f"H:{name}"].append(spec)
for model, cat, spec in r["genai_labels"]:
model_specs[f"G:{model}"].append(spec)
headers12 = ["Model", "Mean Spec", "N"]
rows12 = []
for model, specs in sorted(model_specs.items(), key=lambda x: mean_spec(x[1])):
rows12.append([model, f"{mean_spec(specs):.3f}", len(specs)])
print(fmt_table(headers12, rows12, ["l", "r", "r"]))
# ══════════════════════════════════════════════════════════════════════════════
# SUMMARY
# ══════════════════════════════════════════════════════════════════════════════
print("\n" + "=" * 80)
print("SUMMARY OF KEY FINDINGS")
print("=" * 80)
print(f"""
Dataset: {total_paras} paragraphs, 13 signals each (3 human, 10 GenAI)
1. CATEGORY AGREEMENT: Human majority and GenAI majority agree on {diag/total_paras:.1%} of
paragraphs. The biggest confusions are in the off-diagonal cells above.
2. DIRECTIONAL DISAGREEMENTS: The most common category swaps reveal systematic
differences in how humans and GenAI interpret boundary cases.
3. PRECISION/RECALL: GenAI macro F1={macro_f1:.3f} against human majority.
4. SPECIFICITY BIAS: Human mean={h_avg:.3f}, GenAI mean={g_avg:.3f}
(diff={g_avg - h_avg:+.3f}). {"GenAI rates higher" if g_avg > h_avg else "Humans rate higher"} on average.
5. DIFFICULTY: On easy paragraphs (T1, 10+/13 agree), agreement is very high.
On hard paragraphs, {"humans" if h_odd > g_odd else "GenAI"} are more often the odd-one-out.
6. ANNOTATORS: See table above for individual alignment with GenAI and consensus.
7. TEXT FEATURES: {"Longer" if mean_spec(disagree_wc) > mean_spec(agree_wc) else "Shorter"} paragraphs
tend to produce more disagreement.
8. RIGHT/WRONG: Human right & GenAI wrong: {len(h_right_g_wrong)}, GenAI right &
Human wrong: {len(g_right_h_wrong)}. {"Humans are more often right" if len(h_right_g_wrong) > len(g_right_h_wrong) else "GenAI is more often right"} when they disagree.
""")