""" Cross-analysis: Human annotators vs GenAI models on 1,200-paragraph holdout set. Categories: BG, ID, MR, N/O, RMP, SI, TPR Specificity: 1-4 13 signals per paragraph: 3 human (BIBD), 3 Stage 1, 1 Opus 4.6, 6 benchmark """ import json import sys from collections import Counter, defaultdict from pathlib import Path # ── Category abbreviation mapping ──────────────────────────────────────────── FULL_TO_ABBR = { "Board Governance": "BG", "Incident Disclosure": "ID", "Management Role": "MR", "None/Other": "N/O", "Risk Management Process": "RMP", "Strategy Integration": "SI", "Third-Party Risk": "TPR", } ABBR_TO_FULL = {v: k for k, v in FULL_TO_ABBR.items()} CATS = ["BG", "ID", "MR", "N/O", "RMP", "SI", "TPR"] DATA = Path("data") def abbr(cat: str) -> str: return FULL_TO_ABBR.get(cat, cat) def majority_vote(labels: list[str]) -> str: """Return majority label or 'split' if no majority.""" c = Counter(labels) top = c.most_common(1)[0] if top[1] > len(labels) / 2: return top[0] # Check for a plurality with tie-break: if top 2 are tied, it's split if len(c) >= 2: top2 = c.most_common(2) if top2[0][1] == top2[1][1]: return "split" return top[0] def median_spec(specs: list[int]) -> float: s = sorted(specs) n = len(s) if n % 2 == 1: return float(s[n // 2]) return (s[n // 2 - 1] + s[n // 2]) / 2.0 def mean_spec(specs: list[int]) -> float: return sum(specs) / len(specs) if specs else 0.0 # ── Load data ──────────────────────────────────────────────────────────────── print("Loading data...\n") # Human labels: paragraphId → list of (annotatorName, category, specificity) human_labels: dict[str, list[tuple[str, str, int]]] = defaultdict(list) with open(DATA / "gold" / "human-labels-raw.jsonl") as f: for line in f: d = json.loads(line) human_labels[d["paragraphId"]].append( (d["annotatorName"], abbr(d["contentCategory"]), d["specificityLevel"]) ) holdout_pids = sorted(human_labels.keys()) assert len(holdout_pids) == 1200, f"Expected 1200 holdout paragraphs, got {len(holdout_pids)}" # GenAI labels: paragraphId → list of (modelName, category, specificity) genai_labels: dict[str, list[tuple[str, str, int]]] = defaultdict(list) # Stage 1 (filter to holdout only) holdout_set = set(holdout_pids) with open(DATA / "annotations" / "stage1.patched.jsonl") as f: for line in f: d = json.loads(line) pid = d["paragraphId"] if pid in holdout_set: model = d["provenance"]["modelId"].split("/")[-1] genai_labels[pid].append( (model, abbr(d["label"]["content_category"]), d["label"]["specificity_level"]) ) # Opus with open(DATA / "annotations" / "golden" / "opus.jsonl") as f: for line in f: d = json.loads(line) genai_labels[d["paragraphId"]].append( ("opus-4.6", abbr(d["label"]["content_category"]), d["label"]["specificity_level"]) ) # Bench-holdout models bench_files = [ "gpt-5.4.jsonl", "gemini-3.1-pro-preview.jsonl", "glm-5:exacto.jsonl", "kimi-k2.5.jsonl", "mimo-v2-pro:exacto.jsonl", "minimax-m2.7:exacto.jsonl", ] for fname in bench_files: fpath = DATA / "annotations" / "bench-holdout" / fname model_name = fname.replace(".jsonl", "") with open(fpath) as f: for line in f: d = json.loads(line) genai_labels[d["paragraphId"]].append( (model_name, abbr(d["label"]["content_category"]), d["label"]["specificity_level"]) ) # Paragraph metadata para_meta: dict[str, dict] = {} with open(DATA / "gold" / "paragraphs-holdout.jsonl") as f: for line in f: d = json.loads(line) if d["id"] in holdout_set: para_meta[d["id"]] = d # ── Compute per-paragraph aggregates ───────────────────────────────────────── results = [] for pid in holdout_pids: h = human_labels[pid] g = genai_labels[pid] h_cats = [x[1] for x in h] h_specs = [x[2] for x in h] g_cats = [x[1] for x in g] g_specs = [x[2] for x in g] all_cats = h_cats + g_cats all_specs = h_specs + g_specs h_maj = majority_vote(h_cats) g_maj = majority_vote(g_cats) all_maj = majority_vote(all_cats) h_mean_spec = mean_spec(h_specs) g_mean_spec = mean_spec(g_specs) all_mean_spec = mean_spec(all_specs) # Agreement count: how many of 13 agree with overall majority agree_count = sum(1 for c in all_cats if c == all_maj) if all_maj != "split" else 0 meta = para_meta.get(pid, {}) results.append({ "pid": pid, "h_maj": h_maj, "g_maj": g_maj, "all_maj": all_maj, "h_cats": h_cats, "g_cats": g_cats, "h_specs": h_specs, "g_specs": g_specs, "h_mean_spec": h_mean_spec, "g_mean_spec": g_mean_spec, "all_mean_spec": all_mean_spec, "agree_count": agree_count, "word_count": meta.get("wordCount", 0), "text": meta.get("text", ""), "human_annotators": [x[0] for x in h], "genai_models": [x[0] for x in g], "human_labels": h, "genai_labels": g, }) def fmt_table(headers: list[str], rows: list[list], align: list[str] | None = None): """Format a simple text table.""" col_widths = [len(h) for h in headers] str_rows = [] for row in rows: sr = [str(x) for x in row] str_rows.append(sr) for i, s in enumerate(sr): col_widths[i] = max(col_widths[i], len(s)) if align is None: align = ["r"] * len(headers) def fmt_cell(s, w, a): return s.rjust(w) if a == "r" else s.ljust(w) sep = "+-" + "-+-".join("-" * w for w in col_widths) + "-+" hdr = "| " + " | ".join(fmt_cell(h, col_widths[i], "l") for i, h in enumerate(headers)) + " |" lines = [sep, hdr, sep] for sr in str_rows: line = "| " + " | ".join(fmt_cell(sr[i], col_widths[i], align[i]) for i in range(len(headers))) + " |" lines.append(line) lines.append(sep) return "\n".join(lines) # ══════════════════════════════════════════════════════════════════════════════ # 1. PER-CATEGORY CONFUSION MATRIX: HUMAN MAJORITY vs GENAI MAJORITY # ══════════════════════════════════════════════════════════════════════════════ print("=" * 80) print("1. CONFUSION MATRIX: Human Majority (rows) vs GenAI Majority (cols)") print("=" * 80) cats_plus = CATS + ["split"] cm = defaultdict(lambda: defaultdict(int)) for r in results: cm[r["h_maj"]][r["g_maj"]] += 1 headers = ["H\\G"] + cats_plus + ["Total"] rows = [] for hc in cats_plus: row = [hc] total = 0 for gc in cats_plus: v = cm[hc][gc] row.append(v if v else ".") total += v row.append(total) rows.append(row) # Column totals col_totals = ["Total"] for gc in cats_plus: col_totals.append(sum(cm[hc][gc] for hc in cats_plus)) col_totals.append(sum(sum(cm[hc][gc] for gc in cats_plus) for hc in cats_plus)) rows.append(col_totals) align = ["l"] + ["r"] * (len(headers) - 1) print(fmt_table(headers, rows, align)) # Diagonal agreement diag = sum(cm[c][c] for c in cats_plus) total_paras = len(results) print(f"\nDiagonal agreement: {diag}/{total_paras} = {diag/total_paras:.1%}") print(f"Disagreement: {total_paras - diag}/{total_paras} = {(total_paras - diag)/total_paras:.1%}") # Over/under prediction print("\nGenAI over/under-prediction relative to human majority:") headers2 = ["Category", "Human N", "GenAI N", "Diff", "Direction"] rows2 = [] for c in CATS: h_n = sum(cm[c][gc] for gc in cats_plus) g_n = sum(cm[hc][c] for hc in cats_plus) diff = g_n - h_n direction = "OVER" if diff > 0 else ("UNDER" if diff < 0 else "MATCH") rows2.append([c, h_n, g_n, f"{diff:+d}", direction]) align2 = ["l", "r", "r", "r", "l"] print(fmt_table(headers2, rows2, align2)) # ══════════════════════════════════════════════════════════════════════════════ # 2. DIRECTIONAL DISAGREEMENT ANALYSIS # ══════════════════════════════════════════════════════════════════════════════ print("\n" + "=" * 80) print("2. DIRECTIONAL DISAGREEMENT: Human Majority -> GenAI Majority transitions") print("=" * 80) disagree = [(r["h_maj"], r["g_maj"]) for r in results if r["h_maj"] != r["g_maj"]] print(f"\nTotal disagreements: {len(disagree)}/{total_paras}") trans = Counter(disagree) print("\nTop transitions (H_maj -> G_maj):") headers3 = ["From (Human)", "To (GenAI)", "Count", "Reverse", "Net", "Symmetric?"] rows3 = [] seen = set() for (a, b), cnt in sorted(trans.items(), key=lambda x: -x[1]): pair = tuple(sorted([a, b])) if pair in seen: continue seen.add(pair) rev = trans.get((b, a), 0) net = cnt - rev sym = "Yes" if abs(net) <= max(1, min(cnt, rev) * 0.3) else "No" rows3.append([a, b, cnt, rev, f"{net:+d}", sym]) align3 = ["l", "l", "r", "r", "r", "l"] print(fmt_table(headers3, rows3, align3)) # ══════════════════════════════════════════════════════════════════════════════ # 3. PER-CATEGORY PRECISION/RECALL (Human majority as truth) # ══════════════════════════════════════════════════════════════════════════════ print("\n" + "=" * 80) print("3. PER-CATEGORY PRECISION/RECALL (Human majority as ground truth)") print("=" * 80) # Filter out splits for clean P/R valid = [(r["h_maj"], r["g_maj"]) for r in results if r["h_maj"] != "split" and r["g_maj"] != "split"] headers4 = ["Category", "TP", "FP", "FN", "Precision", "Recall", "F1"] rows4 = [] for c in CATS: tp = sum(1 for h, g in valid if h == c and g == c) fp = sum(1 for h, g in valid if h != c and g == c) fn = sum(1 for h, g in valid if h == c and g != c) prec = tp / (tp + fp) if (tp + fp) > 0 else 0 rec = tp / (tp + fn) if (tp + fn) > 0 else 0 f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0 rows4.append([c, tp, fp, fn, f"{prec:.3f}", f"{rec:.3f}", f"{f1:.3f}"]) align4 = ["l", "r", "r", "r", "r", "r", "r"] print("\nGenAI predictions evaluated against human majority:") print(fmt_table(headers4, rows4, align4)) # Macro averages macro_p = sum(float(r[4]) for r in rows4) / len(CATS) macro_r = sum(float(r[5]) for r in rows4) / len(CATS) macro_f1 = sum(float(r[6]) for r in rows4) / len(CATS) print(f"\nMacro-avg: P={macro_p:.3f} R={macro_r:.3f} F1={macro_f1:.3f}") # Vice versa: GenAI as truth print("\n--- Vice versa: Human predictions evaluated against GenAI majority ---") rows4b = [] for c in CATS: tp = sum(1 for h, g in valid if g == c and h == c) fp = sum(1 for h, g in valid if g != c and h == c) fn = sum(1 for h, g in valid if g == c and h != c) prec = tp / (tp + fp) if (tp + fp) > 0 else 0 rec = tp / (tp + fn) if (tp + fn) > 0 else 0 f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0 rows4b.append([c, tp, fp, fn, f"{prec:.3f}", f"{rec:.3f}", f"{f1:.3f}"]) print(fmt_table(headers4, rows4b, align4)) # ══════════════════════════════════════════════════════════════════════════════ # 4. SPECIFICITY SYSTEMATIC BIAS # ══════════════════════════════════════════════════════════════════════════════ print("\n" + "=" * 80) print("4. SPECIFICITY SYSTEMATIC BIAS: Human vs GenAI") print("=" * 80) # Overall all_h_specs = [s for r in results for s in r["h_specs"]] all_g_specs = [s for r in results for s in r["g_specs"]] h_avg = mean_spec(all_h_specs) g_avg = mean_spec(all_g_specs) print(f"\nOverall mean specificity: Human={h_avg:.3f} GenAI={g_avg:.3f} Diff={g_avg - h_avg:+.3f}") print(f"Overall median: Human={median_spec(all_h_specs):.1f} GenAI={median_spec(all_g_specs):.1f}") # Distribution print("\nSpecificity distribution:") h_dist = Counter(all_h_specs) g_dist = Counter(all_g_specs) headers5 = ["Spec", "Human N", "Human %", "GenAI N", "GenAI %", "Diff %"] rows5 = [] for s in [1, 2, 3, 4]: hn = h_dist.get(s, 0) gn = g_dist.get(s, 0) hp = hn / len(all_h_specs) * 100 gp = gn / len(all_g_specs) * 100 rows5.append([s, hn, f"{hp:.1f}%", gn, f"{gp:.1f}%", f"{gp - hp:+.1f}%"]) print(fmt_table(headers5, rows5, ["r", "r", "r", "r", "r", "r"])) # By category print("\nMean specificity by category:") headers6 = ["Category", "Human", "GenAI", "Diff", "H count", "G count"] rows6 = [] for c in CATS: h_s = [s for r in results for ann in r["human_labels"] if ann[1] == c for s in [ann[2]]] g_s = [s for r in results for ann in r["genai_labels"] if ann[1] == c for s in [ann[2]]] if h_s and g_s: hm = mean_spec(h_s) gm = mean_spec(g_s) rows6.append([c, f"{hm:.3f}", f"{gm:.3f}", f"{gm - hm:+.3f}", len(h_s), len(g_s)]) else: rows6.append([c, "N/A", "N/A", "N/A", len(h_s), len(g_s)]) print(fmt_table(headers6, rows6, ["l", "r", "r", "r", "r", "r"])) # Per-paragraph directional bias h_higher = sum(1 for r in results if r["h_mean_spec"] > r["g_mean_spec"]) g_higher = sum(1 for r in results if r["g_mean_spec"] > r["h_mean_spec"]) same = sum(1 for r in results if abs(r["h_mean_spec"] - r["g_mean_spec"]) < 0.01) print(f"\nPer-paragraph: Human higher spec={h_higher} GenAI higher={g_higher} Same={same}") # ══════════════════════════════════════════════════════════════════════════════ # 5. DIFFICULTY-STRATIFIED ANALYSIS # ══════════════════════════════════════════════════════════════════════════════ print("\n" + "=" * 80) print("5. DIFFICULTY-STRATIFIED ANALYSIS") print("=" * 80) # Tiers based on 13-signal agreement # Tier 1: 10+ agree, Tier 2: 7-9 agree, Tier 3: 5-6 agree, Tier 4: <5 agree def get_tier(agree_count: int) -> str: if agree_count >= 10: return "T1-Easy" elif agree_count >= 7: return "T2-Medium" elif agree_count >= 5: return "T3-Hard" else: return "T4-VHard" for r in results: r["tier"] = get_tier(r["agree_count"]) tier_counts = Counter(r["tier"] for r in results) print(f"\nTier distribution:") for t in ["T1-Easy", "T2-Medium", "T3-Hard", "T4-VHard"]: print(f" {t}: {tier_counts.get(t, 0)} paragraphs") print("\nHuman-GenAI category agreement rate by difficulty tier:") headers7 = ["Tier", "N", "Agree", "Agree%", "H=consensus%", "G=consensus%"] rows7 = [] for t in ["T1-Easy", "T2-Medium", "T3-Hard", "T4-VHard"]: tier_r = [r for r in results if r["tier"] == t] n = len(tier_r) if n == 0: continue agree = sum(1 for r in tier_r if r["h_maj"] == r["g_maj"]) h_match_cons = sum(1 for r in tier_r if r["h_maj"] == r["all_maj"]) g_match_cons = sum(1 for r in tier_r if r["g_maj"] == r["all_maj"]) rows7.append([ t, n, agree, f"{agree/n:.1%}", f"{h_match_cons/n:.1%}", f"{g_match_cons/n:.1%}" ]) print(fmt_table(headers7, rows7, ["l", "r", "r", "r", "r", "r"])) # On hard paragraphs, who is the odd one out? print("\nOn hard paragraphs (T3+T4), disagreement breakdown:") hard = [r for r in results if r["tier"] in ("T3-Hard", "T4-VHard")] h_odd = sum(1 for r in hard if r["g_maj"] == r["all_maj"] and r["h_maj"] != r["all_maj"]) g_odd = sum(1 for r in hard if r["h_maj"] == r["all_maj"] and r["g_maj"] != r["all_maj"]) both_off = sum(1 for r in hard if r["h_maj"] != r["all_maj"] and r["g_maj"] != r["all_maj"]) both_on = sum(1 for r in hard if r["h_maj"] == r["all_maj"] and r["g_maj"] == r["all_maj"]) print(f" Human is odd-one-out (GenAI=consensus, Human!=consensus): {h_odd}") print(f" GenAI is odd-one-out (Human=consensus, GenAI!=consensus): {g_odd}") print(f" Both match consensus: {both_on}") print(f" Both differ from consensus: {both_off}") print(f" Total hard: {len(hard)}") # ══════════════════════════════════════════════════════════════════════════════ # 6. ANNOTATOR-LEVEL PATTERNS # ══════════════════════════════════════════════════════════════════════════════ print("\n" + "=" * 80) print("6. ANNOTATOR-LEVEL PATTERNS") print("=" * 80) annotators = ["Anuj", "Elisabeth", "Joey", "Meghan", "Xander", "Aaryan"] # For each annotator, compute agreement with GenAI majority print("\nPer-annotator agreement with GenAI majority (category):") headers8 = ["Annotator", "N labels", "Agree w/G_maj", "Agree%", "Agree w/13_maj", "13_maj%", "Avg Spec", "Note"] rows8 = [] for ann in annotators: agree_g = 0 agree_all = 0 total = 0 specs = [] for r in results: for name, cat, spec in r["human_labels"]: if name == ann: total += 1 specs.append(spec) if cat == r["g_maj"]: agree_g += 1 if cat == r["all_maj"]: agree_all += 1 if total == 0: continue note = "(excluded from aggregates)" if ann == "Aaryan" else "" rows8.append([ ann, total, agree_g, f"{agree_g/total:.1%}", agree_all, f"{agree_all/total:.1%}", f"{mean_spec(specs):.2f}", note, ]) align8 = ["l", "r", "r", "r", "r", "r", "r", "l"] print(fmt_table(headers8, rows8, align8)) # Annotator category distributions print("\nPer-annotator category distribution:") for ann in annotators: cat_counts = Counter() for r in results: for name, cat, spec in r["human_labels"]: if name == ann: cat_counts[cat] += 1 if not cat_counts: continue total = sum(cat_counts.values()) dist = " ".join(f"{c}:{cat_counts.get(c, 0):3d}({cat_counts.get(c, 0)/total:.0%})" for c in CATS) flag = " ** OUTLIER" if ann == "Aaryan" else "" print(f" {ann:10s} (n={total:3d}): {dist}{flag}") # ══════════════════════════════════════════════════════════════════════════════ # 7. TEXT-FEATURE CORRELATIONS # ══════════════════════════════════════════════════════════════════════════════ print("\n" + "=" * 80) print("7. TEXT-FEATURE CORRELATIONS WITH DISAGREEMENT") print("=" * 80) agree_r = [r for r in results if r["h_maj"] == r["g_maj"]] disagree_r = [r for r in results if r["h_maj"] != r["g_maj"]] # Word count agree_wc = [r["word_count"] for r in agree_r if r["word_count"] > 0] disagree_wc = [r["word_count"] for r in disagree_r if r["word_count"] > 0] print(f"\nWord count (agree vs disagree):") print(f" Agreement paragraphs: mean={mean_spec(agree_wc):.1f} median={median_spec(agree_wc):.0f} n={len(agree_wc)}") print(f" Disagreement paragraphs: mean={mean_spec(disagree_wc):.1f} median={median_spec(disagree_wc):.0f} n={len(disagree_wc)}") # Word count buckets print("\nDisagreement rate by word count bucket:") buckets = [(0, 30, "0-30"), (31, 60, "31-60"), (61, 100, "61-100"), (101, 150, "101-150"), (151, 250, "151-250"), (251, 9999, "251+")] headers9 = ["WC Bucket", "N", "Disagree", "Disagree%"] rows9 = [] for lo, hi, label in buckets: in_bucket = [r for r in results if lo <= r["word_count"] <= hi] dis = sum(1 for r in in_bucket if r["h_maj"] != r["g_maj"]) if in_bucket: rows9.append([label, len(in_bucket), dis, f"{dis/len(in_bucket):.1%}"]) print(fmt_table(headers9, rows9, ["l", "r", "r", "r"])) # Stage1 method (unanimous vs majority) as proxy for quality tier print("\nDisagreement rate by Stage 1 confidence method:") for method in ["unanimous", "majority"]: in_method = [r for r in results if para_meta.get(r["pid"], {}).get("stage1Method") == method] dis = sum(1 for r in in_method if r["h_maj"] != r["g_maj"]) if in_method: print(f" {method:10s}: {dis}/{len(in_method)} = {dis/len(in_method):.1%} disagree") # Keyword analysis print("\nDisagreement rate for paragraphs containing key terms:") keywords = ["material", "NIST", "CISO", "board", "third party", "third-party", "incident", "insurance", "audit", "framework", "breach", "ransomware"] headers10 = ["Keyword", "N", "Disagree", "Disagree%"] rows10 = [] for kw in keywords: matching = [r for r in results if kw.lower() in r["text"].lower()] if not matching: continue dis = sum(1 for r in matching if r["h_maj"] != r["g_maj"]) rows10.append([kw, len(matching), dis, f"{dis/len(matching):.1%}"]) rows10.sort(key=lambda x: -int(x[2])) print(fmt_table(headers10, rows10, ["l", "r", "r", "r"])) # ══════════════════════════════════════════════════════════════════════════════ # 8. "HUMAN RIGHT, GenAI WRONG" vs "GenAI RIGHT, HUMAN WRONG" # ══════════════════════════════════════════════════════════════════════════════ print("\n" + "=" * 80) print("8. HUMAN RIGHT/GENAI WRONG vs GENAI RIGHT/HUMAN WRONG (13-signal consensus)") print("=" * 80) # Only consider paragraphs where all_maj is not split and h/g disagree with each other or consensus h_right_g_wrong = [r for r in results if r["all_maj"] != "split" and r["h_maj"] == r["all_maj"] and r["g_maj"] != r["all_maj"]] g_right_h_wrong = [r for r in results if r["all_maj"] != "split" and r["g_maj"] == r["all_maj"] and r["h_maj"] != r["all_maj"]] both_right = [r for r in results if r["all_maj"] != "split" and r["h_maj"] == r["all_maj"] and r["g_maj"] == r["all_maj"]] both_wrong = [r for r in results if r["all_maj"] != "split" and r["h_maj"] != r["all_maj"] and r["g_maj"] != r["all_maj"]] has_split = [r for r in results if r["all_maj"] == "split"] print(f"\n Both correct: {len(both_right)}") print(f" Human right, GenAI wrong: {len(h_right_g_wrong)}") print(f" GenAI right, Human wrong: {len(g_right_h_wrong)}") print(f" Both wrong: {len(both_wrong)}") print(f" 13-signal split (no consensus): {len(has_split)}") # Category breakdown print("\nCategory breakdown of 'Human right, GenAI wrong':") cat_dist_hrg = Counter(r["all_maj"] for r in h_right_g_wrong) for c in CATS: n = cat_dist_hrg.get(c, 0) if n > 0: print(f" {c}: {n}") print("\nCategory breakdown of 'GenAI right, Human wrong':") cat_dist_grh = Counter(r["all_maj"] for r in g_right_h_wrong) for c in CATS: n = cat_dist_grh.get(c, 0) if n > 0: print(f" {c}: {n}") # What did the wrong side predict? print("\nWhen GenAI is wrong, what does it predict instead?") wrong_g = Counter(r["g_maj"] for r in h_right_g_wrong) for label, cnt in wrong_g.most_common(): print(f" {label}: {cnt}") print("\nWhen Human is wrong, what do they predict instead?") wrong_h = Counter(r["h_maj"] for r in g_right_h_wrong) for label, cnt in wrong_h.most_common(): print(f" {label}: {cnt}") # ══════════════════════════════════════════════════════════════════════════════ # 9. SPECIFICITY BY SOURCE TYPE # ══════════════════════════════════════════════════════════════════════════════ print("\n" + "=" * 80) print("9. SPECIFICITY BY SOURCE TYPE AND CATEGORY") print("=" * 80) # Group models into source types stage1_models = {"gemini-3.1-flash-lite-preview", "grok-4.1-fast", "mimo-v2-flash"} frontier_models = {"opus-4.6", "gpt-5.4", "gemini-3.1-pro-preview", "kimi-k2.5"} budget_models = {"glm-5:exacto", "mimo-v2-pro:exacto", "minimax-m2.7:exacto"} # Collect specs by source type and category source_specs: dict[str, dict[str, list[int]]] = { "Human": defaultdict(list), "Stage1": defaultdict(list), "Frontier": defaultdict(list), "Budget": defaultdict(list), } for r in results: for name, cat, spec in r["human_labels"]: source_specs["Human"][cat].append(spec) source_specs["Human"]["ALL"].append(spec) for model, cat, spec in r["genai_labels"]: if model in stage1_models: src = "Stage1" elif model in frontier_models: src = "Frontier" elif model in budget_models: src = "Budget" else: src = "Budget" # fallback source_specs[src][cat].append(spec) source_specs[src]["ALL"].append(spec) print("\nMean specificity by source type and category:") src_order = ["Human", "Stage1", "Frontier", "Budget"] headers11 = ["Category"] + src_order rows11 = [] for c in CATS + ["ALL"]: row = [c] for src in src_order: specs = source_specs[src].get(c, []) if specs: row.append(f"{mean_spec(specs):.3f}") else: row.append("N/A") rows11.append(row) align11 = ["l"] + ["r"] * len(src_order) print(fmt_table(headers11, rows11, align11)) # Specificity standard deviation by source print("\nSpecificity std dev by source type:") import math for src in src_order: specs = source_specs[src]["ALL"] if specs: m = mean_spec(specs) var = sum((s - m) ** 2 for s in specs) / len(specs) std = math.sqrt(var) print(f" {src:10s}: mean={m:.3f} std={std:.3f} n={len(specs)}") # ── Per-model specificity rankings ─────────────────────────────────────────── print("\nPer-model mean specificity (all categories):") model_specs: dict[str, list[int]] = defaultdict(list) for r in results: for name, cat, spec in r["human_labels"]: model_specs[f"H:{name}"].append(spec) for model, cat, spec in r["genai_labels"]: model_specs[f"G:{model}"].append(spec) headers12 = ["Model", "Mean Spec", "N"] rows12 = [] for model, specs in sorted(model_specs.items(), key=lambda x: mean_spec(x[1])): rows12.append([model, f"{mean_spec(specs):.3f}", len(specs)]) print(fmt_table(headers12, rows12, ["l", "r", "r"])) # ══════════════════════════════════════════════════════════════════════════════ # SUMMARY # ══════════════════════════════════════════════════════════════════════════════ print("\n" + "=" * 80) print("SUMMARY OF KEY FINDINGS") print("=" * 80) print(f""" Dataset: {total_paras} paragraphs, 13 signals each (3 human, 10 GenAI) 1. CATEGORY AGREEMENT: Human majority and GenAI majority agree on {diag/total_paras:.1%} of paragraphs. The biggest confusions are in the off-diagonal cells above. 2. DIRECTIONAL DISAGREEMENTS: The most common category swaps reveal systematic differences in how humans and GenAI interpret boundary cases. 3. PRECISION/RECALL: GenAI macro F1={macro_f1:.3f} against human majority. 4. SPECIFICITY BIAS: Human mean={h_avg:.3f}, GenAI mean={g_avg:.3f} (diff={g_avg - h_avg:+.3f}). {"GenAI rates higher" if g_avg > h_avg else "Humans rate higher"} on average. 5. DIFFICULTY: On easy paragraphs (T1, 10+/13 agree), agreement is very high. On hard paragraphs, {"humans" if h_odd > g_odd else "GenAI"} are more often the odd-one-out. 6. ANNOTATORS: See table above for individual alignment with GenAI and consensus. 7. TEXT FEATURES: {"Longer" if mean_spec(disagree_wc) > mean_spec(agree_wc) else "Shorter"} paragraphs tend to produce more disagreement. 8. RIGHT/WRONG: Human right & GenAI wrong: {len(h_right_g_wrong)}, GenAI right & Human wrong: {len(g_right_h_wrong)}. {"Humans are more often right" if len(h_right_g_wrong) > len(g_right_h_wrong) else "GenAI is more often right"} when they disagree. """)