715 lines
30 KiB
Python
715 lines
30 KiB
Python
"""
|
|
Cross-analysis: Human annotators vs GenAI models on 1,200-paragraph holdout set.
|
|
|
|
Categories: BG, ID, MR, N/O, RMP, SI, TPR
|
|
Specificity: 1-4
|
|
13 signals per paragraph: 3 human (BIBD), 3 Stage 1, 1 Opus 4.6, 6 benchmark
|
|
"""
|
|
|
|
import json
|
|
import sys
|
|
from collections import Counter, defaultdict
|
|
from pathlib import Path
|
|
|
|
# ── Category abbreviation mapping ────────────────────────────────────────────
|
|
FULL_TO_ABBR = {
|
|
"Board Governance": "BG",
|
|
"Incident Disclosure": "ID",
|
|
"Management Role": "MR",
|
|
"None/Other": "N/O",
|
|
"Risk Management Process": "RMP",
|
|
"Strategy Integration": "SI",
|
|
"Third-Party Risk": "TPR",
|
|
}
|
|
ABBR_TO_FULL = {v: k for k, v in FULL_TO_ABBR.items()}
|
|
CATS = ["BG", "ID", "MR", "N/O", "RMP", "SI", "TPR"]
|
|
|
|
DATA = Path("data")
|
|
|
|
|
|
def abbr(cat: str) -> str:
|
|
return FULL_TO_ABBR.get(cat, cat)
|
|
|
|
|
|
def majority_vote(labels: list[str]) -> str:
|
|
"""Return majority label or 'split' if no majority."""
|
|
c = Counter(labels)
|
|
top = c.most_common(1)[0]
|
|
if top[1] > len(labels) / 2:
|
|
return top[0]
|
|
# Check for a plurality with tie-break: if top 2 are tied, it's split
|
|
if len(c) >= 2:
|
|
top2 = c.most_common(2)
|
|
if top2[0][1] == top2[1][1]:
|
|
return "split"
|
|
return top[0]
|
|
|
|
|
|
def median_spec(specs: list[int]) -> float:
|
|
s = sorted(specs)
|
|
n = len(s)
|
|
if n % 2 == 1:
|
|
return float(s[n // 2])
|
|
return (s[n // 2 - 1] + s[n // 2]) / 2.0
|
|
|
|
|
|
def mean_spec(specs: list[int]) -> float:
|
|
return sum(specs) / len(specs) if specs else 0.0
|
|
|
|
|
|
# ── Load data ────────────────────────────────────────────────────────────────
|
|
|
|
print("Loading data...\n")
|
|
|
|
# Human labels: paragraphId → list of (annotatorName, category, specificity)
|
|
human_labels: dict[str, list[tuple[str, str, int]]] = defaultdict(list)
|
|
with open(DATA / "gold" / "human-labels-raw.jsonl") as f:
|
|
for line in f:
|
|
d = json.loads(line)
|
|
human_labels[d["paragraphId"]].append(
|
|
(d["annotatorName"], abbr(d["contentCategory"]), d["specificityLevel"])
|
|
)
|
|
|
|
holdout_pids = sorted(human_labels.keys())
|
|
assert len(holdout_pids) == 1200, f"Expected 1200 holdout paragraphs, got {len(holdout_pids)}"
|
|
|
|
# GenAI labels: paragraphId → list of (modelName, category, specificity)
|
|
genai_labels: dict[str, list[tuple[str, str, int]]] = defaultdict(list)
|
|
|
|
# Stage 1 (filter to holdout only)
|
|
holdout_set = set(holdout_pids)
|
|
with open(DATA / "annotations" / "stage1.patched.jsonl") as f:
|
|
for line in f:
|
|
d = json.loads(line)
|
|
pid = d["paragraphId"]
|
|
if pid in holdout_set:
|
|
model = d["provenance"]["modelId"].split("/")[-1]
|
|
genai_labels[pid].append(
|
|
(model, abbr(d["label"]["content_category"]), d["label"]["specificity_level"])
|
|
)
|
|
|
|
# Opus
|
|
with open(DATA / "annotations" / "golden" / "opus.jsonl") as f:
|
|
for line in f:
|
|
d = json.loads(line)
|
|
genai_labels[d["paragraphId"]].append(
|
|
("opus-4.6", abbr(d["label"]["content_category"]), d["label"]["specificity_level"])
|
|
)
|
|
|
|
# Bench-holdout models
|
|
bench_files = [
|
|
"gpt-5.4.jsonl",
|
|
"gemini-3.1-pro-preview.jsonl",
|
|
"glm-5:exacto.jsonl",
|
|
"kimi-k2.5.jsonl",
|
|
"mimo-v2-pro:exacto.jsonl",
|
|
"minimax-m2.7:exacto.jsonl",
|
|
]
|
|
for fname in bench_files:
|
|
fpath = DATA / "annotations" / "bench-holdout" / fname
|
|
model_name = fname.replace(".jsonl", "")
|
|
with open(fpath) as f:
|
|
for line in f:
|
|
d = json.loads(line)
|
|
genai_labels[d["paragraphId"]].append(
|
|
(model_name, abbr(d["label"]["content_category"]), d["label"]["specificity_level"])
|
|
)
|
|
|
|
# Paragraph metadata
|
|
para_meta: dict[str, dict] = {}
|
|
with open(DATA / "gold" / "paragraphs-holdout.jsonl") as f:
|
|
for line in f:
|
|
d = json.loads(line)
|
|
if d["id"] in holdout_set:
|
|
para_meta[d["id"]] = d
|
|
|
|
# ── Compute per-paragraph aggregates ─────────────────────────────────────────
|
|
|
|
results = []
|
|
for pid in holdout_pids:
|
|
h = human_labels[pid]
|
|
g = genai_labels[pid]
|
|
|
|
h_cats = [x[1] for x in h]
|
|
h_specs = [x[2] for x in h]
|
|
g_cats = [x[1] for x in g]
|
|
g_specs = [x[2] for x in g]
|
|
|
|
all_cats = h_cats + g_cats
|
|
all_specs = h_specs + g_specs
|
|
|
|
h_maj = majority_vote(h_cats)
|
|
g_maj = majority_vote(g_cats)
|
|
all_maj = majority_vote(all_cats)
|
|
|
|
h_mean_spec = mean_spec(h_specs)
|
|
g_mean_spec = mean_spec(g_specs)
|
|
all_mean_spec = mean_spec(all_specs)
|
|
|
|
# Agreement count: how many of 13 agree with overall majority
|
|
agree_count = sum(1 for c in all_cats if c == all_maj) if all_maj != "split" else 0
|
|
|
|
meta = para_meta.get(pid, {})
|
|
|
|
results.append({
|
|
"pid": pid,
|
|
"h_maj": h_maj,
|
|
"g_maj": g_maj,
|
|
"all_maj": all_maj,
|
|
"h_cats": h_cats,
|
|
"g_cats": g_cats,
|
|
"h_specs": h_specs,
|
|
"g_specs": g_specs,
|
|
"h_mean_spec": h_mean_spec,
|
|
"g_mean_spec": g_mean_spec,
|
|
"all_mean_spec": all_mean_spec,
|
|
"agree_count": agree_count,
|
|
"word_count": meta.get("wordCount", 0),
|
|
"text": meta.get("text", ""),
|
|
"human_annotators": [x[0] for x in h],
|
|
"genai_models": [x[0] for x in g],
|
|
"human_labels": h,
|
|
"genai_labels": g,
|
|
})
|
|
|
|
|
|
def fmt_table(headers: list[str], rows: list[list], align: list[str] | None = None):
|
|
"""Format a simple text table."""
|
|
col_widths = [len(h) for h in headers]
|
|
str_rows = []
|
|
for row in rows:
|
|
sr = [str(x) for x in row]
|
|
str_rows.append(sr)
|
|
for i, s in enumerate(sr):
|
|
col_widths[i] = max(col_widths[i], len(s))
|
|
|
|
if align is None:
|
|
align = ["r"] * len(headers)
|
|
|
|
def fmt_cell(s, w, a):
|
|
return s.rjust(w) if a == "r" else s.ljust(w)
|
|
|
|
sep = "+-" + "-+-".join("-" * w for w in col_widths) + "-+"
|
|
hdr = "| " + " | ".join(fmt_cell(h, col_widths[i], "l") for i, h in enumerate(headers)) + " |"
|
|
lines = [sep, hdr, sep]
|
|
for sr in str_rows:
|
|
line = "| " + " | ".join(fmt_cell(sr[i], col_widths[i], align[i]) for i in range(len(headers))) + " |"
|
|
lines.append(line)
|
|
lines.append(sep)
|
|
return "\n".join(lines)
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# 1. PER-CATEGORY CONFUSION MATRIX: HUMAN MAJORITY vs GENAI MAJORITY
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
|
|
print("=" * 80)
|
|
print("1. CONFUSION MATRIX: Human Majority (rows) vs GenAI Majority (cols)")
|
|
print("=" * 80)
|
|
|
|
cats_plus = CATS + ["split"]
|
|
cm = defaultdict(lambda: defaultdict(int))
|
|
for r in results:
|
|
cm[r["h_maj"]][r["g_maj"]] += 1
|
|
|
|
headers = ["H\\G"] + cats_plus + ["Total"]
|
|
rows = []
|
|
for hc in cats_plus:
|
|
row = [hc]
|
|
total = 0
|
|
for gc in cats_plus:
|
|
v = cm[hc][gc]
|
|
row.append(v if v else ".")
|
|
total += v
|
|
row.append(total)
|
|
rows.append(row)
|
|
|
|
# Column totals
|
|
col_totals = ["Total"]
|
|
for gc in cats_plus:
|
|
col_totals.append(sum(cm[hc][gc] for hc in cats_plus))
|
|
col_totals.append(sum(sum(cm[hc][gc] for gc in cats_plus) for hc in cats_plus))
|
|
rows.append(col_totals)
|
|
|
|
align = ["l"] + ["r"] * (len(headers) - 1)
|
|
print(fmt_table(headers, rows, align))
|
|
|
|
# Diagonal agreement
|
|
diag = sum(cm[c][c] for c in cats_plus)
|
|
total_paras = len(results)
|
|
print(f"\nDiagonal agreement: {diag}/{total_paras} = {diag/total_paras:.1%}")
|
|
print(f"Disagreement: {total_paras - diag}/{total_paras} = {(total_paras - diag)/total_paras:.1%}")
|
|
|
|
# Over/under prediction
|
|
print("\nGenAI over/under-prediction relative to human majority:")
|
|
headers2 = ["Category", "Human N", "GenAI N", "Diff", "Direction"]
|
|
rows2 = []
|
|
for c in CATS:
|
|
h_n = sum(cm[c][gc] for gc in cats_plus)
|
|
g_n = sum(cm[hc][c] for hc in cats_plus)
|
|
diff = g_n - h_n
|
|
direction = "OVER" if diff > 0 else ("UNDER" if diff < 0 else "MATCH")
|
|
rows2.append([c, h_n, g_n, f"{diff:+d}", direction])
|
|
align2 = ["l", "r", "r", "r", "l"]
|
|
print(fmt_table(headers2, rows2, align2))
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# 2. DIRECTIONAL DISAGREEMENT ANALYSIS
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
|
|
print("\n" + "=" * 80)
|
|
print("2. DIRECTIONAL DISAGREEMENT: Human Majority -> GenAI Majority transitions")
|
|
print("=" * 80)
|
|
|
|
disagree = [(r["h_maj"], r["g_maj"]) for r in results if r["h_maj"] != r["g_maj"]]
|
|
print(f"\nTotal disagreements: {len(disagree)}/{total_paras}")
|
|
|
|
trans = Counter(disagree)
|
|
print("\nTop transitions (H_maj -> G_maj):")
|
|
headers3 = ["From (Human)", "To (GenAI)", "Count", "Reverse", "Net", "Symmetric?"]
|
|
rows3 = []
|
|
seen = set()
|
|
for (a, b), cnt in sorted(trans.items(), key=lambda x: -x[1]):
|
|
pair = tuple(sorted([a, b]))
|
|
if pair in seen:
|
|
continue
|
|
seen.add(pair)
|
|
rev = trans.get((b, a), 0)
|
|
net = cnt - rev
|
|
sym = "Yes" if abs(net) <= max(1, min(cnt, rev) * 0.3) else "No"
|
|
rows3.append([a, b, cnt, rev, f"{net:+d}", sym])
|
|
align3 = ["l", "l", "r", "r", "r", "l"]
|
|
print(fmt_table(headers3, rows3, align3))
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# 3. PER-CATEGORY PRECISION/RECALL (Human majority as truth)
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
|
|
print("\n" + "=" * 80)
|
|
print("3. PER-CATEGORY PRECISION/RECALL (Human majority as ground truth)")
|
|
print("=" * 80)
|
|
|
|
# Filter out splits for clean P/R
|
|
valid = [(r["h_maj"], r["g_maj"]) for r in results if r["h_maj"] != "split" and r["g_maj"] != "split"]
|
|
|
|
headers4 = ["Category", "TP", "FP", "FN", "Precision", "Recall", "F1"]
|
|
rows4 = []
|
|
for c in CATS:
|
|
tp = sum(1 for h, g in valid if h == c and g == c)
|
|
fp = sum(1 for h, g in valid if h != c and g == c)
|
|
fn = sum(1 for h, g in valid if h == c and g != c)
|
|
prec = tp / (tp + fp) if (tp + fp) > 0 else 0
|
|
rec = tp / (tp + fn) if (tp + fn) > 0 else 0
|
|
f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0
|
|
rows4.append([c, tp, fp, fn, f"{prec:.3f}", f"{rec:.3f}", f"{f1:.3f}"])
|
|
align4 = ["l", "r", "r", "r", "r", "r", "r"]
|
|
print("\nGenAI predictions evaluated against human majority:")
|
|
print(fmt_table(headers4, rows4, align4))
|
|
|
|
# Macro averages
|
|
macro_p = sum(float(r[4]) for r in rows4) / len(CATS)
|
|
macro_r = sum(float(r[5]) for r in rows4) / len(CATS)
|
|
macro_f1 = sum(float(r[6]) for r in rows4) / len(CATS)
|
|
print(f"\nMacro-avg: P={macro_p:.3f} R={macro_r:.3f} F1={macro_f1:.3f}")
|
|
|
|
# Vice versa: GenAI as truth
|
|
print("\n--- Vice versa: Human predictions evaluated against GenAI majority ---")
|
|
rows4b = []
|
|
for c in CATS:
|
|
tp = sum(1 for h, g in valid if g == c and h == c)
|
|
fp = sum(1 for h, g in valid if g != c and h == c)
|
|
fn = sum(1 for h, g in valid if g == c and h != c)
|
|
prec = tp / (tp + fp) if (tp + fp) > 0 else 0
|
|
rec = tp / (tp + fn) if (tp + fn) > 0 else 0
|
|
f1 = 2 * prec * rec / (prec + rec) if (prec + rec) > 0 else 0
|
|
rows4b.append([c, tp, fp, fn, f"{prec:.3f}", f"{rec:.3f}", f"{f1:.3f}"])
|
|
print(fmt_table(headers4, rows4b, align4))
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# 4. SPECIFICITY SYSTEMATIC BIAS
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
|
|
print("\n" + "=" * 80)
|
|
print("4. SPECIFICITY SYSTEMATIC BIAS: Human vs GenAI")
|
|
print("=" * 80)
|
|
|
|
# Overall
|
|
all_h_specs = [s for r in results for s in r["h_specs"]]
|
|
all_g_specs = [s for r in results for s in r["g_specs"]]
|
|
h_avg = mean_spec(all_h_specs)
|
|
g_avg = mean_spec(all_g_specs)
|
|
print(f"\nOverall mean specificity: Human={h_avg:.3f} GenAI={g_avg:.3f} Diff={g_avg - h_avg:+.3f}")
|
|
print(f"Overall median: Human={median_spec(all_h_specs):.1f} GenAI={median_spec(all_g_specs):.1f}")
|
|
|
|
# Distribution
|
|
print("\nSpecificity distribution:")
|
|
h_dist = Counter(all_h_specs)
|
|
g_dist = Counter(all_g_specs)
|
|
headers5 = ["Spec", "Human N", "Human %", "GenAI N", "GenAI %", "Diff %"]
|
|
rows5 = []
|
|
for s in [1, 2, 3, 4]:
|
|
hn = h_dist.get(s, 0)
|
|
gn = g_dist.get(s, 0)
|
|
hp = hn / len(all_h_specs) * 100
|
|
gp = gn / len(all_g_specs) * 100
|
|
rows5.append([s, hn, f"{hp:.1f}%", gn, f"{gp:.1f}%", f"{gp - hp:+.1f}%"])
|
|
print(fmt_table(headers5, rows5, ["r", "r", "r", "r", "r", "r"]))
|
|
|
|
# By category
|
|
print("\nMean specificity by category:")
|
|
headers6 = ["Category", "Human", "GenAI", "Diff", "H count", "G count"]
|
|
rows6 = []
|
|
for c in CATS:
|
|
h_s = [s for r in results for ann in r["human_labels"] if ann[1] == c for s in [ann[2]]]
|
|
g_s = [s for r in results for ann in r["genai_labels"] if ann[1] == c for s in [ann[2]]]
|
|
if h_s and g_s:
|
|
hm = mean_spec(h_s)
|
|
gm = mean_spec(g_s)
|
|
rows6.append([c, f"{hm:.3f}", f"{gm:.3f}", f"{gm - hm:+.3f}", len(h_s), len(g_s)])
|
|
else:
|
|
rows6.append([c, "N/A", "N/A", "N/A", len(h_s), len(g_s)])
|
|
print(fmt_table(headers6, rows6, ["l", "r", "r", "r", "r", "r"]))
|
|
|
|
# Per-paragraph directional bias
|
|
h_higher = sum(1 for r in results if r["h_mean_spec"] > r["g_mean_spec"])
|
|
g_higher = sum(1 for r in results if r["g_mean_spec"] > r["h_mean_spec"])
|
|
same = sum(1 for r in results if abs(r["h_mean_spec"] - r["g_mean_spec"]) < 0.01)
|
|
print(f"\nPer-paragraph: Human higher spec={h_higher} GenAI higher={g_higher} Same={same}")
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# 5. DIFFICULTY-STRATIFIED ANALYSIS
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
|
|
print("\n" + "=" * 80)
|
|
print("5. DIFFICULTY-STRATIFIED ANALYSIS")
|
|
print("=" * 80)
|
|
|
|
# Tiers based on 13-signal agreement
|
|
# Tier 1: 10+ agree, Tier 2: 7-9 agree, Tier 3: 5-6 agree, Tier 4: <5 agree
|
|
def get_tier(agree_count: int) -> str:
|
|
if agree_count >= 10:
|
|
return "T1-Easy"
|
|
elif agree_count >= 7:
|
|
return "T2-Medium"
|
|
elif agree_count >= 5:
|
|
return "T3-Hard"
|
|
else:
|
|
return "T4-VHard"
|
|
|
|
for r in results:
|
|
r["tier"] = get_tier(r["agree_count"])
|
|
|
|
tier_counts = Counter(r["tier"] for r in results)
|
|
print(f"\nTier distribution:")
|
|
for t in ["T1-Easy", "T2-Medium", "T3-Hard", "T4-VHard"]:
|
|
print(f" {t}: {tier_counts.get(t, 0)} paragraphs")
|
|
|
|
print("\nHuman-GenAI category agreement rate by difficulty tier:")
|
|
headers7 = ["Tier", "N", "Agree", "Agree%", "H=consensus%", "G=consensus%"]
|
|
rows7 = []
|
|
for t in ["T1-Easy", "T2-Medium", "T3-Hard", "T4-VHard"]:
|
|
tier_r = [r for r in results if r["tier"] == t]
|
|
n = len(tier_r)
|
|
if n == 0:
|
|
continue
|
|
agree = sum(1 for r in tier_r if r["h_maj"] == r["g_maj"])
|
|
h_match_cons = sum(1 for r in tier_r if r["h_maj"] == r["all_maj"])
|
|
g_match_cons = sum(1 for r in tier_r if r["g_maj"] == r["all_maj"])
|
|
rows7.append([
|
|
t, n, agree, f"{agree/n:.1%}",
|
|
f"{h_match_cons/n:.1%}", f"{g_match_cons/n:.1%}"
|
|
])
|
|
print(fmt_table(headers7, rows7, ["l", "r", "r", "r", "r", "r"]))
|
|
|
|
# On hard paragraphs, who is the odd one out?
|
|
print("\nOn hard paragraphs (T3+T4), disagreement breakdown:")
|
|
hard = [r for r in results if r["tier"] in ("T3-Hard", "T4-VHard")]
|
|
h_odd = sum(1 for r in hard if r["g_maj"] == r["all_maj"] and r["h_maj"] != r["all_maj"])
|
|
g_odd = sum(1 for r in hard if r["h_maj"] == r["all_maj"] and r["g_maj"] != r["all_maj"])
|
|
both_off = sum(1 for r in hard if r["h_maj"] != r["all_maj"] and r["g_maj"] != r["all_maj"])
|
|
both_on = sum(1 for r in hard if r["h_maj"] == r["all_maj"] and r["g_maj"] == r["all_maj"])
|
|
print(f" Human is odd-one-out (GenAI=consensus, Human!=consensus): {h_odd}")
|
|
print(f" GenAI is odd-one-out (Human=consensus, GenAI!=consensus): {g_odd}")
|
|
print(f" Both match consensus: {both_on}")
|
|
print(f" Both differ from consensus: {both_off}")
|
|
print(f" Total hard: {len(hard)}")
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# 6. ANNOTATOR-LEVEL PATTERNS
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
|
|
print("\n" + "=" * 80)
|
|
print("6. ANNOTATOR-LEVEL PATTERNS")
|
|
print("=" * 80)
|
|
|
|
annotators = ["Anuj", "Elisabeth", "Joey", "Meghan", "Xander", "Aaryan"]
|
|
|
|
# For each annotator, compute agreement with GenAI majority
|
|
print("\nPer-annotator agreement with GenAI majority (category):")
|
|
headers8 = ["Annotator", "N labels", "Agree w/G_maj", "Agree%", "Agree w/13_maj", "13_maj%", "Avg Spec", "Note"]
|
|
rows8 = []
|
|
for ann in annotators:
|
|
agree_g = 0
|
|
agree_all = 0
|
|
total = 0
|
|
specs = []
|
|
for r in results:
|
|
for name, cat, spec in r["human_labels"]:
|
|
if name == ann:
|
|
total += 1
|
|
specs.append(spec)
|
|
if cat == r["g_maj"]:
|
|
agree_g += 1
|
|
if cat == r["all_maj"]:
|
|
agree_all += 1
|
|
if total == 0:
|
|
continue
|
|
note = "(excluded from aggregates)" if ann == "Aaryan" else ""
|
|
rows8.append([
|
|
ann, total,
|
|
agree_g, f"{agree_g/total:.1%}",
|
|
agree_all, f"{agree_all/total:.1%}",
|
|
f"{mean_spec(specs):.2f}",
|
|
note,
|
|
])
|
|
align8 = ["l", "r", "r", "r", "r", "r", "r", "l"]
|
|
print(fmt_table(headers8, rows8, align8))
|
|
|
|
# Annotator category distributions
|
|
print("\nPer-annotator category distribution:")
|
|
for ann in annotators:
|
|
cat_counts = Counter()
|
|
for r in results:
|
|
for name, cat, spec in r["human_labels"]:
|
|
if name == ann:
|
|
cat_counts[cat] += 1
|
|
if not cat_counts:
|
|
continue
|
|
total = sum(cat_counts.values())
|
|
dist = " ".join(f"{c}:{cat_counts.get(c, 0):3d}({cat_counts.get(c, 0)/total:.0%})" for c in CATS)
|
|
flag = " ** OUTLIER" if ann == "Aaryan" else ""
|
|
print(f" {ann:10s} (n={total:3d}): {dist}{flag}")
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# 7. TEXT-FEATURE CORRELATIONS
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
|
|
print("\n" + "=" * 80)
|
|
print("7. TEXT-FEATURE CORRELATIONS WITH DISAGREEMENT")
|
|
print("=" * 80)
|
|
|
|
agree_r = [r for r in results if r["h_maj"] == r["g_maj"]]
|
|
disagree_r = [r for r in results if r["h_maj"] != r["g_maj"]]
|
|
|
|
# Word count
|
|
agree_wc = [r["word_count"] for r in agree_r if r["word_count"] > 0]
|
|
disagree_wc = [r["word_count"] for r in disagree_r if r["word_count"] > 0]
|
|
print(f"\nWord count (agree vs disagree):")
|
|
print(f" Agreement paragraphs: mean={mean_spec(agree_wc):.1f} median={median_spec(agree_wc):.0f} n={len(agree_wc)}")
|
|
print(f" Disagreement paragraphs: mean={mean_spec(disagree_wc):.1f} median={median_spec(disagree_wc):.0f} n={len(disagree_wc)}")
|
|
|
|
# Word count buckets
|
|
print("\nDisagreement rate by word count bucket:")
|
|
buckets = [(0, 30, "0-30"), (31, 60, "31-60"), (61, 100, "61-100"), (101, 150, "101-150"), (151, 250, "151-250"), (251, 9999, "251+")]
|
|
headers9 = ["WC Bucket", "N", "Disagree", "Disagree%"]
|
|
rows9 = []
|
|
for lo, hi, label in buckets:
|
|
in_bucket = [r for r in results if lo <= r["word_count"] <= hi]
|
|
dis = sum(1 for r in in_bucket if r["h_maj"] != r["g_maj"])
|
|
if in_bucket:
|
|
rows9.append([label, len(in_bucket), dis, f"{dis/len(in_bucket):.1%}"])
|
|
print(fmt_table(headers9, rows9, ["l", "r", "r", "r"]))
|
|
|
|
# Stage1 method (unanimous vs majority) as proxy for quality tier
|
|
print("\nDisagreement rate by Stage 1 confidence method:")
|
|
for method in ["unanimous", "majority"]:
|
|
in_method = [r for r in results if para_meta.get(r["pid"], {}).get("stage1Method") == method]
|
|
dis = sum(1 for r in in_method if r["h_maj"] != r["g_maj"])
|
|
if in_method:
|
|
print(f" {method:10s}: {dis}/{len(in_method)} = {dis/len(in_method):.1%} disagree")
|
|
|
|
# Keyword analysis
|
|
print("\nDisagreement rate for paragraphs containing key terms:")
|
|
keywords = ["material", "NIST", "CISO", "board", "third party", "third-party", "incident",
|
|
"insurance", "audit", "framework", "breach", "ransomware"]
|
|
headers10 = ["Keyword", "N", "Disagree", "Disagree%"]
|
|
rows10 = []
|
|
for kw in keywords:
|
|
matching = [r for r in results if kw.lower() in r["text"].lower()]
|
|
if not matching:
|
|
continue
|
|
dis = sum(1 for r in matching if r["h_maj"] != r["g_maj"])
|
|
rows10.append([kw, len(matching), dis, f"{dis/len(matching):.1%}"])
|
|
rows10.sort(key=lambda x: -int(x[2]))
|
|
print(fmt_table(headers10, rows10, ["l", "r", "r", "r"]))
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# 8. "HUMAN RIGHT, GenAI WRONG" vs "GenAI RIGHT, HUMAN WRONG"
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
|
|
print("\n" + "=" * 80)
|
|
print("8. HUMAN RIGHT/GENAI WRONG vs GENAI RIGHT/HUMAN WRONG (13-signal consensus)")
|
|
print("=" * 80)
|
|
|
|
# Only consider paragraphs where all_maj is not split and h/g disagree with each other or consensus
|
|
h_right_g_wrong = [r for r in results if r["all_maj"] != "split" and r["h_maj"] == r["all_maj"] and r["g_maj"] != r["all_maj"]]
|
|
g_right_h_wrong = [r for r in results if r["all_maj"] != "split" and r["g_maj"] == r["all_maj"] and r["h_maj"] != r["all_maj"]]
|
|
both_right = [r for r in results if r["all_maj"] != "split" and r["h_maj"] == r["all_maj"] and r["g_maj"] == r["all_maj"]]
|
|
both_wrong = [r for r in results if r["all_maj"] != "split" and r["h_maj"] != r["all_maj"] and r["g_maj"] != r["all_maj"]]
|
|
has_split = [r for r in results if r["all_maj"] == "split"]
|
|
|
|
print(f"\n Both correct: {len(both_right)}")
|
|
print(f" Human right, GenAI wrong: {len(h_right_g_wrong)}")
|
|
print(f" GenAI right, Human wrong: {len(g_right_h_wrong)}")
|
|
print(f" Both wrong: {len(both_wrong)}")
|
|
print(f" 13-signal split (no consensus): {len(has_split)}")
|
|
|
|
# Category breakdown
|
|
print("\nCategory breakdown of 'Human right, GenAI wrong':")
|
|
cat_dist_hrg = Counter(r["all_maj"] for r in h_right_g_wrong)
|
|
for c in CATS:
|
|
n = cat_dist_hrg.get(c, 0)
|
|
if n > 0:
|
|
print(f" {c}: {n}")
|
|
|
|
print("\nCategory breakdown of 'GenAI right, Human wrong':")
|
|
cat_dist_grh = Counter(r["all_maj"] for r in g_right_h_wrong)
|
|
for c in CATS:
|
|
n = cat_dist_grh.get(c, 0)
|
|
if n > 0:
|
|
print(f" {c}: {n}")
|
|
|
|
# What did the wrong side predict?
|
|
print("\nWhen GenAI is wrong, what does it predict instead?")
|
|
wrong_g = Counter(r["g_maj"] for r in h_right_g_wrong)
|
|
for label, cnt in wrong_g.most_common():
|
|
print(f" {label}: {cnt}")
|
|
|
|
print("\nWhen Human is wrong, what do they predict instead?")
|
|
wrong_h = Counter(r["h_maj"] for r in g_right_h_wrong)
|
|
for label, cnt in wrong_h.most_common():
|
|
print(f" {label}: {cnt}")
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# 9. SPECIFICITY BY SOURCE TYPE
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
|
|
print("\n" + "=" * 80)
|
|
print("9. SPECIFICITY BY SOURCE TYPE AND CATEGORY")
|
|
print("=" * 80)
|
|
|
|
# Group models into source types
|
|
stage1_models = {"gemini-3.1-flash-lite-preview", "grok-4.1-fast", "mimo-v2-flash"}
|
|
frontier_models = {"opus-4.6", "gpt-5.4", "gemini-3.1-pro-preview", "kimi-k2.5"}
|
|
budget_models = {"glm-5:exacto", "mimo-v2-pro:exacto", "minimax-m2.7:exacto"}
|
|
|
|
# Collect specs by source type and category
|
|
source_specs: dict[str, dict[str, list[int]]] = {
|
|
"Human": defaultdict(list),
|
|
"Stage1": defaultdict(list),
|
|
"Frontier": defaultdict(list),
|
|
"Budget": defaultdict(list),
|
|
}
|
|
|
|
for r in results:
|
|
for name, cat, spec in r["human_labels"]:
|
|
source_specs["Human"][cat].append(spec)
|
|
source_specs["Human"]["ALL"].append(spec)
|
|
|
|
for model, cat, spec in r["genai_labels"]:
|
|
if model in stage1_models:
|
|
src = "Stage1"
|
|
elif model in frontier_models:
|
|
src = "Frontier"
|
|
elif model in budget_models:
|
|
src = "Budget"
|
|
else:
|
|
src = "Budget" # fallback
|
|
source_specs[src][cat].append(spec)
|
|
source_specs[src]["ALL"].append(spec)
|
|
|
|
print("\nMean specificity by source type and category:")
|
|
src_order = ["Human", "Stage1", "Frontier", "Budget"]
|
|
headers11 = ["Category"] + src_order
|
|
rows11 = []
|
|
for c in CATS + ["ALL"]:
|
|
row = [c]
|
|
for src in src_order:
|
|
specs = source_specs[src].get(c, [])
|
|
if specs:
|
|
row.append(f"{mean_spec(specs):.3f}")
|
|
else:
|
|
row.append("N/A")
|
|
rows11.append(row)
|
|
align11 = ["l"] + ["r"] * len(src_order)
|
|
print(fmt_table(headers11, rows11, align11))
|
|
|
|
# Specificity standard deviation by source
|
|
print("\nSpecificity std dev by source type:")
|
|
import math
|
|
for src in src_order:
|
|
specs = source_specs[src]["ALL"]
|
|
if specs:
|
|
m = mean_spec(specs)
|
|
var = sum((s - m) ** 2 for s in specs) / len(specs)
|
|
std = math.sqrt(var)
|
|
print(f" {src:10s}: mean={m:.3f} std={std:.3f} n={len(specs)}")
|
|
|
|
# ── Per-model specificity rankings ───────────────────────────────────────────
|
|
print("\nPer-model mean specificity (all categories):")
|
|
model_specs: dict[str, list[int]] = defaultdict(list)
|
|
for r in results:
|
|
for name, cat, spec in r["human_labels"]:
|
|
model_specs[f"H:{name}"].append(spec)
|
|
for model, cat, spec in r["genai_labels"]:
|
|
model_specs[f"G:{model}"].append(spec)
|
|
|
|
headers12 = ["Model", "Mean Spec", "N"]
|
|
rows12 = []
|
|
for model, specs in sorted(model_specs.items(), key=lambda x: mean_spec(x[1])):
|
|
rows12.append([model, f"{mean_spec(specs):.3f}", len(specs)])
|
|
print(fmt_table(headers12, rows12, ["l", "r", "r"]))
|
|
|
|
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
# SUMMARY
|
|
# ══════════════════════════════════════════════════════════════════════════════
|
|
|
|
print("\n" + "=" * 80)
|
|
print("SUMMARY OF KEY FINDINGS")
|
|
print("=" * 80)
|
|
|
|
print(f"""
|
|
Dataset: {total_paras} paragraphs, 13 signals each (3 human, 10 GenAI)
|
|
|
|
1. CATEGORY AGREEMENT: Human majority and GenAI majority agree on {diag/total_paras:.1%} of
|
|
paragraphs. The biggest confusions are in the off-diagonal cells above.
|
|
|
|
2. DIRECTIONAL DISAGREEMENTS: The most common category swaps reveal systematic
|
|
differences in how humans and GenAI interpret boundary cases.
|
|
|
|
3. PRECISION/RECALL: GenAI macro F1={macro_f1:.3f} against human majority.
|
|
|
|
4. SPECIFICITY BIAS: Human mean={h_avg:.3f}, GenAI mean={g_avg:.3f}
|
|
(diff={g_avg - h_avg:+.3f}). {"GenAI rates higher" if g_avg > h_avg else "Humans rate higher"} on average.
|
|
|
|
5. DIFFICULTY: On easy paragraphs (T1, 10+/13 agree), agreement is very high.
|
|
On hard paragraphs, {"humans" if h_odd > g_odd else "GenAI"} are more often the odd-one-out.
|
|
|
|
6. ANNOTATORS: See table above for individual alignment with GenAI and consensus.
|
|
|
|
7. TEXT FEATURES: {"Longer" if mean_spec(disagree_wc) > mean_spec(agree_wc) else "Shorter"} paragraphs
|
|
tend to produce more disagreement.
|
|
|
|
8. RIGHT/WRONG: Human right & GenAI wrong: {len(h_right_g_wrong)}, GenAI right &
|
|
Human wrong: {len(g_right_h_wrong)}. {"Humans are more often right" if len(h_right_g_wrong) > len(g_right_h_wrong) else "GenAI is more often right"} when they disagree.
|
|
""")
|