/** * Diff original vs re-run annotations for orphan-word paragraphs. * * Compares stage1.jsonl (original) against stage1-orphan-rerun.jsonl (patched text) * to measure label changes, bias correction, and conflict resolution. * * Usage: bun ts/scripts/diff-orphan-annotations.ts */ import { readFileSync } from "node:fs"; const DATA_DIR = new URL("../../data", import.meta.url).pathname; const ORIG_PATH = `${DATA_DIR}/annotations/stage1.jsonl`; const RERUN_PATH = `${DATA_DIR}/annotations/stage1-orphan-rerun.jsonl`; const PATCHES_PATH = `${DATA_DIR}/paragraphs/patches/orphan-word-patches.jsonl`; interface Annotation { paragraphId: string; label: { content_category: string; specificity_level: number; category_confidence: string; specificity_confidence: string; }; provenance: { modelId: string; }; } function loadAnnotations(path: string): Map { const map = new Map(); for (const line of readFileSync(path, "utf-8").split("\n")) { if (!line.trim()) continue; const ann = JSON.parse(line) as Annotation; const key = ann.paragraphId; if (!map.has(key)) map.set(key, []); map.get(key)!.push(ann); } return map; } function majorityVote(annotations: Annotation[], field: "content_category" | "specificity_level"): { value: string | number; unanimous: boolean; count: number } { const counts = new Map(); for (const ann of annotations) { const v = ann.label[field]; counts.set(v, (counts.get(v) ?? 0) + 1); } let best: string | number = ""; let bestCount = 0; for (const [v, c] of counts) { if (c > bestCount) { best = v; bestCount = c; } } return { value: best, unanimous: bestCount === annotations.length, count: bestCount }; } // ── Main ──────────────────────────────────────────────────────────────── const patchIds = new Set(); for (const line of readFileSync(PATCHES_PATH, "utf-8").split("\n")) { if (!line.trim()) continue; patchIds.add((JSON.parse(line) as { id: string }).id); } const origAll = loadAnnotations(ORIG_PATH); const rerunAll = loadAnnotations(RERUN_PATH); // Filter original annotations to only orphan-word paragraphs const origFiltered = new Map(); for (const [pid, anns] of origAll) { if (patchIds.has(pid)) origFiltered.set(pid, anns); } console.error(`Orphan-word paragraphs: ${patchIds.size}`); console.error(`Original annotations found: ${origFiltered.size} paragraphs`); console.error(`Re-run annotations found: ${rerunAll.size} paragraphs`); // Compare paragraphs that have BOTH original and re-run annotations const comparable = [...rerunAll.keys()].filter((pid) => origFiltered.has(pid)); console.error(`Comparable paragraphs: ${comparable.length}\n`); // Track changes let catChanged = 0; let specChanged = 0; let eitherChanged = 0; // Per-model changes const perModelCatChanges = new Map(); const perModelSpecChanges = new Map(); // Category transition matrix const catTransitions = new Map>(); // Consensus changes let origConflicts = 0; let rerunConflicts = 0; let conflictsResolved = 0; let consensusBroken = 0; // Category distribution const origCatDist = new Map(); const rerunCatDist = new Map(); // Specificity distribution const origSpecDist = new Map(); const rerunSpecDist = new Map(); for (const pid of comparable) { const origAnns = origFiltered.get(pid)!; const rerunAnns = rerunAll.get(pid)!; // Per-model comparison for (const rerunAnn of rerunAnns) { const modelId = rerunAnn.provenance.modelId; const origAnn = origAnns.find((a) => a.provenance.modelId === modelId); if (!origAnn) continue; if (origAnn.label.content_category !== rerunAnn.label.content_category) { perModelCatChanges.set(modelId, (perModelCatChanges.get(modelId) ?? 0) + 1); // Track transition const from = origAnn.label.content_category; const to = rerunAnn.label.content_category; if (!catTransitions.has(from)) catTransitions.set(from, new Map()); catTransitions.get(from)!.set(to, (catTransitions.get(from)!.get(to) ?? 0) + 1); } if (origAnn.label.specificity_level !== rerunAnn.label.specificity_level) { perModelSpecChanges.set(modelId, (perModelSpecChanges.get(modelId) ?? 0) + 1); } } // Consensus comparison (majority vote) const origCatVote = majorityVote(origAnns, "content_category"); const rerunCatVote = majorityVote(rerunAnns, "content_category"); const origSpecVote = majorityVote(origAnns, "specificity_level"); const rerunSpecVote = majorityVote(rerunAnns, "specificity_level"); origCatDist.set(origCatVote.value as string, (origCatDist.get(origCatVote.value as string) ?? 0) + 1); rerunCatDist.set(rerunCatVote.value as string, (rerunCatDist.get(rerunCatVote.value as string) ?? 0) + 1); origSpecDist.set(origSpecVote.value as number, (origSpecDist.get(origSpecVote.value as number) ?? 0) + 1); rerunSpecDist.set(rerunSpecVote.value as number, (rerunSpecDist.get(rerunSpecVote.value as number) ?? 0) + 1); if (origCatVote.value !== rerunCatVote.value) catChanged++; if (origSpecVote.value !== rerunSpecVote.value) specChanged++; if (origCatVote.value !== rerunCatVote.value || origSpecVote.value !== rerunSpecVote.value) eitherChanged++; // Conflict tracking (no majority = conflict) const origHasConflict = origCatVote.count < 2 || origSpecVote.count < 2; const rerunHasConflict = rerunCatVote.count < 2 || rerunSpecVote.count < 2; if (origHasConflict) origConflicts++; if (rerunHasConflict) rerunConflicts++; if (origHasConflict && !rerunHasConflict) conflictsResolved++; if (!origHasConflict && rerunHasConflict) consensusBroken++; } // ── Report ────────────────────────────────────────────────────────────── console.log("═══ ORPHAN WORD RE-ANNOTATION DIFF REPORT ═══\n"); console.log(`Paragraphs compared: ${comparable.length}`); console.log(` Category consensus changed: ${catChanged} (${((catChanged / comparable.length) * 100).toFixed(1)}%)`); console.log(` Specificity consensus changed: ${specChanged} (${((specChanged / comparable.length) * 100).toFixed(1)}%)`); console.log(` Either dimension changed: ${eitherChanged} (${((eitherChanged / comparable.length) * 100).toFixed(1)}%)`); console.log(`\n─── Per-Model Category Changes ───`); for (const [model, count] of [...perModelCatChanges.entries()].sort((a, b) => b[1] - a[1])) { const short = model.split("/")[1] ?? model; console.log(` ${short}: ${count} (${((count / comparable.length) * 100).toFixed(1)}%)`); } console.log(`\n─── Per-Model Specificity Changes ───`); for (const [model, count] of [...perModelSpecChanges.entries()].sort((a, b) => b[1] - a[1])) { const short = model.split("/")[1] ?? model; console.log(` ${short}: ${count} (${((count / comparable.length) * 100).toFixed(1)}%)`); } console.log(`\n─── Conflict Resolution ───`); console.log(` Original conflicts: ${origConflicts}`); console.log(` Re-run conflicts: ${rerunConflicts}`); console.log(` Conflicts resolved (orig conflict → rerun consensus): ${conflictsResolved}`); console.log(` Consensus broken (orig consensus → rerun conflict): ${consensusBroken}`); console.log(` Net conflict change: ${conflictsResolved - consensusBroken > 0 ? "-" : "+"}${Math.abs(conflictsResolved - consensusBroken)}`); console.log(`\n─── Category Distribution (Consensus) ───`); console.log(` ${"Category".padEnd(30)} ${"Original".padStart(8)} ${"Re-run".padStart(8)} ${"Delta".padStart(8)}`); const allCats = new Set([...origCatDist.keys(), ...rerunCatDist.keys()]); for (const cat of [...allCats].sort()) { const orig = origCatDist.get(cat) ?? 0; const rerun = rerunCatDist.get(cat) ?? 0; const delta = rerun - orig; const sign = delta > 0 ? "+" : ""; console.log(` ${cat.padEnd(30)} ${String(orig).padStart(8)} ${String(rerun).padStart(8)} ${(sign + delta).padStart(8)}`); } console.log(`\n─── Specificity Distribution (Consensus) ───`); console.log(` ${"Level".padEnd(10)} ${"Original".padStart(8)} ${"Re-run".padStart(8)} ${"Delta".padStart(8)}`); for (const level of [1, 2, 3, 4]) { const orig = origSpecDist.get(level) ?? 0; const rerun = rerunSpecDist.get(level) ?? 0; const delta = rerun - orig; const sign = delta > 0 ? "+" : ""; console.log(` ${String(level).padEnd(10)} ${String(orig).padStart(8)} ${String(rerun).padStart(8)} ${(sign + delta).padStart(8)}`); } console.log(`\n─── Top Category Transitions ───`); const transitions: [string, string, number][] = []; for (const [from, tos] of catTransitions) { for (const [to, count] of tos) { transitions.push([from, to, count]); } } transitions.sort((a, b) => b[2] - a[2]); for (const [from, to, count] of transitions.slice(0, 15)) { console.log(` ${from} → ${to}: ${count}`); }