211 lines
9.1 KiB
TypeScript
211 lines
9.1 KiB
TypeScript
/**
|
|
* Diff original vs re-run annotations for orphan-word paragraphs.
|
|
*
|
|
* Compares stage1.jsonl (original) against stage1-orphan-rerun.jsonl (patched text)
|
|
* to measure label changes, bias correction, and conflict resolution.
|
|
*
|
|
* Usage: bun ts/scripts/diff-orphan-annotations.ts
|
|
*/
|
|
import { readFileSync } from "node:fs";
|
|
|
|
const DATA_DIR = new URL("../../data", import.meta.url).pathname;
|
|
const ORIG_PATH = `${DATA_DIR}/annotations/stage1.jsonl`;
|
|
const RERUN_PATH = `${DATA_DIR}/annotations/stage1-orphan-rerun.jsonl`;
|
|
const PATCHES_PATH = `${DATA_DIR}/paragraphs/patches/orphan-word-patches.jsonl`;
|
|
|
|
interface Annotation {
|
|
paragraphId: string;
|
|
label: {
|
|
content_category: string;
|
|
specificity_level: number;
|
|
category_confidence: string;
|
|
specificity_confidence: string;
|
|
};
|
|
provenance: {
|
|
modelId: string;
|
|
};
|
|
}
|
|
|
|
function loadAnnotations(path: string): Map<string, Annotation[]> {
|
|
const map = new Map<string, Annotation[]>();
|
|
for (const line of readFileSync(path, "utf-8").split("\n")) {
|
|
if (!line.trim()) continue;
|
|
const ann = JSON.parse(line) as Annotation;
|
|
const key = ann.paragraphId;
|
|
if (!map.has(key)) map.set(key, []);
|
|
map.get(key)!.push(ann);
|
|
}
|
|
return map;
|
|
}
|
|
|
|
function majorityVote(annotations: Annotation[], field: "content_category" | "specificity_level"): { value: string | number; unanimous: boolean; count: number } {
|
|
const counts = new Map<string | number, number>();
|
|
for (const ann of annotations) {
|
|
const v = ann.label[field];
|
|
counts.set(v, (counts.get(v) ?? 0) + 1);
|
|
}
|
|
let best: string | number = "";
|
|
let bestCount = 0;
|
|
for (const [v, c] of counts) {
|
|
if (c > bestCount) { best = v; bestCount = c; }
|
|
}
|
|
return { value: best, unanimous: bestCount === annotations.length, count: bestCount };
|
|
}
|
|
|
|
// ── Main ────────────────────────────────────────────────────────────────
|
|
const patchIds = new Set<string>();
|
|
for (const line of readFileSync(PATCHES_PATH, "utf-8").split("\n")) {
|
|
if (!line.trim()) continue;
|
|
patchIds.add((JSON.parse(line) as { id: string }).id);
|
|
}
|
|
|
|
const origAll = loadAnnotations(ORIG_PATH);
|
|
const rerunAll = loadAnnotations(RERUN_PATH);
|
|
|
|
// Filter original annotations to only orphan-word paragraphs
|
|
const origFiltered = new Map<string, Annotation[]>();
|
|
for (const [pid, anns] of origAll) {
|
|
if (patchIds.has(pid)) origFiltered.set(pid, anns);
|
|
}
|
|
|
|
console.error(`Orphan-word paragraphs: ${patchIds.size}`);
|
|
console.error(`Original annotations found: ${origFiltered.size} paragraphs`);
|
|
console.error(`Re-run annotations found: ${rerunAll.size} paragraphs`);
|
|
|
|
// Compare paragraphs that have BOTH original and re-run annotations
|
|
const comparable = [...rerunAll.keys()].filter((pid) => origFiltered.has(pid));
|
|
console.error(`Comparable paragraphs: ${comparable.length}\n`);
|
|
|
|
// Track changes
|
|
let catChanged = 0;
|
|
let specChanged = 0;
|
|
let eitherChanged = 0;
|
|
// Per-model changes
|
|
const perModelCatChanges = new Map<string, number>();
|
|
const perModelSpecChanges = new Map<string, number>();
|
|
|
|
// Category transition matrix
|
|
const catTransitions = new Map<string, Map<string, number>>();
|
|
|
|
// Consensus changes
|
|
let origConflicts = 0;
|
|
let rerunConflicts = 0;
|
|
let conflictsResolved = 0;
|
|
let consensusBroken = 0;
|
|
|
|
// Category distribution
|
|
const origCatDist = new Map<string, number>();
|
|
const rerunCatDist = new Map<string, number>();
|
|
|
|
// Specificity distribution
|
|
const origSpecDist = new Map<number, number>();
|
|
const rerunSpecDist = new Map<number, number>();
|
|
|
|
for (const pid of comparable) {
|
|
const origAnns = origFiltered.get(pid)!;
|
|
const rerunAnns = rerunAll.get(pid)!;
|
|
|
|
// Per-model comparison
|
|
for (const rerunAnn of rerunAnns) {
|
|
const modelId = rerunAnn.provenance.modelId;
|
|
const origAnn = origAnns.find((a) => a.provenance.modelId === modelId);
|
|
if (!origAnn) continue;
|
|
|
|
if (origAnn.label.content_category !== rerunAnn.label.content_category) {
|
|
perModelCatChanges.set(modelId, (perModelCatChanges.get(modelId) ?? 0) + 1);
|
|
|
|
// Track transition
|
|
const from = origAnn.label.content_category;
|
|
const to = rerunAnn.label.content_category;
|
|
if (!catTransitions.has(from)) catTransitions.set(from, new Map());
|
|
catTransitions.get(from)!.set(to, (catTransitions.get(from)!.get(to) ?? 0) + 1);
|
|
}
|
|
if (origAnn.label.specificity_level !== rerunAnn.label.specificity_level) {
|
|
perModelSpecChanges.set(modelId, (perModelSpecChanges.get(modelId) ?? 0) + 1);
|
|
}
|
|
}
|
|
|
|
// Consensus comparison (majority vote)
|
|
const origCatVote = majorityVote(origAnns, "content_category");
|
|
const rerunCatVote = majorityVote(rerunAnns, "content_category");
|
|
const origSpecVote = majorityVote(origAnns, "specificity_level");
|
|
const rerunSpecVote = majorityVote(rerunAnns, "specificity_level");
|
|
|
|
origCatDist.set(origCatVote.value as string, (origCatDist.get(origCatVote.value as string) ?? 0) + 1);
|
|
rerunCatDist.set(rerunCatVote.value as string, (rerunCatDist.get(rerunCatVote.value as string) ?? 0) + 1);
|
|
origSpecDist.set(origSpecVote.value as number, (origSpecDist.get(origSpecVote.value as number) ?? 0) + 1);
|
|
rerunSpecDist.set(rerunSpecVote.value as number, (rerunSpecDist.get(rerunSpecVote.value as number) ?? 0) + 1);
|
|
|
|
if (origCatVote.value !== rerunCatVote.value) catChanged++;
|
|
if (origSpecVote.value !== rerunSpecVote.value) specChanged++;
|
|
if (origCatVote.value !== rerunCatVote.value || origSpecVote.value !== rerunSpecVote.value) eitherChanged++;
|
|
|
|
// Conflict tracking (no majority = conflict)
|
|
const origHasConflict = origCatVote.count < 2 || origSpecVote.count < 2;
|
|
const rerunHasConflict = rerunCatVote.count < 2 || rerunSpecVote.count < 2;
|
|
if (origHasConflict) origConflicts++;
|
|
if (rerunHasConflict) rerunConflicts++;
|
|
if (origHasConflict && !rerunHasConflict) conflictsResolved++;
|
|
if (!origHasConflict && rerunHasConflict) consensusBroken++;
|
|
}
|
|
|
|
// ── Report ──────────────────────────────────────────────────────────────
|
|
console.log("═══ ORPHAN WORD RE-ANNOTATION DIFF REPORT ═══\n");
|
|
|
|
console.log(`Paragraphs compared: ${comparable.length}`);
|
|
console.log(` Category consensus changed: ${catChanged} (${((catChanged / comparable.length) * 100).toFixed(1)}%)`);
|
|
console.log(` Specificity consensus changed: ${specChanged} (${((specChanged / comparable.length) * 100).toFixed(1)}%)`);
|
|
console.log(` Either dimension changed: ${eitherChanged} (${((eitherChanged / comparable.length) * 100).toFixed(1)}%)`);
|
|
|
|
console.log(`\n─── Per-Model Category Changes ───`);
|
|
for (const [model, count] of [...perModelCatChanges.entries()].sort((a, b) => b[1] - a[1])) {
|
|
const short = model.split("/")[1] ?? model;
|
|
console.log(` ${short}: ${count} (${((count / comparable.length) * 100).toFixed(1)}%)`);
|
|
}
|
|
|
|
console.log(`\n─── Per-Model Specificity Changes ───`);
|
|
for (const [model, count] of [...perModelSpecChanges.entries()].sort((a, b) => b[1] - a[1])) {
|
|
const short = model.split("/")[1] ?? model;
|
|
console.log(` ${short}: ${count} (${((count / comparable.length) * 100).toFixed(1)}%)`);
|
|
}
|
|
|
|
console.log(`\n─── Conflict Resolution ───`);
|
|
console.log(` Original conflicts: ${origConflicts}`);
|
|
console.log(` Re-run conflicts: ${rerunConflicts}`);
|
|
console.log(` Conflicts resolved (orig conflict → rerun consensus): ${conflictsResolved}`);
|
|
console.log(` Consensus broken (orig consensus → rerun conflict): ${consensusBroken}`);
|
|
console.log(` Net conflict change: ${conflictsResolved - consensusBroken > 0 ? "-" : "+"}${Math.abs(conflictsResolved - consensusBroken)}`);
|
|
|
|
console.log(`\n─── Category Distribution (Consensus) ───`);
|
|
console.log(` ${"Category".padEnd(30)} ${"Original".padStart(8)} ${"Re-run".padStart(8)} ${"Delta".padStart(8)}`);
|
|
const allCats = new Set([...origCatDist.keys(), ...rerunCatDist.keys()]);
|
|
for (const cat of [...allCats].sort()) {
|
|
const orig = origCatDist.get(cat) ?? 0;
|
|
const rerun = rerunCatDist.get(cat) ?? 0;
|
|
const delta = rerun - orig;
|
|
const sign = delta > 0 ? "+" : "";
|
|
console.log(` ${cat.padEnd(30)} ${String(orig).padStart(8)} ${String(rerun).padStart(8)} ${(sign + delta).padStart(8)}`);
|
|
}
|
|
|
|
console.log(`\n─── Specificity Distribution (Consensus) ───`);
|
|
console.log(` ${"Level".padEnd(10)} ${"Original".padStart(8)} ${"Re-run".padStart(8)} ${"Delta".padStart(8)}`);
|
|
for (const level of [1, 2, 3, 4]) {
|
|
const orig = origSpecDist.get(level) ?? 0;
|
|
const rerun = rerunSpecDist.get(level) ?? 0;
|
|
const delta = rerun - orig;
|
|
const sign = delta > 0 ? "+" : "";
|
|
console.log(` ${String(level).padEnd(10)} ${String(orig).padStart(8)} ${String(rerun).padStart(8)} ${(sign + delta).padStart(8)}`);
|
|
}
|
|
|
|
console.log(`\n─── Top Category Transitions ───`);
|
|
const transitions: [string, string, number][] = [];
|
|
for (const [from, tos] of catTransitions) {
|
|
for (const [to, count] of tos) {
|
|
transitions.push([from, to, count]);
|
|
}
|
|
}
|
|
transitions.sort((a, b) => b[2] - a[2]);
|
|
for (const [from, to, count] of transitions.slice(0, 15)) {
|
|
console.log(` ${from} → ${to}: ${count}`);
|
|
}
|