SEC-cyBERT/ts/scripts/diff-orphan-annotations.ts

/**
 * Diff original vs re-run annotations for orphan-word paragraphs.
 *
 * Compares stage1.jsonl (original) against stage1-orphan-rerun.jsonl (patched text)
 * to measure label changes, bias correction, and conflict resolution.
 *
 * Usage: bun ts/scripts/diff-orphan-annotations.ts
 */
import { readFileSync } from "node:fs";

const DATA_DIR = new URL("../../data", import.meta.url).pathname;
const ORIG_PATH = `${DATA_DIR}/annotations/stage1.jsonl`;
const RERUN_PATH = `${DATA_DIR}/annotations/stage1-orphan-rerun.jsonl`;
const PATCHES_PATH = `${DATA_DIR}/paragraphs/patches/orphan-word-patches.jsonl`;

interface Annotation {
  paragraphId: string;
  label: {
    content_category: string;
    specificity_level: number;
    category_confidence: string;
    specificity_confidence: string;
  };
  provenance: {
    modelId: string;
  };
}

function loadAnnotations(path: string): Map<string, Annotation[]> {
  const map = new Map<string, Annotation[]>();
  for (const line of readFileSync(path, "utf-8").split("\n")) {
    if (!line.trim()) continue;
    const ann = JSON.parse(line) as Annotation;
    const key = ann.paragraphId;
    if (!map.has(key)) map.set(key, []);
    map.get(key)!.push(ann);
  }
  return map;
}

function majorityVote(annotations: Annotation[], field: "content_category" | "specificity_level"): { value: string | number; unanimous: boolean; count: number } {
  const counts = new Map<string | number, number>();
  for (const ann of annotations) {
    const v = ann.label[field];
    counts.set(v, (counts.get(v) ?? 0) + 1);
  }
  let best: string | number = "";
  let bestCount = 0;
  for (const [v, c] of counts) {
    if (c > bestCount) { best = v; bestCount = c; }
  }
  return { value: best, unanimous: bestCount === annotations.length, count: bestCount };
}

// ── Main ────────────────────────────────────────────────────────────────
const patchIds = new Set<string>();
for (const line of readFileSync(PATCHES_PATH, "utf-8").split("\n")) {
  if (!line.trim()) continue;
  patchIds.add((JSON.parse(line) as { id: string }).id);
}

const origAll = loadAnnotations(ORIG_PATH);
const rerunAll = loadAnnotations(RERUN_PATH);

// Filter original annotations to only orphan-word paragraphs
const origFiltered = new Map<string, Annotation[]>();
for (const [pid, anns] of origAll) {
  if (patchIds.has(pid)) origFiltered.set(pid, anns);
}

console.error(`Orphan-word paragraphs: ${patchIds.size}`);
console.error(`Original annotations found: ${origFiltered.size} paragraphs`);
console.error(`Re-run annotations found: ${rerunAll.size} paragraphs`);

// Compare paragraphs that have BOTH original and re-run annotations
const comparable = [...rerunAll.keys()].filter((pid) => origFiltered.has(pid));
console.error(`Comparable paragraphs: ${comparable.length}\n`);

// Track changes
let catChanged = 0;
let specChanged = 0;
let eitherChanged = 0;
// Per-model changes
const perModelCatChanges = new Map<string, number>();
const perModelSpecChanges = new Map<string, number>();

// Category transition matrix
const catTransitions = new Map<string, Map<string, number>>();

// Consensus changes
let origConflicts = 0;
let rerunConflicts = 0;
let conflictsResolved = 0;
let consensusBroken = 0;

// Category distribution
const origCatDist = new Map<string, number>();
const rerunCatDist = new Map<string, number>();

// Specificity distribution
const origSpecDist = new Map<number, number>();
const rerunSpecDist = new Map<number, number>();

for (const pid of comparable) {
  const origAnns = origFiltered.get(pid)!;
  const rerunAnns = rerunAll.get(pid)!;

  // Per-model comparison
  for (const rerunAnn of rerunAnns) {
    const modelId = rerunAnn.provenance.modelId;
    const origAnn = origAnns.find((a) => a.provenance.modelId === modelId);
    if (!origAnn) continue;

    if (origAnn.label.content_category !== rerunAnn.label.content_category) {
      perModelCatChanges.set(modelId, (perModelCatChanges.get(modelId) ?? 0) + 1);

      // Track transition
      const from = origAnn.label.content_category;
      const to = rerunAnn.label.content_category;
      if (!catTransitions.has(from)) catTransitions.set(from, new Map());
      catTransitions.get(from)!.set(to, (catTransitions.get(from)!.get(to) ?? 0) + 1);
    }
    if (origAnn.label.specificity_level !== rerunAnn.label.specificity_level) {
      perModelSpecChanges.set(modelId, (perModelSpecChanges.get(modelId) ?? 0) + 1);
    }
  }

  // Consensus comparison (majority vote)
  const origCatVote = majorityVote(origAnns, "content_category");
  const rerunCatVote = majorityVote(rerunAnns, "content_category");
  const origSpecVote = majorityVote(origAnns, "specificity_level");
  const rerunSpecVote = majorityVote(rerunAnns, "specificity_level");

  origCatDist.set(origCatVote.value as string, (origCatDist.get(origCatVote.value as string) ?? 0) + 1);
  rerunCatDist.set(rerunCatVote.value as string, (rerunCatDist.get(rerunCatVote.value as string) ?? 0) + 1);
  origSpecDist.set(origSpecVote.value as number, (origSpecDist.get(origSpecVote.value as number) ?? 0) + 1);
  rerunSpecDist.set(rerunSpecVote.value as number, (rerunSpecDist.get(rerunSpecVote.value as number) ?? 0) + 1);

  if (origCatVote.value !== rerunCatVote.value) catChanged++;
  if (origSpecVote.value !== rerunSpecVote.value) specChanged++;
  if (origCatVote.value !== rerunCatVote.value || origSpecVote.value !== rerunSpecVote.value) eitherChanged++;

  // Conflict tracking (no majority = conflict)
  const origHasConflict = origCatVote.count < 2 || origSpecVote.count < 2;
  const rerunHasConflict = rerunCatVote.count < 2 || rerunSpecVote.count < 2;
  if (origHasConflict) origConflicts++;
  if (rerunHasConflict) rerunConflicts++;
  if (origHasConflict && !rerunHasConflict) conflictsResolved++;
  if (!origHasConflict && rerunHasConflict) consensusBroken++;
}

// ── Report ──────────────────────────────────────────────────────────────
console.log("═══ ORPHAN WORD RE-ANNOTATION DIFF REPORT ═══\n");

console.log(`Paragraphs compared: ${comparable.length}`);
console.log(`  Category consensus changed: ${catChanged} (${((catChanged / comparable.length) * 100).toFixed(1)}%)`);
console.log(`  Specificity consensus changed: ${specChanged} (${((specChanged / comparable.length) * 100).toFixed(1)}%)`);
console.log(`  Either dimension changed: ${eitherChanged} (${((eitherChanged / comparable.length) * 100).toFixed(1)}%)`);

console.log(`\n─── Per-Model Category Changes ───`);
for (const [model, count] of [...perModelCatChanges.entries()].sort((a, b) => b[1] - a[1])) {
  const short = model.split("/")[1] ?? model;
  console.log(`  ${short}: ${count} (${((count / comparable.length) * 100).toFixed(1)}%)`);
}

console.log(`\n─── Per-Model Specificity Changes ───`);
for (const [model, count] of [...perModelSpecChanges.entries()].sort((a, b) => b[1] - a[1])) {
  const short = model.split("/")[1] ?? model;
  console.log(`  ${short}: ${count} (${((count / comparable.length) * 100).toFixed(1)}%)`);
}

console.log(`\n─── Conflict Resolution ───`);
console.log(`  Original conflicts: ${origConflicts}`);
console.log(`  Re-run conflicts: ${rerunConflicts}`);
console.log(`  Conflicts resolved (orig conflict → rerun consensus): ${conflictsResolved}`);
console.log(`  Consensus broken (orig consensus → rerun conflict): ${consensusBroken}`);
console.log(`  Net conflict change: ${conflictsResolved - consensusBroken > 0 ? "-" : "+"}${Math.abs(conflictsResolved - consensusBroken)}`);

console.log(`\n─── Category Distribution (Consensus) ───`);
console.log(`  ${"Category".padEnd(30)} ${"Original".padStart(8)} ${"Re-run".padStart(8)} ${"Delta".padStart(8)}`);
const allCats = new Set([...origCatDist.keys(), ...rerunCatDist.keys()]);
for (const cat of [...allCats].sort()) {
  const orig = origCatDist.get(cat) ?? 0;
  const rerun = rerunCatDist.get(cat) ?? 0;
  const delta = rerun - orig;
  const sign = delta > 0 ? "+" : "";
  console.log(`  ${cat.padEnd(30)} ${String(orig).padStart(8)} ${String(rerun).padStart(8)} ${(sign + delta).padStart(8)}`);
}

console.log(`\n─── Specificity Distribution (Consensus) ───`);
console.log(`  ${"Level".padEnd(10)} ${"Original".padStart(8)} ${"Re-run".padStart(8)} ${"Delta".padStart(8)}`);
for (const level of [1, 2, 3, 4]) {
  const orig = origSpecDist.get(level) ?? 0;
  const rerun = rerunSpecDist.get(level) ?? 0;
  const delta = rerun - orig;
  const sign = delta > 0 ? "+" : "";
  console.log(`  ${String(level).padEnd(10)} ${String(orig).padStart(8)} ${String(rerun).padStart(8)} ${(sign + delta).padStart(8)}`);
}

console.log(`\n─── Top Category Transitions ───`);
const transitions: [string, string, number][] = [];
for (const [from, tos] of catTransitions) {
  for (const [to, count] of tos) {
    transitions.push([from, to, count]);
  }
}
transitions.sort((a, b) => b[2] - a[2]);
for (const [from, to, count] of transitions.slice(0, 15)) {
  console.log(`  ${from} → ${to}: ${count}`);
}