/** * Merge original Stage 1 annotations with orphan-word re-run annotations. * * For paragraphs that were re-annotated, replaces original annotations with * re-run annotations. For all other paragraphs, keeps original annotations. * Original stage1.jsonl is NOT modified. * * Usage: bun ts/scripts/merge-annotations.ts * * Output: data/annotations/stage1.patched.jsonl */ import { readFileSync, writeFileSync } from "node:fs"; const DATA_DIR = new URL("../../data", import.meta.url).pathname; const ORIG_PATH = `${DATA_DIR}/annotations/stage1.jsonl`; const RERUN_PATH = `${DATA_DIR}/annotations/stage1-orphan-rerun.jsonl`; const OUTPUT_PATH = `${DATA_DIR}/annotations/stage1.patched.jsonl`; interface Annotation { paragraphId: string; provenance: { modelId: string }; [key: string]: unknown; } // Load re-run annotations, keyed by paragraphId|modelId const rerunMap = new Map(); // key -> raw JSON line const rerunPids = new Set(); for (const line of readFileSync(RERUN_PATH, "utf-8").split("\n")) { if (!line.trim()) continue; const ann = JSON.parse(line) as Annotation; const key = `${ann.paragraphId}|${ann.provenance.modelId}`; rerunMap.set(key, line); rerunPids.add(ann.paragraphId); } console.error(`Re-run annotations: ${rerunMap.size} (${rerunPids.size} paragraphs)`); // Stream through original, replacing where re-run exists let kept = 0; let replaced = 0; const output: string[] = []; for (const line of readFileSync(ORIG_PATH, "utf-8").split("\n")) { if (!line.trim()) continue; const ann = JSON.parse(line) as Annotation; const key = `${ann.paragraphId}|${ann.provenance.modelId}`; if (rerunMap.has(key)) { output.push(rerunMap.get(key)!); rerunMap.delete(key); // mark as used replaced++; } else { output.push(line); kept++; } } // Any re-run annotations not matched to originals (shouldn't happen, but be safe) let added = 0; for (const [, line] of rerunMap) { output.push(line); added++; } writeFileSync(OUTPUT_PATH, output.join("\n") + "\n"); console.error( `\nMerge complete:` + `\n ${kept} original annotations kept` + `\n ${replaced} annotations replaced with re-run` + `\n ${added} new annotations added` + `\n ${output.length} total annotations` + `\n Output: ${OUTPUT_PATH}`, );