SEC-cyBERT/ts/scripts/merge-annotations.ts
2026-03-29 20:33:39 -04:00

74 lines
2.3 KiB
TypeScript

/**
* Merge original Stage 1 annotations with orphan-word re-run annotations.
*
* For paragraphs that were re-annotated, replaces original annotations with
* re-run annotations. For all other paragraphs, keeps original annotations.
* Original stage1.jsonl is NOT modified.
*
* Usage: bun ts/scripts/merge-annotations.ts
*
* Output: data/annotations/stage1.patched.jsonl
*/
import { readFileSync, writeFileSync } from "node:fs";
const DATA_DIR = new URL("../../data", import.meta.url).pathname;
const ORIG_PATH = `${DATA_DIR}/annotations/stage1.jsonl`;
const RERUN_PATH = `${DATA_DIR}/annotations/stage1-orphan-rerun.jsonl`;
const OUTPUT_PATH = `${DATA_DIR}/annotations/stage1.patched.jsonl`;
interface Annotation {
paragraphId: string;
provenance: { modelId: string };
[key: string]: unknown;
}
// Load re-run annotations, keyed by paragraphId|modelId
const rerunMap = new Map<string, string>(); // key -> raw JSON line
const rerunPids = new Set<string>();
for (const line of readFileSync(RERUN_PATH, "utf-8").split("\n")) {
if (!line.trim()) continue;
const ann = JSON.parse(line) as Annotation;
const key = `${ann.paragraphId}|${ann.provenance.modelId}`;
rerunMap.set(key, line);
rerunPids.add(ann.paragraphId);
}
console.error(`Re-run annotations: ${rerunMap.size} (${rerunPids.size} paragraphs)`);
// Stream through original, replacing where re-run exists
let kept = 0;
let replaced = 0;
const output: string[] = [];
for (const line of readFileSync(ORIG_PATH, "utf-8").split("\n")) {
if (!line.trim()) continue;
const ann = JSON.parse(line) as Annotation;
const key = `${ann.paragraphId}|${ann.provenance.modelId}`;
if (rerunMap.has(key)) {
output.push(rerunMap.get(key)!);
rerunMap.delete(key); // mark as used
replaced++;
} else {
output.push(line);
kept++;
}
}
// Any re-run annotations not matched to originals (shouldn't happen, but be safe)
let added = 0;
for (const [, line] of rerunMap) {
output.push(line);
added++;
}
writeFileSync(OUTPUT_PATH, output.join("\n") + "\n");
console.error(
`\nMerge complete:` +
`\n ${kept} original annotations kept` +
`\n ${replaced} annotations replaced with re-run` +
`\n ${added} new annotations added` +
`\n ${output.length} total annotations` +
`\n Output: ${OUTPUT_PATH}`,
);