74 lines
2.3 KiB
TypeScript
74 lines
2.3 KiB
TypeScript
/**
|
|
* Merge original Stage 1 annotations with orphan-word re-run annotations.
|
|
*
|
|
* For paragraphs that were re-annotated, replaces original annotations with
|
|
* re-run annotations. For all other paragraphs, keeps original annotations.
|
|
* Original stage1.jsonl is NOT modified.
|
|
*
|
|
* Usage: bun ts/scripts/merge-annotations.ts
|
|
*
|
|
* Output: data/annotations/stage1.patched.jsonl
|
|
*/
|
|
import { readFileSync, writeFileSync } from "node:fs";
|
|
|
|
const DATA_DIR = new URL("../../data", import.meta.url).pathname;
|
|
const ORIG_PATH = `${DATA_DIR}/annotations/stage1.jsonl`;
|
|
const RERUN_PATH = `${DATA_DIR}/annotations/stage1-orphan-rerun.jsonl`;
|
|
const OUTPUT_PATH = `${DATA_DIR}/annotations/stage1.patched.jsonl`;
|
|
|
|
interface Annotation {
|
|
paragraphId: string;
|
|
provenance: { modelId: string };
|
|
[key: string]: unknown;
|
|
}
|
|
|
|
// Load re-run annotations, keyed by paragraphId|modelId
|
|
const rerunMap = new Map<string, string>(); // key -> raw JSON line
|
|
const rerunPids = new Set<string>();
|
|
for (const line of readFileSync(RERUN_PATH, "utf-8").split("\n")) {
|
|
if (!line.trim()) continue;
|
|
const ann = JSON.parse(line) as Annotation;
|
|
const key = `${ann.paragraphId}|${ann.provenance.modelId}`;
|
|
rerunMap.set(key, line);
|
|
rerunPids.add(ann.paragraphId);
|
|
}
|
|
console.error(`Re-run annotations: ${rerunMap.size} (${rerunPids.size} paragraphs)`);
|
|
|
|
// Stream through original, replacing where re-run exists
|
|
let kept = 0;
|
|
let replaced = 0;
|
|
const output: string[] = [];
|
|
|
|
for (const line of readFileSync(ORIG_PATH, "utf-8").split("\n")) {
|
|
if (!line.trim()) continue;
|
|
const ann = JSON.parse(line) as Annotation;
|
|
const key = `${ann.paragraphId}|${ann.provenance.modelId}`;
|
|
|
|
if (rerunMap.has(key)) {
|
|
output.push(rerunMap.get(key)!);
|
|
rerunMap.delete(key); // mark as used
|
|
replaced++;
|
|
} else {
|
|
output.push(line);
|
|
kept++;
|
|
}
|
|
}
|
|
|
|
// Any re-run annotations not matched to originals (shouldn't happen, but be safe)
|
|
let added = 0;
|
|
for (const [, line] of rerunMap) {
|
|
output.push(line);
|
|
added++;
|
|
}
|
|
|
|
writeFileSync(OUTPUT_PATH, output.join("\n") + "\n");
|
|
|
|
console.error(
|
|
`\nMerge complete:` +
|
|
`\n ${kept} original annotations kept` +
|
|
`\n ${replaced} annotations replaced with re-run` +
|
|
`\n ${added} new annotations added` +
|
|
`\n ${output.length} total annotations` +
|
|
`\n Output: ${OUTPUT_PATH}`,
|
|
);
|