SEC-cyBERT/ts/scripts/judge-diag-batch.ts
2026-03-28 20:39:36 -04:00

115 lines
4.3 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Diagnostic: batch-test GLM-5 on failing paragraph IDs.
* Runs each 3 times to measure intermittent failure rate.
* Usage: bun ts/scripts/judge-diag-batch.ts [model-id]
*/
import { generateText, Output } from "ai";
import { openrouter } from "../src/lib/openrouter.ts";
import { readJsonl, readJsonlRaw } from "../src/lib/jsonl.ts";
import { Paragraph } from "../src/schemas/paragraph.ts";
import { LabelOutputRaw } from "../src/schemas/label.ts";
import { SYSTEM_PROMPT, buildJudgePrompt } from "../src/label/prompts.ts";
const MODEL = process.argv[2] ?? "z-ai/glm-5";
const FAILED_IDS = [
"25e44b58-e11a-4633-8efe-c63836862cd9",
"282c982b-35bb-4fa9-82e3-41c748aa0c83",
"61bcdd6b-cd6b-415e-940a-59c77d8d757a",
"66b02dbe-e7aa-4b6e-9fb4-47542f0cd980",
"87b4fd8c-a095-4645-8969-5071a97d84b8",
"887bc80e-08c5-4337-9a85-1669f8cde071",
"c0d77667-1134-4347-a84e-cf640b463d7e",
"c34fb56c-9190-4e93-8e75-c322dbb563ae",
"c71ebd62-0fef-4ff5-af3f-943c4d7bfdbd",
];
const STAGE1_PATH = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
const PARAGRAPHS_PATH = new URL("../../data/paragraphs/training.jsonl", import.meta.url).pathname;
interface S1Ann {
paragraphId: string;
label: { content_category: string; specificity_level: number; reasoning: string };
provenance: { modelId: string };
}
console.error(`Loading data...`);
const { records: allAnns } = await readJsonlRaw(STAGE1_PATH);
const s1ByParagraph = new Map<string, S1Ann[]>();
for (const raw of allAnns) {
const a = raw as S1Ann;
let arr = s1ByParagraph.get(a.paragraphId);
if (!arr) { arr = []; s1ByParagraph.set(a.paragraphId, arr); }
arr.push(a);
}
const { records: allParagraphs } = await readJsonl(PARAGRAPHS_PATH, Paragraph);
const paragraphMap = new Map(allParagraphs.map(p => [p.id, p]));
console.error(`Testing ${FAILED_IDS.length} paragraphs × 2 attempts each...\n`);
const results: { pid: string; attempt: number; success: boolean; modelId?: string; error?: string }[] = [];
for (const pid of FAILED_IDS) {
const paragraph = paragraphMap.get(pid);
if (!paragraph) { console.error(` Skip ${pid.slice(0, 8)} — not found`); continue; }
const priorAnns = (s1ByParagraph.get(pid) ?? []).map(a => ({
content_category: a.label.content_category,
specificity_level: a.label.specificity_level,
reasoning: a.label.reasoning,
}));
for (let attempt = 1; attempt <= 2; attempt++) {
try {
const result = await generateText({
model: openrouter(MODEL),
output: Output.object({ schema: LabelOutputRaw }),
system: SYSTEM_PROMPT,
prompt: buildJudgePrompt(paragraph, priorAnns),
temperature: 0,
providerOptions: {
openrouter: {
reasoning: { effort: "medium" },
usage: { include: true },
},
},
abortSignal: AbortSignal.timeout(120_000),
});
const modelId = result.response?.modelId ?? "unknown";
console.log(`${pid.slice(0, 8)} #${attempt} — model=${modelId}, cat=${result.output?.content_category}, spec=${result.output?.specificity}`);
results.push({ pid, attempt, success: true, modelId });
} catch (err) {
const msg = err instanceof Error ? err.message.slice(0, 200) : String(err);
console.log(`${pid.slice(0, 8)} #${attempt}${msg}`);
results.push({ pid, attempt, success: false, error: msg });
}
}
}
const total = results.length;
const successes = results.filter(r => r.success).length;
const failures = results.filter(r => !r.success).length;
console.log(`\n=== SUMMARY ===`);
console.log(`Total: ${total}, Success: ${successes} (${(successes/total*100).toFixed(0)}%), Failed: ${failures} (${(failures/total*100).toFixed(0)}%)`);
// Group by modelId
const byModel = new Map<string, number>();
for (const r of results.filter(r => r.success)) {
byModel.set(r.modelId!, (byModel.get(r.modelId!) ?? 0) + 1);
}
console.log(`\nModel IDs seen:`, [...byModel.entries()].map(([m, n]) => `${m} (${n})`).join(", "));
// Per-paragraph failure rate
const byPid = new Map<string, { ok: number; fail: number }>();
for (const r of results) {
if (!byPid.has(r.pid)) byPid.set(r.pid, { ok: 0, fail: 0 });
const b = byPid.get(r.pid)!;
if (r.success) b.ok++; else b.fail++;
}
console.log(`\nPer-paragraph:`);
for (const [pid, { ok, fail }] of byPid) {
console.log(` ${pid.slice(0, 8)}: ${ok} ok, ${fail} fail`);
}