115 lines
4.3 KiB
TypeScript
115 lines
4.3 KiB
TypeScript
/**
|
||
* Diagnostic: batch-test GLM-5 on failing paragraph IDs.
|
||
* Runs each 3 times to measure intermittent failure rate.
|
||
* Usage: bun ts/scripts/judge-diag-batch.ts [model-id]
|
||
*/
|
||
import { generateText, Output } from "ai";
|
||
import { openrouter } from "../src/lib/openrouter.ts";
|
||
import { readJsonl, readJsonlRaw } from "../src/lib/jsonl.ts";
|
||
import { Paragraph } from "@sec-cybert/schemas/paragraph.ts";
|
||
import { LabelOutputRaw } from "@sec-cybert/schemas/label.ts";
|
||
import { SYSTEM_PROMPT, buildJudgePrompt } from "../src/label/prompts.ts";
|
||
|
||
const MODEL = process.argv[2] ?? "z-ai/glm-5";
|
||
|
||
const FAILED_IDS = [
|
||
"25e44b58-e11a-4633-8efe-c63836862cd9",
|
||
"282c982b-35bb-4fa9-82e3-41c748aa0c83",
|
||
"61bcdd6b-cd6b-415e-940a-59c77d8d757a",
|
||
"66b02dbe-e7aa-4b6e-9fb4-47542f0cd980",
|
||
"87b4fd8c-a095-4645-8969-5071a97d84b8",
|
||
"887bc80e-08c5-4337-9a85-1669f8cde071",
|
||
"c0d77667-1134-4347-a84e-cf640b463d7e",
|
||
"c34fb56c-9190-4e93-8e75-c322dbb563ae",
|
||
"c71ebd62-0fef-4ff5-af3f-943c4d7bfdbd",
|
||
];
|
||
|
||
const STAGE1_PATH = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
|
||
const PARAGRAPHS_PATH = new URL("../../data/paragraphs/training.jsonl", import.meta.url).pathname;
|
||
|
||
interface S1Ann {
|
||
paragraphId: string;
|
||
label: { content_category: string; specificity_level: number; reasoning: string };
|
||
provenance: { modelId: string };
|
||
}
|
||
|
||
console.error(`Loading data...`);
|
||
const { records: allAnns } = await readJsonlRaw(STAGE1_PATH);
|
||
const s1ByParagraph = new Map<string, S1Ann[]>();
|
||
for (const raw of allAnns) {
|
||
const a = raw as S1Ann;
|
||
let arr = s1ByParagraph.get(a.paragraphId);
|
||
if (!arr) { arr = []; s1ByParagraph.set(a.paragraphId, arr); }
|
||
arr.push(a);
|
||
}
|
||
|
||
const { records: allParagraphs } = await readJsonl(PARAGRAPHS_PATH, Paragraph);
|
||
const paragraphMap = new Map(allParagraphs.map(p => [p.id, p]));
|
||
|
||
console.error(`Testing ${FAILED_IDS.length} paragraphs × 2 attempts each...\n`);
|
||
|
||
const results: { pid: string; attempt: number; success: boolean; modelId?: string; error?: string }[] = [];
|
||
|
||
for (const pid of FAILED_IDS) {
|
||
const paragraph = paragraphMap.get(pid);
|
||
if (!paragraph) { console.error(` Skip ${pid.slice(0, 8)} — not found`); continue; }
|
||
|
||
const priorAnns = (s1ByParagraph.get(pid) ?? []).map(a => ({
|
||
content_category: a.label.content_category,
|
||
specificity_level: a.label.specificity_level,
|
||
reasoning: a.label.reasoning,
|
||
}));
|
||
|
||
for (let attempt = 1; attempt <= 2; attempt++) {
|
||
try {
|
||
const result = await generateText({
|
||
model: openrouter(MODEL),
|
||
output: Output.object({ schema: LabelOutputRaw }),
|
||
system: SYSTEM_PROMPT,
|
||
prompt: buildJudgePrompt(paragraph, priorAnns),
|
||
temperature: 0,
|
||
providerOptions: {
|
||
openrouter: {
|
||
reasoning: { effort: "medium" },
|
||
usage: { include: true },
|
||
},
|
||
},
|
||
abortSignal: AbortSignal.timeout(120_000),
|
||
});
|
||
|
||
const modelId = result.response?.modelId ?? "unknown";
|
||
console.log(` ✓ ${pid.slice(0, 8)} #${attempt} — model=${modelId}, cat=${result.output?.content_category}, spec=${result.output?.specificity}`);
|
||
results.push({ pid, attempt, success: true, modelId });
|
||
} catch (err) {
|
||
const msg = err instanceof Error ? err.message.slice(0, 200) : String(err);
|
||
console.log(` ✗ ${pid.slice(0, 8)} #${attempt} — ${msg}`);
|
||
results.push({ pid, attempt, success: false, error: msg });
|
||
}
|
||
}
|
||
}
|
||
|
||
const total = results.length;
|
||
const successes = results.filter(r => r.success).length;
|
||
const failures = results.filter(r => !r.success).length;
|
||
console.log(`\n=== SUMMARY ===`);
|
||
console.log(`Total: ${total}, Success: ${successes} (${(successes/total*100).toFixed(0)}%), Failed: ${failures} (${(failures/total*100).toFixed(0)}%)`);
|
||
|
||
// Group by modelId
|
||
const byModel = new Map<string, number>();
|
||
for (const r of results.filter(r => r.success)) {
|
||
byModel.set(r.modelId!, (byModel.get(r.modelId!) ?? 0) + 1);
|
||
}
|
||
console.log(`\nModel IDs seen:`, [...byModel.entries()].map(([m, n]) => `${m} (${n})`).join(", "));
|
||
|
||
// Per-paragraph failure rate
|
||
const byPid = new Map<string, { ok: number; fail: number }>();
|
||
for (const r of results) {
|
||
if (!byPid.has(r.pid)) byPid.set(r.pid, { ok: 0, fail: 0 });
|
||
const b = byPid.get(r.pid)!;
|
||
if (r.success) b.ok++; else b.fail++;
|
||
}
|
||
console.log(`\nPer-paragraph:`);
|
||
for (const [pid, { ok, fail }] of byPid) {
|
||
console.log(` ${pid.slice(0, 8)}: ${ok} ok, ${fail} fail`);
|
||
}
|