/** * Diagnostic: batch-test GLM-5 on failing paragraph IDs. * Runs each 3 times to measure intermittent failure rate. * Usage: bun ts/scripts/judge-diag-batch.ts [model-id] */ import { generateText, Output } from "ai"; import { openrouter } from "../src/lib/openrouter.ts"; import { readJsonl, readJsonlRaw } from "../src/lib/jsonl.ts"; import { Paragraph } from "@sec-cybert/schemas/paragraph.ts"; import { LabelOutputRaw } from "@sec-cybert/schemas/label.ts"; import { SYSTEM_PROMPT, buildJudgePrompt } from "../src/label/prompts.ts"; const MODEL = process.argv[2] ?? "z-ai/glm-5"; const FAILED_IDS = [ "25e44b58-e11a-4633-8efe-c63836862cd9", "282c982b-35bb-4fa9-82e3-41c748aa0c83", "61bcdd6b-cd6b-415e-940a-59c77d8d757a", "66b02dbe-e7aa-4b6e-9fb4-47542f0cd980", "87b4fd8c-a095-4645-8969-5071a97d84b8", "887bc80e-08c5-4337-9a85-1669f8cde071", "c0d77667-1134-4347-a84e-cf640b463d7e", "c34fb56c-9190-4e93-8e75-c322dbb563ae", "c71ebd62-0fef-4ff5-af3f-943c4d7bfdbd", ]; const STAGE1_PATH = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname; const PARAGRAPHS_PATH = new URL("../../data/paragraphs/training.jsonl", import.meta.url).pathname; interface S1Ann { paragraphId: string; label: { content_category: string; specificity_level: number; reasoning: string }; provenance: { modelId: string }; } console.error(`Loading data...`); const { records: allAnns } = await readJsonlRaw(STAGE1_PATH); const s1ByParagraph = new Map(); for (const raw of allAnns) { const a = raw as S1Ann; let arr = s1ByParagraph.get(a.paragraphId); if (!arr) { arr = []; s1ByParagraph.set(a.paragraphId, arr); } arr.push(a); } const { records: allParagraphs } = await readJsonl(PARAGRAPHS_PATH, Paragraph); const paragraphMap = new Map(allParagraphs.map(p => [p.id, p])); console.error(`Testing ${FAILED_IDS.length} paragraphs × 2 attempts each...\n`); const results: { pid: string; attempt: number; success: boolean; modelId?: string; error?: string }[] = []; for (const pid of FAILED_IDS) { const paragraph = paragraphMap.get(pid); if (!paragraph) { console.error(` Skip ${pid.slice(0, 8)} — not found`); continue; } const priorAnns = (s1ByParagraph.get(pid) ?? []).map(a => ({ content_category: a.label.content_category, specificity_level: a.label.specificity_level, reasoning: a.label.reasoning, })); for (let attempt = 1; attempt <= 2; attempt++) { try { const result = await generateText({ model: openrouter(MODEL), output: Output.object({ schema: LabelOutputRaw }), system: SYSTEM_PROMPT, prompt: buildJudgePrompt(paragraph, priorAnns), temperature: 0, providerOptions: { openrouter: { reasoning: { effort: "medium" }, usage: { include: true }, }, }, abortSignal: AbortSignal.timeout(120_000), }); const modelId = result.response?.modelId ?? "unknown"; console.log(` ✓ ${pid.slice(0, 8)} #${attempt} — model=${modelId}, cat=${result.output?.content_category}, spec=${result.output?.specificity}`); results.push({ pid, attempt, success: true, modelId }); } catch (err) { const msg = err instanceof Error ? err.message.slice(0, 200) : String(err); console.log(` ✗ ${pid.slice(0, 8)} #${attempt} — ${msg}`); results.push({ pid, attempt, success: false, error: msg }); } } } const total = results.length; const successes = results.filter(r => r.success).length; const failures = results.filter(r => !r.success).length; console.log(`\n=== SUMMARY ===`); console.log(`Total: ${total}, Success: ${successes} (${(successes/total*100).toFixed(0)}%), Failed: ${failures} (${(failures/total*100).toFixed(0)}%)`); // Group by modelId const byModel = new Map(); for (const r of results.filter(r => r.success)) { byModel.set(r.modelId!, (byModel.get(r.modelId!) ?? 0) + 1); } console.log(`\nModel IDs seen:`, [...byModel.entries()].map(([m, n]) => `${m} (${n})`).join(", ")); // Per-paragraph failure rate const byPid = new Map(); for (const r of results) { if (!byPid.has(r.pid)) byPid.set(r.pid, { ok: 0, fail: 0 }); const b = byPid.get(r.pid)!; if (r.success) b.ok++; else b.fail++; } console.log(`\nPer-paragraph:`); for (const [pid, { ok, fail }] of byPid) { console.log(` ${pid.slice(0, 8)}: ${ok} ok, ${fail} fail`); }