/** * Run mimo-v2-flash on the same 500-sample pilot set used for prompt iteration. * Compares against existing Stage 1 annotations to assess agreement. * * Usage: bun ts/scripts/mimo-pilot.ts */ import { readJsonl, readJsonlRaw, appendJsonl } from "../src/lib/jsonl.ts"; import { Paragraph } from "@sec-cybert/schemas/paragraph.ts"; import { annotateParagraph, type AnnotateOpts } from "../src/label/annotate.ts"; import { PROMPT_VERSION } from "../src/label/prompts.ts"; import { v4 as uuidv4 } from "uuid"; import { existsSync } from "node:fs"; import pLimit from "p-limit"; const PILOT_SAMPLE = new URL("../../data/pilot/pilot-sample-v2.5.jsonl", import.meta.url).pathname; const STAGE1_PATH = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname; const OUTPUT_PATH = new URL("../../data/pilot/pilot-mimo-flash.jsonl", import.meta.url).pathname; const MODEL = "xiaomi/mimo-v2-flash"; const CONCURRENCY = 15; interface S1Ann { paragraphId: string; label: { content_category: string; specificity_level: number }; provenance: { modelId: string }; } function pct(n: number, total: number): string { return `${((n / total) * 100).toFixed(1)}%`; } async function main() { // Load pilot sample paragraphs console.error("Loading pilot sample paragraphs..."); const { records: paragraphs } = await readJsonl(PILOT_SAMPLE, Paragraph); console.error(` ${paragraphs.length} paragraphs`); const pilotIds = new Set(paragraphs.map(p => p.id)); // Load Stage 1 annotations for these paragraphs console.error("Loading Stage 1 annotations for comparison..."); const { records: allAnns } = await readJsonlRaw(STAGE1_PATH); const s1ByParagraph = new Map(); for (const raw of allAnns) { const a = raw as S1Ann; if (!pilotIds.has(a.paragraphId)) continue; let arr = s1ByParagraph.get(a.paragraphId); if (!arr) { arr = []; s1ByParagraph.set(a.paragraphId, arr); } arr.push(a); } console.error(` ${s1ByParagraph.size} paragraphs with Stage 1 data`); // Resume support const doneKeys = new Set(); if (existsSync(OUTPUT_PATH)) { const { records: existing } = await readJsonlRaw(OUTPUT_PATH); for (const r of existing) { const a = r as { paragraphId?: string }; if (a.paragraphId) doneKeys.add(a.paragraphId); } if (doneKeys.size > 0) console.error(` Resuming: ${doneKeys.size} already done`); } const remaining = paragraphs.filter(p => !doneKeys.has(p.id)); console.error(` Running ${remaining.length} annotations...\n`); // Run mimo on remaining paragraphs const runId = uuidv4(); const limit = pLimit(CONCURRENCY); let completed = 0, failed = 0, totalCost = 0; const startTime = Date.now(); const tasks = remaining.map(p => limit(async () => { const opts: AnnotateOpts = { modelId: MODEL, stage: "benchmark", runId, promptVersion: PROMPT_VERSION, reasoningEffort: "low", }; try { const ann = await annotateParagraph(p, opts); await appendJsonl(OUTPUT_PATH, ann); totalCost += ann.provenance.costUsd; completed++; if (completed % 50 === 0) { const elapsed = (Date.now() - startTime) / 1000; process.stderr.write(`\r ${completed}/${remaining.length} (${(completed / elapsed).toFixed(1)}/s, $${totalCost.toFixed(2)}) `); } } catch (err) { failed++; console.error(`\n ✖ ${p.id.slice(0, 8)}: ${err instanceof Error ? err.message : String(err)}`); } })); await Promise.all(tasks); const elapsed = ((Date.now() - startTime) / 1000).toFixed(0); console.error(`\n\n Done: ${completed} completed, ${failed} failed, $${totalCost.toFixed(2)}, ${elapsed}s\n`); // ── Analysis ───────────────────────────────────────────────────────── // Load all mimo results (including resumed) const { records: mimoRaw } = await readJsonlRaw(OUTPUT_PATH); const mimoByParagraph = new Map(); for (const r of mimoRaw) { const a = r as { paragraphId: string; label: { content_category: string; specificity_level: number } }; mimoByParagraph.set(a.paragraphId, a.label); } const s1Models = ["google/gemini-3.1-flash-lite-preview", "openai/gpt-5.4-nano", "x-ai/grok-4.1-fast"]; const shortName = (m: string) => m.split("/").pop()!; console.log("═══════════════════════════════════════════════════════════"); console.log(" MIMO-V2-FLASH PILOT COMPARISON (n=" + mimoByParagraph.size + ")"); console.log("═══════════════════════════════════════════════════════════\n"); // Pairwise agreement: mimo vs each Stage 1 model console.log("── Pairwise Agreement (mimo vs Stage 1 models) ─────────────"); for (const model of s1Models) { let catAgree = 0, specAgree = 0, bothAgree = 0, total = 0; for (const [pid, mimoLabel] of mimoByParagraph) { const s1anns = s1ByParagraph.get(pid); if (!s1anns) continue; const s1ann = s1anns.find(a => a.provenance.modelId === model); if (!s1ann) continue; total++; if (s1ann.label.content_category === mimoLabel.content_category) catAgree++; if (s1ann.label.specificity_level === mimoLabel.specificity_level) specAgree++; if (s1ann.label.content_category === mimoLabel.content_category && s1ann.label.specificity_level === mimoLabel.specificity_level) bothAgree++; } console.log(`\n mimo × ${shortName(model)} (n=${total}):`); console.log(` Category: ${pct(catAgree, total)} (${catAgree})`); console.log(` Specificity: ${pct(specAgree, total)} (${specAgree})`); console.log(` Both: ${pct(bothAgree, total)} (${bothAgree})`); } // Agreement with majority vote console.log("\n── Agreement with Stage 1 Majority Vote ───────────────────"); let catMajAgree = 0, specMajAgree = 0, bothMajAgree = 0, totalMaj = 0; for (const [pid, mimoLabel] of mimoByParagraph) { const s1anns = s1ByParagraph.get(pid); if (!s1anns || s1anns.length !== 3) continue; totalMaj++; // Category majority const cats = s1anns.map(a => a.label.content_category); const catFreq = new Map(); for (const c of cats) catFreq.set(c, (catFreq.get(c) ?? 0) + 1); const majCat = [...catFreq.entries()].find(([, v]) => v >= 2)?.[0]; // Specificity majority const specs = s1anns.map(a => a.label.specificity_level); const specFreq = new Map(); for (const s of specs) specFreq.set(s, (specFreq.get(s) ?? 0) + 1); const majSpec = [...specFreq.entries()].find(([, v]) => v >= 2)?.[0]; const catOk = majCat !== undefined && mimoLabel.content_category === majCat; const specOk = majSpec !== undefined && mimoLabel.specificity_level === majSpec; if (catOk) catMajAgree++; if (specOk) specMajAgree++; if (catOk && specOk) bothMajAgree++; } console.log(` mimo vs majority (n=${totalMaj}):`); console.log(` Category: ${pct(catMajAgree, totalMaj)} (${catMajAgree})`); console.log(` Specificity: ${pct(specMajAgree, totalMaj)} (${specMajAgree})`); console.log(` Both: ${pct(bothMajAgree, totalMaj)} (${bothMajAgree})`); // Unanimity: if mimo replaced nano, what would the new unanimity be? console.log("\n── Hypothetical: replace nano with mimo ────────────────────"); let newCatUnan = 0, newSpecUnan = 0, newBothUnan = 0; let oldCatUnan = 0, oldSpecUnan = 0, oldBothUnan = 0; let nCompare = 0; for (const [pid, mimoLabel] of mimoByParagraph) { const s1anns = s1ByParagraph.get(pid); if (!s1anns || s1anns.length !== 3) continue; nCompare++; const gemini = s1anns.find(a => a.provenance.modelId.includes("gemini"))!; const nano = s1anns.find(a => a.provenance.modelId.includes("nano"))!; const grok = s1anns.find(a => a.provenance.modelId.includes("grok"))!; // Old (with nano) const oldCats = [gemini, nano, grok].map(a => a.label.content_category); const oldSpecs = [gemini, nano, grok].map(a => a.label.specificity_level); const oldCU = new Set(oldCats).size === 1; const oldSU = new Set(oldSpecs).size === 1; if (oldCU) oldCatUnan++; if (oldSU) oldSpecUnan++; if (oldCU && oldSU) oldBothUnan++; // New (with mimo replacing nano) const newCats = [gemini.label.content_category, mimoLabel.content_category, grok.label.content_category]; const newSpecs = [gemini.label.specificity_level, mimoLabel.specificity_level, grok.label.specificity_level]; const newCU = new Set(newCats).size === 1; const newSU = new Set(newSpecs).size === 1; if (newCU) newCatUnan++; if (newSU) newSpecUnan++; if (newCU && newSU) newBothUnan++; } console.log(` n=${nCompare}`); console.log(` Old (nano) New (mimo) Delta`); console.log(` Category: ${pct(oldCatUnan, nCompare).padStart(6)} ${pct(newCatUnan, nCompare).padStart(6)} ${((newCatUnan - oldCatUnan) / nCompare * 100).toFixed(1).padStart(5)}pp`); console.log(` Specificity: ${pct(oldSpecUnan, nCompare).padStart(6)} ${pct(newSpecUnan, nCompare).padStart(6)} ${((newSpecUnan - oldSpecUnan) / nCompare * 100).toFixed(1).padStart(5)}pp`); console.log(` Both: ${pct(oldBothUnan, nCompare).padStart(6)} ${pct(newBothUnan, nCompare).padStart(6)} ${((newBothUnan - oldBothUnan) / nCompare * 100).toFixed(1).padStart(5)}pp`); // Outlier analysis console.log("\n── Outlier Rate (mimo in 3-model panel) ────────────────────"); let mimoCatOut = 0, mimoSpecOut = 0; for (const [pid, mimoLabel] of mimoByParagraph) { const s1anns = s1ByParagraph.get(pid); if (!s1anns || s1anns.length !== 3) continue; const gemini = s1anns.find(a => a.provenance.modelId.includes("gemini"))!; const grok = s1anns.find(a => a.provenance.modelId.includes("grok"))!; // mimo is outlier when gemini and grok agree but mimo differs if (gemini.label.content_category === grok.label.content_category && mimoLabel.content_category !== gemini.label.content_category) mimoCatOut++; if (gemini.label.specificity_level === grok.label.specificity_level && mimoLabel.specificity_level !== gemini.label.specificity_level) mimoSpecOut++; } console.log(` When gemini×grok agree, mimo disagrees:`); console.log(` Category: ${mimoCatOut} (${pct(mimoCatOut, nCompare)})`); console.log(` Specificity: ${mimoSpecOut} (${pct(mimoSpecOut, nCompare)})`); // For comparison: nano outlier rate on same paragraphs let nanoCatOut = 0, nanoSpecOut = 0; for (const [pid] of mimoByParagraph) { const s1anns = s1ByParagraph.get(pid); if (!s1anns || s1anns.length !== 3) continue; const gemini = s1anns.find(a => a.provenance.modelId.includes("gemini"))!; const nano = s1anns.find(a => a.provenance.modelId.includes("nano"))!; const grok = s1anns.find(a => a.provenance.modelId.includes("grok"))!; if (gemini.label.content_category === grok.label.content_category && nano.label.content_category !== gemini.label.content_category) nanoCatOut++; if (gemini.label.specificity_level === grok.label.specificity_level && nano.label.specificity_level !== gemini.label.specificity_level) nanoSpecOut++; } console.log(`\n For comparison, nano disagrees when gemini×grok agree:`); console.log(` Category: ${nanoCatOut} (${pct(nanoCatOut, nCompare)})`); console.log(` Specificity: ${nanoSpecOut} (${pct(nanoSpecOut, nCompare)})`); } main().catch(err => { console.error(err); process.exit(1); });