SEC-cyBERT/ts/scripts/mimo-pilot.ts
2026-03-28 23:44:37 -04:00

246 lines
12 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Run mimo-v2-flash on the same 500-sample pilot set used for prompt iteration.
* Compares against existing Stage 1 annotations to assess agreement.
*
* Usage: bun ts/scripts/mimo-pilot.ts
*/
import { readJsonl, readJsonlRaw, appendJsonl } from "../src/lib/jsonl.ts";
import { Paragraph } from "@sec-cybert/schemas/paragraph.ts";
import { annotateParagraph, type AnnotateOpts } from "../src/label/annotate.ts";
import { PROMPT_VERSION } from "../src/label/prompts.ts";
import { v4 as uuidv4 } from "uuid";
import { existsSync } from "node:fs";
import pLimit from "p-limit";
const PILOT_SAMPLE = new URL("../../data/pilot/pilot-sample-v2.5.jsonl", import.meta.url).pathname;
const STAGE1_PATH = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
const OUTPUT_PATH = new URL("../../data/pilot/pilot-mimo-flash.jsonl", import.meta.url).pathname;
const MODEL = "xiaomi/mimo-v2-flash";
const CONCURRENCY = 15;
interface S1Ann {
paragraphId: string;
label: { content_category: string; specificity_level: number };
provenance: { modelId: string };
}
function pct(n: number, total: number): string {
return `${((n / total) * 100).toFixed(1)}%`;
}
async function main() {
// Load pilot sample paragraphs
console.error("Loading pilot sample paragraphs...");
const { records: paragraphs } = await readJsonl(PILOT_SAMPLE, Paragraph);
console.error(` ${paragraphs.length} paragraphs`);
const pilotIds = new Set(paragraphs.map(p => p.id));
// Load Stage 1 annotations for these paragraphs
console.error("Loading Stage 1 annotations for comparison...");
const { records: allAnns } = await readJsonlRaw(STAGE1_PATH);
const s1ByParagraph = new Map<string, S1Ann[]>();
for (const raw of allAnns) {
const a = raw as S1Ann;
if (!pilotIds.has(a.paragraphId)) continue;
let arr = s1ByParagraph.get(a.paragraphId);
if (!arr) { arr = []; s1ByParagraph.set(a.paragraphId, arr); }
arr.push(a);
}
console.error(` ${s1ByParagraph.size} paragraphs with Stage 1 data`);
// Resume support
const doneKeys = new Set<string>();
if (existsSync(OUTPUT_PATH)) {
const { records: existing } = await readJsonlRaw(OUTPUT_PATH);
for (const r of existing) {
const a = r as { paragraphId?: string };
if (a.paragraphId) doneKeys.add(a.paragraphId);
}
if (doneKeys.size > 0) console.error(` Resuming: ${doneKeys.size} already done`);
}
const remaining = paragraphs.filter(p => !doneKeys.has(p.id));
console.error(` Running ${remaining.length} annotations...\n`);
// Run mimo on remaining paragraphs
const runId = uuidv4();
const limit = pLimit(CONCURRENCY);
let completed = 0, failed = 0, totalCost = 0;
const startTime = Date.now();
const tasks = remaining.map(p => limit(async () => {
const opts: AnnotateOpts = {
modelId: MODEL,
stage: "benchmark",
runId,
promptVersion: PROMPT_VERSION,
reasoningEffort: "low",
};
try {
const ann = await annotateParagraph(p, opts);
await appendJsonl(OUTPUT_PATH, ann);
totalCost += ann.provenance.costUsd;
completed++;
if (completed % 50 === 0) {
const elapsed = (Date.now() - startTime) / 1000;
process.stderr.write(`\r ${completed}/${remaining.length} (${(completed / elapsed).toFixed(1)}/s, $${totalCost.toFixed(2)}) `);
}
} catch (err) {
failed++;
console.error(`\n ✖ ${p.id.slice(0, 8)}: ${err instanceof Error ? err.message : String(err)}`);
}
}));
await Promise.all(tasks);
const elapsed = ((Date.now() - startTime) / 1000).toFixed(0);
console.error(`\n\n Done: ${completed} completed, ${failed} failed, $${totalCost.toFixed(2)}, ${elapsed}s\n`);
// ── Analysis ─────────────────────────────────────────────────────────
// Load all mimo results (including resumed)
const { records: mimoRaw } = await readJsonlRaw(OUTPUT_PATH);
const mimoByParagraph = new Map<string, { content_category: string; specificity_level: number }>();
for (const r of mimoRaw) {
const a = r as { paragraphId: string; label: { content_category: string; specificity_level: number } };
mimoByParagraph.set(a.paragraphId, a.label);
}
const s1Models = ["google/gemini-3.1-flash-lite-preview", "openai/gpt-5.4-nano", "x-ai/grok-4.1-fast"];
const shortName = (m: string) => m.split("/").pop()!;
console.log("═══════════════════════════════════════════════════════════");
console.log(" MIMO-V2-FLASH PILOT COMPARISON (n=" + mimoByParagraph.size + ")");
console.log("═══════════════════════════════════════════════════════════\n");
// Pairwise agreement: mimo vs each Stage 1 model
console.log("── Pairwise Agreement (mimo vs Stage 1 models) ─────────────");
for (const model of s1Models) {
let catAgree = 0, specAgree = 0, bothAgree = 0, total = 0;
for (const [pid, mimoLabel] of mimoByParagraph) {
const s1anns = s1ByParagraph.get(pid);
if (!s1anns) continue;
const s1ann = s1anns.find(a => a.provenance.modelId === model);
if (!s1ann) continue;
total++;
if (s1ann.label.content_category === mimoLabel.content_category) catAgree++;
if (s1ann.label.specificity_level === mimoLabel.specificity_level) specAgree++;
if (s1ann.label.content_category === mimoLabel.content_category &&
s1ann.label.specificity_level === mimoLabel.specificity_level) bothAgree++;
}
console.log(`\n mimo × ${shortName(model)} (n=${total}):`);
console.log(` Category: ${pct(catAgree, total)} (${catAgree})`);
console.log(` Specificity: ${pct(specAgree, total)} (${specAgree})`);
console.log(` Both: ${pct(bothAgree, total)} (${bothAgree})`);
}
// Agreement with majority vote
console.log("\n── Agreement with Stage 1 Majority Vote ───────────────────");
let catMajAgree = 0, specMajAgree = 0, bothMajAgree = 0, totalMaj = 0;
for (const [pid, mimoLabel] of mimoByParagraph) {
const s1anns = s1ByParagraph.get(pid);
if (!s1anns || s1anns.length !== 3) continue;
totalMaj++;
// Category majority
const cats = s1anns.map(a => a.label.content_category);
const catFreq = new Map<string, number>();
for (const c of cats) catFreq.set(c, (catFreq.get(c) ?? 0) + 1);
const majCat = [...catFreq.entries()].find(([, v]) => v >= 2)?.[0];
// Specificity majority
const specs = s1anns.map(a => a.label.specificity_level);
const specFreq = new Map<number, number>();
for (const s of specs) specFreq.set(s, (specFreq.get(s) ?? 0) + 1);
const majSpec = [...specFreq.entries()].find(([, v]) => v >= 2)?.[0];
const catOk = majCat !== undefined && mimoLabel.content_category === majCat;
const specOk = majSpec !== undefined && mimoLabel.specificity_level === majSpec;
if (catOk) catMajAgree++;
if (specOk) specMajAgree++;
if (catOk && specOk) bothMajAgree++;
}
console.log(` mimo vs majority (n=${totalMaj}):`);
console.log(` Category: ${pct(catMajAgree, totalMaj)} (${catMajAgree})`);
console.log(` Specificity: ${pct(specMajAgree, totalMaj)} (${specMajAgree})`);
console.log(` Both: ${pct(bothMajAgree, totalMaj)} (${bothMajAgree})`);
// Unanimity: if mimo replaced nano, what would the new unanimity be?
console.log("\n── Hypothetical: replace nano with mimo ────────────────────");
let newCatUnan = 0, newSpecUnan = 0, newBothUnan = 0;
let oldCatUnan = 0, oldSpecUnan = 0, oldBothUnan = 0;
let nCompare = 0;
for (const [pid, mimoLabel] of mimoByParagraph) {
const s1anns = s1ByParagraph.get(pid);
if (!s1anns || s1anns.length !== 3) continue;
nCompare++;
const gemini = s1anns.find(a => a.provenance.modelId.includes("gemini"))!;
const nano = s1anns.find(a => a.provenance.modelId.includes("nano"))!;
const grok = s1anns.find(a => a.provenance.modelId.includes("grok"))!;
// Old (with nano)
const oldCats = [gemini, nano, grok].map(a => a.label.content_category);
const oldSpecs = [gemini, nano, grok].map(a => a.label.specificity_level);
const oldCU = new Set(oldCats).size === 1;
const oldSU = new Set(oldSpecs).size === 1;
if (oldCU) oldCatUnan++;
if (oldSU) oldSpecUnan++;
if (oldCU && oldSU) oldBothUnan++;
// New (with mimo replacing nano)
const newCats = [gemini.label.content_category, mimoLabel.content_category, grok.label.content_category];
const newSpecs = [gemini.label.specificity_level, mimoLabel.specificity_level, grok.label.specificity_level];
const newCU = new Set(newCats).size === 1;
const newSU = new Set(newSpecs).size === 1;
if (newCU) newCatUnan++;
if (newSU) newSpecUnan++;
if (newCU && newSU) newBothUnan++;
}
console.log(` n=${nCompare}`);
console.log(` Old (nano) New (mimo) Delta`);
console.log(` Category: ${pct(oldCatUnan, nCompare).padStart(6)} ${pct(newCatUnan, nCompare).padStart(6)} ${((newCatUnan - oldCatUnan) / nCompare * 100).toFixed(1).padStart(5)}pp`);
console.log(` Specificity: ${pct(oldSpecUnan, nCompare).padStart(6)} ${pct(newSpecUnan, nCompare).padStart(6)} ${((newSpecUnan - oldSpecUnan) / nCompare * 100).toFixed(1).padStart(5)}pp`);
console.log(` Both: ${pct(oldBothUnan, nCompare).padStart(6)} ${pct(newBothUnan, nCompare).padStart(6)} ${((newBothUnan - oldBothUnan) / nCompare * 100).toFixed(1).padStart(5)}pp`);
// Outlier analysis
console.log("\n── Outlier Rate (mimo in 3-model panel) ────────────────────");
let mimoCatOut = 0, mimoSpecOut = 0;
for (const [pid, mimoLabel] of mimoByParagraph) {
const s1anns = s1ByParagraph.get(pid);
if (!s1anns || s1anns.length !== 3) continue;
const gemini = s1anns.find(a => a.provenance.modelId.includes("gemini"))!;
const grok = s1anns.find(a => a.provenance.modelId.includes("grok"))!;
// mimo is outlier when gemini and grok agree but mimo differs
if (gemini.label.content_category === grok.label.content_category &&
mimoLabel.content_category !== gemini.label.content_category) mimoCatOut++;
if (gemini.label.specificity_level === grok.label.specificity_level &&
mimoLabel.specificity_level !== gemini.label.specificity_level) mimoSpecOut++;
}
console.log(` When gemini×grok agree, mimo disagrees:`);
console.log(` Category: ${mimoCatOut} (${pct(mimoCatOut, nCompare)})`);
console.log(` Specificity: ${mimoSpecOut} (${pct(mimoSpecOut, nCompare)})`);
// For comparison: nano outlier rate on same paragraphs
let nanoCatOut = 0, nanoSpecOut = 0;
for (const [pid] of mimoByParagraph) {
const s1anns = s1ByParagraph.get(pid);
if (!s1anns || s1anns.length !== 3) continue;
const gemini = s1anns.find(a => a.provenance.modelId.includes("gemini"))!;
const nano = s1anns.find(a => a.provenance.modelId.includes("nano"))!;
const grok = s1anns.find(a => a.provenance.modelId.includes("grok"))!;
if (gemini.label.content_category === grok.label.content_category &&
nano.label.content_category !== gemini.label.content_category) nanoCatOut++;
if (gemini.label.specificity_level === grok.label.specificity_level &&
nano.label.specificity_level !== gemini.label.specificity_level) nanoSpecOut++;
}
console.log(`\n For comparison, nano disagrees when gemini×grok agree:`);
console.log(` Category: ${nanoCatOut} (${pct(nanoCatOut, nCompare)})`);
console.log(` Specificity: ${nanoSpecOut} (${pct(nanoSpecOut, nCompare)})`);
}
main().catch(err => { console.error(err); process.exit(1); });