246 lines
12 KiB
TypeScript
246 lines
12 KiB
TypeScript
/**
|
||
* Run mimo-v2-flash on the same 500-sample pilot set used for prompt iteration.
|
||
* Compares against existing Stage 1 annotations to assess agreement.
|
||
*
|
||
* Usage: bun ts/scripts/mimo-pilot.ts
|
||
*/
|
||
import { readJsonl, readJsonlRaw, appendJsonl } from "../src/lib/jsonl.ts";
|
||
import { Paragraph } from "../src/schemas/paragraph.ts";
|
||
import { annotateParagraph, type AnnotateOpts } from "../src/label/annotate.ts";
|
||
import { PROMPT_VERSION } from "../src/label/prompts.ts";
|
||
import { v4 as uuidv4 } from "uuid";
|
||
import { existsSync } from "node:fs";
|
||
import pLimit from "p-limit";
|
||
|
||
const PILOT_SAMPLE = new URL("../../data/pilot/pilot-sample-v2.5.jsonl", import.meta.url).pathname;
|
||
const STAGE1_PATH = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
|
||
const OUTPUT_PATH = new URL("../../data/pilot/pilot-mimo-flash.jsonl", import.meta.url).pathname;
|
||
const MODEL = "xiaomi/mimo-v2-flash";
|
||
const CONCURRENCY = 15;
|
||
|
||
interface S1Ann {
|
||
paragraphId: string;
|
||
label: { content_category: string; specificity_level: number };
|
||
provenance: { modelId: string };
|
||
}
|
||
|
||
function pct(n: number, total: number): string {
|
||
return `${((n / total) * 100).toFixed(1)}%`;
|
||
}
|
||
|
||
async function main() {
|
||
// Load pilot sample paragraphs
|
||
console.error("Loading pilot sample paragraphs...");
|
||
const { records: paragraphs } = await readJsonl(PILOT_SAMPLE, Paragraph);
|
||
console.error(` ${paragraphs.length} paragraphs`);
|
||
|
||
const pilotIds = new Set(paragraphs.map(p => p.id));
|
||
|
||
// Load Stage 1 annotations for these paragraphs
|
||
console.error("Loading Stage 1 annotations for comparison...");
|
||
const { records: allAnns } = await readJsonlRaw(STAGE1_PATH);
|
||
const s1ByParagraph = new Map<string, S1Ann[]>();
|
||
for (const raw of allAnns) {
|
||
const a = raw as S1Ann;
|
||
if (!pilotIds.has(a.paragraphId)) continue;
|
||
let arr = s1ByParagraph.get(a.paragraphId);
|
||
if (!arr) { arr = []; s1ByParagraph.set(a.paragraphId, arr); }
|
||
arr.push(a);
|
||
}
|
||
console.error(` ${s1ByParagraph.size} paragraphs with Stage 1 data`);
|
||
|
||
// Resume support
|
||
const doneKeys = new Set<string>();
|
||
if (existsSync(OUTPUT_PATH)) {
|
||
const { records: existing } = await readJsonlRaw(OUTPUT_PATH);
|
||
for (const r of existing) {
|
||
const a = r as { paragraphId?: string };
|
||
if (a.paragraphId) doneKeys.add(a.paragraphId);
|
||
}
|
||
if (doneKeys.size > 0) console.error(` Resuming: ${doneKeys.size} already done`);
|
||
}
|
||
|
||
const remaining = paragraphs.filter(p => !doneKeys.has(p.id));
|
||
console.error(` Running ${remaining.length} annotations...\n`);
|
||
|
||
// Run mimo on remaining paragraphs
|
||
const runId = uuidv4();
|
||
const limit = pLimit(CONCURRENCY);
|
||
let completed = 0, failed = 0, totalCost = 0;
|
||
const startTime = Date.now();
|
||
|
||
const tasks = remaining.map(p => limit(async () => {
|
||
const opts: AnnotateOpts = {
|
||
modelId: MODEL,
|
||
stage: "benchmark",
|
||
runId,
|
||
promptVersion: PROMPT_VERSION,
|
||
reasoningEffort: "low",
|
||
};
|
||
try {
|
||
const ann = await annotateParagraph(p, opts);
|
||
await appendJsonl(OUTPUT_PATH, ann);
|
||
totalCost += ann.provenance.costUsd;
|
||
completed++;
|
||
if (completed % 50 === 0) {
|
||
const elapsed = (Date.now() - startTime) / 1000;
|
||
process.stderr.write(`\r ${completed}/${remaining.length} (${(completed / elapsed).toFixed(1)}/s, $${totalCost.toFixed(2)}) `);
|
||
}
|
||
} catch (err) {
|
||
failed++;
|
||
console.error(`\n ✖ ${p.id.slice(0, 8)}: ${err instanceof Error ? err.message : String(err)}`);
|
||
}
|
||
}));
|
||
|
||
await Promise.all(tasks);
|
||
const elapsed = ((Date.now() - startTime) / 1000).toFixed(0);
|
||
console.error(`\n\n Done: ${completed} completed, ${failed} failed, $${totalCost.toFixed(2)}, ${elapsed}s\n`);
|
||
|
||
// ── Analysis ─────────────────────────────────────────────────────────
|
||
// Load all mimo results (including resumed)
|
||
const { records: mimoRaw } = await readJsonlRaw(OUTPUT_PATH);
|
||
const mimoByParagraph = new Map<string, { content_category: string; specificity_level: number }>();
|
||
for (const r of mimoRaw) {
|
||
const a = r as { paragraphId: string; label: { content_category: string; specificity_level: number } };
|
||
mimoByParagraph.set(a.paragraphId, a.label);
|
||
}
|
||
|
||
const s1Models = ["google/gemini-3.1-flash-lite-preview", "openai/gpt-5.4-nano", "x-ai/grok-4.1-fast"];
|
||
const shortName = (m: string) => m.split("/").pop()!;
|
||
|
||
console.log("═══════════════════════════════════════════════════════════");
|
||
console.log(" MIMO-V2-FLASH PILOT COMPARISON (n=" + mimoByParagraph.size + ")");
|
||
console.log("═══════════════════════════════════════════════════════════\n");
|
||
|
||
// Pairwise agreement: mimo vs each Stage 1 model
|
||
console.log("── Pairwise Agreement (mimo vs Stage 1 models) ─────────────");
|
||
for (const model of s1Models) {
|
||
let catAgree = 0, specAgree = 0, bothAgree = 0, total = 0;
|
||
for (const [pid, mimoLabel] of mimoByParagraph) {
|
||
const s1anns = s1ByParagraph.get(pid);
|
||
if (!s1anns) continue;
|
||
const s1ann = s1anns.find(a => a.provenance.modelId === model);
|
||
if (!s1ann) continue;
|
||
total++;
|
||
if (s1ann.label.content_category === mimoLabel.content_category) catAgree++;
|
||
if (s1ann.label.specificity_level === mimoLabel.specificity_level) specAgree++;
|
||
if (s1ann.label.content_category === mimoLabel.content_category &&
|
||
s1ann.label.specificity_level === mimoLabel.specificity_level) bothAgree++;
|
||
}
|
||
console.log(`\n mimo × ${shortName(model)} (n=${total}):`);
|
||
console.log(` Category: ${pct(catAgree, total)} (${catAgree})`);
|
||
console.log(` Specificity: ${pct(specAgree, total)} (${specAgree})`);
|
||
console.log(` Both: ${pct(bothAgree, total)} (${bothAgree})`);
|
||
}
|
||
|
||
// Agreement with majority vote
|
||
console.log("\n── Agreement with Stage 1 Majority Vote ───────────────────");
|
||
let catMajAgree = 0, specMajAgree = 0, bothMajAgree = 0, totalMaj = 0;
|
||
for (const [pid, mimoLabel] of mimoByParagraph) {
|
||
const s1anns = s1ByParagraph.get(pid);
|
||
if (!s1anns || s1anns.length !== 3) continue;
|
||
totalMaj++;
|
||
|
||
// Category majority
|
||
const cats = s1anns.map(a => a.label.content_category);
|
||
const catFreq = new Map<string, number>();
|
||
for (const c of cats) catFreq.set(c, (catFreq.get(c) ?? 0) + 1);
|
||
const majCat = [...catFreq.entries()].find(([, v]) => v >= 2)?.[0];
|
||
|
||
// Specificity majority
|
||
const specs = s1anns.map(a => a.label.specificity_level);
|
||
const specFreq = new Map<number, number>();
|
||
for (const s of specs) specFreq.set(s, (specFreq.get(s) ?? 0) + 1);
|
||
const majSpec = [...specFreq.entries()].find(([, v]) => v >= 2)?.[0];
|
||
|
||
const catOk = majCat !== undefined && mimoLabel.content_category === majCat;
|
||
const specOk = majSpec !== undefined && mimoLabel.specificity_level === majSpec;
|
||
if (catOk) catMajAgree++;
|
||
if (specOk) specMajAgree++;
|
||
if (catOk && specOk) bothMajAgree++;
|
||
}
|
||
console.log(` mimo vs majority (n=${totalMaj}):`);
|
||
console.log(` Category: ${pct(catMajAgree, totalMaj)} (${catMajAgree})`);
|
||
console.log(` Specificity: ${pct(specMajAgree, totalMaj)} (${specMajAgree})`);
|
||
console.log(` Both: ${pct(bothMajAgree, totalMaj)} (${bothMajAgree})`);
|
||
|
||
// Unanimity: if mimo replaced nano, what would the new unanimity be?
|
||
console.log("\n── Hypothetical: replace nano with mimo ────────────────────");
|
||
let newCatUnan = 0, newSpecUnan = 0, newBothUnan = 0;
|
||
let oldCatUnan = 0, oldSpecUnan = 0, oldBothUnan = 0;
|
||
let nCompare = 0;
|
||
|
||
for (const [pid, mimoLabel] of mimoByParagraph) {
|
||
const s1anns = s1ByParagraph.get(pid);
|
||
if (!s1anns || s1anns.length !== 3) continue;
|
||
nCompare++;
|
||
|
||
const gemini = s1anns.find(a => a.provenance.modelId.includes("gemini"))!;
|
||
const nano = s1anns.find(a => a.provenance.modelId.includes("nano"))!;
|
||
const grok = s1anns.find(a => a.provenance.modelId.includes("grok"))!;
|
||
|
||
// Old (with nano)
|
||
const oldCats = [gemini, nano, grok].map(a => a.label.content_category);
|
||
const oldSpecs = [gemini, nano, grok].map(a => a.label.specificity_level);
|
||
const oldCU = new Set(oldCats).size === 1;
|
||
const oldSU = new Set(oldSpecs).size === 1;
|
||
if (oldCU) oldCatUnan++;
|
||
if (oldSU) oldSpecUnan++;
|
||
if (oldCU && oldSU) oldBothUnan++;
|
||
|
||
// New (with mimo replacing nano)
|
||
const newCats = [gemini.label.content_category, mimoLabel.content_category, grok.label.content_category];
|
||
const newSpecs = [gemini.label.specificity_level, mimoLabel.specificity_level, grok.label.specificity_level];
|
||
const newCU = new Set(newCats).size === 1;
|
||
const newSU = new Set(newSpecs).size === 1;
|
||
if (newCU) newCatUnan++;
|
||
if (newSU) newSpecUnan++;
|
||
if (newCU && newSU) newBothUnan++;
|
||
}
|
||
|
||
console.log(` n=${nCompare}`);
|
||
console.log(` Old (nano) New (mimo) Delta`);
|
||
console.log(` Category: ${pct(oldCatUnan, nCompare).padStart(6)} ${pct(newCatUnan, nCompare).padStart(6)} ${((newCatUnan - oldCatUnan) / nCompare * 100).toFixed(1).padStart(5)}pp`);
|
||
console.log(` Specificity: ${pct(oldSpecUnan, nCompare).padStart(6)} ${pct(newSpecUnan, nCompare).padStart(6)} ${((newSpecUnan - oldSpecUnan) / nCompare * 100).toFixed(1).padStart(5)}pp`);
|
||
console.log(` Both: ${pct(oldBothUnan, nCompare).padStart(6)} ${pct(newBothUnan, nCompare).padStart(6)} ${((newBothUnan - oldBothUnan) / nCompare * 100).toFixed(1).padStart(5)}pp`);
|
||
|
||
// Outlier analysis
|
||
console.log("\n── Outlier Rate (mimo in 3-model panel) ────────────────────");
|
||
let mimoCatOut = 0, mimoSpecOut = 0;
|
||
for (const [pid, mimoLabel] of mimoByParagraph) {
|
||
const s1anns = s1ByParagraph.get(pid);
|
||
if (!s1anns || s1anns.length !== 3) continue;
|
||
|
||
const gemini = s1anns.find(a => a.provenance.modelId.includes("gemini"))!;
|
||
const grok = s1anns.find(a => a.provenance.modelId.includes("grok"))!;
|
||
|
||
// mimo is outlier when gemini and grok agree but mimo differs
|
||
if (gemini.label.content_category === grok.label.content_category &&
|
||
mimoLabel.content_category !== gemini.label.content_category) mimoCatOut++;
|
||
if (gemini.label.specificity_level === grok.label.specificity_level &&
|
||
mimoLabel.specificity_level !== gemini.label.specificity_level) mimoSpecOut++;
|
||
}
|
||
console.log(` When gemini×grok agree, mimo disagrees:`);
|
||
console.log(` Category: ${mimoCatOut} (${pct(mimoCatOut, nCompare)})`);
|
||
console.log(` Specificity: ${mimoSpecOut} (${pct(mimoSpecOut, nCompare)})`);
|
||
|
||
// For comparison: nano outlier rate on same paragraphs
|
||
let nanoCatOut = 0, nanoSpecOut = 0;
|
||
for (const [pid] of mimoByParagraph) {
|
||
const s1anns = s1ByParagraph.get(pid);
|
||
if (!s1anns || s1anns.length !== 3) continue;
|
||
const gemini = s1anns.find(a => a.provenance.modelId.includes("gemini"))!;
|
||
const nano = s1anns.find(a => a.provenance.modelId.includes("nano"))!;
|
||
const grok = s1anns.find(a => a.provenance.modelId.includes("grok"))!;
|
||
if (gemini.label.content_category === grok.label.content_category &&
|
||
nano.label.content_category !== gemini.label.content_category) nanoCatOut++;
|
||
if (gemini.label.specificity_level === grok.label.specificity_level &&
|
||
nano.label.specificity_level !== gemini.label.specificity_level) nanoSpecOut++;
|
||
}
|
||
console.log(`\n For comparison, nano disagrees when gemini×grok agree:`);
|
||
console.log(` Category: ${nanoCatOut} (${pct(nanoCatOut, nCompare)})`);
|
||
console.log(` Specificity: ${nanoSpecOut} (${pct(nanoSpecOut, nCompare)})`);
|
||
}
|
||
|
||
main().catch(err => { console.error(err); process.exit(1); });
|