SEC-cyBERT/ts/scripts/mimo-pilot.ts

/**
 * Run mimo-v2-flash on the same 500-sample pilot set used for prompt iteration.
 * Compares against existing Stage 1 annotations to assess agreement.
 *
 * Usage: bun ts/scripts/mimo-pilot.ts
 */
import { readJsonl, readJsonlRaw, appendJsonl } from "../src/lib/jsonl.ts";
import { Paragraph } from "@sec-cybert/schemas/paragraph.ts";
import { annotateParagraph, type AnnotateOpts } from "../src/label/annotate.ts";
import { PROMPT_VERSION } from "../src/label/prompts.ts";
import { v4 as uuidv4 } from "uuid";
import { existsSync } from "node:fs";
import pLimit from "p-limit";

const PILOT_SAMPLE = new URL("../../data/pilot/pilot-sample-v2.5.jsonl", import.meta.url).pathname;
const STAGE1_PATH = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
const OUTPUT_PATH = new URL("../../data/pilot/pilot-mimo-flash.jsonl", import.meta.url).pathname;
const MODEL = "xiaomi/mimo-v2-flash";
const CONCURRENCY = 15;

interface S1Ann {
  paragraphId: string;
  label: { content_category: string; specificity_level: number };
  provenance: { modelId: string };
}

function pct(n: number, total: number): string {
  return `${((n / total) * 100).toFixed(1)}%`;
}

async function main() {
  // Load pilot sample paragraphs
  console.error("Loading pilot sample paragraphs...");
  const { records: paragraphs } = await readJsonl(PILOT_SAMPLE, Paragraph);
  console.error(`  ${paragraphs.length} paragraphs`);

  const pilotIds = new Set(paragraphs.map(p => p.id));

  // Load Stage 1 annotations for these paragraphs
  console.error("Loading Stage 1 annotations for comparison...");
  const { records: allAnns } = await readJsonlRaw(STAGE1_PATH);
  const s1ByParagraph = new Map<string, S1Ann[]>();
  for (const raw of allAnns) {
    const a = raw as S1Ann;
    if (!pilotIds.has(a.paragraphId)) continue;
    let arr = s1ByParagraph.get(a.paragraphId);
    if (!arr) { arr = []; s1ByParagraph.set(a.paragraphId, arr); }
    arr.push(a);
  }
  console.error(`  ${s1ByParagraph.size} paragraphs with Stage 1 data`);

  // Resume support
  const doneKeys = new Set<string>();
  if (existsSync(OUTPUT_PATH)) {
    const { records: existing } = await readJsonlRaw(OUTPUT_PATH);
    for (const r of existing) {
      const a = r as { paragraphId?: string };
      if (a.paragraphId) doneKeys.add(a.paragraphId);
    }
    if (doneKeys.size > 0) console.error(`  Resuming: ${doneKeys.size} already done`);
  }

  const remaining = paragraphs.filter(p => !doneKeys.has(p.id));
  console.error(`  Running ${remaining.length} annotations...\n`);

  // Run mimo on remaining paragraphs
  const runId = uuidv4();
  const limit = pLimit(CONCURRENCY);
  let completed = 0, failed = 0, totalCost = 0;
  const startTime = Date.now();

  const tasks = remaining.map(p => limit(async () => {
    const opts: AnnotateOpts = {
      modelId: MODEL,
      stage: "benchmark",
      runId,
      promptVersion: PROMPT_VERSION,
      reasoningEffort: "low",
    };
    try {
      const ann = await annotateParagraph(p, opts);
      await appendJsonl(OUTPUT_PATH, ann);
      totalCost += ann.provenance.costUsd;
      completed++;
      if (completed % 50 === 0) {
        const elapsed = (Date.now() - startTime) / 1000;
        process.stderr.write(`\r  ${completed}/${remaining.length} (${(completed / elapsed).toFixed(1)}/s, $${totalCost.toFixed(2)})   `);
      }
    } catch (err) {
      failed++;
      console.error(`\n  ✖ ${p.id.slice(0, 8)}: ${err instanceof Error ? err.message : String(err)}`);
    }
  }));

  await Promise.all(tasks);
  const elapsed = ((Date.now() - startTime) / 1000).toFixed(0);
  console.error(`\n\n  Done: ${completed} completed, ${failed} failed, $${totalCost.toFixed(2)}, ${elapsed}s\n`);

  // ── Analysis ─────────────────────────────────────────────────────────
  // Load all mimo results (including resumed)
  const { records: mimoRaw } = await readJsonlRaw(OUTPUT_PATH);
  const mimoByParagraph = new Map<string, { content_category: string; specificity_level: number }>();
  for (const r of mimoRaw) {
    const a = r as { paragraphId: string; label: { content_category: string; specificity_level: number } };
    mimoByParagraph.set(a.paragraphId, a.label);
  }

  const s1Models = ["google/gemini-3.1-flash-lite-preview", "openai/gpt-5.4-nano", "x-ai/grok-4.1-fast"];
  const shortName = (m: string) => m.split("/").pop()!;

  console.log("═══════════════════════════════════════════════════════════");
  console.log("  MIMO-V2-FLASH PILOT COMPARISON (n=" + mimoByParagraph.size + ")");
  console.log("═══════════════════════════════════════════════════════════\n");

  // Pairwise agreement: mimo vs each Stage 1 model
  console.log("── Pairwise Agreement (mimo vs Stage 1 models) ─────────────");
  for (const model of s1Models) {
    let catAgree = 0, specAgree = 0, bothAgree = 0, total = 0;
    for (const [pid, mimoLabel] of mimoByParagraph) {
      const s1anns = s1ByParagraph.get(pid);
      if (!s1anns) continue;
      const s1ann = s1anns.find(a => a.provenance.modelId === model);
      if (!s1ann) continue;
      total++;
      if (s1ann.label.content_category === mimoLabel.content_category) catAgree++;
      if (s1ann.label.specificity_level === mimoLabel.specificity_level) specAgree++;
      if (s1ann.label.content_category === mimoLabel.content_category &&
          s1ann.label.specificity_level === mimoLabel.specificity_level) bothAgree++;
    }
    console.log(`\n  mimo × ${shortName(model)} (n=${total}):`);
    console.log(`    Category:    ${pct(catAgree, total)} (${catAgree})`);
    console.log(`    Specificity: ${pct(specAgree, total)} (${specAgree})`);
    console.log(`    Both:        ${pct(bothAgree, total)} (${bothAgree})`);
  }

  // Agreement with majority vote
  console.log("\n── Agreement with Stage 1 Majority Vote ───────────────────");
  let catMajAgree = 0, specMajAgree = 0, bothMajAgree = 0, totalMaj = 0;
  for (const [pid, mimoLabel] of mimoByParagraph) {
    const s1anns = s1ByParagraph.get(pid);
    if (!s1anns || s1anns.length !== 3) continue;
    totalMaj++;

    // Category majority
    const cats = s1anns.map(a => a.label.content_category);
    const catFreq = new Map<string, number>();
    for (const c of cats) catFreq.set(c, (catFreq.get(c) ?? 0) + 1);
    const majCat = [...catFreq.entries()].find(([, v]) => v >= 2)?.[0];

    // Specificity majority
    const specs = s1anns.map(a => a.label.specificity_level);
    const specFreq = new Map<number, number>();
    for (const s of specs) specFreq.set(s, (specFreq.get(s) ?? 0) + 1);
    const majSpec = [...specFreq.entries()].find(([, v]) => v >= 2)?.[0];

    const catOk = majCat !== undefined && mimoLabel.content_category === majCat;
    const specOk = majSpec !== undefined && mimoLabel.specificity_level === majSpec;
    if (catOk) catMajAgree++;
    if (specOk) specMajAgree++;
    if (catOk && specOk) bothMajAgree++;
  }
  console.log(`  mimo vs majority (n=${totalMaj}):`);
  console.log(`    Category:    ${pct(catMajAgree, totalMaj)} (${catMajAgree})`);
  console.log(`    Specificity: ${pct(specMajAgree, totalMaj)} (${specMajAgree})`);
  console.log(`    Both:        ${pct(bothMajAgree, totalMaj)} (${bothMajAgree})`);

  // Unanimity: if mimo replaced nano, what would the new unanimity be?
  console.log("\n── Hypothetical: replace nano with mimo ────────────────────");
  let newCatUnan = 0, newSpecUnan = 0, newBothUnan = 0;
  let oldCatUnan = 0, oldSpecUnan = 0, oldBothUnan = 0;
  let nCompare = 0;

  for (const [pid, mimoLabel] of mimoByParagraph) {
    const s1anns = s1ByParagraph.get(pid);
    if (!s1anns || s1anns.length !== 3) continue;
    nCompare++;

    const gemini = s1anns.find(a => a.provenance.modelId.includes("gemini"))!;
    const nano = s1anns.find(a => a.provenance.modelId.includes("nano"))!;
    const grok = s1anns.find(a => a.provenance.modelId.includes("grok"))!;

    // Old (with nano)
    const oldCats = [gemini, nano, grok].map(a => a.label.content_category);
    const oldSpecs = [gemini, nano, grok].map(a => a.label.specificity_level);
    const oldCU = new Set(oldCats).size === 1;
    const oldSU = new Set(oldSpecs).size === 1;
    if (oldCU) oldCatUnan++;
    if (oldSU) oldSpecUnan++;
    if (oldCU && oldSU) oldBothUnan++;

    // New (with mimo replacing nano)
    const newCats = [gemini.label.content_category, mimoLabel.content_category, grok.label.content_category];
    const newSpecs = [gemini.label.specificity_level, mimoLabel.specificity_level, grok.label.specificity_level];
    const newCU = new Set(newCats).size === 1;
    const newSU = new Set(newSpecs).size === 1;
    if (newCU) newCatUnan++;
    if (newSU) newSpecUnan++;
    if (newCU && newSU) newBothUnan++;
  }

  console.log(`  n=${nCompare}`);
  console.log(`                    Old (nano)    New (mimo)    Delta`);
  console.log(`    Category:       ${pct(oldCatUnan, nCompare).padStart(6)}        ${pct(newCatUnan, nCompare).padStart(6)}       ${((newCatUnan - oldCatUnan) / nCompare * 100).toFixed(1).padStart(5)}pp`);
  console.log(`    Specificity:    ${pct(oldSpecUnan, nCompare).padStart(6)}        ${pct(newSpecUnan, nCompare).padStart(6)}       ${((newSpecUnan - oldSpecUnan) / nCompare * 100).toFixed(1).padStart(5)}pp`);
  console.log(`    Both:           ${pct(oldBothUnan, nCompare).padStart(6)}        ${pct(newBothUnan, nCompare).padStart(6)}       ${((newBothUnan - oldBothUnan) / nCompare * 100).toFixed(1).padStart(5)}pp`);

  // Outlier analysis
  console.log("\n── Outlier Rate (mimo in 3-model panel) ────────────────────");
  let mimoCatOut = 0, mimoSpecOut = 0;
  for (const [pid, mimoLabel] of mimoByParagraph) {
    const s1anns = s1ByParagraph.get(pid);
    if (!s1anns || s1anns.length !== 3) continue;

    const gemini = s1anns.find(a => a.provenance.modelId.includes("gemini"))!;
    const grok = s1anns.find(a => a.provenance.modelId.includes("grok"))!;

    // mimo is outlier when gemini and grok agree but mimo differs
    if (gemini.label.content_category === grok.label.content_category &&
        mimoLabel.content_category !== gemini.label.content_category) mimoCatOut++;
    if (gemini.label.specificity_level === grok.label.specificity_level &&
        mimoLabel.specificity_level !== gemini.label.specificity_level) mimoSpecOut++;
  }
  console.log(`  When gemini×grok agree, mimo disagrees:`);
  console.log(`    Category:    ${mimoCatOut} (${pct(mimoCatOut, nCompare)})`);
  console.log(`    Specificity: ${mimoSpecOut} (${pct(mimoSpecOut, nCompare)})`);

  // For comparison: nano outlier rate on same paragraphs
  let nanoCatOut = 0, nanoSpecOut = 0;
  for (const [pid] of mimoByParagraph) {
    const s1anns = s1ByParagraph.get(pid);
    if (!s1anns || s1anns.length !== 3) continue;
    const gemini = s1anns.find(a => a.provenance.modelId.includes("gemini"))!;
    const nano = s1anns.find(a => a.provenance.modelId.includes("nano"))!;
    const grok = s1anns.find(a => a.provenance.modelId.includes("grok"))!;
    if (gemini.label.content_category === grok.label.content_category &&
        nano.label.content_category !== gemini.label.content_category) nanoCatOut++;
    if (gemini.label.specificity_level === grok.label.specificity_level &&
        nano.label.specificity_level !== gemini.label.specificity_level) nanoSpecOut++;
  }
  console.log(`\n  For comparison, nano disagrees when gemini×grok agree:`);
  console.log(`    Category:    ${nanoCatOut} (${pct(nanoCatOut, nCompare)})`);
  console.log(`    Specificity: ${nanoSpecOut} (${pct(nanoSpecOut, nCompare)})`);
}

main().catch(err => { console.error(err); process.exit(1); });