/** * Deep analysis of Stage 1 annotation data. * * Usage: bun ts/scripts/stage1-analyze.ts */ import { readJsonlRaw } from "../src/lib/jsonl.ts"; const INPUT = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname; // ── Types ────────────────────────────────────────────────────────────── interface Ann { paragraphId: string; label: { content_category: string; specificity_level: number; category_confidence: string; specificity_confidence: string; reasoning: string; }; provenance: { modelId: string; costUsd: number; inputTokens: number; outputTokens: number; reasoningTokens: number; latencyMs: number; requestedAt: string; }; } type ModelAnns = Map; // paragraphId → annotations // ── Helpers ──────────────────────────────────────────────────────────── function pct(n: number, total: number): string { return `${((n / total) * 100).toFixed(1)}%`; } function median(arr: number[]): number { const sorted = [...arr].sort((a, b) => a - b); const mid = Math.floor(sorted.length / 2); return sorted.length % 2 ? sorted[mid] : (sorted[mid - 1] + sorted[mid]) / 2; } function mean(arr: number[]): number { return arr.reduce((a, b) => a + b, 0) / arr.length; } function stddev(arr: number[]): number { const m = mean(arr); return Math.sqrt(arr.reduce((sum, x) => sum + (x - m) ** 2, 0) / arr.length); } function percentile(arr: number[], p: number): number { const sorted = [...arr].sort((a, b) => a - b); const idx = (p / 100) * (sorted.length - 1); const lo = Math.floor(idx); const hi = Math.ceil(idx); return lo === hi ? sorted[lo] : sorted[lo] + (sorted[hi] - sorted[lo]) * (idx - lo); } // ── Main ─────────────────────────────────────────────────────────────── async function main() { console.log("Loading annotations..."); const { records: raw, skipped } = await readJsonlRaw(INPUT); const anns = raw as Ann[]; console.log(` ${anns.length.toLocaleString()} annotations loaded, ${skipped} skipped\n`); // Group by paragraph const byParagraph = new Map(); for (const a of anns) { let arr = byParagraph.get(a.paragraphId); if (!arr) { arr = []; byParagraph.set(a.paragraphId, arr); } arr.push(a); } // Group by model const byModel = new Map(); for (const a of anns) { let arr = byModel.get(a.provenance.modelId); if (!arr) { arr = []; byModel.set(a.provenance.modelId, arr); } arr.push(a); } const modelNames = [...byModel.keys()].sort(); const shortName = (m: string) => m.split("/").pop()!; const nParagraphs = byParagraph.size; // ════════════════════════════════════════════════════════════════════ // 1. OVERVIEW // ════════════════════════════════════════════════════════════════════ console.log("═══════════════════════════════════════════════════════════"); console.log(" STAGE 1 DEEP ANALYSIS"); console.log("═══════════════════════════════════════════════════════════\n"); console.log("── Overview ──────────────────────────────────────────────"); console.log(` Paragraphs: ${nParagraphs.toLocaleString()}`); console.log(` Annotations: ${anns.length.toLocaleString()}`); console.log(` Models: ${modelNames.map(shortName).join(", ")}`); let totalCost = 0, totalInput = 0, totalOutput = 0, totalReasoning = 0; for (const a of anns) { totalCost += a.provenance.costUsd; totalInput += a.provenance.inputTokens; totalOutput += a.provenance.outputTokens; totalReasoning += a.provenance.reasoningTokens; } console.log(` Total cost: $${totalCost.toFixed(2)}`); console.log(` Input tokens: ${(totalInput / 1e6).toFixed(1)}M`); console.log(` Output tokens: ${(totalOutput / 1e6).toFixed(1)}M`); console.log(` Reasoning: ${(totalReasoning / 1e6).toFixed(1)}M`); // ════════════════════════════════════════════════════════════════════ // 2. PER-MODEL STATS // ════════════════════════════════════════════════════════════════════ console.log("\n── Per-Model Statistics ───────────────────────────────────"); for (const model of modelNames) { const mas = byModel.get(model)!; const costs = mas.map(a => a.provenance.costUsd); const latencies = mas.map(a => a.provenance.latencyMs); const outputs = mas.map(a => a.provenance.outputTokens); console.log(`\n ${shortName(model)} (n=${mas.length.toLocaleString()}):`); console.log(` Cost: $${costs.reduce((a, b) => a + b, 0).toFixed(2)} total, $${mean(costs).toFixed(5)}/ann`); console.log(` Latency: median ${median(latencies).toFixed(0)}ms, p95 ${percentile(latencies, 95).toFixed(0)}ms, p99 ${percentile(latencies, 99).toFixed(0)}ms`); console.log(` Output: median ${median(outputs).toFixed(0)} tokens, mean ${mean(outputs).toFixed(0)}`); // Category distribution const catCounts = new Map(); const specCounts = new Map(); const confCatCounts = new Map(); const confSpecCounts = new Map(); for (const a of mas) { catCounts.set(a.label.content_category, (catCounts.get(a.label.content_category) ?? 0) + 1); specCounts.set(a.label.specificity_level, (specCounts.get(a.label.specificity_level) ?? 0) + 1); confCatCounts.set(a.label.category_confidence, (confCatCounts.get(a.label.category_confidence) ?? 0) + 1); confSpecCounts.set(a.label.specificity_confidence, (confSpecCounts.get(a.label.specificity_confidence) ?? 0) + 1); } console.log(` Categories: ${[...catCounts.entries()].sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}=${v}`).join(", ")}`); console.log(` Specificity: ${[...specCounts.entries()].sort((a, b) => a[0] - b[0]).map(([k, v]) => `${k}=${v} (${pct(v, mas.length)})`).join(", ")}`); console.log(` Cat confidence: ${[...confCatCounts.entries()].sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}=${v}`).join(", ")}`); console.log(` Spec confidence: ${[...confSpecCounts.entries()].sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}=${v}`).join(", ")}`); } // ════════════════════════════════════════════════════════════════════ // 3. AGREEMENT ANALYSIS // ════════════════════════════════════════════════════════════════════ console.log("\n\n── Agreement Analysis ─────────────────────────────────────"); let catUnanimous = 0, specUnanimous = 0, bothUnanimous = 0; let catMajority = 0, specMajority = 0, bothMajority = 0; let catNoMajority = 0, specNoMajority = 0; const specSpreads: number[] = []; // Category confusion tracking const catDisagreementPairs = new Map(); const specDisagreementPatterns = new Map(); for (const [pid, panns] of byParagraph) { if (panns.length !== 3) continue; const cats = panns.map(a => a.label.content_category); const specs = panns.map(a => a.label.specificity_level); // Category agreement const catSet = new Set(cats); const catUnan = catSet.size === 1; const catMaj = cats.filter(c => c === cats[0]).length >= 2 || cats.filter(c => c === cats[1]).length >= 2; if (catUnan) catUnanimous++; if (catMaj) catMajority++; if (!catMaj) { catNoMajority++; } // Track category disagreement pairs if (!catUnan) { const sorted = [...cats].sort(); for (let i = 0; i < sorted.length; i++) { for (let j = i + 1; j < sorted.length; j++) { if (sorted[i] !== sorted[j]) { const key = `${sorted[i]} ↔ ${sorted[j]}`; catDisagreementPairs.set(key, (catDisagreementPairs.get(key) ?? 0) + 1); } } } } // Specificity agreement const specSet = new Set(specs); const specUnan = specSet.size === 1; const specMaj0 = specs.filter(s => s === specs[0]).length >= 2 || specs.filter(s => s === specs[1]).length >= 2; if (specUnan) specUnanimous++; if (specMaj0) specMajority++; if (!specMaj0) specNoMajority++; // Specificity spread (MAD) const specMedian = median(specs); const mad = mean(specs.map(s => Math.abs(s - specMedian))); specSpreads.push(mad); // Track specificity disagreement patterns if (!specUnan) { const sortedSpecs = [...specs].sort((a, b) => a - b); const key = `[${sortedSpecs.join(",")}]`; specDisagreementPatterns.set(key, (specDisagreementPatterns.get(key) ?? 0) + 1); } // Both if (catUnan && specUnan) bothUnanimous++; if (catMaj && specMaj0) bothMajority++; } console.log(`\n Unanimity (all 3 agree):`); console.log(` Category: ${catUnanimous.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(catUnanimous, nParagraphs)})`); console.log(` Specificity: ${specUnanimous.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(specUnanimous, nParagraphs)})`); console.log(` Both: ${bothUnanimous.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(bothUnanimous, nParagraphs)})`); console.log(`\n Majority (≥2 agree):`); console.log(` Category: ${catMajority.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(catMajority, nParagraphs)})`); console.log(` Specificity: ${specMajority.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(specMajority, nParagraphs)})`); console.log(` Both: ${bothMajority.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(bothMajority, nParagraphs)})`); console.log(`\n No majority (3-way split):`); console.log(` Category: ${catNoMajority.toLocaleString()} (${pct(catNoMajority, nParagraphs)})`); console.log(` Specificity: ${specNoMajority.toLocaleString()} (${pct(specNoMajority, nParagraphs)})`); console.log(`\n Specificity spread (MAD):`); console.log(` Mean: ${mean(specSpreads).toFixed(3)}`); console.log(` Median: ${median(specSpreads).toFixed(3)}`); console.log(` Std: ${stddev(specSpreads).toFixed(3)}`); // Stage 2 need const needsStage2 = nParagraphs - bothUnanimous; console.log(`\n → Need Stage 2 judge: ${needsStage2.toLocaleString()} (${pct(needsStage2, nParagraphs)})`); // ════════════════════════════════════════════════════════════════════ // 4. DISAGREEMENT BREAKDOWN // ════════════════════════════════════════════════════════════════════ console.log("\n\n── Category Disagreement Pairs (top 20) ──────────────────"); const sortedCatDis = [...catDisagreementPairs.entries()].sort((a, b) => b[1] - a[1]); for (const [pair, count] of sortedCatDis.slice(0, 20)) { console.log(` ${count.toLocaleString().padStart(6)} ${pair}`); } console.log("\n── Specificity Disagreement Patterns (all) ────────────────"); const sortedSpecDis = [...specDisagreementPatterns.entries()].sort((a, b) => b[1] - a[1]); for (const [pattern, count] of sortedSpecDis) { console.log(` ${count.toLocaleString().padStart(6)} ${pattern}`); } // ════════════════════════════════════════════════════════════════════ // 5. PAIRWISE MODEL AGREEMENT // ════════════════════════════════════════════════════════════════════ console.log("\n\n── Pairwise Model Agreement ───────────────────────────────"); for (let i = 0; i < modelNames.length; i++) { for (let j = i + 1; j < modelNames.length; j++) { const m1 = modelNames[i], m2 = modelNames[j]; let catAgree = 0, specAgree = 0, bothAgree = 0, total = 0; for (const [pid, panns] of byParagraph) { const a1 = panns.find(a => a.provenance.modelId === m1); const a2 = panns.find(a => a.provenance.modelId === m2); if (!a1 || !a2) continue; total++; const ca = a1.label.content_category === a2.label.content_category; const sa = a1.label.specificity_level === a2.label.specificity_level; if (ca) catAgree++; if (sa) specAgree++; if (ca && sa) bothAgree++; } console.log(`\n ${shortName(m1)} × ${shortName(m2)} (n=${total.toLocaleString()}):`); console.log(` Category: ${pct(catAgree, total)} (${catAgree.toLocaleString()})`); console.log(` Specificity: ${pct(specAgree, total)} (${specAgree.toLocaleString()})`); console.log(` Both: ${pct(bothAgree, total)} (${bothAgree.toLocaleString()})`); } } // ════════════════════════════════════════════════════════════════════ // 6. CATEGORY DISTRIBUTION (AGGREGATE) // ════════════════════════════════════════════════════════════════════ console.log("\n\n── Category Distribution (all annotations) ────────────────"); const aggCat = new Map(); for (const a of anns) { aggCat.set(a.label.content_category, (aggCat.get(a.label.content_category) ?? 0) + 1); } const sortedCats = [...aggCat.entries()].sort((a, b) => b[1] - a[1]); for (const [cat, count] of sortedCats) { console.log(` ${count.toLocaleString().padStart(8)} ${pct(count, anns.length).padStart(6)} ${cat}`); } // Per-model category distribution comparison console.log("\n── Category Distribution by Model (%) ─────────────────────"); const categories = sortedCats.map(([c]) => c); const header = "Category".padEnd(30) + modelNames.map(m => shortName(m).padStart(12)).join(""); console.log(` ${header}`); for (const cat of categories) { const row = cat.padEnd(30) + modelNames.map(m => { const mas = byModel.get(m)!; const count = mas.filter(a => a.label.content_category === cat).length; return pct(count, mas.length).padStart(12); }).join(""); console.log(` ${row}`); } // ════════════════════════════════════════════════════════════════════ // 7. SPECIFICITY DISTRIBUTION (AGGREGATE) // ════════════════════════════════════════════════════════════════════ console.log("\n── Specificity Distribution (all annotations) ──────────────"); const specLabels = ["Generic Boilerplate", "Domain-Adapted", "Firm-Specific", "Quantified-Verifiable"]; const aggSpec = new Map(); for (const a of anns) { aggSpec.set(a.label.specificity_level, (aggSpec.get(a.label.specificity_level) ?? 0) + 1); } for (let s = 1; s <= 4; s++) { const count = aggSpec.get(s) ?? 0; console.log(` ${count.toLocaleString().padStart(8)} ${pct(count, anns.length).padStart(6)} ${s} (${specLabels[s - 1]})`); } console.log("\n── Specificity Distribution by Model (%) ──────────────────"); const specHeader = "Level".padEnd(30) + modelNames.map(m => shortName(m).padStart(12)).join(""); console.log(` ${specHeader}`); for (let s = 1; s <= 4; s++) { const row = `${s} ${specLabels[s - 1]}`.padEnd(30) + modelNames.map(m => { const mas = byModel.get(m)!; const count = mas.filter(a => a.label.specificity_level === s).length; return pct(count, mas.length).padStart(12); }).join(""); console.log(` ${row}`); } // ════════════════════════════════════════════════════════════════════ // 8. CROSS-TABULATION: Category × Specificity // ════════════════════════════════════════════════════════════════════ console.log("\n── Category × Specificity Cross-tab (unanimous paragraphs only) ─"); const crossTab = new Map(); let unanimousCount = 0; for (const [pid, panns] of byParagraph) { if (panns.length !== 3) continue; const cats = panns.map(a => a.label.content_category); const specs = panns.map(a => a.label.specificity_level); if (new Set(cats).size === 1 && new Set(specs).size === 1) { const key = `${cats[0]}|${specs[0]}`; crossTab.set(key, (crossTab.get(key) ?? 0) + 1); unanimousCount++; } } console.log(` (${unanimousCount.toLocaleString()} paragraphs with both-unanimous)\n`); const ctHeader = "Category".padEnd(30) + [1, 2, 3, 4].map(s => `${s}`.padStart(8)).join("") + " Total".padStart(8); console.log(` ${ctHeader}`); for (const cat of categories) { let rowTotal = 0; const cells = [1, 2, 3, 4].map(s => { const v = crossTab.get(`${cat}|${s}`) ?? 0; rowTotal += v; return `${v}`.padStart(8); }).join(""); console.log(` ${cat.padEnd(30)}${cells} ${`${rowTotal}`.padStart(6)}`); } // ════════════════════════════════════════════════════════════════════ // 9. CONFIDENCE ANALYSIS // ════════════════════════════════════════════════════════════════════ console.log("\n\n── Confidence vs Agreement ─────────────────────────────────"); // Check if low-confidence predictions are more likely to disagree const confBuckets: { label: string; filter: (a: Ann) => boolean }[] = [ { label: "both high", filter: a => a.label.category_confidence === "high" && a.label.specificity_confidence === "high" }, { label: "cat low", filter: a => a.label.category_confidence === "low" }, { label: "spec low", filter: a => a.label.specificity_confidence === "low" }, { label: "cat medium", filter: a => a.label.category_confidence === "medium" }, { label: "spec medium", filter: a => a.label.specificity_confidence === "medium" }, ]; for (const bucket of confBuckets) { // Find paragraphs where at least one model reported this confidence let totalP = 0, catUnanP = 0, specUnanP = 0, bothUnanP = 0; for (const [pid, panns] of byParagraph) { if (panns.length !== 3) continue; if (!panns.some(bucket.filter)) continue; totalP++; const cats = panns.map(a => a.label.content_category); const specs = panns.map(a => a.label.specificity_level); if (new Set(cats).size === 1) catUnanP++; if (new Set(specs).size === 1) specUnanP++; if (new Set(cats).size === 1 && new Set(specs).size === 1) bothUnanP++; } if (totalP === 0) continue; console.log(` "${bucket.label}" paragraphs (n=${totalP.toLocaleString()}): cat ${pct(catUnanP, totalP)}, spec ${pct(specUnanP, totalP)}, both ${pct(bothUnanP, totalP)}`); } // ════════════════════════════════════════════════════════════════════ // 10. OUTLIER MODEL ANALYSIS // ════════════════════════════════════════════════════════════════════ console.log("\n\n── Outlier Analysis (model is the odd one out) ────────────"); const outlierCounts = new Map(); for (const m of modelNames) { outlierCounts.set(m, { catOutlier: 0, specOutlier: 0, total: 0 }); } for (const [pid, panns] of byParagraph) { if (panns.length !== 3) continue; for (const m of modelNames) { outlierCounts.get(m)!.total++; } const cats = panns.map(a => a.label.content_category); const specs = panns.map(a => a.label.specificity_level); // Category: if 2 agree and 1 differs, the differing one is outlier if (new Set(cats).size === 2) { for (const a of panns) { const others = panns.filter(o => o !== a); if (others[0].label.content_category === others[1].label.content_category && a.label.content_category !== others[0].label.content_category) { outlierCounts.get(a.provenance.modelId)!.catOutlier++; } } } // Specificity: if 2 agree and 1 differs if (new Set(specs).size === 2) { for (const a of panns) { const others = panns.filter(o => o !== a); if (others[0].label.specificity_level === others[1].label.specificity_level && a.label.specificity_level !== others[0].label.specificity_level) { outlierCounts.get(a.provenance.modelId)!.specOutlier++; } } } } for (const m of modelNames) { const o = outlierCounts.get(m)!; console.log(`\n ${shortName(m)}:`); console.log(` Category outlier: ${o.catOutlier.toLocaleString()} times (${pct(o.catOutlier, o.total)})`); console.log(` Specificity outlier: ${o.specOutlier.toLocaleString()} times (${pct(o.specOutlier, o.total)})`); } // ════════════════════════════════════════════════════════════════════ // 11. CATEGORY-SPECIFIC SPECIFICITY AGREEMENT // ════════════════════════════════════════════════════════════════════ console.log("\n\n── Specificity Agreement by Category ──────────────────────"); console.log(" (among paragraphs where all 3 models agree on category)\n"); const catSpecAgreement = new Map(); for (const [pid, panns] of byParagraph) { if (panns.length !== 3) continue; const cats = panns.map(a => a.label.content_category); if (new Set(cats).size !== 1) continue; const cat = cats[0]; if (!catSpecAgreement.has(cat)) catSpecAgreement.set(cat, { total: 0, specUnan: 0, specMaj: 0 }); const entry = catSpecAgreement.get(cat)!; entry.total++; const specs = panns.map(a => a.label.specificity_level); if (new Set(specs).size === 1) entry.specUnan++; if (specs.filter(s => s === specs[0]).length >= 2 || specs.filter(s => s === specs[1]).length >= 2) entry.specMaj++; } for (const cat of categories) { const e = catSpecAgreement.get(cat); if (!e) continue; console.log(` ${cat.padEnd(28)} n=${e.total.toLocaleString().padStart(6)} spec-unan: ${pct(e.specUnan, e.total).padStart(6)} spec-maj: ${pct(e.specMaj, e.total).padStart(6)}`); } // ════════════════════════════════════════════════════════════════════ // 12. CONSENSUS LABELS (majority vote or unanimous) // ════════════════════════════════════════════════════════════════════ console.log("\n\n── Consensus Label Distribution (majority vote) ───────────"); const consensusCat = new Map(); const consensusSpec = new Map(); let noConsensusCat = 0, noConsensusSpec = 0; for (const [pid, panns] of byParagraph) { if (panns.length !== 3) continue; // Category majority const cats = panns.map(a => a.label.content_category); const catFreq = new Map(); for (const c of cats) catFreq.set(c, (catFreq.get(c) ?? 0) + 1); const majCat = [...catFreq.entries()].find(([, v]) => v >= 2)?.[0]; if (majCat) { consensusCat.set(majCat, (consensusCat.get(majCat) ?? 0) + 1); } else { noConsensusCat++; } // Specificity majority const specs = panns.map(a => a.label.specificity_level); const specFreq = new Map(); for (const s of specs) specFreq.set(s, (specFreq.get(s) ?? 0) + 1); const majSpec = [...specFreq.entries()].find(([, v]) => v >= 2)?.[0]; if (majSpec !== undefined) { consensusSpec.set(majSpec, (consensusSpec.get(majSpec) ?? 0) + 1); } else { noConsensusSpec++; } } console.log("\n Category (majority vote):"); const sortedConsCat = [...consensusCat.entries()].sort((a, b) => b[1] - a[1]); for (const [cat, count] of sortedConsCat) { console.log(` ${count.toLocaleString().padStart(8)} ${pct(count, nParagraphs).padStart(6)} ${cat}`); } console.log(` ${noConsensusCat.toLocaleString().padStart(8)} ${pct(noConsensusCat, nParagraphs).padStart(6)} [no majority]`); console.log("\n Specificity (majority vote):"); for (let s = 1; s <= 4; s++) { const count = consensusSpec.get(s) ?? 0; console.log(` ${count.toLocaleString().padStart(8)} ${pct(count, nParagraphs).padStart(6)} ${s} (${specLabels[s - 1]})`); } console.log(` ${noConsensusSpec.toLocaleString().padStart(8)} ${pct(noConsensusSpec, nParagraphs).padStart(6)} [no majority]`); // ════════════════════════════════════════════════════════════════════ // 13. STAGE 2 WORKLOAD ESTIMATE // ════════════════════════════════════════════════════════════════════ console.log("\n\n── Stage 2 Workload Estimate ───────────────────────────────"); let catOnly = 0, specOnly = 0, bothDisagree = 0; for (const [pid, panns] of byParagraph) { if (panns.length !== 3) continue; const cats = panns.map(a => a.label.content_category); const specs = panns.map(a => a.label.specificity_level); const catU = new Set(cats).size === 1; const specU = new Set(specs).size === 1; if (!catU && specU) catOnly++; if (catU && !specU) specOnly++; if (!catU && !specU) bothDisagree++; } console.log(` Paragraphs needing Stage 2: ${needsStage2.toLocaleString()} (${pct(needsStage2, nParagraphs)})`); console.log(` Cat disagree only: ${catOnly.toLocaleString()}`); console.log(` Spec disagree only: ${specOnly.toLocaleString()}`); console.log(` Both disagree: ${bothDisagree.toLocaleString()}`); // Estimate cost: stage2 uses sonnet, roughly 3x more expensive per call // Average input tokens from stage1 + annotations context const avgInput = totalInput / anns.length; const stage2InputEst = (avgInput + 500) * needsStage2; // extra for prior annotation context const stage2CostEst = (stage2InputEst / 1e6) * 3.0 + (needsStage2 * 150 / 1e6) * 15.0; // $3/MTok in, $15/MTok out estimate console.log(`\n Estimated Stage 2 cost: ~$${stage2CostEst.toFixed(0)} (rough, Sonnet pricing)`); } main().catch(err => { console.error(err); process.exit(1); });