SEC-cyBERT/ts/scripts/stage1-analyze.ts
2026-04-04 22:49:24 -04:00

539 lines
29 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/**
* Deep analysis of Stage 1 annotation data.
*
* Usage: bun ts/scripts/stage1-analyze.ts
*/
import { readJsonlRaw } from "../src/lib/jsonl.ts";
const INPUT = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
// ── Types ──────────────────────────────────────────────────────────────
interface Ann {
paragraphId: string;
label: {
content_category: string;
specificity_level: number;
category_confidence: string;
specificity_confidence: string;
reasoning: string;
};
provenance: {
modelId: string;
costUsd: number;
inputTokens: number;
outputTokens: number;
reasoningTokens: number;
latencyMs: number;
requestedAt: string;
};
}
type ModelAnns = Map<string, Ann[]>; // paragraphId → annotations
// ── Helpers ────────────────────────────────────────────────────────────
function pct(n: number, total: number): string {
return `${((n / total) * 100).toFixed(1)}%`;
}
function median(arr: number[]): number {
const sorted = [...arr].sort((a, b) => a - b);
const mid = Math.floor(sorted.length / 2);
return sorted.length % 2 ? sorted[mid] : (sorted[mid - 1] + sorted[mid]) / 2;
}
function mean(arr: number[]): number {
return arr.reduce((a, b) => a + b, 0) / arr.length;
}
function stddev(arr: number[]): number {
const m = mean(arr);
return Math.sqrt(arr.reduce((sum, x) => sum + (x - m) ** 2, 0) / arr.length);
}
function percentile(arr: number[], p: number): number {
const sorted = [...arr].sort((a, b) => a - b);
const idx = (p / 100) * (sorted.length - 1);
const lo = Math.floor(idx);
const hi = Math.ceil(idx);
return lo === hi ? sorted[lo] : sorted[lo] + (sorted[hi] - sorted[lo]) * (idx - lo);
}
// ── Main ───────────────────────────────────────────────────────────────
async function main() {
console.log("Loading annotations...");
const { records: raw, skipped } = await readJsonlRaw(INPUT);
const anns = raw as Ann[];
console.log(` ${anns.length.toLocaleString()} annotations loaded, ${skipped} skipped\n`);
// Group by paragraph
const byParagraph = new Map<string, Ann[]>();
for (const a of anns) {
let arr = byParagraph.get(a.paragraphId);
if (!arr) { arr = []; byParagraph.set(a.paragraphId, arr); }
arr.push(a);
}
// Group by model
const byModel = new Map<string, Ann[]>();
for (const a of anns) {
let arr = byModel.get(a.provenance.modelId);
if (!arr) { arr = []; byModel.set(a.provenance.modelId, arr); }
arr.push(a);
}
const modelNames = [...byModel.keys()].sort();
const shortName = (m: string) => m.split("/").pop()!;
const nParagraphs = byParagraph.size;
// ════════════════════════════════════════════════════════════════════
// 1. OVERVIEW
// ════════════════════════════════════════════════════════════════════
console.log("═══════════════════════════════════════════════════════════");
console.log(" STAGE 1 DEEP ANALYSIS");
console.log("═══════════════════════════════════════════════════════════\n");
console.log("── Overview ──────────────────────────────────────────────");
console.log(` Paragraphs: ${nParagraphs.toLocaleString()}`);
console.log(` Annotations: ${anns.length.toLocaleString()}`);
console.log(` Models: ${modelNames.map(shortName).join(", ")}`);
let totalCost = 0, totalInput = 0, totalOutput = 0, totalReasoning = 0;
for (const a of anns) {
totalCost += a.provenance.costUsd;
totalInput += a.provenance.inputTokens;
totalOutput += a.provenance.outputTokens;
totalReasoning += a.provenance.reasoningTokens;
}
console.log(` Total cost: $${totalCost.toFixed(2)}`);
console.log(` Input tokens: ${(totalInput / 1e6).toFixed(1)}M`);
console.log(` Output tokens: ${(totalOutput / 1e6).toFixed(1)}M`);
console.log(` Reasoning: ${(totalReasoning / 1e6).toFixed(1)}M`);
// ════════════════════════════════════════════════════════════════════
// 2. PER-MODEL STATS
// ════════════════════════════════════════════════════════════════════
console.log("\n── Per-Model Statistics ───────────────────────────────────");
for (const model of modelNames) {
const mas = byModel.get(model)!;
const costs = mas.map(a => a.provenance.costUsd);
const latencies = mas.map(a => a.provenance.latencyMs);
const outputs = mas.map(a => a.provenance.outputTokens);
console.log(`\n ${shortName(model)} (n=${mas.length.toLocaleString()}):`);
console.log(` Cost: $${costs.reduce((a, b) => a + b, 0).toFixed(2)} total, $${mean(costs).toFixed(5)}/ann`);
console.log(` Latency: median ${median(latencies).toFixed(0)}ms, p95 ${percentile(latencies, 95).toFixed(0)}ms, p99 ${percentile(latencies, 99).toFixed(0)}ms`);
console.log(` Output: median ${median(outputs).toFixed(0)} tokens, mean ${mean(outputs).toFixed(0)}`);
// Category distribution
const catCounts = new Map<string, number>();
const specCounts = new Map<number, number>();
const confCatCounts = new Map<string, number>();
const confSpecCounts = new Map<string, number>();
for (const a of mas) {
catCounts.set(a.label.content_category, (catCounts.get(a.label.content_category) ?? 0) + 1);
specCounts.set(a.label.specificity_level, (specCounts.get(a.label.specificity_level) ?? 0) + 1);
confCatCounts.set(a.label.category_confidence, (confCatCounts.get(a.label.category_confidence) ?? 0) + 1);
confSpecCounts.set(a.label.specificity_confidence, (confSpecCounts.get(a.label.specificity_confidence) ?? 0) + 1);
}
console.log(` Categories: ${[...catCounts.entries()].sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}=${v}`).join(", ")}`);
console.log(` Specificity: ${[...specCounts.entries()].sort((a, b) => a[0] - b[0]).map(([k, v]) => `${k}=${v} (${pct(v, mas.length)})`).join(", ")}`);
console.log(` Cat confidence: ${[...confCatCounts.entries()].sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}=${v}`).join(", ")}`);
console.log(` Spec confidence: ${[...confSpecCounts.entries()].sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}=${v}`).join(", ")}`);
}
// ════════════════════════════════════════════════════════════════════
// 3. AGREEMENT ANALYSIS
// ════════════════════════════════════════════════════════════════════
console.log("\n\n── Agreement Analysis ─────────────────────────────────────");
let catUnanimous = 0, specUnanimous = 0, bothUnanimous = 0;
let catMajority = 0, specMajority = 0, bothMajority = 0;
let catNoMajority = 0, specNoMajority = 0;
const specSpreads: number[] = [];
// Category confusion tracking
const catDisagreementPairs = new Map<string, number>();
const specDisagreementPatterns = new Map<string, number>();
for (const [pid, panns] of byParagraph) {
if (panns.length !== 3) continue;
const cats = panns.map(a => a.label.content_category);
const specs = panns.map(a => a.label.specificity_level);
// Category agreement
const catSet = new Set(cats);
const catUnan = catSet.size === 1;
const catMaj = cats.filter(c => c === cats[0]).length >= 2 ||
cats.filter(c => c === cats[1]).length >= 2;
if (catUnan) catUnanimous++;
if (catMaj) catMajority++;
if (!catMaj) {
catNoMajority++;
}
// Track category disagreement pairs
if (!catUnan) {
const sorted = [...cats].sort();
for (let i = 0; i < sorted.length; i++) {
for (let j = i + 1; j < sorted.length; j++) {
if (sorted[i] !== sorted[j]) {
const key = `${sorted[i]}${sorted[j]}`;
catDisagreementPairs.set(key, (catDisagreementPairs.get(key) ?? 0) + 1);
}
}
}
}
// Specificity agreement
const specSet = new Set(specs);
const specUnan = specSet.size === 1;
const specMaj0 = specs.filter(s => s === specs[0]).length >= 2 ||
specs.filter(s => s === specs[1]).length >= 2;
if (specUnan) specUnanimous++;
if (specMaj0) specMajority++;
if (!specMaj0) specNoMajority++;
// Specificity spread (MAD)
const specMedian = median(specs);
const mad = mean(specs.map(s => Math.abs(s - specMedian)));
specSpreads.push(mad);
// Track specificity disagreement patterns
if (!specUnan) {
const sortedSpecs = [...specs].sort((a, b) => a - b);
const key = `[${sortedSpecs.join(",")}]`;
specDisagreementPatterns.set(key, (specDisagreementPatterns.get(key) ?? 0) + 1);
}
// Both
if (catUnan && specUnan) bothUnanimous++;
if (catMaj && specMaj0) bothMajority++;
}
console.log(`\n Unanimity (all 3 agree):`);
console.log(` Category: ${catUnanimous.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(catUnanimous, nParagraphs)})`);
console.log(` Specificity: ${specUnanimous.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(specUnanimous, nParagraphs)})`);
console.log(` Both: ${bothUnanimous.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(bothUnanimous, nParagraphs)})`);
console.log(`\n Majority (≥2 agree):`);
console.log(` Category: ${catMajority.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(catMajority, nParagraphs)})`);
console.log(` Specificity: ${specMajority.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(specMajority, nParagraphs)})`);
console.log(` Both: ${bothMajority.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(bothMajority, nParagraphs)})`);
console.log(`\n No majority (3-way split):`);
console.log(` Category: ${catNoMajority.toLocaleString()} (${pct(catNoMajority, nParagraphs)})`);
console.log(` Specificity: ${specNoMajority.toLocaleString()} (${pct(specNoMajority, nParagraphs)})`);
console.log(`\n Specificity spread (MAD):`);
console.log(` Mean: ${mean(specSpreads).toFixed(3)}`);
console.log(` Median: ${median(specSpreads).toFixed(3)}`);
console.log(` Std: ${stddev(specSpreads).toFixed(3)}`);
// Stage 2 need
const needsStage2 = nParagraphs - bothUnanimous;
console.log(`\n → Need Stage 2 judge: ${needsStage2.toLocaleString()} (${pct(needsStage2, nParagraphs)})`);
// ════════════════════════════════════════════════════════════════════
// 4. DISAGREEMENT BREAKDOWN
// ════════════════════════════════════════════════════════════════════
console.log("\n\n── Category Disagreement Pairs (top 20) ──────────────────");
const sortedCatDis = [...catDisagreementPairs.entries()].sort((a, b) => b[1] - a[1]);
for (const [pair, count] of sortedCatDis.slice(0, 20)) {
console.log(` ${count.toLocaleString().padStart(6)} ${pair}`);
}
console.log("\n── Specificity Disagreement Patterns (all) ────────────────");
const sortedSpecDis = [...specDisagreementPatterns.entries()].sort((a, b) => b[1] - a[1]);
for (const [pattern, count] of sortedSpecDis) {
console.log(` ${count.toLocaleString().padStart(6)} ${pattern}`);
}
// ════════════════════════════════════════════════════════════════════
// 5. PAIRWISE MODEL AGREEMENT
// ════════════════════════════════════════════════════════════════════
console.log("\n\n── Pairwise Model Agreement ───────────────────────────────");
for (let i = 0; i < modelNames.length; i++) {
for (let j = i + 1; j < modelNames.length; j++) {
const m1 = modelNames[i], m2 = modelNames[j];
let catAgree = 0, specAgree = 0, bothAgree = 0, total = 0;
for (const [pid, panns] of byParagraph) {
const a1 = panns.find(a => a.provenance.modelId === m1);
const a2 = panns.find(a => a.provenance.modelId === m2);
if (!a1 || !a2) continue;
total++;
const ca = a1.label.content_category === a2.label.content_category;
const sa = a1.label.specificity_level === a2.label.specificity_level;
if (ca) catAgree++;
if (sa) specAgree++;
if (ca && sa) bothAgree++;
}
console.log(`\n ${shortName(m1)} × ${shortName(m2)} (n=${total.toLocaleString()}):`);
console.log(` Category: ${pct(catAgree, total)} (${catAgree.toLocaleString()})`);
console.log(` Specificity: ${pct(specAgree, total)} (${specAgree.toLocaleString()})`);
console.log(` Both: ${pct(bothAgree, total)} (${bothAgree.toLocaleString()})`);
}
}
// ════════════════════════════════════════════════════════════════════
// 6. CATEGORY DISTRIBUTION (AGGREGATE)
// ════════════════════════════════════════════════════════════════════
console.log("\n\n── Category Distribution (all annotations) ────────────────");
const aggCat = new Map<string, number>();
for (const a of anns) {
aggCat.set(a.label.content_category, (aggCat.get(a.label.content_category) ?? 0) + 1);
}
const sortedCats = [...aggCat.entries()].sort((a, b) => b[1] - a[1]);
for (const [cat, count] of sortedCats) {
console.log(` ${count.toLocaleString().padStart(8)} ${pct(count, anns.length).padStart(6)} ${cat}`);
}
// Per-model category distribution comparison
console.log("\n── Category Distribution by Model (%) ─────────────────────");
const categories = sortedCats.map(([c]) => c);
const header = "Category".padEnd(30) + modelNames.map(m => shortName(m).padStart(12)).join("");
console.log(` ${header}`);
for (const cat of categories) {
const row = cat.padEnd(30) + modelNames.map(m => {
const mas = byModel.get(m)!;
const count = mas.filter(a => a.label.content_category === cat).length;
return pct(count, mas.length).padStart(12);
}).join("");
console.log(` ${row}`);
}
// ════════════════════════════════════════════════════════════════════
// 7. SPECIFICITY DISTRIBUTION (AGGREGATE)
// ════════════════════════════════════════════════════════════════════
console.log("\n── Specificity Distribution (all annotations) ──────────────");
const specLabels = ["Generic Boilerplate", "Domain-Adapted", "Firm-Specific", "Quantified-Verifiable"];
const aggSpec = new Map<number, number>();
for (const a of anns) {
aggSpec.set(a.label.specificity_level, (aggSpec.get(a.label.specificity_level) ?? 0) + 1);
}
for (let s = 1; s <= 4; s++) {
const count = aggSpec.get(s) ?? 0;
console.log(` ${count.toLocaleString().padStart(8)} ${pct(count, anns.length).padStart(6)} ${s} (${specLabels[s - 1]})`);
}
console.log("\n── Specificity Distribution by Model (%) ──────────────────");
const specHeader = "Level".padEnd(30) + modelNames.map(m => shortName(m).padStart(12)).join("");
console.log(` ${specHeader}`);
for (let s = 1; s <= 4; s++) {
const row = `${s} ${specLabels[s - 1]}`.padEnd(30) + modelNames.map(m => {
const mas = byModel.get(m)!;
const count = mas.filter(a => a.label.specificity_level === s).length;
return pct(count, mas.length).padStart(12);
}).join("");
console.log(` ${row}`);
}
// ════════════════════════════════════════════════════════════════════
// 8. CROSS-TABULATION: Category × Specificity
// ════════════════════════════════════════════════════════════════════
console.log("\n── Category × Specificity Cross-tab (unanimous paragraphs only) ─");
const crossTab = new Map<string, number>();
let unanimousCount = 0;
for (const [pid, panns] of byParagraph) {
if (panns.length !== 3) continue;
const cats = panns.map(a => a.label.content_category);
const specs = panns.map(a => a.label.specificity_level);
if (new Set(cats).size === 1 && new Set(specs).size === 1) {
const key = `${cats[0]}|${specs[0]}`;
crossTab.set(key, (crossTab.get(key) ?? 0) + 1);
unanimousCount++;
}
}
console.log(` (${unanimousCount.toLocaleString()} paragraphs with both-unanimous)\n`);
const ctHeader = "Category".padEnd(30) + [1, 2, 3, 4].map(s => `${s}`.padStart(8)).join("") + " Total".padStart(8);
console.log(` ${ctHeader}`);
for (const cat of categories) {
let rowTotal = 0;
const cells = [1, 2, 3, 4].map(s => {
const v = crossTab.get(`${cat}|${s}`) ?? 0;
rowTotal += v;
return `${v}`.padStart(8);
}).join("");
console.log(` ${cat.padEnd(30)}${cells} ${`${rowTotal}`.padStart(6)}`);
}
// ════════════════════════════════════════════════════════════════════
// 9. CONFIDENCE ANALYSIS
// ════════════════════════════════════════════════════════════════════
console.log("\n\n── Confidence vs Agreement ─────────────────────────────────");
// Check if low-confidence predictions are more likely to disagree
const confBuckets: { label: string; filter: (a: Ann) => boolean }[] = [
{ label: "both high", filter: a => a.label.category_confidence === "high" && a.label.specificity_confidence === "high" },
{ label: "cat low", filter: a => a.label.category_confidence === "low" },
{ label: "spec low", filter: a => a.label.specificity_confidence === "low" },
{ label: "cat medium", filter: a => a.label.category_confidence === "medium" },
{ label: "spec medium", filter: a => a.label.specificity_confidence === "medium" },
];
for (const bucket of confBuckets) {
// Find paragraphs where at least one model reported this confidence
let totalP = 0, catUnanP = 0, specUnanP = 0, bothUnanP = 0;
for (const [pid, panns] of byParagraph) {
if (panns.length !== 3) continue;
if (!panns.some(bucket.filter)) continue;
totalP++;
const cats = panns.map(a => a.label.content_category);
const specs = panns.map(a => a.label.specificity_level);
if (new Set(cats).size === 1) catUnanP++;
if (new Set(specs).size === 1) specUnanP++;
if (new Set(cats).size === 1 && new Set(specs).size === 1) bothUnanP++;
}
if (totalP === 0) continue;
console.log(` "${bucket.label}" paragraphs (n=${totalP.toLocaleString()}): cat ${pct(catUnanP, totalP)}, spec ${pct(specUnanP, totalP)}, both ${pct(bothUnanP, totalP)}`);
}
// ════════════════════════════════════════════════════════════════════
// 10. OUTLIER MODEL ANALYSIS
// ════════════════════════════════════════════════════════════════════
console.log("\n\n── Outlier Analysis (model is the odd one out) ────────────");
const outlierCounts = new Map<string, { catOutlier: number; specOutlier: number; total: number }>();
for (const m of modelNames) {
outlierCounts.set(m, { catOutlier: 0, specOutlier: 0, total: 0 });
}
for (const [pid, panns] of byParagraph) {
if (panns.length !== 3) continue;
for (const m of modelNames) {
outlierCounts.get(m)!.total++;
}
const cats = panns.map(a => a.label.content_category);
const specs = panns.map(a => a.label.specificity_level);
// Category: if 2 agree and 1 differs, the differing one is outlier
if (new Set(cats).size === 2) {
for (const a of panns) {
const others = panns.filter(o => o !== a);
if (others[0].label.content_category === others[1].label.content_category &&
a.label.content_category !== others[0].label.content_category) {
outlierCounts.get(a.provenance.modelId)!.catOutlier++;
}
}
}
// Specificity: if 2 agree and 1 differs
if (new Set(specs).size === 2) {
for (const a of panns) {
const others = panns.filter(o => o !== a);
if (others[0].label.specificity_level === others[1].label.specificity_level &&
a.label.specificity_level !== others[0].label.specificity_level) {
outlierCounts.get(a.provenance.modelId)!.specOutlier++;
}
}
}
}
for (const m of modelNames) {
const o = outlierCounts.get(m)!;
console.log(`\n ${shortName(m)}:`);
console.log(` Category outlier: ${o.catOutlier.toLocaleString()} times (${pct(o.catOutlier, o.total)})`);
console.log(` Specificity outlier: ${o.specOutlier.toLocaleString()} times (${pct(o.specOutlier, o.total)})`);
}
// ════════════════════════════════════════════════════════════════════
// 11. CATEGORY-SPECIFIC SPECIFICITY AGREEMENT
// ════════════════════════════════════════════════════════════════════
console.log("\n\n── Specificity Agreement by Category ──────────────────────");
console.log(" (among paragraphs where all 3 models agree on category)\n");
const catSpecAgreement = new Map<string, { total: number; specUnan: number; specMaj: number }>();
for (const [pid, panns] of byParagraph) {
if (panns.length !== 3) continue;
const cats = panns.map(a => a.label.content_category);
if (new Set(cats).size !== 1) continue;
const cat = cats[0];
if (!catSpecAgreement.has(cat)) catSpecAgreement.set(cat, { total: 0, specUnan: 0, specMaj: 0 });
const entry = catSpecAgreement.get(cat)!;
entry.total++;
const specs = panns.map(a => a.label.specificity_level);
if (new Set(specs).size === 1) entry.specUnan++;
if (specs.filter(s => s === specs[0]).length >= 2 || specs.filter(s => s === specs[1]).length >= 2) entry.specMaj++;
}
for (const cat of categories) {
const e = catSpecAgreement.get(cat);
if (!e) continue;
console.log(` ${cat.padEnd(28)} n=${e.total.toLocaleString().padStart(6)} spec-unan: ${pct(e.specUnan, e.total).padStart(6)} spec-maj: ${pct(e.specMaj, e.total).padStart(6)}`);
}
// ════════════════════════════════════════════════════════════════════
// 12. CONSENSUS LABELS (majority vote or unanimous)
// ════════════════════════════════════════════════════════════════════
console.log("\n\n── Consensus Label Distribution (majority vote) ───────────");
const consensusCat = new Map<string, number>();
const consensusSpec = new Map<number, number>();
let noConsensusCat = 0, noConsensusSpec = 0;
for (const [pid, panns] of byParagraph) {
if (panns.length !== 3) continue;
// Category majority
const cats = panns.map(a => a.label.content_category);
const catFreq = new Map<string, number>();
for (const c of cats) catFreq.set(c, (catFreq.get(c) ?? 0) + 1);
const majCat = [...catFreq.entries()].find(([, v]) => v >= 2)?.[0];
if (majCat) {
consensusCat.set(majCat, (consensusCat.get(majCat) ?? 0) + 1);
} else {
noConsensusCat++;
}
// Specificity majority
const specs = panns.map(a => a.label.specificity_level);
const specFreq = new Map<number, number>();
for (const s of specs) specFreq.set(s, (specFreq.get(s) ?? 0) + 1);
const majSpec = [...specFreq.entries()].find(([, v]) => v >= 2)?.[0];
if (majSpec !== undefined) {
consensusSpec.set(majSpec, (consensusSpec.get(majSpec) ?? 0) + 1);
} else {
noConsensusSpec++;
}
}
console.log("\n Category (majority vote):");
const sortedConsCat = [...consensusCat.entries()].sort((a, b) => b[1] - a[1]);
for (const [cat, count] of sortedConsCat) {
console.log(` ${count.toLocaleString().padStart(8)} ${pct(count, nParagraphs).padStart(6)} ${cat}`);
}
console.log(` ${noConsensusCat.toLocaleString().padStart(8)} ${pct(noConsensusCat, nParagraphs).padStart(6)} [no majority]`);
console.log("\n Specificity (majority vote):");
for (let s = 1; s <= 4; s++) {
const count = consensusSpec.get(s) ?? 0;
console.log(` ${count.toLocaleString().padStart(8)} ${pct(count, nParagraphs).padStart(6)} ${s} (${specLabels[s - 1]})`);
}
console.log(` ${noConsensusSpec.toLocaleString().padStart(8)} ${pct(noConsensusSpec, nParagraphs).padStart(6)} [no majority]`);
// ════════════════════════════════════════════════════════════════════
// 13. STAGE 2 WORKLOAD ESTIMATE
// ════════════════════════════════════════════════════════════════════
console.log("\n\n── Stage 2 Workload Estimate ───────────────────────────────");
let catOnly = 0, specOnly = 0, bothDisagree = 0;
for (const [pid, panns] of byParagraph) {
if (panns.length !== 3) continue;
const cats = panns.map(a => a.label.content_category);
const specs = panns.map(a => a.label.specificity_level);
const catU = new Set(cats).size === 1;
const specU = new Set(specs).size === 1;
if (!catU && specU) catOnly++;
if (catU && !specU) specOnly++;
if (!catU && !specU) bothDisagree++;
}
console.log(` Paragraphs needing Stage 2: ${needsStage2.toLocaleString()} (${pct(needsStage2, nParagraphs)})`);
console.log(` Cat disagree only: ${catOnly.toLocaleString()}`);
console.log(` Spec disagree only: ${specOnly.toLocaleString()}`);
console.log(` Both disagree: ${bothDisagree.toLocaleString()}`);
// Estimate cost: stage2 uses sonnet, roughly 3x more expensive per call
// Average input tokens from stage1 + annotations context
const avgInput = totalInput / anns.length;
const stage2InputEst = (avgInput + 500) * needsStage2; // extra for prior annotation context
const stage2CostEst = (stage2InputEst / 1e6) * 3.0 + (needsStage2 * 150 / 1e6) * 15.0; // $3/MTok in, $15/MTok out estimate
console.log(`\n Estimated Stage 2 cost: ~$${stage2CostEst.toFixed(0)} (rough, Sonnet pricing)`);
}
main().catch(err => { console.error(err); process.exit(1); });