539 lines
29 KiB
TypeScript
539 lines
29 KiB
TypeScript
/**
|
||
* Deep analysis of Stage 1 annotation data.
|
||
*
|
||
* Usage: bun ts/scripts/stage1-analyze.ts
|
||
*/
|
||
import { readJsonlRaw } from "../src/lib/jsonl.ts";
|
||
|
||
const INPUT = new URL("../../data/annotations/stage1.jsonl", import.meta.url).pathname;
|
||
|
||
// ── Types ──────────────────────────────────────────────────────────────
|
||
interface Ann {
|
||
paragraphId: string;
|
||
label: {
|
||
content_category: string;
|
||
specificity_level: number;
|
||
category_confidence: string;
|
||
specificity_confidence: string;
|
||
reasoning: string;
|
||
};
|
||
provenance: {
|
||
modelId: string;
|
||
costUsd: number;
|
||
inputTokens: number;
|
||
outputTokens: number;
|
||
reasoningTokens: number;
|
||
latencyMs: number;
|
||
requestedAt: string;
|
||
};
|
||
}
|
||
|
||
type ModelAnns = Map<string, Ann[]>; // paragraphId → annotations
|
||
|
||
// ── Helpers ────────────────────────────────────────────────────────────
|
||
function pct(n: number, total: number): string {
|
||
return `${((n / total) * 100).toFixed(1)}%`;
|
||
}
|
||
|
||
function median(arr: number[]): number {
|
||
const sorted = [...arr].sort((a, b) => a - b);
|
||
const mid = Math.floor(sorted.length / 2);
|
||
return sorted.length % 2 ? sorted[mid] : (sorted[mid - 1] + sorted[mid]) / 2;
|
||
}
|
||
|
||
function mean(arr: number[]): number {
|
||
return arr.reduce((a, b) => a + b, 0) / arr.length;
|
||
}
|
||
|
||
function stddev(arr: number[]): number {
|
||
const m = mean(arr);
|
||
return Math.sqrt(arr.reduce((sum, x) => sum + (x - m) ** 2, 0) / arr.length);
|
||
}
|
||
|
||
function percentile(arr: number[], p: number): number {
|
||
const sorted = [...arr].sort((a, b) => a - b);
|
||
const idx = (p / 100) * (sorted.length - 1);
|
||
const lo = Math.floor(idx);
|
||
const hi = Math.ceil(idx);
|
||
return lo === hi ? sorted[lo] : sorted[lo] + (sorted[hi] - sorted[lo]) * (idx - lo);
|
||
}
|
||
|
||
// ── Main ───────────────────────────────────────────────────────────────
|
||
async function main() {
|
||
console.log("Loading annotations...");
|
||
const { records: raw, skipped } = await readJsonlRaw(INPUT);
|
||
const anns = raw as Ann[];
|
||
console.log(` ${anns.length.toLocaleString()} annotations loaded, ${skipped} skipped\n`);
|
||
|
||
// Group by paragraph
|
||
const byParagraph = new Map<string, Ann[]>();
|
||
for (const a of anns) {
|
||
let arr = byParagraph.get(a.paragraphId);
|
||
if (!arr) { arr = []; byParagraph.set(a.paragraphId, arr); }
|
||
arr.push(a);
|
||
}
|
||
|
||
// Group by model
|
||
const byModel = new Map<string, Ann[]>();
|
||
for (const a of anns) {
|
||
let arr = byModel.get(a.provenance.modelId);
|
||
if (!arr) { arr = []; byModel.set(a.provenance.modelId, arr); }
|
||
arr.push(a);
|
||
}
|
||
|
||
const modelNames = [...byModel.keys()].sort();
|
||
const shortName = (m: string) => m.split("/").pop()!;
|
||
const nParagraphs = byParagraph.size;
|
||
|
||
// ════════════════════════════════════════════════════════════════════
|
||
// 1. OVERVIEW
|
||
// ════════════════════════════════════════════════════════════════════
|
||
console.log("═══════════════════════════════════════════════════════════");
|
||
console.log(" STAGE 1 DEEP ANALYSIS");
|
||
console.log("═══════════════════════════════════════════════════════════\n");
|
||
|
||
console.log("── Overview ──────────────────────────────────────────────");
|
||
console.log(` Paragraphs: ${nParagraphs.toLocaleString()}`);
|
||
console.log(` Annotations: ${anns.length.toLocaleString()}`);
|
||
console.log(` Models: ${modelNames.map(shortName).join(", ")}`);
|
||
|
||
let totalCost = 0, totalInput = 0, totalOutput = 0, totalReasoning = 0;
|
||
for (const a of anns) {
|
||
totalCost += a.provenance.costUsd;
|
||
totalInput += a.provenance.inputTokens;
|
||
totalOutput += a.provenance.outputTokens;
|
||
totalReasoning += a.provenance.reasoningTokens;
|
||
}
|
||
console.log(` Total cost: $${totalCost.toFixed(2)}`);
|
||
console.log(` Input tokens: ${(totalInput / 1e6).toFixed(1)}M`);
|
||
console.log(` Output tokens: ${(totalOutput / 1e6).toFixed(1)}M`);
|
||
console.log(` Reasoning: ${(totalReasoning / 1e6).toFixed(1)}M`);
|
||
|
||
// ════════════════════════════════════════════════════════════════════
|
||
// 2. PER-MODEL STATS
|
||
// ════════════════════════════════════════════════════════════════════
|
||
console.log("\n── Per-Model Statistics ───────────────────────────────────");
|
||
for (const model of modelNames) {
|
||
const mas = byModel.get(model)!;
|
||
const costs = mas.map(a => a.provenance.costUsd);
|
||
const latencies = mas.map(a => a.provenance.latencyMs);
|
||
const outputs = mas.map(a => a.provenance.outputTokens);
|
||
console.log(`\n ${shortName(model)} (n=${mas.length.toLocaleString()}):`);
|
||
console.log(` Cost: $${costs.reduce((a, b) => a + b, 0).toFixed(2)} total, $${mean(costs).toFixed(5)}/ann`);
|
||
console.log(` Latency: median ${median(latencies).toFixed(0)}ms, p95 ${percentile(latencies, 95).toFixed(0)}ms, p99 ${percentile(latencies, 99).toFixed(0)}ms`);
|
||
console.log(` Output: median ${median(outputs).toFixed(0)} tokens, mean ${mean(outputs).toFixed(0)}`);
|
||
|
||
// Category distribution
|
||
const catCounts = new Map<string, number>();
|
||
const specCounts = new Map<number, number>();
|
||
const confCatCounts = new Map<string, number>();
|
||
const confSpecCounts = new Map<string, number>();
|
||
for (const a of mas) {
|
||
catCounts.set(a.label.content_category, (catCounts.get(a.label.content_category) ?? 0) + 1);
|
||
specCounts.set(a.label.specificity_level, (specCounts.get(a.label.specificity_level) ?? 0) + 1);
|
||
confCatCounts.set(a.label.category_confidence, (confCatCounts.get(a.label.category_confidence) ?? 0) + 1);
|
||
confSpecCounts.set(a.label.specificity_confidence, (confSpecCounts.get(a.label.specificity_confidence) ?? 0) + 1);
|
||
}
|
||
console.log(` Categories: ${[...catCounts.entries()].sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}=${v}`).join(", ")}`);
|
||
console.log(` Specificity: ${[...specCounts.entries()].sort((a, b) => a[0] - b[0]).map(([k, v]) => `${k}=${v} (${pct(v, mas.length)})`).join(", ")}`);
|
||
console.log(` Cat confidence: ${[...confCatCounts.entries()].sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}=${v}`).join(", ")}`);
|
||
console.log(` Spec confidence: ${[...confSpecCounts.entries()].sort((a, b) => b[1] - a[1]).map(([k, v]) => `${k}=${v}`).join(", ")}`);
|
||
}
|
||
|
||
// ════════════════════════════════════════════════════════════════════
|
||
// 3. AGREEMENT ANALYSIS
|
||
// ════════════════════════════════════════════════════════════════════
|
||
console.log("\n\n── Agreement Analysis ─────────────────────────────────────");
|
||
|
||
let catUnanimous = 0, specUnanimous = 0, bothUnanimous = 0;
|
||
let catMajority = 0, specMajority = 0, bothMajority = 0;
|
||
let catNoMajority = 0, specNoMajority = 0;
|
||
const specSpreads: number[] = [];
|
||
|
||
// Category confusion tracking
|
||
const catDisagreementPairs = new Map<string, number>();
|
||
const specDisagreementPatterns = new Map<string, number>();
|
||
|
||
for (const [pid, panns] of byParagraph) {
|
||
if (panns.length !== 3) continue;
|
||
|
||
const cats = panns.map(a => a.label.content_category);
|
||
const specs = panns.map(a => a.label.specificity_level);
|
||
|
||
// Category agreement
|
||
const catSet = new Set(cats);
|
||
const catUnan = catSet.size === 1;
|
||
const catMaj = cats.filter(c => c === cats[0]).length >= 2 ||
|
||
cats.filter(c => c === cats[1]).length >= 2;
|
||
if (catUnan) catUnanimous++;
|
||
if (catMaj) catMajority++;
|
||
if (!catMaj) {
|
||
catNoMajority++;
|
||
}
|
||
|
||
// Track category disagreement pairs
|
||
if (!catUnan) {
|
||
const sorted = [...cats].sort();
|
||
for (let i = 0; i < sorted.length; i++) {
|
||
for (let j = i + 1; j < sorted.length; j++) {
|
||
if (sorted[i] !== sorted[j]) {
|
||
const key = `${sorted[i]} ↔ ${sorted[j]}`;
|
||
catDisagreementPairs.set(key, (catDisagreementPairs.get(key) ?? 0) + 1);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Specificity agreement
|
||
const specSet = new Set(specs);
|
||
const specUnan = specSet.size === 1;
|
||
const specMaj0 = specs.filter(s => s === specs[0]).length >= 2 ||
|
||
specs.filter(s => s === specs[1]).length >= 2;
|
||
if (specUnan) specUnanimous++;
|
||
if (specMaj0) specMajority++;
|
||
if (!specMaj0) specNoMajority++;
|
||
|
||
// Specificity spread (MAD)
|
||
const specMedian = median(specs);
|
||
const mad = mean(specs.map(s => Math.abs(s - specMedian)));
|
||
specSpreads.push(mad);
|
||
|
||
// Track specificity disagreement patterns
|
||
if (!specUnan) {
|
||
const sortedSpecs = [...specs].sort((a, b) => a - b);
|
||
const key = `[${sortedSpecs.join(",")}]`;
|
||
specDisagreementPatterns.set(key, (specDisagreementPatterns.get(key) ?? 0) + 1);
|
||
}
|
||
|
||
// Both
|
||
if (catUnan && specUnan) bothUnanimous++;
|
||
if (catMaj && specMaj0) bothMajority++;
|
||
}
|
||
|
||
console.log(`\n Unanimity (all 3 agree):`);
|
||
console.log(` Category: ${catUnanimous.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(catUnanimous, nParagraphs)})`);
|
||
console.log(` Specificity: ${specUnanimous.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(specUnanimous, nParagraphs)})`);
|
||
console.log(` Both: ${bothUnanimous.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(bothUnanimous, nParagraphs)})`);
|
||
|
||
console.log(`\n Majority (≥2 agree):`);
|
||
console.log(` Category: ${catMajority.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(catMajority, nParagraphs)})`);
|
||
console.log(` Specificity: ${specMajority.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(specMajority, nParagraphs)})`);
|
||
console.log(` Both: ${bothMajority.toLocaleString()} / ${nParagraphs.toLocaleString()} (${pct(bothMajority, nParagraphs)})`);
|
||
|
||
console.log(`\n No majority (3-way split):`);
|
||
console.log(` Category: ${catNoMajority.toLocaleString()} (${pct(catNoMajority, nParagraphs)})`);
|
||
console.log(` Specificity: ${specNoMajority.toLocaleString()} (${pct(specNoMajority, nParagraphs)})`);
|
||
|
||
console.log(`\n Specificity spread (MAD):`);
|
||
console.log(` Mean: ${mean(specSpreads).toFixed(3)}`);
|
||
console.log(` Median: ${median(specSpreads).toFixed(3)}`);
|
||
console.log(` Std: ${stddev(specSpreads).toFixed(3)}`);
|
||
|
||
// Stage 2 need
|
||
const needsStage2 = nParagraphs - bothUnanimous;
|
||
console.log(`\n → Need Stage 2 judge: ${needsStage2.toLocaleString()} (${pct(needsStage2, nParagraphs)})`);
|
||
|
||
// ════════════════════════════════════════════════════════════════════
|
||
// 4. DISAGREEMENT BREAKDOWN
|
||
// ════════════════════════════════════════════════════════════════════
|
||
console.log("\n\n── Category Disagreement Pairs (top 20) ──────────────────");
|
||
const sortedCatDis = [...catDisagreementPairs.entries()].sort((a, b) => b[1] - a[1]);
|
||
for (const [pair, count] of sortedCatDis.slice(0, 20)) {
|
||
console.log(` ${count.toLocaleString().padStart(6)} ${pair}`);
|
||
}
|
||
|
||
console.log("\n── Specificity Disagreement Patterns (all) ────────────────");
|
||
const sortedSpecDis = [...specDisagreementPatterns.entries()].sort((a, b) => b[1] - a[1]);
|
||
for (const [pattern, count] of sortedSpecDis) {
|
||
console.log(` ${count.toLocaleString().padStart(6)} ${pattern}`);
|
||
}
|
||
|
||
// ════════════════════════════════════════════════════════════════════
|
||
// 5. PAIRWISE MODEL AGREEMENT
|
||
// ════════════════════════════════════════════════════════════════════
|
||
console.log("\n\n── Pairwise Model Agreement ───────────────────────────────");
|
||
for (let i = 0; i < modelNames.length; i++) {
|
||
for (let j = i + 1; j < modelNames.length; j++) {
|
||
const m1 = modelNames[i], m2 = modelNames[j];
|
||
let catAgree = 0, specAgree = 0, bothAgree = 0, total = 0;
|
||
for (const [pid, panns] of byParagraph) {
|
||
const a1 = panns.find(a => a.provenance.modelId === m1);
|
||
const a2 = panns.find(a => a.provenance.modelId === m2);
|
||
if (!a1 || !a2) continue;
|
||
total++;
|
||
const ca = a1.label.content_category === a2.label.content_category;
|
||
const sa = a1.label.specificity_level === a2.label.specificity_level;
|
||
if (ca) catAgree++;
|
||
if (sa) specAgree++;
|
||
if (ca && sa) bothAgree++;
|
||
}
|
||
console.log(`\n ${shortName(m1)} × ${shortName(m2)} (n=${total.toLocaleString()}):`);
|
||
console.log(` Category: ${pct(catAgree, total)} (${catAgree.toLocaleString()})`);
|
||
console.log(` Specificity: ${pct(specAgree, total)} (${specAgree.toLocaleString()})`);
|
||
console.log(` Both: ${pct(bothAgree, total)} (${bothAgree.toLocaleString()})`);
|
||
}
|
||
}
|
||
|
||
// ════════════════════════════════════════════════════════════════════
|
||
// 6. CATEGORY DISTRIBUTION (AGGREGATE)
|
||
// ════════════════════════════════════════════════════════════════════
|
||
console.log("\n\n── Category Distribution (all annotations) ────────────────");
|
||
const aggCat = new Map<string, number>();
|
||
for (const a of anns) {
|
||
aggCat.set(a.label.content_category, (aggCat.get(a.label.content_category) ?? 0) + 1);
|
||
}
|
||
const sortedCats = [...aggCat.entries()].sort((a, b) => b[1] - a[1]);
|
||
for (const [cat, count] of sortedCats) {
|
||
console.log(` ${count.toLocaleString().padStart(8)} ${pct(count, anns.length).padStart(6)} ${cat}`);
|
||
}
|
||
|
||
// Per-model category distribution comparison
|
||
console.log("\n── Category Distribution by Model (%) ─────────────────────");
|
||
const categories = sortedCats.map(([c]) => c);
|
||
const header = "Category".padEnd(30) + modelNames.map(m => shortName(m).padStart(12)).join("");
|
||
console.log(` ${header}`);
|
||
for (const cat of categories) {
|
||
const row = cat.padEnd(30) + modelNames.map(m => {
|
||
const mas = byModel.get(m)!;
|
||
const count = mas.filter(a => a.label.content_category === cat).length;
|
||
return pct(count, mas.length).padStart(12);
|
||
}).join("");
|
||
console.log(` ${row}`);
|
||
}
|
||
|
||
// ════════════════════════════════════════════════════════════════════
|
||
// 7. SPECIFICITY DISTRIBUTION (AGGREGATE)
|
||
// ════════════════════════════════════════════════════════════════════
|
||
console.log("\n── Specificity Distribution (all annotations) ──────────────");
|
||
const specLabels = ["Generic Boilerplate", "Domain-Adapted", "Firm-Specific", "Quantified-Verifiable"];
|
||
const aggSpec = new Map<number, number>();
|
||
for (const a of anns) {
|
||
aggSpec.set(a.label.specificity_level, (aggSpec.get(a.label.specificity_level) ?? 0) + 1);
|
||
}
|
||
for (let s = 1; s <= 4; s++) {
|
||
const count = aggSpec.get(s) ?? 0;
|
||
console.log(` ${count.toLocaleString().padStart(8)} ${pct(count, anns.length).padStart(6)} ${s} (${specLabels[s - 1]})`);
|
||
}
|
||
|
||
console.log("\n── Specificity Distribution by Model (%) ──────────────────");
|
||
const specHeader = "Level".padEnd(30) + modelNames.map(m => shortName(m).padStart(12)).join("");
|
||
console.log(` ${specHeader}`);
|
||
for (let s = 1; s <= 4; s++) {
|
||
const row = `${s} ${specLabels[s - 1]}`.padEnd(30) + modelNames.map(m => {
|
||
const mas = byModel.get(m)!;
|
||
const count = mas.filter(a => a.label.specificity_level === s).length;
|
||
return pct(count, mas.length).padStart(12);
|
||
}).join("");
|
||
console.log(` ${row}`);
|
||
}
|
||
|
||
// ════════════════════════════════════════════════════════════════════
|
||
// 8. CROSS-TABULATION: Category × Specificity
|
||
// ════════════════════════════════════════════════════════════════════
|
||
console.log("\n── Category × Specificity Cross-tab (unanimous paragraphs only) ─");
|
||
const crossTab = new Map<string, number>();
|
||
let unanimousCount = 0;
|
||
for (const [pid, panns] of byParagraph) {
|
||
if (panns.length !== 3) continue;
|
||
const cats = panns.map(a => a.label.content_category);
|
||
const specs = panns.map(a => a.label.specificity_level);
|
||
if (new Set(cats).size === 1 && new Set(specs).size === 1) {
|
||
const key = `${cats[0]}|${specs[0]}`;
|
||
crossTab.set(key, (crossTab.get(key) ?? 0) + 1);
|
||
unanimousCount++;
|
||
}
|
||
}
|
||
console.log(` (${unanimousCount.toLocaleString()} paragraphs with both-unanimous)\n`);
|
||
const ctHeader = "Category".padEnd(30) + [1, 2, 3, 4].map(s => `${s}`.padStart(8)).join("") + " Total".padStart(8);
|
||
console.log(` ${ctHeader}`);
|
||
for (const cat of categories) {
|
||
let rowTotal = 0;
|
||
const cells = [1, 2, 3, 4].map(s => {
|
||
const v = crossTab.get(`${cat}|${s}`) ?? 0;
|
||
rowTotal += v;
|
||
return `${v}`.padStart(8);
|
||
}).join("");
|
||
console.log(` ${cat.padEnd(30)}${cells} ${`${rowTotal}`.padStart(6)}`);
|
||
}
|
||
|
||
// ════════════════════════════════════════════════════════════════════
|
||
// 9. CONFIDENCE ANALYSIS
|
||
// ════════════════════════════════════════════════════════════════════
|
||
console.log("\n\n── Confidence vs Agreement ─────────────────────────────────");
|
||
// Check if low-confidence predictions are more likely to disagree
|
||
const confBuckets: { label: string; filter: (a: Ann) => boolean }[] = [
|
||
{ label: "both high", filter: a => a.label.category_confidence === "high" && a.label.specificity_confidence === "high" },
|
||
{ label: "cat low", filter: a => a.label.category_confidence === "low" },
|
||
{ label: "spec low", filter: a => a.label.specificity_confidence === "low" },
|
||
{ label: "cat medium", filter: a => a.label.category_confidence === "medium" },
|
||
{ label: "spec medium", filter: a => a.label.specificity_confidence === "medium" },
|
||
];
|
||
|
||
for (const bucket of confBuckets) {
|
||
// Find paragraphs where at least one model reported this confidence
|
||
let totalP = 0, catUnanP = 0, specUnanP = 0, bothUnanP = 0;
|
||
for (const [pid, panns] of byParagraph) {
|
||
if (panns.length !== 3) continue;
|
||
if (!panns.some(bucket.filter)) continue;
|
||
totalP++;
|
||
const cats = panns.map(a => a.label.content_category);
|
||
const specs = panns.map(a => a.label.specificity_level);
|
||
if (new Set(cats).size === 1) catUnanP++;
|
||
if (new Set(specs).size === 1) specUnanP++;
|
||
if (new Set(cats).size === 1 && new Set(specs).size === 1) bothUnanP++;
|
||
}
|
||
if (totalP === 0) continue;
|
||
console.log(` "${bucket.label}" paragraphs (n=${totalP.toLocaleString()}): cat ${pct(catUnanP, totalP)}, spec ${pct(specUnanP, totalP)}, both ${pct(bothUnanP, totalP)}`);
|
||
}
|
||
|
||
// ════════════════════════════════════════════════════════════════════
|
||
// 10. OUTLIER MODEL ANALYSIS
|
||
// ════════════════════════════════════════════════════════════════════
|
||
console.log("\n\n── Outlier Analysis (model is the odd one out) ────────────");
|
||
const outlierCounts = new Map<string, { catOutlier: number; specOutlier: number; total: number }>();
|
||
for (const m of modelNames) {
|
||
outlierCounts.set(m, { catOutlier: 0, specOutlier: 0, total: 0 });
|
||
}
|
||
|
||
for (const [pid, panns] of byParagraph) {
|
||
if (panns.length !== 3) continue;
|
||
for (const m of modelNames) {
|
||
outlierCounts.get(m)!.total++;
|
||
}
|
||
|
||
const cats = panns.map(a => a.label.content_category);
|
||
const specs = panns.map(a => a.label.specificity_level);
|
||
|
||
// Category: if 2 agree and 1 differs, the differing one is outlier
|
||
if (new Set(cats).size === 2) {
|
||
for (const a of panns) {
|
||
const others = panns.filter(o => o !== a);
|
||
if (others[0].label.content_category === others[1].label.content_category &&
|
||
a.label.content_category !== others[0].label.content_category) {
|
||
outlierCounts.get(a.provenance.modelId)!.catOutlier++;
|
||
}
|
||
}
|
||
}
|
||
|
||
// Specificity: if 2 agree and 1 differs
|
||
if (new Set(specs).size === 2) {
|
||
for (const a of panns) {
|
||
const others = panns.filter(o => o !== a);
|
||
if (others[0].label.specificity_level === others[1].label.specificity_level &&
|
||
a.label.specificity_level !== others[0].label.specificity_level) {
|
||
outlierCounts.get(a.provenance.modelId)!.specOutlier++;
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
for (const m of modelNames) {
|
||
const o = outlierCounts.get(m)!;
|
||
console.log(`\n ${shortName(m)}:`);
|
||
console.log(` Category outlier: ${o.catOutlier.toLocaleString()} times (${pct(o.catOutlier, o.total)})`);
|
||
console.log(` Specificity outlier: ${o.specOutlier.toLocaleString()} times (${pct(o.specOutlier, o.total)})`);
|
||
}
|
||
|
||
// ════════════════════════════════════════════════════════════════════
|
||
// 11. CATEGORY-SPECIFIC SPECIFICITY AGREEMENT
|
||
// ════════════════════════════════════════════════════════════════════
|
||
console.log("\n\n── Specificity Agreement by Category ──────────────────────");
|
||
console.log(" (among paragraphs where all 3 models agree on category)\n");
|
||
const catSpecAgreement = new Map<string, { total: number; specUnan: number; specMaj: number }>();
|
||
for (const [pid, panns] of byParagraph) {
|
||
if (panns.length !== 3) continue;
|
||
const cats = panns.map(a => a.label.content_category);
|
||
if (new Set(cats).size !== 1) continue;
|
||
const cat = cats[0];
|
||
if (!catSpecAgreement.has(cat)) catSpecAgreement.set(cat, { total: 0, specUnan: 0, specMaj: 0 });
|
||
const entry = catSpecAgreement.get(cat)!;
|
||
entry.total++;
|
||
const specs = panns.map(a => a.label.specificity_level);
|
||
if (new Set(specs).size === 1) entry.specUnan++;
|
||
if (specs.filter(s => s === specs[0]).length >= 2 || specs.filter(s => s === specs[1]).length >= 2) entry.specMaj++;
|
||
}
|
||
|
||
for (const cat of categories) {
|
||
const e = catSpecAgreement.get(cat);
|
||
if (!e) continue;
|
||
console.log(` ${cat.padEnd(28)} n=${e.total.toLocaleString().padStart(6)} spec-unan: ${pct(e.specUnan, e.total).padStart(6)} spec-maj: ${pct(e.specMaj, e.total).padStart(6)}`);
|
||
}
|
||
|
||
// ════════════════════════════════════════════════════════════════════
|
||
// 12. CONSENSUS LABELS (majority vote or unanimous)
|
||
// ════════════════════════════════════════════════════════════════════
|
||
console.log("\n\n── Consensus Label Distribution (majority vote) ───────────");
|
||
const consensusCat = new Map<string, number>();
|
||
const consensusSpec = new Map<number, number>();
|
||
let noConsensusCat = 0, noConsensusSpec = 0;
|
||
|
||
for (const [pid, panns] of byParagraph) {
|
||
if (panns.length !== 3) continue;
|
||
|
||
// Category majority
|
||
const cats = panns.map(a => a.label.content_category);
|
||
const catFreq = new Map<string, number>();
|
||
for (const c of cats) catFreq.set(c, (catFreq.get(c) ?? 0) + 1);
|
||
const majCat = [...catFreq.entries()].find(([, v]) => v >= 2)?.[0];
|
||
if (majCat) {
|
||
consensusCat.set(majCat, (consensusCat.get(majCat) ?? 0) + 1);
|
||
} else {
|
||
noConsensusCat++;
|
||
}
|
||
|
||
// Specificity majority
|
||
const specs = panns.map(a => a.label.specificity_level);
|
||
const specFreq = new Map<number, number>();
|
||
for (const s of specs) specFreq.set(s, (specFreq.get(s) ?? 0) + 1);
|
||
const majSpec = [...specFreq.entries()].find(([, v]) => v >= 2)?.[0];
|
||
if (majSpec !== undefined) {
|
||
consensusSpec.set(majSpec, (consensusSpec.get(majSpec) ?? 0) + 1);
|
||
} else {
|
||
noConsensusSpec++;
|
||
}
|
||
}
|
||
|
||
console.log("\n Category (majority vote):");
|
||
const sortedConsCat = [...consensusCat.entries()].sort((a, b) => b[1] - a[1]);
|
||
for (const [cat, count] of sortedConsCat) {
|
||
console.log(` ${count.toLocaleString().padStart(8)} ${pct(count, nParagraphs).padStart(6)} ${cat}`);
|
||
}
|
||
console.log(` ${noConsensusCat.toLocaleString().padStart(8)} ${pct(noConsensusCat, nParagraphs).padStart(6)} [no majority]`);
|
||
|
||
console.log("\n Specificity (majority vote):");
|
||
for (let s = 1; s <= 4; s++) {
|
||
const count = consensusSpec.get(s) ?? 0;
|
||
console.log(` ${count.toLocaleString().padStart(8)} ${pct(count, nParagraphs).padStart(6)} ${s} (${specLabels[s - 1]})`);
|
||
}
|
||
console.log(` ${noConsensusSpec.toLocaleString().padStart(8)} ${pct(noConsensusSpec, nParagraphs).padStart(6)} [no majority]`);
|
||
|
||
// ════════════════════════════════════════════════════════════════════
|
||
// 13. STAGE 2 WORKLOAD ESTIMATE
|
||
// ════════════════════════════════════════════════════════════════════
|
||
console.log("\n\n── Stage 2 Workload Estimate ───────────────────────────────");
|
||
let catOnly = 0, specOnly = 0, bothDisagree = 0;
|
||
for (const [pid, panns] of byParagraph) {
|
||
if (panns.length !== 3) continue;
|
||
const cats = panns.map(a => a.label.content_category);
|
||
const specs = panns.map(a => a.label.specificity_level);
|
||
const catU = new Set(cats).size === 1;
|
||
const specU = new Set(specs).size === 1;
|
||
if (!catU && specU) catOnly++;
|
||
if (catU && !specU) specOnly++;
|
||
if (!catU && !specU) bothDisagree++;
|
||
}
|
||
console.log(` Paragraphs needing Stage 2: ${needsStage2.toLocaleString()} (${pct(needsStage2, nParagraphs)})`);
|
||
console.log(` Cat disagree only: ${catOnly.toLocaleString()}`);
|
||
console.log(` Spec disagree only: ${specOnly.toLocaleString()}`);
|
||
console.log(` Both disagree: ${bothDisagree.toLocaleString()}`);
|
||
|
||
// Estimate cost: stage2 uses sonnet, roughly 3x more expensive per call
|
||
// Average input tokens from stage1 + annotations context
|
||
const avgInput = totalInput / anns.length;
|
||
const stage2InputEst = (avgInput + 500) * needsStage2; // extra for prior annotation context
|
||
const stage2CostEst = (stage2InputEst / 1e6) * 3.0 + (needsStage2 * 150 / 1e6) * 15.0; // $3/MTok in, $15/MTok out estimate
|
||
console.log(`\n Estimated Stage 2 cost: ~$${stage2CostEst.toFixed(0)} (rough, Sonnet pricing)`);
|
||
}
|
||
|
||
main().catch(err => { console.error(err); process.exit(1); });
|