import { readJsonl } from "./lib/jsonl.ts"; import { Paragraph } from "@sec-cybert/schemas/paragraph.ts"; import { Annotation } from "@sec-cybert/schemas/annotation.ts"; import { STAGE1_MODELS } from "./lib/openrouter.ts"; import { runBatch } from "./label/batch.ts"; import { runGoldenBatch } from "./label/golden.ts"; import { computeConsensus } from "./label/consensus.ts"; import { judgeParagraph } from "./label/annotate.ts"; import { appendJsonl, readJsonlRaw } from "./lib/jsonl.ts"; import { v4 as uuidv4 } from "uuid"; import { PROMPT_VERSION } from "./label/prompts.ts"; import { extract10K, extract8K, reparse10K, reparse8K, mergeTrainingData } from "./extract/pipeline.ts"; const DATA = "../data"; function usage(): never { console.error(`Usage: bun sec [options] Commands: extract:10k [--start-date YYYY-MM-DD] [--end-date YYYY-MM-DD] [--limit N] extract:8k [--start-date YYYY-MM-DD] [--end-date YYYY-MM-DD] [--limit N] extract:reparse Re-parse cached 10-K HTML files with current parser (no network) extract:reparse-8k Re-parse cached 8-K HTML files with current parser (no network) extract:merge Merge 10-K + 8-K, remove truncated filings, dedup → training.jsonl label:annotate --model [--limit N] [--concurrency N] label:annotate-all [--limit N] [--concurrency N] label:consensus label:judge [--concurrency N] label:golden [--paragraphs ] [--limit N] [--delay N] (Opus via Agent SDK) label:cost`); process.exit(1); } const [command, ...rest] = process.argv.slice(2); if (!command) usage(); function flag(name: string): string | undefined { const idx = rest.indexOf(`--${name}`); if (idx === -1) return undefined; return rest[idx + 1]; } function flagInt(name: string, fallback: number): number { const v = flag(name); return v !== undefined ? parseInt(v, 10) : fallback; } const PARAGRAPHS_PATH = `${DATA}/paragraphs/paragraphs.jsonl`; const PARAGRAPHS_8K_PATH = `${DATA}/paragraphs/paragraphs-8k.jsonl`; const SESSIONS_PATH = `${DATA}/metadata/sessions.jsonl`; async function loadParagraphs(): Promise { const { records, skipped } = await readJsonl(PARAGRAPHS_PATH, Paragraph); if (skipped > 0) process.stderr.write(` ⚠ Skipped ${skipped} invalid paragraph lines\n`); if (records.length === 0) { process.stderr.write(` ✖ No paragraphs found at ${PARAGRAPHS_PATH}\n`); process.exit(1); } process.stderr.write(` Loaded ${records.length} paragraphs\n`); return records; } async function cmdAnnotate(): Promise { const modelId = flag("model"); if (!modelId) { console.error("--model is required"); process.exit(1); } const paragraphs = await loadParagraphs(); const modelShort = modelId.split("/")[1]!; await runBatch(paragraphs, { modelId, stage: "stage1", outputPath: `${DATA}/annotations/stage1/${modelShort}.jsonl`, errorsPath: `${DATA}/annotations/stage1/${modelShort}-errors.jsonl`, sessionsPath: SESSIONS_PATH, concurrency: flagInt("concurrency", 12), limit: flag("limit") !== undefined ? flagInt("limit", 50) : undefined, }); } async function cmdAnnotateAll(): Promise { const paragraphs = await loadParagraphs(); const concurrency = flagInt("concurrency", 12); const limit = flag("limit") !== undefined ? flagInt("limit", 50) : undefined; for (const modelId of STAGE1_MODELS) { const modelShort = modelId.split("/")[1]!; process.stderr.write(`\n ═══ ${modelId} ═══\n`); await runBatch(paragraphs, { modelId, stage: "stage1", outputPath: `${DATA}/annotations/stage1/${modelShort}.jsonl`, errorsPath: `${DATA}/annotations/stage1/${modelShort}-errors.jsonl`, sessionsPath: SESSIONS_PATH, concurrency, limit, }); } } async function cmdConsensus(): Promise { // Load all Stage 1 annotations const allAnnotations: Map = new Map(); for (const modelId of STAGE1_MODELS) { const modelShort = modelId.split("/")[1]!; const path = `${DATA}/annotations/stage1/${modelShort}.jsonl`; const { records } = await readJsonl(path, Annotation); process.stderr.write(` Loaded ${records.length} annotations from ${modelShort}\n`); for (const ann of records) { const existing = allAnnotations.get(ann.paragraphId) ?? []; existing.push(ann); allAnnotations.set(ann.paragraphId, existing); } } // Only process paragraphs with all 3 annotations let consensus = 0; let needsJudge = 0; const outputPath = `${DATA}/annotations/consensus.jsonl`; for (const [paragraphId, anns] of allAnnotations) { if (anns.length !== 3) continue; const { result, needsJudge: needs } = computeConsensus(paragraphId, anns); await appendJsonl(outputPath, result); if (needs) needsJudge++; else consensus++; } const total = consensus + needsJudge; process.stderr.write( `\n ✓ Consensus: ${consensus}/${total} (${((consensus / total) * 100).toFixed(1)}%) agreed\n` + ` ${needsJudge} paragraphs need Stage 2 judge\n`, ); } async function cmdJudge(): Promise { // Load paragraphs and consensus results needing judge const paragraphs = await loadParagraphs(); const paragraphMap = new Map(paragraphs.map((p) => [p.id, p])); const consensusPath = `${DATA}/annotations/consensus.jsonl`; const { records: rawConsensus } = await readJsonlRaw(consensusPath); // Load all stage 1 annotations for lookup const stage1Map: Map = new Map(); for (const modelId of STAGE1_MODELS) { const modelShort = modelId.split("/")[1]!; const { records } = await readJsonl( `${DATA}/annotations/stage1/${modelShort}.jsonl`, Annotation, ); for (const ann of records) { const existing = stage1Map.get(ann.paragraphId) ?? []; existing.push(ann); stage1Map.set(ann.paragraphId, existing); } } const unresolvedIds: string[] = []; for (const raw of rawConsensus) { const r = raw as { paragraphId?: string; method?: string }; if ( r.method === "unresolved" && r.paragraphId ) { unresolvedIds.push(r.paragraphId); } } // Check what's already judged const judgePath = `${DATA}/annotations/stage2/judge.jsonl`; const { records: existing } = await readJsonlRaw(judgePath); const judgedIds = new Set( existing .filter((r): r is { paragraphId: string } => !!r && typeof r === "object" && "paragraphId" in r) .map((r) => r.paragraphId), ); const toJudge = unresolvedIds.filter((id) => !judgedIds.has(id)); process.stderr.write(` ${toJudge.length} paragraphs to judge (${judgedIds.size} already done)\n`); const runId = uuidv4(); let processed = 0; for (const paragraphId of toJudge) { const paragraph = paragraphMap.get(paragraphId); if (!paragraph) continue; const stage1Anns = stage1Map.get(paragraphId); if (!stage1Anns || stage1Anns.length < 3) continue; const priorLabels = stage1Anns.map((a) => ({ content_category: a.label.content_category, specificity_level: a.label.specificity_level, reasoning: a.label.reasoning, })); try { const judgeAnn = await judgeParagraph(paragraph, priorLabels, { runId, promptVersion: PROMPT_VERSION, }); await appendJsonl(judgePath, judgeAnn); processed++; if (processed % 10 === 0) { process.stderr.write(` Judged ${processed}/${toJudge.length}\n`); } } catch (error) { process.stderr.write( ` ✖ Judge error for ${paragraphId}: ${error instanceof Error ? error.message : String(error)}\n`, ); } } process.stderr.write(`\n ✓ Judged ${processed} paragraphs\n`); } async function cmdGolden(): Promise { // Load the 1,200 human-labeled paragraph IDs from the labelapp sample const sampledIdsPath = "../labelapp/.sampled-ids.json"; const sampledIds = new Set( JSON.parse(await import("node:fs/promises").then((fs) => fs.readFile(sampledIdsPath, "utf-8"))), ); process.stderr.write(` Loaded ${sampledIds.size} sampled IDs from ${sampledIdsPath}\n`); // Load patched paragraphs and filter to the human-labeled set const paragraphsPath = flag("paragraphs") ?? `${DATA}/paragraphs/paragraphs-clean.patched.jsonl`; const { records: allParagraphs, skipped } = await readJsonl(paragraphsPath, Paragraph); if (skipped > 0) process.stderr.write(` ⚠ Skipped ${skipped} invalid paragraph lines\n`); const paragraphs = allParagraphs.filter((p) => sampledIds.has(p.id)); process.stderr.write(` Matched ${paragraphs.length}/${sampledIds.size} paragraphs from ${paragraphsPath}\n`); if (paragraphs.length === 0) { process.stderr.write(" ✖ No matching paragraphs found\n"); process.exit(1); } await runGoldenBatch(paragraphs, { outputPath: `${DATA}/annotations/golden/opus.jsonl`, errorsPath: `${DATA}/annotations/golden/opus-errors.jsonl`, limit: flag("limit") !== undefined ? flagInt("limit", 50) : undefined, delayMs: flag("delay") !== undefined ? flagInt("delay", 1000) : 1000, }); } async function cmdCost(): Promise { const modelCosts: Record = {}; const stageCosts: Record = {}; // Stage 1 for (const modelId of STAGE1_MODELS) { const modelShort = modelId.split("/")[1]!; const path = `${DATA}/annotations/stage1/${modelShort}.jsonl`; const { records } = await readJsonl(path, Annotation); const cost = records.reduce((sum, a) => sum + a.provenance.costUsd, 0); modelCosts[modelId] = { cost, count: records.length }; const stage = stageCosts["stage1"] ?? { cost: 0, count: 0 }; stage.cost += cost; stage.count += records.length; stageCosts["stage1"] = stage; } // Stage 2 const judgePath = `${DATA}/annotations/stage2/judge.jsonl`; const { records: judgeRecords } = await readJsonl(judgePath, Annotation); const judgeCost = judgeRecords.reduce((sum, a) => sum + a.provenance.costUsd, 0); modelCosts["anthropic/claude-sonnet-4.6 (judge)"] = { cost: judgeCost, count: judgeRecords.length, }; stageCosts["stage2"] = { cost: judgeCost, count: judgeRecords.length }; // Print console.log("\n Cost Summary"); console.log(" " + "─".repeat(60)); console.log("\n By Model:"); for (const [model, { cost, count }] of Object.entries(modelCosts)) { console.log(` ${model.padEnd(45)} $${cost.toFixed(4)} (${count} annotations)`); } console.log("\n By Stage:"); let total = 0; for (const [stage, { cost, count }] of Object.entries(stageCosts)) { console.log(` ${stage.padEnd(45)} $${cost.toFixed(4)} (${count} annotations)`); total += cost; } console.log(`\n Total: $${total.toFixed(4)}`); } async function cmdExtract10K(): Promise { await extract10K({ outputPath: PARAGRAPHS_PATH, errorsPath: `${DATA}/extracted/item1c/errors.jsonl`, filingType: "10-K", startDate: flag("start-date") ?? "2023-12-15", endDate: flag("end-date") ?? "2025-12-31", limit: flag("limit") !== undefined ? flagInt("limit", 100) : undefined, }); } async function cmdExtract8K(): Promise { await extract8K({ outputPath: PARAGRAPHS_8K_PATH, errorsPath: `${DATA}/extracted/item105/errors.jsonl`, filingType: "8-K", startDate: flag("start-date") ?? "2023-12-18", endDate: flag("end-date") ?? "2025-12-31", limit: flag("limit") !== undefined ? flagInt("limit", 100) : undefined, }); } async function cmdReparse8K(): Promise { await reparse8K({ outputPath: PARAGRAPHS_8K_PATH }); } // Dispatch switch (command) { case "extract:10k": await cmdExtract10K(); break; case "extract:8k": await cmdExtract8K(); break; case "extract:reparse": await reparse10K({ outputPath: PARAGRAPHS_PATH }); break; case "extract:reparse-8k": await cmdReparse8K(); break; case "extract:merge": await mergeTrainingData({ tenKPath: PARAGRAPHS_PATH, eightKPath: PARAGRAPHS_8K_PATH, outputPath: `${DATA}/paragraphs/training.jsonl`, }); break; case "label:annotate": await cmdAnnotate(); break; case "label:annotate-all": await cmdAnnotateAll(); break; case "label:consensus": await cmdConsensus(); break; case "label:judge": await cmdJudge(); break; case "label:golden": await cmdGolden(); break; case "label:cost": await cmdCost(); break; default: console.error(`Unknown command: ${command}`); usage(); }