opus golden set scaffolding

This commit is contained in:
Joey Eamigh 2026-03-30 22:02:52 -04:00
parent 7b660fe361
commit 32cd5ecfa8
No known key found for this signature in database
GPG Key ID: CE8C05DFFC53C9CB
5 changed files with 405 additions and 0 deletions

View File

@ -51,6 +51,7 @@
"ts": {
"name": "sec-cybert",
"dependencies": {
"@anthropic-ai/claude-agent-sdk": "^0.2.88",
"@openrouter/ai-sdk-provider": "^2.3.3",
"@sec-cybert/schemas": "workspace:*",
"ai": "^6.0.141",
@ -82,6 +83,10 @@
"@alloc/quick-lru": ["@alloc/quick-lru@5.2.0", "", {}, "sha512-UrcABB+4bUrFABwbluTIBErXwvbsU/V7TZWfmbgJfbkwiBuziS9gxdODUyuiecfdGQ85jglMW6juS3+z5TsKLw=="],
"@anthropic-ai/claude-agent-sdk": ["@anthropic-ai/claude-agent-sdk@0.2.88", "", { "dependencies": { "@anthropic-ai/sdk": "^0.74.0", "@modelcontextprotocol/sdk": "^1.27.1" }, "optionalDependencies": { "@img/sharp-darwin-arm64": "^0.34.2", "@img/sharp-darwin-x64": "^0.34.2", "@img/sharp-linux-arm": "^0.34.2", "@img/sharp-linux-arm64": "^0.34.2", "@img/sharp-linux-x64": "^0.34.2", "@img/sharp-linuxmusl-arm64": "^0.34.2", "@img/sharp-linuxmusl-x64": "^0.34.2", "@img/sharp-win32-arm64": "^0.34.2", "@img/sharp-win32-x64": "^0.34.2" }, "peerDependencies": { "zod": "^4.0.0" } }, "sha512-hm9AYD8UGpGouOlmWB6kMRjIUCMtO13N3HDsviu7/htOXJZ/KKypgEd5yW04Ro6421SwX4KfQNrwayJ6R227+g=="],
"@anthropic-ai/sdk": ["@anthropic-ai/sdk@0.74.0", "", { "dependencies": { "json-schema-to-ts": "^3.1.1" }, "peerDependencies": { "zod": "^3.25.0 || ^4.0.0" }, "optionalPeers": ["zod"], "bin": { "anthropic-ai-sdk": "bin/cli" } }, "sha512-srbJV7JKsc5cQ6eVuFzjZO7UR3xEPJqPamHFIe29bs38Ij2IripoAhC0S5NslNbaFUYqBKypmmpzMTpqfHEUDw=="],
"@babel/code-frame": ["@babel/code-frame@7.29.0", "", { "dependencies": { "@babel/helper-validator-identifier": "^7.28.5", "js-tokens": "^4.0.0", "picocolors": "^1.1.1" } }, "sha512-9NhCeYjq9+3uxgdtp20LSiJXJvN0FeCtNGpJxuMFZ1Kv3cWUNb6DOhJwUvcVCzKGR66cw4njwM6hrJLqgOwbcw=="],
"@babel/compat-data": ["@babel/compat-data@7.29.0", "", {}, "sha512-T1NCJqT/j9+cn8fvkt7jtwbLBfLC/1y1c7NtCeXFRgzGTsafi68MRv8yzkYSapBnFA6L3U2VSc02ciDzoAJhJg=="],
@ -1004,6 +1009,8 @@
"json-schema": ["json-schema@0.4.0", "", {}, "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA=="],
"json-schema-to-ts": ["json-schema-to-ts@3.1.1", "", { "dependencies": { "@babel/runtime": "^7.18.3", "ts-algebra": "^2.0.0" } }, "sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g=="],
"json-schema-traverse": ["json-schema-traverse@0.4.1", "", {}, "sha512-xbbCH5dCYU5T8LcEhhuh7HJ88HXuW3qsI3Y0zOZFKfZEHcpWiHU/Jxzk629Brsab/mMiHQti9wMP+845RPe3Vg=="],
"json-schema-typed": ["json-schema-typed@8.0.2", "", {}, "sha512-fQhoXdcvc3V28x7C7BMs4P5+kNlgUURe2jmUT1T//oBRMDrqy1QPelJimwZGo7Hg9VPV3EQV5Bnq4hbFy2vetA=="],
@ -1374,6 +1381,8 @@
"tough-cookie": ["tough-cookie@6.0.1", "", { "dependencies": { "tldts": "^7.0.5" } }, "sha512-LktZQb3IeoUWB9lqR5EWTHgW/VTITCXg4D21M+lvybRVdylLrRMnqaIONLVb5mav8vM19m44HIcGq4qASeu2Qw=="],
"ts-algebra": ["ts-algebra@2.0.0", "", {}, "sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw=="],
"ts-api-utils": ["ts-api-utils@2.5.0", "", { "peerDependencies": { "typescript": ">=4.8.4" } }, "sha512-OJ/ibxhPlqrMM0UiNHJ/0CKQkoKF243/AEmplt3qpRgkW8VG7IfOS41h7V8TjITqdByHzrjcS/2si+y4lIh8NA=="],
"ts-morph": ["ts-morph@26.0.0", "", { "dependencies": { "@ts-morph/common": "~0.27.0", "code-block-writer": "^13.0.3" } }, "sha512-ztMO++owQnz8c/gIENcM9XfCEzgoGphTv+nKpYNM1bgsdOVC/jRZuEBf6N+mLLDNg68Kl+GgUZfOySaRiG1/Ug=="],

View File

@ -509,6 +509,24 @@ The admin page shows disputed paragraphs with all 3 labels side-by-side, annotat
| Migration transition script | `labelapp/scripts/ensure-migration-baseline.ts` |
| Docker entrypoint | `labelapp/entrypoint.sh` |
### Opus Golden Labeling
With the human gold set nearing completion, we added a parallel labeling pass using Claude Opus 4.6 as an additional expert annotator. The motivation is empirical: the GenAI pipeline's Stage 1 consensus + Stage 2 judge combination has shown strong alignment with the codebook throughout development, and Opus represents a significant capability jump over the models used in Stages 1 and 2. Having an independent Opus annotation for every gold-set paragraph gives us a third perspective alongside the human labels and the existing pipeline labels — useful for adjudication, for measuring human-vs-model agreement, and as an upper bound on what automated annotation can achieve.
**Implementation:** Rather than routing through OpenRouter (which would cost ~$27-80 depending on the model), we used the Claude Agent SDK (`@anthropic-ai/claude-agent-sdk`) to call Opus 4.6 through the existing Claude Code subscription. The Agent SDK's `query()` function accepts a custom system prompt and structured output schema, so we configured it as a fully isolated classifier: no tools, no hooks, no settings, no session persistence — just a system prompt and a JSON schema response.
**Key design decisions:**
1. **Full codebook as system prompt.** The Stage 1/2 pipeline uses a condensed v2.5 operational prompt (~4KB). For Opus, we feed the entire labeling codebook (`docs/LABELING-CODEBOOK.md`, ~42KB) plus the operational prompt plus the JSON output schema. Opus has the context window and reasoning depth to actually use the worked examples, borderline cases, and decision rules that cheaper models would ignore.
2. **Reasoning traces saved.** Opus's adaptive thinking produces step-by-step codebook application (e.g., "Count QV-eligible facts: specific date (2020), 24 years (quantified)... two hard verifiable facts → Quantified-Verifiable"). These are saved in the `golden.thinking` field alongside each annotation — valuable both for adjudication and for understanding where the codebook's boundaries create ambiguity.
3. **Raw confidence preserved.** Opus returns numeric confidence (0-1) rather than the categorical high/medium/low that cheaper models produce. We save the raw values (`golden.rawCategoryConfidence`, `golden.rawSpecificityConfidence`) before coercing them through the existing `Confidence` transform. This gives a finer-grained signal for weighting or analysis.
4. **Serial execution at 1 req/s.** The Claude Code subscription has rate limits, so the batch runs serially with a 1-second delay between requests. At ~4 paragraphs/minute (including Opus thinking time), the full 1,200-paragraph set completes in ~5 hours. Crash-safe JSONL checkpoint resume means it can be interrupted and restarted without re-running completed paragraphs.
**Output:** `data/annotations/golden/opus.jsonl` — standard `Annotation` records (compatible with the existing pipeline) plus a `golden` block containing thinking traces, raw confidence values, and the model's specific fact extractions. The `provenance.promptVersion` is tagged `v2.5+codebook` to distinguish from standard Stage 1/2 annotations.
---
## Phase 9: Pre-Training Strategy — DAPT + TAPT
@ -963,6 +981,8 @@ Three models from three providers — minimizes correlated errors.
| Judge benchmarking | `ts/scripts/judge-bench.ts` | Supports structured/tool modes, gold label comparison |
| Judge diagnostics | `ts/scripts/judge-diag.ts`, `judge-diag-batch.ts` | GLM-5 failure investigation |
| Model benchmarking | `ts/scripts/model-bench.ts` | Stage 1 candidate evaluation |
| Golden annotation (Opus) | `ts/src/label/golden.ts` | Agent SDK runner for gold set, saves reasoning traces |
| Golden annotations | `data/annotations/golden/opus.jsonl` | Opus 4.6 labels + thinking + raw confidence |
---

View File

@ -16,6 +16,7 @@
"typescript": "^5"
},
"dependencies": {
"@anthropic-ai/claude-agent-sdk": "^0.2.88",
"@openrouter/ai-sdk-provider": "^2.3.3",
"@sec-cybert/schemas": "workspace:*",
"ai": "^6.0.141",

View File

@ -3,6 +3,7 @@ import { Paragraph } from "@sec-cybert/schemas/paragraph.ts";
import { Annotation } from "@sec-cybert/schemas/annotation.ts";
import { STAGE1_MODELS } from "./lib/openrouter.ts";
import { runBatch } from "./label/batch.ts";
import { runGoldenBatch } from "./label/golden.ts";
import { computeConsensus } from "./label/consensus.ts";
import { judgeParagraph } from "./label/annotate.ts";
import { appendJsonl, readJsonlRaw } from "./lib/jsonl.ts";
@ -25,6 +26,7 @@ Commands:
label:annotate-all [--limit N] [--concurrency N]
label:consensus
label:judge [--concurrency N]
label:golden [--paragraphs <path>] [--limit N] [--delay N] (Opus via Agent SDK)
label:cost`);
process.exit(1);
}
@ -220,6 +222,35 @@ async function cmdJudge(): Promise<void> {
process.stderr.write(`\n ✓ Judged ${processed} paragraphs\n`);
}
async function cmdGolden(): Promise<void> {
// Load the 1,200 human-labeled paragraph IDs from the labelapp sample
const sampledIdsPath = "../labelapp/.sampled-ids.json";
const sampledIds = new Set<string>(
JSON.parse(await import("node:fs/promises").then((fs) => fs.readFile(sampledIdsPath, "utf-8"))),
);
process.stderr.write(` Loaded ${sampledIds.size} sampled IDs from ${sampledIdsPath}\n`);
// Load patched paragraphs and filter to the human-labeled set
const paragraphsPath = flag("paragraphs") ?? `${DATA}/paragraphs/paragraphs-clean.patched.jsonl`;
const { records: allParagraphs, skipped } = await readJsonl(paragraphsPath, Paragraph);
if (skipped > 0) process.stderr.write(` ⚠ Skipped ${skipped} invalid paragraph lines\n`);
const paragraphs = allParagraphs.filter((p) => sampledIds.has(p.id));
process.stderr.write(` Matched ${paragraphs.length}/${sampledIds.size} paragraphs from ${paragraphsPath}\n`);
if (paragraphs.length === 0) {
process.stderr.write(" ✖ No matching paragraphs found\n");
process.exit(1);
}
await runGoldenBatch(paragraphs, {
outputPath: `${DATA}/annotations/golden/opus.jsonl`,
errorsPath: `${DATA}/annotations/golden/opus-errors.jsonl`,
limit: flag("limit") !== undefined ? flagInt("limit", 50) : undefined,
delayMs: flag("delay") !== undefined ? flagInt("delay", 1000) : 1000,
});
}
async function cmdCost(): Promise<void> {
const modelCosts: Record<string, { cost: number; count: number }> = {};
const stageCosts: Record<string, { cost: number; count: number }> = {};
@ -325,6 +356,9 @@ switch (command) {
case "label:judge":
await cmdJudge();
break;
case "label:golden":
await cmdGolden();
break;
case "label:cost":
await cmdCost();
break;

341
ts/src/label/golden.ts Normal file
View File

@ -0,0 +1,341 @@
/**
* Golden set annotation via Claude Agent SDK.
*
* Uses the user's Claude Code subscription (OAuth) instead of API keys,
* calling Opus 4.6 through the Agent SDK's `query()` with structured output.
* Designed for the ~1,200 human-labeled paragraphs.
*
* Key differences from Stage 1/2 (OpenRouter):
* - Full codebook (docs/LABELING-CODEBOOK.md) + v2.5 prompt as system prompt
* - Saves reasoning traces (Opus adaptive thinking) alongside annotations
* - Saves raw confidence values before coercion
* - No API cost runs on Max subscription
*/
import { readFile } from "node:fs/promises";
import { query } from "@anthropic-ai/claude-agent-sdk";
import { z } from "zod";
import { v4 as uuidv4 } from "uuid";
import {
LabelOutputRaw,
ContentCategory,
SpecificityLabel,
FactType,
toLabelOutput,
} from "@sec-cybert/schemas/label.ts";
import type { Annotation } from "@sec-cybert/schemas/annotation.ts";
import type { Paragraph } from "@sec-cybert/schemas/paragraph.ts";
import { SYSTEM_PROMPT, buildUserPrompt, PROMPT_VERSION } from "./prompts.ts";
import { loadCompletedIds } from "../lib/checkpoint.ts";
import { appendJsonl } from "../lib/jsonl.ts";
/**
* JSON-Schema-safe version of LabelOutputRaw.
*
* The real LabelOutputRaw uses .transform() and .pipe() (for Confidence
* and SpecificFact) which cannot convert to JSON Schema. We define a
* plain schema here for the Agent SDK's outputFormat, then parse the
* result through the real LabelOutputRaw for validation + transforms.
*
* Confidence is a number (0-1) here since Opus naturally outputs numeric
* confidence. We save the raw value separately and coerce via LabelOutputRaw.
*/
const GoldenOutputSchema = z.object({
content_category: z.enum(ContentCategory.options),
specific_facts: z.array(
z.object({
fact: z.string(),
type: z.enum(FactType.options),
}),
),
specificity: z.enum(SpecificityLabel.options),
category_confidence: z.number(),
specificity_confidence: z.number(),
reasoning: z.string(),
});
/** Extra fields saved alongside annotation but NOT in the label block. */
interface GoldenExtras {
/** Opus's reasoning trace from adaptive thinking. */
thinking: string;
/** Raw confidence values before coercion to high/medium/low. */
rawCategoryConfidence: number;
rawSpecificityConfidence: number;
/** Specific facts as returned by the model (before any transform). */
rawSpecificFacts: Array<{ fact: string; type: string }>;
}
/** What we write to the JSONL — Annotation + extras at top level. */
type GoldenAnnotation = Annotation & { golden: GoldenExtras };
export interface GoldenBatchOpts {
outputPath: string;
errorsPath: string;
limit?: number;
/** Delay between requests in ms. Default 1000 (1 req/s). */
delayMs?: number;
}
/** Build the enhanced system prompt: full codebook + v2.5 operational prompt + JSON schema. */
async function buildGoldenSystemPrompt(): Promise<string> {
const codebookPath = new URL("../../../docs/LABELING-CODEBOOK.md", import.meta.url).pathname;
const codebook = await readFile(codebookPath, "utf-8");
// Strip the old "LLM Response Schema" section from the codebook to avoid
// conflicting with the actual JSON schema we enforce via outputFormat.
// The old section uses specificity_level (integer) instead of specificity (string label).
const schemaHeading = "## LLM Response Schema";
const codebookTrimmed = codebook.includes(schemaHeading)
? codebook.slice(0, codebook.indexOf(schemaHeading)).trimEnd()
: codebook;
const jsonSchema = JSON.stringify(z.toJSONSchema(GoldenOutputSchema), null, 2);
return `${codebookTrimmed}
OPERATIONAL PROMPT (v${PROMPT_VERSION})
The codebook above is the authoritative reference. The prompt below
is the condensed operational version used for all annotation.
${SYSTEM_PROMPT}
OUTPUT JSON SCHEMA
You MUST return JSON matching this exact schema. Use text labels for
specificity (not integers). Confidence is a number 0-1.
${jsonSchema}`;
}
/** Extract from Agent SDK result messages. */
interface QueryResult {
structuredOutput: unknown;
thinking: string;
inputTokens: number;
outputTokens: number;
}
/**
* Annotate a single paragraph via Claude Agent SDK.
* Each call spawns a fresh, isolated query() no tools, no hooks,
* no settings, no session persistence.
*/
async function annotateGolden(
paragraph: Paragraph,
runId: string,
systemPrompt: string,
): Promise<GoldenAnnotation> {
const requestedAt = new Date().toISOString();
const start = Date.now();
let result: QueryResult = {
structuredOutput: null,
thinking: "",
inputTokens: 0,
outputTokens: 0,
};
for await (const message of query({
prompt: buildUserPrompt(paragraph),
options: {
model: "claude-opus-4-6",
systemPrompt,
outputFormat: {
type: "json_schema",
schema: z.toJSONSchema(GoldenOutputSchema),
},
// No tools — pure classification
allowedTools: [],
disallowedTools: ["Bash", "Read", "Write", "Edit", "Glob", "Grep", "WebSearch", "WebFetch", "Agent", "AskUserQuestion"],
// Isolation: no hooks, no settings, no session persistence
hooks: {},
settingSources: [],
persistSession: false,
// Single-turn: one prompt → one structured response
maxTurns: 1,
permissionMode: "dontAsk",
},
})) {
const msg = message as Record<string, unknown>;
// Extract thinking from assistant messages
if (msg.type === "assistant") {
const inner = msg.message as { content?: Array<{ type: string; thinking?: string }> };
if (inner.content) {
for (const block of inner.content) {
if (block.type === "thinking" && block.thinking) {
result.thinking += block.thinking;
}
}
}
}
// Extract structured output and usage from result message
if (msg.type === "result" && msg.subtype === "success") {
if (msg.structured_output) {
result.structuredOutput = msg.structured_output;
} else if (typeof msg.result === "string") {
const raw = msg.result;
const fenceMatch = raw.match(/```(?:json)?\s*\n([\s\S]*?)\n```/);
const jsonStr = fenceMatch ? fenceMatch[1]! : raw;
try {
result.structuredOutput = JSON.parse(jsonStr);
} catch {
// not valid JSON
}
}
// Token usage from modelUsage (more detailed than top-level usage)
const modelUsage = msg.modelUsage as Record<string, { inputTokens?: number; outputTokens?: number; cacheReadInputTokens?: number }> | undefined;
const opusUsage = modelUsage?.["claude-opus-4-6"];
if (opusUsage) {
result.inputTokens = (opusUsage.inputTokens ?? 0) + (opusUsage.cacheReadInputTokens ?? 0);
result.outputTokens = opusUsage.outputTokens ?? 0;
}
}
}
const latencyMs = Date.now() - start;
if (!result.structuredOutput) {
throw new Error(`No structured output from Opus for ${paragraph.id}`);
}
// Save raw values before coercion
const raw = result.structuredOutput as Record<string, unknown>;
const rawCategoryConfidence = typeof raw.category_confidence === "number" ? raw.category_confidence : 0;
const rawSpecificityConfidence = typeof raw.specificity_confidence === "number" ? raw.specificity_confidence : 0;
const rawSpecificFacts = Array.isArray(raw.specific_facts)
? (raw.specific_facts as Array<{ fact: string; type: string }>)
: [];
// Parse through the real schema with transforms (Confidence coercion, etc.)
const parsed = LabelOutputRaw.parse(result.structuredOutput);
return {
paragraphId: paragraph.id,
label: toLabelOutput(parsed),
provenance: {
modelId: "anthropic/claude-opus-4-6",
provider: "anthropic",
generationId: "agent-sdk",
stage: "benchmark",
runId,
promptVersion: `${PROMPT_VERSION}+codebook`,
inputTokens: result.inputTokens,
outputTokens: result.outputTokens,
reasoningTokens: 0, // included in outputTokens, not broken out by SDK
costUsd: 0, // subscription — no per-request cost
latencyMs,
requestedAt,
},
golden: {
thinking: result.thinking,
rawCategoryConfidence,
rawSpecificityConfidence,
rawSpecificFacts,
},
};
}
/**
* Run golden set annotation: serial 1-req/s through the Agent SDK.
* Crash-safe with JSONL checkpoint resume.
*/
export async function runGoldenBatch(
paragraphs: Paragraph[],
opts: GoldenBatchOpts,
): Promise<void> {
const { outputPath, errorsPath, limit, delayMs = 1000 } = opts;
const runId = uuidv4();
// Build system prompt once (codebook + operational prompt)
const systemPrompt = await buildGoldenSystemPrompt();
process.stderr.write(` System prompt: ${(systemPrompt.length / 1024).toFixed(1)}KB\n`);
// Resume support
const { completedIds, skippedLines } = await loadCompletedIds(outputPath);
if (skippedLines > 0) {
process.stderr.write(` ⚠ Skipped ${skippedLines} corrupt lines in ${outputPath}\n`);
}
let remaining = paragraphs.filter((p) => !completedIds.has(p.id));
if (limit !== undefined) remaining = remaining.slice(0, limit);
const total = remaining.length;
if (total === 0) {
process.stderr.write(` ✓ All ${paragraphs.length} paragraphs already completed\n`);
return;
}
process.stderr.write(
` Starting golden annotation │ Opus 4.6 (Agent SDK) │ ${total} remaining of ${paragraphs.length}\n`,
);
let processed = 0;
let errored = 0;
const startTime = Date.now();
// Graceful shutdown
let stopping = false;
const onSignal = () => {
if (stopping) return;
stopping = true;
process.stderr.write("\n ⏸ Stopping — finishing current request...\n");
};
process.on("SIGINT", onSignal);
process.on("SIGTERM", onSignal);
for (const paragraph of remaining) {
if (stopping) break;
try {
const annotation = await annotateGolden(paragraph, runId, systemPrompt);
await appendJsonl(outputPath, annotation);
processed++;
if (processed % 10 === 0 || processed === total) {
const elapsed = (Date.now() - startTime) / 1000;
const rate = (processed / elapsed) * 60;
const etaMin = Math.round((total - processed) / rate);
process.stderr.write(
` ${processed}/${total} (${((processed / total) * 100).toFixed(1)}%) │ ${rate.toFixed(1)} para/min │ ETA ${etaMin}m │ ${errored} errors\n`,
);
}
} catch (error) {
errored++;
await appendJsonl(errorsPath, {
paragraphId: paragraph.id,
error: error instanceof Error ? error.message : String(error),
modelId: "anthropic/claude-opus-4-6",
timestamp: new Date().toISOString(),
});
process.stderr.write(
` ✖ Error on ${paragraph.id}: ${error instanceof Error ? error.message : String(error)}\n`,
);
// 5 consecutive errors with no successes = likely systemic
if (errored >= 5 && processed === 0) {
process.stderr.write(" ✖ 5 errors with no successes. Stopping.\n");
break;
}
}
// Rate limit: 1 req/s
if (!stopping) {
await new Promise((r) => setTimeout(r, delayMs));
}
}
// Cleanup
process.off("SIGINT", onSignal);
process.off("SIGTERM", onSignal);
process.stderr.write(
`\n ✓ Golden annotation done: ${processed} processed, ${errored} errors\n`,
);
}