import { generateText, Output } from "ai"; import { openrouter, providerOf } from "../lib/openrouter.ts"; import { LabelOutputRaw, toLabelOutput } from "../schemas/label.ts"; import type { Annotation } from "../schemas/annotation.ts"; import type { Paragraph } from "../schemas/paragraph.ts"; import { SYSTEM_PROMPT, buildUserPrompt, buildJudgePrompt, PROMPT_VERSION } from "./prompts.ts"; import { withRetry } from "../lib/retry.ts"; /** OpenRouter reasoning effort levels. */ type ReasoningEffort = "low" | "medium" | "high"; /** Build providerOptions for OpenRouter with reasoning + usage tracking. */ function buildProviderOptions(effort: ReasoningEffort) { return { openrouter: { reasoning: { effort }, usage: { include: true as const }, }, }; } /** Extract cost from the result, checking both raw usage and providerMetadata. */ function extractCost(result: { usage: unknown; providerMetadata?: unknown }): number { // Primary: raw usage.cost (always present in our smoke test) const raw = result.usage as { raw?: { cost?: number } }; if (raw.raw?.cost !== undefined) return raw.raw.cost; // Fallback: providerMetadata.openrouter.usage.cost const meta = result.providerMetadata as | { openrouter?: { usage?: { cost?: number } } } | undefined; return meta?.openrouter?.usage?.cost ?? 0; } export interface AnnotateOpts { modelId: string; stage: "stage1" | "stage2-judge" | "benchmark"; runId: string; promptVersion?: string; reasoningEffort?: ReasoningEffort; } /** * Annotate a single paragraph with one model. * Returns a full Annotation record with provenance. */ export async function annotateParagraph( paragraph: Paragraph, opts: AnnotateOpts, ): Promise { const { modelId, stage, runId, promptVersion = PROMPT_VERSION, reasoningEffort = "low", } = opts; const requestedAt = new Date().toISOString(); const start = Date.now(); const result = await withRetry( () => generateText({ model: openrouter(modelId), output: Output.object({ schema: LabelOutputRaw }), system: SYSTEM_PROMPT, prompt: buildUserPrompt(paragraph), temperature: 0, providerOptions: buildProviderOptions(reasoningEffort), abortSignal: AbortSignal.timeout(120_000), }), { label: `${modelId}:${paragraph.id}` }, ); const latencyMs = Date.now() - start; const rawOutput = result.output; if (!rawOutput) throw new Error(`No output from ${modelId} for ${paragraph.id}`); return { paragraphId: paragraph.id, label: toLabelOutput(rawOutput), provenance: { modelId, provider: providerOf(modelId), generationId: result.response?.id ?? "unknown", stage, runId, promptVersion, inputTokens: result.usage.inputTokens ?? 0, outputTokens: result.usage.outputTokens ?? 0, reasoningTokens: result.usage.outputTokenDetails?.reasoningTokens ?? 0, costUsd: extractCost(result), latencyMs, requestedAt, }, }; } export interface JudgeOpts { runId: string; promptVersion?: string; } /** * Run the Stage 2 judge on a paragraph where Stage 1 models disagreed. * Receives the paragraph + all 3 prior annotations in randomized order. */ export async function judgeParagraph( paragraph: Paragraph, priorAnnotations: Array<{ content_category: string; specificity_level: number; reasoning: string; }>, opts: JudgeOpts, ): Promise { const { runId, promptVersion = PROMPT_VERSION } = opts; const modelId = "anthropic/claude-sonnet-4.6"; const requestedAt = new Date().toISOString(); const start = Date.now(); const result = await withRetry( () => generateText({ model: openrouter(modelId), output: Output.object({ schema: LabelOutputRaw }), system: SYSTEM_PROMPT, prompt: buildJudgePrompt(paragraph, priorAnnotations), temperature: 0, providerOptions: buildProviderOptions("medium"), abortSignal: AbortSignal.timeout(240_000), }), { label: `judge:${paragraph.id}` }, ); const latencyMs = Date.now() - start; const rawOutput = result.output; if (!rawOutput) throw new Error(`No judge output for ${paragraph.id}`); return { paragraphId: paragraph.id, label: toLabelOutput(rawOutput), provenance: { modelId, provider: providerOf(modelId), generationId: result.response?.id ?? "unknown", stage: "stage2-judge", runId, promptVersion, inputTokens: result.usage.inputTokens ?? 0, outputTokens: result.usage.outputTokens ?? 0, reasoningTokens: result.usage.outputTokenDetails?.reasoningTokens ?? 0, costUsd: extractCost(result), latencyMs, requestedAt, }, }; }