import { z } from "zod"; /** Performance metrics for a single model on the gold set. */ export const BenchmarkResult = z.object({ modelId: z.string(), stage: z.enum(["stage1", "stage2-judge", "benchmark"]), promptVersion: z.string(), metrics: z.object({ category: z.object({ macroF1: z.number(), perClassF1: z.record(z.string(), z.number()), mcc: z.number(), accuracy: z.number(), krippendorphAlpha: z.number(), }), specificity: z.object({ macroF1: z.number(), mae: z.number(), // mean absolute error spearman: z.number(), krippendorphAlpha: z.number(), }), }), totalAnnotations: z.number().int(), totalCostUsd: z.number(), avgLatencyMs: z.number(), runAt: z.iso.datetime(), }); export type BenchmarkResult = z.infer;