2026-03-28 23:44:37 -04:00

29 lines
831 B
TypeScript

import { z } from "zod";
/** Performance metrics for a single model on the gold set. */
export const BenchmarkResult = z.object({
modelId: z.string(),
stage: z.enum(["stage1", "stage2-judge", "benchmark"]),
promptVersion: z.string(),
metrics: z.object({
category: z.object({
macroF1: z.number(),
perClassF1: z.record(z.string(), z.number()),
mcc: z.number(),
accuracy: z.number(),
krippendorphAlpha: z.number(),
}),
specificity: z.object({
macroF1: z.number(),
mae: z.number(), // mean absolute error
spearman: z.number(),
krippendorphAlpha: z.number(),
}),
}),
totalAnnotations: z.number().int(),
totalCostUsd: z.number(),
avgLatencyMs: z.number(),
runAt: z.iso.datetime(),
});
export type BenchmarkResult = z.infer<typeof BenchmarkResult>;