29 lines
831 B
TypeScript
29 lines
831 B
TypeScript
import { z } from "zod";
|
|
|
|
/** Performance metrics for a single model on the gold set. */
|
|
export const BenchmarkResult = z.object({
|
|
modelId: z.string(),
|
|
stage: z.enum(["stage1", "stage2-judge", "benchmark"]),
|
|
promptVersion: z.string(),
|
|
metrics: z.object({
|
|
category: z.object({
|
|
macroF1: z.number(),
|
|
perClassF1: z.record(z.string(), z.number()),
|
|
mcc: z.number(),
|
|
accuracy: z.number(),
|
|
krippendorphAlpha: z.number(),
|
|
}),
|
|
specificity: z.object({
|
|
macroF1: z.number(),
|
|
mae: z.number(), // mean absolute error
|
|
spearman: z.number(),
|
|
krippendorphAlpha: z.number(),
|
|
}),
|
|
}),
|
|
totalAnnotations: z.number().int(),
|
|
totalCostUsd: z.number(),
|
|
avgLatencyMs: z.number(),
|
|
runAt: z.iso.datetime(),
|
|
});
|
|
export type BenchmarkResult = z.infer<typeof BenchmarkResult>;
|