SEC-cyBERT/labelapp/lib/__test__/metrics.test.ts
2026-03-29 00:32:24 -04:00

174 lines
4.9 KiB
TypeScript

import { describe, test, expect } from "bun:test";
import {
cohensKappa,
krippendorffsAlpha,
agreementRate,
confusionMatrix,
perCategoryAgreement,
} from "../metrics";
describe("cohensKappa", () => {
test("perfect agreement returns 1", () => {
const r1 = ["A", "B", "A", "B"];
const r2 = ["A", "B", "A", "B"];
expect(cohensKappa(r1, r2)).toBeCloseTo(1.0);
});
test("known example", () => {
// From Wikipedia Cohen's kappa example
const r1 = [
"Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes",
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
];
const r2 = [
"Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "No", "No", "No", "No",
"Yes", "Yes", "No", "No", "No", "No", "No", "No", "No", "No",
];
// p_o = 14/20 = 0.7
// p_e = (10/20 * 8/20) + (10/20 * 12/20) = 0.2 + 0.3 = 0.5
// kappa = (0.7 - 0.5) / (1 - 0.5) = 0.4
expect(cohensKappa(r1, r2)).toBeCloseTo(0.4);
});
test("complete disagreement returns negative kappa", () => {
const r1 = ["A", "A", "B", "B"];
const r2 = ["B", "B", "A", "A"];
// p_o = 0, p_e = (2/4 * 2/4) + (2/4 * 2/4) = 0.5
// kappa = (0 - 0.5) / (1 - 0.5) = -1
expect(cohensKappa(r1, r2)).toBeCloseTo(-1.0);
});
test("throws on mismatched lengths", () => {
expect(() => cohensKappa(["A"], ["A", "B"])).toThrow();
});
});
describe("krippendorffsAlpha", () => {
test("perfect agreement returns 1", () => {
const ratings = [
[1, 2, 3, 4],
[1, 2, 3, 4],
[1, 2, 3, 4],
];
expect(krippendorffsAlpha(ratings)).toBeCloseTo(1.0);
});
test("handles missing data", () => {
// 3 raters, 4 items, some missing
const ratings = [
[1, 2, null, 4],
[1, 2, 3, 4],
[1, null, 3, 4],
];
// Should still compute with available pairs
const alpha = krippendorffsAlpha(ratings);
expect(alpha).toBeGreaterThan(0.5);
expect(alpha).toBeLessThanOrEqual(1.0);
});
test("random ratings return alpha near 0", () => {
// Create ratings that are essentially random
const ratings = [
[1, 2, 3, 4, 1, 2, 3, 4, 1, 2],
[4, 3, 2, 1, 4, 3, 2, 1, 4, 3],
[2, 4, 1, 3, 2, 4, 1, 3, 2, 4],
];
const alpha = krippendorffsAlpha(ratings);
// Random ratings should produce alpha close to 0 or negative
expect(alpha).toBeLessThan(0.3);
});
test("all same value returns 1", () => {
const ratings = [
[3, 3, 3, 3],
[3, 3, 3, 3],
];
expect(krippendorffsAlpha(ratings)).toBeCloseTo(1.0);
});
test("throws with fewer than 2 raters", () => {
expect(() => krippendorffsAlpha([[1, 2, 3]])).toThrow();
});
});
describe("confusionMatrix", () => {
test("produces correct counts", () => {
const actual = ["A", "A", "B", "B", "C"];
const predicted = ["A", "B", "B", "C", "C"];
const labels = ["A", "B", "C"];
const matrix = confusionMatrix(actual, predicted, labels);
// Row=actual, Col=predicted
// A: predicted A=1, predicted B=1, predicted C=0
// B: predicted A=0, predicted B=1, predicted C=1
// C: predicted A=0, predicted B=0, predicted C=1
expect(matrix).toEqual([
[1, 1, 0],
[0, 1, 1],
[0, 0, 1],
]);
});
test("perfect prediction has diagonal only", () => {
const labels = ["X", "Y"];
const vals = ["X", "Y", "X", "Y"];
const matrix = confusionMatrix(vals, vals, labels);
expect(matrix).toEqual([
[2, 0],
[0, 2],
]);
});
});
describe("agreementRate", () => {
test("all agree", () => {
expect(
agreementRate([
["A", "A", "A"],
["B", "B", "B"],
]),
).toBeCloseTo(1.0);
});
test("none agree", () => {
expect(agreementRate([["A", "B", "C"]])).toBeCloseTo(0.0);
});
test("partial agreement", () => {
expect(
agreementRate([
["A", "A", "A"],
["A", "B", "A"],
]),
).toBeCloseTo(0.5);
});
test("empty input", () => {
expect(agreementRate([])).toBe(0);
});
});
describe("perCategoryAgreement", () => {
test("computes per-category rates", () => {
const labels = [
{ category: "A", annotatorId: "r1", paragraphId: "p1" },
{ category: "A", annotatorId: "r2", paragraphId: "p1" },
{ category: "A", annotatorId: "r1", paragraphId: "p2" },
{ category: "B", annotatorId: "r2", paragraphId: "p2" },
{ category: "B", annotatorId: "r1", paragraphId: "p3" },
{ category: "B", annotatorId: "r2", paragraphId: "p3" },
];
const result = perCategoryAgreement(labels, ["A", "B"]);
// Category A: p1 (both A = agree), p2 (A vs B = disagree) => 1/2 = 0.5
// Category B: p2 (A vs B = disagree), p3 (both B = agree) => 1/2 = 0.5
expect(result["A"]).toBeCloseTo(0.5);
expect(result["B"]).toBeCloseTo(0.5);
});
test("category with no ratings returns 0", () => {
const result = perCategoryAgreement([], ["A"]);
expect(result["A"]).toBe(0);
});
});