174 lines
4.9 KiB
TypeScript
174 lines
4.9 KiB
TypeScript
import { describe, test, expect } from "bun:test";
|
|
import {
|
|
cohensKappa,
|
|
krippendorffsAlpha,
|
|
agreementRate,
|
|
confusionMatrix,
|
|
perCategoryAgreement,
|
|
} from "../metrics";
|
|
|
|
describe("cohensKappa", () => {
|
|
test("perfect agreement returns 1", () => {
|
|
const r1 = ["A", "B", "A", "B"];
|
|
const r2 = ["A", "B", "A", "B"];
|
|
expect(cohensKappa(r1, r2)).toBeCloseTo(1.0);
|
|
});
|
|
|
|
test("known example", () => {
|
|
// From Wikipedia Cohen's kappa example
|
|
const r1 = [
|
|
"Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes",
|
|
"No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
|
|
];
|
|
const r2 = [
|
|
"Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "No", "No", "No", "No",
|
|
"Yes", "Yes", "No", "No", "No", "No", "No", "No", "No", "No",
|
|
];
|
|
// p_o = 14/20 = 0.7
|
|
// p_e = (10/20 * 8/20) + (10/20 * 12/20) = 0.2 + 0.3 = 0.5
|
|
// kappa = (0.7 - 0.5) / (1 - 0.5) = 0.4
|
|
expect(cohensKappa(r1, r2)).toBeCloseTo(0.4);
|
|
});
|
|
|
|
test("complete disagreement returns negative kappa", () => {
|
|
const r1 = ["A", "A", "B", "B"];
|
|
const r2 = ["B", "B", "A", "A"];
|
|
// p_o = 0, p_e = (2/4 * 2/4) + (2/4 * 2/4) = 0.5
|
|
// kappa = (0 - 0.5) / (1 - 0.5) = -1
|
|
expect(cohensKappa(r1, r2)).toBeCloseTo(-1.0);
|
|
});
|
|
|
|
test("throws on mismatched lengths", () => {
|
|
expect(() => cohensKappa(["A"], ["A", "B"])).toThrow();
|
|
});
|
|
});
|
|
|
|
describe("krippendorffsAlpha", () => {
|
|
test("perfect agreement returns 1", () => {
|
|
const ratings = [
|
|
[1, 2, 3, 4],
|
|
[1, 2, 3, 4],
|
|
[1, 2, 3, 4],
|
|
];
|
|
expect(krippendorffsAlpha(ratings)).toBeCloseTo(1.0);
|
|
});
|
|
|
|
test("handles missing data", () => {
|
|
// 3 raters, 4 items, some missing
|
|
const ratings = [
|
|
[1, 2, null, 4],
|
|
[1, 2, 3, 4],
|
|
[1, null, 3, 4],
|
|
];
|
|
// Should still compute with available pairs
|
|
const alpha = krippendorffsAlpha(ratings);
|
|
expect(alpha).toBeGreaterThan(0.5);
|
|
expect(alpha).toBeLessThanOrEqual(1.0);
|
|
});
|
|
|
|
test("random ratings return alpha near 0", () => {
|
|
// Create ratings that are essentially random
|
|
const ratings = [
|
|
[1, 2, 3, 4, 1, 2, 3, 4, 1, 2],
|
|
[4, 3, 2, 1, 4, 3, 2, 1, 4, 3],
|
|
[2, 4, 1, 3, 2, 4, 1, 3, 2, 4],
|
|
];
|
|
const alpha = krippendorffsAlpha(ratings);
|
|
// Random ratings should produce alpha close to 0 or negative
|
|
expect(alpha).toBeLessThan(0.3);
|
|
});
|
|
|
|
test("all same value returns 1", () => {
|
|
const ratings = [
|
|
[3, 3, 3, 3],
|
|
[3, 3, 3, 3],
|
|
];
|
|
expect(krippendorffsAlpha(ratings)).toBeCloseTo(1.0);
|
|
});
|
|
|
|
test("throws with fewer than 2 raters", () => {
|
|
expect(() => krippendorffsAlpha([[1, 2, 3]])).toThrow();
|
|
});
|
|
});
|
|
|
|
describe("confusionMatrix", () => {
|
|
test("produces correct counts", () => {
|
|
const actual = ["A", "A", "B", "B", "C"];
|
|
const predicted = ["A", "B", "B", "C", "C"];
|
|
const labels = ["A", "B", "C"];
|
|
const matrix = confusionMatrix(actual, predicted, labels);
|
|
|
|
// Row=actual, Col=predicted
|
|
// A: predicted A=1, predicted B=1, predicted C=0
|
|
// B: predicted A=0, predicted B=1, predicted C=1
|
|
// C: predicted A=0, predicted B=0, predicted C=1
|
|
expect(matrix).toEqual([
|
|
[1, 1, 0],
|
|
[0, 1, 1],
|
|
[0, 0, 1],
|
|
]);
|
|
});
|
|
|
|
test("perfect prediction has diagonal only", () => {
|
|
const labels = ["X", "Y"];
|
|
const vals = ["X", "Y", "X", "Y"];
|
|
const matrix = confusionMatrix(vals, vals, labels);
|
|
expect(matrix).toEqual([
|
|
[2, 0],
|
|
[0, 2],
|
|
]);
|
|
});
|
|
});
|
|
|
|
describe("agreementRate", () => {
|
|
test("all agree", () => {
|
|
expect(
|
|
agreementRate([
|
|
["A", "A", "A"],
|
|
["B", "B", "B"],
|
|
]),
|
|
).toBeCloseTo(1.0);
|
|
});
|
|
|
|
test("none agree", () => {
|
|
expect(agreementRate([["A", "B", "C"]])).toBeCloseTo(0.0);
|
|
});
|
|
|
|
test("partial agreement", () => {
|
|
expect(
|
|
agreementRate([
|
|
["A", "A", "A"],
|
|
["A", "B", "A"],
|
|
]),
|
|
).toBeCloseTo(0.5);
|
|
});
|
|
|
|
test("empty input", () => {
|
|
expect(agreementRate([])).toBe(0);
|
|
});
|
|
});
|
|
|
|
describe("perCategoryAgreement", () => {
|
|
test("computes per-category rates", () => {
|
|
const labels = [
|
|
{ category: "A", annotatorId: "r1", paragraphId: "p1" },
|
|
{ category: "A", annotatorId: "r2", paragraphId: "p1" },
|
|
{ category: "A", annotatorId: "r1", paragraphId: "p2" },
|
|
{ category: "B", annotatorId: "r2", paragraphId: "p2" },
|
|
{ category: "B", annotatorId: "r1", paragraphId: "p3" },
|
|
{ category: "B", annotatorId: "r2", paragraphId: "p3" },
|
|
];
|
|
const result = perCategoryAgreement(labels, ["A", "B"]);
|
|
|
|
// Category A: p1 (both A = agree), p2 (A vs B = disagree) => 1/2 = 0.5
|
|
// Category B: p2 (A vs B = disagree), p3 (both B = agree) => 1/2 = 0.5
|
|
expect(result["A"]).toBeCloseTo(0.5);
|
|
expect(result["B"]).toBeCloseTo(0.5);
|
|
});
|
|
|
|
test("category with no ratings returns 0", () => {
|
|
const result = perCategoryAgreement([], ["A"]);
|
|
expect(result["A"]).toBe(0);
|
|
});
|
|
});
|