import { describe, test, expect } from "bun:test"; import { cohensKappa, krippendorffsAlpha, agreementRate, confusionMatrix, perCategoryAgreement, } from "../metrics"; describe("cohensKappa", () => { test("perfect agreement returns 1", () => { const r1 = ["A", "B", "A", "B"]; const r2 = ["A", "B", "A", "B"]; expect(cohensKappa(r1, r2)).toBeCloseTo(1.0); }); test("known example", () => { // From Wikipedia Cohen's kappa example const r1 = [ "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "No", "No", "No", "No", "No", "No", "No", "No", "No", "No", ]; const r2 = [ "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "No", "No", "No", "No", "Yes", "Yes", "No", "No", "No", "No", "No", "No", "No", "No", ]; // p_o = 14/20 = 0.7 // p_e = (10/20 * 8/20) + (10/20 * 12/20) = 0.2 + 0.3 = 0.5 // kappa = (0.7 - 0.5) / (1 - 0.5) = 0.4 expect(cohensKappa(r1, r2)).toBeCloseTo(0.4); }); test("complete disagreement returns negative kappa", () => { const r1 = ["A", "A", "B", "B"]; const r2 = ["B", "B", "A", "A"]; // p_o = 0, p_e = (2/4 * 2/4) + (2/4 * 2/4) = 0.5 // kappa = (0 - 0.5) / (1 - 0.5) = -1 expect(cohensKappa(r1, r2)).toBeCloseTo(-1.0); }); test("throws on mismatched lengths", () => { expect(() => cohensKappa(["A"], ["A", "B"])).toThrow(); }); }); describe("krippendorffsAlpha", () => { test("perfect agreement returns 1", () => { const ratings = [ [1, 2, 3, 4], [1, 2, 3, 4], [1, 2, 3, 4], ]; expect(krippendorffsAlpha(ratings)).toBeCloseTo(1.0); }); test("handles missing data", () => { // 3 raters, 4 items, some missing const ratings = [ [1, 2, null, 4], [1, 2, 3, 4], [1, null, 3, 4], ]; // Should still compute with available pairs const alpha = krippendorffsAlpha(ratings); expect(alpha).toBeGreaterThan(0.5); expect(alpha).toBeLessThanOrEqual(1.0); }); test("random ratings return alpha near 0", () => { // Create ratings that are essentially random const ratings = [ [1, 2, 3, 4, 1, 2, 3, 4, 1, 2], [4, 3, 2, 1, 4, 3, 2, 1, 4, 3], [2, 4, 1, 3, 2, 4, 1, 3, 2, 4], ]; const alpha = krippendorffsAlpha(ratings); // Random ratings should produce alpha close to 0 or negative expect(alpha).toBeLessThan(0.3); }); test("all same value returns 1", () => { const ratings = [ [3, 3, 3, 3], [3, 3, 3, 3], ]; expect(krippendorffsAlpha(ratings)).toBeCloseTo(1.0); }); test("throws with fewer than 2 raters", () => { expect(() => krippendorffsAlpha([[1, 2, 3]])).toThrow(); }); }); describe("confusionMatrix", () => { test("produces correct counts", () => { const actual = ["A", "A", "B", "B", "C"]; const predicted = ["A", "B", "B", "C", "C"]; const labels = ["A", "B", "C"]; const matrix = confusionMatrix(actual, predicted, labels); // Row=actual, Col=predicted // A: predicted A=1, predicted B=1, predicted C=0 // B: predicted A=0, predicted B=1, predicted C=1 // C: predicted A=0, predicted B=0, predicted C=1 expect(matrix).toEqual([ [1, 1, 0], [0, 1, 1], [0, 0, 1], ]); }); test("perfect prediction has diagonal only", () => { const labels = ["X", "Y"]; const vals = ["X", "Y", "X", "Y"]; const matrix = confusionMatrix(vals, vals, labels); expect(matrix).toEqual([ [2, 0], [0, 2], ]); }); }); describe("agreementRate", () => { test("all agree", () => { expect( agreementRate([ ["A", "A", "A"], ["B", "B", "B"], ]), ).toBeCloseTo(1.0); }); test("none agree", () => { expect(agreementRate([["A", "B", "C"]])).toBeCloseTo(0.0); }); test("partial agreement", () => { expect( agreementRate([ ["A", "A", "A"], ["A", "B", "A"], ]), ).toBeCloseTo(0.5); }); test("empty input", () => { expect(agreementRate([])).toBe(0); }); }); describe("perCategoryAgreement", () => { test("computes per-category rates", () => { const labels = [ { category: "A", annotatorId: "r1", paragraphId: "p1" }, { category: "A", annotatorId: "r2", paragraphId: "p1" }, { category: "A", annotatorId: "r1", paragraphId: "p2" }, { category: "B", annotatorId: "r2", paragraphId: "p2" }, { category: "B", annotatorId: "r1", paragraphId: "p3" }, { category: "B", annotatorId: "r2", paragraphId: "p3" }, ]; const result = perCategoryAgreement(labels, ["A", "B"]); // Category A: p1 (both A = agree), p2 (A vs B = disagree) => 1/2 = 0.5 // Category B: p2 (A vs B = disagree), p3 (both B = agree) => 1/2 = 0.5 expect(result["A"]).toBeCloseTo(0.5); expect(result["B"]).toBeCloseTo(0.5); }); test("category with no ratings returns 0", () => { const result = perCategoryAgreement([], ["A"]); expect(result["A"]).toBe(0); }); });