SEC-cyBERT/labelapp/lib/__test__/metrics.test.ts

import { describe, test, expect } from "bun:test";
import {
  cohensKappa,
  krippendorffsAlpha,
  agreementRate,
  confusionMatrix,
  perCategoryAgreement,
} from "../metrics";

describe("cohensKappa", () => {
  test("perfect agreement returns 1", () => {
    const r1 = ["A", "B", "A", "B"];
    const r2 = ["A", "B", "A", "B"];
    expect(cohensKappa(r1, r2)).toBeCloseTo(1.0);
  });

  test("known example", () => {
    // From Wikipedia Cohen's kappa example
    const r1 = [
      "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "Yes",
      "No", "No", "No", "No", "No", "No", "No", "No", "No", "No",
    ];
    const r2 = [
      "Yes", "Yes", "Yes", "Yes", "Yes", "Yes", "No", "No", "No", "No",
      "Yes", "Yes", "No", "No", "No", "No", "No", "No", "No", "No",
    ];
    // p_o = 14/20 = 0.7
    // p_e = (10/20 * 8/20) + (10/20 * 12/20) = 0.2 + 0.3 = 0.5
    // kappa = (0.7 - 0.5) / (1 - 0.5) = 0.4
    expect(cohensKappa(r1, r2)).toBeCloseTo(0.4);
  });

  test("complete disagreement returns negative kappa", () => {
    const r1 = ["A", "A", "B", "B"];
    const r2 = ["B", "B", "A", "A"];
    // p_o = 0, p_e = (2/4 * 2/4) + (2/4 * 2/4) = 0.5
    // kappa = (0 - 0.5) / (1 - 0.5) = -1
    expect(cohensKappa(r1, r2)).toBeCloseTo(-1.0);
  });

  test("throws on mismatched lengths", () => {
    expect(() => cohensKappa(["A"], ["A", "B"])).toThrow();
  });
});

describe("krippendorffsAlpha", () => {
  test("perfect agreement returns 1", () => {
    const ratings = [
      [1, 2, 3, 4],
      [1, 2, 3, 4],
      [1, 2, 3, 4],
    ];
    expect(krippendorffsAlpha(ratings)).toBeCloseTo(1.0);
  });

  test("handles missing data", () => {
    // 3 raters, 4 items, some missing
    const ratings = [
      [1, 2, null, 4],
      [1, 2, 3, 4],
      [1, null, 3, 4],
    ];
    // Should still compute with available pairs
    const alpha = krippendorffsAlpha(ratings);
    expect(alpha).toBeGreaterThan(0.5);
    expect(alpha).toBeLessThanOrEqual(1.0);
  });

  test("random ratings return alpha near 0", () => {
    // Create ratings that are essentially random
    const ratings = [
      [1, 2, 3, 4, 1, 2, 3, 4, 1, 2],
      [4, 3, 2, 1, 4, 3, 2, 1, 4, 3],
      [2, 4, 1, 3, 2, 4, 1, 3, 2, 4],
    ];
    const alpha = krippendorffsAlpha(ratings);
    // Random ratings should produce alpha close to 0 or negative
    expect(alpha).toBeLessThan(0.3);
  });

  test("all same value returns 1", () => {
    const ratings = [
      [3, 3, 3, 3],
      [3, 3, 3, 3],
    ];
    expect(krippendorffsAlpha(ratings)).toBeCloseTo(1.0);
  });

  test("throws with fewer than 2 raters", () => {
    expect(() => krippendorffsAlpha([[1, 2, 3]])).toThrow();
  });
});

describe("confusionMatrix", () => {
  test("produces correct counts", () => {
    const actual = ["A", "A", "B", "B", "C"];
    const predicted = ["A", "B", "B", "C", "C"];
    const labels = ["A", "B", "C"];
    const matrix = confusionMatrix(actual, predicted, labels);

    // Row=actual, Col=predicted
    // A: predicted A=1, predicted B=1, predicted C=0
    // B: predicted A=0, predicted B=1, predicted C=1
    // C: predicted A=0, predicted B=0, predicted C=1
    expect(matrix).toEqual([
      [1, 1, 0],
      [0, 1, 1],
      [0, 0, 1],
    ]);
  });

  test("perfect prediction has diagonal only", () => {
    const labels = ["X", "Y"];
    const vals = ["X", "Y", "X", "Y"];
    const matrix = confusionMatrix(vals, vals, labels);
    expect(matrix).toEqual([
      [2, 0],
      [0, 2],
    ]);
  });
});

describe("agreementRate", () => {
  test("all agree", () => {
    expect(
      agreementRate([
        ["A", "A", "A"],
        ["B", "B", "B"],
      ]),
    ).toBeCloseTo(1.0);
  });

  test("none agree", () => {
    expect(agreementRate([["A", "B", "C"]])).toBeCloseTo(0.0);
  });

  test("partial agreement", () => {
    expect(
      agreementRate([
        ["A", "A", "A"],
        ["A", "B", "A"],
      ]),
    ).toBeCloseTo(0.5);
  });

  test("empty input", () => {
    expect(agreementRate([])).toBe(0);
  });
});

describe("perCategoryAgreement", () => {
  test("computes per-category rates", () => {
    const labels = [
      { category: "A", annotatorId: "r1", paragraphId: "p1" },
      { category: "A", annotatorId: "r2", paragraphId: "p1" },
      { category: "A", annotatorId: "r1", paragraphId: "p2" },
      { category: "B", annotatorId: "r2", paragraphId: "p2" },
      { category: "B", annotatorId: "r1", paragraphId: "p3" },
      { category: "B", annotatorId: "r2", paragraphId: "p3" },
    ];
    const result = perCategoryAgreement(labels, ["A", "B"]);

    // Category A: p1 (both A = agree), p2 (A vs B = disagree) => 1/2 = 0.5
    // Category B: p2 (A vs B = disagree), p3 (both B = agree) => 1/2 = 0.5
    expect(result["A"]).toBeCloseTo(0.5);
    expect(result["B"]).toBeCloseTo(0.5);
  });

  test("category with no ratings returns 0", () => {
    const result = perCategoryAgreement([], ["A"]);
    expect(result["A"]).toBe(0);
  });
});