SEC-cyBERT/labelapp/lib/metrics.ts

/**
 * Statistical metrics for inter-rater reliability analysis.
 *
 * - cohensKappa: nominal agreement between two raters
 * - krippendorffsAlpha: ordinal agreement with multiple raters (handles missing data)
 * - confusionMatrix: contingency table for two sets of ratings
 * - agreementRate: raw proportion of items where all raters agree
 * - perCategoryAgreement: per-category agreement rates
 */

/**
 * Cohen's Kappa for two raters on nominal data.
 *
 * κ = (p_o - p_e) / (1 - p_e)
 * where p_o = observed agreement, p_e = expected agreement by chance.
 */
export function cohensKappa(ratings1: string[], ratings2: string[]): number {
  if (ratings1.length !== ratings2.length) {
    throw new Error("Rating arrays must have the same length");
  }

  const n = ratings1.length;
  if (n === 0) return 0;

  // Collect all unique categories
  const categories = new Set<string>();
  for (let i = 0; i < n; i++) {
    categories.add(ratings1[i]);
    categories.add(ratings2[i]);
  }

  // Count agreements and marginal frequencies
  let agreements = 0;
  const count1 = new Map<string, number>();
  const count2 = new Map<string, number>();

  for (const c of categories) {
    count1.set(c, 0);
    count2.set(c, 0);
  }

  for (let i = 0; i < n; i++) {
    if (ratings1[i] === ratings2[i]) agreements++;
    count1.set(ratings1[i], (count1.get(ratings1[i]) ?? 0) + 1);
    count2.set(ratings2[i], (count2.get(ratings2[i]) ?? 0) + 1);
  }

  const po = agreements / n;

  // Expected agreement by chance
  let pe = 0;
  for (const c of categories) {
    pe += (count1.get(c)! / n) * (count2.get(c)! / n);
  }

  if (pe === 1) return 1; // Both raters used the same single category
  return (po - pe) / (1 - pe);
}

/**
 * Krippendorff's Alpha for ordinal data with multiple raters.
 *
 * Uses the coincidence matrix approach with ordinal distance function d(c,k) = (c-k)^2.
 * Handles missing data (null values).
 *
 * @param ratings - raters x items matrix, null = missing
 * @returns alpha coefficient (-inf to 1, where 1 = perfect agreement)
 */
export function krippendorffsAlpha(ratings: (number | null)[][]): number {
  const nRaters = ratings.length;
  if (nRaters < 2) throw new Error("Need at least 2 raters");

  const nItems = ratings[0].length;
  if (nItems === 0) return 0;

  // Collect all unique values across all ratings
  const valueSet = new Set<number>();
  for (let r = 0; r < nRaters; r++) {
    for (let i = 0; i < nItems; i++) {
      const v = ratings[r][i];
      if (v !== null) valueSet.add(v);
    }
  }
  const values = [...valueSet].sort((a, b) => a - b);
  const valueIndex = new Map<number, number>();
  for (let i = 0; i < values.length; i++) {
    valueIndex.set(values[i], i);
  }
  const nValues = values.length;

  if (nValues < 2) return 1; // All non-null ratings are the same value

  // Build coincidence matrix
  // o[c][k] = number of coincidences between values c and k
  const o: number[][] = Array.from({ length: nValues }, () =>
    new Array(nValues).fill(0),
  );

  let totalPairable = 0;

  for (let i = 0; i < nItems; i++) {
    // Collect non-null values for this item
    const itemValues: number[] = [];
    for (let r = 0; r < nRaters; r++) {
      const v = ratings[r][i];
      if (v !== null) itemValues.push(v);
    }

    const mi = itemValues.length;
    if (mi < 2) continue; // Need at least 2 raters on this item

    // Each pair of coders contributes 1/(m_i - 1) to the coincidence matrix
    const weight = 1 / (mi - 1);

    for (let a = 0; a < mi; a++) {
      for (let b = 0; b < mi; b++) {
        if (a === b) continue;
        const ci = valueIndex.get(itemValues[a])!;
        const ki = valueIndex.get(itemValues[b])!;
        o[ci][ki] += weight;
      }
    }

    totalPairable += mi;
  }

  if (totalPairable === 0) return 0;

  // Marginal frequencies from coincidence matrix: n_c = sum of row c
  const nc: number[] = new Array(nValues).fill(0);
  for (let c = 0; c < nValues; c++) {
    for (let k = 0; k < nValues; k++) {
      nc[c] += o[c][k];
    }
  }

  const nTotal = nc.reduce((sum, v) => sum + v, 0);
  if (nTotal === 0) return 0;

  // Ordinal distance function: d(c, k) = (c - k)^2
  const dist = (c: number, k: number): number => {
    return (values[c] - values[k]) ** 2;
  };

  // Observed disagreement: D_o = (1/n) * sum_c sum_k o[c][k] * d(c,k)
  let dObserved = 0;
  for (let c = 0; c < nValues; c++) {
    for (let k = 0; k < nValues; k++) {
      if (c !== k) {
        dObserved += o[c][k] * dist(c, k);
      }
    }
  }
  dObserved /= nTotal;

  // Expected disagreement: D_e = (1/(n*(n-1))) * sum_c sum_k n_c * n_k * d(c,k)
  let dExpected = 0;
  for (let c = 0; c < nValues; c++) {
    for (let k = 0; k < nValues; k++) {
      if (c !== k) {
        dExpected += nc[c] * nc[k] * dist(c, k);
      }
    }
  }
  dExpected /= nTotal * (nTotal - 1);

  if (dExpected === 0) return 1; // No expected disagreement possible
  return 1 - dObserved / dExpected;
}

/**
 * Build a confusion matrix for two sets of ratings.
 *
 * @param actual - ground truth labels
 * @param predicted - predicted labels
 * @param labels - ordered list of label values (defines row/column order)
 * @returns 2D array where result[i][j] = count of (actual=labels[i], predicted=labels[j])
 */
export function confusionMatrix(
  actual: string[],
  predicted: string[],
  labels: string[],
): number[][] {
  if (actual.length !== predicted.length) {
    throw new Error("Arrays must have the same length");
  }

  const labelIndex = new Map<string, number>();
  for (let i = 0; i < labels.length; i++) {
    labelIndex.set(labels[i], i);
  }

  const matrix: number[][] = Array.from({ length: labels.length }, () =>
    new Array(labels.length).fill(0),
  );

  for (let i = 0; i < actual.length; i++) {
    const ai = labelIndex.get(actual[i]);
    const pi = labelIndex.get(predicted[i]);
    if (ai !== undefined && pi !== undefined) {
      matrix[ai][pi]++;
    }
  }

  return matrix;
}

/**
 * Raw agreement rate: proportion of items where ALL raters agree.
 *
 * @param labels - items x raters matrix (each inner array is the ratings for one item)
 * @returns proportion of items with complete agreement (0 to 1)
 */
export function agreementRate(labels: string[][]): number {
  if (labels.length === 0) return 0;

  let agreements = 0;

  for (const itemRatings of labels) {
    if (itemRatings.length === 0) continue;
    const allSame = itemRatings.every((r) => r === itemRatings[0]);
    if (allSame) agreements++;
  }

  return agreements / labels.length;
}

/**
 * Per-category agreement: for each category, what proportion of items
 * assigned that category by at least one rater have full agreement?
 *
 * @param labels - flat array of label records with category, annotatorId, paragraphId
 * @param categories - list of categories to compute agreement for
 * @returns record mapping each category to its agreement rate (0 to 1)
 */
export function perCategoryAgreement(
  labels: {
    category: string;
    annotatorId: string;
    paragraphId: string;
  }[],
  categories: string[],
): Record<string, number> {
  // Group labels by paragraph
  const byParagraph = new Map<string, string[]>();
  for (const label of labels) {
    if (!byParagraph.has(label.paragraphId)) {
      byParagraph.set(label.paragraphId, []);
    }
    byParagraph.get(label.paragraphId)!.push(label.category);
  }

  const result: Record<string, number> = {};

  for (const category of categories) {
    let relevant = 0;
    let agreed = 0;

    for (const ratings of byParagraph.values()) {
      // Check if this category appears in any rating for this paragraph
      if (!ratings.includes(category)) continue;

      relevant++;
      // Check if ALL raters assigned this category
      if (ratings.every((r) => r === category)) {
        agreed++;
      }
    }

    result[category] = relevant > 0 ? agreed / relevant : 0;
  }

  return result;
}