SEC-cyBERT/labelapp/lib/metrics.ts
2026-03-29 00:32:24 -04:00

275 lines
7.8 KiB
TypeScript

/**
* Statistical metrics for inter-rater reliability analysis.
*
* - cohensKappa: nominal agreement between two raters
* - krippendorffsAlpha: ordinal agreement with multiple raters (handles missing data)
* - confusionMatrix: contingency table for two sets of ratings
* - agreementRate: raw proportion of items where all raters agree
* - perCategoryAgreement: per-category agreement rates
*/
/**
* Cohen's Kappa for two raters on nominal data.
*
* κ = (p_o - p_e) / (1 - p_e)
* where p_o = observed agreement, p_e = expected agreement by chance.
*/
export function cohensKappa(ratings1: string[], ratings2: string[]): number {
if (ratings1.length !== ratings2.length) {
throw new Error("Rating arrays must have the same length");
}
const n = ratings1.length;
if (n === 0) return 0;
// Collect all unique categories
const categories = new Set<string>();
for (let i = 0; i < n; i++) {
categories.add(ratings1[i]);
categories.add(ratings2[i]);
}
// Count agreements and marginal frequencies
let agreements = 0;
const count1 = new Map<string, number>();
const count2 = new Map<string, number>();
for (const c of categories) {
count1.set(c, 0);
count2.set(c, 0);
}
for (let i = 0; i < n; i++) {
if (ratings1[i] === ratings2[i]) agreements++;
count1.set(ratings1[i], (count1.get(ratings1[i]) ?? 0) + 1);
count2.set(ratings2[i], (count2.get(ratings2[i]) ?? 0) + 1);
}
const po = agreements / n;
// Expected agreement by chance
let pe = 0;
for (const c of categories) {
pe += (count1.get(c)! / n) * (count2.get(c)! / n);
}
if (pe === 1) return 1; // Both raters used the same single category
return (po - pe) / (1 - pe);
}
/**
* Krippendorff's Alpha for ordinal data with multiple raters.
*
* Uses the coincidence matrix approach with ordinal distance function d(c,k) = (c-k)^2.
* Handles missing data (null values).
*
* @param ratings - raters x items matrix, null = missing
* @returns alpha coefficient (-inf to 1, where 1 = perfect agreement)
*/
export function krippendorffsAlpha(ratings: (number | null)[][]): number {
const nRaters = ratings.length;
if (nRaters < 2) throw new Error("Need at least 2 raters");
const nItems = ratings[0].length;
if (nItems === 0) return 0;
// Collect all unique values across all ratings
const valueSet = new Set<number>();
for (let r = 0; r < nRaters; r++) {
for (let i = 0; i < nItems; i++) {
const v = ratings[r][i];
if (v !== null) valueSet.add(v);
}
}
const values = [...valueSet].sort((a, b) => a - b);
const valueIndex = new Map<number, number>();
for (let i = 0; i < values.length; i++) {
valueIndex.set(values[i], i);
}
const nValues = values.length;
if (nValues < 2) return 1; // All non-null ratings are the same value
// Build coincidence matrix
// o[c][k] = number of coincidences between values c and k
const o: number[][] = Array.from({ length: nValues }, () =>
new Array(nValues).fill(0),
);
let totalPairable = 0;
for (let i = 0; i < nItems; i++) {
// Collect non-null values for this item
const itemValues: number[] = [];
for (let r = 0; r < nRaters; r++) {
const v = ratings[r][i];
if (v !== null) itemValues.push(v);
}
const mi = itemValues.length;
if (mi < 2) continue; // Need at least 2 raters on this item
// Each pair of coders contributes 1/(m_i - 1) to the coincidence matrix
const weight = 1 / (mi - 1);
for (let a = 0; a < mi; a++) {
for (let b = 0; b < mi; b++) {
if (a === b) continue;
const ci = valueIndex.get(itemValues[a])!;
const ki = valueIndex.get(itemValues[b])!;
o[ci][ki] += weight;
}
}
totalPairable += mi;
}
if (totalPairable === 0) return 0;
// Marginal frequencies from coincidence matrix: n_c = sum of row c
const nc: number[] = new Array(nValues).fill(0);
for (let c = 0; c < nValues; c++) {
for (let k = 0; k < nValues; k++) {
nc[c] += o[c][k];
}
}
const nTotal = nc.reduce((sum, v) => sum + v, 0);
if (nTotal === 0) return 0;
// Ordinal distance function: d(c, k) = (c - k)^2
const dist = (c: number, k: number): number => {
return (values[c] - values[k]) ** 2;
};
// Observed disagreement: D_o = (1/n) * sum_c sum_k o[c][k] * d(c,k)
let dObserved = 0;
for (let c = 0; c < nValues; c++) {
for (let k = 0; k < nValues; k++) {
if (c !== k) {
dObserved += o[c][k] * dist(c, k);
}
}
}
dObserved /= nTotal;
// Expected disagreement: D_e = (1/(n*(n-1))) * sum_c sum_k n_c * n_k * d(c,k)
let dExpected = 0;
for (let c = 0; c < nValues; c++) {
for (let k = 0; k < nValues; k++) {
if (c !== k) {
dExpected += nc[c] * nc[k] * dist(c, k);
}
}
}
dExpected /= nTotal * (nTotal - 1);
if (dExpected === 0) return 1; // No expected disagreement possible
return 1 - dObserved / dExpected;
}
/**
* Build a confusion matrix for two sets of ratings.
*
* @param actual - ground truth labels
* @param predicted - predicted labels
* @param labels - ordered list of label values (defines row/column order)
* @returns 2D array where result[i][j] = count of (actual=labels[i], predicted=labels[j])
*/
export function confusionMatrix(
actual: string[],
predicted: string[],
labels: string[],
): number[][] {
if (actual.length !== predicted.length) {
throw new Error("Arrays must have the same length");
}
const labelIndex = new Map<string, number>();
for (let i = 0; i < labels.length; i++) {
labelIndex.set(labels[i], i);
}
const matrix: number[][] = Array.from({ length: labels.length }, () =>
new Array(labels.length).fill(0),
);
for (let i = 0; i < actual.length; i++) {
const ai = labelIndex.get(actual[i]);
const pi = labelIndex.get(predicted[i]);
if (ai !== undefined && pi !== undefined) {
matrix[ai][pi]++;
}
}
return matrix;
}
/**
* Raw agreement rate: proportion of items where ALL raters agree.
*
* @param labels - items x raters matrix (each inner array is the ratings for one item)
* @returns proportion of items with complete agreement (0 to 1)
*/
export function agreementRate(labels: string[][]): number {
if (labels.length === 0) return 0;
let agreements = 0;
for (const itemRatings of labels) {
if (itemRatings.length === 0) continue;
const allSame = itemRatings.every((r) => r === itemRatings[0]);
if (allSame) agreements++;
}
return agreements / labels.length;
}
/**
* Per-category agreement: for each category, what proportion of items
* assigned that category by at least one rater have full agreement?
*
* @param labels - flat array of label records with category, annotatorId, paragraphId
* @param categories - list of categories to compute agreement for
* @returns record mapping each category to its agreement rate (0 to 1)
*/
export function perCategoryAgreement(
labels: {
category: string;
annotatorId: string;
paragraphId: string;
}[],
categories: string[],
): Record<string, number> {
// Group labels by paragraph
const byParagraph = new Map<string, string[]>();
for (const label of labels) {
if (!byParagraph.has(label.paragraphId)) {
byParagraph.set(label.paragraphId, []);
}
byParagraph.get(label.paragraphId)!.push(label.category);
}
const result: Record<string, number> = {};
for (const category of categories) {
let relevant = 0;
let agreed = 0;
for (const ratings of byParagraph.values()) {
// Check if this category appears in any rating for this paragraph
if (!ratings.includes(category)) continue;
relevant++;
// Check if ALL raters assigned this category
if (ratings.every((r) => r === category)) {
agreed++;
}
}
result[category] = relevant > 0 ? agreed / relevant : 0;
}
return result;
}