/** * Statistical metrics for inter-rater reliability analysis. * * - cohensKappa: nominal agreement between two raters * - krippendorffsAlpha: ordinal agreement with multiple raters (handles missing data) * - confusionMatrix: contingency table for two sets of ratings * - agreementRate: raw proportion of items where all raters agree * - perCategoryAgreement: per-category agreement rates */ /** * Cohen's Kappa for two raters on nominal data. * * κ = (p_o - p_e) / (1 - p_e) * where p_o = observed agreement, p_e = expected agreement by chance. */ export function cohensKappa(ratings1: string[], ratings2: string[]): number { if (ratings1.length !== ratings2.length) { throw new Error("Rating arrays must have the same length"); } const n = ratings1.length; if (n === 0) return 0; // Collect all unique categories const categories = new Set(); for (let i = 0; i < n; i++) { categories.add(ratings1[i]); categories.add(ratings2[i]); } // Count agreements and marginal frequencies let agreements = 0; const count1 = new Map(); const count2 = new Map(); for (const c of categories) { count1.set(c, 0); count2.set(c, 0); } for (let i = 0; i < n; i++) { if (ratings1[i] === ratings2[i]) agreements++; count1.set(ratings1[i], (count1.get(ratings1[i]) ?? 0) + 1); count2.set(ratings2[i], (count2.get(ratings2[i]) ?? 0) + 1); } const po = agreements / n; // Expected agreement by chance let pe = 0; for (const c of categories) { pe += (count1.get(c)! / n) * (count2.get(c)! / n); } if (pe === 1) return 1; // Both raters used the same single category return (po - pe) / (1 - pe); } /** * Krippendorff's Alpha for ordinal data with multiple raters. * * Uses the coincidence matrix approach with ordinal distance function d(c,k) = (c-k)^2. * Handles missing data (null values). * * @param ratings - raters x items matrix, null = missing * @returns alpha coefficient (-inf to 1, where 1 = perfect agreement) */ export function krippendorffsAlpha(ratings: (number | null)[][]): number { const nRaters = ratings.length; if (nRaters < 2) throw new Error("Need at least 2 raters"); const nItems = ratings[0].length; if (nItems === 0) return 0; // Collect all unique values across all ratings const valueSet = new Set(); for (let r = 0; r < nRaters; r++) { for (let i = 0; i < nItems; i++) { const v = ratings[r][i]; if (v !== null) valueSet.add(v); } } const values = [...valueSet].sort((a, b) => a - b); const valueIndex = new Map(); for (let i = 0; i < values.length; i++) { valueIndex.set(values[i], i); } const nValues = values.length; if (nValues < 2) return 1; // All non-null ratings are the same value // Build coincidence matrix // o[c][k] = number of coincidences between values c and k const o: number[][] = Array.from({ length: nValues }, () => new Array(nValues).fill(0), ); let totalPairable = 0; for (let i = 0; i < nItems; i++) { // Collect non-null values for this item const itemValues: number[] = []; for (let r = 0; r < nRaters; r++) { const v = ratings[r][i]; if (v !== null) itemValues.push(v); } const mi = itemValues.length; if (mi < 2) continue; // Need at least 2 raters on this item // Each pair of coders contributes 1/(m_i - 1) to the coincidence matrix const weight = 1 / (mi - 1); for (let a = 0; a < mi; a++) { for (let b = 0; b < mi; b++) { if (a === b) continue; const ci = valueIndex.get(itemValues[a])!; const ki = valueIndex.get(itemValues[b])!; o[ci][ki] += weight; } } totalPairable += mi; } if (totalPairable === 0) return 0; // Marginal frequencies from coincidence matrix: n_c = sum of row c const nc: number[] = new Array(nValues).fill(0); for (let c = 0; c < nValues; c++) { for (let k = 0; k < nValues; k++) { nc[c] += o[c][k]; } } const nTotal = nc.reduce((sum, v) => sum + v, 0); if (nTotal === 0) return 0; // Ordinal distance function: d(c, k) = (c - k)^2 const dist = (c: number, k: number): number => { return (values[c] - values[k]) ** 2; }; // Observed disagreement: D_o = (1/n) * sum_c sum_k o[c][k] * d(c,k) let dObserved = 0; for (let c = 0; c < nValues; c++) { for (let k = 0; k < nValues; k++) { if (c !== k) { dObserved += o[c][k] * dist(c, k); } } } dObserved /= nTotal; // Expected disagreement: D_e = (1/(n*(n-1))) * sum_c sum_k n_c * n_k * d(c,k) let dExpected = 0; for (let c = 0; c < nValues; c++) { for (let k = 0; k < nValues; k++) { if (c !== k) { dExpected += nc[c] * nc[k] * dist(c, k); } } } dExpected /= nTotal * (nTotal - 1); if (dExpected === 0) return 1; // No expected disagreement possible return 1 - dObserved / dExpected; } /** * Build a confusion matrix for two sets of ratings. * * @param actual - ground truth labels * @param predicted - predicted labels * @param labels - ordered list of label values (defines row/column order) * @returns 2D array where result[i][j] = count of (actual=labels[i], predicted=labels[j]) */ export function confusionMatrix( actual: string[], predicted: string[], labels: string[], ): number[][] { if (actual.length !== predicted.length) { throw new Error("Arrays must have the same length"); } const labelIndex = new Map(); for (let i = 0; i < labels.length; i++) { labelIndex.set(labels[i], i); } const matrix: number[][] = Array.from({ length: labels.length }, () => new Array(labels.length).fill(0), ); for (let i = 0; i < actual.length; i++) { const ai = labelIndex.get(actual[i]); const pi = labelIndex.get(predicted[i]); if (ai !== undefined && pi !== undefined) { matrix[ai][pi]++; } } return matrix; } /** * Raw agreement rate: proportion of items where ALL raters agree. * * @param labels - items x raters matrix (each inner array is the ratings for one item) * @returns proportion of items with complete agreement (0 to 1) */ export function agreementRate(labels: string[][]): number { if (labels.length === 0) return 0; let agreements = 0; for (const itemRatings of labels) { if (itemRatings.length === 0) continue; const allSame = itemRatings.every((r) => r === itemRatings[0]); if (allSame) agreements++; } return agreements / labels.length; } /** * Per-category agreement: for each category, what proportion of items * assigned that category by at least one rater have full agreement? * * @param labels - flat array of label records with category, annotatorId, paragraphId * @param categories - list of categories to compute agreement for * @returns record mapping each category to its agreement rate (0 to 1) */ export function perCategoryAgreement( labels: { category: string; annotatorId: string; paragraphId: string; }[], categories: string[], ): Record { // Group labels by paragraph const byParagraph = new Map(); for (const label of labels) { if (!byParagraph.has(label.paragraphId)) { byParagraph.set(label.paragraphId, []); } byParagraph.get(label.paragraphId)!.push(label.category); } const result: Record = {}; for (const category of categories) { let relevant = 0; let agreed = 0; for (const ratings of byParagraph.values()) { // Check if this category appears in any rating for this paragraph if (!ratings.includes(category)) continue; relevant++; // Check if ALL raters assigned this category if (ratings.every((r) => r === category)) { agreed++; } } result[category] = relevant > 0 ? agreed / relevant : 0; } return result; }