/** * Analyze the 348 annotated paragraphs with no cybersecurity keywords. * Reports label distribution to decide: keep or exclude from training. * * Usage: bun ts/scripts/analyze-no-cyber.ts */ import { readFileSync } from "node:fs"; const DATA_DIR = new URL("../../data", import.meta.url).pathname; const QUALITY_PATH = `${DATA_DIR}/paragraphs/quality/quality-scores.jsonl`; const ANNOTATIONS_PATH = `${DATA_DIR}/annotations/stage1.jsonl`; const TRAINING_PATH = `${DATA_DIR}/paragraphs/training.patched.jsonl`; interface QualityScore { id: string; issues: string[]; quality_tier: string; } interface Annotation { paragraphId: string; label: { content_category: string; specificity_level: number; category_confidence: string; specificity_confidence: string; reasoning: string; }; provenance: { modelId: string }; } // Load quality scores — find no-cyber paragraphs const noCyberIds = new Set(); for (const line of readFileSync(QUALITY_PATH, "utf-8").split("\n")) { if (!line.trim()) continue; const q = JSON.parse(line) as QualityScore; if (q.issues.includes("no_cyber_keywords")) { noCyberIds.add(q.id); } } console.error(`No-cyber paragraphs (all): ${noCyberIds.size}`); // Load training set IDs const trainingIds = new Set(); for (const line of readFileSync(TRAINING_PATH, "utf-8").split("\n")) { if (!line.trim()) continue; const p = JSON.parse(line) as { id: string }; trainingIds.add(p.id); } // Filter to annotated no-cyber paragraphs const annotatedNoCyber = new Set([...noCyberIds].filter((id) => trainingIds.has(id))); console.error(`No-cyber paragraphs (annotated): ${annotatedNoCyber.size}`); // Load annotations for these paragraphs const annotations = new Map(); for (const line of readFileSync(ANNOTATIONS_PATH, "utf-8").split("\n")) { if (!line.trim()) continue; const ann = JSON.parse(line) as Annotation; if (annotatedNoCyber.has(ann.paragraphId)) { if (!annotations.has(ann.paragraphId)) annotations.set(ann.paragraphId, []); annotations.get(ann.paragraphId)!.push(ann); } } console.error(`Paragraphs with annotations: ${annotations.size}\n`); // Majority vote per paragraph function majority(items: T[]): { value: T; count: number } { const counts = new Map(); for (const item of items) counts.set(item, (counts.get(item) ?? 0) + 1); let best: T = items[0]!; let bestCount = 0; for (const [v, c] of counts) { if (c > bestCount) { best = v; bestCount = c; } } return { value: best, count: bestCount }; } // Category distribution (consensus) const catDist = new Map(); const specDist = new Map(); const confDist = new Map(); let conflicts = 0; // Per-paragraph details for interesting cases const nonOther: { pid: string; cat: string; spec: number; anns: Annotation[] }[] = []; for (const [pid, anns] of annotations) { const catVote = majority(anns.map((a) => a.label.content_category)); const specVote = majority(anns.map((a) => a.label.specificity_level)); catDist.set(catVote.value, (catDist.get(catVote.value) ?? 0) + 1); specDist.set(specVote.value, (specDist.get(specVote.value) ?? 0) + 1); if (catVote.count < 2) conflicts++; // Track confidence for (const ann of anns) { confDist.set(ann.label.category_confidence, (confDist.get(ann.label.category_confidence) ?? 0) + 1); } if (catVote.value !== "None/Other") { nonOther.push({ pid, cat: catVote.value, spec: specVote.value, anns }); } } // ── Report ────────────────────────────────────────────────────────────── console.log("═══ NO-CYBER-KEYWORD PARAGRAPH ANALYSIS ═══\n"); console.log(`Total annotated no-cyber paragraphs: ${annotations.size}`); console.log(`Conflicts (no majority): ${conflicts}\n`); console.log("─── Category Distribution (Consensus) ───"); for (const [cat, count] of [...catDist.entries()].sort((a, b) => b[1] - a[1])) { console.log(` ${cat.padEnd(30)} ${count} (${((count / annotations.size) * 100).toFixed(1)}%)`); } console.log("\n─── Specificity Distribution (Consensus) ───"); for (const level of [1, 2, 3, 4]) { const count = specDist.get(level) ?? 0; console.log(` Level ${level}: ${count} (${((count / annotations.size) * 100).toFixed(1)}%)`); } console.log("\n─── Confidence Distribution (All Models) ───"); for (const conf of ["high", "medium", "low"]) { const count = confDist.get(conf) ?? 0; const total = [...confDist.values()].reduce((a, b) => a + b, 0); console.log(` ${conf}: ${count} (${((count / total) * 100).toFixed(1)}%)`); } console.log(`\n─── Non-"None/Other" Paragraphs: ${nonOther.length} ───`); if (nonOther.length > 0) { console.log("These are the concerning ones — labeled as real categories but have no cyber keywords.\n"); // Load actual paragraph text for these const textMap = new Map(); const noCyberPidSet = new Set(nonOther.map((n) => n.pid)); for (const line of readFileSync(TRAINING_PATH, "utf-8").split("\n")) { if (!line.trim()) continue; const p = JSON.parse(line) as { id: string; text: string }; if (noCyberPidSet.has(p.id)) textMap.set(p.id, p.text); } // Show samples for (const item of nonOther.slice(0, 10)) { const text = textMap.get(item.pid) ?? "(text not found)"; const modelVotes = item.anns.map((a) => `${a.provenance.modelId.split("/")[1]}: ${a.label.content_category}`).join(", "); console.log(` [${item.cat} / Spec ${item.spec}] ${item.pid}`); console.log(` Models: ${modelVotes}`); console.log(` Text: ${text.substring(0, 150)}...`); console.log(); } } // Summary recommendation const noneOtherCount = catDist.get("None/Other") ?? 0; const noneOtherPct = ((noneOtherCount / annotations.size) * 100).toFixed(1); console.log("─── RECOMMENDATION ───"); if (nonOther.length < 50) { console.log(` ${noneOtherPct}% labeled None/Other. Only ${nonOther.length} labeled as real categories.`); console.log(` → EXCLUDE ${nonOther.length} non-None/Other paragraphs from training (likely section bleed).`); console.log(` → KEEP ${noneOtherCount} None/Other paragraphs (correct labels for non-cyber content).`); } else { console.log(` WARNING: ${nonOther.length} paragraphs labeled as real categories — investigate further.`); }