SEC-cyBERT/ts/scripts/analyze-no-cyber.ts
2026-03-29 20:33:39 -04:00

165 lines
6.4 KiB
TypeScript

/**
* Analyze the 348 annotated paragraphs with no cybersecurity keywords.
* Reports label distribution to decide: keep or exclude from training.
*
* Usage: bun ts/scripts/analyze-no-cyber.ts
*/
import { readFileSync } from "node:fs";
const DATA_DIR = new URL("../../data", import.meta.url).pathname;
const QUALITY_PATH = `${DATA_DIR}/paragraphs/quality/quality-scores.jsonl`;
const ANNOTATIONS_PATH = `${DATA_DIR}/annotations/stage1.jsonl`;
const TRAINING_PATH = `${DATA_DIR}/paragraphs/training.patched.jsonl`;
interface QualityScore {
id: string;
issues: string[];
quality_tier: string;
}
interface Annotation {
paragraphId: string;
label: {
content_category: string;
specificity_level: number;
category_confidence: string;
specificity_confidence: string;
reasoning: string;
};
provenance: { modelId: string };
}
// Load quality scores — find no-cyber paragraphs
const noCyberIds = new Set<string>();
for (const line of readFileSync(QUALITY_PATH, "utf-8").split("\n")) {
if (!line.trim()) continue;
const q = JSON.parse(line) as QualityScore;
if (q.issues.includes("no_cyber_keywords")) {
noCyberIds.add(q.id);
}
}
console.error(`No-cyber paragraphs (all): ${noCyberIds.size}`);
// Load training set IDs
const trainingIds = new Set<string>();
for (const line of readFileSync(TRAINING_PATH, "utf-8").split("\n")) {
if (!line.trim()) continue;
const p = JSON.parse(line) as { id: string };
trainingIds.add(p.id);
}
// Filter to annotated no-cyber paragraphs
const annotatedNoCyber = new Set([...noCyberIds].filter((id) => trainingIds.has(id)));
console.error(`No-cyber paragraphs (annotated): ${annotatedNoCyber.size}`);
// Load annotations for these paragraphs
const annotations = new Map<string, Annotation[]>();
for (const line of readFileSync(ANNOTATIONS_PATH, "utf-8").split("\n")) {
if (!line.trim()) continue;
const ann = JSON.parse(line) as Annotation;
if (annotatedNoCyber.has(ann.paragraphId)) {
if (!annotations.has(ann.paragraphId)) annotations.set(ann.paragraphId, []);
annotations.get(ann.paragraphId)!.push(ann);
}
}
console.error(`Paragraphs with annotations: ${annotations.size}\n`);
// Majority vote per paragraph
function majority<T>(items: T[]): { value: T; count: number } {
const counts = new Map<T, number>();
for (const item of items) counts.set(item, (counts.get(item) ?? 0) + 1);
let best: T = items[0]!;
let bestCount = 0;
for (const [v, c] of counts) {
if (c > bestCount) { best = v; bestCount = c; }
}
return { value: best, count: bestCount };
}
// Category distribution (consensus)
const catDist = new Map<string, number>();
const specDist = new Map<number, number>();
const confDist = new Map<string, number>();
let conflicts = 0;
// Per-paragraph details for interesting cases
const nonOther: { pid: string; cat: string; spec: number; anns: Annotation[] }[] = [];
for (const [pid, anns] of annotations) {
const catVote = majority(anns.map((a) => a.label.content_category));
const specVote = majority(anns.map((a) => a.label.specificity_level));
catDist.set(catVote.value, (catDist.get(catVote.value) ?? 0) + 1);
specDist.set(specVote.value, (specDist.get(specVote.value) ?? 0) + 1);
if (catVote.count < 2) conflicts++;
// Track confidence
for (const ann of anns) {
confDist.set(ann.label.category_confidence, (confDist.get(ann.label.category_confidence) ?? 0) + 1);
}
if (catVote.value !== "None/Other") {
nonOther.push({ pid, cat: catVote.value, spec: specVote.value, anns });
}
}
// ── Report ──────────────────────────────────────────────────────────────
console.log("═══ NO-CYBER-KEYWORD PARAGRAPH ANALYSIS ═══\n");
console.log(`Total annotated no-cyber paragraphs: ${annotations.size}`);
console.log(`Conflicts (no majority): ${conflicts}\n`);
console.log("─── Category Distribution (Consensus) ───");
for (const [cat, count] of [...catDist.entries()].sort((a, b) => b[1] - a[1])) {
console.log(` ${cat.padEnd(30)} ${count} (${((count / annotations.size) * 100).toFixed(1)}%)`);
}
console.log("\n─── Specificity Distribution (Consensus) ───");
for (const level of [1, 2, 3, 4]) {
const count = specDist.get(level) ?? 0;
console.log(` Level ${level}: ${count} (${((count / annotations.size) * 100).toFixed(1)}%)`);
}
console.log("\n─── Confidence Distribution (All Models) ───");
for (const conf of ["high", "medium", "low"]) {
const count = confDist.get(conf) ?? 0;
const total = [...confDist.values()].reduce((a, b) => a + b, 0);
console.log(` ${conf}: ${count} (${((count / total) * 100).toFixed(1)}%)`);
}
console.log(`\n─── Non-"None/Other" Paragraphs: ${nonOther.length} ───`);
if (nonOther.length > 0) {
console.log("These are the concerning ones — labeled as real categories but have no cyber keywords.\n");
// Load actual paragraph text for these
const textMap = new Map<string, string>();
const noCyberPidSet = new Set(nonOther.map((n) => n.pid));
for (const line of readFileSync(TRAINING_PATH, "utf-8").split("\n")) {
if (!line.trim()) continue;
const p = JSON.parse(line) as { id: string; text: string };
if (noCyberPidSet.has(p.id)) textMap.set(p.id, p.text);
}
// Show samples
for (const item of nonOther.slice(0, 10)) {
const text = textMap.get(item.pid) ?? "(text not found)";
const modelVotes = item.anns.map((a) => `${a.provenance.modelId.split("/")[1]}: ${a.label.content_category}`).join(", ");
console.log(` [${item.cat} / Spec ${item.spec}] ${item.pid}`);
console.log(` Models: ${modelVotes}`);
console.log(` Text: ${text.substring(0, 150)}...`);
console.log();
}
}
// Summary recommendation
const noneOtherCount = catDist.get("None/Other") ?? 0;
const noneOtherPct = ((noneOtherCount / annotations.size) * 100).toFixed(1);
console.log("─── RECOMMENDATION ───");
if (nonOther.length < 50) {
console.log(` ${noneOtherPct}% labeled None/Other. Only ${nonOther.length} labeled as real categories.`);
console.log(` → EXCLUDE ${nonOther.length} non-None/Other paragraphs from training (likely section bleed).`);
console.log(` → KEEP ${noneOtherCount} None/Other paragraphs (correct labels for non-cyber content).`);
} else {
console.log(` WARNING: ${nonOther.length} paragraphs labeled as real categories — investigate further.`);
}