165 lines
6.4 KiB
TypeScript
165 lines
6.4 KiB
TypeScript
/**
|
|
* Analyze the 348 annotated paragraphs with no cybersecurity keywords.
|
|
* Reports label distribution to decide: keep or exclude from training.
|
|
*
|
|
* Usage: bun ts/scripts/analyze-no-cyber.ts
|
|
*/
|
|
import { readFileSync } from "node:fs";
|
|
|
|
const DATA_DIR = new URL("../../data", import.meta.url).pathname;
|
|
const QUALITY_PATH = `${DATA_DIR}/paragraphs/quality/quality-scores.jsonl`;
|
|
const ANNOTATIONS_PATH = `${DATA_DIR}/annotations/stage1.jsonl`;
|
|
const TRAINING_PATH = `${DATA_DIR}/paragraphs/training.patched.jsonl`;
|
|
|
|
interface QualityScore {
|
|
id: string;
|
|
issues: string[];
|
|
quality_tier: string;
|
|
}
|
|
|
|
interface Annotation {
|
|
paragraphId: string;
|
|
label: {
|
|
content_category: string;
|
|
specificity_level: number;
|
|
category_confidence: string;
|
|
specificity_confidence: string;
|
|
reasoning: string;
|
|
};
|
|
provenance: { modelId: string };
|
|
}
|
|
|
|
// Load quality scores — find no-cyber paragraphs
|
|
const noCyberIds = new Set<string>();
|
|
for (const line of readFileSync(QUALITY_PATH, "utf-8").split("\n")) {
|
|
if (!line.trim()) continue;
|
|
const q = JSON.parse(line) as QualityScore;
|
|
if (q.issues.includes("no_cyber_keywords")) {
|
|
noCyberIds.add(q.id);
|
|
}
|
|
}
|
|
console.error(`No-cyber paragraphs (all): ${noCyberIds.size}`);
|
|
|
|
// Load training set IDs
|
|
const trainingIds = new Set<string>();
|
|
for (const line of readFileSync(TRAINING_PATH, "utf-8").split("\n")) {
|
|
if (!line.trim()) continue;
|
|
const p = JSON.parse(line) as { id: string };
|
|
trainingIds.add(p.id);
|
|
}
|
|
|
|
// Filter to annotated no-cyber paragraphs
|
|
const annotatedNoCyber = new Set([...noCyberIds].filter((id) => trainingIds.has(id)));
|
|
console.error(`No-cyber paragraphs (annotated): ${annotatedNoCyber.size}`);
|
|
|
|
// Load annotations for these paragraphs
|
|
const annotations = new Map<string, Annotation[]>();
|
|
for (const line of readFileSync(ANNOTATIONS_PATH, "utf-8").split("\n")) {
|
|
if (!line.trim()) continue;
|
|
const ann = JSON.parse(line) as Annotation;
|
|
if (annotatedNoCyber.has(ann.paragraphId)) {
|
|
if (!annotations.has(ann.paragraphId)) annotations.set(ann.paragraphId, []);
|
|
annotations.get(ann.paragraphId)!.push(ann);
|
|
}
|
|
}
|
|
console.error(`Paragraphs with annotations: ${annotations.size}\n`);
|
|
|
|
// Majority vote per paragraph
|
|
function majority<T>(items: T[]): { value: T; count: number } {
|
|
const counts = new Map<T, number>();
|
|
for (const item of items) counts.set(item, (counts.get(item) ?? 0) + 1);
|
|
let best: T = items[0]!;
|
|
let bestCount = 0;
|
|
for (const [v, c] of counts) {
|
|
if (c > bestCount) { best = v; bestCount = c; }
|
|
}
|
|
return { value: best, count: bestCount };
|
|
}
|
|
|
|
// Category distribution (consensus)
|
|
const catDist = new Map<string, number>();
|
|
const specDist = new Map<number, number>();
|
|
const confDist = new Map<string, number>();
|
|
let conflicts = 0;
|
|
|
|
// Per-paragraph details for interesting cases
|
|
const nonOther: { pid: string; cat: string; spec: number; anns: Annotation[] }[] = [];
|
|
|
|
for (const [pid, anns] of annotations) {
|
|
const catVote = majority(anns.map((a) => a.label.content_category));
|
|
const specVote = majority(anns.map((a) => a.label.specificity_level));
|
|
|
|
catDist.set(catVote.value, (catDist.get(catVote.value) ?? 0) + 1);
|
|
specDist.set(specVote.value, (specDist.get(specVote.value) ?? 0) + 1);
|
|
|
|
if (catVote.count < 2) conflicts++;
|
|
|
|
// Track confidence
|
|
for (const ann of anns) {
|
|
confDist.set(ann.label.category_confidence, (confDist.get(ann.label.category_confidence) ?? 0) + 1);
|
|
}
|
|
|
|
if (catVote.value !== "None/Other") {
|
|
nonOther.push({ pid, cat: catVote.value, spec: specVote.value, anns });
|
|
}
|
|
}
|
|
|
|
// ── Report ──────────────────────────────────────────────────────────────
|
|
console.log("═══ NO-CYBER-KEYWORD PARAGRAPH ANALYSIS ═══\n");
|
|
console.log(`Total annotated no-cyber paragraphs: ${annotations.size}`);
|
|
console.log(`Conflicts (no majority): ${conflicts}\n`);
|
|
|
|
console.log("─── Category Distribution (Consensus) ───");
|
|
for (const [cat, count] of [...catDist.entries()].sort((a, b) => b[1] - a[1])) {
|
|
console.log(` ${cat.padEnd(30)} ${count} (${((count / annotations.size) * 100).toFixed(1)}%)`);
|
|
}
|
|
|
|
console.log("\n─── Specificity Distribution (Consensus) ───");
|
|
for (const level of [1, 2, 3, 4]) {
|
|
const count = specDist.get(level) ?? 0;
|
|
console.log(` Level ${level}: ${count} (${((count / annotations.size) * 100).toFixed(1)}%)`);
|
|
}
|
|
|
|
console.log("\n─── Confidence Distribution (All Models) ───");
|
|
for (const conf of ["high", "medium", "low"]) {
|
|
const count = confDist.get(conf) ?? 0;
|
|
const total = [...confDist.values()].reduce((a, b) => a + b, 0);
|
|
console.log(` ${conf}: ${count} (${((count / total) * 100).toFixed(1)}%)`);
|
|
}
|
|
|
|
console.log(`\n─── Non-"None/Other" Paragraphs: ${nonOther.length} ───`);
|
|
if (nonOther.length > 0) {
|
|
console.log("These are the concerning ones — labeled as real categories but have no cyber keywords.\n");
|
|
|
|
// Load actual paragraph text for these
|
|
const textMap = new Map<string, string>();
|
|
const noCyberPidSet = new Set(nonOther.map((n) => n.pid));
|
|
for (const line of readFileSync(TRAINING_PATH, "utf-8").split("\n")) {
|
|
if (!line.trim()) continue;
|
|
const p = JSON.parse(line) as { id: string; text: string };
|
|
if (noCyberPidSet.has(p.id)) textMap.set(p.id, p.text);
|
|
}
|
|
|
|
// Show samples
|
|
for (const item of nonOther.slice(0, 10)) {
|
|
const text = textMap.get(item.pid) ?? "(text not found)";
|
|
const modelVotes = item.anns.map((a) => `${a.provenance.modelId.split("/")[1]}: ${a.label.content_category}`).join(", ");
|
|
console.log(` [${item.cat} / Spec ${item.spec}] ${item.pid}`);
|
|
console.log(` Models: ${modelVotes}`);
|
|
console.log(` Text: ${text.substring(0, 150)}...`);
|
|
console.log();
|
|
}
|
|
}
|
|
|
|
// Summary recommendation
|
|
const noneOtherCount = catDist.get("None/Other") ?? 0;
|
|
const noneOtherPct = ((noneOtherCount / annotations.size) * 100).toFixed(1);
|
|
console.log("─── RECOMMENDATION ───");
|
|
if (nonOther.length < 50) {
|
|
console.log(` ${noneOtherPct}% labeled None/Other. Only ${nonOther.length} labeled as real categories.`);
|
|
console.log(` → EXCLUDE ${nonOther.length} non-None/Other paragraphs from training (likely section bleed).`);
|
|
console.log(` → KEEP ${noneOtherCount} None/Other paragraphs (correct labels for non-cyber content).`);
|
|
} else {
|
|
console.log(` WARNING: ${nonOther.length} paragraphs labeled as real categories — investigate further.`);
|
|
}
|