SEC-cyBERT/ts/scripts/analyze-no-cyber.ts

/**
 * Analyze the 348 annotated paragraphs with no cybersecurity keywords.
 * Reports label distribution to decide: keep or exclude from training.
 *
 * Usage: bun ts/scripts/analyze-no-cyber.ts
 */
import { readFileSync } from "node:fs";

const DATA_DIR = new URL("../../data", import.meta.url).pathname;
const QUALITY_PATH = `${DATA_DIR}/paragraphs/quality/quality-scores.jsonl`;
const ANNOTATIONS_PATH = `${DATA_DIR}/annotations/stage1.jsonl`;
const TRAINING_PATH = `${DATA_DIR}/paragraphs/training.patched.jsonl`;

interface QualityScore {
  id: string;
  issues: string[];
  quality_tier: string;
}

interface Annotation {
  paragraphId: string;
  label: {
    content_category: string;
    specificity_level: number;
    category_confidence: string;
    specificity_confidence: string;
    reasoning: string;
  };
  provenance: { modelId: string };
}

// Load quality scores — find no-cyber paragraphs
const noCyberIds = new Set<string>();
for (const line of readFileSync(QUALITY_PATH, "utf-8").split("\n")) {
  if (!line.trim()) continue;
  const q = JSON.parse(line) as QualityScore;
  if (q.issues.includes("no_cyber_keywords")) {
    noCyberIds.add(q.id);
  }
}
console.error(`No-cyber paragraphs (all): ${noCyberIds.size}`);

// Load training set IDs
const trainingIds = new Set<string>();
for (const line of readFileSync(TRAINING_PATH, "utf-8").split("\n")) {
  if (!line.trim()) continue;
  const p = JSON.parse(line) as { id: string };
  trainingIds.add(p.id);
}

// Filter to annotated no-cyber paragraphs
const annotatedNoCyber = new Set([...noCyberIds].filter((id) => trainingIds.has(id)));
console.error(`No-cyber paragraphs (annotated): ${annotatedNoCyber.size}`);

// Load annotations for these paragraphs
const annotations = new Map<string, Annotation[]>();
for (const line of readFileSync(ANNOTATIONS_PATH, "utf-8").split("\n")) {
  if (!line.trim()) continue;
  const ann = JSON.parse(line) as Annotation;
  if (annotatedNoCyber.has(ann.paragraphId)) {
    if (!annotations.has(ann.paragraphId)) annotations.set(ann.paragraphId, []);
    annotations.get(ann.paragraphId)!.push(ann);
  }
}
console.error(`Paragraphs with annotations: ${annotations.size}\n`);

// Majority vote per paragraph
function majority<T>(items: T[]): { value: T; count: number } {
  const counts = new Map<T, number>();
  for (const item of items) counts.set(item, (counts.get(item) ?? 0) + 1);
  let best: T = items[0]!;
  let bestCount = 0;
  for (const [v, c] of counts) {
    if (c > bestCount) { best = v; bestCount = c; }
  }
  return { value: best, count: bestCount };
}

// Category distribution (consensus)
const catDist = new Map<string, number>();
const specDist = new Map<number, number>();
const confDist = new Map<string, number>();
let conflicts = 0;

// Per-paragraph details for interesting cases
const nonOther: { pid: string; cat: string; spec: number; anns: Annotation[] }[] = [];

for (const [pid, anns] of annotations) {
  const catVote = majority(anns.map((a) => a.label.content_category));
  const specVote = majority(anns.map((a) => a.label.specificity_level));

  catDist.set(catVote.value, (catDist.get(catVote.value) ?? 0) + 1);
  specDist.set(specVote.value, (specDist.get(specVote.value) ?? 0) + 1);

  if (catVote.count < 2) conflicts++;

  // Track confidence
  for (const ann of anns) {
    confDist.set(ann.label.category_confidence, (confDist.get(ann.label.category_confidence) ?? 0) + 1);
  }

  if (catVote.value !== "None/Other") {
    nonOther.push({ pid, cat: catVote.value, spec: specVote.value, anns });
  }
}

// ── Report ──────────────────────────────────────────────────────────────
console.log("═══ NO-CYBER-KEYWORD PARAGRAPH ANALYSIS ═══\n");
console.log(`Total annotated no-cyber paragraphs: ${annotations.size}`);
console.log(`Conflicts (no majority): ${conflicts}\n`);

console.log("─── Category Distribution (Consensus) ───");
for (const [cat, count] of [...catDist.entries()].sort((a, b) => b[1] - a[1])) {
  console.log(`  ${cat.padEnd(30)} ${count} (${((count / annotations.size) * 100).toFixed(1)}%)`);
}

console.log("\n─── Specificity Distribution (Consensus) ───");
for (const level of [1, 2, 3, 4]) {
  const count = specDist.get(level) ?? 0;
  console.log(`  Level ${level}: ${count} (${((count / annotations.size) * 100).toFixed(1)}%)`);
}

console.log("\n─── Confidence Distribution (All Models) ───");
for (const conf of ["high", "medium", "low"]) {
  const count = confDist.get(conf) ?? 0;
  const total = [...confDist.values()].reduce((a, b) => a + b, 0);
  console.log(`  ${conf}: ${count} (${((count / total) * 100).toFixed(1)}%)`);
}

console.log(`\n─── Non-"None/Other" Paragraphs: ${nonOther.length} ───`);
if (nonOther.length > 0) {
  console.log("These are the concerning ones — labeled as real categories but have no cyber keywords.\n");

  // Load actual paragraph text for these
  const textMap = new Map<string, string>();
  const noCyberPidSet = new Set(nonOther.map((n) => n.pid));
  for (const line of readFileSync(TRAINING_PATH, "utf-8").split("\n")) {
    if (!line.trim()) continue;
    const p = JSON.parse(line) as { id: string; text: string };
    if (noCyberPidSet.has(p.id)) textMap.set(p.id, p.text);
  }

  // Show samples
  for (const item of nonOther.slice(0, 10)) {
    const text = textMap.get(item.pid) ?? "(text not found)";
    const modelVotes = item.anns.map((a) => `${a.provenance.modelId.split("/")[1]}: ${a.label.content_category}`).join(", ");
    console.log(`  [${item.cat} / Spec ${item.spec}] ${item.pid}`);
    console.log(`    Models: ${modelVotes}`);
    console.log(`    Text: ${text.substring(0, 150)}...`);
    console.log();
  }
}

// Summary recommendation
const noneOtherCount = catDist.get("None/Other") ?? 0;
const noneOtherPct = ((noneOtherCount / annotations.size) * 100).toFixed(1);
console.log("─── RECOMMENDATION ───");
if (nonOther.length < 50) {
  console.log(`  ${noneOtherPct}% labeled None/Other. Only ${nonOther.length} labeled as real categories.`);
  console.log(`  → EXCLUDE ${nonOther.length} non-None/Other paragraphs from training (likely section bleed).`);
  console.log(`  → KEEP ${noneOtherCount} None/Other paragraphs (correct labels for non-cyber content).`);
} else {
  console.log(`  WARNING: ${nonOther.length} paragraphs labeled as real categories — investigate further.`);
}