From 2e932bc327f228065d12fd712ce254d0fc60f479 Mon Sep 17 00:00:00 2001
From: Joey Eamigh <55670930+JoeyEamigh@users.noreply.github.com>
Date: Sun, 5 Apr 2026 15:37:50 -0400
Subject: [PATCH] working model!!!!!
---
.dockerignore | 4 +-
docs/NARRATIVE.md | 131 ++++
docs/SPECIFICITY-IMPROVEMENT-PLAN.md | 21 +
docs/STATUS.md | 32 +-
labelapp/Dockerfile | 9 +-
labelapp/app/api/metrics/route.ts | 3 +-
labelapp/app/codebook/page.tsx | 7 +
labelapp/app/label/page.tsx | 3 +
labelapp/lib/onboarding-content.ts | 19 +-
labelapp/scripts/assign.ts | 6 +-
labelapp/scripts/dump-all.ts | 12 +-
python/main.py | 47 +-
python/pyproject.toml | 3 +
python/scripts/generate-comparison-figures.py | 226 ++++++
python/src/finetune/eval.py | 724 ++++++++++++++++++
results/eval/comparison/comparison_table.png | Bin 0 -> 103751 bytes
.../coral_vs_independent_all_metrics.png | Bin 0 -> 165311 bytes
.../comparison/coral_vs_independent_f1.png | Bin 0 -> 226637 bytes
results/eval/comparison/improvement_delta.png | Bin 0 -> 113820 bytes
.../comparison/spec_confusion_comparison.png | Bin 0 -> 119468 bytes
.../figures/calibration_cat_gpt-5.4.png | Bin 0 -> 53327 bytes
.../figures/calibration_cat_opus-4.6.png | Bin 0 -> 53774 bytes
.../figures/confusion_cat_gpt-5.4.png | Bin 0 -> 126943 bytes
.../figures/confusion_cat_opus-4.6.png | Bin 0 -> 127459 bytes
.../figures/confusion_spec_gpt-5.4.png | Bin 0 -> 90038 bytes
.../figures/confusion_spec_opus-4.6.png | Bin 0 -> 91169 bytes
.../figures/model_comparison.png | Bin 0 -> 76023 bytes
.../figures/per_class_f1_gpt-5.4.png | Bin 0 -> 111770 bytes
.../figures/per_class_f1_opus-4.6.png | Bin 0 -> 113281 bytes
.../figures/speed_comparison.png | Bin 0 -> 61822 bytes
results/eval/coral-baseline/metrics.json | 298 +++++++
results/eval/coral-baseline/report_gpt-54.txt | 54 ++
.../eval/coral-baseline/report_opus-46.txt | 54 ++
.../figures/calibration_cat_gpt-5.4.png | Bin 0 -> 53974 bytes
.../figures/calibration_cat_opus-4.6.png | Bin 0 -> 53535 bytes
.../figures/confusion_cat_gpt-5.4.png | Bin 0 -> 122535 bytes
.../figures/confusion_cat_opus-4.6.png | Bin 0 -> 120672 bytes
.../figures/confusion_spec_gpt-5.4.png | Bin 0 -> 84454 bytes
.../figures/confusion_spec_opus-4.6.png | Bin 0 -> 85511 bytes
.../figures/model_comparison.png | Bin 0 -> 66118 bytes
.../figures/per_class_f1_gpt-5.4.png | Bin 0 -> 106750 bytes
.../figures/per_class_f1_opus-4.6.png | Bin 0 -> 107875 bytes
.../figures/speed_comparison.png | Bin 0 -> 54818 bytes
results/eval/iter1-independent/metrics.json | 298 +++++++
.../eval/iter1-independent/report_gpt-54.txt | 54 ++
.../eval/iter1-independent/report_opus-46.txt | 54 ++
46 files changed, 2025 insertions(+), 34 deletions(-)
create mode 100644 python/scripts/generate-comparison-figures.py
create mode 100644 python/src/finetune/eval.py
create mode 100644 results/eval/comparison/comparison_table.png
create mode 100644 results/eval/comparison/coral_vs_independent_all_metrics.png
create mode 100644 results/eval/comparison/coral_vs_independent_f1.png
create mode 100644 results/eval/comparison/improvement_delta.png
create mode 100644 results/eval/comparison/spec_confusion_comparison.png
create mode 100644 results/eval/coral-baseline/figures/calibration_cat_gpt-5.4.png
create mode 100644 results/eval/coral-baseline/figures/calibration_cat_opus-4.6.png
create mode 100644 results/eval/coral-baseline/figures/confusion_cat_gpt-5.4.png
create mode 100644 results/eval/coral-baseline/figures/confusion_cat_opus-4.6.png
create mode 100644 results/eval/coral-baseline/figures/confusion_spec_gpt-5.4.png
create mode 100644 results/eval/coral-baseline/figures/confusion_spec_opus-4.6.png
create mode 100644 results/eval/coral-baseline/figures/model_comparison.png
create mode 100644 results/eval/coral-baseline/figures/per_class_f1_gpt-5.4.png
create mode 100644 results/eval/coral-baseline/figures/per_class_f1_opus-4.6.png
create mode 100644 results/eval/coral-baseline/figures/speed_comparison.png
create mode 100644 results/eval/coral-baseline/metrics.json
create mode 100644 results/eval/coral-baseline/report_gpt-54.txt
create mode 100644 results/eval/coral-baseline/report_opus-46.txt
create mode 100644 results/eval/iter1-independent/figures/calibration_cat_gpt-5.4.png
create mode 100644 results/eval/iter1-independent/figures/calibration_cat_opus-4.6.png
create mode 100644 results/eval/iter1-independent/figures/confusion_cat_gpt-5.4.png
create mode 100644 results/eval/iter1-independent/figures/confusion_cat_opus-4.6.png
create mode 100644 results/eval/iter1-independent/figures/confusion_spec_gpt-5.4.png
create mode 100644 results/eval/iter1-independent/figures/confusion_spec_opus-4.6.png
create mode 100644 results/eval/iter1-independent/figures/model_comparison.png
create mode 100644 results/eval/iter1-independent/figures/per_class_f1_gpt-5.4.png
create mode 100644 results/eval/iter1-independent/figures/per_class_f1_opus-4.6.png
create mode 100644 results/eval/iter1-independent/figures/speed_comparison.png
create mode 100644 results/eval/iter1-independent/metrics.json
create mode 100644 results/eval/iter1-independent/report_gpt-54.txt
create mode 100644 results/eval/iter1-independent/report_opus-46.txt
diff --git a/.dockerignore b/.dockerignore
index 1fa5fa9..17e8804 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -3,7 +3,6 @@
# Allow only what the labelapp Dockerfile needs
!package.json
-!bun.lock
!packages/schemas/
!ts/package.json
@@ -14,9 +13,10 @@ labelapp/.env*
labelapp/playwright-report/
labelapp/test-results/
-# Seed data (only the two JSONL files we need)
+# Seed data
!data/paragraphs/paragraphs-clean.jsonl
!data/annotations/stage1.jsonl
+!data/gold/v2-holdout-ids.json
# Git/IDE
.git
diff --git a/docs/NARRATIVE.md b/docs/NARRATIVE.md
index 7246ef7..7d8b9ac 100644
--- a/docs/NARRATIVE.md
+++ b/docs/NARRATIVE.md
@@ -575,6 +575,137 @@ As a proxy before human labels arrive, evaluation against GPT-5.4 and Opus bench
---
+## Phase 9: Holdout Evaluation — Proxy Gold Results
+
+### Evaluation Setup
+
+Built a comprehensive evaluation pipeline (`python/src/finetune/eval.py`) to test the trained model on the 1,200-paragraph holdout set. Since human gold labels were not yet available, we used two frontier API models as proxy references:
+
+- **GPT-5.4** (1,200 labels, ~$3,400/1M texts, ~2,900ms/sample)
+- **Opus-4.6** (1,200 labels, ~$5,000/1M texts, ~6,000ms/sample)
+
+Both references used the same v4.5 prompt as the Grok training labels but are different model families — they provide independent validation that the fine-tuned model learned the construct, not just Grok's idiosyncrasies.
+
+The evaluation computed: macro/weighted F1, per-class F1, precision, recall, MCC, AUC (one-vs-rest), QWK, MAE, Krippendorff's alpha (nominal for category, ordinal for specificity), confusion matrices, and calibration (ECE).
+
+### Results: Independent Thresholds (Epoch 8, Best Model)
+
+| Metric | vs GPT-5.4 | vs Opus-4.6 |
+|--------|-----------|-------------|
+| **Cat Macro F1** | **0.934** | **0.923** |
+| **Spec Macro F1** | **0.895** | **0.883** |
+| Cat MCC | 0.923 | 0.909 |
+| Cat AUC (OvR) | 0.992 | 0.994 |
+| Spec QWK | 0.932 | 0.923 |
+| Spec MAE | 0.118 | 0.136 |
+| Cat Kripp α | 0.922 | 0.909 |
+| Spec Kripp α | 0.918 | 0.907 |
+| Cat ECE | 0.054 | 0.066 |
+| Throughput | **178 samples/sec** | — |
+| Latency | **5.6ms/sample** | — |
+
+Both heads pass the 0.80 macro F1 target by wide margins on held-out data against independent reference models.
+
+Per-class category F1 (vs GPT-5.4): Board Gov. 0.972, Incident Disc. 0.961, Mgmt Role 0.941, None/Other 0.888, Risk Mgmt Proc. 0.856, Strategy Int. 0.958, Third-Party 0.959. RMP is the weakest category (0.856) due to MR↔RMP boundary ambiguity, but still comfortably above target.
+
+Per-class specificity F1 (vs GPT-5.4): L1 0.936, L2 0.798, L3 0.894, L4 0.954. L2 is the weakest level — analyzed in detail below.
+
+### Results: CORAL Baseline (Epoch 5) — For Comparison
+
+| Metric | vs GPT-5.4 | vs Opus-4.6 |
+|--------|-----------|-------------|
+| Cat Macro F1 | 0.936 | 0.928 |
+| **Spec Macro F1** | **0.597** | **0.596** |
+| Spec QWK | 0.876 | 0.872 |
+
+The category heads are essentially identical between models — the backbone handles category well regardless of specificity architecture. The +0.298 spec F1 improvement is entirely attributable to the independent threshold heads.
+
+CORAL's confusion matrix reveals the mechanism: it collapses L2 (F1=0.407) and L3 (F1=0.369) into L1 and L4, predicting extreme levels because the shared weight vector can't represent the intermediate transitions. The independent threshold model's confusion matrix shows clean diagonals across all four levels.
+
+### Reference Agreement Ceiling
+
+A critical finding: **the model agrees with the references more than the references agree with each other.**
+
+| Comparison | Macro Spec F1 | L2 F1 |
+|-----------|---------------|-------|
+| GPT-5.4 vs Opus-4.6 | **0.885** | **0.805** |
+| Our model vs GPT-5.4 | **0.895** | 0.798 |
+| Our model vs Opus-4.6 | 0.883 | 0.776 |
+| Stage 1 Consensus vs GPT-5.4 | 0.911 | 0.845 |
+
+Our model's macro spec F1 (0.895) exceeds the inter-reference agreement (0.885). This means the model learned a "consensus position" that is more consistent than either individual reference. Further improvements against these proxy references are not meaningful — they would represent overfitting to one reference's idiosyncrasies rather than genuine improvement.
+
+The L2 F1 of 0.798 is within 0.007 of the reference ceiling (0.805). The L1↔L2 boundary is the hardest in the construct — it hinges on whether language is "domain-specific" enough to qualify (the ERM test). Paragraphs using quasi-domain language (e.g., "risk management program for cybersecurity") sit in a genuine gray zone where even frontier models disagree.
+
+### L2 Error Analysis
+
+The L2 confusion is directional. Against GPT-5.4:
+- 29 L2 paragraphs misclassified as L1 (model under-calls domain terminology)
+- 23 L1 paragraphs misclassified as L2 (model over-calls domain terminology)
+- Only 7 L2→L3 and 2 L2→L4 errors (higher transitions are clean)
+
+This is the L1↔L2 boundary problem in isolation — the model handles L2↔L3 and L3↔L4 transitions with high accuracy. The ERM test ("would an employee relations manager understand this language?") is inherently subjective at the margin.
+
+### Category × Specificity Joint Distribution
+
+The holdout set reveals strong correlation between category and specificity:
+
+| Category | L1 | L2 | L3 | L4 |
+|---------|-----|-----|-----|-----|
+| None/Other | **100%** | 0% | 0% | 0% |
+| Strategy Integration | **85%** | 10% | 2% | 3% |
+| Third-Party Risk | 62% | **22%** | 12% | 5% |
+| Risk Mgmt Process | 34% | **44%** | 16% | 6% |
+| Board Governance | 42% | 4% | **45%** | 9% |
+| Management Role | 13% | 3% | 29% | **54%** |
+| Incident Disclosure | 0% | 8% | 2% | **90%** |
+
+Despite this correlation, the current architecture treats specificity as category-independent (by design — per the codebook, specificity measures "how specific" regardless of "what about"). Making specificity category-dependent was considered but rejected: the cell sizes for many (category, spec_level) combinations are too small for reliable conditional modeling, and error propagation from category mistakes would corrupt specificity predictions. The strong correlations are already captured implicitly by the shared backbone. This remains a potential direction for future investigation with a larger dataset.
+
+### Sequence Length Analysis
+
+At max_seq_length=512, truncation is negligible:
+
+| Dataset | Mean tokens | P95 | P99 | Max | Truncated (>512) |
+|---------|------------|-----|-----|-----|-----------------|
+| All paragraphs (72K) | 114.6 | 240 | 350 | 678 | 139 (0.19%) |
+| Holdout (1,200) | 117.9 | 236 | 329 | 603 | 1 (0.08%) |
+
+SEC cybersecurity disclosure paragraphs are short by nature (median ~100 tokens). The 512-token limit is more than sufficient — increasing to 1024 would affect only 139 training paragraphs and 1 holdout paragraph.
+
+### Speed and Cost Comparison
+
+| System | Latency | Throughput | Cost/1M texts | Reproducible |
+|--------|---------|-----------|---------------|-------------|
+| **Fine-tuned specialist** | **5.6ms** | **178/sec** | **~$5** | **Yes** |
+| GPT-5.4 (API) | ~2,900ms | ~0.3/sec | ~$3,400 | No |
+| Opus-4.6 (API) | ~6,000ms | ~0.2/sec | ~$5,000 | No |
+
+The fine-tuned model is **520× faster** than GPT-5.4 and **1,070× faster** than Opus-4.6, at **~680-1,000× lower cost**, with comparable or better accuracy and full determinism.
+
+### Calibration
+
+The model is well-calibrated for category (ECE=0.054 vs GPT-5.4) and reasonably calibrated for specificity (ECE=0.071). The calibration plot shows slight overconfidence in the 0.7-0.9 range — consistent with the "benign overfitting" observed during training where the model became more confident without changing decision boundaries. Temperature scaling could improve calibration without affecting predictions (a single scalar adjustment on validation logits), which would be valuable for deployment confidence thresholds.
+
+### Remaining Opportunities
+
+**Threshold tuning (free, post-gold):** Once human gold labels arrive, grid-search the per-threshold sigmoid cutoffs. Currently all thresholds use 0.5 — the optimal L1→L2 cutoff may differ. This requires no retraining and could gain +0.01-0.02 on L2 F1.
+
+**Ensemble (3 seeds, +0.01-0.03 F1):** Train 3 models with seeds 42/43/44, average sigmoid outputs. Reduces variance on boundary cases and provides confidence intervals for reported metrics. Cost: 3× training time (~24h total), 3× inference time (~17ms/sample).
+
+**Temperature scaling (free, improves calibration only):** Fit a single temperature parameter on the validation set. Reduces ECE without changing predictions — relevant for deployment where confidence scores matter.
+
+**Larger specificity MLP (future investigation):** The current 256-dim MLP is efficient but may not capture the full complexity of subtle specificity distinctions. Larger heads (512-dim or 3-layer) could help if the dataset grows, but risk overfitting at current data scale.
+
+### Figures Generated
+
+All evaluation figures saved to `results/eval/`:
+- `iter1-independent/figures/` — confusion matrices (cat + spec), calibration reliability diagrams, per-class F1 bar charts (vs GPT-5.4 and Opus-4.6 separately)
+- `coral-baseline/figures/` — same set for CORAL baseline comparison
+- `comparison/` — side-by-side CORAL vs Independent (per-class F1 bars, all-metrics comparison, improvement delta chart, confusion matrix comparison, summary table)
+
+---
+
## v1 Reference
The complete v1 narrative — Stage 1 prompt engineering (12+ iterations), model benchmarking (21+ models, 12 providers), human labeling webapp, gold set adjudication (13-signal cross-analysis), codebook iterations v1.0–v3.5 — is preserved at `docs/NARRATIVE-v1.md`.
diff --git a/docs/SPECIFICITY-IMPROVEMENT-PLAN.md b/docs/SPECIFICITY-IMPROVEMENT-PLAN.md
index f58f05f..a3acbb3 100644
--- a/docs/SPECIFICITY-IMPROVEMENT-PLAN.md
+++ b/docs/SPECIFICITY-IMPROVEMENT-PLAN.md
@@ -129,3 +129,24 @@ eval F1 improvement. Best checkpoint: epoch 8 (spec F1=0.945).
Independent thresholds were the key insight — CORAL's shared weight vector was
the primary bottleneck. Attention pooling, MLP heads, and confidence filtering
all contributed. Tier 2 and Tier 3 ideas were not needed.
+
+### Holdout Evaluation (1,200 paragraphs, proxy gold)
+
+Validated on held-out data against two independent frontier model references:
+
+| Model | Ref | Cat F1 | Spec F1 | L2 F1 | Spec QWK |
+|-------|-----|--------|---------|-------|----------|
+| Independent (ep8) | GPT-5.4 | 0.934 | **0.895** | 0.798 | 0.932 |
+| Independent (ep8) | Opus-4.6 | 0.923 | **0.883** | 0.776 | 0.923 |
+| CORAL (ep5) | GPT-5.4 | 0.936 | 0.597 | 0.407 | 0.876 |
+| CORAL (ep5) | Opus-4.6 | 0.928 | 0.596 | 0.418 | 0.872 |
+| GPT-5.4 | Opus-4.6 | — | **0.885** | **0.805** | 0.919 |
+
+**Key finding:** The model's holdout spec F1 (0.895) exceeds the inter-reference
+agreement (0.885 between GPT-5.4 and Opus-4.6). The model has reached the
+construct reliability ceiling — further improvement requires cleaner reference
+labels, not a better model.
+
+**L2 is at ceiling:** Model L2 F1 (0.798) is within 0.007 of reference agreement
+(0.805). The L1↔L2 boundary is genuinely ambiguous. Remaining opportunity:
+per-threshold sigmoid tuning against human gold labels (potential +0.01-0.02).
diff --git a/docs/STATUS.md b/docs/STATUS.md
index 80bc802..b92a147 100644
--- a/docs/STATUS.md
+++ b/docs/STATUS.md
@@ -1,6 +1,6 @@
# Project Status — v2 Pipeline
-**Deadline:** 2026-04-24 | **Started:** 2026-04-03 | **Updated:** 2026-04-05 (Fine-tuning done: cat F1=0.943, spec F1=0.945)
+**Deadline:** 2026-04-24 | **Started:** 2026-04-03 | **Updated:** 2026-04-05 (Holdout eval done: cat F1=0.934, spec F1=0.895 vs GPT-5.4 proxy gold)
---
@@ -142,14 +142,27 @@
- **Improvement plan:** `docs/SPECIFICITY-IMPROVEMENT-PLAN.md`
### 13. Evaluation & Paper ← CURRENT
-- [ ] Proxy eval: run fine-tuned model on holdout, compare against GPT-5.4 and Opus benchmark labels
+- [x] Proxy eval: fine-tuned model on 1,200 holdout vs GPT-5.4 and Opus-4.6 proxy gold
+- [x] Full metrics suite: macro/per-class F1, precision, recall, MCC, AUC, QWK, MAE, Krippendorff's α, ECE, confusion matrices
+- [x] CORAL baseline comparison: same eval pipeline on CORAL epoch 5 checkpoint
+- [x] Figures: confusion matrices, calibration diagrams, per-class F1 bars, CORAL vs Independent comparison, speed/cost table
+- [x] Reference ceiling analysis: GPT-5.4 vs Opus-4.6 agreement = 0.885 macro spec F1 (our model exceeds this at 0.895)
+- [x] L2 error analysis: model L2 F1 (0.798) within 0.007 of reference ceiling (0.805)
+- [x] Sequence length analysis: only 139/72K paragraphs (0.19%) truncated at 512 tokens — negligible impact
+- [x] Opus labels completed: 1,200/1,200 (filled 16 missing from initial run)
- [ ] Macro F1 on holdout gold (target > 0.80 both heads) — blocked on human labels
-- [ ] Per-class F1 breakdown + GenAI benchmark table
-- [ ] Error analysis, cost comparison, IGNITE slides
+- [ ] Per-threshold sigmoid tuning against human gold (potential +0.01-0.02 on L2 F1)
+- [ ] Temperature scaling for improved calibration (ECE reduction without changing predictions)
+- [ ] Ensemble of 3 seeds for confidence intervals and potential +0.01-0.03 F1
+- [ ] Error analysis against human gold, IGNITE slides
- [ ] Note in paper: specificity is paragraph-level (presence check), not category-conditional — acknowledge as limitation/future work
- [ ] Note in paper: DAPT/TAPT did not improve fine-tuning — noteworthy null result
- [ ] Note in paper: CORAL ordinal regression insufficient for multi-signal ordinal classification
-- **Next:** evaluate fine-tuned model on holdout using GPT-5.4 + Opus labels as proxy gold
+- [ ] Note in paper: model exceeds inter-reference agreement — approaches ceiling of construct reliability
+- **Proxy gold results (vs GPT-5.4):** Cat F1=0.934, Spec F1=0.895, MCC=0.923/0.866, AUC=0.992/0.982, QWK=0.932
+- **Proxy gold results (vs Opus-4.6):** Cat F1=0.923, Spec F1=0.883, QWK=0.923
+- **Speed:** 5.6ms/sample (178/sec) — 520× faster than GPT-5.4, 1,070× faster than Opus
+- **Next:** deploy labelapp for human annotation, then gold evaluation + threshold tuning
---
@@ -177,7 +190,7 @@
| v2 holdout reference | `data/annotations/v2-bench/gpt-5.4.jsonl` (v4.5, 1,200 paragraphs) |
| v2 iteration archive | `data/annotations/v2-bench/gpt-5.4.v4.{0,1,2,3,4}.jsonl` |
| v4.5 boundary test | `data/annotations/v2-bench/v45-test/gpt-5.4.jsonl` (50 paragraphs) |
-| Opus prompt-only | `data/annotations/v2-bench/opus-4.6.jsonl` (1,184 paragraphs) |
+| Opus prompt-only | `data/annotations/v2-bench/opus-4.6.jsonl` (1,200 paragraphs) |
| Opus +codebook | `data/annotations/golden/opus.jsonl` (includes v1 + v2 runs) |
| Grok self-consistency test | `data/annotations/v2-bench/grok-rerun/grok-4.1-fast.jsonl` (47 paragraphs) |
| Benchmark analysis | `scripts/analyze-v2-bench.py` |
@@ -199,6 +212,13 @@
| CORAL baseline (ablation winner) | `checkpoints/finetune/best-base_weighted_ce-ep5/final/` (cat=0.932, spec=0.517) |
| Ablation results | `checkpoints/finetune/ablation/ablation_results.json` |
| Spec improvement plan | `docs/SPECIFICITY-IMPROVEMENT-PLAN.md` |
+| Best model iter1 config | `python/configs/finetune/iter1-independent.yaml` |
+| Eval script | `python/src/finetune/eval.py` |
+| Eval results (best model) | `results/eval/iter1-independent/metrics.json` |
+| Eval results (CORAL) | `results/eval/coral-baseline/metrics.json` |
+| Comparison figures | `results/eval/comparison/` (5 charts) |
+| Per-model eval figures | `results/eval/iter1-independent/figures/` + `results/eval/coral-baseline/figures/` |
+| Comparison figure script | `python/scripts/generate-comparison-figures.py` |
### v2 Stage 1 Distribution (72,045 paragraphs, v4.5 prompt, Grok ×3 consensus + GPT-5.4 judge)
diff --git a/labelapp/Dockerfile b/labelapp/Dockerfile
index 047e9b4..6439774 100644
--- a/labelapp/Dockerfile
+++ b/labelapp/Dockerfile
@@ -1,14 +1,15 @@
# Build context: monorepo root (run: docker build -f labelapp/Dockerfile .)
-FROM oven/bun:1 AS base
+FROM oven/bun:1.3.8 AS base
# -- Install dependencies --
FROM base AS deps
WORKDIR /app
-COPY package.json bun.lock ./
+COPY package.json ./
COPY packages/schemas/package.json packages/schemas/
COPY ts/package.json ts/
COPY labelapp/package.json labelapp/
-RUN bun install --frozen-lockfile
+# bun.lock intentionally excluded — bun hangs parsing the binary lockfile in Docker (bun bug)
+RUN bun install
# -- Build Next.js --
FROM base AS builder
@@ -16,7 +17,7 @@ WORKDIR /app
COPY --from=deps /app/node_modules ./node_modules
COPY --from=deps /app/packages/schemas/node_modules ./packages/schemas/node_modules
COPY --from=deps /app/labelapp/node_modules ./labelapp/node_modules
-COPY package.json bun.lock ./
+COPY package.json ./
COPY packages/schemas/ packages/schemas/
COPY labelapp/ labelapp/
ENV NEXT_TELEMETRY_DISABLED=1
diff --git a/labelapp/app/api/metrics/route.ts b/labelapp/app/api/metrics/route.ts
index a486109..12ee914 100644
--- a/labelapp/app/api/metrics/route.ts
+++ b/labelapp/app/api/metrics/route.ts
@@ -100,7 +100,6 @@ export async function GET() {
// Filter to non-admin annotators for per-annotator stats
const perAnnotator = allAnnotators
- .filter((a) => a.id !== "joey")
.map((a) => ({
id: a.id,
displayName: a.displayName,
@@ -132,7 +131,7 @@ export async function GET() {
// Collect all annotator IDs that have labels (excluding admin)
const annotatorIds = [
...new Set(allLabels.map((l) => l.annotatorId)),
- ].filter((id) => id !== "joey");
+ ];
annotatorIds.sort();
// For each annotator pair, collect shared paragraph ratings
diff --git a/labelapp/app/codebook/page.tsx b/labelapp/app/codebook/page.tsx
index 0d85ff3..6a1497c 100644
--- a/labelapp/app/codebook/page.tsx
+++ b/labelapp/app/codebook/page.tsx
@@ -797,6 +797,13 @@ export default function CodebookPage() {
how company-specific the disclosure is. Apply the decision test in
order — stop at the first “yes.”
+
+ Specificity rates the ENTIRE paragraph — not just
+ the parts related to the content category you chose. If a Board
+ Governance paragraph also mentions CrowdStrike Falcon or the
+ CISO’s 20 years of experience, those facts count. Scan
+ everything, don’t filter by category.
+
{/* Decision Test */}
diff --git a/labelapp/app/label/page.tsx b/labelapp/app/label/page.tsx
index a0a5aa1..79ed36e 100644
--- a/labelapp/app/label/page.tsx
+++ b/labelapp/app/label/page.tsx
@@ -489,6 +489,9 @@ function CodebookSidebar() {
desc="Contains 1+ QV-eligible facts: specific numbers, dates, named external entities, named tools/products, verifiable certifications."
/>
+
+ Specificity rates the WHOLE paragraph — not just the category-relevant parts. Scan everything.
+
diff --git a/labelapp/lib/onboarding-content.ts b/labelapp/lib/onboarding-content.ts
index 9394e69..42dba5c 100644
--- a/labelapp/lib/onboarding-content.ts
+++ b/labelapp/lib/onboarding-content.ts
@@ -29,7 +29,7 @@ export const ONBOARDING_STEPS: OnboardingStep[] = [
"Management Role is broader: it now covers how management is ORGANIZED to handle cybersecurity — role allocation, committee structure, reporting lines — not just \"who a specific person is.\" Paragraphs about management structure without named individuals can be MR.",
"Specificity Level 2 is broader: renamed from \"Sector-Adapted\" to \"Domain-Adapted.\" Cybersecurity terms like penetration testing, vulnerability scanning, SIEM, and SOC now trigger Level 2. In v1, these were incorrectly classified as Level 1.",
"Level 4 requires just 1 QV fact (was 2+). No more counting. If an external party could verify even one claim in the paragraph — a dollar amount, a named tool, a specific date — it's Level 4.",
- "You'll be labeling 1,200 holdout paragraphs total. There are 5 annotators, with 3 labeling each paragraph. You'll see roughly 720.",
+ "You'll be labeling 1,200 holdout paragraphs total. There are 6 annotators, with 3 labeling each paragraph. You'll see roughly 600.",
],
keyPoints: [
"Same 7 categories, same 4 specificity levels — the framework is unchanged.",
@@ -49,11 +49,12 @@ export const ONBOARDING_STEPS: OnboardingStep[] = [
"Question 1 — Content Category: \"What is this paragraph about?\" Pick the best of 7 options.",
"Question 2 — Specificity Level: \"How company-specific is this paragraph?\" Pick a level from 1 to 4.",
"These are independent dimensions. A materiality disclaimer can be Strategy Integration (category) at Level 1 (generic boilerplate). An incident report can be Incident Disclosure at Level 4 (specific dates and firms).",
+ "Important: specificity rates THE WHOLE PARAGRAPH, not just the category-relevant parts. If a Board Governance paragraph mentions the CISO by name and describes penetration testing, those facts count for specificity even though they're not \"board\" content. Scan the entire paragraph for the most specific fact present — don't filter by category first.",
],
keyPoints: [
"One content category (of 7) — pick the dominant one.",
- "One specificity level (1–4) — determined by the most specific fact present.",
- "Category and specificity are independent — don't let one influence the other.",
+ "One specificity level (1–4) — determined by the most specific fact in THE WHOLE PARAGRAPH.",
+ "Specificity rates the paragraph, not the category. A Board Governance paragraph that mentions CrowdStrike Falcon is Level 4.",
],
},
@@ -142,6 +143,7 @@ export const ONBOARDING_STEPS: OnboardingStep[] = [
subtitle: "How company-specific is this paragraph?",
content: [
"Specificity measures how much this paragraph tells you about THIS specific company versus generic filler any company could use.",
+ "Critical: specificity rates the ENTIRE paragraph — not just the parts related to the category you chose. If you categorize a paragraph as Board Governance but it also mentions CrowdStrike Falcon or the CISO's 20 years of experience, those facts still count. Scan everything.",
"Think of it as a waterfall — check from the top and stop at the first yes:",
"Level 4 — Quantified-Verifiable: Can an external party verify at least one claim? (a specific number, date, named tool/firm, verifiable certification) → Level 4.",
"Level 3 — Firm-Specific: Does it contain at least one fact unique to THIS company? (CISO title, named non-generic committee, named individual, 24/7 SOC) → Level 3.",
@@ -218,6 +220,13 @@ export const ONBOARDING_STEPS: OnboardingStep[] = [
explanation:
"BG because the Audit Committee is the subject (oversight). CISO is a firm-specific fact → Level 3. No QV facts (no numbers, dates, named firms).",
},
+ {
+ text: "The Board oversees our cybersecurity program, which is led by our CISO and includes penetration testing and vulnerability assessments using CrowdStrike Falcon.",
+ category: "Board Governance",
+ specificity: "Level 4 — Quantified-Verifiable",
+ explanation:
+ "BG because the Board is the subject. But specificity rates THE WHOLE PARAGRAPH — not just the board content. CrowdStrike Falcon is a named tool (QV-eligible), so Level 4. Don't be tempted to rate only the \"board\" parts as generic — the paragraph as a whole contains a verifiable fact.",
+ },
{
text: "Under the leadership of our CISO, we have implemented network segmentation, endpoint detection and response, data loss prevention, and SIEM. Our team monitors critical systems continuously and conducts quarterly tabletop exercises.",
category: "Risk Management Process",
@@ -248,8 +257,8 @@ export const ONBOARDING_STEPS: OnboardingStep[] = [
},
],
keyPoints: [
- "Category and specificity are independent. Don't let one influence the other.",
- "The person-removal test and specificity waterfall work together — use both.",
+ "Specificity rates the WHOLE paragraph — not just the parts related to the category. Scan everything.",
+ "A Board Governance paragraph that mentions CrowdStrike Falcon → still Level 4. Don't filter facts by category.",
"When in doubt on category: which question does the paragraph answer?",
"When in doubt on specificity: check the waterfall top-down (QV → IS → Domain → Generic).",
],
diff --git a/labelapp/scripts/assign.ts b/labelapp/scripts/assign.ts
index 84cad3a..7f7438e 100644
--- a/labelapp/scripts/assign.ts
+++ b/labelapp/scripts/assign.ts
@@ -1,7 +1,6 @@
process.env.DATABASE_URL ??=
"postgresql://sec_cybert:sec_cybert@localhost:5432/sec_cybert";
-import { ne } from "drizzle-orm";
import { db } from "../db";
import * as schema from "../db/schema";
import { generateAssignments, printAssignmentStats } from "../lib/assignment";
@@ -15,12 +14,11 @@ async function main() {
const paragraphIds = rows.map((r) => r.id);
console.log(` ${paragraphIds.length} paragraphs`);
- // 2. Read annotator IDs from DB (exclude joey — admin)
+ // 2. Read annotator IDs from DB (all annotators, including joey)
console.log("Loading annotators...");
const annotators = await db
.select({ id: schema.annotators.id })
- .from(schema.annotators)
- .where(ne(schema.annotators.id, "joey"));
+ .from(schema.annotators);
const annotatorIds = annotators.map((a) => a.id).sort();
console.log(` ${annotatorIds.length} annotators: ${annotatorIds.join(", ")}`);
diff --git a/labelapp/scripts/dump-all.ts b/labelapp/scripts/dump-all.ts
index 5f1bacc..341c193 100644
--- a/labelapp/scripts/dump-all.ts
+++ b/labelapp/scripts/dump-all.ts
@@ -53,16 +53,14 @@ async function main() {
db.select().from(schema.adjudications),
]);
- const nonAdminAnnotators = allAnnotators.filter((a) => a.id !== "joey");
- const annotatorIds = nonAdminAnnotators.map((a) => a.id).sort();
+ const annotatorIds = allAnnotators.map((a) => a.id).sort();
const annotatorNames = new Map(allAnnotators.map((a) => [a.id, a.displayName]));
- // Filter to non-admin labels only
- const labels = allLabels.filter((l) => l.annotatorId !== "joey");
+ const labels = allLabels;
- console.log(` ${labels.length} human labels (non-admin)`);
+ console.log(` ${labels.length} human labels`);
console.log(` ${allParagraphs.length} paragraphs`);
- console.log(` ${nonAdminAnnotators.length} annotators`);
+ console.log(` ${allAnnotators.length} annotators`);
console.log(` ${allQuizSessions.length} quiz sessions`);
console.log(` ${allAdjudications.length} adjudications`);
@@ -108,7 +106,7 @@ async function main() {
// ── 3. Annotators JSON ──
console.log("\nExporting annotator profiles...");
- const annotatorProfiles = nonAdminAnnotators.map((a) => ({
+ const annotatorProfiles = allAnnotators.map((a: { id: string; displayName: string; onboardedAt: Date | null }) => ({
id: a.id,
displayName: a.displayName,
onboardedAt: a.onboardedAt?.toISOString() ?? null,
diff --git a/python/main.py b/python/main.py
index 2e3b83f..48a5f25 100644
--- a/python/main.py
+++ b/python/main.py
@@ -44,6 +44,35 @@ def cmd_finetune(args: argparse.Namespace) -> None:
train(config)
+def cmd_eval(args: argparse.Namespace) -> None:
+ from src.finetune.eval import EvalConfig, evaluate
+
+ benchmark_paths = {}
+ if args.benchmark:
+ for name, path in args.benchmark:
+ benchmark_paths[name] = path
+ else:
+ # Default benchmarks
+ benchmark_paths = {
+ "GPT-5.4": "../data/annotations/v2-bench/gpt-5.4.jsonl",
+ "Opus-4.6": "../data/annotations/v2-bench/opus-4.6.jsonl",
+ }
+
+ config = EvalConfig(
+ checkpoint_path=args.checkpoint,
+ paragraphs_path=args.paragraphs,
+ holdout_path=args.holdout,
+ benchmark_paths=benchmark_paths,
+ output_dir=args.output_dir,
+ max_seq_length=args.max_seq_length,
+ batch_size=args.batch_size,
+ specificity_head=args.spec_head,
+ spec_mlp_dim=args.spec_mlp_dim,
+ pooling=args.pooling,
+ )
+ evaluate(config)
+
+
def cmd_ablate(args: argparse.Namespace) -> None:
from src.common.config import FinetuneConfig
from src.finetune.train import ablate
@@ -94,10 +123,20 @@ def main() -> None:
ab.add_argument("--epochs", type=int, help="Override epochs per ablation run (default: config value)")
ab.set_defaults(func=cmd_ablate)
- # ── eval (placeholder) ──
- ev = sub.add_parser("eval", help="Evaluate a trained model")
- ev.add_argument("--config", required=True, help="Path to YAML config file")
- ev.set_defaults(func=lambda args: print("Evaluation not yet implemented."))
+ # ── eval ──
+ ev = sub.add_parser("eval", help="Evaluate a trained model on holdout set")
+ ev.add_argument("--checkpoint", required=True, help="Path to model checkpoint directory")
+ ev.add_argument("--paragraphs", default="../data/paragraphs/paragraphs-clean.patched.jsonl")
+ ev.add_argument("--holdout", default="../data/gold/v2-holdout-ids.json")
+ ev.add_argument("--benchmark", action="append", nargs=2, metavar=("NAME", "PATH"),
+ help="Benchmark reference: NAME PATH (can repeat)")
+ ev.add_argument("--output-dir", default="../results/eval")
+ ev.add_argument("--max-seq-length", type=int, default=512)
+ ev.add_argument("--batch-size", type=int, default=64)
+ ev.add_argument("--spec-head", default="independent", choices=["coral", "independent", "softmax"])
+ ev.add_argument("--spec-mlp-dim", type=int, default=256)
+ ev.add_argument("--pooling", default="attention", choices=["cls", "attention"])
+ ev.set_defaults(func=cmd_eval)
args = parser.parse_args()
args.func(args)
diff --git a/python/pyproject.toml b/python/pyproject.toml
index d71ba5d..235c9c3 100644
--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -15,6 +15,9 @@ dependencies = [
"unsloth==2026.3.11",
"coral-pytorch>=1.4.0",
"scikit-learn>=1.8.0",
+ "krippendorff>=0.8.2",
+ "matplotlib>=3.10.8",
+ "seaborn>=0.13.2",
]
[project.scripts]
diff --git a/python/scripts/generate-comparison-figures.py b/python/scripts/generate-comparison-figures.py
new file mode 100644
index 0000000..acdc5dc
--- /dev/null
+++ b/python/scripts/generate-comparison-figures.py
@@ -0,0 +1,226 @@
+"""Generate side-by-side comparison figures: CORAL baseline vs Independent threshold model."""
+
+import json
+from pathlib import Path
+
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+
+RESULTS_DIR = Path(__file__).resolve().parent.parent.parent / "results" / "eval"
+OUTPUT_DIR = RESULTS_DIR / "comparison"
+
+CATEGORIES = [
+ "Board Gov.",
+ "Incident Disc.",
+ "Mgmt Role",
+ "None/Other",
+ "Risk Mgmt Proc.",
+ "Strategy Int.",
+ "Third-Party",
+]
+SPEC_LABELS = ["L1: Generic", "L2: Domain", "L3: Firm-Spec.", "L4: Quantified"]
+
+
+def load_metrics(model_dir: str) -> dict:
+ with open(RESULTS_DIR / model_dir / "metrics.json") as f:
+ return json.load(f)
+
+
+def main():
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
+ sns.set_theme(style="whitegrid", font_scale=1.1)
+
+ coral = load_metrics("coral-baseline")
+ indep = load_metrics("iter1-independent")
+
+ # Use GPT-5.4 as the reference (1200 samples, complete)
+ coral_gpt = coral["best-base_weighted_ce-ep5_vs_GPT-5.4"]
+ indep_gpt = indep["iter1-independent_vs_GPT-5.4"]
+
+ # ── 1. Side-by-side per-class F1 (Category) ─────────────────────────────
+ # Keys come from eval.py: name.replace(" ", "").replace("/", "")[:8]
+ cat_keys = ["BoardGov", "Incident", "Manageme", "NoneOthe", "RiskMana", "Strategy", "Third-Pa"]
+
+ coral_cat_f1 = [coral_gpt.get(f"cat_f1_{k}", 0) for k in cat_keys]
+ indep_cat_f1 = [indep_gpt.get(f"cat_f1_{k}", 0) for k in cat_keys]
+
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))
+
+ x = np.arange(len(CATEGORIES))
+ width = 0.35
+ bars1 = ax1.bar(x - width/2, coral_cat_f1, width, label="CORAL (Epoch 5)", color="#DD8452", alpha=0.85)
+ bars2 = ax1.bar(x + width/2, indep_cat_f1, width, label="Independent (Epoch 8)", color="#4C72B0", alpha=0.85)
+ ax1.axhline(0.80, color="red", linestyle="--", alpha=0.5, label="Target (0.80)")
+ ax1.set_ylabel("F1 Score")
+ ax1.set_title("Category F1 by Class")
+ ax1.set_xticks(x)
+ ax1.set_xticklabels(CATEGORIES, rotation=25, ha="right")
+ ax1.set_ylim(0, 1.05)
+ ax1.legend(loc="lower right")
+
+ for bar, v in zip(bars1, coral_cat_f1):
+ ax1.text(bar.get_x() + bar.get_width()/2, v + 0.01, f"{v:.2f}", ha="center", va="bottom", fontsize=7)
+ for bar, v in zip(bars2, indep_cat_f1):
+ ax1.text(bar.get_x() + bar.get_width()/2, v + 0.01, f"{v:.2f}", ha="center", va="bottom", fontsize=7)
+
+ # ── Specificity F1 side-by-side ──────────────────────────────────────────
+ # Keys come from eval.py: name.replace(" ", "").replace(":", "")[:8]
+ spec_keys = ["L1Generi", "L2Domain", "L3Firm-S", "L4Quanti"]
+
+ coral_spec_f1 = [coral_gpt.get(f"spec_f1_{k}", 0) for k in spec_keys]
+ indep_spec_f1 = [indep_gpt.get(f"spec_f1_{k}", 0) for k in spec_keys]
+
+ x2 = np.arange(len(SPEC_LABELS))
+ bars3 = ax2.bar(x2 - width/2, coral_spec_f1, width, label="CORAL (Epoch 5)", color="#DD8452", alpha=0.85)
+ bars4 = ax2.bar(x2 + width/2, indep_spec_f1, width, label="Independent (Epoch 8)", color="#4C72B0", alpha=0.85)
+ ax2.axhline(0.80, color="red", linestyle="--", alpha=0.5, label="Target (0.80)")
+ ax2.set_ylabel("F1 Score")
+ ax2.set_title("Specificity F1 by Level")
+ ax2.set_xticks(x2)
+ ax2.set_xticklabels(SPEC_LABELS)
+ ax2.set_ylim(0, 1.05)
+ ax2.legend(loc="lower right")
+
+ for bar, v in zip(bars3, coral_spec_f1):
+ ax2.text(bar.get_x() + bar.get_width()/2, v + 0.01, f"{v:.2f}", ha="center", va="bottom", fontsize=8)
+ for bar, v in zip(bars4, indep_spec_f1):
+ ax2.text(bar.get_x() + bar.get_width()/2, v + 0.01, f"{v:.2f}", ha="center", va="bottom", fontsize=8)
+
+ plt.suptitle("CORAL Baseline vs Independent Thresholds — Holdout Set (vs GPT-5.4)", fontsize=14, fontweight="bold")
+ plt.tight_layout()
+ fig.savefig(OUTPUT_DIR / "coral_vs_independent_f1.png", dpi=200)
+ plt.close(fig)
+ print(f" Saved: coral_vs_independent_f1.png")
+
+ # ── 2. Summary metrics comparison ────────────────────────────────────────
+ metrics_to_compare = {
+ "Cat Macro F1": ("cat_macro_f1", "cat_macro_f1"),
+ "Spec Macro F1": ("spec_macro_f1", "spec_macro_f1"),
+ "Cat MCC": ("cat_mcc", "cat_mcc"),
+ "Spec MCC": ("spec_mcc", "spec_mcc"),
+ "Cat AUC": ("cat_auc", "cat_auc"),
+ "Spec AUC": ("spec_auc", "spec_auc"),
+ "Spec QWK": ("spec_qwk", "spec_qwk"),
+ "Cat Kripp α": ("cat_kripp_alpha", "cat_kripp_alpha"),
+ "Spec Kripp α": ("spec_kripp_alpha", "spec_kripp_alpha"),
+ }
+
+ fig, ax = plt.subplots(figsize=(12, 6))
+ labels = list(metrics_to_compare.keys())
+ coral_vals = [coral_gpt.get(v[0], 0) for v in metrics_to_compare.values()]
+ indep_vals = [indep_gpt.get(v[1], 0) for v in metrics_to_compare.values()]
+
+ x = np.arange(len(labels))
+ width = 0.35
+ ax.bar(x - width/2, coral_vals, width, label="CORAL (Epoch 5)", color="#DD8452", alpha=0.85)
+ ax.bar(x + width/2, indep_vals, width, label="Independent (Epoch 8)", color="#4C72B0", alpha=0.85)
+ ax.axhline(0.80, color="red", linestyle="--", alpha=0.5)
+ ax.set_ylabel("Score")
+ ax.set_title("CORAL vs Independent — All Metrics (Holdout vs GPT-5.4)")
+ ax.set_xticks(x)
+ ax.set_xticklabels(labels, rotation=30, ha="right")
+ ax.set_ylim(0, 1.1)
+ ax.legend()
+
+ for i, (cv, iv) in enumerate(zip(coral_vals, indep_vals)):
+ ax.text(i - width/2, cv + 0.01, f"{cv:.3f}", ha="center", va="bottom", fontsize=7)
+ ax.text(i + width/2, iv + 0.01, f"{iv:.3f}", ha="center", va="bottom", fontsize=7)
+
+ plt.tight_layout()
+ fig.savefig(OUTPUT_DIR / "coral_vs_independent_all_metrics.png", dpi=200)
+ plt.close(fig)
+ print(f" Saved: coral_vs_independent_all_metrics.png")
+
+ # ── 3. Delta chart (improvement from CORAL → Independent) ────────────────
+ deltas = [iv - cv for cv, iv in zip(coral_vals, indep_vals)]
+ colors = ["#55a868" if d >= 0 else "#c44e52" for d in deltas]
+
+ fig, ax = plt.subplots(figsize=(10, 5))
+ ax.barh(labels, deltas, color=colors, alpha=0.85)
+ ax.axvline(0, color="black", linewidth=0.8)
+ ax.set_xlabel("Improvement (Independent − CORAL)")
+ ax.set_title("Metric Improvement: Independent Thresholds over CORAL")
+ for i, (d, label) in enumerate(zip(deltas, labels)):
+ ax.text(d + 0.003 if d >= 0 else d - 0.003, i, f"{d:+.3f}",
+ va="center", ha="left" if d >= 0 else "right", fontsize=9)
+
+ plt.tight_layout()
+ fig.savefig(OUTPUT_DIR / "improvement_delta.png", dpi=200)
+ plt.close(fig)
+ print(f" Saved: improvement_delta.png")
+
+ # ── 4. Specificity confusion matrix side-by-side ─────────────────────────
+ spec_labels_short = ["L1", "L2", "L3", "L4"]
+
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(13, 5))
+
+ cm_coral = np.array(coral_gpt["spec_confusion_matrix"])
+ cm_indep = np.array(indep_gpt["spec_confusion_matrix"])
+
+ cm_coral_norm = cm_coral.astype(float) / cm_coral.sum(axis=1, keepdims=True).clip(min=1)
+ cm_indep_norm = cm_indep.astype(float) / cm_indep.sum(axis=1, keepdims=True).clip(min=1)
+
+ sns.heatmap(cm_coral_norm, annot=cm_coral, fmt="d", cmap="Oranges",
+ xticklabels=spec_labels_short, yticklabels=spec_labels_short,
+ ax=ax1, vmin=0, vmax=1, cbar=False)
+ ax1.set_title("CORAL (Epoch 5) — Spec F1=0.597")
+ ax1.set_xlabel("Predicted")
+ ax1.set_ylabel("GPT-5.4 Reference")
+
+ sns.heatmap(cm_indep_norm, annot=cm_indep, fmt="d", cmap="Blues",
+ xticklabels=spec_labels_short, yticklabels=spec_labels_short,
+ ax=ax2, vmin=0, vmax=1, cbar=False)
+ ax2.set_title("Independent (Epoch 8) — Spec F1=0.895")
+ ax2.set_xlabel("Predicted")
+ ax2.set_ylabel("GPT-5.4 Reference")
+
+ plt.suptitle("Specificity Confusion Matrices — CORAL vs Independent", fontsize=13, fontweight="bold")
+ plt.tight_layout()
+ fig.savefig(OUTPUT_DIR / "spec_confusion_comparison.png", dpi=200)
+ plt.close(fig)
+ print(f" Saved: spec_confusion_comparison.png")
+
+ # ── 5. Cost/speed comparison table figure ────────────────────────────────
+ fig, ax = plt.subplots(figsize=(10, 4))
+ ax.axis("off")
+
+ table_data = [
+ ["Metric", "CORAL (Ep5)", "Independent (Ep8)", "GPT-5.4 (API)", "Opus-4.6 (API)"],
+ ["Cat Macro F1", f"{coral_gpt['cat_macro_f1']:.4f}", f"{indep_gpt['cat_macro_f1']:.4f}", "—(reference)", "—(reference)"],
+ ["Spec Macro F1", f"{coral_gpt['spec_macro_f1']:.4f}", f"{indep_gpt['spec_macro_f1']:.4f}", "—(reference)", "—(reference)"],
+ ["Spec QWK", f"{coral_gpt['spec_qwk']:.4f}", f"{indep_gpt['spec_qwk']:.4f}", "—", "—"],
+ ["MCC (Cat)", f"{coral_gpt['cat_mcc']:.4f}", f"{indep_gpt['cat_mcc']:.4f}", "—", "—"],
+ ["Latency/sample", "5.6ms", "5.6ms", "~2,900ms", "~6,000ms"],
+ ["Cost/1M texts", "~$5", "~$5", "~$3,400", "~$5,000*"],
+ ["Reproducible", "Yes", "Yes", "No", "No"],
+ ]
+
+ table = ax.table(cellText=table_data[1:], colLabels=table_data[0],
+ cellLoc="center", loc="center")
+ table.auto_set_font_size(False)
+ table.set_fontsize(9)
+ table.scale(1, 1.5)
+
+ # Style header
+ for j in range(len(table_data[0])):
+ table[0, j].set_facecolor("#4C72B0")
+ table[0, j].set_text_props(color="white", fontweight="bold")
+
+ # Highlight best specialist column
+ for i in range(1, len(table_data)):
+ table[i, 2].set_facecolor("#d4edda")
+
+ ax.set_title("Model Comparison Summary", fontsize=13, fontweight="bold", pad=20)
+ plt.tight_layout()
+ fig.savefig(OUTPUT_DIR / "comparison_table.png", dpi=200)
+ plt.close(fig)
+ print(f" Saved: comparison_table.png")
+
+ print(f"\n All figures saved to {OUTPUT_DIR}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/python/src/finetune/eval.py b/python/src/finetune/eval.py
new file mode 100644
index 0000000..4a94a21
--- /dev/null
+++ b/python/src/finetune/eval.py
@@ -0,0 +1,724 @@
+"""Holdout evaluation for trained DualHeadModernBERT models.
+
+Loads a trained checkpoint, runs inference on the 1,200-paragraph holdout set,
+and compares predictions against proxy gold labels (GPT-5.4, Opus-4.6) or
+real human gold labels when available.
+
+Reports: macro/per-class F1, precision, recall, MCC, AUC (one-vs-rest),
+QWK, MAE, Krippendorff's alpha, confusion matrices, calibration (ECE),
+cost/latency comparison, and generates publication-ready figures.
+"""
+
+import json
+import time
+from dataclasses import dataclass
+from pathlib import Path
+
+import krippendorff
+import matplotlib
+matplotlib.use("Agg")
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+import torch
+import torch.nn.functional as F
+from safetensors.torch import load_file
+from sklearn.metrics import (
+ f1_score,
+ precision_score,
+ recall_score,
+ matthews_corrcoef,
+ roc_auc_score,
+ cohen_kappa_score,
+ confusion_matrix,
+ classification_report,
+ mean_absolute_error,
+)
+from transformers import AutoModel, AutoTokenizer
+
+from .data import CATEGORIES, CAT2ID, NUM_CATEGORIES, NUM_SPECIFICITY
+from .model import DualHeadModernBERT, ordinal_predict, softmax_predict
+
+SPEC_LABELS = ["L1: Generic", "L2: Domain", "L3: Firm-Specific", "L4: Quantified"]
+
+
+@dataclass
+class EvalConfig:
+ """Configuration for holdout evaluation."""
+ checkpoint_path: str
+ paragraphs_path: str
+ holdout_path: str
+ benchmark_paths: dict[str, str] # name → path to benchmark JSONL
+ output_dir: str
+ max_seq_length: int = 512
+ batch_size: int = 64
+ # Architecture params (must match training)
+ specificity_head: str = "independent"
+ spec_mlp_dim: int = 256
+ pooling: str = "attention"
+
+
+def _load_jsonl(path: str | Path) -> list[dict]:
+ records = []
+ with open(path) as f:
+ for line in f:
+ line = line.strip()
+ if line:
+ records.append(json.loads(line))
+ return records
+
+
+def load_holdout_data(
+ paragraphs_path: str,
+ holdout_path: str,
+ benchmark_paths: dict[str, str],
+) -> dict:
+ """Load holdout paragraphs and benchmark labels, joined by paragraph ID."""
+ paragraphs = {p["id"]: p for p in _load_jsonl(paragraphs_path)}
+
+ with open(holdout_path) as f:
+ holdout_ids = json.load(f)
+
+ # Load benchmark labels
+ benchmarks = {}
+ for name, path in benchmark_paths.items():
+ labels = {r["paragraphId"]: r for r in _load_jsonl(path)}
+ benchmarks[name] = labels
+
+ # Build joined dataset
+ records = []
+ for pid in holdout_ids:
+ para = paragraphs.get(pid)
+ if para is None:
+ continue
+
+ bench_labels = {}
+ for name, labels in benchmarks.items():
+ if pid in labels:
+ lab = labels[pid]["label"]
+ bench_labels[name] = {
+ "category": lab["content_category"],
+ "specificity": lab["specificity_level"],
+ }
+
+ records.append({
+ "id": pid,
+ "text": para["text"],
+ "benchmark_labels": bench_labels,
+ })
+
+ return records
+
+
+def load_model(config: EvalConfig, device: torch.device) -> tuple:
+ """Load trained DualHeadModernBERT from checkpoint."""
+ checkpoint = Path(config.checkpoint_path)
+
+ tokenizer = AutoTokenizer.from_pretrained(str(checkpoint))
+
+ # Load backbone (we need the architecture, then overwrite weights)
+ try:
+ import flash_attn # noqa: F401
+ attn_impl = "flash_attention_2"
+ except ImportError:
+ attn_impl = "sdpa"
+
+ backbone = AutoModel.from_pretrained(
+ "answerdotai/ModernBERT-large",
+ trust_remote_code=True,
+ attn_implementation=attn_impl,
+ dtype=torch.bfloat16,
+ )
+ hidden_size = backbone.config.hidden_size
+
+ model = DualHeadModernBERT(
+ backbone=backbone,
+ hidden_size=hidden_size,
+ num_categories=NUM_CATEGORIES,
+ num_specificity=NUM_SPECIFICITY,
+ specificity_head_type=config.specificity_head,
+ spec_mlp_dim=config.spec_mlp_dim,
+ pooling=config.pooling,
+ )
+
+ # Load trained weights (filter out loss function buffers not needed for inference)
+ state_dict = load_file(str(checkpoint / "model.safetensors"))
+ model.load_state_dict(state_dict, strict=False)
+ model = model.to(device).to(torch.bfloat16)
+ model.eval()
+
+ return model, tokenizer
+
+
+@torch.no_grad()
+def run_inference(
+ model: DualHeadModernBERT,
+ tokenizer,
+ records: list[dict],
+ max_seq_length: int,
+ batch_size: int,
+ device: torch.device,
+ spec_head_type: str = "independent",
+) -> dict:
+ """Run model inference on holdout records. Returns predictions + timing."""
+ texts = [r["text"] for r in records]
+ all_cat_logits = []
+ all_spec_logits = []
+ total_time = 0.0
+
+ for i in range(0, len(texts), batch_size):
+ batch_texts = texts[i : i + batch_size]
+ encoded = tokenizer(
+ batch_texts,
+ truncation=True,
+ max_length=max_seq_length,
+ padding="longest",
+ return_tensors="pt",
+ ).to(device)
+
+ start = time.perf_counter()
+ outputs = model(
+ input_ids=encoded["input_ids"],
+ attention_mask=encoded["attention_mask"],
+ )
+ torch.cuda.synchronize() if device.type == "cuda" else None
+ total_time += time.perf_counter() - start
+
+ all_cat_logits.append(outputs["category_logits"].float().cpu())
+ all_spec_logits.append(outputs["specificity_logits"].float().cpu())
+
+ cat_logits = torch.cat(all_cat_logits, dim=0)
+ spec_logits = torch.cat(all_spec_logits, dim=0)
+
+ cat_probs = F.softmax(cat_logits, dim=1).numpy()
+ cat_preds = cat_logits.argmax(dim=1).numpy()
+
+ if spec_head_type == "softmax":
+ spec_preds = softmax_predict(spec_logits).numpy()
+ spec_probs = F.softmax(spec_logits, dim=1).numpy()
+ else:
+ spec_preds = ordinal_predict(spec_logits).numpy()
+ # Convert ordinal logits to class probs for AUC
+ spec_probs = _ordinal_to_class_probs(spec_logits).numpy()
+
+ return {
+ "cat_preds": cat_preds,
+ "cat_probs": cat_probs,
+ "cat_logits": cat_logits.numpy(),
+ "spec_preds": spec_preds,
+ "spec_probs": spec_probs,
+ "spec_logits": spec_logits.numpy(),
+ "total_time_s": total_time,
+ "num_samples": len(texts),
+ "avg_ms_per_sample": (total_time / len(texts)) * 1000,
+ }
+
+
+def _ordinal_to_class_probs(logits: torch.Tensor) -> torch.Tensor:
+ """Convert ordinal threshold logits to per-class probabilities.
+
+ P(class=0) = 1 - P(>=1)
+ P(class=k) = P(>=k) - P(>=k+1) for 0 < k < K-1
+ P(class=K-1) = P(>=K-1)
+ """
+ probs = torch.sigmoid(logits) # (N, K-1)
+ num_classes = probs.shape[1] + 1
+ class_probs = torch.zeros(probs.shape[0], num_classes)
+
+ class_probs[:, 0] = 1.0 - probs[:, 0]
+ for k in range(1, num_classes - 1):
+ class_probs[:, k] = probs[:, k - 1] - probs[:, k]
+ class_probs[:, -1] = probs[:, -1]
+
+ # Clamp for numerical stability
+ class_probs = class_probs.clamp(min=0.0)
+ class_probs = class_probs / class_probs.sum(dim=1, keepdim=True)
+
+ return class_probs
+
+
+def compute_ece(probs: np.ndarray, labels: np.ndarray, n_bins: int = 15) -> tuple[float, dict]:
+ """Compute Expected Calibration Error and per-bin data for reliability diagram."""
+ confidences = np.max(probs, axis=1)
+ predictions = np.argmax(probs, axis=1)
+ accuracies = (predictions == labels).astype(float)
+
+ bin_edges = np.linspace(0.0, 1.0, n_bins + 1)
+ bin_data = {"bin_centers": [], "bin_accs": [], "bin_confs": [], "bin_counts": []}
+ ece = 0.0
+
+ for i in range(n_bins):
+ mask = (confidences > bin_edges[i]) & (confidences <= bin_edges[i + 1])
+ count = mask.sum()
+ if count > 0:
+ avg_conf = confidences[mask].mean()
+ avg_acc = accuracies[mask].mean()
+ ece += (count / len(labels)) * abs(avg_acc - avg_conf)
+ bin_data["bin_centers"].append((bin_edges[i] + bin_edges[i + 1]) / 2)
+ bin_data["bin_accs"].append(avg_acc)
+ bin_data["bin_confs"].append(avg_conf)
+ bin_data["bin_counts"].append(int(count))
+
+ return ece, bin_data
+
+
+def compute_all_metrics(
+ preds: np.ndarray,
+ labels: np.ndarray,
+ probs: np.ndarray,
+ label_names: list[str],
+ task_name: str,
+ is_ordinal: bool = False,
+) -> dict:
+ """Compute comprehensive metrics for a single classification task."""
+ num_classes = len(label_names)
+
+ # Basic classification metrics
+ macro_f1 = f1_score(labels, preds, average="macro", labels=range(num_classes))
+ weighted_f1 = f1_score(labels, preds, average="weighted", labels=range(num_classes))
+ per_class_f1 = f1_score(labels, preds, average=None, labels=range(num_classes))
+ per_class_precision = precision_score(labels, preds, average=None, labels=range(num_classes), zero_division=0)
+ per_class_recall = recall_score(labels, preds, average=None, labels=range(num_classes), zero_division=0)
+ macro_precision = precision_score(labels, preds, average="macro", labels=range(num_classes), zero_division=0)
+ macro_recall = recall_score(labels, preds, average="macro", labels=range(num_classes), zero_division=0)
+
+ # MCC (multiclass)
+ mcc = matthews_corrcoef(labels, preds)
+
+ # AUC (one-vs-rest, macro)
+ try:
+ auc = roc_auc_score(labels, probs, multi_class="ovr", average="macro", labels=range(num_classes))
+ except ValueError:
+ auc = float("nan")
+
+ # Confusion matrix
+ cm = confusion_matrix(labels, preds, labels=range(num_classes))
+
+ # Calibration
+ ece, bin_data = compute_ece(probs, labels)
+
+ metrics = {
+ f"{task_name}_macro_f1": macro_f1,
+ f"{task_name}_weighted_f1": weighted_f1,
+ f"{task_name}_macro_precision": macro_precision,
+ f"{task_name}_macro_recall": macro_recall,
+ f"{task_name}_mcc": mcc,
+ f"{task_name}_auc": auc,
+ f"{task_name}_ece": ece,
+ f"{task_name}_confusion_matrix": cm.tolist(),
+ f"{task_name}_calibration": bin_data,
+ }
+
+ for i, name in enumerate(label_names):
+ short = name.replace(" ", "").replace("/", "").replace(":", "")[:8]
+ metrics[f"{task_name}_f1_{short}"] = per_class_f1[i]
+ metrics[f"{task_name}_prec_{short}"] = per_class_precision[i]
+ metrics[f"{task_name}_recall_{short}"] = per_class_recall[i]
+
+ # Ordinal-specific metrics
+ if is_ordinal:
+ metrics[f"{task_name}_qwk"] = cohen_kappa_score(labels, preds, weights="quadratic")
+ metrics[f"{task_name}_mae"] = mean_absolute_error(labels, preds)
+
+ # Krippendorff's alpha (treat model + reference as two coders)
+ reliability_data = np.array([labels, preds])
+ if is_ordinal:
+ metrics[f"{task_name}_kripp_alpha"] = krippendorff.alpha(
+ reliability_data=reliability_data, level_of_measurement="ordinal"
+ )
+ else:
+ metrics[f"{task_name}_kripp_alpha"] = krippendorff.alpha(
+ reliability_data=reliability_data, level_of_measurement="nominal"
+ )
+
+ return metrics
+
+
+def generate_figures(
+ results: dict,
+ output_dir: Path,
+ model_name: str,
+ ref_name: str,
+) -> list[str]:
+ """Generate publication-ready figures. Returns list of saved file paths."""
+ figures_dir = output_dir / "figures"
+ figures_dir.mkdir(parents=True, exist_ok=True)
+ saved = []
+
+ sns.set_theme(style="whitegrid", font_scale=1.1)
+ palette = sns.color_palette("viridis", 7)
+
+ # 1. Category confusion matrix
+ cm = np.array(results["cat_confusion_matrix"])
+ fig, ax = plt.subplots(figsize=(10, 8))
+ cm_norm = cm.astype(float) / cm.sum(axis=1, keepdims=True)
+ sns.heatmap(
+ cm_norm, annot=cm, fmt="d", cmap="Blues",
+ xticklabels=[c[:12] for c in CATEGORIES],
+ yticklabels=[c[:12] for c in CATEGORIES],
+ ax=ax, vmin=0, vmax=1, cbar_kws={"label": "Proportion"},
+ )
+ ax.set_xlabel(f"Predicted ({model_name})")
+ ax.set_ylabel(f"Reference ({ref_name})")
+ ax.set_title(f"Category Confusion Matrix — {model_name} vs {ref_name}")
+ plt.tight_layout()
+ path = figures_dir / f"confusion_cat_{ref_name.lower().replace(' ', '_')}.png"
+ fig.savefig(path, dpi=150)
+ plt.close(fig)
+ saved.append(str(path))
+
+ # 2. Specificity confusion matrix
+ cm_spec = np.array(results["spec_confusion_matrix"])
+ fig, ax = plt.subplots(figsize=(7, 6))
+ cm_spec_norm = cm_spec.astype(float) / cm_spec.sum(axis=1, keepdims=True).clip(min=1)
+ sns.heatmap(
+ cm_spec_norm, annot=cm_spec, fmt="d", cmap="Oranges",
+ xticklabels=SPEC_LABELS,
+ yticklabels=SPEC_LABELS,
+ ax=ax, vmin=0, vmax=1, cbar_kws={"label": "Proportion"},
+ )
+ ax.set_xlabel(f"Predicted ({model_name})")
+ ax.set_ylabel(f"Reference ({ref_name})")
+ ax.set_title(f"Specificity Confusion Matrix — {model_name} vs {ref_name}")
+ plt.tight_layout()
+ path = figures_dir / f"confusion_spec_{ref_name.lower().replace(' ', '_')}.png"
+ fig.savefig(path, dpi=150)
+ plt.close(fig)
+ saved.append(str(path))
+
+ # 3. Calibration reliability diagram (category)
+ cal = results["cat_calibration"]
+ fig, ax = plt.subplots(figsize=(6, 6))
+ ax.bar(cal["bin_centers"], cal["bin_accs"], width=0.06, alpha=0.7, label="Accuracy", color="#4C72B0")
+ ax.plot([0, 1], [0, 1], "k--", alpha=0.5, label="Perfect calibration")
+ ax.set_xlabel("Confidence")
+ ax.set_ylabel("Accuracy")
+ ax.set_title(f"Category Calibration — ECE={results['cat_ece']:.4f}")
+ ax.legend()
+ ax.set_xlim(0, 1)
+ ax.set_ylim(0, 1)
+ plt.tight_layout()
+ path = figures_dir / f"calibration_cat_{ref_name.lower().replace(' ', '_')}.png"
+ fig.savefig(path, dpi=150)
+ plt.close(fig)
+ saved.append(str(path))
+
+ # 4. Per-class F1 bar chart
+ cat_f1s = [results.get(f"cat_f1_{c.replace(' ', '').replace('/', '')[:8]}", 0) for c in CATEGORIES]
+ spec_f1s = [results.get(f"spec_f1_{s.replace(' ', '').replace(':', '')[:8]}", 0) for s in SPEC_LABELS]
+
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))
+
+ bars = ax1.barh([c[:15] for c in CATEGORIES], cat_f1s, color=palette)
+ ax1.axvline(0.80, color="red", linestyle="--", alpha=0.7, label="Target (0.80)")
+ ax1.set_xlabel("F1 Score")
+ ax1.set_title(f"Per-Category F1 ({model_name} vs {ref_name})")
+ ax1.set_xlim(0, 1)
+ ax1.legend()
+ for bar, v in zip(bars, cat_f1s):
+ ax1.text(v + 0.01, bar.get_y() + bar.get_height() / 2, f"{v:.3f}", va="center", fontsize=9)
+
+ bars2 = ax2.barh(SPEC_LABELS, spec_f1s, color=sns.color_palette("YlOrRd", 4))
+ ax2.axvline(0.80, color="red", linestyle="--", alpha=0.7, label="Target (0.80)")
+ ax2.set_xlabel("F1 Score")
+ ax2.set_title(f"Per-Level Specificity F1 ({model_name} vs {ref_name})")
+ ax2.set_xlim(0, 1)
+ ax2.legend()
+ for bar, v in zip(bars2, spec_f1s):
+ ax2.text(v + 0.01, bar.get_y() + bar.get_height() / 2, f"{v:.3f}", va="center", fontsize=9)
+
+ plt.tight_layout()
+ path = figures_dir / f"per_class_f1_{ref_name.lower().replace(' ', '_')}.png"
+ fig.savefig(path, dpi=150)
+ plt.close(fig)
+ saved.append(str(path))
+
+ return saved
+
+
+def generate_comparison_figures(
+ all_results: dict[str, dict],
+ output_dir: Path,
+) -> list[str]:
+ """Generate figures comparing multiple models/references."""
+ figures_dir = output_dir / "figures"
+ figures_dir.mkdir(parents=True, exist_ok=True)
+ saved = []
+
+ sns.set_theme(style="whitegrid", font_scale=1.1)
+
+ # 1. Model comparison bar chart (if we have CORAL vs Independent)
+ model_names = list(all_results.keys())
+ if len(model_names) >= 2:
+ metric_keys = ["cat_macro_f1", "spec_macro_f1", "cat_mcc", "spec_qwk", "spec_mae"]
+ metric_labels = ["Cat Macro F1", "Spec Macro F1", "Cat MCC", "Spec QWK", "Spec MAE"]
+
+ fig, ax = plt.subplots(figsize=(12, 6))
+ x = np.arange(len(metric_labels))
+ width = 0.8 / len(model_names)
+
+ for i, name in enumerate(model_names):
+ vals = []
+ for k in metric_keys:
+ v = all_results[name].get(k, 0)
+ vals.append(v if not np.isnan(v) else 0)
+ bars = ax.bar(x + i * width - 0.4 + width / 2, vals, width, label=name, alpha=0.85)
+ for bar, v in zip(bars, vals):
+ ax.text(bar.get_x() + bar.get_width() / 2, bar.get_height() + 0.01,
+ f"{v:.3f}", ha="center", va="bottom", fontsize=8)
+
+ ax.set_xticks(x)
+ ax.set_xticklabels(metric_labels)
+ ax.set_ylabel("Score")
+ ax.set_title("Model Comparison — All Metrics")
+ ax.legend()
+ ax.axhline(0.80, color="red", linestyle="--", alpha=0.5, label="F1 Target")
+ plt.tight_layout()
+ path = figures_dir / "model_comparison.png"
+ fig.savefig(path, dpi=150)
+ plt.close(fig)
+ saved.append(str(path))
+
+ # 2. Speed/cost comparison
+ speed_data = {}
+ for name, res in all_results.items():
+ if "avg_ms_per_sample" in res:
+ speed_data[name] = res["avg_ms_per_sample"]
+
+ if speed_data:
+ # Add GenAI panel reference speeds from docs
+ genai_speeds = {
+ "GPT-5.4 (API)": 2900, # from provenance data
+ "Opus-4.6 (API)": 6000, # from provenance data
+ }
+
+ fig, ax = plt.subplots(figsize=(10, 5))
+ all_names = list(speed_data.keys()) + list(genai_speeds.keys())
+ all_speeds = list(speed_data.values()) + list(genai_speeds.values())
+ colors = ["#4C72B0"] * len(speed_data) + ["#DD8452"] * len(genai_speeds)
+
+ bars = ax.barh(all_names, all_speeds, color=colors)
+ ax.set_xlabel("Latency (ms per paragraph)")
+ ax.set_title("Inference Speed Comparison")
+ ax.set_xscale("log")
+ for bar, v in zip(bars, all_speeds):
+ ax.text(v * 1.1, bar.get_y() + bar.get_height() / 2,
+ f"{v:.1f}ms", va="center", fontsize=9)
+
+ plt.tight_layout()
+ path = figures_dir / "speed_comparison.png"
+ fig.savefig(path, dpi=150)
+ plt.close(fig)
+ saved.append(str(path))
+
+ return saved
+
+
+def format_report(
+ model_name: str,
+ ref_name: str,
+ metrics: dict,
+ inference_stats: dict,
+) -> str:
+ """Format a human-readable evaluation report."""
+ lines = []
+ lines.append(f"\n{'='*70}")
+ lines.append(f" HOLDOUT EVALUATION: {model_name} vs {ref_name}")
+ lines.append(f"{'='*70}\n")
+
+ lines.append(f" Samples evaluated: {inference_stats['num_samples']}")
+ lines.append(f" Total inference time: {inference_stats['total_time_s']:.2f}s")
+ lines.append(f" Avg latency: {inference_stats['avg_ms_per_sample']:.2f}ms/sample")
+ lines.append(f" Throughput: {1000 / inference_stats['avg_ms_per_sample']:.0f} samples/sec\n")
+
+ # Category metrics
+ lines.append(f" {'─'*50}")
+ lines.append(f" CATEGORY CLASSIFICATION")
+ lines.append(f" {'─'*50}")
+ lines.append(f" Macro F1: {metrics['cat_macro_f1']:.4f} {'✓' if metrics['cat_macro_f1'] >= 0.80 else '✗'} (target: 0.80)")
+ lines.append(f" Weighted F1: {metrics['cat_weighted_f1']:.4f}")
+ lines.append(f" Macro Prec: {metrics['cat_macro_precision']:.4f}")
+ lines.append(f" Macro Recall: {metrics['cat_macro_recall']:.4f}")
+ lines.append(f" MCC: {metrics['cat_mcc']:.4f}")
+ lines.append(f" AUC (OvR): {metrics['cat_auc']:.4f}")
+ lines.append(f" ECE: {metrics['cat_ece']:.4f}")
+ lines.append(f" Kripp Alpha: {metrics['cat_kripp_alpha']:.4f}")
+ lines.append("")
+
+ lines.append(f" {'Category':<25} {'F1':>8} {'Prec':>8} {'Recall':>8}")
+ lines.append(f" {'-'*25} {'-'*8} {'-'*8} {'-'*8}")
+ for c in CATEGORIES:
+ short = c.replace(" ", "").replace("/", "")[:8]
+ f1 = metrics.get(f"cat_f1_{short}", 0)
+ prec = metrics.get(f"cat_prec_{short}", 0)
+ rec = metrics.get(f"cat_recall_{short}", 0)
+ lines.append(f" {c:<25} {f1:>8.4f} {prec:>8.4f} {rec:>8.4f}")
+
+ # Specificity metrics
+ lines.append(f"\n {'─'*50}")
+ lines.append(f" SPECIFICITY CLASSIFICATION")
+ lines.append(f" {'─'*50}")
+ lines.append(f" Macro F1: {metrics['spec_macro_f1']:.4f} {'✓' if metrics['spec_macro_f1'] >= 0.80 else '✗'} (target: 0.80)")
+ lines.append(f" Weighted F1: {metrics['spec_weighted_f1']:.4f}")
+ lines.append(f" Macro Prec: {metrics['spec_macro_precision']:.4f}")
+ lines.append(f" Macro Recall: {metrics['spec_macro_recall']:.4f}")
+ lines.append(f" MCC: {metrics['spec_mcc']:.4f}")
+ lines.append(f" AUC (OvR): {metrics['spec_auc']:.4f}")
+ lines.append(f" QWK: {metrics['spec_qwk']:.4f}")
+ lines.append(f" MAE: {metrics['spec_mae']:.4f}")
+ lines.append(f" ECE: {metrics['spec_ece']:.4f}")
+ lines.append(f" Kripp Alpha: {metrics['spec_kripp_alpha']:.4f}")
+ lines.append("")
+
+ lines.append(f" {'Level':<25} {'F1':>8} {'Prec':>8} {'Recall':>8}")
+ lines.append(f" {'-'*25} {'-'*8} {'-'*8} {'-'*8}")
+ for s in SPEC_LABELS:
+ short = s.replace(" ", "").replace(":", "")[:8]
+ f1 = metrics.get(f"spec_f1_{short}", 0)
+ prec = metrics.get(f"spec_prec_{short}", 0)
+ rec = metrics.get(f"spec_recall_{short}", 0)
+ lines.append(f" {s:<25} {f1:>8.4f} {prec:>8.4f} {rec:>8.4f}")
+
+ lines.append(f"\n{'='*70}\n")
+
+ return "\n".join(lines)
+
+
+def evaluate(config: EvalConfig) -> dict:
+ """Run full holdout evaluation. Returns all metrics + generates figures."""
+ output_dir = Path(config.output_dir)
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+ print(f"\n Device: {device}")
+
+ # Load holdout data
+ print(" Loading holdout data...")
+ records = load_holdout_data(
+ config.paragraphs_path,
+ config.holdout_path,
+ config.benchmark_paths,
+ )
+ print(f" Holdout paragraphs: {len(records)}")
+
+ # Load model
+ print(f" Loading model from {config.checkpoint_path}...")
+ model, tokenizer = load_model(config, device)
+ param_count = sum(p.numel() for p in model.parameters()) / 1e6
+ print(f" Model parameters: {param_count:.0f}M")
+
+ # Run inference
+ print(" Running inference...")
+ inference = run_inference(
+ model, tokenizer, records,
+ config.max_seq_length, config.batch_size,
+ device, config.specificity_head,
+ )
+ print(f" Inference complete: {inference['total_time_s']:.2f}s ({inference['avg_ms_per_sample']:.2f}ms/sample)")
+
+ # Evaluate against each benchmark
+ all_results = {}
+ all_figures = []
+
+ model_name = Path(config.checkpoint_path).parent.name
+ if model_name == "final":
+ model_name = Path(config.checkpoint_path).parent.parent.name
+
+ for ref_name, ref_path in config.benchmark_paths.items():
+ print(f"\n Evaluating against {ref_name}...")
+
+ # Get reference labels for paragraphs that have them
+ cat_labels = []
+ spec_labels = []
+ cat_preds = []
+ spec_preds = []
+ cat_probs = []
+ spec_probs = []
+
+ for i, rec in enumerate(records):
+ bench = rec["benchmark_labels"].get(ref_name)
+ if bench is None:
+ continue
+ cat_labels.append(CAT2ID[bench["category"]])
+ spec_labels.append(bench["specificity"] - 1) # 0-indexed
+ cat_preds.append(inference["cat_preds"][i])
+ spec_preds.append(inference["spec_preds"][i])
+ cat_probs.append(inference["cat_probs"][i])
+ spec_probs.append(inference["spec_probs"][i])
+
+ cat_labels = np.array(cat_labels)
+ spec_labels = np.array(spec_labels)
+ cat_preds = np.array(cat_preds)
+ spec_preds = np.array(spec_preds)
+ cat_probs = np.array(cat_probs)
+ spec_probs = np.array(spec_probs)
+
+ print(f" Matched samples: {len(cat_labels)}")
+
+ # Compute metrics
+ cat_metrics = compute_all_metrics(
+ cat_preds, cat_labels, cat_probs, CATEGORIES, "cat", is_ordinal=False
+ )
+ spec_metrics = compute_all_metrics(
+ spec_preds, spec_labels, spec_probs, SPEC_LABELS, "spec", is_ordinal=True
+ )
+
+ combined = {**cat_metrics, **spec_metrics, **inference}
+ combined["combined_macro_f1"] = (combined["cat_macro_f1"] + combined["spec_macro_f1"]) / 2
+
+ # Print report
+ report = format_report(model_name, ref_name, combined, inference)
+ print(report)
+
+ # Save report
+ report_path = output_dir / f"report_{ref_name.lower().replace(' ', '_').replace('.', '')}.txt"
+ with open(report_path, "w") as f:
+ f.write(report)
+
+ # Generate figures
+ figs = generate_figures(combined, output_dir, model_name, ref_name)
+ all_figures.extend(figs)
+
+ all_results[f"{model_name}_vs_{ref_name}"] = combined
+
+ # Cross-model comparison figures
+ comp_figs = generate_comparison_figures(all_results, output_dir)
+ all_figures.extend(comp_figs)
+
+ # Save all metrics as JSON
+ serializable = {}
+ for k, v in all_results.items():
+ serializable[k] = {
+ mk: mv for mk, mv in v.items()
+ if isinstance(mv, (int, float, str, list, bool))
+ }
+ metrics_path = output_dir / "metrics.json"
+ with open(metrics_path, "w") as f:
+ json.dump(serializable, f, indent=2, default=str)
+
+ print(f"\n Results saved to {output_dir}")
+ print(f" Figures: {len(all_figures)} generated")
+ for fig_path in all_figures:
+ print(f" {fig_path}")
+
+ return all_results
+
+
+def evaluate_comparison(
+ configs: list[EvalConfig],
+ output_dir: str,
+) -> dict:
+ """Evaluate multiple model checkpoints and generate cross-model comparisons."""
+ output_dir = Path(output_dir)
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ all_results = {}
+ for config in configs:
+ results = evaluate(config)
+ all_results.update(results)
+
+ # Generate comparison figures across all models
+ comp_figs = generate_comparison_figures(all_results, output_dir)
+ print(f"\n Comparison figures: {len(comp_figs)}")
+
+ return all_results
diff --git a/results/eval/comparison/comparison_table.png b/results/eval/comparison/comparison_table.png
new file mode 100644
index 0000000000000000000000000000000000000000..c848429146a24348a0c700fd490ba35f39adf629
GIT binary patch
literal 103751
zcmeFZ_g9l?*ES3~!YBediV6sfil|hjD%A=iO?n5F-dkv)*%?7VsnS7uXi5tm6%`>s
zsG)=aiqsI05=tQO?aR5J=Uwaj1KuCrb!ROnl`Ci2``E{E>@)M?uBHm>0j>iK3=FKQ
zw{Pk&Ffey9Fzo&NPbPRJU~t6czQXw
zxk`v$x+r>C_-}g@%H2y=Ow8qfKOySoX(#s4)VmX|vfur-FjvVGG~#
zKR5R{39|)cX8&{hWq6f==YO8S{o$+3|L57&zZg;f=NSY4f8S*I?;;=p@&Aj2=%4;I
z+{o)6f~>y2>>hk?m8!Vgv`g1o9}OtQ1qE(SCeX&8u%{^QemfIo-motP;7?ioZV6$n_H?=G{!RXFwwrpo*4T)1$dt*uSx>eYvjpFGhowNQ|9
z8#UX@%%)dtQYq@|^*b37{`?)-;|I#n)H(6JGlhnwjW>(0`Ol9wnDPa!zK$0*{_$%w
zJDdOP*|QGQovBIUHenAQJdkwkJ5LH+=}4%~`7lO&Z`HEeDYt0<{nInHbS*VCT@#a}
zNFKS|vug47?Qbs4_2xxc1TNizoB0pI9%Ok$rF5$I!oA_E2yPicC8e?16@P#K;R=V|
zv(AS{F4?plC3)gRG@ndIrd;;8uBAk;;0>6+G&wb8@tMGA7^x7jX8-NgDdhb(9k%qP
z7CtU{zjtdCqVHkT-&wmM^!GzHxW3snPfyPZ{}tOjyxVYvDtt%6yIbdR%zS|vh8F&}
zo;`cUF}#Bv3+~N|QWPv7GHVstTO!=r+FCoE77&}3W|2wJ)zLxW71qy7NR&sL{>F<9
zp;OP~;M(rbj4K`EWPRpme$_}h5zKxNKE(O2OwHoNnkr0rj~>mgTc}kF+FH|hb}o{a
zm#=UeQ|x$i>4wtwuP0$)Va^0g*xy|B#J7jfD4St&Cr%e?XFXq=AM&b6_PDi4BbUa-
z##)yP|NZyX=I5-vtTH1RZ{D1E_VnrbD_4w@rCoZa(?d!~b-u)+kh#7KL3HW>Z82mw
z;6gGoZHDTnZyq5JwmV?TL-IZx{9@+2BShlX+lu`*a$Zv}(?WLq>n2&brTgmrS5_ey
z^Y1r5UtV8NNKa2Usj$yXut2Lq;KepIHIaPMgVs0;Mi`IV;?rEP&f6PH=&3}zzPGnn
zo`r_udV6(WzkXf2P)n9{8J3iC?E6+sM~1*qa`KNZh_Q!FHOE9u2IWT^gX)@(Hgj@v
z4!^ew?I?>EF}3Kc`dG8rxTR@qY%Gnte&dGJ{ZD_~QdX8Fl~}bzqQ5^oJ|CZc#Y;K-
zm^i?WOBddW6ATLv*G&*Lm-PE}4}!O*0bgG)PhaiTGO2dYglNlKT}9~^o9I)9D|`Pr
zd>&%4;kZ`>DCPzt-EyJb+S(qyxl*V8w3OHyp0F
zuRxa)ue34#1Y=U^m`EH$^=fd*`4nlUT+W>9FD!H+RymF}HsJ2vdyy*V>j(id{OVG`2YYeY9>!(l%6Ie(2$Pv_<5pO#q{A~{pkJ^byIaD*N%;mr+s|OD%{FWYswzT5$18;AISJ-!-rT|#wX{8$&
z7{tF+&U*a#@lhBF#Mj(&Zr69*DBbwBjrQ9*i=z$RKR)lrO-Q>88#OYAvhnmpDedHQ
z2pJd=UPMM3tS^wH9(?|@KVLh`%*96qFbtxj6t-E;JkM#MgiF%?Rl99@$MUC7*Xz6&
z>sK_3O|hM}Lx3LSitOigWEtnduBoN2%p26N+-|p{cQz*2>Dw##u&5}*J9okm+|JeD
zjyQRxXJN74c}QF%UD2F~bu;wzEQ8?CZwd}#Wv
z@coV6T#er0N+)l~^?l6Dg^;XVvZ(x$rX6%L9g2gBZmxPagiG9;H~AkvT>bL(D>C=4
zSEp_qJ#k{@*9L*@B$8vh-I!T
zp4lB`1C}A?*b&e0M*5m567#a2KmQ2~3KE$fnC;Cg%*gm)n+z9}ZNGc>!E8?sZM|vN
zrM&3c6DI46NDl5W{h5IPe25o7qaJmx-+TF|Do$L5akXcb2zw}aG?oUiErnjS&&kal
zR`7+lk(vedhX7R**$6sZQba<3yUK;QI%pL&^v3}Ig;S@lmD_goz>K{I%zfkPzWN3R
z)~5w7t5U|A#%%p9EYifq#Zeo-?juOskt{PnbZaVx$`7^GbFsSy4sg<
z?a9dS=)H?GkRv)qMsFTiS&@dPKEF7Erc@48ShxOFH?5oa+#c*91~zot#1O
zIXF1bns&F8ijCfX`o6Mn-@ZFqS_6$iTar!$;Yz0g&BdnOMvP(Uf&q7LPY;6P_7LRp
zHc-IsXlhom%+~;n@}PgbIER|)I#oA)fQ2P4At4X)%6?_4BT3RB+O_&SQ^V3%?&Xyg
zubMHnrNr0S9ur?@VNuFh?k*6qT3c(B-k+^8)F0tuj)U*+PmcP&c=zt&=GK;woE*`1
zlfFJy8n95Kf*nN{Hn+0}2L}UNUl$A<2|o*EN=tzn8!u+*TD}bnE9>#&=~PE@-nsNZ
z9bb}n0&LXkd^z6n2F|F3dX|b`{
zz(QaluCp8y?y6gC6w=bt3Zl*C(ICfMn0S9qPR>GD442u2H}38P1TB3zjxn#RYMw|`
z7iies*+OPZx_VKyt+SKx`iu(d=eNTm#$`7rzDD%QE!JnT@yJ$F(U9M1=(!6WNm95n
zVI?KY+1Xh~VpTDMt(z;;L}v-#1+xUk)v@5M4%Z~8n8G7
zm!F)TCOj3lD$gS=&(AwYa7rjc<>~wNy_wp2F5Mosz;#zdEEFm%
zoou;V#*$_#Bq-Q}r1MzFt~o-d&41Nb0K=Rq#*TWDH*Cv|fu^N=jpB1VA}{7LTJKN$
z8KbodQ*;6f%4SDNxE#3gj@SR^VHky6;0^^lM%Y-IFb}pZJ`3ygk(saW;}d3*$!@Y+
zLwvoylR`GUmhYradE9*QpOG=$Tq%w`d^C4cn4o5=RW>WGga2c
zKLz3xDqGD+t}p-s0a96Kp$bN04DjK3;zVnEd*2Z)`OK7*li0OM*SivzE@?+8ti2&Q
z>l*ez3IiNLd2Xh-2s#dwnET-721-M;BjaAZ%Gg?LvIIguIb4#FaTTw$TOTiKK1ZPS
z=4oMDACIdC2RAM>tXuFX1`Za}gLgKkU0DZTg(+n%fb2z6`wc4t)?MqS1vHZMG9MkS
z^IfdUt`rR~zs5tSQDdT_v}z)^piT<`$u%S1?rDn?N@Y7;t(Ts-vEc^&QIfg{$g6#Ki0H5=1=Ps}G8(KOn|7>+Ft_#@>4oz=`PtCv%csTJ9#3dx8
z9D4Y%QIy*d4G4QeDpfy5zubQTc1{z|A!JxmQ5TP*4ENuUN4A6^MB!h`DfBRltB!Di
zsD@WeXghv&tjZ-B((WEsUQ|?*FL)!9P-0#P)!Px0U~UW;E}C1~xy#Tp$OgNOK>hXW
zkDIGB62)&pYV|@gQs)Xw1^M|uQ`ThfT(Ycm)Ng>iFu}T}R=JWeOA{@HQ1+zBDf0fy
zkDs6x+>gEy#?P-T`@ynPBB#H9&jf~V-bj;CNmVN+rXDK(t~LQzyjw9UZ`!@
z9-;`Dh%L44NXmgX9iU(*Njb$g4>W-+#9Bt_ZJS%xc;+BV8>*NZvaUJw$evQU`5pkl?Bg{Fl-gK8%)~Ar$RB966U>m%L*dt++KWQK#2{h$#S0se2z7o;`ow
zHZWi)Vp7os)reLBIfjOk
zO<6NcOKd|_J_hLqyQ
z?PWD7mtj3tF3B#SfO#ss{x_o(Y5C?R6g7I{{y+chioKwV4C8{J;GLbF9mw8-kKY4{KJ!|I
zR~wiGPs#3x7k;)|g-;)ast61WL;<;+8{6H%nzp!ss6%E?qmmH`4WZl*&{G6@s|~IP
zyA5SOe}f!(9FC|J`pK9B;SD4nY|p{iL+
zC%eQ$jdOxzlE#7a)1Rlshb1Dm%m0?FQ+Ea=eWms`$W!q#F)=)Z;h>cc8IF@D=iQTG
z5N5E+(m2>oXC1k`?ChBjkM{Zy6<69N*q&SkB=#x{o(H8nKX-xs#Ko&H@Jdse%a;cQ
zuAWvcC@5gFn>Q3tp{9|FDW<$lJG4Sl8L+x5Mbkp6@87@wjd=0+M2VK-2?fx+;wPNl
zBmjKRyScg5d8_To&8f8S{^XiiccBfSsR%+>E^uLIYZ3d$7c~HYWZ8KT|LE0j73$`6
zIx<5<-8WyY;SY83_4TO$w}Oe4>nEYMM6W>Si|8SshDv1khgEyMI6GTl8N7L`+zwAb
z7Fs(?+2O-OMxU`th#OADz-m&>b-_-|VJ7Fp=O%54z&wn6$7pfsTRtp*NH*bp|Tv3xNnmaVzR+XQR3xjJMjmr&KK(!
z8o2U={QUMB;j~ljiCLgcoS^Qp=`IEM`(F?iZnrh|7e97c&Nud-h=@wuh$jd=K;4hF
zLtz|3^F3J05IS{I6p!H0`0}ZLiN~pv-t&By-LI-d@m3~R8mFyNV9hF677$Yd*{9lw
zZhO>;`6{+KIZP$@ks}jj$Eve9V{)!wR6)b*=KWTzu0x=3aq^wd++!mf<*CCY1aa$Z
zB6|**a$mgn(EP&|)LJmP-ry(@V&E2eh2nTvE8L}twX4pyt@Mxkz*6v`rF^{CB{g7X
zg_T!S9G#VZ^ytxnKmUBCmojt@z<~xr2_dxhGhLZJWI3)A-)}>9;H35rb7pnhAO8mr
zoLq%3D0_##1^Ukd5iF~aVNRgPeI#snYGTtlTdDWbdZhBs4t2aK&Hwi8wgRKW)-b7D-&uElrN+zmXV=4;xKOk%Orb?Vw$
zm2*O3VgaDG{imnCE)zYEs;Ig1gtzlG9Kb^
zJ8Xy%_+c!%Ty`e1Cr!>|AQ8yku!z@Dl#d+``RG7o&zpoegVApF{+>I`cll?j1C|2_
z>`Yai74>m%?`}6yp)}r&-17#?yk|`W_(MRRih=(Q3TvsWqc+#~@89ns+uYNAaJAjJl!L-Z~sJH-MpB=_&I>ENtKdx3uL^5$!ZyCEMz^&GIu>?#WHNnp6_qH8e
z#bz}+;m5@cz?mRP1KR+?-C*HA8OI{K_RKofUGI#A=QiC#^3cbj+J(lZ1kI1pxIezTe
z99UWUAlX+xv546Nck=n@_2gp)kPXA?)JWU?hI;qzW#nq6VnAe?LLiF{XsD`o1CxI%
zw@`iKjPgtO6T(fg>4{})=7h182l+ZVVF>igjQZU`yf6v8{KPw+NZUEN2jYn-ioT~9
zSC<1rQW{#0FPnR<1vMFsQ9(-+XQQsCu450uX$W!%|%x^txX=%xN
z@^Y80acc&PDh&XP($h5tb~}gTk>3#*$~l7sAO|8f;)il|JYMCY=0=vY-4aeRTLN&m
zjdW`NYL{(ryR}BvR3NtHOPF$A^D`XAx|M=eH!Lg%QYbn)I;C@7m{N{c?1jZU%E6H{
z+7Lht>$^}yC^jtwBiwPcz6LZ+A2@RdD$B(ofkwx4wG?t7)Ad4jHuFHh
zVEPLUiE*y)s_r*4ffCjMZ>P+kU3Ah;Il9?|5C^K5JQ1a2praEF^x0}t#(&v5LE>ja
zLj&UeTo4h_0B1U}!;3H^Y330Zy?wn+I(B-?>r$iBW38DtLLKM&3Z!v}Wm{riH=N@pbK)R$6WnWl$L9Z*%xi#w_D8Pz
z9{p3)je(^7&;h{7#???ZwVWULnCZ@9{*H^WR17h%i7fqQi8n
z-9UQ+-|Gh=L>Wzg`t%QtR9SuC+f6oLfw!A|6SnvD_wz&SZ=cEYVAhm?Sc$D^$XX>X
z5FYqS&73=X_DgY)SqG>wD#XT+FbgYdf3?R1vXphym6a7CRLXMTxnwD)T;&L^Av5M(
z@I4i_=F3gWtY380rpkI_`l4AOS{X!H|I_n1Qf>=&dmP&{(xA;6pw1=eiRa$F6p
zi0216A7Hn`#TwlUdH6!M-#0UiLd)ddO7*SqV?N~elq^6e0C4SetbcXT%90B}$A-o?eqow+P0XS%esgpBR$OHQj5I+(=-
zq#t9K9%O*f39BdtTPz)fn+vaHK{}_(wO`QF=qw>cUKT7JpQ*fp$qZVXHxAvB_nwP|
ztZS2B>vsjT^hs1j?6=U5D=8@{2iWhNU-Vs$H+6mQpAsLR3sM9pdZgLXVf+}fcI*E(
zoJ;<$bnGuHeBxSMPsN|TaQgIhfV@i69q`d)H4+uTOy#-a`PtU^_U(U4YNohwRv_~#
zorii{i85n>z4s5Hpq0e-WI2aA)7>mOz6+hI-{cO_X5`(S#a$ExEyioQ6T&e@`Q?dB
z2(fto6c%rARtEQtfFbG)ZcgpNw;@0~X~4iJ@#~9YW1^;4cX#L%f+_5HGv(|o5C+8c
znU81ZlKkcnuyY4wXAW5HgWtWt#x93eh9p;iV^9FjN~IuO4Q~=3rX*t-xb%$VX-J(Z
z)+ZE^k6Q+6gFYt!!~Jct&JVsvh*NJZG1B%^yz5FFEzzSd4boz^a=^mNIBy#$DyY!X
z`FP*CzE;=7tl>3);&M}4>0~Of$Ao_C{EZcTlUXOV7hs)?e6MUyU9|CUih07sS|%5Q
zNHXcDy?gfbm0C7=QjWuVf3V#9pT4LS&mrOB`f|LQQJ$aZdpkTY5C(OygpeNR^73*8
zR9iZDU)?Zi3ZV%$9t?N;@_BGmu%^H`ylcrNbrk?_(7O}_OEl)379nVzl=t@STX7;6
zv?)M*4GH6b06_z373BL&d!j@e9&ZS~Hm~0}V{*kPxIRb^0mj+}G%YM5q8|!^8$z@!
zz;AA7q|FKO01JdhUmLhfh+WZjgMBtaX7nBLD~Qlt`eFzr-gjYS=(~Ccv^6zL8`hH6
z7RO4rw*!IgC?lN(P`-{bR?;B)!I?GuVmWhm4UC!D*}QZD)Py)N0s%OJfOaF5oI+?=
z8+HO^>EM6z(4l+SPQ(C$oYMyOe+MchJj?{*?gW$fAsa{rp>9#$11vxDCi+^kX1ZcAv|eoN
z?4TKA&Lv^{@;Mt1rlLYJ+2tV9liwaGr4TVVaL}`Ui%9u%T4{QknvsY}0}bE1P+Tze
z$9?yCt)`m2fADNHa76>0zw<&uY6y!$EGnq+J>L~Yzpjf*O6m<*1Qg@EOMqN@_8TDX
z?yrGstjHmd@dLWGY_*kPOkL1#FW`2o?{;kpW#{ubQ7uvoL#qXEh!T(Rbs7W#!p4#(
z$pfXnCdq^ME>@>p5>!R}5NPgL#iU(1d-fJsCU24Mcmh#z`&YBH_j|EiqHCQ7NZ}rE
zW?ka*a&sL)b|CR@y5$cBv3{M8ODMi!heqZC*`gt0TRv3O8gmYj6Qb*6kU8C1Dsl%O
z(BqIEh=gssdrc%*xx7KE+Q~LRHM*c_;94#$BV&kgF$=%%&%~olWxLSt;Ru2Xz;zF}
zBhF9=8qGi*i+KegT!cp!HY08$@e=qfAYl>JDI_L_pQ?endRg6Vv7N_vBol#uk;UQ4
zLIfqTfj!x(zdd(Pz#!PPx}cH5*X2$G0HQ3n#?7zALD|rYIT%|Y3~B6bw3hi9aav|u
z{(u#BceuD0If_&Lsro>)q2xnRjy!`!=yg9EUV)ww&R%j@VH)PiuDEd(%qCZgQ&+|{
zw%*1B1FWlg5RfV#DFUviDI|mxu-Gs;JaxTcdl`o~5rVMB0dFp!=9llxhq15!|a_JDNxBlYeRNu
zke^4nxJ;&K;1i(7A2CaUI|1FHE9PQw0o^=J{TKm_l#&Us9>MLOgM~%Ob&bF%kYfVS
z(-!hF2SWx~9U55mLdA;S{_-HkljQx&;0VUVnlxfu!C4H%M~SO(1=nvCAcvS!<%`Uc
z28fE7Tv+0YVOOc8QV@s^HYe?bKYP!!;dp+Q0;`7D+!q8bG=N#p+EHlT
zPrXyDt%N4GGBjX{aVw`g1HiT*(E@;CeY{`rJWj~a;-zity^F}%5VrKX+1zANTqP1v
zS114e_sGURq?c}9OJzw6vUtn?N&ysp2dEOYc;W8OR5_%(n%H3r_9Jo*0I}Pc*m>uD
zhDQbmjgVGu(iN{U(w4ruN
z1#Y;5`+gI`w-?x=TR9!a=el9MSf;fjm
zAh%&F^_qg|Kgnd%so>H+ygD*`u=9~pg}`~0+gJ&xmkV&zU}$Ja{KUZnY-|SLl5t5n
zW&;7t2Yuw)U5lD$=
z=asMSi9!-@yazcgIE5qq2~QXp+9se7Ruy{f5KP^!q#i!N5J!O}>jRQAlZS2po;-nv87DQ<iY=z66b|N9(_(jG*$!R_G@@E)1jPTsY(%>_~g3{4zS
zKgcN}5B5OF-VQe)F$&4G%?c-nw-qNK`;T87vs9wJY$F`?G(u10IfF
zhcjN@aDXILA;16#Ie4Ag;B6rmBEoy1znf6^0zSte_#$-3S=w?z8F})bn|sacPyanZ
z`QMYGfVcl$g#RwWZ&dhiB)~HKcM<-(2>;f||39W0COt!2fs~G
zTJn_H%ix~)MqQu;?m2#Jvojc;>)d5<_eLIW6mF_9cZtZ$|D5FZtTAO|sJ?l?Qzv}4
zd6SQ!L@;HSaeVGG@=){lL%mnY0*M_7b)2Gfkv%gOji($HH#*?!7(SdWa~N)pZbv>P
z`1^1F{eyr1mT0FK5PYcLGWNTt%)c@F`@avH_WTcf{r&tO2Bc&2Key=rrI*Mnf4}&@
z@+LnXM`9Q+%{3;uH&E}&)J=+oG?;MvrdOE;uSaNjuSikn-
zUHG_1O@f98V)a#!mgqpgt`(DaZu0$6}$!R)vMBdCJnx=E}To4q$
zP?~YOfYV<#P~3cF;AD5M!rB0X3NP=l&VYAhxx7X7!as_e_rq~6u8RYAcW1`FEpMa^
z9G^kmi8O8T3jH{q8MQ;yt<1V1>DGTLXlF$_w8dM+#!_uNP+p2cWfVUlZ&uqnQfFwuQsr2&5+*9w
zMbwwuY_?lgdB7*Xp!U+K<#no#kL#l~KCRxxLUt$WT8uP#G*H8w6l5ilWBqgPBlh?O
zA?!OH;vTcvdU@8l!J?#E>`Uzq$7=^zm*b)mx;@zsh0GMe#dEFl#80{|&ez`;PqYzF
zjs7Ie{@WGN2EQ2u?J0kh@j
zw$4j_Q_=ZS%Au{lO`Y9Ex3&;<7Xz^tLRI{i&DU4*X!*+48Iw!Qy3!7<$^%V&qk-=V
zlLiVMu_M;vh2N|Vg0H>qDPOK!GzzmVy_`=9)paHa1!<&+hhTF>2hXT-t=ts*qR&o%
ziIRKMt7@reMzg@K#XyDJYT&t#ht$d?VkA2%gL!Ky|3V>^DX_QfhITC62q3#
zhJT9pJ}itX#cE?$OG=Z`ef>fc1A6Bl?%)|U^dMKS_&=isb-WJmY75(HA`x`0Z
zJ~Apg_FmL+;ZmN1lwBRFd1Mb&+`c{El1fWMmSeuzArsWtEOZ)jYCPf%z8!KT%Zx5`9|+66ceTH(t7BIJk*r4a*edRzM6z5SPe=DP
z=@!FJxjIj<@``i4TwazG>>4wbI=#2Q(1o=@$v?}$#Wy%YQ2VZZj~gVg-`Mk7$}p2j
z)ogXNj*#2w1Al|-I5KtqF7L_l;_A8M)3yZxg(noG>nrQk_9eBEH)hol@Qg-XJXI{7
z#KbUZfHtVo3k+`S6uAr2Jg~eT&%f!%N&WEZn4qw@L*o;+^)1*|?z;Z>#_?%!c~2#b
zm)9f(r3JrvN@&Xn=q>K%)mhNcm8ALSJ$E`|qJMtZ4m@(0_lI5YkS}0(h089d%b<7L
zY;bMm%lM{J3hT4ds62FKai)JETb*YaI-R#ED3L>}LE;qG;vT#U*eCbEB&
zF6x*rG23|Sv<$XM!XDUK?u`Fz>E!0+jfVKb7_z>b?P}UgLd*IH
z-e4Kv(Kpb1)V%VNDm)vzI>I80?~7!6zrFk6g`UmyAbV27D=%g8^TR$*f(752Pn@rG
z)6vjb`tWy5`0E37Swfj9A={_=LDu6Dwuh6)k2>IA{|LAUf!a8#d`snzHhfEO+5Ap&
zGWlBp%7tjSR(Wr8>PT(CU5R}(|JjeGCUd)rCQ}A-3}z>f7rIT|(JME3gY_@iyw+-v$}5tE2d!%bDkH&|JVpk$kWyu>DJukZ1`L}
za4DqYJ}bHRYNKLWt=(OT208_e9cHW!N3gGY?&ATrUl5FqHpVNp`Zy`45_v~`mfG?0
z8h(0(AKwLa1~Y>oxxR}1X^RR;Z=>>D2;E+(Vvs&1izoB8m}JUP@90}oQzpmL^J8Di
z<9vW8RMcEkYyH+gt>o65#BsEI{dr7?IJ(#skJ&^85Oh
z7TztO%<8|t0WdFbciAxt(c7&qdHs@!ot2`S4n#^X`q!Z*mIG;OJt7TrIKKtnf+BKf
zjwX2wvKhC_el~k{2yY-Lk$06b*aA(?EQxp0Nneu^VMM8#yj#nMIE?8Y&pg#IH~Zv5
zvZzDdqo(BH?vFIQ%k-0J|IKGK+Xc}MOE06%X{K9Tuv=9@MGH+e&(7ZHe<)g6iJI`B
z$fnGr#&Yo7Y0B&*0yfj;Lq0$2g0>+F=}H
z0EJ_>v8JX&p;tp;u9o8}Ug&(!az~3)hMG%%6statc6hi}thKm`A>ZtHIRxOu_frwI
zm2n+KOJRQZZnLNze9-bp0v+_p#AfoC;OTZPKKeS>K4w4geKUf2oN7{^eHlN+I%{mG
zvy{=pVVasi&FAIK$o2k2iQn?$?5{H~vZc>jNWZ>uu4lf4^${%RgPW$6E+@|xRIlP#
z&7JDvsI%&hB~|lRimM{B`fB{W^-H-jV(iLB99hOAS+=J37(R-w<%Wnd>BAOJTGKb|
zPnREJ$dJ^^x^||-$=N@}Pg)Hh^1kga`Ghd1A06iYx5wIvHpWwv4tl$q({cuZ^%1q?
z=K}F9P~gyO<#8qr^les?)@8S;g+7^;PUic~Hv)y^hke(=p2)iBTh(c*|I_Pjb$XAB
z>v^`5T9w;PuIl-7Ezb?|S&x5e!R0fW|KR6nl5yI7T-oxY$Tp)cAS)&%t&oQ~Ol~--
zkIi?}myy9t_WLO~Sd6KTRs8{d==tWRuwRFiaJ#!kzJ21R0z!FLLyY#tp8mA$Tty6!
zy!FB^yi}WHqAVo&>yz$9FwVLgu!}c&tbPsN)dZ?V745
zK@DDp2g2KFJL@Fr@_HwI_Ns$_H@ta!W{*fgc(}Ul*|ZK(tMKKTmD;ai2iU8}>5}=J
zwl!KWl>ENhI7IRmTm5vRjv9~kt~zoHX<2=#i(Xj{U){tLW7E8YU!j^!?z_4*jX2)x
z{oq)6Da=x?#F%v{CiSx#qWV{Cq!hROlgm8A%TxJte$M25381#r
zX3_3{PxH*tUF;Db_KYxYpl2UAkZO0PzUqdC8iSHd>D?Bs^u^xtg}TCj+EexfNxGDW
z6^Um_3bzCph|l!j`_=WK#EkoH4gA(N_CACj#^3d(2M&GM$^H~nb754N?R0qDomgm^l
z{1kbV0iXI%c4%m&Ud-@!!Sj+1xU^pMbR8?&+0TjUU>Uf#HfWbKG^>=GZ~&QpKfkD1
z<~2nq#KI*nD(Tt5rrzH?wV56+&-E20eY(+!w@z-fPwzDuZz@$R`s$%yL3S_mIe9s2
z@43#r18&5NJ-G{G?KaG2oeOV(%axFi=FRDcyD`yxF*&MUzP0N5GwPnoC8zN@p6uu2
z(nv|(c9u}ZMw9Ec(|1Y}Y^F4JH7iB^iyJ#T3WY9bYy=TZ>+G-HHo->4B$KX4i)7
zgzMkA-PY$4#GMbd2^c&Xb*E|vYHXA6Q0Yh@aU#I)wuiw$XY}|}Il`iPg;{7RK(kiJ
zu*v>Xrs`VBLhceQ@0@_DY)n#CNJG`thzX3n{UXir=y3r)v8>2*Hjdr3R!MbpubRE=
zdiRgIMB{F-66+jkV~Wi!Q71Yu4^C_V|{2g3lWCTE2XU#1YOun2i3u`L#VySQ$Rx*v*ux$u_)baux5Fd)3}#d_5q{ZFK_4HYW$4fTQ3!ftkbd@3nI@-tJD
z;Vmyb9vIN{oTjp?9G!iZI|>2yzCxEGCe#bv4#$t;$sNPK!uk>keD7K@59qRLv4M&p
zwB`s0I}|BSCXa1%7Ci)U&jOh=^Q=whXz^?%`;2lPD#keSnIUBlBD4+Ho8QRey1xWi+7cu@_fD&CbA!6V3YS(OgZZEDUwaiDZ@yCTw?6t%E}T7J*I73
z#QazwZ!-VgH}4DC%lUlVm`9{=J3m|NY12Cv`cJ*xvx*oi3FXbcJemXuwAEE5qMD$n
z)nKJfMZAlw#rLYCN0VaR1#q5GKc*+PcQ6&^p`C&XLY5yHM-jG$c}|m+x7=IpCs^l}
zySg~9Pt=?vIC1Bzt;`)@S<}CRE+Nd{?u!av8@7HHXWc6!sw$(X*t@VnXyR6*^{H)a
zp&jo=)y{QYzSDnZ&*Zd=p2Z_oJDu`M?_JIgqgqSVJT<*!V6YTg
z(Jo)N70Ri`u4mswm-}kG(H)larjIqdIPlBO#L8AX1
ze0#a|r{#5TCFh#^hHt~vZ8=$a=Xx7+1v*Hxo);PJ^8V5FNZeV;PkC==ghd(Ko6yh=
zL4mi`D`$oBKG^EHI`~?Q8DTAmA8d_{-j&Kq7U!~Q6gmt|P0;SOa(i+12{RjerD06no(Bkvgph{XKe;|!R`LKn*277xKVyJVjUKfxm5l`46Tw@37KxXV<~R6^#Egd=B8GZvt5DR1y!lj
zPf!6(%5y#PCisw=I|T*~n}wmwgYE9K0|oIrUZzg^``5HR(P@z4eXF;&PIg8Q1&;^5
zDzqwhq|HwkOP?8Kj2(L@>QTEpbu50BsN=yDQ7aRQxmJ8PVrSGR{vNnF4S|;D)Xr7&
z-!fPl&`=;3+~{?bO`YFfx3|xTvZ%$z%*C~JYM&9m3#!N4B{T_^PPuq^s$L?`Jmx%o
z1xjpRla5F3E7?mO{qG&T8)SncIzI3`30AC1%X2M$&{wSgGT69G>&WzF
zv9pIyj%Rg8Q7iWMYyep(*I)%*sG${7d!4zM>7i;FXP3!a{|1bsMJXoOGH7u2efE#q
zr|csmBe~DrL8_a`{G&%^dOb%HHE>fmn_eA$@atFC#F^olBr&j;+c-dFTpxKPyOXheUr-LdnU>!gTy&YiTvH(O&I{
zADC}A`>t+XPLt}jf1qGLkm)KAR(=wyOd4+UR7krlX8{V;``Pf!cWrt1qNiDp9-MVF
zU@1#H+=%M+D2~w51G8pwo$j6LXnvQ0)N6kis%-h3E^f(JJTd&-E0m*wIb2Ovy11l9
z$b4Xa-`yHf+c9m$J_0Jdfzn;t_kG7#fGPTPF!pLU-N~&ytBEEIc0$x
z+^mpRMpA|Q3ROUWZVj~^Sbw1;S&?)0P#~nIiNQLL>V+%uN2fc;263&mVBbDR6PQ4&D+${e4#Ar>?{@NJ`Th3}^}(E$kg
zMPTHSfkcmiY{CBiMvvIZ6j4<*nPcfs5}f62Yd%8O>){EPXbpeu<*;$E^}d0cu^!4%
z-to1Q<2YK)Td=k8lU38rcA>uU{Y&L1V3MI^uc*A&p6)dvV0eqpDOdEbSQeEPxtZ^p
z2{!V^|AYBa`iL}_Z;Ct7AnqEmmcW(oBYEk@ty=<|5lqkaZDMToUPbbY53=Ovzq-l?
zI_p?;n9~hzR;FaH4JnrgR9rM&Hui3y=+ct(Twu{p?!n@e>G^xt#P
zig30&BCq5vtu{Nf597a3|9!%yZ(`TJ@JL?56}sz~mz6GjS*iw3-^4ZzM6Pgodh$Ra
zCS+{nI^V8)FkzKK=N|e4-z&a?61-yC|A{VAYIVDmcIR@;q+~u)R+rbrq1$OT)hpLHqe5dj>P73y=Fz`^WaRx*F+Y|&Q8(~7L5k$OxRJn
zcm`w)>|aIMJPSexZ`ss9lUe_21IF=^0t~gca|PC+nBj>T?`9Y?P_asA!?Aa54K?
zhg&Gm7Eb2d`nOh;x1)i)QQ@ry4Aecf`3GwqhzY5jbq(ez>o1_zeVB1sDPi9`$NcoP
zEkouAbjXVX;iB8yim*J%jVNCaXoIqCbmISY&-C0Qj5FH0^QK
zqEgl~a-_lKhrF)+1J;u0nFei=K2|y3#u#WC^Z~r7Uq5>f`+sZr+b2f$dpu*UYQdE@
z&%=R>#HH=ZQ(&VfucgGpzE%aPItIDkbHv7X(`W+}jJ9@>+;axmyU$O@sMfKi>l2l~9B(jSJza5+qFwp!j@yiLY;Xb;uw
zi%)LcN&O*e%%C9Nj?eX)bH6^6ny}C}wY-M1SnslzoJlylwc0U~3aGxjgQpihN$L}2
z`-v^$%e3`w7AgO$SdJ&>hwvXNM-C9vf2W29^B4RwP8ceNo%UStcaJuJ0w
zKEa3@-{a@c6&wA!(C}+<_oIm~sb(@hrCErIG|s!VUS6WD{
z>Go(E6MedhFgKL@{G`L!<+kY20c&wGsHIA4D-YQ(U4Qd6cv{-^zu5m8QViVFeBUPG
zxVsWW7-B9B39;P~T<8jz9n}nr5L6Ufbv$0kw=Aj7JmV{56^U4T+YngK67@xLD
z2*PA^Wn0F}Ha2;FttA906f*qvP;CNm;zR_K2nnTb6+HNM`+5wRBQ(V9R}23!Rtyxg
z4=7l&s>?+0tf@220rByepI6+~ag5Xv_k^Y%)-2}QH#jTGQyX|GQXFt!tLe|WRPMj72|3ywMoS1LGkv!aV#5P@vZ6Ec=F;uRHCuh5UV|FoH2
zJps8E7gEv~DkqhZ)K~KMUB-$_@@2VE#8v6#-l_0QSEM@>rkz~LlU|TD|6avdjpjVa
zRJa>ZZPY8JHL>ZUFftO6*%!9pYG`sT6KMeUOQaXOzc~%-twP~SZOSyNp+2?wss_Sc
zgZ3T$K2uso@OTW>e(vq*81sgA;opC?uKqyVPP@MCEnlWCRdh&8Jd`!cp{nEC1{Th=
z-odx^e)3jX#T!nMUueaXOAM#UTbp$v
zvWc4xM6r||2&-^)-+`Z9`!s&tuD<0ztz$ENJZW`_Jvr|C-}a|ld-e;c)5J%Na0dx9h$VFcivPR*NfIXYex>}Bnn4GFtOwQ9`uRG5_V_|4^!t-atgIsy91Mr
znM%_lS^xw&5J2gRac^R8)@%4UZzKCxabO)ZlP!fIX1`KV3)j#MkQ1a9B^;iDH0+Kq%LQh
z*B^qsyZwd7kx!n``N!!*4)p5IMgJp-+vbb&2%PTD0%L~e#%N{GW
z;xeuU`Pib*<;L+X4a6_QBk5OuP;4qTP5Yr$z>*fS-E5nj5h_
z`D9fvx(8=XMd(E6J94c|97`VRrf|#cc9`(hGU3In8H8;H}^sQ+Fy-b{I=2~k%
zN!t1fhwG_Ho_0F*zU~sZO%&}%5J%hK{o>UvU!Z@x>)qlc+PoR!oGr72Eq>fXZ=go)
zM2rmQQ5Ss-2cC~otR@|#BmxurgYX#2ITzYDnsJAouRs?b=jmc^a?T9cG4VGSTqR)1
zIC`3?-F$>#Ts&X(Avp0ma!;W=<5t6;UvBn=JPA|m?NtpL+%tC}}C=jd|%J$BF}
z_+rcR*#o;kQz#X4cd96p0RQhj_oFm-0W~`vFX4jG2d6vpP-)cZYaO&+=~ucxLZ1bI
zd&7z=>Xc?ZLscs{$UrC+8oc$KkzrxTaBltH>(-)|^MPu4P1Kr_T77!YUT~&$jE^5p
zs+hhVcGf#n>i?tet;4EXzpmkps7UJ(L_|Ollm=y$?rs(7
zl9ul7*whC0JJ=k^ZR7kivI+ZzD}8xs4U2X0
zmAW7P>9v-2(U58H76wcNe)
zZP)YMDp@U{8?EZul+InC&W5&p8a0WYobv|azT}KGdB3pq&Eqc)I|R;rY|pAm7%Ufd
zaG?}X-G85|VaCgSLl{Bil=K&;W~#q=b7O+jv@Xs?w7^dPdDT4Pr|Mw+D*JB^nf+|d
z@hq+z!u=2O0&X;`hPE23{fc5ms9Wv*yXFwP<|8b+|4g6<7RGW`dne**9yupHfoE>t
zngP=fd&&&P;;pUaBCa8EbrP81RB2}RUB&XJ{x)lB&%#z8l@3Z@B$QMN&bIVZ)mH;3
zEVptBH#m2~r&ozC+r`QM#vqYnwo{Ud@KSDzv67nncIk`NI{Ho>pMuA!w2nC&B5zJv
z%O;wJ5uIV=ZnKzZoGwOo&a2(^mrJ-_1?xO@_}Zq?Xj!k=CDeB=
z`q);bWXIwfudV@d1&1r&Yk?DoVV5MMR6&eFY*$IgwcQ<_EH$)&%#9ajE8)>13t2vv
zQqhte3Vs_@W3of8nQWOAO_PpTIJ(lDUs$PFV(R_mb~yFi65zO|?%?w3`#DzeYL8-L
zWt@Ut`DVAQMvhQb`5~T6jGYTJCA$nbdk~weopW%PD8i;u4+~}S76n!*mj)+`!iLAf
zVoA8#I_ys{B{jHNZ_d@w^IAB4(~GBN(m%P*P2rDztqAsM*UAQ(!*}A3xRU*~Y(lvs?Oz^c
zFY~@UD!r2E6JZq?Hn+N-nVC;opxK<Mw^S&Ttqcjy*s5TgA#$+0pMRv
zn((bKfj9N35L9v_bkSGn**mn8w9qnu@$+0;a&VR&a9u{w^VPJw@=Gk|@;mci0#}PEf=&?l>
z=O2wZZM{S3MG^W$gSdCu51gBO%x-0=w4r->M`Xl;b7pLZaF$lRGpv45;l&wd+^E
zZd7Mh&f9Og6wXIyp12zExG~cIX|`#KRnOf0UOY0(pvBAB_3Ab`sgh&(l}K0cK?i
zQT5u&6l?aC`jATW^n$xPy#=8~&I6tM9lnaXbaTD(EEqxf(@M}mSARJbZ&r|n0(WYr
z0v2B4Ym7y``Wy3uyiI2wW<8N&@axUU>2WT#+yl<
z7d~p$*X+A<$h(8y@89mDJa&|Mqb+s~^
zLe92ygEkJ*5?w_*CpUe$b1K-RyN9ZBW`0u7Wpk57X7qii3)9%+bG6pJIn;8jb!Tp#
zH};ml)RNQ8uy80O*Ts~}h-szrBYBF#+b88XR2-wYKFHsjcQFGe!wYz*jEzdcX=1}C
z^XujX_=0$c9$N7n>r0tL1EL3~dBCf>U$`6d<8jRU(14GZgb%0$8qA+*RyQdgI}hW;
zm^d3gwla;0H0CMnvYJ_)&CE*l5e#$l(R3O>Ih$X}h17-ILAl(a0OmNkRQK_$5y#(U
z4Fx9N&DVT6*}j3y@TmYR>qFkMd@y>
z98I>CursAF*ovzRg3-0MBCB6lNv8g`a%+xMV7`vZn6^tj1yDD?PiJr
zLV4$A&r(*f#m(R%&uXu%P%N1hEpY+WfBhISql-n
z@n=(Q*d4taabL>0c#xw#=HmC^#z0<@HKuElLR>pM2zq-?nso)@7L-F5#b_0v{@2|o
zJ3e+>iRRQ?~ez|?5>j*-C}aga0p
z{qeA;>H5vl5y73nvX)q2q|!0xP-Xy$c%gI~7pjP&&(v~zrFZH|;mzaxC3PV#BvKIu
zR{uDNehpAQV=K;Lh=)nX*80uChQ;0&?(R3xBlSY7zz>vqwd1$nO1kCCF>bGGM7*Rm
zT4ZhUG~P$6CSPY=9!kl3Vm3W82QEddR3p`OgH~eZ0-G3&2TX;mL@c&TOQ3se<5){y
zL0?;02W>#jo_uHhajd+6*q5%+_KbpC34!tE^n!HHNl7hJyL~6pe$jn7@fJZZS?A81
zhYz}*=7~!+77HnJ@2arX9H#9r=%GvO51869o-p(VQu
z)ft85Xud|ur$~9yQlNG`*USd9YppOt@yy2BBuYwsWQ6+T%5c^Dl$o_V$cRc^Y@ysc
z2~%if4KzH_X5JZF3#xslym#2#PQ8&1xHo8NQd0TaS_F(#lraPZ$c~g?oa{Ya<9l({
zoC|3yOT#C=705%ga0l4RiJzUUQO_K937osB2stWt=jWnWsfEH{NKuKFxO%TYaC=%!
zN`cp--JhNHQ)ES8Y#;I;jE)tg$laVMPjsP`FLaWh=*_Nsx@T*wi4C{6EO{}MNobKj
zcSs{1rhRa0kU^_WT2)80ZI|*dbg3aHwi_&Y;_>%c&JH1SL$Kr_Hk(GQ6A;LaIMNR<
zXwc9o`KwxBXLVdr=`hl%4)iYDyeNY7ks-}BF;cCIy*G$yEGFrtk@@A`gDqDF#@n5D
z`;f|Xd3=+P@m~J(7mq|EoYume_}J974+JtK3>g?ix|h!rb8z_G7|1eo_u3MVE^;8S
z^4Sz}6g)bq6yWNZ+&BUfm6iK0!%*=^tuXQNLh4yW2eeVFZxM^IR`zC7|^GI%Ezrk#ttRnWMS={mI;WANb%3>ue
zuZH1io^o4DS9z@GCe@e@hj*z~>vAQGGWY!0Se%??^Hhk`{Gt~ouYGD0l4A%%Z4=B*
zrgTjZ0L*(lny;>AoyS=G+Vy5T(1aqfY7WO~b?rj4yrcEQc~J4HFyHJUn_nOwN%0+a
zW*naH*aV`>N2f%=b@H!@lD)xn-#yROrA(&_7*Cd8D+?y`W#@rWDT)(dl%mB>^1?;;
z!>BZm>wF@ij|c*G#IPr%@k?2+k-=qHL$_gW)G$ZspL?;b{k=iWaq9#qo#@JwR}kD(
zLmYpG!4Syak+QebiPQTX0$G^3d%4?CQ_zwUl4eu1`Wp+tYaj@}@cala?2AMHGHA#y
z2!?)TvQf$&8g0}d^ZKNBu8h5a&A%k!b)3OmrU2ckFGvYY%&x5cObHe?ld5oY;7);mj9QV)&
z#(vBD^N?0%T`Tj@wy;jcpk=hrfIDd{j_ef0+}&o4FZlJdDZU3n2E!E9I##gu}
zHG5qR;}C##_2Mr|s1pN0JH6)}eKKkT?YoQ(BZKcw+@tlHv^lx{P35HDS2T^+Z7?3BoO
zf$Zf(+mi@NVFY)a?4CN!IK4;aN{KuoQruZchfPFb5YC{I0%+{-jwE)|KtYMd7DCmaOYOn<}?W_>oDOU0OrMUd{#^uz4HaK+>zYO%OXa&37HrK{zZH&gMu4+dI`
zhqEZjUpgjm+6=nHl>8c8tV_7I*_S4g&W64Dk!ZK?a1o
zz?*!h8s51|_kj~*bFo)Yh8`0vS!bhz8LeuV--a2cDqX&Yz)TnPt{%?e~O1!=eTcX%>j!cT6?3t9)qb9keB95Dy+YN4uI8PGg~9hucX
zd=qB7thjez3qFTNb__}7*FBwDUUTm#bQS7HICLX3;9(TU%D`mDK+d(CV<3QEhru}(
zWCX(HrqkH%y=4V$-WU5M)g#qG7OIMI-!8|*#3p}SKA!w07Prw(J!{rDnzEe>u`{uD
zMJDW6)0qzfn1w1O^|RHPtKo*KRE1_SH~8XC2EDw6=hd6UTwx5vH78vD&ax!zjs6zm+D%y%zZgLktsn{PFAsR(w&eC6^e$&J6DjnVsG
zNZL8ISag#LeA8t-tuu_gtsiuh;)H`p-a`Q^F}o%%!Q8__&qq<+J`-uGqY7ER%~}>@
z>JqnKPXNeU{hLX|HtD4U!+T|f&+o-|4q9Fvw1DC)i<&wqiLMy>4#NXaYMxJajAx>S
zt~x&%8nI9k(z*Rw6n=Oapjz%^R0e!}#yg{RXSArVABJ_j>NE=${PR|IWFWFIb9E@d
zxO!U~)1T1aPPvor(u6}a_FAiDzdxno{X6fu{#-ei>S@+fF<3u2zSUKElvLU~rL-6i&IC0(wO69N
z;Is~i6ZIsaDe|u!AM2*GCjX*pRxQlFsy!SW-}WaGh^#o69KbXwHa;r(YQ;9{^_V+i
zLp|OvCJ*M^UN*ljzR{l?96gws1qpP1fno((_|S&aAsnQ!KXkQvi9X8@QYg{K`61!D
zV--PzvCDjqHvi7vnVl`|pC1Ma;SSpNfBveD*QE5{M4I^*opN(wPAnU)4?uc&Xk0!V
zv*N*Y`MLr`W2(kI{onG@NkUFI)G+
zO;^!~vKNa?rRLX}q+q?wCq=$y-Zc9Tx7zayzXhC(SQ#peSQ%p7i0y>u1FZ0|(YPYp
z_?%lrDB9VKdWiZ58xpt@U9V*-@2;$j)l{tvTv8smtPevfCdP{_0&MElB=;-axz;M;;
z+00|>ffu^%MHk|vR`3a*{OuXC*qj0ctkpa_A&W+H6HH#%xC5(0Ymar>VU9Tl5=b9zR
z^d=FT9B5kqjmjuCuOuL4qWLyft4bV{K(4=hz!ZEQUgvlWH4CXL-S6(JfGy4#Y}ITU
zr~+pH{w)p%8?uh3_&Hp`WiGNM&74$*%kd14{WFYUV7Lfnud<8+IarW#$#cywHDYA>&~x@Aq(~!O)JZjvH1ZiFsyg?=1g{zqXXtE^)|Z1=
zjD4jjZi28+kj1kP8PLS8*4LjZ`HIi{epk$HyWgLve^CJ)s-kma8HzG?$3W&Z8(DwR
z<9bVaax+4A@5l2d20!N;j<75H3x2w6!|(t5yrmJLN(9mmYCCOYKriG9r*y}Cy?&Ct
z3M(kSHrtQhd_MsXJ&-@j;TSlTx6YlQ`ba(t1ew!r%ya#iJa;S%zKr&l1Y5q+#2dU!dfYsh4H^X-@
zK~tlOjKlwX7+B}$Lard3hMG|^gM&1zM(gz-;&Y8yoo#$I+{{gDa8YBsa6FP(F!KG8
zbmhk0`tuV(7?nM(cxg*%37<|uYF!747a)BqJ93%C{66$&>*mlS<&yP8j8l~Md6ciO
z7{@TjzoVLiY9^Lg_H$k$1C^TgDDQ!?^8icTqqN_$5F;vnuUIs+OzV6OlPikrWddPV
zc&D<1t;!C!YA}L+{ej(6I5S+=k&A=U5Q_emcw`w+oUjLVhF-kyrVXC0g5R+VW1e!o
z)9ig}Ij8zMden)k)rg}ORo4I!?cdxk6C1fBm~hes>OL%B6@`mpAA7_5!I?4fLGt9}
z_Q;=dBeo+qPh`Xac#^uY&Y$Eb;Zt#S!XZF>sVEpZLFSSps|6@htdk{Wh)248
z&pZd0&b?w8pPO`Lx+A0yEVM61a|v#w3bb#4_u(}sc|?Ws)rZqF&!P1V7<3@*#oWA&
zd?nGcs(x6BRiC}JP-HL-%I<~i!i=RKeJP;x8=q7&iBUF9y!bU+DrLFB#APTcmN<83g5j
zc7|1%e^2gv{GW&TmA8*3LA(7~@Xg8&Yx|B}2mT-AU{-@(;-Ff@L+PmepGWflJz>vR
zW^MO@Q)yI9LDw`Z^={
zpQro(hYo<`e@syR>z}`(|NrGL!-P3pabIpet5UdIoL~h?Z6REa-$QgA}M&)MyZ`m>$S<<*xgc1m;R+TAC0?p4aSTxNglsZf&2s(2q`kb9*jb`LsoZ!>27A(*HBkF1$G}OVwUE+yh
zpEsRnO!7z>aWT)Q`ar7YW~`b%nyXdwc-J>79oLg2gR}nOw4&-jo~0-8S@UJpkC$94
zf?ADxlZFXP+`4E59k%>C?tV#1%FgWL)peR(=V!Xb`NQLYyS8B3uYOPtVSw4;-9e3##
zYb)*6ufu@4`&kh9UU{9Kl4+Mmbj3^e=45i0yzDwBg5Gzu7)*Ny{*QG>AFp-l-I5|8
zCRJ8JGsX$We;@8Mwb&jL+gtPG$h2?2%Qp)42upG4dP8XgR18w8*%FN4T#4T-ztoJc
zjBZ}L;NiR$6Ocz~G<=ti`Eiie-;MMVKTi@yF%E^Fi9ZQ05wH%e;=QVWgUIoZRoEX7
z)t@;_y}1V}2#D2_%jN`PAu<56nILlsyNJot=6BaIibW
zVrEW4>TjG|s<1-5wm^aMSr{UtxH|oTRt?1L*t!!$y|qA%wi0pQ1S!hrm6etM%7f1T
zMVw_|lc`Rxi-?FgAMcriYNg`(h(JQP1K1scPIdCH=1L_V5Jm*Y58Zd=9N&P@5>
zd8A^Pjhd>ahgO+Gm3L;(juQ}Bx%47&188VUVHtV+FUV(_Z(l3Lo
z-~gz`-vOaiFe{UbeV~ZMqm6p353*&zH6vm@O(7u%}L$==u#_w?_WIDTD6L2
zIAo_;#VFJ;
zC--EePxwsOTogi`43LA?|9qsrnw_0ps(h2(Ja{vXfI}MnGU$Xq2mN-AR{MMv5T>p`
zyc!W3qZ?vAC-XKWuj=u;u)e;r*lqR4#7;Wb5O*ke${bQ2ATEmrI~O=bz`(@CrNmLY
z7e?c;bX%$yw(e0R{!@^jmQu10&{dZiQtAYYNh%Cvb7%KOWd_$lcT~@Fofk~@WWqq&81Whzw`(8<1KlbuMDLZaxEX_9O-IZOle8X`iqDQ34`qIrrnm`b1fK?*6?5sljhU
zqF=!;Y#m$IXVV0akXja8G7%5Lme4!oH*b1L1#5C8Lxh#d$;mSz(;Ne;o^?1yl%OHamUoO`SSsMT(35wsaL07Y
z4}l1DJlt)FD917bB`j^AdYTU!wcs@CoTXP=bp}>Px*K9NG}k#ehP1=tYpvh1#pR!64nh`g5?Qg_96fEFYHmWzvs%;S4Ld4~Be`7owa*2MC;t0yFlI_&HP*&nD
z@5@2S!cxl-kxA^r%;Z*e`MQeI%re(OqywMChkcsQ^y54;*FLNg)mRtMn>VyMw;lTK
zZwS>j?P6ySSv)U2qBj&@t2GKyvdGS|DCG?pOj~-MJ*mX2m7OYWUCJ-k&sb=_cEyw7
zANogc#l@RiH$%L-&FL2Dr?VF@M{avPvtvWf<3rv%ZaN17LyL8yL*tUSm@ey8x{x!g
z9ZUy3bygm*w{yA`scwU^nW!-L5EEpM=8vhB7;c--JTvW7K^Ik;+>ucmO7Wmv68q7C
z-&a9lA3~QJ&{JfcO*C-jihJnq#>V@AjI6AzUXE9xxrJ=Om2J7Sw)T`;Xft~a1X#}_
zGOpfTB3D8qUpHaZKQ#HeD>A;T7Ip+N?uFjel#dLvS#>3UM@PBABN){d$&s*f2#%#-
zC2$24Yw37+zQPJBGH!jGsg&vldiAnDZ6vj>eDj(Dk!(f~rA`G2TJZC_*D&-vQ;A`0
zd>o&ccplcF=|NJh*AW+>id2yD1y7#_L&P5-N54KE#9-|~Lm2dF3rb5H8yfteEbq-!
zzDGdDXi&BRUK1KMZug;Fmgvxfs>+ZbFnW1*@Dx
z3}$v_<}s-89xcS!%9?;-LdVX0`}V=XJ!a+rQ0`Sqe0)I^Vi%dUC?SZQpZ^Ymx_hfb
zA)o*)(zhGfvN{dIwkXqPhgY!zj$!u{Xu#m23D)#W@Z^bJXCSqSWL9mnqMWUW|NQw?
zeZ3fWCMi(jHC8>;=yTE|4w;_vi}e)oWnGn&h$UY056tRX`t|JWIMJ&^{GP$j
z7NbdMgj>M2Rls5H5@0ps*06W*0Q$hY07fss)OHq3V{&w>Bgk3Q3S8E;K=PUC`F9k&
z{$0z#oQJ*puR(L(5G+~WzI{72HTAG3Ny7bL+|_&_i=2;-k7)$7kZ*zEktu}Z{yg0n
zCgn8bQ-d3DYrKdMcrT=Zv(sE>%+Q)yY)>U)G8iYrx@iV9Az@%}3rxY>+}(S=Kf*7ZC>C~vH*2lua$k!a9-F9a(h05!&ZONO$O6FHjPf$P72|Na9S{&0g)Enn`DBJ$OMM9!Z-
zfBEv|AK*8wQDOHYi)RafF=9gsMQ<3dr5gBLHG$XU-#Tw0uti>ke47s2iD~8ShY!0T
zrGA;5TmnR@@rj7&ea}uF_9}aTF)x4W1s7M>bhqu9^LTj0-~d!+HT)RdYbXIA8F{N=
z1EKutg*!}!!432mwgxjF)LeU{2RbqI{wtcxmp|KhHi?+x^pO@kg90o@PrurQZJe(=
zUZfqi`NvP6QY1s^nBb&%R~&BGhCMqwyH2%B5G0S;%}oUzotxnBodz1(nPt-V5gqkR
zhz8N5o*(^d0&LDtqJD6Pq&ITWREC{f8p=--Q`2Vfv^KIo!J563{u{KX{!YKrE=oYG
z`;VV)vvI-edk#){q5U0W@csqWK566#m|FR
z!3(%D9@~jn0FwArR8%)_-ptS_q=Py%R?L^?72N-GDc>w_@Y~$vHl+_L&dbXqW7GNy
zZojaFn&8(`Zz`d_H#&T{4$ws`yuovLj5NY-ufUIo$qn&essqh;LqkJ5C#MvHpKpZS
zwqAh~lq0w?gDKPx@a1cPUs*07K;8)kGXBBAclGPM+&n!Uw`U~4X!3P_K1a4%t~9Uy
zeubkV*sdo5PKG;944<{8rUqVhZl*C%Hj3*VSW0smH4zbzGaG{w#ZX@7e0$_DxT2q9
zqzk(nguQ{4i08ZTXFjSL!t`gdPj>e9?lUf6Z+)HDg6yK-YoF8O?P0To7iMO758=*$
zCyEik^^R!XK+u4nU0IO^(R1_hs?ueqJWwLv0=av!o}(BZ^Irg>|3akk!y}cy!in>@
z!v2nniwlp{-^<{UP;3o0v$$enVsImxAZ;*dm*p(~1-GT8;o^G`xs5?tUpAVj)yWFp
z?*v}+@87>$pv?a#Pge+XoTQ{AvKLN3Wu6G`E?8%!R^UUe6Uw14s+OlC1qi*;7hAVX
z`0Lw!qeJi#vvYAV1!$iZjsdSQR?VWfU}$$AtjZkb+azEIBrI`z>Y!IrYulcQ&zV@O<%6jbcoAvG@%wj@2hkGO8cg?MOAjwZYL>}y`<=CXkkUkb2WY2;p*gUv-SyfKCf5Q&>$
zTmUIu6UZZ`UD_Y8u#{sh!7UXYq95GNEih8-O_CtIwLF~)L5a(_34$SbCP0kh29E*onTW|6H))Sh
zgp7~aW3+vC-dj=xM_pyuwNWCTd{5pJ-4FL2tISwhdg-yZ^b?|ghJL*$g5}Lsm!QU@
z1W+R6vi2Qd2V|VV{)0Jin)n8o)%2zrVl@Ebm$13Dwsxa`6I2-*4qUlz6%}UQGd8
z2bPiqn4Kw|zglG4X(25`LqTCC?T+C&Iy{`3o_-`Lc?q(Noc$UYH|x~6g#u6Z?G5|F
z+Gylgxp)eO{zBEaE
zU-XJ90OXjYPxi7T1O$d)1p-m#JwyaYu#^Yy7is7TNx|-dC!2P!NJ9L)p%KkFgUF^s
z+s)5TYtz2%X^p%wERE6>cm|DQsiw_Xa>vLhz2@}$XSBnvEo8!YmxxKGD<%~K=Yzb3
zeWo&)+(_)u4!6>SQ|pO)s~WR$GR6x?1RS;XLOJMm=!CqFc|t=&HMF&rD+-Qsc|L&w
z!Q=V1@YgAeQN=hKmu`1NU6`&s}+R!ebE3mIHgAg+|mf(dvG}cvMO(#
z5930NaK7f(pWTogap81NV@`(08TIDgR=3v(*s?=HP6ZnYzuhX$I)V
zG~`~rJ-zh-t&zgk`8(ameB&|l<
z%n3-h!&T1NOwQg83-P~30e}FYuBfioUULK_Qczrchl?u=*~;#^ArAKTiLhkM&&-JA
zKl%$>hHT)6M?5PI(m)(va&mLQ1D(It13d$<4N8bBfH=TN_7LENy|bLw_O}z9f@fg-
z{uYdwioljh^myqGk{JMrfvFRB`2C4H8xyrM5DB7qEw90jZJd~(;o#r^e{1ZweQPYx
zcAVFVDC_jDgsKnEu5HKlY}Z?dkiheH+}mt8d0i&!6Yur4*M~*VNQh1lf}Zqm`p^
zKY)Tm9-sz5F9qisz{7C&(>>}9ex_IpLt8;^Q>0a=AC}eYl*;>#^_4fNspT;EF{`w%
zAS5pPIq&E6erZwFz?jd>l+J?v+G<98*9DLj9E8EiP_;lZT6
zrsk|Zo;82jn}Y9qvW*9|w<&Dgw2!3_Y-eES8-|RO9Qz(@=mt{@J$?8P1@6DFDD4(H
zDEsy0gST{&Vf#Kk!`yf-iO_EP{)94HQnxdNGQ~
z;}rAtdEX8wP^PP^Ypm8Y5{kb-6B(F1mNiUeZ%um
zd(8rvHa$q@;@W7}1Hc1_5=_noz6rZ?fyqu1{Frf9?C@Rz`9mO-I`dM%r-&cG|1@CP
zSV89iE|WInB|#)L2FnMFOFX>-frl2ux%{oxsEKBuRkx6}_TqubO2`+1f>m3Rwc}Bz
z4yeFra6OBGt!0lD{Gfc-EO6azLGD;|Q}q_@c-@9n+1juMm*Z_2Q`WgoQ`^v126Syh
zR~Lz^0x61sjICuT`Qr8}sn~4w3zl7DOR6qp#(}|M>Uwwc78lq2`
z*05$`Z8RMn@3fX{B0kBDRZWu@TNmS4@0U?hP&|XU$re#uSojmND%jPjP!e>ad2ikN
zv4=4^+?u{tY))CM8MSJ;L;kvz(s!3$pZJL0QL)J?0D342+n$U9vo(>a-Mu3tHz+75
zz+KdA#6w1WPVoj-x$lr;ryeOt1dJ>`fBt+K))%*v5g{q*JwZXi#=0UDQWKpVZjaTt
zb2nnZNir2sHmi2oCtqbq1ufw$6#O=0Q&8HfWITTjF!;*#>yiiwKRzo0MJx1PiojHj
zjAdtiZH-Yii;O|~s{uS4rj_fo!jddgVb>ggCS+u!oKo~*w6Etc+ACKC`q#7|unlr|
zpX^T);e~~Th-rjJ(*rHM2KK=0{CuNr{pm%GvuQ(lC=R0dtZyJF
zTGTrlX~SQyhGN;KEIu-l0+yQivu9Vy$Wl;M{+B)rSYAt_OZKm!GB50?^-mGGqFbyv
zaqm>omM>f3<4)?|DwnH~-TF-82oULLHD
zz8$|*s&SWxKz6ZRol)#F&)FnNwZvhrT_4R#h=Y>@
zz2hM3SS7j5$gJ>f(bqAF{FSui5{ZTj2Od0ml6V!482K!ehvXLV8+QFhSOw8&OR+)T
z0DohZr|0k#{O((CevvPsTGR$07m$!Jsl5E<{x{_;6{bK;jVQj6{_W-6)AB`g%e^ot%^;tMCbWxap&5{w?iEz&jisA0vI2;^rrhAIoOuGpl5LPW;!f6KD#=zis$_
z-_Q_+WZO4G+1Ji2T=PJO~aB25e~u$*AvJ
zypD#(w`F_-BQ3Wpp+mZ%VT}(eMniVb%sOf6$z1}qZ%akx1w^uiCh{FvjZ6Fu;>;S#
zeZ-zWmKjX@H*W7#%xojpZDITT)Y`{BqV@ZiSy)(1bFqFo!^Npt6uYab8C`kcz$xXLJDV}eQKaG~J+`&Xdu
zghpR0{QP?wH}uZVucoH=P$9AF)ks!4%y&cx+0UFOAt3=W>A;tuNVru;<@LD>58#^=
z>^cS4AHpuV1qe&nedn`?=L$vz&;z((p*S%ZNpU?Zq_(g;iHwXo>I{}RGlBBAsjV#l
zP%9T0rt!1~Q}G9b?>#`T<>BJ=4lKdbm@V()g9Lb>{4@HLK
z>9I4kv|Er02jD;zzoHQqM>>){ZsibO8K^K^6vUCV1K4=t@xoFp22lQ*m#qWs;yCpK
zr|*no*&Uk2z_0$hLCN<*vU5yN5QK+XA1&vv)lZY9M3`UzKwt%Te{|#5JHn5Ffp@gatkP(u
zAc})wAMGn76IGa9hhmIW@)$?GhGYL{V
z(?t0f^Lb-5wnlZzant;P?sL85YqIQ~Aoh5vk%R)fRyxWmQHXO<
z_ciP{Cq(*EFtd!XLzbt9E)ZV(#)c3tupwliJHaOUDn?55ru5wY&H$lH7n)_;Pw7-SF(oEqQ*=KC1Q>#aB{Y1-_a>5YOYfs4wU#|-Kiog$dhzQ#
z3d+j}o47e#EL3+&Yp;EPGXldsL#^$&lO|ge6BCcm22fbD>er3!-Fo==@i{QmScJ;w
z*Wh3TU`W#pEbN0BKT=WD6!g&K?2UM@6R`sSi4=8N1#wzUv*=zv;87&6#8K*l@)
zAMoYBkUgRV9PgsQT~|`|0CE_7;*SLd1!SA$76SyYz+|t;c2cwt11^b4Ky4v^>vgoV
z4MvqQ!2%B+Yys1R!wK^rDzg~ij}D~AFk(({JE6|^eS
z=j>=P1d9ViVv>XBe4p=FZ7NziII_}1-y0kp^y_OiBWqZ+&n{C}>u9eWK)6vI=#A8HT=Xpv+@B
z=?@u!jI455C-?!?o@S*3!$rTJPF_H@c>yK@0fgkt#)do);qYKMP0;5qk&+@_^Kk$1g@lB_humtv
zrfIJmd;qmxGtyl%KN$kpY5SbO)!pV8DkJE^QVO}Sf-|cbpckliW|x;+YdsHSafsl@
zkN4MseAHDM*aY-J!J(G|oMGC%P8bn!Kis+vnG@<;duRZtDA+ZY>V0X|)zvKqvchfR!Wd-074<3%%e6qaR2vobVB)pgRGb2p|tRfmaHk;IIFX
zBmmaHNIMI7t{%Am&{W;p+oK0>b~1l+u*%JKr+g1mcQ&d2h1jr`{F?IwM25zLL`Ly+q!qh6zGb~UPnsW
zcp$yWp;=}EcpZ3zW}Z52P0ejsh>2p7sMI+w{Nu*-N|e^Qx!S-7V6tpxx7fzuYX{Y~^sEEjgxh08__Lj9
z=;d8&-#&~jPgeWSai6u_@rJFsC)*ug2%nifdxQJNKP0gEj=KDNxicIg&uX&u+R-}R
zA-}w0?*=s=Dv^&Dlow~>S?;|Udh)TxlN7SH=vwa}87Kd7iLbiN`?vNTo0p6(e!Aru
zyk_j^I_qJ;Nt#4!U}+{(j#JyAhx=n)yBx=^H3`pHVs0K5GcwJzg+|ZAteHx_-f7|J
z%fTE<92^|kc;P4*!`lYV;R{rq!0^e{6GIE|2NVqv(|pxV%a5TH2K;2SHTClowNONj
z`|dwCZdiC+9%^p>0QHNgn!dM}7gB#h4ML58Ehu2gm{t6d)d9s0q$%h~86ZnonlLuL
zIWEw_+1UhUU=TVE_%+3PA8!H*_D=wX)bn(1^YLBN*4Bnun1_XijUDm3@Od99Er?&a
zBEtiDo#Kd}5iqcSeep@{R$3fvAp!(XJQp`sfj6)5ejipoT5e`=!%BIbSydC%=KLLE%9%EbA3Z1!a
zG&)rx&{s)>1mGwCgqt5^J3By$Fy$ARoy`J(bfLIgRHjf%?9STyI$%u#Dt;Sd$bQgI
zbp!?hXx~i5lEz1m=q(B3M-zW_riD}ppu|lrFu-p5<
zuvE}#$-v$>8{yU$6MVU#g#k9RjX=~Pc^R-&B5--fJ-6u$pPCM_eu}>z%uA?VWIIap
z2G_5kuryWWT`v#j`-Op(G#@u+HZ?Uhult556P8G`(JD5b+MAM(HO|GmJ1CaQJ5L-{
zkh5#*C=C=t>$Cfhk4m2EJ2p!`hwZ%!=Qwn3%kZgN@MkAb)>?f$h+*eO)%_Op_uGrJ#loXyR&dm^mo$JHEdtCdv0eALl@v*BRS!|NbTkA
z{V(0-dEk@82cu3Bxmu;)fQtuia|W*Y@9?lq>_n?C$uE*W9E%xCV$u(>3?V9R4
zbadU4vkvIJQ~me=fCmVz3%Li;eKyQ^h(-XhT-!Y#A`vl_j9=I7IoXx+7V&guo>
z7meQg!~Fe+6cD=q{R)kY4+gnnGjrzZ~`3jE*W46oQ>mf$~M
z`EReWX+1A6c%+uAB>|JVyBop6uOUgnSRsi;f5xM5X4QCCSMfQ9#BJ<+`nJ9vJ;C=t
z|Nr`GVfhI_=)O@CL>ydT=Xc7D8Fg1@!#lIQbR#k1tPrkIYXZDL$TC`tNt8
z+i6245_GGaUM%;f0tY9&xzS>T(5b_)!I8vcJ5keJEkw5y)^mv6={@A&xG}moj`QCa
zdL=X+OyAF-7lL(neL|xQA@c!v(eG}&_pkX4HztW8kt4ARq|huL!I_n!P`3-|-6bxdh$|WGb5dIQ2n&_sDs44_IC?+TOHX*Qa13q?I?S#fN(vnAp
zbPzt?H#dYM8m4rLAfOUO((=5#3?q9bZes?Ve;~aA_xvkUIlZ8Cfb$+52Bi?Fy;{jN_3^YrJX7TeJS^H?;KX1b3WyCg2U^PE)3Zd-=nn4q42*Cc>
z<0G4inpPO{Y^n3cRJd-^K)f}Ap{<(jCMxW4r9J~5DNPFGT`yp}!18{<-SwHc-s2ls
zeR&|tB$8XgPFU{BMgQ*4RE81-t-^ixZnf*CDGxOUbmw`NO6KDD
zxNj%$HI!V&mmoizM_>T8Cn$#K>A>&^U?iS#2LY%Pb`Rg;Uwz;(*V4BV;a_t%DLHxh
zZ{Z#2p)~`iNhc9-`O1}_@gkmoi_J*AkM|Ox3mY=90PTC6mS|oUKs!9zccDvgN#Zwb@8URYWNsz0)&+Z><$HmMkVM4j4c
zM%%a07ebrec?Uzf-S~M)Y8L=b9!SW)t%Gccmg$>x2&sOr)m{MB~@*Y|7Kcg
zmExcxS)*FNOa}g$qGF9I{9He1W4iSAg&Rd%LYCf-cPyy6j<_#Y0%INft=tN2g<=bj
zh{U|^f9h*K!LjR%AkRN{RAIk<(bsnJ!({Mj$@(^a?nt>$jwM7IMT8ZDp*|S0OE>#R
z7#@+={;*lBh$n)0k;xYr({bOKI~wXf&C1lN8
zn2T~|*nBt(a3_E=#Tw#kzez{1wnFuqP~ih?#J^taYr_hK>U29*hcNvi;B)c-X-oiU
zpKA*ze1(j(z>vXRg?L$rB7jeyz8~ee_y>?{2f%f16|=LGqqHgae7#y^$Nq2Ye&S5wl8~EF!2^Xi*NDW8o7_MZ_kW~UZ)`c(^S`Xy|hk4@9Rvyv5g5bEN=
zgK2f2>DTb?){3G|-}@+w0xB+kNT_qulo((*79e=|{tc>>e8%NzN!8)GJLQ~}X{u~S
z#>V&EJaiUJ@rGIme3a!Z?%Y;g_;By{gUU1;{1){X+u0w8NfAjd)|;rP-s@1x`OQd^
z^nAcdX*TcL<>@~b5O4HVn|W@}qeX{ToOm9ja<85V55lK{0zz646%X%ZXwlxd^Qqc1
z3$n)bVpqS#&MTGAM$e3v2x834*WyH#2AF5t+=R#duYBsGPz*ABt}s~G^g%ISO6JAH
zgYckruN-C&AP;t83&WD)0v)`=#$xP}M$FmW6tu+;_4ju4jiHcT_~BryPP=5uXSC3?
z6^A?TlMaSDi{x%dcxVL!0e=geOAbf5MnQKt9bYePHOMY^FX5V86+>o=DEsJHWe
zW#^+?{N#QGwK`O3S0LqWe0X5&A`|u?BoT?
z*InoR+kU;a(CMD6+^`G0GhOw;NNO^4@5dKq(=PZ*$za%iHtEW*^$*$_s$AC$ZWM|*
zzqG{d5RHXRH0JXqP=jeump;k6fn~8fCZ%~%>!5CMQ
zaB4AU{ozUm1?TVm=2`
zth>rQ?u6+?cidHEQaHJ~wY_d-W4ieUXRC%xgXAWC?@-4bYI6JZwLGBl
zBC8^!%}gyXrde-OQX-4lYDxoyvz$gs@6r>r=7p=9n+_HnvZ&2_&YFUX5z0(XNpaHe
zcJOzAwSmIwXnp3;MN{m4o5=3>oM~`WjaB0q1;UMVnR*S`jG*&?Z?
zAS{w6(=CmmF67|9x&9(VgRNQ|9)a^%LkB8z0K}L#3|Hp3n&>JDeYdT|m@N{0HAIsp
zZ71}QnpyUHv50X=Ng*~l@f2ulnc03(%mn5ihb~czR$FdmWp&`-K|J-_#e7e^U>Cm`#Byu7;s0_P%@Wf(PQ1mZIj
z>{CsR2^zsmyqZKKH+0OW-UCUp=yR5X9T5EjY(pWv(~yU(uvt(NJH+H5ltE)=Kx5dE
zb8B%DcIc)(LoL|^hVUMtqzOxe?lspR{jRvFkVZ+z0XQNAB<2D~`Qq}+jT?0!+DYu&
z7b#(qG0496#+gQqXzc3!4JlrLVLoF_PJh7#>P35oQ`Md0+lm9JW$j%)i$MYqF{|&D
z?9JdWiYaC6AquZ+O_yBY!M^HwocB%-a1zS(3jgZy5Mha-e&6L48L2`%Dp?tnL(Q25
zm~3{IKVsq+kCmzDc)Ee@99!Z|PW2=;;Vh2bu8--2@j6*YEMyDY-pR!<4s{?Cw3986
zAE5;L04Y!_a9hu}G^aV!j#X5j9uNUp15dp?CTurhRGsk^?<@A})tKm#wGA}q$utW?
znW@&r&;)d^3Ib)pbqtn13Thhil2`A1!P7YK@f^5-I;djboQc1U$Co{nA59ZJ5aUNIABF-5$__#lyxUx;J|
z{VizTT~x9{muda_Jp4ZbSS|x;jL907Bj^`$Kq>`d*I^CkNBz}_wT2yXRoPuhF0NnX
zos{v}Tf0=$Q9KZ&);p=s7_SGrl(kTv-k+BrD
zPf@6>toJzgCET-V0IWn3LI=wvc<~8d%)cSokm)-d)u?`GRT3bv-CMosft#D+%=$xz
zH~?@g(-OY2%A2RcY4LhiB|XQ%#=Ywp8ApbPH)BlzG>0m710?-Ylrk0gP6fUyF9_9#
z0l8F)ivxeJ&*?bR%Y*+*gi27at7ll^8eomoX7jT9RoAfPic^M5I}GF9kM3fi#9mUi
zE8QJEbL3`1cj0l1FAk2IrdL)@hS88=+`tQUcG>Gp=k}co%9yN}bmDtMd8@pY!okUD
zYP-pD>zDC*7P$(I&urAKJW~$+rt~P4eJ-sws2mzwp2^`G9n(K(LijDyJu#tuk1J(b
zq$&CuU4$@kALEC{r;vCa_Cqq^Kud+0W!vDO$qYx<^|k}#7GTQt-fnW&;Rj5)b#>rE
zCrC{&hWS}gSL$zGTl;~FJ}gi2s1j9;R{i82Q$K<`K*~eJrFug#h32W8>xU=xGxRiO
z!HY`Z8t}|=V<^rpWB-_p5&*H?pe{Hh4uk+%fj`m()n{vG=U*u);&4%rR}9xV6t!n~
z^w|3`C4yq(5d&LzAJc=5wl;9E;+W>@7eD01g64~=Ml-{*TMxvA6u+MY~VF
zI
zZ%nF474~+(_~pQX1HaR)0w^F;
zXIqkbH51M*c!t*PHdH^#V%b+n6&t@;ynn;wt>^
zA8c}IGF&U(Da)yQ&Xhuh0AO;riOc0^U#Z%Djo6@zM-LwfyhwLmX~i?ndUNNdbEjm+
z?W|k&f8{@46gi((^-8^({wHVar`fO@?nkFDokrmcRQKtp_of@(3
zP|_Synp4erwjOcMfyN2pgiIl-S7d>`{`KZ862w|YBw+|4!3i@oC;}w%QbR1VihJ;M
zC_TX$e#(CNH##P#P*aTJU0VnE4a9U$bLKuzYLiSSX#W3{|{S#Fz%|0&DOff51sCD@(Zur`iD19BU{CNYNbC}{-N
z?^J6o;B;D|1#0iFd)s5OP?hOnd`fZOxVzXwbP>_gF?ox~j0a0%bT)S_8(Rs!M;b6W
z6udg3w5gle*{9%1Jcun8HVu80)L!$NQ~zgRd=V3X@e~H^I&e8Y<+5I_(5vKbHx`0X
zK%Q_1$Cjs(H{1PGWQ3cB0fCbS77GsSDYx*ia7hD9&XvU9p%iF?M+$-=1}8LHpIp~Q
zw5GXD#@O>At}M&>Xd
zqIm`j1&Yoh7~`aCs{(d7_Z}sioM=*a8fq=>JzZNUmWJ-siPZ`P91AH%M{4&;oIrW>
z#`w*Bn>8O{Q%JSrSwunG8gJXVzkSi+mF*KP&!$J~6i>Z-v?eqsh?Y;8>ghFWu-mrP
zN#hwd=23;v^3pT=9=%LIEozn<_r-8-`-s`doXmvygh>-PUAprbAO13RT9lK1-Lcs=
z@}%T$@T52wHFUd($U0ne?7~m<>!4DdNWG!A*>IsqLQoX`1Zk%-GKpum&g}h9cu(z2Pfs5M_2CV_A;{`8@2>jS
zRaCrB7{mI4lCQ*wb?w?ePd7A|>=ziM5fB59!sm1=Qy^YiXD`z!yq-rfe+1&zvgHa-
zPp^4~%@Os4RLv{~2xQ~{WGvaVWi_|-7QIHl&dSWA?r0isLHNc77Gcqi0TdT5j-8!d
z^(wBtxJHCYz>QlkWw3z=6W<&hFJg9$MMcZyS--$(
z)`|6*!u-R7eXlQu@!-p;Y6YdzQ7PBH?ZW5}W~-FiZj_KP5*^jm{Dd-jJ=o%I+C53O<}D-X&XEtqT+Z-c@V|in{z3v{8&<7WJ|6%0oRM~mblMdmB5T#c{Yki<>$rp%bWVd9Q~d%-Ww?|h`Byicnf~b
z=DVZSlj(vsuMZ!jbj^v}`D^M+!_{q8Bc-(WezG1=(`DMFDKhr+)qduo-O!b|Iu1?}
z;nQj%r!nIw5C%;BXyn)S$j3sv&@Bn@fb(zy9D+$GmgSw9{}Gw~!f=5<6=h(@;9yAK
zJV=^F;kA+@3SEJ)c&y+qvkROZXv2D2KlAxKm4b1L^5ivv=rv@-9f@&CXD`5yX$^k>
zyz90d`wYwkU1<(R?L=dR=1%;HM6-cL9Mt4CTUqn9h*e`N+obO9rJN`$VR4Abv&99&
zW4G&4mJ&2%<=JdFpjwQg*T~zcG`jyQ$zGK5n`F;{W7gk<4T2qE&cR2*kRXTN4_zMQ
z{`~PJ0ocIV6yb)Oq-_l5sE|IE#{?;7Or!0h;l?
zniA~`9g_wfQVM5(KVwUmEP3phO{ya8Ju#T$p6mZ7lzmJoU+F#=W^rS|pF2991pF5$
zJ1Y+l50V`euz&V8qzFR3)5iwHzR%Jfecefn2AIBA*PAv<^qMw?eVS4uw)SYItz9fX
zF!&6=GSq_EHZAs@JUlgIDo(a=j2tiota^2Fs~gQ4V*Z8ipa@hAR2;1gt?xMb
zVoADHq*Bq9j96@-(kDa9XkA~t@SQlTH00(wV#^_ZH{G>x%olwDWV2xX6}viwVc(6k
zY{qK&L*)X`OiT?n;#Jy#c54-~+0G|wE*NWDbWdz5XIDA9w<;oWI>!s$ETtP9519rA%^_Q*=_Dp$YrO!|{H3Ut=
znhhHY;5|G=`#CHuOyH~d>JRzL9XiV5;W%fWEE6g?Gt@#q0q&s^Hxea$3HG&DxZC%t
zQx=c+*D-?J1p`Q?a3EB|z!IQ~fe2gxKn7pZgJFqO*$eS?e
z%^Pnuys>54tR3-~G8nvX!blIDx8ev_nF%`DN@cI{e?k%;Qm
zm`;qF>QYQODI_>1(FwcxBI`a<=qD5gx45zG|F%4R?bUdtGxAmO?r-w5Sl8*>KIO@t
zu3^RU*EvyKmYP0U8Y1r0w#dF-_U8oKqw)*fGBO-cokwjWjb<;G9%QdNvnBNLRLiV~
z$H3Mh9ucc5rLO{-!4`vDv@9Zn`>GxLu`Me86VZO^??ijGU%AFhp6LVLsMO|um31=qg(XLc|GUc4iGg78QP9jR0IfPo|;vN$92-Bt8D
ztMM#+{PvCP7lG0o>(*%>b5O@9!lEG^;s9`?R=l%>GJ}>9Bw)~Zsl-}P5z+-T0knfK
z-PJ!n?aE`ISmo^PXU?I
z(pk-#HB;Zzb)KRVwTvg2P&)mU&64EGa)MCMs8(LA(
zlA-~)dU$%?uP3IUq+}%?K?vXPtw*?U^+eANLZp+B03l}DOyw;c04Z4Rty1v2_>Xk_Yu$gQ
z<2i)?8+3feZj(k9XDG;zo9qnnkW;W!<@Gb%J(`-97L?U^E{z9l+)8OVy=~w)t%Ia733F$Uktq5@|RIFfm0{|ezuxj26FlAv93q~`v
z`XxjaBVA^ap{g50>-Nd;G4)*ossK(0M5=FmhvY
zQ+k*316?Hnp&?H!eg|KlHXOKuM^6omg2SCXt`P0{VjjI$`%f0o-g#5W#6!_TJz|
zFpJpB{JY`vTG_DVZ#LX*on=4m9L{zevK9>soLE^|R3)jrPoIQtbjqW|KHOCrh>pLE
z(P`JPinBg!Srj4=HAxnkuEr2iA)amq;t14TH^V%uQ9(NNM`UkuhJsI?P;`>F!sTY6
z5lllE$IN^81t(ilt)QKu5;rJT
zq`&UoeQDidtx^w_8Ezy@mA_gTVY!d613T{!pe;BCeUOG&?yFbal%E-?Mz#ndXzqA<
zF_>ol45+)d*Fn;7s0jtGZcjZFuT|Vld|+e!+S3TckLf##GS$XLdO_U`b<4IfFA(Yo))P65QM
zj)xqaQ5WubD${VRF7f80-ZSTo*xe!pwK|Gdfg!JqA2Q&b$+~{FO7|Q*Bhz!~f^Xw@
zXEy<-DhWpifZ>mh5Q_*?qfjK484Ao1PjOGer#PJcjjUfc^@`_dT(PT+!wl%Hj}SNE
z#j|)Yo(8}p61~l*ygZnOIGP=22k29-mHU;7zB6uM#z+l2U2kQW3-9X#Kgqc6^GWw%ai5M$LY?g_a&;ZUP^%X$JAvN;e^OP*b(!~$~?#x
zjOGTb8@e6)>FTk>^q#+AX&1t-1>#6mz!x
z*4#UU$R|j=nAjJl+ToFr%K&wF`yd;;;fa5XsR5!9_Co_LL_Q7*AOhC|Yyt7(<>FF6
z9Zx0uGv+$x5Vy}?xDW-1DeWe-3@E(Hgph!~r;m{!Tu_f(U6)ngS&r^T7iuTI4jiF7
z(ECu3HMJYOKRmsAQ9C_EN+2tO2Z1Sr4$Xn7>hi~XCo`-^=r4UlROGT%tMoA78Xsub
z@OzfCe8mcpmxZL512AcGTLq2MG%MbBg4&=;BOeb+`VC^%;b?_E8HxOysJ@&1wLZ8{
zBS`%MzQa;Ii>k5{Yb*kbf-#dx#B5-+yP^UT#IjY&^K^FI-TK9N8U-Nsz^luO6gEG0mF3hs)x#O_7;c
zv+W$ZJXnJqU>z$)WGd}%lUo-6(<9b%T`ii*@6C_l;
z&hvWLlwsE37q9adQ?Yt)k=VTKPOe<8_Fa~#j+CgGk=_kW%Q87`Q+@oDrY3mwRi1dB
zJ8p${l?O>Vs6j2L5gWe0o=aR`khbj_cz?Q9h|5_6dv}yoT=_C2tJnV#I7m74zE`JX
zHfCjIk>wq(oybeQYq1VkP-mEE%#nQs=F3T`|&s-G$dVx*cR|q;gwNPQsRcW5OFbl2MupklK~0FXi85d{gLkW7FM9XZ*eL_+et`h
zhqez+NBi2f5&&NiE+5yKgA8PmFMhCt_~lW)a#5EesU|8hCB*;%6-mYo0XV{kGzDbd
zZ8yPB*!mqOoFRWzqV*H2HYMQ}AWDSc+X9y9{+m*73-gj8l5SKn+!y~9VC~lz0kEqC
zm5^cW+bx(Skxmm8*J^QbaSUM(VUtld)LPg8hvzmZZP1*m5i*4fDxziz<(csB_)l=Z
zs6&G~it!$?4G<_L3TFV~x!V3M?oz?GZ{ml8njqJXBJb`kBBCI^<>S?a+VBz|pHN!(
zPP*b_b76NU!B=>>QOev$3K=27q1NN0e2!3NL%Q_L8DjwF%TTrEV}T;BJ(OFW_@5E8
z$Dx=D4C;ggkmnhNdT@R#NPJj|gS6)yAKtmMn3yMU3lYxNJl=W+{s-dGfF`VBKFgq!
z_J&{c)KKWPB80B)M{7%BWJs^X&reUrzTzfi!=*-MCO~hs2)vLeWScKRCKOr5cFR`N
zf5`tDaBBUr>i#>-YOjN5+M>oSdTQ=JsBxGw&$BHLkvNz$T_Uk}-^ny>);_VQqfOg?
zzKvH%ZEtAK4Y}+Gk9yHrOVheskJk#X#WGfYk39WR9O{)ZJNu?nPfIF`e1^8qI}god
z*QOfo`$ObwHYNOq`=^?i_?l>rZ~7^oxjOB}&-geMNFEdA`-WP7{3tm-lC#~@a2WDCd6rBZ@~cbIN0-^FHwGZcQ+wk2|e^81t*QQ
zK1o-8d9IQ)w{08~$Z-BA);?o~G*EaiZ4kJ?icN)pK2mg;iK3drYrz%A+A$USaVwo$
za*Q=Vt+hS|;by6}GYf($hp||Jzv>uXzWM*165{oq|X_3o;=+w5F%iOQJl
zDE@k