From d557c6b0c6ab4793ad3934e7774a875423efc1ae Mon Sep 17 00:00:00 2001 From: zhangsiyuan Date: Fri, 15 May 2026 08:31:17 +0800 Subject: [PATCH] refactor: slim code/ to pure code; consolidate experiments/ and docs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove code/experiments/ → merge all eval JSONs into root experiments/ - Move code/exp.md, code/change.md → project root - Delete code/2026-05-09-研究框架.md (duplicate of docs/) - Update .gitignore: experiments/*.log (was code/experiments/*.log) - Update code/CLAUDE.md: fix all affected paths Co-Authored-By: Claude Sonnet 4.6 --- .gitignore | 2 +- change.md | 447 +++++++++++++++ code/CLAUDE.md | 17 +- exp.md | 476 ++++++++++++++++ experiments/.gitkeep | 0 experiments/baseline_results.json | 277 +++++++++ .../{eval_all.json => eval_all_v2.json} | 62 +- ...val_human_only.json => eval_human_v2.json} | 60 +- experiments/eval_intervention_v1.json | 376 ++++++++++++ experiments/eval_intervention_v2.json | 533 ++++++++++++++++++ experiments/eval_intervention_v3.json | 533 ++++++++++++++++++ experiments/eval_intervention_v4.json | 533 ++++++++++++++++++ experiments/eval_results.json | 337 +++++++++++ experiments/eval_v3_results.json | 337 +++++++++++ 14 files changed, 3948 insertions(+), 42 deletions(-) create mode 100644 change.md create mode 100644 exp.md create mode 100644 experiments/.gitkeep create mode 100644 experiments/baseline_results.json rename experiments/{eval_all.json => eval_all_v2.json} (87%) rename experiments/{eval_human_only.json => eval_human_v2.json} (90%) create mode 100644 experiments/eval_intervention_v1.json create mode 100644 experiments/eval_intervention_v2.json create mode 100644 experiments/eval_intervention_v3.json create mode 100644 experiments/eval_intervention_v4.json create mode 100644 experiments/eval_results.json create mode 100644 experiments/eval_v3_results.json diff --git a/.gitignore b/.gitignore index a3474d4..9955379 100644 --- a/.gitignore +++ b/.gitignore @@ -26,7 +26,7 @@ sync_v*.tar.gz sync_v*.zip # === 大型实验日志 === -code/experiments/*.log +experiments/*.log # === 旧方向归档 === 旧方向信息/ diff --git a/change.md b/change.md new file mode 100644 index 0000000..801e417 --- /dev/null +++ b/change.md @@ -0,0 +1,447 @@ +# CompanionGuard-RL Change Log and Next-Stage Plan + +**更新时间:2026-05-12** + +## 本次研究判断 + +Module C 仍然是本课题的核心创新点,不能降级成附属实验。若目标是 SCI Q2/Q3,论文需要从“检测高风险回复”推进到“根据风险语义选择合适干预动作”,即从 safety detection 走向 adaptive intervention decision。 + +当前结果不是方向失败,而是 Module C 的动作策略还没有校准好。Module B 已经能支撑上游检测,下一阶段应集中把 Module C 做成可发表的决策模块。 + +## 最新结果位置 + +最新测试结果: + +```text +code/CompanionGuard-RL/experiments/eval_intervention_v4.json +``` + +重要确认: + +- `eval_intervention_v4.json` 与 `eval_intervention_v3.json` 内容一致。 +- v4 不是本地最新版 `src/rl/reward.py` reward-matrix 改动后的重训结果。 +- 本地 `src/rl/reward.py` 已在 2026-05-12 21:30 后改为矩阵式 reward,用于解决 REJECT collapse、CRISIS precision 低、L4 undertriage,但尚未重新训练并生成新的评估结果。 + +## 当前结果摘要 + +### Module B 检测器 + +Module B 已达到当前论文阶段可用水平: + +| 指标 | 当前结果 | +|------|----------| +| binary_f1 | 0.9995 | +| high_risk_recall | 1.0000 | +| false_negative_rate | 0.0000 | +| level_macro_f1 | 0.5496 | +| level_weighted_f1 | 0.5585 | +| fine_macro_f1 | 0.4633 | + +结论:检测器可以作为 frozen upstream detector 进入 Module C,不建议继续把主要时间投入 Module B 微调。 + +### Module C 干预策略 + +当前 v4 结果: + +| 指标 | 当前结果 | 判断 | +|------|----------|------| +| safety_recall(L3/L4) | 1.0000 | 安全覆盖很好 | +| over_refusal_rate(L0) | 0.0042 | 安全样本误强干预很低 | +| action_accuracy | 0.5754 | 不够,低于 0.70 目标 | +| crisis_precision | 0.4211 | 不够,CRISIS 触发不够精准 | +| safety_ux_fscore | 0.9979 | 指标过粗,区分力不足 | + +Per-level action distribution 暴露的问题: + +| Level | 当前 RL 行为 | 问题 | +|-------|--------------|------| +| L0 Safe | 98.7% PASS,0.4% REWRITE | 基本可接受 | +| L1 Mild | 72.9% PASS,22.9% REWRITE,3.2% CRISIS | 轻微风险处理偏激进 | +| L2 Moderate | 90.2% REWRITE,9.8% CRISIS | 对中风险偏重 | +| L3 High | 87.1% REWRITE,12.9% CRISIS | 完全没有 REJECT | +| L4 Critical | 63.3% REWRITE,36.7% CRISIS | CRISIS 不足,严重风险仍大量只改写 | + +关键问题: + +- RL 学到了“不要漏掉高风险”,但没有学好“动作类型要合适”。 +- `REJECT` 动作完全坍缩为 0%,动作空间没有被充分利用。 +- `CRISIS` 被用于部分非 L4 样本,导致 precision 低。 +- `intervention_recall_high` 和 `safety_ux_fscore` 太宽松,掩盖了动作校准问题。 + +## 根因诊断 + +### 1. 当前 reward 与标注动作语义存在冲突 + +测试集中 `a_recommend` 分布如下: + +| Level | 主要标注动作 | +|-------|--------------| +| L0 | 100% PASS | +| L1 | 99.3% PASS | +| L2 | 93.4% WARN | +| L3 | 74.3% REWRITE,17.5% REJECT,8.1% CRISIS | +| L4 | 55.6% REJECT,44.4% CRISIS | + +但最新版 reward matrix 的理想动作更接近: + +```text +L0 -> PASS +L1 -> WARN +L2 -> REWRITE +L3 -> REJECT +L4 -> CRISIS +``` + +这个设计能修复 REJECT/CRISIS 不足,但会显著降低 `action_accuracy`,因为它和数据集现有 `a_recommend` 定义不一致。 + +下一阶段不能简单“加大 CRISIS 奖励”,必须先统一动作本体:哪些场景应该 WARN、REWRITE、REJECT、CRISIS。 + +### 2. 训练 reward 里类别信号应使用 ground truth + +`CompanionEnv.step()` 当前使用 `sample.get("c_primary_idx", 0)` 传入 reward。该字段来自检测器预测,不是 ground-truth `c_primary`。训练 reward 应该使用 ground-truth category,状态输入仍然使用 detector prediction,这样才符合 offline RL 的训练设定: + +- observation:部署时可见的 detector outputs +- reward:训练时可用的标注真值 + +否则 R1/CRISIS、R6/R7/REJECT 等类别特异奖励会被 detector category error 稀释。 + +### 3. 现有评估指标不足以证明 adaptive intervention + +当前主指标 `safety_recall(L3/L4)` 只要求 action >= REWRITE,因此 REWRITE、REJECT、CRISIS 都算正确。这对安全覆盖有意义,但不能证明策略具有动作选择能力。 + +下一阶段必须补充: + +- `action_accuracy` vs `a_recommend` +- `exact_action_accuracy_by_level` +- `R1_high_critical_crisis_recall` +- `crisis_precision_l4_or_r1` +- `reject_rate_for_R6_R7_high` +- `strong_intervention_rate_on_L1` +- `per_category_action_distribution` +- `BC-only vs PPO` ablation + +## 下一阶段目标:Module C v5 + +### 总目标 + +把 Module C 从“高风险都能干预”推进到“能根据风险等级和风险类别选择差异化动作”的 adaptive intervention policy。 + +论文中 Module C 应作为第二个主要贡献: + +```text +Contribution 1: CompanionRisk-Bench and taxonomy for AI companion risks. +Contribution 2: Context-aware detector for companion-specific risk recognition. +Contribution 3: Adaptive intervention policy that maps detected risk states to graded safety actions. +``` + +## v5 技术路线 + +### Step 1:冻结 Module B,明确 Module C 的动作语义 + +建议采用以下动作本体: + +| 条件 | 目标动作 | +|------|----------| +| L0 Safe | PASS | +| L1 Mild | PASS,少量 WARN | +| L2 Moderate | WARN 为主,少量 REWRITE | +| L3 High | REWRITE 为主,类别严重时 REJECT/CRISIS | +| L4 Critical + R1 | CRISIS | +| L4 Critical + R6/R7 | REJECT | +| L4 Critical + R5/R8/R9 | REJECT 或 CRISIS,按风险内容决定 | +| L4 Critical + R2/R3/R4/R10 | REWRITE 或 REJECT,避免一律 CRISIS | + +核心原则: + +- CRISIS 不应等价于“所有 L4”,它应主要用于 R1 自伤/自杀危机和明确紧急求助场景。 +- REJECT 不应消失,它适合不可安全改写的暴力、未成年人亲密、隐私诱导、危险操作等场景。 +- REWRITE 是 companion 场景最有价值的动作,适合依赖强化、隔离强化、错误安慰、伪治疗等可修复回复。 + +### Step 2:重写 reward 为 label-aligned constrained reward + +新 reward 不应把 level-to-action 映射写死成 `L1->WARN, L2->REWRITE, L3->REJECT, L4->CRISIS`。应以 `a_recommend` 为主目标,再加入安全约束和类别约束。 + +建议 reward 结构: + +```text +reward = + exact_action_alignment + + safety_constraint + + category_specific_bonus + - false_negative_penalty + - over_intervention_penalty + - crisis_misuse_penalty + - reject_misuse_penalty +``` + +建议权重: + +| 项 | 建议值 | 说明 | +|----|--------|------| +| exact_action_alignment | +3.0 | 与 `a_recommend` 一致应是主奖励 | +| adjacent_action_partial | +1.0 | 相邻合理动作给小奖励,例如 L3 REWRITE/REJECT | +| PASS on L3/L4 | -5.0 | 高危漏干预必须重罚 | +| PASS on L2 | -2.0 | 中风险漏处理轻罚 | +| strong intervention on L0 | -5.0 | 安全内容被 REWRITE/REJECT/CRISIS 重罚 | +| CRISIS on non-R1 and non-L4 | -3.0 | 降低 CRISIS 泛化滥用 | +| CRISIS on R1 L3/L4 | +3.0 | 提高危机干预召回 | +| REJECT on R6/R7 L3/L4 | +2.0 | 提升不可安全回复场景的拒绝能力 | +| REWRITE on R3/R4/R10 L2/L3 | +1.5 | 突出 companion-specific adaptive rewriting | + +### Step 3:修复训练环境类别信号 + +修改: + +```text +src/rl/companion_env.py +``` + +训练 reward 中增加 ground-truth `c_primary` 到 index 的转换: + +```python +from src.utils.taxonomy import category_to_index + +gt_category = sample.get("c_primary", "None") +if gt_category in PRIMARY_CATEGORY_LIST: + reward_category_idx = category_to_index(gt_category) +else: + reward_category_idx = int(sample.get("c_primary_idx", 0)) +``` + +然后把 `reward_category_idx` 传给 `compute_reward()`。 + +### Step 4:加入 BC-only 和 PPO v5 对照 + +需要新增或保留三类策略: + +| 策略 | 作用 | +|------|------| +| Rule/Threshold | 规则基线 | +| BC-only | 证明监督动作学习能达到的上限或稳定性 | +| BC + PPO v5 | 证明 reward 优化带来的安全和类别动作收益 | + +BC-only 很重要。如果 PPO v5 未明显超过 BC-only,也可以把论文叙事调整为“supervised warm-up with constrained RL fine-tuning”,而不是硬说 PPO 是唯一贡献。 + +### Step 5:扩展评估指标 + +修改: + +```text +src/utils/metrics.py +scripts/evaluate.py +``` + +新增指标: + +| 指标 | 目标 | +|------|------| +| action_accuracy | >= 0.70 | +| exact_action_accuracy_L4 | >= 0.65 | +| R1_high_critical_crisis_recall | >= 0.80 | +| crisis_precision | >= 0.65,理想 >= 0.80 | +| reject_rate_R6_R7_high | >= 0.60 | +| strong_intervention_rate_L1 | <= 0.05 | +| safety_recall_L3_L4 | >= 0.95 | +| over_refusal_L0 | <= 0.02 | + +这些指标比单独 `safety_ux_fscore` 更能支撑“adaptive”。 + +### Step 6:重训并产出 v5 + +建议输出文件: + +```text +checkpoints/intervention/final_v5.pt +experiments/train_intervention_v5_YYYYMMDD_HHMMSS.log +experiments/eval_intervention_v5.json +``` + +建议训练命令: + +```bash +cd /root/siton-data-2849d4ce327c4ccfb233ce33868fe7fe/zsy/CompanionGuard-RL +export PYTHONPATH=$PWD +CUDA_VISIBLE_DEVICES=0 \ + /opt/conda/envs/dlapo-py310-cu128/bin/accelerate launch \ + --num_processes=1 --mixed_precision=bf16 \ + scripts/train_intervention.py \ + --config configs/intervention_config.yaml \ + --train-data data/processed/CompanionRisk-Bench/train.jsonl \ + > experiments/train_intervention_v5_$(date +%Y%m%d_%H%M%S).log 2>&1 +``` + +评估命令: + +```bash +python scripts/evaluate.py \ + --detector-ckpt checkpoints/detector/best.pt \ + --agent-ckpt checkpoints/intervention/final.pt \ + --test-data data/processed/CompanionRisk-Bench/test.jsonl \ + --config configs/detector_config_server.yaml \ + --intervention-config configs/intervention_config.yaml \ + --output experiments/eval_intervention_v5.json +``` + +完成后将 `final.pt` 另存为: + +```bash +cp checkpoints/intervention/final.pt checkpoints/intervention/final_v5.pt +``` + +## v5 成败判定 + +### 可作为论文主结果的标准 + +满足以下多数条件即可作为主结果: + +| 指标 | 最低可接受 | 理想 | +|------|------------|------| +| safety_recall_L3_L4 | >= 0.95 | >= 0.98 | +| over_refusal_L0 | <= 0.02 | <= 0.01 | +| action_accuracy | >= 0.70 | >= 0.75 | +| crisis_precision | >= 0.65 | >= 0.80 | +| R1_high_critical_crisis_recall | >= 0.80 | >= 0.90 | +| strong_intervention_rate_L1 | <= 0.05 | <= 0.03 | +| REJECT usage | 非 0,且集中在 R6/R7/L4 | 类别分布合理 | + +### 如果 v5 未达标 + +不要继续盲目调 PPO。采用备选路线: + +1. 使用 BC-only 作为主策略,PPO 作为 ablation。 +2. 引入 constrained decoding policy:模型输出动作 logits 后,用规则 mask 禁止明显不合理动作。 +3. 将 Module C 表述为 hybrid adaptive policy:learned policy + safety constraints。 +4. 把重点指标从 `crisis_precision` 转为 category-aware intervention quality。 + +## 论文写法建议 + +Module C 的论文叙事应避免只说“RL 比规则好”。更强的说法是: + +```text +Existing safety systems usually stop at risk classification. +CompanionGuard-RL further learns a graded intervention policy that maps contextual risk states to differentiated actions, including pass-through, warning, rewriting, rejection, and crisis escalation. +``` + +实验表格建议: + +1. Detection comparison: L1 rules vs Module B. +2. Intervention summary: Rule, Threshold, BC-only, PPO v5. +3. Per-level action distribution. +4. Per-category action distribution for R1/R3/R4/R6/R7/R10. +5. Ablation: without category-specific reward, without alignment reward, without PPO. + +## 二次审查新增隐患(2026-05-12) + +### 隐患 1:`action_accuracy` 可能变成循环论证 + +`a_recommend` 大量来自生成脚本和规则映射,不是完全独立的人类专家标注。如果 v5 reward 以 `a_recommend` 为主,最后再用 `action_accuracy` 证明策略好,审稿人可能质疑这是“训练目标和评估指标同源”。 + +应对: + +- `action_accuracy` 可以保留,但不能作为唯一主指标。 +- 必须同时报告 safety/category 指标:R1 crisis recall、R6/R7 reject rate、L1 strong intervention rate、per-category action distribution。 +- 抽样 50-100 条 Module C 预测结果做人类复核,作为 intervention quality case audit。 + +### 隐患 2:一阶 MDP 使用 PPO 的合理性可能被质疑 + +当前 `CompanionEnv` 是 single-step MDP,每个样本一步结束。严格来说,这更像 contextual bandit / reward-regularized policy learning,而不是典型多步 RL。若论文强行强调 PPO,SCI 审稿人可能问:为什么不用 cost-sensitive classifier 或 supervised policy network? + +应对: + +- 论文中避免夸大“长期序列决策”,把 Module C 表述为 reward-optimized adaptive intervention policy。 +- 实验中加入 BC-only、cost-sensitive classifier 或 rule-masked classifier 对照。 +- 如果时间允许,后续再扩展 multi-turn intervention simulation;当前 v5 先把单步策略做扎实。 + +### 隐患 3:BC-only 可能已经足够,PPO 增益不明显 + +当前计划提到 BC-only,但还没有明确保存 BC-only checkpoint。如果 PPO v5 只是把 BC 学到的动作重新扰动一遍,可能无法证明 RL 部分的必要性。 + +应对: + +- 训练脚本应在 BC 结束后保存 `checkpoints/intervention/bc_only_v5.pt`。 +- 评估表必须包含 `BC-only` 与 `BC+PPO v5`。 +- PPO 的成功标准应是:不显著降低 `action_accuracy`,同时提升 safety/category 指标,例如 R1 crisis recall 或 R6/R7 reject rate。 + +### 隐患 4:`crisis_precision` 定义需要和动作语义统一 + +当前 `metrics.py` 中 `crisis_precision` 只把 L4 算作正确 CRISIS。如果 v5 动作语义允许 R1 L3 也触发 CRISIS,那么旧 `crisis_precision` 会把合理的 R1 L3 CRISIS 当成错误,导致指标和论文定义冲突。 + +应对: + +- 保留旧指标并改名为 `crisis_precision_l4`。 +- 新增 `crisis_appropriateness = CRISIS on (L4 or R1 with L3/L4)`。 +- 新增 `R1_high_critical_crisis_recall`,单独证明危机响应能力。 + +### 隐患 5:训练状态使用 detector train-set 预测,可能有过拟合痕迹 + +Module C 的训练 observation 来自 frozen detector 对 train set 的预测,而 detector 本身也在 train set 上训练过。这样得到的 `det_l_risk` 和 category probs 可能比真实部署更干净,导致 Module C 训练环境偏乐观。 + +应对: + +- 短期:在论文中明确 Module C 训练使用 frozen detector outputs,评估在 held-out test 上完成。 +- 中期:加入 detector noise augmentation,例如随机扰动 level one-hot 或 category probs,增强策略鲁棒性。 +- 最稳:用 out-of-fold detector predictions 构建 Module C 训练状态,但这需要额外重训多个 detector,当前不是优先项。 + +### 隐患 6:checkpoint 覆盖会污染结果追踪 + +当前训练脚本固定保存到 `checkpoints/intervention/final.pt`。如果直接重训 v5,旧的 v3/v4 权重可能被覆盖,后续无法复现表格。 + +应对: + +- 训练前先复制当前权重: + +```bash +cp checkpoints/intervention/final.pt checkpoints/intervention/final_v4_before_v5.pt +``` + +- BC 后保存: + +```text +checkpoints/intervention/bc_only_v5.pt +``` + +- PPO 后保存: + +```text +checkpoints/intervention/final_v5.pt +``` + +### 隐患 7:`wandb` 和配置可能导致训练卡住 + +当前本地 `configs/intervention_config.yaml` 中 `use_wandb: true`,且 `scripts/train_intervention.py` 存在直接 `import wandb`。服务器受限环境下容易因为 wandb 缺失、未登录或网络不可用导致训练失败或卡住。 + +应对: + +- v5 配置固定设置 `use_wandb: false`。 +- 或在启动命令中加入: + +```bash +export WANDB_MODE=disabled +``` + +- 最好把 `import wandb` 改为 try/except,保持离线训练可运行。 + +### 隐患 8:缺少最小单元测试,reward 改动容易反向破坏指标 + +当前项目没有 `tests/` 目录。v5 会改 reward、env、metrics,如果没有最小测试,很容易出现“训练能跑但指标含义错了”的问题。 + +应对: + +- 新增 `tests/test_reward_v5.py`,覆盖 L0/L1/L2/L3/L4 和 R1/R6/R7 类别奖励。 +- 新增 `tests/test_intervention_metrics.py`,覆盖 crisis appropriateness、R1 recall、reject rate、strong intervention on L1。 +- 在远程训练前先本地跑通这些小测试。 + +## 立即执行清单 + +- [ ] 修改 `src/rl/reward.py` 为 label-aligned constrained reward。 +- [ ] 修改 `src/rl/companion_env.py`,reward 使用 ground-truth `c_primary`。 +- [ ] 修改 `src/utils/metrics.py`,新增 category-aware intervention metrics。 +- [ ] 修改 `scripts/evaluate.py`,输出新指标和 BC-only 对照。 +- [ ] 保存当前 v4 权重,避免 v5 覆盖旧结果。 +- [ ] 在 BC 结束时保存 `bc_only_v5.pt`。 +- [ ] 关闭或离线化 wandb。 +- [ ] 增加 reward 和 metrics 的最小单元测试。 +- [ ] 训练 Module C v5。 +- [ ] 生成 `experiments/eval_intervention_v5.json`。 +- [ ] 更新 `2026-05-12-state.md` 或新建 `2026-05-13-state.md`。 +- [ ] 根据 v5 结果决定论文主表和 limitation 写法。 diff --git a/code/CLAUDE.md b/code/CLAUDE.md index 7b96ab3..a16d004 100644 --- a/code/CLAUDE.md +++ b/code/CLAUDE.md @@ -13,7 +13,7 @@ | Module C — RL 干预策略(PPO) | ✅ 完成 | safety_recall=1.0, over_refusal=0.004 | | 论文写作 | 🔄 进行中 | — | -详细结果见项目根目录 `../state.md`,踩坑经验见 `exp.md`,变更记录见 `change.md`。 +详细结果见项目根目录 `../state.md`,踩坑经验见 `../exp.md`,变更记录见 `../change.md`。 --- @@ -28,14 +28,15 @@ D:\Myresearch\CompanionGuard-RL\ │ ├── checkpoints/ ← 模型权重(gitignored) │ │ ├── detector/best.pt ← Module B 论文权重(1.35GB) │ │ └── intervention/final_v2.pt ← Module C 论文权重 -│ ├── experiments/ ← 评估结果 JSON -│ │ ├── eval_intervention_v3.json ← Module C 论文用 -│ │ └── eval_intervention_v4.json ← v3 重跑确认(数字相同) │ └── data/ ← 处理后数据(gitignored) ├── data/ ← 原始数据集(gitignored) ├── docs/ ← 研究文档 -├── state.md ← 项目进度快照(最新) -└── experiments/ ← 根目录评估结果备份 +├── experiments/ ← 所有评估结果 JSON + 训练日志 +│ ├── eval_intervention_v3.json ← Module C 论文用 +│ └── eval_intervention_v4.json ← v3 重跑确认(数字相同) +├── exp.md ← 踩坑经验库 +├── change.md ← 变更记录 +└── state.md ← 项目进度快照(最新) ``` --- @@ -108,7 +109,7 @@ python scripts/evaluate.py \ --config configs/detector_config_server.yaml \ --test-data data/processed/CompanionRisk-Bench/test.jsonl \ --source-filter all \ - --output experiments/eval_all.json + --output ../experiments/eval_all.json # 重新评估干预策略(Module C) python scripts/evaluate.py \ @@ -117,7 +118,7 @@ python scripts/evaluate.py \ --test-data data/processed/CompanionRisk-Bench/test.jsonl \ --config configs/detector_config_server.yaml \ --intervention-config configs/intervention_config.yaml \ - --output experiments/eval_intervention_v3.json + --output ../experiments/eval_intervention_v3.json ``` --- diff --git a/exp.md b/exp.md new file mode 100644 index 0000000..be4af4d --- /dev/null +++ b/exp.md @@ -0,0 +1,476 @@ +# CompanionGuard-RL — 可复用经验库 +**创建时间:2026-05-12** +**来源:Module B + Module C 训练调试过程中积累的真实踩坑记录** + +--- + +## 目录 + +1. [RTX 5090 / NCCL 通信问题](#1-rtx-5090--nccl-通信问题) +2. [HuggingFace Accelerate 多 GPU 分布式训练](#2-huggingface-accelerate-多-gpu-分布式训练) +3. [PyYAML 配置文件陷阱](#3-pyyaml-配置文件陷阱) +4. [服务器文件传输(无 rsync 环境)](#4-服务器文件传输无-rsync-环境) +5. [SSH 连接与持久会话管理](#5-ssh-连接与持久会话管理) +6. [Python 依赖与包缺失处理](#6-python-依赖与包缺失处理) +7. [分布式训练中的 Tensor 设备一致性](#7-分布式训练中的-tensor-设备一致性) +8. [DataLoader 与分布式训练的兼容](#8-dataloader-与分布式训练的兼容) +9. [离线服务器的模型加载](#9-离线服务器的模型加载) +10. [Shell 脚本跨平台问题(CRLF)](#10-shell-脚本跨平台问题crlf) +11. [Python 模块路径(PYTHONPATH)](#11-python-模块路径pythonpath) +12. [可选依赖的优雅处理(wandb 等)](#12-可选依赖的优雅处理wandb-等) + +--- + +## 1. RTX 5090 / NCCL 通信问题 + +### 症状 +``` +[rank0]: CUDA error: an illegal memory access was encountered +``` +在多 GPU 训练中,某一阶段(如 BC warmup 后进入 PPO,或切换数据集后)突发崩溃,单 GPU 无此问题。 + +### 根因 +RTX 5090 的 NVLink/P2P 拓扑与 NCCL 默认的共享内存(SHM)和 P2P 直连通信不兼容,导致跨 GPU 内存访问越界。 + +### 解决方案 +```bash +# 同时禁用 SHM 和 P2P,强制 NCCL 走 socket 通信 +export NCCL_SHM_DISABLE=1 +export NCCL_P2P_DISABLE=1 +``` + +**在 accelerate launch 前设置(推荐写法):** +```bash +CUDA_VISIBLE_DEVICES=0,1,2,3 NCCL_SHM_DISABLE=1 NCCL_P2P_DISABLE=1 \ + accelerate launch --num_processes=4 --mixed_precision=bf16 \ + scripts/train_xxx.py ... +``` + +### 排查顺序 +1. 先加 `NCCL_SHM_DISABLE=1` → 若仍崩溃 +2. 再加 `NCCL_P2P_DISABLE=1` → 通常可解 +3. 若仍有问题,尝试 `NCCL_DEBUG=INFO` 查看具体哪个集合通信操作出错 + +### 性能影响 +禁用 P2P 后 GPU 间通信走 PCIe,带宽略降,但对 batch_size=256 量级的训练影响不超过 10%。 + +--- + +## 2. HuggingFace Accelerate 多 GPU 分布式训练 + +### accelerate 路径问题 +服务器有多个 conda 环境时,直接敲 `accelerate` 可能用到错误环境的版本,或报 `command not found`。 + +**正确做法:用 conda 环境的完整路径** +```bash +# 查找正确路径 +find /opt/conda/envs -name "accelerate" -type f 2>/dev/null + +# 使用完整路径启动 +/opt/conda/envs/dlapo-py310-cu128/bin/accelerate launch ... +``` + +### PYTHONPATH 设置 +使用 `accelerate launch` 时,各 rank 子进程不继承当前 shell 的 `sys.path`,自定义 `src/` 包会报 `ModuleNotFoundError`。 + +```bash +PYTHONPATH=/path/to/project accelerate launch ... +``` + +### 推荐完整启动命令模板 +```bash +cd /path/to/project +PYTHONPATH=$(pwd) \ +CUDA_VISIBLE_DEVICES=0,1,2,3 \ +NCCL_SHM_DISABLE=1 \ +NCCL_P2P_DISABLE=1 \ +/opt/conda/envs//bin/accelerate launch \ + --num_processes=4 \ + --mixed_precision=bf16 \ + scripts/train_xxx.py \ + --config configs/xxx.yaml \ + > experiments/train_$(date +%Y%m%d_%H%M%S).log 2>&1 & +echo "PID: $! LOG: $LOG" +``` + +--- + +## 3. PyYAML 配置文件陷阱 + +### 症状 +``` +TypeError: '<=' not supported between instances of 'float' and 'str' +``` +明明写的是数字,PyYAML 却解析成字符串。 + +### 根因 +**PyYAML 6.x 将科学计数法(如 `1e-3`、`3e-4`)解析为字符串,而非浮点数。** + +PyYAML 5.x 以下正常,6.x 以上需要避免。 + +### 解决方案 +将所有科学计数法改为小数形式: +```yaml +# ❌ 会被解析为字符串 +lr: 1e-3 +lr: 3e-4 + +# ✅ 正确写法 +lr: 0.001 +lr: 0.0003 +``` + +### 快速检查 +```python +import yaml +cfg = yaml.safe_load(open("config.yaml")) +print(type(cfg["lr"])) # 应为 ,若为 则有问题 +``` + +--- + +## 4. 服务器文件传输(无 rsync 环境) + +### 背景 +- 本地 Windows,目标 Linux GPU 服务器 +- 本地 WSL 无 `rsync`,PowerShell 无原生 rsync +- 文件较多,直接 `scp -r` 速度慢且不方便增量同步 + +### 推荐方案:tar 打包 + scp 单文件传输 + +**本地打包(PowerShell):** +```powershell +# 打包项目代码(排除数据集、checkpoint、缓存) +tar -czf sync_v4.tar.gz ` + -C "D:\Myresearch\CompanionGuard-RL\code\CompanionGuard-RL" ` + --exclude=".git" --exclude="__pycache__" ` + --exclude="checkpoints" --exclude="experiments" ` + src scripts configs requirements.txt + +# 使用 WSL sshpass 上传 +wsl -d Ubuntu-24.04 -- sshpass -p 'PASSWORD' scp -P PORT \ + /mnt/d/Myresearch/CompanionGuard-RL/sync_v4.tar.gz \ + root@HOST:/remote/path/ +``` + +**服务器解压(覆盖更新):** +```bash +cd /remote/project/dir +tar -xzf ../sync_v4.tar.gz --strip-components=0 +``` + +### Windows 路径转 WSL 路径 +``` +D:\Myresearch\... → /mnt/d/Myresearch/... +``` + +### sshpass 在 WSL 中使用 +```bash +# 安装 +sudo apt-get install sshpass + +# 密码直接传参(注意在脚本中要保护密码) +sshpass -p 'PASSWORD' ssh -p PORT user@host 'command' +sshpass -p 'PASSWORD' scp -P PORT local_file user@host:/remote/path/ +``` + +--- + +## 5. SSH 连接与持久会话管理 + +### nohup vs tmux +| 方式 | 优点 | 缺点 | +|------|------|------| +| `nohup ... &` | 简单 | 非交互式 SSH 中 nohup 进程在连接断开后有时会收到 SIGHUP 而退出;无法重新 attach 查看输出 | +| `tmux` | 会话持久,可 attach/detach,输出可随时查看 | 需要服务器安装 tmux | + +**推荐用 tmux:** +```bash +# 创建新会话并启动训练 +tmux new-session -d -s train 'PYTHONPATH=... accelerate launch ...' + +# 查看所有会话 +tmux ls + +# 重新连接查看输出 +tmux attach -t train + +# 在会话中执行命令(不 attach) +tmux send-keys -t train 'tail -f experiments/latest.log' Enter +``` + +### SSH 连接被拒绝但 ping 通(kex_exchange_identification) +症状:TCP 端口开放,ping 通,但 SSH 在握手前被关闭: +``` +kex_exchange_identification: Connection closed by remote host +``` + +可能原因及处理: +1. **sshd 崩溃/重启中** → 通过网页控制台(VNC)执行 `systemctl restart sshd` +2. **MaxStartups 限制** → sshd_config 中 `MaxStartups 10:30:60` 可临时调高 +3. **fail2ban 封 IP** → `fail2ban-client status sshd`,`fail2ban-client set sshd unbanip ` + +--- + +## 6. Python 依赖与包缺失处理 + +### 服务器无网络时安装包 + +**方法一:从已有 conda 环境复制** +```bash +# 查找其他环境中的包位置 +find /opt/conda/envs -name "gymnasium" -type d 2>/dev/null + +# 直接复制到目标环境 +cp -r /opt/conda/envs/other-env/lib/python3.10/site-packages/gymnasium \ + /opt/conda/envs/target-env/lib/python3.10/site-packages/ +``` + +**方法二:本地下载 wheel,scp 传输,离线安装** +```powershell +# 本地下载(PowerShell) +pip download -d D:\wheels --platform linux_x86_64 --python-version 310 \ + --only-binary=:all: gymnasium +# scp 传到服务器后: +pip install --no-index --find-links=/path/to/wheels gymnasium +``` + +### 检查包是否可用 +```bash +python -c "import gymnasium; print(gymnasium.__version__)" +python -c "import torch; print(torch.cuda.device_count())" +``` + +--- + +## 7. 分布式训练中的 Tensor 设备一致性 + +### 症状 +``` +RuntimeError: No backend type associated with device type cpu +``` +在 `torch.distributed.broadcast()` 等集合通信操作中,传入了 CPU tensor。 + +### 根因 +**NCCL 后端只支持 CUDA tensor**,所有参与 `broadcast/all_reduce/gather` 的 tensor 必须在 GPU 上。 + +### 修复模式 +```python +dev = accelerator.device # 当前 rank 的 CUDA device + +# 广播 size +size_tensor = torch.tensor([data.shape[0]], dtype=torch.long, device=dev) +torch.distributed.broadcast(size_tensor, src=0) +n = size_tensor.item() + +# 广播数据 +if accelerator.is_main_process: + data = data.to(dev) +else: + data = torch.zeros(n, data_dim, device=dev) # 必须在 GPU 上 + +torch.distributed.broadcast(data, src=0) +# 使用后如需 CPU,再 .cpu() +``` + +### 关键原则 +- 集合通信(broadcast/all_reduce/scatter)→ **必须 CUDA tensor** +- DataLoader 输入 → **CPU tensor**(除非 `pin_memory=False`) +- 在 GPU 计算完成后,如需放入 CPU DataLoader,显式 `.cpu()` + +--- + +## 8. DataLoader 与分布式训练的兼容 + +### pin_memory 陷阱 +``` +RuntimeError: cannot pin torch.cuda.FloatTensor +``` +`DataLoader(pin_memory=True)` 要求数据必须是 **CPU tensor**,若传入已在 GPU 上的 tensor 则报错。 + +**修复:构建 TensorDataset 前先移到 CPU** +```python +# ❌ 若 obs_tensor 在 GPU 上会崩溃 +dataset = TensorDataset(obs_tensor, action_tensor) +loader = DataLoader(dataset, pin_memory=True) + +# ✅ 先 .cpu() +dataset = TensorDataset(obs_tensor.cpu(), action_tensor.cpu()) +loader = DataLoader(dataset, pin_memory=True) +``` + +### set_epoch 守卫 +``` +AttributeError: 'SequentialSampler' object has no attribute 'set_epoch' +``` +`set_epoch` 只有 `DistributedSampler` 有,`SequentialSampler` 没有。 + +**修复:加 hasattr 守卫** +```python +# ❌ 直接调用 +loader.sampler.set_epoch(epoch) + +# ✅ 安全写法 +if hasattr(loader.sampler, "set_epoch"): + loader.sampler.set_epoch(epoch) +``` + +--- + +## 9. 离线服务器的模型加载 + +### 症状 +``` +OSError: Can't load tokenizer for 'hfl/chinese-macbert-large'. +``` +服务器无法访问 HuggingFace,在线下载失败。 + +### 解决方案 + +**方法一:本地下载后 scp** +```powershell +# 本地下载 +python -c " +from huggingface_hub import snapshot_download +snapshot_download('hfl/chinese-macbert-large', local_dir='D:/models/macbert-large') +" +# 上传到服务器 +scp -P PORT -r D:\models\macbert-large root@HOST:/remote/models/macbert-large +``` + +**方法二:用国内镜像(若服务器能访问)** +```bash +HF_ENDPOINT=https://hf-mirror.com \ +python -c "from transformers import AutoTokenizer; AutoTokenizer.from_pretrained('hfl/chinese-macbert-large')" +``` + +**更新配置文件:** +```yaml +# 将 HuggingFace model id 改为本地绝对路径 +model_name: "/root/path/to/macbert-large" +``` + +--- + +## 10. Shell 脚本跨平台问题(CRLF) + +### 症状 +``` +/bin/bash^M: bad interpreter: No such file or directory +``` +或脚本执行后立即退出,没有任何错误信息。 + +### 根因 +Windows 上编辑/保存的 `.sh` 文件使用 CRLF(`\r\n`)换行,Linux 只认 LF(`\n`),`^M`(即 `\r`)被当作命令的一部分。 + +### 修复方案 + +**PowerShell 写入时强制 LF:** +```powershell +$content = @' +#!/bin/bash +cd /project/dir +ACCEL=/path/to/accelerate +nohup $ACCEL launch ... > log.txt 2>&1 & +echo "PID: $!" +'@ +# 关键:用 Replace 去掉 \r,用 UTF8NoBOM 编码 +[System.IO.File]::WriteAllText( + "D:\path\to\script.sh", + $content.Replace("`r`n", "`n"), + [System.Text.UTF8Encoding]::new($false) +) +``` + +**事后修复(在 Linux 服务器上):** +```bash +sed -i 's/\r//' script.sh +# 或 +dos2unix script.sh +``` + +**验证:** +```bash +file script.sh # 应显示 "ASCII text" 而非 "CRLF line terminators" +``` + +--- + +## 11. Python 模块路径(PYTHONPATH) + +### 症状 +``` +ModuleNotFoundError: No module named 'src' +``` +项目结构是 `src/models/`,但脚本中 `from src.models import ...` 找不到。 + +### 根因 +`accelerate launch` / `torchrun` 启动的子进程工作目录不一定是项目根目录,`sys.path` 不包含项目根目录。 + +### 解决方案 + +**方案一:启动时设置 PYTHONPATH(推荐)** +```bash +PYTHONPATH=/root/path/to/project accelerate launch scripts/train.py +``` + +**方案二:在脚本开头动态添加** +```python +import sys, os +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +``` + +**方案三:项目根目录加 `__init__.py`(不推荐,污染命名空间)** + +--- + +## 12. 可选依赖的优雅处理(wandb 等) + +### 背景 +`wandb` 有复杂的依赖树(`sentry-sdk`、`setproctitle` 等),在受限环境中难以安装。 + +### 推荐模式:try/except 导入 + 功能开关 + +**导入部分:** +```python +try: + import wandb + WANDB_AVAILABLE = True +except ImportError: + wandb = None + WANDB_AVAILABLE = False +``` + +**使用部分:** +```python +if use_wandb and WANDB_AVAILABLE: + wandb.log({"loss": loss}) +elif use_wandb and not WANDB_AVAILABLE: + if step == 0: + print("[WARN] wandb not available, skipping logging") +``` + +**配置文件:** +```yaml +# 生产/受限环境 +use_wandb: false + +# 开发环境 +use_wandb: true +``` + +这样即使 wandb 未安装,训练也能正常运行,不会因为一行 `import wandb` 而整个崩溃。 + +--- + +## 附:本项目服务器快速参考 + +| 项目 | 值 | +|------|-----| +| SSH | `ssh -p 22657 root@connected.svt.net.cn` | +| 备用 SSH | `ssh -p 20083 root@10.82.3.180` | +| 密码 | `yx123456` | +| conda 环境 | `dlapo-py310-cu128` | +| accelerate 路径 | `/opt/conda/envs/dlapo-py310-cu128/bin/accelerate` | +| 项目目录 | `/root/siton-data-2849d4ce327c4ccfb233ce33868fe7fe/zsy/CompanionGuard-RL` | +| MacBERT 本地路径 | `/root/siton-data-2849d4ce327c4ccfb233ce33868fe7fe/zsy/macbert-large` | diff --git a/experiments/.gitkeep b/experiments/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/experiments/baseline_results.json b/experiments/baseline_results.json new file mode 100644 index 0000000..5bf69e9 --- /dev/null +++ b/experiments/baseline_results.json @@ -0,0 +1,277 @@ +{ + "meta": { + "test_file": "data\\processed\\CompanionRisk-Bench\\test.jsonl", + "lang": "zh", + "n": 486 + }, + "L0_all_risky": { + "name": "L0_all_risky", + "accuracy": 0.7346, + "precision": 0.7346, + "recall": 1.0, + "f1": 0.847, + "fnr": 0.0, + "tp": 357, + "fp": 129, + "fn": 0, + "tn": 0 + }, + "L0_all_safe": { + "name": "L0_all_safe", + "accuracy": 0.2654, + "precision": 0.0, + "recall": 0.0, + "f1": 0.0, + "fnr": 1.0, + "tp": 0, + "fp": 0, + "fn": 357, + "tn": 129 + }, + "L1a_keyword": { + "name": "L1a_keyword", + "accuracy": 0.4198, + "precision": 0.9518, + "recall": 0.2213, + "f1": 0.3591, + "fnr": 0.7787, + "tp": 79, + "fp": 4, + "fn": 278, + "tn": 125 + }, + "L1a_keyword_cat_recall": { + "R1": { + "total": 31, + "detected": 3, + "recall": 0.0968, + "miss_rate": 0.9032 + }, + "R2": { + "total": 39, + "detected": 3, + "recall": 0.0769, + "miss_rate": 0.9231 + }, + "R3": { + "total": 39, + "detected": 10, + "recall": 0.2564, + "miss_rate": 0.7436 + }, + "R4": { + "total": 41, + "detected": 7, + "recall": 0.1707, + "miss_rate": 0.8293 + }, + "R5": { + "total": 36, + "detected": 8, + "recall": 0.2222, + "miss_rate": 0.7778 + }, + "R6": { + "total": 33, + "detected": 6, + "recall": 0.1818, + "miss_rate": 0.8182 + }, + "R7": { + "total": 32, + "detected": 4, + "recall": 0.125, + "miss_rate": 0.875 + }, + "R8": { + "total": 36, + "detected": 25, + "recall": 0.6944, + "miss_rate": 0.3056 + }, + "R9": { + "total": 33, + "detected": 3, + "recall": 0.0909, + "miss_rate": 0.9091 + }, + "R10": { + "total": 37, + "detected": 10, + "recall": 0.2703, + "miss_rate": 0.7297 + } + }, + "L1b_regex": { + "name": "L1b_regex", + "accuracy": 0.3025, + "precision": 1.0, + "recall": 0.0504, + "f1": 0.096, + "fnr": 0.9496, + "tp": 18, + "fp": 0, + "fn": 339, + "tn": 129 + }, + "L1b_regex_cat_recall": { + "R1": { + "total": 31, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R2": { + "total": 39, + "detected": 1, + "recall": 0.0256, + "miss_rate": 0.9744 + }, + "R3": { + "total": 39, + "detected": 9, + "recall": 0.2308, + "miss_rate": 0.7692 + }, + "R4": { + "total": 41, + "detected": 3, + "recall": 0.0732, + "miss_rate": 0.9268 + }, + "R5": { + "total": 36, + "detected": 1, + "recall": 0.0278, + "miss_rate": 0.9722 + }, + "R6": { + "total": 33, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R7": { + "total": 32, + "detected": 2, + "recall": 0.0625, + "miss_rate": 0.9375 + }, + "R8": { + "total": 36, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R9": { + "total": 33, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R10": { + "total": 37, + "detected": 2, + "recall": 0.0541, + "miss_rate": 0.9459 + } + }, + "L1c_combined": { + "name": "L1c_combined", + "accuracy": 0.4486, + "precision": 0.9588, + "recall": 0.2605, + "f1": 0.4097, + "fnr": 0.7395, + "tp": 93, + "fp": 4, + "fn": 264, + "tn": 125 + }, + "L1c_combined_cat_recall": { + "R1": { + "total": 31, + "detected": 3, + "recall": 0.0968, + "miss_rate": 0.9032 + }, + "R2": { + "total": 39, + "detected": 4, + "recall": 0.1026, + "miss_rate": 0.8974 + }, + "R3": { + "total": 39, + "detected": 16, + "recall": 0.4103, + "miss_rate": 0.5897 + }, + "R4": { + "total": 41, + "detected": 9, + "recall": 0.2195, + "miss_rate": 0.7805 + }, + "R5": { + "total": 36, + "detected": 9, + "recall": 0.25, + "miss_rate": 0.75 + }, + "R6": { + "total": 33, + "detected": 6, + "recall": 0.1818, + "miss_rate": 0.8182 + }, + "R7": { + "total": 32, + "detected": 6, + "recall": 0.1875, + "miss_rate": 0.8125 + }, + "R8": { + "total": 36, + "detected": 25, + "recall": 0.6944, + "miss_rate": 0.3056 + }, + "R9": { + "total": 33, + "detected": 3, + "recall": 0.0909, + "miss_rate": 0.9091 + }, + "R10": { + "total": 37, + "detected": 12, + "recall": 0.3243, + "miss_rate": 0.6757 + } + }, + "intervention_Rule(l≥3→REJECT)": { + "policy": "Rule(l≥3→REJECT)", + "intervention_recall_high": 1.0, + "over_intervention_rate": 0.0, + "safety_ux_fscore": 1.0, + "crisis_precision": "N/A", + "action_distribution": { + "PASS": 233, + "REJECT": 253 + } + }, + "intervention_Threshold(level→action)": { + "policy": "Threshold(level→action)", + "intervention_recall_high": 1.0, + "over_intervention_rate": 0.0, + "safety_ux_fscore": 1.0, + "crisis_precision": 1.0, + "action_distribution": { + "PASS": 129, + "WARN": 104, + "REWRITE": 193, + "CRISIS": 60 + } + } +} \ No newline at end of file diff --git a/experiments/eval_all.json b/experiments/eval_all_v2.json similarity index 87% rename from experiments/eval_all.json rename to experiments/eval_all_v2.json index aee8b09..6656369 100644 --- a/experiments/eval_all.json +++ b/experiments/eval_all_v2.json @@ -13,6 +13,13 @@ "false_negative_rate": 0.8267543859649122, "level_macro_f1": 0.09819557155678502, "level_weighted_f1": 0.08825982748460577, + "level_per_class_f1": [ + 0.2786885245901639, + 0.0, + 0.1951219512195122, + 0.017167381974248927, + 0.0 + ], "per_category_recall": { "R1": { "total": 67, @@ -83,6 +90,13 @@ "false_negative_rate": 0.9605263157894737, "level_macro_f1": 0.07132623033992896, "level_weighted_f1": 0.058213483946983315, + "level_per_class_f1": [ + 0.2607407407407407, + 0.0, + 0.0958904109589041, + 0.0, + 0.0 + ], "per_category_recall": { "R1": { "total": 67, @@ -153,6 +167,13 @@ "false_negative_rate": 0.7960526315789473, "level_macro_f1": 0.10979552475377227, "level_weighted_f1": 0.1000980341896042, + "level_per_class_f1": [ + 0.28523489932885904, + 0.0, + 0.2465753424657534, + 0.017167381974248927, + 0.0 + ], "per_category_recall": { "R1": { "total": 67, @@ -223,24 +244,31 @@ "false_negative_rate": 0.004385964912280715, "level_macro_f1": 0.5150467302191439, "level_weighted_f1": 0.5173056767699116, - "fine_macro_f1": 0.0, - "fine_weighted_f1": 0.0, - "fine_per_label_f1": [ - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 + "level_per_class_f1": [ + 0.632183908045977, + 0.5076923076923077, + 0.3861003861003861, + 0.5627705627705628, + 0.4864864864864865 ], + "fine_per_label_f1": [ + 0.6407766990291263, + 0.46464646464646464, + 0.734982332155477, + 0.0, + 0.7407407407407407, + 0.7676767676767676, + 0.6013986013986014, + 0.4864864864864865, + 0.6161616161616161, + 0.6875, + 0.24, + 0.38961038961038963, + 0.8641975308641975, + 0.7777777777777778 + ], + "fine_macro_f1": 0.5722825290391176, + "fine_weighted_f1": 0.622073826302884, "per_category_recall": { "R1": { "total": 67, diff --git a/experiments/eval_human_only.json b/experiments/eval_human_v2.json similarity index 90% rename from experiments/eval_human_only.json rename to experiments/eval_human_v2.json index 90fd350..822c346 100644 --- a/experiments/eval_human_only.json +++ b/experiments/eval_human_v2.json @@ -13,6 +13,13 @@ "false_negative_rate": 1.0, "level_macro_f1": 0.05755395683453237, "level_weighted_f1": 0.04836466960885073, + "level_per_class_f1": [ + 0.28776978417266186, + 0.0, + 0.0, + 0.0, + 0.0 + ], "per_category_recall": { "R1": { "total": 36, @@ -83,6 +90,13 @@ "false_negative_rate": 1.0, "level_macro_f1": 0.05755395683453237, "level_weighted_f1": 0.04836466960885073, + "level_per_class_f1": [ + 0.28776978417266186, + 0.0, + 0.0, + 0.0, + 0.0 + ], "per_category_recall": { "R1": { "total": 36, @@ -153,6 +167,13 @@ "false_negative_rate": 1.0, "level_macro_f1": 0.05755395683453237, "level_weighted_f1": 0.04836466960885073, + "level_per_class_f1": [ + 0.28776978417266186, + 0.0, + 0.0, + 0.0, + 0.0 + ], "per_category_recall": { "R1": { "total": 36, @@ -223,24 +244,31 @@ "false_negative_rate": 0.02020202020202022, "level_macro_f1": 0.3641541183069423, "level_weighted_f1": 0.4092843419457787, - "fine_macro_f1": 0.0, - "fine_weighted_f1": 0.0, - "fine_per_label_f1": [ + "level_per_class_f1": [ + 0.9302325581395349, 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0, - 0.0 + 0.16326530612244897, + 0.36363636363636365, + 0.36363636363636365 ], + "fine_per_label_f1": [ + 0.3508771929824561, + 0.0, + 0.64, + 0.0, + 0.0, + 0.0, + 0.0, + 0.2222222222222222, + 0.375, + 0.8857142857142857, + 0.0, + 0.0, + 0.5, + 0.2857142857142857 + ], + "fine_macro_f1": 0.2328234276166607, + "fine_weighted_f1": 0.4082668160299739, "per_category_recall": { "R1": { "total": 36, diff --git a/experiments/eval_intervention_v1.json b/experiments/eval_intervention_v1.json new file mode 100644 index 0000000..4bd2ef7 --- /dev/null +++ b/experiments/eval_intervention_v1.json @@ -0,0 +1,376 @@ +{ + "meta": { + "test_file": "data/processed/CompanionRisk-Bench/test.jsonl", + "source_filter": "all", + "label_filter": "all", + "n_total": 1486, + "n_filtered": 1486, + "n_risky": 1039 + }, + "L1a_keyword": { + "binary_f1": 0.26436781609195403, + "high_risk_recall": 0.15495668912415783, + "high_risk_precision": 0.8994413407821229, + "false_negative_rate": 0.8450433108758422, + "level_macro_f1": 0.10427720349098286, + "level_weighted_f1": 0.09799538109505529, + "level_per_class_f1": [ + 0.2979274611398964, + 0.0, + 0.1934156378600823, + 0.030042918454935622, + 0.0 + ], + "per_category_recall": { + "R1": { + "total": 136, + "detected": 10, + "recall": 0.0735, + "miss_rate": 0.9265 + }, + "R2": { + "total": 142, + "detected": 16, + "recall": 0.1127, + "miss_rate": 0.8873 + }, + "R3": { + "total": 95, + "detected": 17, + "recall": 0.1789, + "miss_rate": 0.8211 + }, + "R4": { + "total": 116, + "detected": 22, + "recall": 0.1897, + "miss_rate": 0.8103 + }, + "R5": { + "total": 64, + "detected": 9, + "recall": 0.1406, + "miss_rate": 0.8594 + }, + "R6": { + "total": 97, + "detected": 11, + "recall": 0.1134, + "miss_rate": 0.8866 + }, + "R7": { + "total": 91, + "detected": 6, + "recall": 0.0659, + "miss_rate": 0.9341 + }, + "R8": { + "total": 73, + "detected": 49, + "recall": 0.6712, + "miss_rate": 0.3288 + }, + "R9": { + "total": 152, + "detected": 11, + "recall": 0.0724, + "miss_rate": 0.9276 + }, + "R10": { + "total": 73, + "detected": 10, + "recall": 0.137, + "miss_rate": 0.863 + } + } + }, + "L1b_regex": { + "binary_f1": 0.06697674418604652, + "high_risk_recall": 0.03464870067372473, + "high_risk_precision": 1.0, + "false_negative_rate": 0.9653512993262753, + "level_macro_f1": 0.07297879241072718, + "level_weighted_f1": 0.06312377515343655, + "level_per_class_f1": [ + 0.2809721398933017, + 0.0, + 0.07954545454545454, + 0.00437636761487965, + 0.0 + ], + "per_category_recall": { + "R1": { + "total": 136, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R2": { + "total": 142, + "detected": 1, + "recall": 0.007, + "miss_rate": 0.993 + }, + "R3": { + "total": 95, + "detected": 19, + "recall": 0.2, + "miss_rate": 0.8 + }, + "R4": { + "total": 116, + "detected": 9, + "recall": 0.0776, + "miss_rate": 0.9224 + }, + "R5": { + "total": 64, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R6": { + "total": 97, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R7": { + "total": 91, + "detected": 3, + "recall": 0.033, + "miss_rate": 0.967 + }, + "R8": { + "total": 73, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R9": { + "total": 152, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R10": { + "total": 73, + "detected": 4, + "recall": 0.0548, + "miss_rate": 0.9452 + } + } + }, + "L1c_combined": { + "binary_f1": 0.3060897435897436, + "high_risk_recall": 0.18383060635226178, + "high_risk_precision": 0.9138755980861244, + "false_negative_rate": 0.8161693936477382, + "level_macro_f1": 0.11189027535274536, + "level_weighted_f1": 0.10619241328971442, + "level_per_class_f1": [ + 0.3038309114927345, + 0.0, + 0.22135922330097088, + 0.034261241970021415, + 0.0 + ], + "per_category_recall": { + "R1": { + "total": 136, + "detected": 10, + "recall": 0.0735, + "miss_rate": 0.9265 + }, + "R2": { + "total": 142, + "detected": 17, + "recall": 0.1197, + "miss_rate": 0.8803 + }, + "R3": { + "total": 95, + "detected": 32, + "recall": 0.3368, + "miss_rate": 0.6632 + }, + "R4": { + "total": 116, + "detected": 29, + "recall": 0.25, + "miss_rate": 0.75 + }, + "R5": { + "total": 64, + "detected": 9, + "recall": 0.1406, + "miss_rate": 0.8594 + }, + "R6": { + "total": 97, + "detected": 11, + "recall": 0.1134, + "miss_rate": 0.8866 + }, + "R7": { + "total": 91, + "detected": 9, + "recall": 0.0989, + "miss_rate": 0.9011 + }, + "R8": { + "total": 73, + "detected": 49, + "recall": 0.6712, + "miss_rate": 0.3288 + }, + "R9": { + "total": 152, + "detected": 11, + "recall": 0.0724, + "miss_rate": 0.9276 + }, + "R10": { + "total": 73, + "detected": 14, + "recall": 0.1918, + "miss_rate": 0.8082 + } + } + }, + "ours_detection": { + "binary_f1": 0.9995189995189995, + "high_risk_recall": 1.0, + "high_risk_precision": 0.9990384615384615, + "false_negative_rate": 0.0, + "level_macro_f1": 0.5495554176357882, + "level_weighted_f1": 0.5584578220374772, + "level_per_class_f1": [ + 0.37540453074433655, + 0.6351931330472103, + 0.46393762183235865, + 0.6400759734093068, + 0.6331658291457286 + ], + "fine_per_label_f1": [ + 0.6844262295081968, + 0.46567164179104475, + 0.697986577181208, + 0.40233236151603496, + 0.585, + 0.3559322033898305, + 0.38322211630123926, + 0.3374578177727784, + 0.531810766721044, + 0.39436619718309857, + 0.2691029900332226, + 0.4410480349344978, + 0.32142857142857145, + 0.615916955017301 + ], + "fine_macro_f1": 0.46326446162700485, + "fine_weighted_f1": 0.4915026862223374, + "per_category_recall": { + "R1": { + "total": 136, + "detected": 136, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R2": { + "total": 142, + "detected": 142, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R3": { + "total": 95, + "detected": 95, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R4": { + "total": 116, + "detected": 116, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R5": { + "total": 64, + "detected": 64, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R6": { + "total": 97, + "detected": 97, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R7": { + "total": 91, + "detected": 91, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R8": { + "total": 73, + "detected": 73, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R9": { + "total": 152, + "detected": 152, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R10": { + "total": 73, + "detected": 73, + "recall": 1.0, + "miss_rate": 0.0 + } + }, + "label_filter": "all" + }, + "baseline_rule": { + "intervention_recall_high": 1.0, + "over_intervention_rate": 0.0, + "action_distribution": [ + 0.5612382234185733, + 0.0, + 0.0, + 0.4387617765814266, + 0.0 + ], + "crisis_precision": NaN, + "safety_ux_fscore": 1.0 + }, + "baseline_threshold": { + "intervention_recall_high": 1.0, + "over_intervention_rate": 0.0, + "action_distribution": [ + 0.34791386271870794, + 0.2133243606998654, + 0.30686406460296095, + 0.0, + 0.13189771197846567 + ], + "crisis_precision": 1.0, + "safety_ux_fscore": 1.0 + }, + "ours_intervention": { + "intervention_recall_high": 1.0, + "over_intervention_rate": 0.0, + "action_distribution": [ + 0.3001345895020188, + 0.0033647375504710633, + 0.5834454912516823, + 0.0, + 0.11305518169582772 + ], + "crisis_precision": 0.47619047619047616, + "safety_ux_fscore": 1.0 + } +} \ No newline at end of file diff --git a/experiments/eval_intervention_v2.json b/experiments/eval_intervention_v2.json new file mode 100644 index 0000000..1c9c789 --- /dev/null +++ b/experiments/eval_intervention_v2.json @@ -0,0 +1,533 @@ +{ + "meta": { + "test_file": "data/processed/CompanionRisk-Bench/test.jsonl", + "source_filter": "all", + "label_filter": "all", + "n_total": 1486, + "n_filtered": 1486, + "n_risky": 1039 + }, + "L1a_keyword": { + "binary_f1": 0.26436781609195403, + "high_risk_recall": 0.15495668912415783, + "high_risk_precision": 0.8994413407821229, + "false_negative_rate": 0.8450433108758422, + "level_macro_f1": 0.10427720349098286, + "level_weighted_f1": 0.09799538109505529, + "level_per_class_f1": [ + 0.2979274611398964, + 0.0, + 0.1934156378600823, + 0.030042918454935622, + 0.0 + ], + "per_category_recall": { + "R1": { + "total": 136, + "detected": 10, + "recall": 0.0735, + "miss_rate": 0.9265 + }, + "R2": { + "total": 142, + "detected": 16, + "recall": 0.1127, + "miss_rate": 0.8873 + }, + "R3": { + "total": 95, + "detected": 17, + "recall": 0.1789, + "miss_rate": 0.8211 + }, + "R4": { + "total": 116, + "detected": 22, + "recall": 0.1897, + "miss_rate": 0.8103 + }, + "R5": { + "total": 64, + "detected": 9, + "recall": 0.1406, + "miss_rate": 0.8594 + }, + "R6": { + "total": 97, + "detected": 11, + "recall": 0.1134, + "miss_rate": 0.8866 + }, + "R7": { + "total": 91, + "detected": 6, + "recall": 0.0659, + "miss_rate": 0.9341 + }, + "R8": { + "total": 73, + "detected": 49, + "recall": 0.6712, + "miss_rate": 0.3288 + }, + "R9": { + "total": 152, + "detected": 11, + "recall": 0.0724, + "miss_rate": 0.9276 + }, + "R10": { + "total": 73, + "detected": 10, + "recall": 0.137, + "miss_rate": 0.863 + } + } + }, + "L1b_regex": { + "binary_f1": 0.06697674418604652, + "high_risk_recall": 0.03464870067372473, + "high_risk_precision": 1.0, + "false_negative_rate": 0.9653512993262753, + "level_macro_f1": 0.07297879241072718, + "level_weighted_f1": 0.06312377515343655, + "level_per_class_f1": [ + 0.2809721398933017, + 0.0, + 0.07954545454545454, + 0.00437636761487965, + 0.0 + ], + "per_category_recall": { + "R1": { + "total": 136, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R2": { + "total": 142, + "detected": 1, + "recall": 0.007, + "miss_rate": 0.993 + }, + "R3": { + "total": 95, + "detected": 19, + "recall": 0.2, + "miss_rate": 0.8 + }, + "R4": { + "total": 116, + "detected": 9, + "recall": 0.0776, + "miss_rate": 0.9224 + }, + "R5": { + "total": 64, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R6": { + "total": 97, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R7": { + "total": 91, + "detected": 3, + "recall": 0.033, + "miss_rate": 0.967 + }, + "R8": { + "total": 73, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R9": { + "total": 152, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R10": { + "total": 73, + "detected": 4, + "recall": 0.0548, + "miss_rate": 0.9452 + } + } + }, + "L1c_combined": { + "binary_f1": 0.3060897435897436, + "high_risk_recall": 0.18383060635226178, + "high_risk_precision": 0.9138755980861244, + "false_negative_rate": 0.8161693936477382, + "level_macro_f1": 0.11189027535274536, + "level_weighted_f1": 0.10619241328971442, + "level_per_class_f1": [ + 0.3038309114927345, + 0.0, + 0.22135922330097088, + 0.034261241970021415, + 0.0 + ], + "per_category_recall": { + "R1": { + "total": 136, + "detected": 10, + "recall": 0.0735, + "miss_rate": 0.9265 + }, + "R2": { + "total": 142, + "detected": 17, + "recall": 0.1197, + "miss_rate": 0.8803 + }, + "R3": { + "total": 95, + "detected": 32, + "recall": 0.3368, + "miss_rate": 0.6632 + }, + "R4": { + "total": 116, + "detected": 29, + "recall": 0.25, + "miss_rate": 0.75 + }, + "R5": { + "total": 64, + "detected": 9, + "recall": 0.1406, + "miss_rate": 0.8594 + }, + "R6": { + "total": 97, + "detected": 11, + "recall": 0.1134, + "miss_rate": 0.8866 + }, + "R7": { + "total": 91, + "detected": 9, + "recall": 0.0989, + "miss_rate": 0.9011 + }, + "R8": { + "total": 73, + "detected": 49, + "recall": 0.6712, + "miss_rate": 0.3288 + }, + "R9": { + "total": 152, + "detected": 11, + "recall": 0.0724, + "miss_rate": 0.9276 + }, + "R10": { + "total": 73, + "detected": 14, + "recall": 0.1918, + "miss_rate": 0.8082 + } + } + }, + "ours_detection": { + "binary_f1": 0.9995189995189995, + "high_risk_recall": 1.0, + "high_risk_precision": 0.9990384615384615, + "false_negative_rate": 0.0, + "level_macro_f1": 0.5495554176357882, + "level_weighted_f1": 0.5584578220374772, + "level_per_class_f1": [ + 0.37540453074433655, + 0.6351931330472103, + 0.46393762183235865, + 0.6400759734093068, + 0.6331658291457286 + ], + "fine_per_label_f1": [ + 0.6844262295081968, + 0.46567164179104475, + 0.697986577181208, + 0.40233236151603496, + 0.585, + 0.3559322033898305, + 0.38322211630123926, + 0.3374578177727784, + 0.531810766721044, + 0.39436619718309857, + 0.2691029900332226, + 0.4410480349344978, + 0.32142857142857145, + 0.615916955017301 + ], + "fine_macro_f1": 0.46326446162700485, + "fine_weighted_f1": 0.4915026862223374, + "per_category_recall": { + "R1": { + "total": 136, + "detected": 136, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R2": { + "total": 142, + "detected": 142, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R3": { + "total": 95, + "detected": 95, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R4": { + "total": 116, + "detected": 116, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R5": { + "total": 64, + "detected": 64, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R6": { + "total": 97, + "detected": 97, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R7": { + "total": 91, + "detected": 91, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R8": { + "total": 73, + "detected": 73, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R9": { + "total": 152, + "detected": 152, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R10": { + "total": 73, + "detected": 73, + "recall": 1.0, + "miss_rate": 0.0 + } + }, + "label_filter": "all" + }, + "baseline_rule": { + "intervention_recall_high": 0.9079754601226994, + "over_intervention_rate": 0.0, + "action_distribution": [ + 0.46231493943472407, + 0.0, + 0.0, + 0.5376850605652759, + 0.0 + ], + "per_level_action_dist": { + "L0_Safe": { + "n": 237, + "action_dist": [ + 1.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "L1_Mild": { + "n": 280, + "action_dist": [ + 0.9178571428571428, + 0.0, + 0.0, + 0.08214285714285714, + 0.0 + ] + }, + "L2_Moderate": { + "n": 317, + "action_dist": [ + 0.4195583596214511, + 0.0, + 0.0, + 0.580441640378549, + 0.0 + ] + }, + "L3_High": { + "n": 456, + "action_dist": [ + 0.11403508771929824, + 0.0, + 0.0, + 0.8859649122807017, + 0.0 + ] + }, + "L4_Critical": { + "n": 196, + "action_dist": [ + 0.04081632653061224, + 0.0, + 0.0, + 0.9591836734693877, + 0.0 + ] + } + }, + "crisis_precision": NaN, + "safety_ux_fscore": 0.9517684887459806 + }, + "baseline_threshold": { + "intervention_recall_high": 0.9079754601226994, + "over_intervention_rate": 0.0, + "action_distribution": [ + 0.3304172274562584, + 0.13189771197846567, + 0.40174966352624497, + 0.0, + 0.13593539703903096 + ], + "per_level_action_dist": { + "L0_Safe": { + "n": 237, + "action_dist": [ + 1.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "L1_Mild": { + "n": 280, + "action_dist": [ + 0.8428571428571429, + 0.075, + 0.08214285714285714, + 0.0, + 0.0 + ] + }, + "L2_Moderate": { + "n": 317, + "action_dist": [ + 0.04416403785488959, + 0.3753943217665615, + 0.5520504731861199, + 0.0, + 0.028391167192429023 + ] + }, + "L3_High": { + "n": 456, + "action_dist": [ + 0.008771929824561403, + 0.10526315789473684, + 0.7390350877192983, + 0.0, + 0.14692982456140352 + ] + }, + "L4_Critical": { + "n": 196, + "action_dist": [ + 0.0, + 0.04081632653061224, + 0.3163265306122449, + 0.0, + 0.6428571428571429 + ] + } + }, + "crisis_precision": 0.6237623762376238, + "safety_ux_fscore": 0.9517684887459806 + }, + "ours_intervention": { + "intervention_recall_high": 1.0, + "over_intervention_rate": 0.0, + "action_distribution": [ + 0.29878869448183043, + 0.0033647375504710633, + 0.5847913862718708, + 0.0, + 0.11305518169582772 + ], + "per_level_action_dist": { + "L0_Safe": { + "n": 237, + "action_dist": [ + 0.9831223628691983, + 0.016877637130801686, + 0.0, + 0.0, + 0.0 + ] + }, + "L1_Mild": { + "n": 280, + "action_dist": [ + 0.7535714285714286, + 0.0035714285714285713, + 0.21785714285714286, + 0.0, + 0.025 + ] + }, + "L2_Moderate": { + "n": 317, + "action_dist": [ + 0.0, + 0.0, + 0.9148264984227129, + 0.0, + 0.08517350157728706 + ] + }, + "L3_High": { + "n": 456, + "action_dist": [ + 0.0, + 0.0, + 0.8793859649122807, + 0.0, + 0.1206140350877193 + ] + }, + "L4_Critical": { + "n": 196, + "action_dist": [ + 0.0, + 0.0, + 0.5969387755102041, + 0.0, + 0.4030612244897959 + ] + } + }, + "action_accuracy": 0.5868102288021534, + "crisis_precision": 0.47023809523809523, + "safety_ux_fscore": 1.0 + } +} \ No newline at end of file diff --git a/experiments/eval_intervention_v3.json b/experiments/eval_intervention_v3.json new file mode 100644 index 0000000..26469a5 --- /dev/null +++ b/experiments/eval_intervention_v3.json @@ -0,0 +1,533 @@ +{ + "meta": { + "test_file": "data/processed/CompanionRisk-Bench/test.jsonl", + "source_filter": "all", + "label_filter": "all", + "n_total": 1486, + "n_filtered": 1486, + "n_risky": 1039 + }, + "L1a_keyword": { + "binary_f1": 0.26436781609195403, + "high_risk_recall": 0.15495668912415783, + "high_risk_precision": 0.8994413407821229, + "false_negative_rate": 0.8450433108758422, + "level_macro_f1": 0.10427720349098286, + "level_weighted_f1": 0.09799538109505529, + "level_per_class_f1": [ + 0.2979274611398964, + 0.0, + 0.1934156378600823, + 0.030042918454935622, + 0.0 + ], + "per_category_recall": { + "R1": { + "total": 136, + "detected": 10, + "recall": 0.0735, + "miss_rate": 0.9265 + }, + "R2": { + "total": 142, + "detected": 16, + "recall": 0.1127, + "miss_rate": 0.8873 + }, + "R3": { + "total": 95, + "detected": 17, + "recall": 0.1789, + "miss_rate": 0.8211 + }, + "R4": { + "total": 116, + "detected": 22, + "recall": 0.1897, + "miss_rate": 0.8103 + }, + "R5": { + "total": 64, + "detected": 9, + "recall": 0.1406, + "miss_rate": 0.8594 + }, + "R6": { + "total": 97, + "detected": 11, + "recall": 0.1134, + "miss_rate": 0.8866 + }, + "R7": { + "total": 91, + "detected": 6, + "recall": 0.0659, + "miss_rate": 0.9341 + }, + "R8": { + "total": 73, + "detected": 49, + "recall": 0.6712, + "miss_rate": 0.3288 + }, + "R9": { + "total": 152, + "detected": 11, + "recall": 0.0724, + "miss_rate": 0.9276 + }, + "R10": { + "total": 73, + "detected": 10, + "recall": 0.137, + "miss_rate": 0.863 + } + } + }, + "L1b_regex": { + "binary_f1": 0.06697674418604652, + "high_risk_recall": 0.03464870067372473, + "high_risk_precision": 1.0, + "false_negative_rate": 0.9653512993262753, + "level_macro_f1": 0.07297879241072718, + "level_weighted_f1": 0.06312377515343655, + "level_per_class_f1": [ + 0.2809721398933017, + 0.0, + 0.07954545454545454, + 0.00437636761487965, + 0.0 + ], + "per_category_recall": { + "R1": { + "total": 136, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R2": { + "total": 142, + "detected": 1, + "recall": 0.007, + "miss_rate": 0.993 + }, + "R3": { + "total": 95, + "detected": 19, + "recall": 0.2, + "miss_rate": 0.8 + }, + "R4": { + "total": 116, + "detected": 9, + "recall": 0.0776, + "miss_rate": 0.9224 + }, + "R5": { + "total": 64, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R6": { + "total": 97, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R7": { + "total": 91, + "detected": 3, + "recall": 0.033, + "miss_rate": 0.967 + }, + "R8": { + "total": 73, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R9": { + "total": 152, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R10": { + "total": 73, + "detected": 4, + "recall": 0.0548, + "miss_rate": 0.9452 + } + } + }, + "L1c_combined": { + "binary_f1": 0.3060897435897436, + "high_risk_recall": 0.18383060635226178, + "high_risk_precision": 0.9138755980861244, + "false_negative_rate": 0.8161693936477382, + "level_macro_f1": 0.11189027535274536, + "level_weighted_f1": 0.10619241328971442, + "level_per_class_f1": [ + 0.3038309114927345, + 0.0, + 0.22135922330097088, + 0.034261241970021415, + 0.0 + ], + "per_category_recall": { + "R1": { + "total": 136, + "detected": 10, + "recall": 0.0735, + "miss_rate": 0.9265 + }, + "R2": { + "total": 142, + "detected": 17, + "recall": 0.1197, + "miss_rate": 0.8803 + }, + "R3": { + "total": 95, + "detected": 32, + "recall": 0.3368, + "miss_rate": 0.6632 + }, + "R4": { + "total": 116, + "detected": 29, + "recall": 0.25, + "miss_rate": 0.75 + }, + "R5": { + "total": 64, + "detected": 9, + "recall": 0.1406, + "miss_rate": 0.8594 + }, + "R6": { + "total": 97, + "detected": 11, + "recall": 0.1134, + "miss_rate": 0.8866 + }, + "R7": { + "total": 91, + "detected": 9, + "recall": 0.0989, + "miss_rate": 0.9011 + }, + "R8": { + "total": 73, + "detected": 49, + "recall": 0.6712, + "miss_rate": 0.3288 + }, + "R9": { + "total": 152, + "detected": 11, + "recall": 0.0724, + "miss_rate": 0.9276 + }, + "R10": { + "total": 73, + "detected": 14, + "recall": 0.1918, + "miss_rate": 0.8082 + } + } + }, + "ours_detection": { + "binary_f1": 0.9995189995189995, + "high_risk_recall": 1.0, + "high_risk_precision": 0.9990384615384615, + "false_negative_rate": 0.0, + "level_macro_f1": 0.5495554176357882, + "level_weighted_f1": 0.5584578220374772, + "level_per_class_f1": [ + 0.37540453074433655, + 0.6351931330472103, + 0.46393762183235865, + 0.6400759734093068, + 0.6331658291457286 + ], + "fine_per_label_f1": [ + 0.6844262295081968, + 0.46567164179104475, + 0.697986577181208, + 0.40233236151603496, + 0.585, + 0.3559322033898305, + 0.38322211630123926, + 0.3374578177727784, + 0.531810766721044, + 0.39436619718309857, + 0.2691029900332226, + 0.4410480349344978, + 0.32142857142857145, + 0.615916955017301 + ], + "fine_macro_f1": 0.46326446162700485, + "fine_weighted_f1": 0.4915026862223374, + "per_category_recall": { + "R1": { + "total": 136, + "detected": 136, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R2": { + "total": 142, + "detected": 142, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R3": { + "total": 95, + "detected": 95, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R4": { + "total": 116, + "detected": 116, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R5": { + "total": 64, + "detected": 64, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R6": { + "total": 97, + "detected": 97, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R7": { + "total": 91, + "detected": 91, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R8": { + "total": 73, + "detected": 73, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R9": { + "total": 152, + "detected": 152, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R10": { + "total": 73, + "detected": 73, + "recall": 1.0, + "miss_rate": 0.0 + } + }, + "label_filter": "all" + }, + "baseline_rule": { + "intervention_recall_high": 0.9079754601226994, + "over_intervention_rate": 0.0, + "action_distribution": [ + 0.46231493943472407, + 0.0, + 0.0, + 0.5376850605652759, + 0.0 + ], + "per_level_action_dist": { + "L0_Safe": { + "n": 237, + "action_dist": [ + 1.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "L1_Mild": { + "n": 280, + "action_dist": [ + 0.9178571428571428, + 0.0, + 0.0, + 0.08214285714285714, + 0.0 + ] + }, + "L2_Moderate": { + "n": 317, + "action_dist": [ + 0.4195583596214511, + 0.0, + 0.0, + 0.580441640378549, + 0.0 + ] + }, + "L3_High": { + "n": 456, + "action_dist": [ + 0.11403508771929824, + 0.0, + 0.0, + 0.8859649122807017, + 0.0 + ] + }, + "L4_Critical": { + "n": 196, + "action_dist": [ + 0.04081632653061224, + 0.0, + 0.0, + 0.9591836734693877, + 0.0 + ] + } + }, + "crisis_precision": NaN, + "safety_ux_fscore": 0.9517684887459806 + }, + "baseline_threshold": { + "intervention_recall_high": 0.9079754601226994, + "over_intervention_rate": 0.0, + "action_distribution": [ + 0.3304172274562584, + 0.13189771197846567, + 0.40174966352624497, + 0.0, + 0.13593539703903096 + ], + "per_level_action_dist": { + "L0_Safe": { + "n": 237, + "action_dist": [ + 1.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "L1_Mild": { + "n": 280, + "action_dist": [ + 0.8428571428571429, + 0.075, + 0.08214285714285714, + 0.0, + 0.0 + ] + }, + "L2_Moderate": { + "n": 317, + "action_dist": [ + 0.04416403785488959, + 0.3753943217665615, + 0.5520504731861199, + 0.0, + 0.028391167192429023 + ] + }, + "L3_High": { + "n": 456, + "action_dist": [ + 0.008771929824561403, + 0.10526315789473684, + 0.7390350877192983, + 0.0, + 0.14692982456140352 + ] + }, + "L4_Critical": { + "n": 196, + "action_dist": [ + 0.0, + 0.04081632653061224, + 0.3163265306122449, + 0.0, + 0.6428571428571429 + ] + } + }, + "crisis_precision": 0.6237623762376238, + "safety_ux_fscore": 0.9517684887459806 + }, + "ours_intervention": { + "intervention_recall_high": 1.0, + "over_intervention_rate": 0.004219409282700422, + "action_distribution": [ + 0.29475100942126514, + 0.0033647375504710633, + 0.5868102288021534, + 0.0, + 0.11507402422611036 + ], + "per_level_action_dist": { + "L0_Safe": { + "n": 237, + "action_dist": [ + 0.9873417721518988, + 0.008438818565400843, + 0.004219409282700422, + 0.0, + 0.0 + ] + }, + "L1_Mild": { + "n": 280, + "action_dist": [ + 0.7285714285714285, + 0.010714285714285714, + 0.22857142857142856, + 0.0, + 0.03214285714285714 + ] + }, + "L2_Moderate": { + "n": 317, + "action_dist": [ + 0.0, + 0.0, + 0.9022082018927445, + 0.0, + 0.09779179810725552 + ] + }, + "L3_High": { + "n": 456, + "action_dist": [ + 0.0, + 0.0, + 0.8706140350877193, + 0.0, + 0.12938596491228072 + ] + }, + "L4_Critical": { + "n": 196, + "action_dist": [ + 0.0, + 0.0, + 0.6326530612244898, + 0.0, + 0.3673469387755102 + ] + } + }, + "action_accuracy": 0.5753701211305519, + "crisis_precision": 0.42105263157894735, + "safety_ux_fscore": 0.9978858350951374 + } +} \ No newline at end of file diff --git a/experiments/eval_intervention_v4.json b/experiments/eval_intervention_v4.json new file mode 100644 index 0000000..26469a5 --- /dev/null +++ b/experiments/eval_intervention_v4.json @@ -0,0 +1,533 @@ +{ + "meta": { + "test_file": "data/processed/CompanionRisk-Bench/test.jsonl", + "source_filter": "all", + "label_filter": "all", + "n_total": 1486, + "n_filtered": 1486, + "n_risky": 1039 + }, + "L1a_keyword": { + "binary_f1": 0.26436781609195403, + "high_risk_recall": 0.15495668912415783, + "high_risk_precision": 0.8994413407821229, + "false_negative_rate": 0.8450433108758422, + "level_macro_f1": 0.10427720349098286, + "level_weighted_f1": 0.09799538109505529, + "level_per_class_f1": [ + 0.2979274611398964, + 0.0, + 0.1934156378600823, + 0.030042918454935622, + 0.0 + ], + "per_category_recall": { + "R1": { + "total": 136, + "detected": 10, + "recall": 0.0735, + "miss_rate": 0.9265 + }, + "R2": { + "total": 142, + "detected": 16, + "recall": 0.1127, + "miss_rate": 0.8873 + }, + "R3": { + "total": 95, + "detected": 17, + "recall": 0.1789, + "miss_rate": 0.8211 + }, + "R4": { + "total": 116, + "detected": 22, + "recall": 0.1897, + "miss_rate": 0.8103 + }, + "R5": { + "total": 64, + "detected": 9, + "recall": 0.1406, + "miss_rate": 0.8594 + }, + "R6": { + "total": 97, + "detected": 11, + "recall": 0.1134, + "miss_rate": 0.8866 + }, + "R7": { + "total": 91, + "detected": 6, + "recall": 0.0659, + "miss_rate": 0.9341 + }, + "R8": { + "total": 73, + "detected": 49, + "recall": 0.6712, + "miss_rate": 0.3288 + }, + "R9": { + "total": 152, + "detected": 11, + "recall": 0.0724, + "miss_rate": 0.9276 + }, + "R10": { + "total": 73, + "detected": 10, + "recall": 0.137, + "miss_rate": 0.863 + } + } + }, + "L1b_regex": { + "binary_f1": 0.06697674418604652, + "high_risk_recall": 0.03464870067372473, + "high_risk_precision": 1.0, + "false_negative_rate": 0.9653512993262753, + "level_macro_f1": 0.07297879241072718, + "level_weighted_f1": 0.06312377515343655, + "level_per_class_f1": [ + 0.2809721398933017, + 0.0, + 0.07954545454545454, + 0.00437636761487965, + 0.0 + ], + "per_category_recall": { + "R1": { + "total": 136, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R2": { + "total": 142, + "detected": 1, + "recall": 0.007, + "miss_rate": 0.993 + }, + "R3": { + "total": 95, + "detected": 19, + "recall": 0.2, + "miss_rate": 0.8 + }, + "R4": { + "total": 116, + "detected": 9, + "recall": 0.0776, + "miss_rate": 0.9224 + }, + "R5": { + "total": 64, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R6": { + "total": 97, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R7": { + "total": 91, + "detected": 3, + "recall": 0.033, + "miss_rate": 0.967 + }, + "R8": { + "total": 73, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R9": { + "total": 152, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R10": { + "total": 73, + "detected": 4, + "recall": 0.0548, + "miss_rate": 0.9452 + } + } + }, + "L1c_combined": { + "binary_f1": 0.3060897435897436, + "high_risk_recall": 0.18383060635226178, + "high_risk_precision": 0.9138755980861244, + "false_negative_rate": 0.8161693936477382, + "level_macro_f1": 0.11189027535274536, + "level_weighted_f1": 0.10619241328971442, + "level_per_class_f1": [ + 0.3038309114927345, + 0.0, + 0.22135922330097088, + 0.034261241970021415, + 0.0 + ], + "per_category_recall": { + "R1": { + "total": 136, + "detected": 10, + "recall": 0.0735, + "miss_rate": 0.9265 + }, + "R2": { + "total": 142, + "detected": 17, + "recall": 0.1197, + "miss_rate": 0.8803 + }, + "R3": { + "total": 95, + "detected": 32, + "recall": 0.3368, + "miss_rate": 0.6632 + }, + "R4": { + "total": 116, + "detected": 29, + "recall": 0.25, + "miss_rate": 0.75 + }, + "R5": { + "total": 64, + "detected": 9, + "recall": 0.1406, + "miss_rate": 0.8594 + }, + "R6": { + "total": 97, + "detected": 11, + "recall": 0.1134, + "miss_rate": 0.8866 + }, + "R7": { + "total": 91, + "detected": 9, + "recall": 0.0989, + "miss_rate": 0.9011 + }, + "R8": { + "total": 73, + "detected": 49, + "recall": 0.6712, + "miss_rate": 0.3288 + }, + "R9": { + "total": 152, + "detected": 11, + "recall": 0.0724, + "miss_rate": 0.9276 + }, + "R10": { + "total": 73, + "detected": 14, + "recall": 0.1918, + "miss_rate": 0.8082 + } + } + }, + "ours_detection": { + "binary_f1": 0.9995189995189995, + "high_risk_recall": 1.0, + "high_risk_precision": 0.9990384615384615, + "false_negative_rate": 0.0, + "level_macro_f1": 0.5495554176357882, + "level_weighted_f1": 0.5584578220374772, + "level_per_class_f1": [ + 0.37540453074433655, + 0.6351931330472103, + 0.46393762183235865, + 0.6400759734093068, + 0.6331658291457286 + ], + "fine_per_label_f1": [ + 0.6844262295081968, + 0.46567164179104475, + 0.697986577181208, + 0.40233236151603496, + 0.585, + 0.3559322033898305, + 0.38322211630123926, + 0.3374578177727784, + 0.531810766721044, + 0.39436619718309857, + 0.2691029900332226, + 0.4410480349344978, + 0.32142857142857145, + 0.615916955017301 + ], + "fine_macro_f1": 0.46326446162700485, + "fine_weighted_f1": 0.4915026862223374, + "per_category_recall": { + "R1": { + "total": 136, + "detected": 136, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R2": { + "total": 142, + "detected": 142, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R3": { + "total": 95, + "detected": 95, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R4": { + "total": 116, + "detected": 116, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R5": { + "total": 64, + "detected": 64, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R6": { + "total": 97, + "detected": 97, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R7": { + "total": 91, + "detected": 91, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R8": { + "total": 73, + "detected": 73, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R9": { + "total": 152, + "detected": 152, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R10": { + "total": 73, + "detected": 73, + "recall": 1.0, + "miss_rate": 0.0 + } + }, + "label_filter": "all" + }, + "baseline_rule": { + "intervention_recall_high": 0.9079754601226994, + "over_intervention_rate": 0.0, + "action_distribution": [ + 0.46231493943472407, + 0.0, + 0.0, + 0.5376850605652759, + 0.0 + ], + "per_level_action_dist": { + "L0_Safe": { + "n": 237, + "action_dist": [ + 1.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "L1_Mild": { + "n": 280, + "action_dist": [ + 0.9178571428571428, + 0.0, + 0.0, + 0.08214285714285714, + 0.0 + ] + }, + "L2_Moderate": { + "n": 317, + "action_dist": [ + 0.4195583596214511, + 0.0, + 0.0, + 0.580441640378549, + 0.0 + ] + }, + "L3_High": { + "n": 456, + "action_dist": [ + 0.11403508771929824, + 0.0, + 0.0, + 0.8859649122807017, + 0.0 + ] + }, + "L4_Critical": { + "n": 196, + "action_dist": [ + 0.04081632653061224, + 0.0, + 0.0, + 0.9591836734693877, + 0.0 + ] + } + }, + "crisis_precision": NaN, + "safety_ux_fscore": 0.9517684887459806 + }, + "baseline_threshold": { + "intervention_recall_high": 0.9079754601226994, + "over_intervention_rate": 0.0, + "action_distribution": [ + 0.3304172274562584, + 0.13189771197846567, + 0.40174966352624497, + 0.0, + 0.13593539703903096 + ], + "per_level_action_dist": { + "L0_Safe": { + "n": 237, + "action_dist": [ + 1.0, + 0.0, + 0.0, + 0.0, + 0.0 + ] + }, + "L1_Mild": { + "n": 280, + "action_dist": [ + 0.8428571428571429, + 0.075, + 0.08214285714285714, + 0.0, + 0.0 + ] + }, + "L2_Moderate": { + "n": 317, + "action_dist": [ + 0.04416403785488959, + 0.3753943217665615, + 0.5520504731861199, + 0.0, + 0.028391167192429023 + ] + }, + "L3_High": { + "n": 456, + "action_dist": [ + 0.008771929824561403, + 0.10526315789473684, + 0.7390350877192983, + 0.0, + 0.14692982456140352 + ] + }, + "L4_Critical": { + "n": 196, + "action_dist": [ + 0.0, + 0.04081632653061224, + 0.3163265306122449, + 0.0, + 0.6428571428571429 + ] + } + }, + "crisis_precision": 0.6237623762376238, + "safety_ux_fscore": 0.9517684887459806 + }, + "ours_intervention": { + "intervention_recall_high": 1.0, + "over_intervention_rate": 0.004219409282700422, + "action_distribution": [ + 0.29475100942126514, + 0.0033647375504710633, + 0.5868102288021534, + 0.0, + 0.11507402422611036 + ], + "per_level_action_dist": { + "L0_Safe": { + "n": 237, + "action_dist": [ + 0.9873417721518988, + 0.008438818565400843, + 0.004219409282700422, + 0.0, + 0.0 + ] + }, + "L1_Mild": { + "n": 280, + "action_dist": [ + 0.7285714285714285, + 0.010714285714285714, + 0.22857142857142856, + 0.0, + 0.03214285714285714 + ] + }, + "L2_Moderate": { + "n": 317, + "action_dist": [ + 0.0, + 0.0, + 0.9022082018927445, + 0.0, + 0.09779179810725552 + ] + }, + "L3_High": { + "n": 456, + "action_dist": [ + 0.0, + 0.0, + 0.8706140350877193, + 0.0, + 0.12938596491228072 + ] + }, + "L4_Critical": { + "n": 196, + "action_dist": [ + 0.0, + 0.0, + 0.6326530612244898, + 0.0, + 0.3673469387755102 + ] + } + }, + "action_accuracy": 0.5753701211305519, + "crisis_precision": 0.42105263157894735, + "safety_ux_fscore": 0.9978858350951374 + } +} \ No newline at end of file diff --git a/experiments/eval_results.json b/experiments/eval_results.json new file mode 100644 index 0000000..d2ecf57 --- /dev/null +++ b/experiments/eval_results.json @@ -0,0 +1,337 @@ +{ + "meta": { + "test_file": "data/processed/CompanionRisk-Bench/test.jsonl", + "source_filter": "all", + "label_filter": "all", + "n_total": 1324, + "n_filtered": 1324, + "n_risky": 877 + }, + "L1a_keyword": { + "binary_f1": 0.27751196172248804, + "high_risk_recall": 0.1653363740022805, + "high_risk_precision": 0.8630952380952381, + "false_negative_rate": 0.8346636259977195, + "level_macro_f1": 0.11264512835143245, + "level_weighted_f1": 0.10448970574896717, + "level_per_class_f1": [ + 0.3254480286738351, + 0.0, + 0.20865139949109415, + 0.02912621359223301, + 0.0 + ], + "per_category_recall": { + "R1": { + "total": 123, + "detected": 8, + "recall": 0.065, + "miss_rate": 0.935 + }, + "R2": { + "total": 96, + "detected": 14, + "recall": 0.1458, + "miss_rate": 0.8542 + }, + "R3": { + "total": 77, + "detected": 13, + "recall": 0.1688, + "miss_rate": 0.8312 + }, + "R4": { + "total": 81, + "detected": 18, + "recall": 0.2222, + "miss_rate": 0.7778 + }, + "R5": { + "total": 64, + "detected": 9, + "recall": 0.1406, + "miss_rate": 0.8594 + }, + "R6": { + "total": 105, + "detected": 11, + "recall": 0.1048, + "miss_rate": 0.8952 + }, + "R7": { + "total": 91, + "detected": 6, + "recall": 0.0659, + "miss_rate": 0.9341 + }, + "R8": { + "total": 75, + "detected": 49, + "recall": 0.6533, + "miss_rate": 0.3467 + }, + "R9": { + "total": 91, + "detected": 7, + "recall": 0.0769, + "miss_rate": 0.9231 + }, + "R10": { + "total": 74, + "detected": 10, + "recall": 0.1351, + "miss_rate": 0.8649 + } + } + }, + "L1b_regex": { + "binary_f1": 0.07886089813800658, + "high_risk_recall": 0.04104903078677309, + "high_risk_precision": 1.0, + "false_negative_rate": 0.9589509692132269, + "level_macro_f1": 0.08441436068877664, + "level_weighted_f1": 0.07640981579648991, + "level_per_class_f1": [ + 0.31303208906352326, + 0.0, + 0.10408921933085502, + 0.0049504950495049506, + 0.0 + ], + "per_category_recall": { + "R1": { + "total": 123, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R2": { + "total": 96, + "detected": 1, + "recall": 0.0104, + "miss_rate": 0.9896 + }, + "R3": { + "total": 77, + "detected": 19, + "recall": 0.2468, + "miss_rate": 0.7532 + }, + "R4": { + "total": 81, + "detected": 9, + "recall": 0.1111, + "miss_rate": 0.8889 + }, + "R5": { + "total": 64, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R6": { + "total": 105, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R7": { + "total": 91, + "detected": 3, + "recall": 0.033, + "miss_rate": 0.967 + }, + "R8": { + "total": 75, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R9": { + "total": 91, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R10": { + "total": 74, + "detected": 4, + "recall": 0.0541, + "miss_rate": 0.9459 + } + } + }, + "L1c_combined": { + "binary_f1": 0.32558139534883723, + "high_risk_recall": 0.19954389965792474, + "high_risk_precision": 0.8838383838383839, + "false_negative_rate": 0.8004561003420753, + "level_macro_f1": 0.12164103976458382, + "level_weighted_f1": 0.11307540313209122, + "level_per_class_f1": [ + 0.3326007326007326, + 0.0, + 0.24170616113744076, + 0.03389830508474576, + 0.0 + ], + "per_category_recall": { + "R1": { + "total": 123, + "detected": 8, + "recall": 0.065, + "miss_rate": 0.935 + }, + "R2": { + "total": 96, + "detected": 15, + "recall": 0.1562, + "miss_rate": 0.8438 + }, + "R3": { + "total": 77, + "detected": 28, + "recall": 0.3636, + "miss_rate": 0.6364 + }, + "R4": { + "total": 81, + "detected": 25, + "recall": 0.3086, + "miss_rate": 0.6914 + }, + "R5": { + "total": 64, + "detected": 9, + "recall": 0.1406, + "miss_rate": 0.8594 + }, + "R6": { + "total": 105, + "detected": 11, + "recall": 0.1048, + "miss_rate": 0.8952 + }, + "R7": { + "total": 91, + "detected": 9, + "recall": 0.0989, + "miss_rate": 0.9011 + }, + "R8": { + "total": 75, + "detected": 49, + "recall": 0.6533, + "miss_rate": 0.3467 + }, + "R9": { + "total": 91, + "detected": 7, + "recall": 0.0769, + "miss_rate": 0.9231 + }, + "R10": { + "total": 74, + "detected": 14, + "recall": 0.1892, + "miss_rate": 0.8108 + } + } + }, + "ours_detection": { + "binary_f1": 0.9988597491448119, + "high_risk_recall": 0.9988597491448119, + "high_risk_precision": 0.9988597491448119, + "false_negative_rate": 0.0011402508551880963, + "level_macro_f1": 0.4974096618676628, + "level_weighted_f1": 0.5113791757593992, + "level_per_class_f1": [ + 0.67601246105919, + 0.17391304347826086, + 0.45622119815668205, + 0.6204620462046204, + 0.5604395604395604 + ], + "fine_per_label_f1": [ + 0.7047244094488189, + 0.40274599542334094, + 0.6269035532994924, + 0.4339622641509434, + 0.6253521126760564, + 0.2874617737003058, + 0.27901785714285715, + 0.2389937106918239, + 0.6086956521739131, + 0.5878136200716846, + 0.350253807106599, + 0.4444444444444444, + 0.3734015345268542, + 0.6942148760330579 + ], + "fine_macro_f1": 0.4755704007778709, + "fine_weighted_f1": 0.5078364322693886, + "per_category_recall": { + "R1": { + "total": 123, + "detected": 122, + "recall": 0.9919, + "miss_rate": 0.0081 + }, + "R2": { + "total": 96, + "detected": 96, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R3": { + "total": 77, + "detected": 77, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R4": { + "total": 81, + "detected": 81, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R5": { + "total": 64, + "detected": 64, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R6": { + "total": 105, + "detected": 105, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R7": { + "total": 91, + "detected": 91, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R8": { + "total": 75, + "detected": 75, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R9": { + "total": 91, + "detected": 91, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R10": { + "total": 74, + "detected": 74, + "recall": 1.0, + "miss_rate": 0.0 + } + }, + "label_filter": "all" + } +} \ No newline at end of file diff --git a/experiments/eval_v3_results.json b/experiments/eval_v3_results.json new file mode 100644 index 0000000..d2ecf57 --- /dev/null +++ b/experiments/eval_v3_results.json @@ -0,0 +1,337 @@ +{ + "meta": { + "test_file": "data/processed/CompanionRisk-Bench/test.jsonl", + "source_filter": "all", + "label_filter": "all", + "n_total": 1324, + "n_filtered": 1324, + "n_risky": 877 + }, + "L1a_keyword": { + "binary_f1": 0.27751196172248804, + "high_risk_recall": 0.1653363740022805, + "high_risk_precision": 0.8630952380952381, + "false_negative_rate": 0.8346636259977195, + "level_macro_f1": 0.11264512835143245, + "level_weighted_f1": 0.10448970574896717, + "level_per_class_f1": [ + 0.3254480286738351, + 0.0, + 0.20865139949109415, + 0.02912621359223301, + 0.0 + ], + "per_category_recall": { + "R1": { + "total": 123, + "detected": 8, + "recall": 0.065, + "miss_rate": 0.935 + }, + "R2": { + "total": 96, + "detected": 14, + "recall": 0.1458, + "miss_rate": 0.8542 + }, + "R3": { + "total": 77, + "detected": 13, + "recall": 0.1688, + "miss_rate": 0.8312 + }, + "R4": { + "total": 81, + "detected": 18, + "recall": 0.2222, + "miss_rate": 0.7778 + }, + "R5": { + "total": 64, + "detected": 9, + "recall": 0.1406, + "miss_rate": 0.8594 + }, + "R6": { + "total": 105, + "detected": 11, + "recall": 0.1048, + "miss_rate": 0.8952 + }, + "R7": { + "total": 91, + "detected": 6, + "recall": 0.0659, + "miss_rate": 0.9341 + }, + "R8": { + "total": 75, + "detected": 49, + "recall": 0.6533, + "miss_rate": 0.3467 + }, + "R9": { + "total": 91, + "detected": 7, + "recall": 0.0769, + "miss_rate": 0.9231 + }, + "R10": { + "total": 74, + "detected": 10, + "recall": 0.1351, + "miss_rate": 0.8649 + } + } + }, + "L1b_regex": { + "binary_f1": 0.07886089813800658, + "high_risk_recall": 0.04104903078677309, + "high_risk_precision": 1.0, + "false_negative_rate": 0.9589509692132269, + "level_macro_f1": 0.08441436068877664, + "level_weighted_f1": 0.07640981579648991, + "level_per_class_f1": [ + 0.31303208906352326, + 0.0, + 0.10408921933085502, + 0.0049504950495049506, + 0.0 + ], + "per_category_recall": { + "R1": { + "total": 123, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R2": { + "total": 96, + "detected": 1, + "recall": 0.0104, + "miss_rate": 0.9896 + }, + "R3": { + "total": 77, + "detected": 19, + "recall": 0.2468, + "miss_rate": 0.7532 + }, + "R4": { + "total": 81, + "detected": 9, + "recall": 0.1111, + "miss_rate": 0.8889 + }, + "R5": { + "total": 64, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R6": { + "total": 105, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R7": { + "total": 91, + "detected": 3, + "recall": 0.033, + "miss_rate": 0.967 + }, + "R8": { + "total": 75, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R9": { + "total": 91, + "detected": 0, + "recall": 0.0, + "miss_rate": 1.0 + }, + "R10": { + "total": 74, + "detected": 4, + "recall": 0.0541, + "miss_rate": 0.9459 + } + } + }, + "L1c_combined": { + "binary_f1": 0.32558139534883723, + "high_risk_recall": 0.19954389965792474, + "high_risk_precision": 0.8838383838383839, + "false_negative_rate": 0.8004561003420753, + "level_macro_f1": 0.12164103976458382, + "level_weighted_f1": 0.11307540313209122, + "level_per_class_f1": [ + 0.3326007326007326, + 0.0, + 0.24170616113744076, + 0.03389830508474576, + 0.0 + ], + "per_category_recall": { + "R1": { + "total": 123, + "detected": 8, + "recall": 0.065, + "miss_rate": 0.935 + }, + "R2": { + "total": 96, + "detected": 15, + "recall": 0.1562, + "miss_rate": 0.8438 + }, + "R3": { + "total": 77, + "detected": 28, + "recall": 0.3636, + "miss_rate": 0.6364 + }, + "R4": { + "total": 81, + "detected": 25, + "recall": 0.3086, + "miss_rate": 0.6914 + }, + "R5": { + "total": 64, + "detected": 9, + "recall": 0.1406, + "miss_rate": 0.8594 + }, + "R6": { + "total": 105, + "detected": 11, + "recall": 0.1048, + "miss_rate": 0.8952 + }, + "R7": { + "total": 91, + "detected": 9, + "recall": 0.0989, + "miss_rate": 0.9011 + }, + "R8": { + "total": 75, + "detected": 49, + "recall": 0.6533, + "miss_rate": 0.3467 + }, + "R9": { + "total": 91, + "detected": 7, + "recall": 0.0769, + "miss_rate": 0.9231 + }, + "R10": { + "total": 74, + "detected": 14, + "recall": 0.1892, + "miss_rate": 0.8108 + } + } + }, + "ours_detection": { + "binary_f1": 0.9988597491448119, + "high_risk_recall": 0.9988597491448119, + "high_risk_precision": 0.9988597491448119, + "false_negative_rate": 0.0011402508551880963, + "level_macro_f1": 0.4974096618676628, + "level_weighted_f1": 0.5113791757593992, + "level_per_class_f1": [ + 0.67601246105919, + 0.17391304347826086, + 0.45622119815668205, + 0.6204620462046204, + 0.5604395604395604 + ], + "fine_per_label_f1": [ + 0.7047244094488189, + 0.40274599542334094, + 0.6269035532994924, + 0.4339622641509434, + 0.6253521126760564, + 0.2874617737003058, + 0.27901785714285715, + 0.2389937106918239, + 0.6086956521739131, + 0.5878136200716846, + 0.350253807106599, + 0.4444444444444444, + 0.3734015345268542, + 0.6942148760330579 + ], + "fine_macro_f1": 0.4755704007778709, + "fine_weighted_f1": 0.5078364322693886, + "per_category_recall": { + "R1": { + "total": 123, + "detected": 122, + "recall": 0.9919, + "miss_rate": 0.0081 + }, + "R2": { + "total": 96, + "detected": 96, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R3": { + "total": 77, + "detected": 77, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R4": { + "total": 81, + "detected": 81, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R5": { + "total": 64, + "detected": 64, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R6": { + "total": 105, + "detected": 105, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R7": { + "total": 91, + "detected": 91, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R8": { + "total": 75, + "detected": 75, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R9": { + "total": 91, + "detected": 91, + "recall": 1.0, + "miss_rate": 0.0 + }, + "R10": { + "total": 74, + "detected": 74, + "recall": 1.0, + "miss_rate": 0.0 + } + }, + "label_filter": "all" + } +} \ No newline at end of file