feat: Module C v5/v6 training complete, ablations, SOTA baselines, paper updates

- Module C: BC+PPO training v5/v6 done; eval results in experiments/eval_intervention_v{5,6}.json
- Reward: v5 label-aligned constrained reward (code/src/rl/reward.py)
- Ablations: Module B (history_r, response_only, full) + Module C (wo_category_reward)
- SOTA baselines: WildGuard and ShieldGemma2b eval scripts and results
- Paper: update sections 05–08 (Module B/C description, experiments table, discussion)
- Docs: add record.md (change log), update state.md and exp.md; retire change.md
- Tools: add html-to-ppt utilities and run_shieldgemma2b.sh
- Configs: add ablation YAML configs for Module B and C
- Cleanup: remove stale reference/ PNG screenshots

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-20 14:24:09 +08:00
parent 6d61a950f1
commit 52ba43f08d
55 changed files with 8239 additions and 1244 deletions

View File

@@ -0,0 +1,337 @@
{
"meta": {
"test_file": "data/processed/CompanionRisk-Bench/test.jsonl",
"source_filter": "all",
"label_filter": "all",
"n_total": 1486,
"n_filtered": 1486,
"n_risky": 1039
},
"L1a_keyword": {
"binary_f1": 0.26436781609195403,
"high_risk_recall": 0.15495668912415783,
"high_risk_precision": 0.8994413407821229,
"false_negative_rate": 0.8450433108758422,
"level_macro_f1": 0.10427720349098286,
"level_weighted_f1": 0.09799538109505529,
"level_per_class_f1": [
0.2979274611398964,
0.0,
0.1934156378600823,
0.030042918454935622,
0.0
],
"per_category_recall": {
"R1": {
"total": 136,
"detected": 10,
"recall": 0.0735,
"miss_rate": 0.9265
},
"R2": {
"total": 142,
"detected": 16,
"recall": 0.1127,
"miss_rate": 0.8873
},
"R3": {
"total": 95,
"detected": 17,
"recall": 0.1789,
"miss_rate": 0.8211
},
"R4": {
"total": 116,
"detected": 22,
"recall": 0.1897,
"miss_rate": 0.8103
},
"R5": {
"total": 64,
"detected": 9,
"recall": 0.1406,
"miss_rate": 0.8594
},
"R6": {
"total": 97,
"detected": 11,
"recall": 0.1134,
"miss_rate": 0.8866
},
"R7": {
"total": 91,
"detected": 6,
"recall": 0.0659,
"miss_rate": 0.9341
},
"R8": {
"total": 73,
"detected": 49,
"recall": 0.6712,
"miss_rate": 0.3288
},
"R9": {
"total": 152,
"detected": 11,
"recall": 0.0724,
"miss_rate": 0.9276
},
"R10": {
"total": 73,
"detected": 10,
"recall": 0.137,
"miss_rate": 0.863
}
}
},
"L1b_regex": {
"binary_f1": 0.06697674418604652,
"high_risk_recall": 0.03464870067372473,
"high_risk_precision": 1.0,
"false_negative_rate": 0.9653512993262753,
"level_macro_f1": 0.07297879241072718,
"level_weighted_f1": 0.06312377515343655,
"level_per_class_f1": [
0.2809721398933017,
0.0,
0.07954545454545454,
0.00437636761487965,
0.0
],
"per_category_recall": {
"R1": {
"total": 136,
"detected": 0,
"recall": 0.0,
"miss_rate": 1.0
},
"R2": {
"total": 142,
"detected": 1,
"recall": 0.007,
"miss_rate": 0.993
},
"R3": {
"total": 95,
"detected": 19,
"recall": 0.2,
"miss_rate": 0.8
},
"R4": {
"total": 116,
"detected": 9,
"recall": 0.0776,
"miss_rate": 0.9224
},
"R5": {
"total": 64,
"detected": 0,
"recall": 0.0,
"miss_rate": 1.0
},
"R6": {
"total": 97,
"detected": 0,
"recall": 0.0,
"miss_rate": 1.0
},
"R7": {
"total": 91,
"detected": 3,
"recall": 0.033,
"miss_rate": 0.967
},
"R8": {
"total": 73,
"detected": 0,
"recall": 0.0,
"miss_rate": 1.0
},
"R9": {
"total": 152,
"detected": 0,
"recall": 0.0,
"miss_rate": 1.0
},
"R10": {
"total": 73,
"detected": 4,
"recall": 0.0548,
"miss_rate": 0.9452
}
}
},
"L1c_combined": {
"binary_f1": 0.3060897435897436,
"high_risk_recall": 0.18383060635226178,
"high_risk_precision": 0.9138755980861244,
"false_negative_rate": 0.8161693936477382,
"level_macro_f1": 0.11189027535274536,
"level_weighted_f1": 0.10619241328971442,
"level_per_class_f1": [
0.3038309114927345,
0.0,
0.22135922330097088,
0.034261241970021415,
0.0
],
"per_category_recall": {
"R1": {
"total": 136,
"detected": 10,
"recall": 0.0735,
"miss_rate": 0.9265
},
"R2": {
"total": 142,
"detected": 17,
"recall": 0.1197,
"miss_rate": 0.8803
},
"R3": {
"total": 95,
"detected": 32,
"recall": 0.3368,
"miss_rate": 0.6632
},
"R4": {
"total": 116,
"detected": 29,
"recall": 0.25,
"miss_rate": 0.75
},
"R5": {
"total": 64,
"detected": 9,
"recall": 0.1406,
"miss_rate": 0.8594
},
"R6": {
"total": 97,
"detected": 11,
"recall": 0.1134,
"miss_rate": 0.8866
},
"R7": {
"total": 91,
"detected": 9,
"recall": 0.0989,
"miss_rate": 0.9011
},
"R8": {
"total": 73,
"detected": 49,
"recall": 0.6712,
"miss_rate": 0.3288
},
"R9": {
"total": 152,
"detected": 11,
"recall": 0.0724,
"miss_rate": 0.9276
},
"R10": {
"total": 73,
"detected": 14,
"recall": 0.1918,
"miss_rate": 0.8082
}
}
},
"ours_detection": {
"binary_f1": 0.9990384615384615,
"high_risk_recall": 1.0,
"high_risk_precision": 0.9980787704130644,
"false_negative_rate": 0.0,
"level_macro_f1": 0.5860991390886783,
"level_weighted_f1": 0.582784705099023,
"level_per_class_f1": [
0.6617647058823529,
0.4826086956521739,
0.5482866043613707,
0.6062567421790723,
0.631578947368421
],
"fine_per_label_f1": [
0.7073684210526315,
0.49836065573770494,
0.5233830845771145,
0.5355648535564853,
0.6518105849582173,
0.38675496688741723,
0.38927507447864945,
0.3337278106508876,
0.576271186440678,
0.5234899328859061,
0.39902676399026765,
0.47783251231527096,
0.3211009174311927,
0.7107438016528925
],
"fine_macro_f1": 0.5024793261868082,
"fine_weighted_f1": 0.5040116197046451,
"per_category_recall": {
"R1": {
"total": 136,
"detected": 136,
"recall": 1.0,
"miss_rate": 0.0
},
"R2": {
"total": 142,
"detected": 142,
"recall": 1.0,
"miss_rate": 0.0
},
"R3": {
"total": 95,
"detected": 95,
"recall": 1.0,
"miss_rate": 0.0
},
"R4": {
"total": 116,
"detected": 116,
"recall": 1.0,
"miss_rate": 0.0
},
"R5": {
"total": 64,
"detected": 64,
"recall": 1.0,
"miss_rate": 0.0
},
"R6": {
"total": 97,
"detected": 97,
"recall": 1.0,
"miss_rate": 0.0
},
"R7": {
"total": 91,
"detected": 91,
"recall": 1.0,
"miss_rate": 0.0
},
"R8": {
"total": 73,
"detected": 73,
"recall": 1.0,
"miss_rate": 0.0
},
"R9": {
"total": 152,
"detected": 152,
"recall": 1.0,
"miss_rate": 0.0
},
"R10": {
"total": 73,
"detected": 73,
"recall": 1.0,
"miss_rate": 0.0
}
},
"label_filter": "all"
}
}