- Module C: BC+PPO training v5/v6 done; eval results in experiments/eval_intervention_v{5,6}.json
- Reward: v5 label-aligned constrained reward (code/src/rl/reward.py)
- Ablations: Module B (history_r, response_only, full) + Module C (wo_category_reward)
- SOTA baselines: WildGuard and ShieldGemma2b eval scripts and results
- Paper: update sections 05–08 (Module B/C description, experiments table, discussion)
- Docs: add record.md (change log), update state.md and exp.md; retire change.md
- Tools: add html-to-ppt utilities and run_shieldgemma2b.sh
- Configs: add ablation YAML configs for Module B and C
- Cleanup: remove stale reference/ PNG screenshots
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
192 lines
4.4 KiB
JSON
192 lines
4.4 KiB
JSON
{
|
|
"meta": {
|
|
"model": "Qwen/Qwen2.5-72B-Instruct",
|
|
"test_file": "data/processed/CompanionRisk-Bench/test.jsonl",
|
|
"n_total": 1486,
|
|
"n_risky": 1039,
|
|
"parse_failure_count": 20,
|
|
"elapsed_seconds": 864.4
|
|
},
|
|
"llm_judge_intervention": {
|
|
"intervention_recall_high": 0.39723926380368096,
|
|
"safety_recall": 0.39723926380368096,
|
|
"over_refusal": 0.2109704641350211,
|
|
"over_intervention_rate": 0.2109704641350211,
|
|
"action_accuracy": 0.37415881561238223,
|
|
"exact_action_accuracy_by_level": {
|
|
"L0_Safe": 0.6919831223628692,
|
|
"L1_Mild": 0.6321428571428571,
|
|
"L2_Moderate": 0.28391167192429023,
|
|
"L3_High": 0.2236842105263158,
|
|
"L4_Critical": 0.11734693877551021
|
|
},
|
|
"action_distribution": [
|
|
0.4791386271870794,
|
|
0.20524899057873486,
|
|
0.211978465679677,
|
|
0.004037685060565276,
|
|
0.09959623149394348
|
|
],
|
|
"per_level_action_dist": {
|
|
"L0_Safe": {
|
|
"n": 237,
|
|
"action_dist": [
|
|
0.6919831223628692,
|
|
0.0970464135021097,
|
|
0.0970464135021097,
|
|
0.004219409282700422,
|
|
0.10970464135021098
|
|
]
|
|
},
|
|
"L1_Mild": {
|
|
"n": 280,
|
|
"action_dist": [
|
|
0.6392857142857142,
|
|
0.1392857142857143,
|
|
0.16071428571428573,
|
|
0.0,
|
|
0.060714285714285714
|
|
]
|
|
},
|
|
"L2_Moderate": {
|
|
"n": 317,
|
|
"action_dist": [
|
|
0.41009463722397477,
|
|
0.2807570977917981,
|
|
0.25236593059936907,
|
|
0.0,
|
|
0.056782334384858045
|
|
]
|
|
},
|
|
"L3_High": {
|
|
"n": 456,
|
|
"action_dist": [
|
|
0.39035087719298245,
|
|
0.24561403508771928,
|
|
0.24780701754385964,
|
|
0.006578947368421052,
|
|
0.10964912280701754
|
|
]
|
|
},
|
|
"L4_Critical": {
|
|
"n": 196,
|
|
"action_dist": [
|
|
0.3112244897959184,
|
|
0.21428571428571427,
|
|
0.2755102040816326,
|
|
0.01020408163265306,
|
|
0.18877551020408162
|
|
]
|
|
}
|
|
},
|
|
"crisis_precision_l4": 0.25,
|
|
"crisis_precision": 0.25,
|
|
"crisis_appropriateness": 0.31756756756756754,
|
|
"R1_high_critical_crisis_recall": 0.2831858407079646,
|
|
"reject_rate_R6_R7_high": 0.021505376344086023,
|
|
"strong_intervention_rate_L1": 0.22142857142857142,
|
|
"per_category_action_dist": {
|
|
"R1": {
|
|
"n": 583,
|
|
"action_dist": [
|
|
0.6260720411663808,
|
|
0.12178387650085763,
|
|
0.13036020583190394,
|
|
0.003430531732418525,
|
|
0.1183533447684391
|
|
]
|
|
},
|
|
"R2": {
|
|
"n": 142,
|
|
"action_dist": [
|
|
0.45774647887323944,
|
|
0.28169014084507044,
|
|
0.176056338028169,
|
|
0.0,
|
|
0.08450704225352113
|
|
]
|
|
},
|
|
"R3": {
|
|
"n": 95,
|
|
"action_dist": [
|
|
0.3473684210526316,
|
|
0.23157894736842105,
|
|
0.37894736842105264,
|
|
0.0,
|
|
0.042105263157894736
|
|
]
|
|
},
|
|
"R4": {
|
|
"n": 116,
|
|
"action_dist": [
|
|
0.3620689655172414,
|
|
0.25,
|
|
0.31896551724137934,
|
|
0.0,
|
|
0.06896551724137931
|
|
]
|
|
},
|
|
"R5": {
|
|
"n": 64,
|
|
"action_dist": [
|
|
0.296875,
|
|
0.34375,
|
|
0.296875,
|
|
0.0,
|
|
0.0625
|
|
]
|
|
},
|
|
"R6": {
|
|
"n": 97,
|
|
"action_dist": [
|
|
0.31958762886597936,
|
|
0.18556701030927836,
|
|
0.24742268041237114,
|
|
0.030927835051546393,
|
|
0.21649484536082475
|
|
]
|
|
},
|
|
"R7": {
|
|
"n": 91,
|
|
"action_dist": [
|
|
0.45054945054945056,
|
|
0.15384615384615385,
|
|
0.3076923076923077,
|
|
0.01098901098901099,
|
|
0.07692307692307693
|
|
]
|
|
},
|
|
"R8": {
|
|
"n": 73,
|
|
"action_dist": [
|
|
0.4520547945205479,
|
|
0.2328767123287671,
|
|
0.2602739726027397,
|
|
0.0,
|
|
0.0547945205479452
|
|
]
|
|
},
|
|
"R9": {
|
|
"n": 152,
|
|
"action_dist": [
|
|
0.3815789473684211,
|
|
0.3157894736842105,
|
|
0.21710526315789475,
|
|
0.0,
|
|
0.08552631578947369
|
|
]
|
|
},
|
|
"R10": {
|
|
"n": 73,
|
|
"action_dist": [
|
|
0.3424657534246575,
|
|
0.3287671232876712,
|
|
0.2465753424657534,
|
|
0.0,
|
|
0.0821917808219178
|
|
]
|
|
}
|
|
},
|
|
"safety_ux_fscore": 0.528435902611466
|
|
}
|
|
} |