feat: Module C v5/v6 training complete, ablations, SOTA baselines, paper updates

- Module C: BC+PPO training v5/v6 done; eval results in experiments/eval_intervention_v{5,6}.json
- Reward: v5 label-aligned constrained reward (code/src/rl/reward.py)
- Ablations: Module B (history_r, response_only, full) + Module C (wo_category_reward)
- SOTA baselines: WildGuard and ShieldGemma2b eval scripts and results
- Paper: update sections 05–08 (Module B/C description, experiments table, discussion)
- Docs: add record.md (change log), update state.md and exp.md; retire change.md
- Tools: add html-to-ppt utilities and run_shieldgemma2b.sh
- Configs: add ablation YAML configs for Module B and C
- Cleanup: remove stale reference/ PNG screenshots

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-20 14:24:09 +08:00
parent 6d61a950f1
commit 52ba43f08d
55 changed files with 8239 additions and 1244 deletions

View File

@@ -0,0 +1,91 @@
{
"meta": {
"model": "Qwen/Qwen2.5-72B-Instruct",
"test_file": "data/processed/CompanionRisk-Bench/test.jsonl",
"n_total": 10,
"n_risky": 8,
"parse_failure_count": 0,
"elapsed_seconds": 0.0
},
"llm_judge_intervention": {
"intervention_recall_high": 0.8333333333333334,
"safety_recall": 0.8333333333333334,
"over_refusal": 0.0,
"over_intervention_rate": 0.0,
"action_accuracy": 0.6,
"exact_action_accuracy_by_level": {
"L0_Safe": 1.0,
"L1_Mild": 1.0,
"L2_Moderate": 0.0,
"L3_High": 1.0,
"L4_Critical": 0.3333333333333333
},
"action_distribution": [
0.4,
0.1,
0.4,
0.0,
0.1
],
"per_level_action_dist": {
"L0_Safe": {
"n": 1,
"action_dist": [
1.0,
0.0,
0.0,
0.0,
0.0
]
},
"L1_Mild": {
"n": 1,
"action_dist": [
1.0,
0.0,
0.0,
0.0,
0.0
]
},
"L2_Moderate": {
"n": 2,
"action_dist": [
1.0,
0.0,
0.0,
0.0,
0.0
]
},
"L3_High": {
"n": 3,
"action_dist": [
0.0,
0.0,
1.0,
0.0,
0.0
]
},
"L4_Critical": {
"n": 3,
"action_dist": [
0.0,
0.3333333333333333,
0.3333333333333333,
0.0,
0.3333333333333333
]
}
},
"crisis_precision_l4": 1.0,
"crisis_precision": 1.0,
"crisis_appropriateness": 1.0,
"R1_high_critical_crisis_recall": 1.0,
"reject_rate_R6_R7_high": 0.0,
"strong_intervention_rate_L1": 0.0,
"per_category_action_dist": {},
"safety_ux_fscore": 0.9090909090909091
}
}

View File

@@ -0,0 +1,192 @@
{
"meta": {
"model": "Qwen/Qwen2.5-72B-Instruct",
"test_file": "data/processed/CompanionRisk-Bench/test.jsonl",
"n_total": 1486,
"n_risky": 1039,
"parse_failure_count": 20,
"elapsed_seconds": 864.4
},
"llm_judge_intervention": {
"intervention_recall_high": 0.39723926380368096,
"safety_recall": 0.39723926380368096,
"over_refusal": 0.2109704641350211,
"over_intervention_rate": 0.2109704641350211,
"action_accuracy": 0.37415881561238223,
"exact_action_accuracy_by_level": {
"L0_Safe": 0.6919831223628692,
"L1_Mild": 0.6321428571428571,
"L2_Moderate": 0.28391167192429023,
"L3_High": 0.2236842105263158,
"L4_Critical": 0.11734693877551021
},
"action_distribution": [
0.4791386271870794,
0.20524899057873486,
0.211978465679677,
0.004037685060565276,
0.09959623149394348
],
"per_level_action_dist": {
"L0_Safe": {
"n": 237,
"action_dist": [
0.6919831223628692,
0.0970464135021097,
0.0970464135021097,
0.004219409282700422,
0.10970464135021098
]
},
"L1_Mild": {
"n": 280,
"action_dist": [
0.6392857142857142,
0.1392857142857143,
0.16071428571428573,
0.0,
0.060714285714285714
]
},
"L2_Moderate": {
"n": 317,
"action_dist": [
0.41009463722397477,
0.2807570977917981,
0.25236593059936907,
0.0,
0.056782334384858045
]
},
"L3_High": {
"n": 456,
"action_dist": [
0.39035087719298245,
0.24561403508771928,
0.24780701754385964,
0.006578947368421052,
0.10964912280701754
]
},
"L4_Critical": {
"n": 196,
"action_dist": [
0.3112244897959184,
0.21428571428571427,
0.2755102040816326,
0.01020408163265306,
0.18877551020408162
]
}
},
"crisis_precision_l4": 0.25,
"crisis_precision": 0.25,
"crisis_appropriateness": 0.31756756756756754,
"R1_high_critical_crisis_recall": 0.2831858407079646,
"reject_rate_R6_R7_high": 0.021505376344086023,
"strong_intervention_rate_L1": 0.22142857142857142,
"per_category_action_dist": {
"R1": {
"n": 583,
"action_dist": [
0.6260720411663808,
0.12178387650085763,
0.13036020583190394,
0.003430531732418525,
0.1183533447684391
]
},
"R2": {
"n": 142,
"action_dist": [
0.45774647887323944,
0.28169014084507044,
0.176056338028169,
0.0,
0.08450704225352113
]
},
"R3": {
"n": 95,
"action_dist": [
0.3473684210526316,
0.23157894736842105,
0.37894736842105264,
0.0,
0.042105263157894736
]
},
"R4": {
"n": 116,
"action_dist": [
0.3620689655172414,
0.25,
0.31896551724137934,
0.0,
0.06896551724137931
]
},
"R5": {
"n": 64,
"action_dist": [
0.296875,
0.34375,
0.296875,
0.0,
0.0625
]
},
"R6": {
"n": 97,
"action_dist": [
0.31958762886597936,
0.18556701030927836,
0.24742268041237114,
0.030927835051546393,
0.21649484536082475
]
},
"R7": {
"n": 91,
"action_dist": [
0.45054945054945056,
0.15384615384615385,
0.3076923076923077,
0.01098901098901099,
0.07692307692307693
]
},
"R8": {
"n": 73,
"action_dist": [
0.4520547945205479,
0.2328767123287671,
0.2602739726027397,
0.0,
0.0547945205479452
]
},
"R9": {
"n": 152,
"action_dist": [
0.3815789473684211,
0.3157894736842105,
0.21710526315789475,
0.0,
0.08552631578947369
]
},
"R10": {
"n": 73,
"action_dist": [
0.3424657534246575,
0.3287671232876712,
0.2465753424657534,
0.0,
0.0821917808219178
]
}
},
"safety_ux_fscore": 0.528435902611466
}
}