- Module C: BC+PPO training v5/v6 done; eval results in experiments/eval_intervention_v{5,6}.json
- Reward: v5 label-aligned constrained reward (code/src/rl/reward.py)
- Ablations: Module B (history_r, response_only, full) + Module C (wo_category_reward)
- SOTA baselines: WildGuard and ShieldGemma2b eval scripts and results
- Paper: update sections 05–08 (Module B/C description, experiments table, discussion)
- Docs: add record.md (change log), update state.md and exp.md; retire change.md
- Tools: add html-to-ppt utilities and run_shieldgemma2b.sh
- Configs: add ablation YAML configs for Module B and C
- Cleanup: remove stale reference/ PNG screenshots
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
337 lines
7.2 KiB
JSON
337 lines
7.2 KiB
JSON
{
|
|
"meta": {
|
|
"test_file": "data/processed/CompanionRisk-Bench/test.jsonl",
|
|
"source_filter": "all",
|
|
"label_filter": "all",
|
|
"n_total": 1486,
|
|
"n_filtered": 1486,
|
|
"n_risky": 1039
|
|
},
|
|
"L1a_keyword": {
|
|
"binary_f1": 0.26436781609195403,
|
|
"high_risk_recall": 0.15495668912415783,
|
|
"high_risk_precision": 0.8994413407821229,
|
|
"false_negative_rate": 0.8450433108758422,
|
|
"level_macro_f1": 0.10427720349098286,
|
|
"level_weighted_f1": 0.09799538109505529,
|
|
"level_per_class_f1": [
|
|
0.2979274611398964,
|
|
0.0,
|
|
0.1934156378600823,
|
|
0.030042918454935622,
|
|
0.0
|
|
],
|
|
"per_category_recall": {
|
|
"R1": {
|
|
"total": 136,
|
|
"detected": 10,
|
|
"recall": 0.0735,
|
|
"miss_rate": 0.9265
|
|
},
|
|
"R2": {
|
|
"total": 142,
|
|
"detected": 16,
|
|
"recall": 0.1127,
|
|
"miss_rate": 0.8873
|
|
},
|
|
"R3": {
|
|
"total": 95,
|
|
"detected": 17,
|
|
"recall": 0.1789,
|
|
"miss_rate": 0.8211
|
|
},
|
|
"R4": {
|
|
"total": 116,
|
|
"detected": 22,
|
|
"recall": 0.1897,
|
|
"miss_rate": 0.8103
|
|
},
|
|
"R5": {
|
|
"total": 64,
|
|
"detected": 9,
|
|
"recall": 0.1406,
|
|
"miss_rate": 0.8594
|
|
},
|
|
"R6": {
|
|
"total": 97,
|
|
"detected": 11,
|
|
"recall": 0.1134,
|
|
"miss_rate": 0.8866
|
|
},
|
|
"R7": {
|
|
"total": 91,
|
|
"detected": 6,
|
|
"recall": 0.0659,
|
|
"miss_rate": 0.9341
|
|
},
|
|
"R8": {
|
|
"total": 73,
|
|
"detected": 49,
|
|
"recall": 0.6712,
|
|
"miss_rate": 0.3288
|
|
},
|
|
"R9": {
|
|
"total": 152,
|
|
"detected": 11,
|
|
"recall": 0.0724,
|
|
"miss_rate": 0.9276
|
|
},
|
|
"R10": {
|
|
"total": 73,
|
|
"detected": 10,
|
|
"recall": 0.137,
|
|
"miss_rate": 0.863
|
|
}
|
|
}
|
|
},
|
|
"L1b_regex": {
|
|
"binary_f1": 0.06697674418604652,
|
|
"high_risk_recall": 0.03464870067372473,
|
|
"high_risk_precision": 1.0,
|
|
"false_negative_rate": 0.9653512993262753,
|
|
"level_macro_f1": 0.07297879241072718,
|
|
"level_weighted_f1": 0.06312377515343655,
|
|
"level_per_class_f1": [
|
|
0.2809721398933017,
|
|
0.0,
|
|
0.07954545454545454,
|
|
0.00437636761487965,
|
|
0.0
|
|
],
|
|
"per_category_recall": {
|
|
"R1": {
|
|
"total": 136,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R2": {
|
|
"total": 142,
|
|
"detected": 1,
|
|
"recall": 0.007,
|
|
"miss_rate": 0.993
|
|
},
|
|
"R3": {
|
|
"total": 95,
|
|
"detected": 19,
|
|
"recall": 0.2,
|
|
"miss_rate": 0.8
|
|
},
|
|
"R4": {
|
|
"total": 116,
|
|
"detected": 9,
|
|
"recall": 0.0776,
|
|
"miss_rate": 0.9224
|
|
},
|
|
"R5": {
|
|
"total": 64,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R6": {
|
|
"total": 97,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R7": {
|
|
"total": 91,
|
|
"detected": 3,
|
|
"recall": 0.033,
|
|
"miss_rate": 0.967
|
|
},
|
|
"R8": {
|
|
"total": 73,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R9": {
|
|
"total": 152,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R10": {
|
|
"total": 73,
|
|
"detected": 4,
|
|
"recall": 0.0548,
|
|
"miss_rate": 0.9452
|
|
}
|
|
}
|
|
},
|
|
"L1c_combined": {
|
|
"binary_f1": 0.3060897435897436,
|
|
"high_risk_recall": 0.18383060635226178,
|
|
"high_risk_precision": 0.9138755980861244,
|
|
"false_negative_rate": 0.8161693936477382,
|
|
"level_macro_f1": 0.11189027535274536,
|
|
"level_weighted_f1": 0.10619241328971442,
|
|
"level_per_class_f1": [
|
|
0.3038309114927345,
|
|
0.0,
|
|
0.22135922330097088,
|
|
0.034261241970021415,
|
|
0.0
|
|
],
|
|
"per_category_recall": {
|
|
"R1": {
|
|
"total": 136,
|
|
"detected": 10,
|
|
"recall": 0.0735,
|
|
"miss_rate": 0.9265
|
|
},
|
|
"R2": {
|
|
"total": 142,
|
|
"detected": 17,
|
|
"recall": 0.1197,
|
|
"miss_rate": 0.8803
|
|
},
|
|
"R3": {
|
|
"total": 95,
|
|
"detected": 32,
|
|
"recall": 0.3368,
|
|
"miss_rate": 0.6632
|
|
},
|
|
"R4": {
|
|
"total": 116,
|
|
"detected": 29,
|
|
"recall": 0.25,
|
|
"miss_rate": 0.75
|
|
},
|
|
"R5": {
|
|
"total": 64,
|
|
"detected": 9,
|
|
"recall": 0.1406,
|
|
"miss_rate": 0.8594
|
|
},
|
|
"R6": {
|
|
"total": 97,
|
|
"detected": 11,
|
|
"recall": 0.1134,
|
|
"miss_rate": 0.8866
|
|
},
|
|
"R7": {
|
|
"total": 91,
|
|
"detected": 9,
|
|
"recall": 0.0989,
|
|
"miss_rate": 0.9011
|
|
},
|
|
"R8": {
|
|
"total": 73,
|
|
"detected": 49,
|
|
"recall": 0.6712,
|
|
"miss_rate": 0.3288
|
|
},
|
|
"R9": {
|
|
"total": 152,
|
|
"detected": 11,
|
|
"recall": 0.0724,
|
|
"miss_rate": 0.9276
|
|
},
|
|
"R10": {
|
|
"total": 73,
|
|
"detected": 14,
|
|
"recall": 0.1918,
|
|
"miss_rate": 0.8082
|
|
}
|
|
}
|
|
},
|
|
"ours_detection": {
|
|
"binary_f1": 0.9995189995189995,
|
|
"high_risk_recall": 1.0,
|
|
"high_risk_precision": 0.9990384615384615,
|
|
"false_negative_rate": 0.0,
|
|
"level_macro_f1": 0.5849268426729829,
|
|
"level_weighted_f1": 0.5837172940762267,
|
|
"level_per_class_f1": [
|
|
0.6365503080082136,
|
|
0.555765595463138,
|
|
0.5648854961832062,
|
|
0.5886214442013129,
|
|
0.5788113695090439
|
|
],
|
|
"fine_per_label_f1": [
|
|
0.7136563876651982,
|
|
0.3092369477911647,
|
|
0.5855338691159586,
|
|
0.49557522123893805,
|
|
0.5514018691588785,
|
|
0.39836289222373805,
|
|
0.4025423728813559,
|
|
0.33865030674846625,
|
|
0.5205479452054794,
|
|
0.36049382716049383,
|
|
0.46153846153846156,
|
|
0.34050179211469533,
|
|
0.2616822429906542,
|
|
0.7942583732057417
|
|
],
|
|
"fine_macro_f1": 0.4667130363599446,
|
|
"fine_weighted_f1": 0.48464325778962425,
|
|
"per_category_recall": {
|
|
"R1": {
|
|
"total": 136,
|
|
"detected": 136,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R2": {
|
|
"total": 142,
|
|
"detected": 142,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R3": {
|
|
"total": 95,
|
|
"detected": 95,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R4": {
|
|
"total": 116,
|
|
"detected": 116,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R5": {
|
|
"total": 64,
|
|
"detected": 64,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R6": {
|
|
"total": 97,
|
|
"detected": 97,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R7": {
|
|
"total": 91,
|
|
"detected": 91,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R8": {
|
|
"total": 73,
|
|
"detected": 73,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R9": {
|
|
"total": 152,
|
|
"detected": 152,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R10": {
|
|
"total": 73,
|
|
"detected": 73,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
}
|
|
},
|
|
"label_filter": "all"
|
|
}
|
|
} |