Files
CompanionGuard-RL/code/experiments/eval_v3_results.json
zhangsiyuan bd1f51c496 chore: initial commit — unified project repo
Merged code repo (CompanionGuard-RL) into single project-level git.
Reorganized root: docs/, reference/, experiments/, tmp/active|archives/.
Gitignored: data/, checkpoints/, .venv, experiment logs, tmp/archives.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-14 11:28:42 +08:00

337 lines
7.2 KiB
JSON

{
"meta": {
"test_file": "data/processed/CompanionRisk-Bench/test.jsonl",
"source_filter": "all",
"label_filter": "all",
"n_total": 1324,
"n_filtered": 1324,
"n_risky": 877
},
"L1a_keyword": {
"binary_f1": 0.27751196172248804,
"high_risk_recall": 0.1653363740022805,
"high_risk_precision": 0.8630952380952381,
"false_negative_rate": 0.8346636259977195,
"level_macro_f1": 0.11264512835143245,
"level_weighted_f1": 0.10448970574896717,
"level_per_class_f1": [
0.3254480286738351,
0.0,
0.20865139949109415,
0.02912621359223301,
0.0
],
"per_category_recall": {
"R1": {
"total": 123,
"detected": 8,
"recall": 0.065,
"miss_rate": 0.935
},
"R2": {
"total": 96,
"detected": 14,
"recall": 0.1458,
"miss_rate": 0.8542
},
"R3": {
"total": 77,
"detected": 13,
"recall": 0.1688,
"miss_rate": 0.8312
},
"R4": {
"total": 81,
"detected": 18,
"recall": 0.2222,
"miss_rate": 0.7778
},
"R5": {
"total": 64,
"detected": 9,
"recall": 0.1406,
"miss_rate": 0.8594
},
"R6": {
"total": 105,
"detected": 11,
"recall": 0.1048,
"miss_rate": 0.8952
},
"R7": {
"total": 91,
"detected": 6,
"recall": 0.0659,
"miss_rate": 0.9341
},
"R8": {
"total": 75,
"detected": 49,
"recall": 0.6533,
"miss_rate": 0.3467
},
"R9": {
"total": 91,
"detected": 7,
"recall": 0.0769,
"miss_rate": 0.9231
},
"R10": {
"total": 74,
"detected": 10,
"recall": 0.1351,
"miss_rate": 0.8649
}
}
},
"L1b_regex": {
"binary_f1": 0.07886089813800658,
"high_risk_recall": 0.04104903078677309,
"high_risk_precision": 1.0,
"false_negative_rate": 0.9589509692132269,
"level_macro_f1": 0.08441436068877664,
"level_weighted_f1": 0.07640981579648991,
"level_per_class_f1": [
0.31303208906352326,
0.0,
0.10408921933085502,
0.0049504950495049506,
0.0
],
"per_category_recall": {
"R1": {
"total": 123,
"detected": 0,
"recall": 0.0,
"miss_rate": 1.0
},
"R2": {
"total": 96,
"detected": 1,
"recall": 0.0104,
"miss_rate": 0.9896
},
"R3": {
"total": 77,
"detected": 19,
"recall": 0.2468,
"miss_rate": 0.7532
},
"R4": {
"total": 81,
"detected": 9,
"recall": 0.1111,
"miss_rate": 0.8889
},
"R5": {
"total": 64,
"detected": 0,
"recall": 0.0,
"miss_rate": 1.0
},
"R6": {
"total": 105,
"detected": 0,
"recall": 0.0,
"miss_rate": 1.0
},
"R7": {
"total": 91,
"detected": 3,
"recall": 0.033,
"miss_rate": 0.967
},
"R8": {
"total": 75,
"detected": 0,
"recall": 0.0,
"miss_rate": 1.0
},
"R9": {
"total": 91,
"detected": 0,
"recall": 0.0,
"miss_rate": 1.0
},
"R10": {
"total": 74,
"detected": 4,
"recall": 0.0541,
"miss_rate": 0.9459
}
}
},
"L1c_combined": {
"binary_f1": 0.32558139534883723,
"high_risk_recall": 0.19954389965792474,
"high_risk_precision": 0.8838383838383839,
"false_negative_rate": 0.8004561003420753,
"level_macro_f1": 0.12164103976458382,
"level_weighted_f1": 0.11307540313209122,
"level_per_class_f1": [
0.3326007326007326,
0.0,
0.24170616113744076,
0.03389830508474576,
0.0
],
"per_category_recall": {
"R1": {
"total": 123,
"detected": 8,
"recall": 0.065,
"miss_rate": 0.935
},
"R2": {
"total": 96,
"detected": 15,
"recall": 0.1562,
"miss_rate": 0.8438
},
"R3": {
"total": 77,
"detected": 28,
"recall": 0.3636,
"miss_rate": 0.6364
},
"R4": {
"total": 81,
"detected": 25,
"recall": 0.3086,
"miss_rate": 0.6914
},
"R5": {
"total": 64,
"detected": 9,
"recall": 0.1406,
"miss_rate": 0.8594
},
"R6": {
"total": 105,
"detected": 11,
"recall": 0.1048,
"miss_rate": 0.8952
},
"R7": {
"total": 91,
"detected": 9,
"recall": 0.0989,
"miss_rate": 0.9011
},
"R8": {
"total": 75,
"detected": 49,
"recall": 0.6533,
"miss_rate": 0.3467
},
"R9": {
"total": 91,
"detected": 7,
"recall": 0.0769,
"miss_rate": 0.9231
},
"R10": {
"total": 74,
"detected": 14,
"recall": 0.1892,
"miss_rate": 0.8108
}
}
},
"ours_detection": {
"binary_f1": 0.9988597491448119,
"high_risk_recall": 0.9988597491448119,
"high_risk_precision": 0.9988597491448119,
"false_negative_rate": 0.0011402508551880963,
"level_macro_f1": 0.4974096618676628,
"level_weighted_f1": 0.5113791757593992,
"level_per_class_f1": [
0.67601246105919,
0.17391304347826086,
0.45622119815668205,
0.6204620462046204,
0.5604395604395604
],
"fine_per_label_f1": [
0.7047244094488189,
0.40274599542334094,
0.6269035532994924,
0.4339622641509434,
0.6253521126760564,
0.2874617737003058,
0.27901785714285715,
0.2389937106918239,
0.6086956521739131,
0.5878136200716846,
0.350253807106599,
0.4444444444444444,
0.3734015345268542,
0.6942148760330579
],
"fine_macro_f1": 0.4755704007778709,
"fine_weighted_f1": 0.5078364322693886,
"per_category_recall": {
"R1": {
"total": 123,
"detected": 122,
"recall": 0.9919,
"miss_rate": 0.0081
},
"R2": {
"total": 96,
"detected": 96,
"recall": 1.0,
"miss_rate": 0.0
},
"R3": {
"total": 77,
"detected": 77,
"recall": 1.0,
"miss_rate": 0.0
},
"R4": {
"total": 81,
"detected": 81,
"recall": 1.0,
"miss_rate": 0.0
},
"R5": {
"total": 64,
"detected": 64,
"recall": 1.0,
"miss_rate": 0.0
},
"R6": {
"total": 105,
"detected": 105,
"recall": 1.0,
"miss_rate": 0.0
},
"R7": {
"total": 91,
"detected": 91,
"recall": 1.0,
"miss_rate": 0.0
},
"R8": {
"total": 75,
"detected": 75,
"recall": 1.0,
"miss_rate": 0.0
},
"R9": {
"total": 91,
"detected": 91,
"recall": 1.0,
"miss_rate": 0.0
},
"R10": {
"total": 74,
"detected": 74,
"recall": 1.0,
"miss_rate": 0.0
}
},
"label_filter": "all"
}
}