Merged code repo (CompanionGuard-RL) into single project-level git. Reorganized root: docs/, reference/, experiments/, tmp/active|archives/. Gitignored: data/, checkpoints/, .venv, experiment logs, tmp/archives. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
337 lines
7.2 KiB
JSON
337 lines
7.2 KiB
JSON
{
|
|
"meta": {
|
|
"test_file": "data/processed/CompanionRisk-Bench/test.jsonl",
|
|
"source_filter": "all",
|
|
"label_filter": "all",
|
|
"n_total": 1324,
|
|
"n_filtered": 1324,
|
|
"n_risky": 877
|
|
},
|
|
"L1a_keyword": {
|
|
"binary_f1": 0.27751196172248804,
|
|
"high_risk_recall": 0.1653363740022805,
|
|
"high_risk_precision": 0.8630952380952381,
|
|
"false_negative_rate": 0.8346636259977195,
|
|
"level_macro_f1": 0.11264512835143245,
|
|
"level_weighted_f1": 0.10448970574896717,
|
|
"level_per_class_f1": [
|
|
0.3254480286738351,
|
|
0.0,
|
|
0.20865139949109415,
|
|
0.02912621359223301,
|
|
0.0
|
|
],
|
|
"per_category_recall": {
|
|
"R1": {
|
|
"total": 123,
|
|
"detected": 8,
|
|
"recall": 0.065,
|
|
"miss_rate": 0.935
|
|
},
|
|
"R2": {
|
|
"total": 96,
|
|
"detected": 14,
|
|
"recall": 0.1458,
|
|
"miss_rate": 0.8542
|
|
},
|
|
"R3": {
|
|
"total": 77,
|
|
"detected": 13,
|
|
"recall": 0.1688,
|
|
"miss_rate": 0.8312
|
|
},
|
|
"R4": {
|
|
"total": 81,
|
|
"detected": 18,
|
|
"recall": 0.2222,
|
|
"miss_rate": 0.7778
|
|
},
|
|
"R5": {
|
|
"total": 64,
|
|
"detected": 9,
|
|
"recall": 0.1406,
|
|
"miss_rate": 0.8594
|
|
},
|
|
"R6": {
|
|
"total": 105,
|
|
"detected": 11,
|
|
"recall": 0.1048,
|
|
"miss_rate": 0.8952
|
|
},
|
|
"R7": {
|
|
"total": 91,
|
|
"detected": 6,
|
|
"recall": 0.0659,
|
|
"miss_rate": 0.9341
|
|
},
|
|
"R8": {
|
|
"total": 75,
|
|
"detected": 49,
|
|
"recall": 0.6533,
|
|
"miss_rate": 0.3467
|
|
},
|
|
"R9": {
|
|
"total": 91,
|
|
"detected": 7,
|
|
"recall": 0.0769,
|
|
"miss_rate": 0.9231
|
|
},
|
|
"R10": {
|
|
"total": 74,
|
|
"detected": 10,
|
|
"recall": 0.1351,
|
|
"miss_rate": 0.8649
|
|
}
|
|
}
|
|
},
|
|
"L1b_regex": {
|
|
"binary_f1": 0.07886089813800658,
|
|
"high_risk_recall": 0.04104903078677309,
|
|
"high_risk_precision": 1.0,
|
|
"false_negative_rate": 0.9589509692132269,
|
|
"level_macro_f1": 0.08441436068877664,
|
|
"level_weighted_f1": 0.07640981579648991,
|
|
"level_per_class_f1": [
|
|
0.31303208906352326,
|
|
0.0,
|
|
0.10408921933085502,
|
|
0.0049504950495049506,
|
|
0.0
|
|
],
|
|
"per_category_recall": {
|
|
"R1": {
|
|
"total": 123,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R2": {
|
|
"total": 96,
|
|
"detected": 1,
|
|
"recall": 0.0104,
|
|
"miss_rate": 0.9896
|
|
},
|
|
"R3": {
|
|
"total": 77,
|
|
"detected": 19,
|
|
"recall": 0.2468,
|
|
"miss_rate": 0.7532
|
|
},
|
|
"R4": {
|
|
"total": 81,
|
|
"detected": 9,
|
|
"recall": 0.1111,
|
|
"miss_rate": 0.8889
|
|
},
|
|
"R5": {
|
|
"total": 64,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R6": {
|
|
"total": 105,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R7": {
|
|
"total": 91,
|
|
"detected": 3,
|
|
"recall": 0.033,
|
|
"miss_rate": 0.967
|
|
},
|
|
"R8": {
|
|
"total": 75,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R9": {
|
|
"total": 91,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R10": {
|
|
"total": 74,
|
|
"detected": 4,
|
|
"recall": 0.0541,
|
|
"miss_rate": 0.9459
|
|
}
|
|
}
|
|
},
|
|
"L1c_combined": {
|
|
"binary_f1": 0.32558139534883723,
|
|
"high_risk_recall": 0.19954389965792474,
|
|
"high_risk_precision": 0.8838383838383839,
|
|
"false_negative_rate": 0.8004561003420753,
|
|
"level_macro_f1": 0.12164103976458382,
|
|
"level_weighted_f1": 0.11307540313209122,
|
|
"level_per_class_f1": [
|
|
0.3326007326007326,
|
|
0.0,
|
|
0.24170616113744076,
|
|
0.03389830508474576,
|
|
0.0
|
|
],
|
|
"per_category_recall": {
|
|
"R1": {
|
|
"total": 123,
|
|
"detected": 8,
|
|
"recall": 0.065,
|
|
"miss_rate": 0.935
|
|
},
|
|
"R2": {
|
|
"total": 96,
|
|
"detected": 15,
|
|
"recall": 0.1562,
|
|
"miss_rate": 0.8438
|
|
},
|
|
"R3": {
|
|
"total": 77,
|
|
"detected": 28,
|
|
"recall": 0.3636,
|
|
"miss_rate": 0.6364
|
|
},
|
|
"R4": {
|
|
"total": 81,
|
|
"detected": 25,
|
|
"recall": 0.3086,
|
|
"miss_rate": 0.6914
|
|
},
|
|
"R5": {
|
|
"total": 64,
|
|
"detected": 9,
|
|
"recall": 0.1406,
|
|
"miss_rate": 0.8594
|
|
},
|
|
"R6": {
|
|
"total": 105,
|
|
"detected": 11,
|
|
"recall": 0.1048,
|
|
"miss_rate": 0.8952
|
|
},
|
|
"R7": {
|
|
"total": 91,
|
|
"detected": 9,
|
|
"recall": 0.0989,
|
|
"miss_rate": 0.9011
|
|
},
|
|
"R8": {
|
|
"total": 75,
|
|
"detected": 49,
|
|
"recall": 0.6533,
|
|
"miss_rate": 0.3467
|
|
},
|
|
"R9": {
|
|
"total": 91,
|
|
"detected": 7,
|
|
"recall": 0.0769,
|
|
"miss_rate": 0.9231
|
|
},
|
|
"R10": {
|
|
"total": 74,
|
|
"detected": 14,
|
|
"recall": 0.1892,
|
|
"miss_rate": 0.8108
|
|
}
|
|
}
|
|
},
|
|
"ours_detection": {
|
|
"binary_f1": 0.9988597491448119,
|
|
"high_risk_recall": 0.9988597491448119,
|
|
"high_risk_precision": 0.9988597491448119,
|
|
"false_negative_rate": 0.0011402508551880963,
|
|
"level_macro_f1": 0.4974096618676628,
|
|
"level_weighted_f1": 0.5113791757593992,
|
|
"level_per_class_f1": [
|
|
0.67601246105919,
|
|
0.17391304347826086,
|
|
0.45622119815668205,
|
|
0.6204620462046204,
|
|
0.5604395604395604
|
|
],
|
|
"fine_per_label_f1": [
|
|
0.7047244094488189,
|
|
0.40274599542334094,
|
|
0.6269035532994924,
|
|
0.4339622641509434,
|
|
0.6253521126760564,
|
|
0.2874617737003058,
|
|
0.27901785714285715,
|
|
0.2389937106918239,
|
|
0.6086956521739131,
|
|
0.5878136200716846,
|
|
0.350253807106599,
|
|
0.4444444444444444,
|
|
0.3734015345268542,
|
|
0.6942148760330579
|
|
],
|
|
"fine_macro_f1": 0.4755704007778709,
|
|
"fine_weighted_f1": 0.5078364322693886,
|
|
"per_category_recall": {
|
|
"R1": {
|
|
"total": 123,
|
|
"detected": 122,
|
|
"recall": 0.9919,
|
|
"miss_rate": 0.0081
|
|
},
|
|
"R2": {
|
|
"total": 96,
|
|
"detected": 96,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R3": {
|
|
"total": 77,
|
|
"detected": 77,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R4": {
|
|
"total": 81,
|
|
"detected": 81,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R5": {
|
|
"total": 64,
|
|
"detected": 64,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R6": {
|
|
"total": 105,
|
|
"detected": 105,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R7": {
|
|
"total": 91,
|
|
"detected": 91,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R8": {
|
|
"total": 75,
|
|
"detected": 75,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R9": {
|
|
"total": 91,
|
|
"detected": 91,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R10": {
|
|
"total": 74,
|
|
"detected": 74,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
}
|
|
},
|
|
"label_filter": "all"
|
|
}
|
|
} |