{ "meta": { "test_file": "data/processed/CompanionRisk-Bench/test.jsonl", "source_filter": "all", "label_filter": "all", "n_total": 1324, "n_filtered": 1324, "n_risky": 877 }, "L1a_keyword": { "binary_f1": 0.27751196172248804, "high_risk_recall": 0.1653363740022805, "high_risk_precision": 0.8630952380952381, "false_negative_rate": 0.8346636259977195, "level_macro_f1": 0.11264512835143245, "level_weighted_f1": 0.10448970574896717, "level_per_class_f1": [ 0.3254480286738351, 0.0, 0.20865139949109415, 0.02912621359223301, 0.0 ], "per_category_recall": { "R1": { "total": 123, "detected": 8, "recall": 0.065, "miss_rate": 0.935 }, "R2": { "total": 96, "detected": 14, "recall": 0.1458, "miss_rate": 0.8542 }, "R3": { "total": 77, "detected": 13, "recall": 0.1688, "miss_rate": 0.8312 }, "R4": { "total": 81, "detected": 18, "recall": 0.2222, "miss_rate": 0.7778 }, "R5": { "total": 64, "detected": 9, "recall": 0.1406, "miss_rate": 0.8594 }, "R6": { "total": 105, "detected": 11, "recall": 0.1048, "miss_rate": 0.8952 }, "R7": { "total": 91, "detected": 6, "recall": 0.0659, "miss_rate": 0.9341 }, "R8": { "total": 75, "detected": 49, "recall": 0.6533, "miss_rate": 0.3467 }, "R9": { "total": 91, "detected": 7, "recall": 0.0769, "miss_rate": 0.9231 }, "R10": { "total": 74, "detected": 10, "recall": 0.1351, "miss_rate": 0.8649 } } }, "L1b_regex": { "binary_f1": 0.07886089813800658, "high_risk_recall": 0.04104903078677309, "high_risk_precision": 1.0, "false_negative_rate": 0.9589509692132269, "level_macro_f1": 0.08441436068877664, "level_weighted_f1": 0.07640981579648991, "level_per_class_f1": [ 0.31303208906352326, 0.0, 0.10408921933085502, 0.0049504950495049506, 0.0 ], "per_category_recall": { "R1": { "total": 123, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R2": { "total": 96, "detected": 1, "recall": 0.0104, "miss_rate": 0.9896 }, "R3": { "total": 77, "detected": 19, "recall": 0.2468, "miss_rate": 0.7532 }, "R4": { "total": 81, "detected": 9, "recall": 0.1111, "miss_rate": 0.8889 }, "R5": { "total": 64, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R6": { "total": 105, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R7": { "total": 91, "detected": 3, "recall": 0.033, "miss_rate": 0.967 }, "R8": { "total": 75, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R9": { "total": 91, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R10": { "total": 74, "detected": 4, "recall": 0.0541, "miss_rate": 0.9459 } } }, "L1c_combined": { "binary_f1": 0.32558139534883723, "high_risk_recall": 0.19954389965792474, "high_risk_precision": 0.8838383838383839, "false_negative_rate": 0.8004561003420753, "level_macro_f1": 0.12164103976458382, "level_weighted_f1": 0.11307540313209122, "level_per_class_f1": [ 0.3326007326007326, 0.0, 0.24170616113744076, 0.03389830508474576, 0.0 ], "per_category_recall": { "R1": { "total": 123, "detected": 8, "recall": 0.065, "miss_rate": 0.935 }, "R2": { "total": 96, "detected": 15, "recall": 0.1562, "miss_rate": 0.8438 }, "R3": { "total": 77, "detected": 28, "recall": 0.3636, "miss_rate": 0.6364 }, "R4": { "total": 81, "detected": 25, "recall": 0.3086, "miss_rate": 0.6914 }, "R5": { "total": 64, "detected": 9, "recall": 0.1406, "miss_rate": 0.8594 }, "R6": { "total": 105, "detected": 11, "recall": 0.1048, "miss_rate": 0.8952 }, "R7": { "total": 91, "detected": 9, "recall": 0.0989, "miss_rate": 0.9011 }, "R8": { "total": 75, "detected": 49, "recall": 0.6533, "miss_rate": 0.3467 }, "R9": { "total": 91, "detected": 7, "recall": 0.0769, "miss_rate": 0.9231 }, "R10": { "total": 74, "detected": 14, "recall": 0.1892, "miss_rate": 0.8108 } } }, "ours_detection": { "binary_f1": 0.9988597491448119, "high_risk_recall": 0.9988597491448119, "high_risk_precision": 0.9988597491448119, "false_negative_rate": 0.0011402508551880963, "level_macro_f1": 0.4974096618676628, "level_weighted_f1": 0.5113791757593992, "level_per_class_f1": [ 0.67601246105919, 0.17391304347826086, 0.45622119815668205, 0.6204620462046204, 0.5604395604395604 ], "fine_per_label_f1": [ 0.7047244094488189, 0.40274599542334094, 0.6269035532994924, 0.4339622641509434, 0.6253521126760564, 0.2874617737003058, 0.27901785714285715, 0.2389937106918239, 0.6086956521739131, 0.5878136200716846, 0.350253807106599, 0.4444444444444444, 0.3734015345268542, 0.6942148760330579 ], "fine_macro_f1": 0.4755704007778709, "fine_weighted_f1": 0.5078364322693886, "per_category_recall": { "R1": { "total": 123, "detected": 122, "recall": 0.9919, "miss_rate": 0.0081 }, "R2": { "total": 96, "detected": 96, "recall": 1.0, "miss_rate": 0.0 }, "R3": { "total": 77, "detected": 77, "recall": 1.0, "miss_rate": 0.0 }, "R4": { "total": 81, "detected": 81, "recall": 1.0, "miss_rate": 0.0 }, "R5": { "total": 64, "detected": 64, "recall": 1.0, "miss_rate": 0.0 }, "R6": { "total": 105, "detected": 105, "recall": 1.0, "miss_rate": 0.0 }, "R7": { "total": 91, "detected": 91, "recall": 1.0, "miss_rate": 0.0 }, "R8": { "total": 75, "detected": 75, "recall": 1.0, "miss_rate": 0.0 }, "R9": { "total": 91, "detected": 91, "recall": 1.0, "miss_rate": 0.0 }, "R10": { "total": 74, "detected": 74, "recall": 1.0, "miss_rate": 0.0 } }, "label_filter": "all" } }