{ "meta": { "test_file": "data/processed/CompanionRisk-Bench/test.jsonl", "source_filter": "human", "n_total": 605, "n_filtered": 119, "n_risky": 99 }, "L1a_keyword": { "binary_f1": 0.0, "high_risk_recall": 0.0, "high_risk_precision": 0.0, "false_negative_rate": 1.0, "level_macro_f1": 0.05755395683453237, "level_weighted_f1": 0.04836466960885073, "level_per_class_f1": [ 0.28776978417266186, 0.0, 0.0, 0.0, 0.0 ], "per_category_recall": { "R1": { "total": 36, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R2": { "total": 6, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R3": { "total": 0, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R4": { "total": 0, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R5": { "total": 0, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R6": { "total": 31, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R7": { "total": 5, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R8": { "total": 2, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R9": { "total": 19, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R10": { "total": 0, "detected": 0, "recall": 0.0, "miss_rate": 1.0 } } }, "L1b_regex": { "binary_f1": 0.0, "high_risk_recall": 0.0, "high_risk_precision": 0.0, "false_negative_rate": 1.0, "level_macro_f1": 0.05755395683453237, "level_weighted_f1": 0.04836466960885073, "level_per_class_f1": [ 0.28776978417266186, 0.0, 0.0, 0.0, 0.0 ], "per_category_recall": { "R1": { "total": 36, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R2": { "total": 6, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R3": { "total": 0, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R4": { "total": 0, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R5": { "total": 0, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R6": { "total": 31, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R7": { "total": 5, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R8": { "total": 2, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R9": { "total": 19, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R10": { "total": 0, "detected": 0, "recall": 0.0, "miss_rate": 1.0 } } }, "L1c_combined": { "binary_f1": 0.0, "high_risk_recall": 0.0, "high_risk_precision": 0.0, "false_negative_rate": 1.0, "level_macro_f1": 0.05755395683453237, "level_weighted_f1": 0.04836466960885073, "level_per_class_f1": [ 0.28776978417266186, 0.0, 0.0, 0.0, 0.0 ], "per_category_recall": { "R1": { "total": 36, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R2": { "total": 6, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R3": { "total": 0, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R4": { "total": 0, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R5": { "total": 0, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R6": { "total": 31, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R7": { "total": 5, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R8": { "total": 2, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R9": { "total": 19, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R10": { "total": 0, "detected": 0, "recall": 0.0, "miss_rate": 1.0 } } }, "ours_detection": { "binary_f1": 0.9847715736040609, "high_risk_recall": 0.9797979797979798, "high_risk_precision": 0.9897959183673469, "false_negative_rate": 0.02020202020202022, "level_macro_f1": 0.3641541183069423, "level_weighted_f1": 0.4092843419457787, "level_per_class_f1": [ 0.9302325581395349, 0.0, 0.16326530612244897, 0.36363636363636365, 0.36363636363636365 ], "fine_per_label_f1": [ 0.3508771929824561, 0.0, 0.64, 0.0, 0.0, 0.0, 0.0, 0.2222222222222222, 0.375, 0.8857142857142857, 0.0, 0.0, 0.5, 0.2857142857142857 ], "fine_macro_f1": 0.2328234276166607, "fine_weighted_f1": 0.4082668160299739, "per_category_recall": { "R1": { "total": 36, "detected": 35, "recall": 0.9722, "miss_rate": 0.0278 }, "R2": { "total": 6, "detected": 5, "recall": 0.8333, "miss_rate": 0.1667 }, "R3": { "total": 0, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R4": { "total": 0, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R5": { "total": 0, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R6": { "total": 31, "detected": 31, "recall": 1.0, "miss_rate": 0.0 }, "R7": { "total": 5, "detected": 5, "recall": 1.0, "miss_rate": 0.0 }, "R8": { "total": 2, "detected": 2, "recall": 1.0, "miss_rate": 0.0 }, "R9": { "total": 19, "detected": 19, "recall": 1.0, "miss_rate": 0.0 }, "R10": { "total": 0, "detected": 0, "recall": 0.0, "miss_rate": 1.0 } } } }