{ "meta": { "test_file": "data/processed/CompanionRisk-Bench/test.jsonl", "source_filter": "all", "n_total": 605, "n_filtered": 605, "n_risky": 456 }, "L1a_keyword": { "binary_f1": 0.29313543599257885, "high_risk_recall": 0.17324561403508773, "high_risk_precision": 0.9518072289156626, "false_negative_rate": 0.8267543859649122, "level_macro_f1": 0.09819557155678502, "level_weighted_f1": 0.08825982748460577, "per_category_recall": { "R1": { "total": 67, "detected": 3, "recall": 0.0448, "miss_rate": 0.9552 }, "R2": { "total": 45, "detected": 3, "recall": 0.0667, "miss_rate": 0.9333 }, "R3": { "total": 39, "detected": 10, "recall": 0.2564, "miss_rate": 0.7436 }, "R4": { "total": 41, "detected": 7, "recall": 0.1707, "miss_rate": 0.8293 }, "R5": { "total": 36, "detected": 8, "recall": 0.2222, "miss_rate": 0.7778 }, "R6": { "total": 64, "detected": 6, "recall": 0.0938, "miss_rate": 0.9062 }, "R7": { "total": 37, "detected": 4, "recall": 0.1081, "miss_rate": 0.8919 }, "R8": { "total": 38, "detected": 25, "recall": 0.6579, "miss_rate": 0.3421 }, "R9": { "total": 52, "detected": 3, "recall": 0.0577, "miss_rate": 0.9423 }, "R10": { "total": 37, "detected": 10, "recall": 0.2703, "miss_rate": 0.7297 } } }, "L1b_regex": { "binary_f1": 0.0759493670886076, "high_risk_recall": 0.039473684210526314, "high_risk_precision": 1.0, "false_negative_rate": 0.9605263157894737, "level_macro_f1": 0.07132623033992896, "level_weighted_f1": 0.058213483946983315, "per_category_recall": { "R1": { "total": 67, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R2": { "total": 45, "detected": 1, "recall": 0.0222, "miss_rate": 0.9778 }, "R3": { "total": 39, "detected": 9, "recall": 0.2308, "miss_rate": 0.7692 }, "R4": { "total": 41, "detected": 3, "recall": 0.0732, "miss_rate": 0.9268 }, "R5": { "total": 36, "detected": 1, "recall": 0.0278, "miss_rate": 0.9722 }, "R6": { "total": 64, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R7": { "total": 37, "detected": 2, "recall": 0.0541, "miss_rate": 0.9459 }, "R8": { "total": 38, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R9": { "total": 52, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R10": { "total": 37, "detected": 2, "recall": 0.0541, "miss_rate": 0.9459 } } }, "L1c_combined": { "binary_f1": 0.33634719710669075, "high_risk_recall": 0.20394736842105263, "high_risk_precision": 0.9587628865979382, "false_negative_rate": 0.7960526315789473, "level_macro_f1": 0.10979552475377227, "level_weighted_f1": 0.1000980341896042, "per_category_recall": { "R1": { "total": 67, "detected": 3, "recall": 0.0448, "miss_rate": 0.9552 }, "R2": { "total": 45, "detected": 4, "recall": 0.0889, "miss_rate": 0.9111 }, "R3": { "total": 39, "detected": 16, "recall": 0.4103, "miss_rate": 0.5897 }, "R4": { "total": 41, "detected": 9, "recall": 0.2195, "miss_rate": 0.7805 }, "R5": { "total": 36, "detected": 9, "recall": 0.25, "miss_rate": 0.75 }, "R6": { "total": 64, "detected": 6, "recall": 0.0938, "miss_rate": 0.9062 }, "R7": { "total": 37, "detected": 6, "recall": 0.1622, "miss_rate": 0.8378 }, "R8": { "total": 38, "detected": 25, "recall": 0.6579, "miss_rate": 0.3421 }, "R9": { "total": 52, "detected": 3, "recall": 0.0577, "miss_rate": 0.9423 }, "R10": { "total": 37, "detected": 12, "recall": 0.3243, "miss_rate": 0.6757 } } }, "ours_detection": { "binary_f1": 0.9967069154774972, "high_risk_recall": 0.9956140350877193, "high_risk_precision": 0.9978021978021978, "false_negative_rate": 0.004385964912280715, "level_macro_f1": 0.5150467302191439, "level_weighted_f1": 0.5173056767699116, "fine_macro_f1": 0.0, "fine_weighted_f1": 0.0, "fine_per_label_f1": [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ], "per_category_recall": { "R1": { "total": 67, "detected": 66, "recall": 0.9851, "miss_rate": 0.0149 }, "R2": { "total": 45, "detected": 44, "recall": 0.9778, "miss_rate": 0.0222 }, "R3": { "total": 39, "detected": 39, "recall": 1.0, "miss_rate": 0.0 }, "R4": { "total": 41, "detected": 41, "recall": 1.0, "miss_rate": 0.0 }, "R5": { "total": 36, "detected": 36, "recall": 1.0, "miss_rate": 0.0 }, "R6": { "total": 64, "detected": 64, "recall": 1.0, "miss_rate": 0.0 }, "R7": { "total": 37, "detected": 37, "recall": 1.0, "miss_rate": 0.0 }, "R8": { "total": 38, "detected": 38, "recall": 1.0, "miss_rate": 0.0 }, "R9": { "total": 52, "detected": 52, "recall": 1.0, "miss_rate": 0.0 }, "R10": { "total": 37, "detected": 37, "recall": 1.0, "miss_rate": 0.0 } } } }