{ "meta": { "test_file": "data/processed/CompanionRisk-Bench/test.jsonl", "source_filter": "all", "label_filter": "all", "n_total": 1486, "n_filtered": 1486, "n_risky": 1039 }, "L1a_keyword": { "binary_f1": 0.26436781609195403, "high_risk_recall": 0.15495668912415783, "high_risk_precision": 0.8994413407821229, "false_negative_rate": 0.8450433108758422, "level_macro_f1": 0.10427720349098286, "level_weighted_f1": 0.09799538109505529, "level_per_class_f1": [ 0.2979274611398964, 0.0, 0.1934156378600823, 0.030042918454935622, 0.0 ], "per_category_recall": { "R1": { "total": 136, "detected": 10, "recall": 0.0735, "miss_rate": 0.9265 }, "R2": { "total": 142, "detected": 16, "recall": 0.1127, "miss_rate": 0.8873 }, "R3": { "total": 95, "detected": 17, "recall": 0.1789, "miss_rate": 0.8211 }, "R4": { "total": 116, "detected": 22, "recall": 0.1897, "miss_rate": 0.8103 }, "R5": { "total": 64, "detected": 9, "recall": 0.1406, "miss_rate": 0.8594 }, "R6": { "total": 97, "detected": 11, "recall": 0.1134, "miss_rate": 0.8866 }, "R7": { "total": 91, "detected": 6, "recall": 0.0659, "miss_rate": 0.9341 }, "R8": { "total": 73, "detected": 49, "recall": 0.6712, "miss_rate": 0.3288 }, "R9": { "total": 152, "detected": 11, "recall": 0.0724, "miss_rate": 0.9276 }, "R10": { "total": 73, "detected": 10, "recall": 0.137, "miss_rate": 0.863 } } }, "L1b_regex": { "binary_f1": 0.06697674418604652, "high_risk_recall": 0.03464870067372473, "high_risk_precision": 1.0, "false_negative_rate": 0.9653512993262753, "level_macro_f1": 0.07297879241072718, "level_weighted_f1": 0.06312377515343655, "level_per_class_f1": [ 0.2809721398933017, 0.0, 0.07954545454545454, 0.00437636761487965, 0.0 ], "per_category_recall": { "R1": { "total": 136, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R2": { "total": 142, "detected": 1, "recall": 0.007, "miss_rate": 0.993 }, "R3": { "total": 95, "detected": 19, "recall": 0.2, "miss_rate": 0.8 }, "R4": { "total": 116, "detected": 9, "recall": 0.0776, "miss_rate": 0.9224 }, "R5": { "total": 64, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R6": { "total": 97, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R7": { "total": 91, "detected": 3, "recall": 0.033, "miss_rate": 0.967 }, "R8": { "total": 73, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R9": { "total": 152, "detected": 0, "recall": 0.0, "miss_rate": 1.0 }, "R10": { "total": 73, "detected": 4, "recall": 0.0548, "miss_rate": 0.9452 } } }, "L1c_combined": { "binary_f1": 0.3060897435897436, "high_risk_recall": 0.18383060635226178, "high_risk_precision": 0.9138755980861244, "false_negative_rate": 0.8161693936477382, "level_macro_f1": 0.11189027535274536, "level_weighted_f1": 0.10619241328971442, "level_per_class_f1": [ 0.3038309114927345, 0.0, 0.22135922330097088, 0.034261241970021415, 0.0 ], "per_category_recall": { "R1": { "total": 136, "detected": 10, "recall": 0.0735, "miss_rate": 0.9265 }, "R2": { "total": 142, "detected": 17, "recall": 0.1197, "miss_rate": 0.8803 }, "R3": { "total": 95, "detected": 32, "recall": 0.3368, "miss_rate": 0.6632 }, "R4": { "total": 116, "detected": 29, "recall": 0.25, "miss_rate": 0.75 }, "R5": { "total": 64, "detected": 9, "recall": 0.1406, "miss_rate": 0.8594 }, "R6": { "total": 97, "detected": 11, "recall": 0.1134, "miss_rate": 0.8866 }, "R7": { "total": 91, "detected": 9, "recall": 0.0989, "miss_rate": 0.9011 }, "R8": { "total": 73, "detected": 49, "recall": 0.6712, "miss_rate": 0.3288 }, "R9": { "total": 152, "detected": 11, "recall": 0.0724, "miss_rate": 0.9276 }, "R10": { "total": 73, "detected": 14, "recall": 0.1918, "miss_rate": 0.8082 } } }, "ours_detection": { "binary_f1": 0.9995189995189995, "high_risk_recall": 1.0, "high_risk_precision": 0.9990384615384615, "false_negative_rate": 0.0, "level_macro_f1": 0.5495554176357882, "level_weighted_f1": 0.5584578220374772, "level_per_class_f1": [ 0.37540453074433655, 0.6351931330472103, 0.46393762183235865, 0.6400759734093068, 0.6331658291457286 ], "fine_per_label_f1": [ 0.6844262295081968, 0.46567164179104475, 0.697986577181208, 0.40233236151603496, 0.585, 0.3559322033898305, 0.38322211630123926, 0.3374578177727784, 0.531810766721044, 0.39436619718309857, 0.2691029900332226, 0.4410480349344978, 0.32142857142857145, 0.615916955017301 ], "fine_macro_f1": 0.46326446162700485, "fine_weighted_f1": 0.4915026862223374, "per_category_recall": { "R1": { "total": 136, "detected": 136, "recall": 1.0, "miss_rate": 0.0 }, "R2": { "total": 142, "detected": 142, "recall": 1.0, "miss_rate": 0.0 }, "R3": { "total": 95, "detected": 95, "recall": 1.0, "miss_rate": 0.0 }, "R4": { "total": 116, "detected": 116, "recall": 1.0, "miss_rate": 0.0 }, "R5": { "total": 64, "detected": 64, "recall": 1.0, "miss_rate": 0.0 }, "R6": { "total": 97, "detected": 97, "recall": 1.0, "miss_rate": 0.0 }, "R7": { "total": 91, "detected": 91, "recall": 1.0, "miss_rate": 0.0 }, "R8": { "total": 73, "detected": 73, "recall": 1.0, "miss_rate": 0.0 }, "R9": { "total": 152, "detected": 152, "recall": 1.0, "miss_rate": 0.0 }, "R10": { "total": 73, "detected": 73, "recall": 1.0, "miss_rate": 0.0 } }, "label_filter": "all" }, "baseline_rule": { "intervention_recall_high": 1.0, "over_intervention_rate": 0.0, "action_distribution": [ 0.5612382234185733, 0.0, 0.0, 0.4387617765814266, 0.0 ], "crisis_precision": NaN, "safety_ux_fscore": 1.0 }, "baseline_threshold": { "intervention_recall_high": 1.0, "over_intervention_rate": 0.0, "action_distribution": [ 0.34791386271870794, 0.2133243606998654, 0.30686406460296095, 0.0, 0.13189771197846567 ], "crisis_precision": 1.0, "safety_ux_fscore": 1.0 }, "ours_intervention": { "intervention_recall_high": 1.0, "over_intervention_rate": 0.0, "action_distribution": [ 0.3001345895020188, 0.0033647375504710633, 0.5834454912516823, 0.0, 0.11305518169582772 ], "crisis_precision": 0.47619047619047616, "safety_ux_fscore": 1.0 } }