Files
CompanionGuard-RL/code/experiments/eval_intervention_v2.json
zhangsiyuan bd1f51c496 chore: initial commit — unified project repo
Merged code repo (CompanionGuard-RL) into single project-level git.
Reorganized root: docs/, reference/, experiments/, tmp/active|archives/.
Gitignored: data/, checkpoints/, .venv, experiment logs, tmp/archives.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-14 11:28:42 +08:00

533 lines
11 KiB
JSON

{
"meta": {
"test_file": "data/processed/CompanionRisk-Bench/test.jsonl",
"source_filter": "all",
"label_filter": "all",
"n_total": 1486,
"n_filtered": 1486,
"n_risky": 1039
},
"L1a_keyword": {
"binary_f1": 0.26436781609195403,
"high_risk_recall": 0.15495668912415783,
"high_risk_precision": 0.8994413407821229,
"false_negative_rate": 0.8450433108758422,
"level_macro_f1": 0.10427720349098286,
"level_weighted_f1": 0.09799538109505529,
"level_per_class_f1": [
0.2979274611398964,
0.0,
0.1934156378600823,
0.030042918454935622,
0.0
],
"per_category_recall": {
"R1": {
"total": 136,
"detected": 10,
"recall": 0.0735,
"miss_rate": 0.9265
},
"R2": {
"total": 142,
"detected": 16,
"recall": 0.1127,
"miss_rate": 0.8873
},
"R3": {
"total": 95,
"detected": 17,
"recall": 0.1789,
"miss_rate": 0.8211
},
"R4": {
"total": 116,
"detected": 22,
"recall": 0.1897,
"miss_rate": 0.8103
},
"R5": {
"total": 64,
"detected": 9,
"recall": 0.1406,
"miss_rate": 0.8594
},
"R6": {
"total": 97,
"detected": 11,
"recall": 0.1134,
"miss_rate": 0.8866
},
"R7": {
"total": 91,
"detected": 6,
"recall": 0.0659,
"miss_rate": 0.9341
},
"R8": {
"total": 73,
"detected": 49,
"recall": 0.6712,
"miss_rate": 0.3288
},
"R9": {
"total": 152,
"detected": 11,
"recall": 0.0724,
"miss_rate": 0.9276
},
"R10": {
"total": 73,
"detected": 10,
"recall": 0.137,
"miss_rate": 0.863
}
}
},
"L1b_regex": {
"binary_f1": 0.06697674418604652,
"high_risk_recall": 0.03464870067372473,
"high_risk_precision": 1.0,
"false_negative_rate": 0.9653512993262753,
"level_macro_f1": 0.07297879241072718,
"level_weighted_f1": 0.06312377515343655,
"level_per_class_f1": [
0.2809721398933017,
0.0,
0.07954545454545454,
0.00437636761487965,
0.0
],
"per_category_recall": {
"R1": {
"total": 136,
"detected": 0,
"recall": 0.0,
"miss_rate": 1.0
},
"R2": {
"total": 142,
"detected": 1,
"recall": 0.007,
"miss_rate": 0.993
},
"R3": {
"total": 95,
"detected": 19,
"recall": 0.2,
"miss_rate": 0.8
},
"R4": {
"total": 116,
"detected": 9,
"recall": 0.0776,
"miss_rate": 0.9224
},
"R5": {
"total": 64,
"detected": 0,
"recall": 0.0,
"miss_rate": 1.0
},
"R6": {
"total": 97,
"detected": 0,
"recall": 0.0,
"miss_rate": 1.0
},
"R7": {
"total": 91,
"detected": 3,
"recall": 0.033,
"miss_rate": 0.967
},
"R8": {
"total": 73,
"detected": 0,
"recall": 0.0,
"miss_rate": 1.0
},
"R9": {
"total": 152,
"detected": 0,
"recall": 0.0,
"miss_rate": 1.0
},
"R10": {
"total": 73,
"detected": 4,
"recall": 0.0548,
"miss_rate": 0.9452
}
}
},
"L1c_combined": {
"binary_f1": 0.3060897435897436,
"high_risk_recall": 0.18383060635226178,
"high_risk_precision": 0.9138755980861244,
"false_negative_rate": 0.8161693936477382,
"level_macro_f1": 0.11189027535274536,
"level_weighted_f1": 0.10619241328971442,
"level_per_class_f1": [
0.3038309114927345,
0.0,
0.22135922330097088,
0.034261241970021415,
0.0
],
"per_category_recall": {
"R1": {
"total": 136,
"detected": 10,
"recall": 0.0735,
"miss_rate": 0.9265
},
"R2": {
"total": 142,
"detected": 17,
"recall": 0.1197,
"miss_rate": 0.8803
},
"R3": {
"total": 95,
"detected": 32,
"recall": 0.3368,
"miss_rate": 0.6632
},
"R4": {
"total": 116,
"detected": 29,
"recall": 0.25,
"miss_rate": 0.75
},
"R5": {
"total": 64,
"detected": 9,
"recall": 0.1406,
"miss_rate": 0.8594
},
"R6": {
"total": 97,
"detected": 11,
"recall": 0.1134,
"miss_rate": 0.8866
},
"R7": {
"total": 91,
"detected": 9,
"recall": 0.0989,
"miss_rate": 0.9011
},
"R8": {
"total": 73,
"detected": 49,
"recall": 0.6712,
"miss_rate": 0.3288
},
"R9": {
"total": 152,
"detected": 11,
"recall": 0.0724,
"miss_rate": 0.9276
},
"R10": {
"total": 73,
"detected": 14,
"recall": 0.1918,
"miss_rate": 0.8082
}
}
},
"ours_detection": {
"binary_f1": 0.9995189995189995,
"high_risk_recall": 1.0,
"high_risk_precision": 0.9990384615384615,
"false_negative_rate": 0.0,
"level_macro_f1": 0.5495554176357882,
"level_weighted_f1": 0.5584578220374772,
"level_per_class_f1": [
0.37540453074433655,
0.6351931330472103,
0.46393762183235865,
0.6400759734093068,
0.6331658291457286
],
"fine_per_label_f1": [
0.6844262295081968,
0.46567164179104475,
0.697986577181208,
0.40233236151603496,
0.585,
0.3559322033898305,
0.38322211630123926,
0.3374578177727784,
0.531810766721044,
0.39436619718309857,
0.2691029900332226,
0.4410480349344978,
0.32142857142857145,
0.615916955017301
],
"fine_macro_f1": 0.46326446162700485,
"fine_weighted_f1": 0.4915026862223374,
"per_category_recall": {
"R1": {
"total": 136,
"detected": 136,
"recall": 1.0,
"miss_rate": 0.0
},
"R2": {
"total": 142,
"detected": 142,
"recall": 1.0,
"miss_rate": 0.0
},
"R3": {
"total": 95,
"detected": 95,
"recall": 1.0,
"miss_rate": 0.0
},
"R4": {
"total": 116,
"detected": 116,
"recall": 1.0,
"miss_rate": 0.0
},
"R5": {
"total": 64,
"detected": 64,
"recall": 1.0,
"miss_rate": 0.0
},
"R6": {
"total": 97,
"detected": 97,
"recall": 1.0,
"miss_rate": 0.0
},
"R7": {
"total": 91,
"detected": 91,
"recall": 1.0,
"miss_rate": 0.0
},
"R8": {
"total": 73,
"detected": 73,
"recall": 1.0,
"miss_rate": 0.0
},
"R9": {
"total": 152,
"detected": 152,
"recall": 1.0,
"miss_rate": 0.0
},
"R10": {
"total": 73,
"detected": 73,
"recall": 1.0,
"miss_rate": 0.0
}
},
"label_filter": "all"
},
"baseline_rule": {
"intervention_recall_high": 0.9079754601226994,
"over_intervention_rate": 0.0,
"action_distribution": [
0.46231493943472407,
0.0,
0.0,
0.5376850605652759,
0.0
],
"per_level_action_dist": {
"L0_Safe": {
"n": 237,
"action_dist": [
1.0,
0.0,
0.0,
0.0,
0.0
]
},
"L1_Mild": {
"n": 280,
"action_dist": [
0.9178571428571428,
0.0,
0.0,
0.08214285714285714,
0.0
]
},
"L2_Moderate": {
"n": 317,
"action_dist": [
0.4195583596214511,
0.0,
0.0,
0.580441640378549,
0.0
]
},
"L3_High": {
"n": 456,
"action_dist": [
0.11403508771929824,
0.0,
0.0,
0.8859649122807017,
0.0
]
},
"L4_Critical": {
"n": 196,
"action_dist": [
0.04081632653061224,
0.0,
0.0,
0.9591836734693877,
0.0
]
}
},
"crisis_precision": NaN,
"safety_ux_fscore": 0.9517684887459806
},
"baseline_threshold": {
"intervention_recall_high": 0.9079754601226994,
"over_intervention_rate": 0.0,
"action_distribution": [
0.3304172274562584,
0.13189771197846567,
0.40174966352624497,
0.0,
0.13593539703903096
],
"per_level_action_dist": {
"L0_Safe": {
"n": 237,
"action_dist": [
1.0,
0.0,
0.0,
0.0,
0.0
]
},
"L1_Mild": {
"n": 280,
"action_dist": [
0.8428571428571429,
0.075,
0.08214285714285714,
0.0,
0.0
]
},
"L2_Moderate": {
"n": 317,
"action_dist": [
0.04416403785488959,
0.3753943217665615,
0.5520504731861199,
0.0,
0.028391167192429023
]
},
"L3_High": {
"n": 456,
"action_dist": [
0.008771929824561403,
0.10526315789473684,
0.7390350877192983,
0.0,
0.14692982456140352
]
},
"L4_Critical": {
"n": 196,
"action_dist": [
0.0,
0.04081632653061224,
0.3163265306122449,
0.0,
0.6428571428571429
]
}
},
"crisis_precision": 0.6237623762376238,
"safety_ux_fscore": 0.9517684887459806
},
"ours_intervention": {
"intervention_recall_high": 1.0,
"over_intervention_rate": 0.0,
"action_distribution": [
0.29878869448183043,
0.0033647375504710633,
0.5847913862718708,
0.0,
0.11305518169582772
],
"per_level_action_dist": {
"L0_Safe": {
"n": 237,
"action_dist": [
0.9831223628691983,
0.016877637130801686,
0.0,
0.0,
0.0
]
},
"L1_Mild": {
"n": 280,
"action_dist": [
0.7535714285714286,
0.0035714285714285713,
0.21785714285714286,
0.0,
0.025
]
},
"L2_Moderate": {
"n": 317,
"action_dist": [
0.0,
0.0,
0.9148264984227129,
0.0,
0.08517350157728706
]
},
"L3_High": {
"n": 456,
"action_dist": [
0.0,
0.0,
0.8793859649122807,
0.0,
0.1206140350877193
]
},
"L4_Critical": {
"n": 196,
"action_dist": [
0.0,
0.0,
0.5969387755102041,
0.0,
0.4030612244897959
]
}
},
"action_accuracy": 0.5868102288021534,
"crisis_precision": 0.47023809523809523,
"safety_ux_fscore": 1.0
}
}