- Remove code/experiments/ → merge all eval JSONs into root experiments/ - Move code/exp.md, code/change.md → project root - Delete code/2026-05-09-研究框架.md (duplicate of docs/) - Update .gitignore: experiments/*.log (was code/experiments/*.log) - Update code/CLAUDE.md: fix all affected paths Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
335 lines
6.5 KiB
JSON
335 lines
6.5 KiB
JSON
{
|
|
"meta": {
|
|
"test_file": "data/processed/CompanionRisk-Bench/test.jsonl",
|
|
"source_filter": "human",
|
|
"n_total": 605,
|
|
"n_filtered": 119,
|
|
"n_risky": 99
|
|
},
|
|
"L1a_keyword": {
|
|
"binary_f1": 0.0,
|
|
"high_risk_recall": 0.0,
|
|
"high_risk_precision": 0.0,
|
|
"false_negative_rate": 1.0,
|
|
"level_macro_f1": 0.05755395683453237,
|
|
"level_weighted_f1": 0.04836466960885073,
|
|
"level_per_class_f1": [
|
|
0.28776978417266186,
|
|
0.0,
|
|
0.0,
|
|
0.0,
|
|
0.0
|
|
],
|
|
"per_category_recall": {
|
|
"R1": {
|
|
"total": 36,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R2": {
|
|
"total": 6,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R3": {
|
|
"total": 0,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R4": {
|
|
"total": 0,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R5": {
|
|
"total": 0,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R6": {
|
|
"total": 31,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R7": {
|
|
"total": 5,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R8": {
|
|
"total": 2,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R9": {
|
|
"total": 19,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R10": {
|
|
"total": 0,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
}
|
|
}
|
|
},
|
|
"L1b_regex": {
|
|
"binary_f1": 0.0,
|
|
"high_risk_recall": 0.0,
|
|
"high_risk_precision": 0.0,
|
|
"false_negative_rate": 1.0,
|
|
"level_macro_f1": 0.05755395683453237,
|
|
"level_weighted_f1": 0.04836466960885073,
|
|
"level_per_class_f1": [
|
|
0.28776978417266186,
|
|
0.0,
|
|
0.0,
|
|
0.0,
|
|
0.0
|
|
],
|
|
"per_category_recall": {
|
|
"R1": {
|
|
"total": 36,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R2": {
|
|
"total": 6,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R3": {
|
|
"total": 0,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R4": {
|
|
"total": 0,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R5": {
|
|
"total": 0,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R6": {
|
|
"total": 31,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R7": {
|
|
"total": 5,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R8": {
|
|
"total": 2,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R9": {
|
|
"total": 19,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R10": {
|
|
"total": 0,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
}
|
|
}
|
|
},
|
|
"L1c_combined": {
|
|
"binary_f1": 0.0,
|
|
"high_risk_recall": 0.0,
|
|
"high_risk_precision": 0.0,
|
|
"false_negative_rate": 1.0,
|
|
"level_macro_f1": 0.05755395683453237,
|
|
"level_weighted_f1": 0.04836466960885073,
|
|
"level_per_class_f1": [
|
|
0.28776978417266186,
|
|
0.0,
|
|
0.0,
|
|
0.0,
|
|
0.0
|
|
],
|
|
"per_category_recall": {
|
|
"R1": {
|
|
"total": 36,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R2": {
|
|
"total": 6,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R3": {
|
|
"total": 0,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R4": {
|
|
"total": 0,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R5": {
|
|
"total": 0,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R6": {
|
|
"total": 31,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R7": {
|
|
"total": 5,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R8": {
|
|
"total": 2,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R9": {
|
|
"total": 19,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R10": {
|
|
"total": 0,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
}
|
|
}
|
|
},
|
|
"ours_detection": {
|
|
"binary_f1": 0.9847715736040609,
|
|
"high_risk_recall": 0.9797979797979798,
|
|
"high_risk_precision": 0.9897959183673469,
|
|
"false_negative_rate": 0.02020202020202022,
|
|
"level_macro_f1": 0.3641541183069423,
|
|
"level_weighted_f1": 0.4092843419457787,
|
|
"level_per_class_f1": [
|
|
0.9302325581395349,
|
|
0.0,
|
|
0.16326530612244897,
|
|
0.36363636363636365,
|
|
0.36363636363636365
|
|
],
|
|
"fine_per_label_f1": [
|
|
0.3508771929824561,
|
|
0.0,
|
|
0.64,
|
|
0.0,
|
|
0.0,
|
|
0.0,
|
|
0.0,
|
|
0.2222222222222222,
|
|
0.375,
|
|
0.8857142857142857,
|
|
0.0,
|
|
0.0,
|
|
0.5,
|
|
0.2857142857142857
|
|
],
|
|
"fine_macro_f1": 0.2328234276166607,
|
|
"fine_weighted_f1": 0.4082668160299739,
|
|
"per_category_recall": {
|
|
"R1": {
|
|
"total": 36,
|
|
"detected": 35,
|
|
"recall": 0.9722,
|
|
"miss_rate": 0.0278
|
|
},
|
|
"R2": {
|
|
"total": 6,
|
|
"detected": 5,
|
|
"recall": 0.8333,
|
|
"miss_rate": 0.1667
|
|
},
|
|
"R3": {
|
|
"total": 0,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R4": {
|
|
"total": 0,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R5": {
|
|
"total": 0,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R6": {
|
|
"total": 31,
|
|
"detected": 31,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R7": {
|
|
"total": 5,
|
|
"detected": 5,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R8": {
|
|
"total": 2,
|
|
"detected": 2,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R9": {
|
|
"total": 19,
|
|
"detected": 19,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R10": {
|
|
"total": 0,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
}
|
|
}
|
|
}
|
|
} |