Merged code repo (CompanionGuard-RL) into single project-level git. Reorganized root: docs/, reference/, experiments/, tmp/active|archives/. Gitignored: data/, checkpoints/, .venv, experiment logs, tmp/archives. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
335 lines
7.1 KiB
JSON
335 lines
7.1 KiB
JSON
{
|
|
"meta": {
|
|
"test_file": "data/processed/CompanionRisk-Bench/test.jsonl",
|
|
"source_filter": "all",
|
|
"n_total": 605,
|
|
"n_filtered": 605,
|
|
"n_risky": 456
|
|
},
|
|
"L1a_keyword": {
|
|
"binary_f1": 0.29313543599257885,
|
|
"high_risk_recall": 0.17324561403508773,
|
|
"high_risk_precision": 0.9518072289156626,
|
|
"false_negative_rate": 0.8267543859649122,
|
|
"level_macro_f1": 0.09819557155678502,
|
|
"level_weighted_f1": 0.08825982748460577,
|
|
"level_per_class_f1": [
|
|
0.2786885245901639,
|
|
0.0,
|
|
0.1951219512195122,
|
|
0.017167381974248927,
|
|
0.0
|
|
],
|
|
"per_category_recall": {
|
|
"R1": {
|
|
"total": 67,
|
|
"detected": 3,
|
|
"recall": 0.0448,
|
|
"miss_rate": 0.9552
|
|
},
|
|
"R2": {
|
|
"total": 45,
|
|
"detected": 3,
|
|
"recall": 0.0667,
|
|
"miss_rate": 0.9333
|
|
},
|
|
"R3": {
|
|
"total": 39,
|
|
"detected": 10,
|
|
"recall": 0.2564,
|
|
"miss_rate": 0.7436
|
|
},
|
|
"R4": {
|
|
"total": 41,
|
|
"detected": 7,
|
|
"recall": 0.1707,
|
|
"miss_rate": 0.8293
|
|
},
|
|
"R5": {
|
|
"total": 36,
|
|
"detected": 8,
|
|
"recall": 0.2222,
|
|
"miss_rate": 0.7778
|
|
},
|
|
"R6": {
|
|
"total": 64,
|
|
"detected": 6,
|
|
"recall": 0.0938,
|
|
"miss_rate": 0.9062
|
|
},
|
|
"R7": {
|
|
"total": 37,
|
|
"detected": 4,
|
|
"recall": 0.1081,
|
|
"miss_rate": 0.8919
|
|
},
|
|
"R8": {
|
|
"total": 38,
|
|
"detected": 25,
|
|
"recall": 0.6579,
|
|
"miss_rate": 0.3421
|
|
},
|
|
"R9": {
|
|
"total": 52,
|
|
"detected": 3,
|
|
"recall": 0.0577,
|
|
"miss_rate": 0.9423
|
|
},
|
|
"R10": {
|
|
"total": 37,
|
|
"detected": 10,
|
|
"recall": 0.2703,
|
|
"miss_rate": 0.7297
|
|
}
|
|
}
|
|
},
|
|
"L1b_regex": {
|
|
"binary_f1": 0.0759493670886076,
|
|
"high_risk_recall": 0.039473684210526314,
|
|
"high_risk_precision": 1.0,
|
|
"false_negative_rate": 0.9605263157894737,
|
|
"level_macro_f1": 0.07132623033992896,
|
|
"level_weighted_f1": 0.058213483946983315,
|
|
"level_per_class_f1": [
|
|
0.2607407407407407,
|
|
0.0,
|
|
0.0958904109589041,
|
|
0.0,
|
|
0.0
|
|
],
|
|
"per_category_recall": {
|
|
"R1": {
|
|
"total": 67,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R2": {
|
|
"total": 45,
|
|
"detected": 1,
|
|
"recall": 0.0222,
|
|
"miss_rate": 0.9778
|
|
},
|
|
"R3": {
|
|
"total": 39,
|
|
"detected": 9,
|
|
"recall": 0.2308,
|
|
"miss_rate": 0.7692
|
|
},
|
|
"R4": {
|
|
"total": 41,
|
|
"detected": 3,
|
|
"recall": 0.0732,
|
|
"miss_rate": 0.9268
|
|
},
|
|
"R5": {
|
|
"total": 36,
|
|
"detected": 1,
|
|
"recall": 0.0278,
|
|
"miss_rate": 0.9722
|
|
},
|
|
"R6": {
|
|
"total": 64,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R7": {
|
|
"total": 37,
|
|
"detected": 2,
|
|
"recall": 0.0541,
|
|
"miss_rate": 0.9459
|
|
},
|
|
"R8": {
|
|
"total": 38,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R9": {
|
|
"total": 52,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R10": {
|
|
"total": 37,
|
|
"detected": 2,
|
|
"recall": 0.0541,
|
|
"miss_rate": 0.9459
|
|
}
|
|
}
|
|
},
|
|
"L1c_combined": {
|
|
"binary_f1": 0.33634719710669075,
|
|
"high_risk_recall": 0.20394736842105263,
|
|
"high_risk_precision": 0.9587628865979382,
|
|
"false_negative_rate": 0.7960526315789473,
|
|
"level_macro_f1": 0.10979552475377227,
|
|
"level_weighted_f1": 0.1000980341896042,
|
|
"level_per_class_f1": [
|
|
0.28523489932885904,
|
|
0.0,
|
|
0.2465753424657534,
|
|
0.017167381974248927,
|
|
0.0
|
|
],
|
|
"per_category_recall": {
|
|
"R1": {
|
|
"total": 67,
|
|
"detected": 3,
|
|
"recall": 0.0448,
|
|
"miss_rate": 0.9552
|
|
},
|
|
"R2": {
|
|
"total": 45,
|
|
"detected": 4,
|
|
"recall": 0.0889,
|
|
"miss_rate": 0.9111
|
|
},
|
|
"R3": {
|
|
"total": 39,
|
|
"detected": 16,
|
|
"recall": 0.4103,
|
|
"miss_rate": 0.5897
|
|
},
|
|
"R4": {
|
|
"total": 41,
|
|
"detected": 9,
|
|
"recall": 0.2195,
|
|
"miss_rate": 0.7805
|
|
},
|
|
"R5": {
|
|
"total": 36,
|
|
"detected": 9,
|
|
"recall": 0.25,
|
|
"miss_rate": 0.75
|
|
},
|
|
"R6": {
|
|
"total": 64,
|
|
"detected": 6,
|
|
"recall": 0.0938,
|
|
"miss_rate": 0.9062
|
|
},
|
|
"R7": {
|
|
"total": 37,
|
|
"detected": 6,
|
|
"recall": 0.1622,
|
|
"miss_rate": 0.8378
|
|
},
|
|
"R8": {
|
|
"total": 38,
|
|
"detected": 25,
|
|
"recall": 0.6579,
|
|
"miss_rate": 0.3421
|
|
},
|
|
"R9": {
|
|
"total": 52,
|
|
"detected": 3,
|
|
"recall": 0.0577,
|
|
"miss_rate": 0.9423
|
|
},
|
|
"R10": {
|
|
"total": 37,
|
|
"detected": 12,
|
|
"recall": 0.3243,
|
|
"miss_rate": 0.6757
|
|
}
|
|
}
|
|
},
|
|
"ours_detection": {
|
|
"binary_f1": 0.9967069154774972,
|
|
"high_risk_recall": 0.9956140350877193,
|
|
"high_risk_precision": 0.9978021978021978,
|
|
"false_negative_rate": 0.004385964912280715,
|
|
"level_macro_f1": 0.5150467302191439,
|
|
"level_weighted_f1": 0.5173056767699116,
|
|
"level_per_class_f1": [
|
|
0.632183908045977,
|
|
0.5076923076923077,
|
|
0.3861003861003861,
|
|
0.5627705627705628,
|
|
0.4864864864864865
|
|
],
|
|
"fine_per_label_f1": [
|
|
0.6407766990291263,
|
|
0.46464646464646464,
|
|
0.734982332155477,
|
|
0.0,
|
|
0.7407407407407407,
|
|
0.7676767676767676,
|
|
0.6013986013986014,
|
|
0.4864864864864865,
|
|
0.6161616161616161,
|
|
0.6875,
|
|
0.24,
|
|
0.38961038961038963,
|
|
0.8641975308641975,
|
|
0.7777777777777778
|
|
],
|
|
"fine_macro_f1": 0.5722825290391176,
|
|
"fine_weighted_f1": 0.622073826302884,
|
|
"per_category_recall": {
|
|
"R1": {
|
|
"total": 67,
|
|
"detected": 66,
|
|
"recall": 0.9851,
|
|
"miss_rate": 0.0149
|
|
},
|
|
"R2": {
|
|
"total": 45,
|
|
"detected": 44,
|
|
"recall": 0.9778,
|
|
"miss_rate": 0.0222
|
|
},
|
|
"R3": {
|
|
"total": 39,
|
|
"detected": 39,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R4": {
|
|
"total": 41,
|
|
"detected": 41,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R5": {
|
|
"total": 36,
|
|
"detected": 36,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R6": {
|
|
"total": 64,
|
|
"detected": 64,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R7": {
|
|
"total": 37,
|
|
"detected": 37,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R8": {
|
|
"total": 38,
|
|
"detected": 38,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R9": {
|
|
"total": 52,
|
|
"detected": 52,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
},
|
|
"R10": {
|
|
"total": 37,
|
|
"detected": 37,
|
|
"recall": 1.0,
|
|
"miss_rate": 0.0
|
|
}
|
|
}
|
|
}
|
|
} |