Merged code repo (CompanionGuard-RL) into single project-level git. Reorganized root: docs/, reference/, experiments/, tmp/active|archives/. Gitignored: data/, checkpoints/, .venv, experiment logs, tmp/archives. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
277 lines
5.0 KiB
JSON
277 lines
5.0 KiB
JSON
{
|
|
"meta": {
|
|
"test_file": "data\\processed\\CompanionRisk-Bench\\test.jsonl",
|
|
"lang": "zh",
|
|
"n": 486
|
|
},
|
|
"L0_all_risky": {
|
|
"name": "L0_all_risky",
|
|
"accuracy": 0.7346,
|
|
"precision": 0.7346,
|
|
"recall": 1.0,
|
|
"f1": 0.847,
|
|
"fnr": 0.0,
|
|
"tp": 357,
|
|
"fp": 129,
|
|
"fn": 0,
|
|
"tn": 0
|
|
},
|
|
"L0_all_safe": {
|
|
"name": "L0_all_safe",
|
|
"accuracy": 0.2654,
|
|
"precision": 0.0,
|
|
"recall": 0.0,
|
|
"f1": 0.0,
|
|
"fnr": 1.0,
|
|
"tp": 0,
|
|
"fp": 0,
|
|
"fn": 357,
|
|
"tn": 129
|
|
},
|
|
"L1a_keyword": {
|
|
"name": "L1a_keyword",
|
|
"accuracy": 0.4198,
|
|
"precision": 0.9518,
|
|
"recall": 0.2213,
|
|
"f1": 0.3591,
|
|
"fnr": 0.7787,
|
|
"tp": 79,
|
|
"fp": 4,
|
|
"fn": 278,
|
|
"tn": 125
|
|
},
|
|
"L1a_keyword_cat_recall": {
|
|
"R1": {
|
|
"total": 31,
|
|
"detected": 3,
|
|
"recall": 0.0968,
|
|
"miss_rate": 0.9032
|
|
},
|
|
"R2": {
|
|
"total": 39,
|
|
"detected": 3,
|
|
"recall": 0.0769,
|
|
"miss_rate": 0.9231
|
|
},
|
|
"R3": {
|
|
"total": 39,
|
|
"detected": 10,
|
|
"recall": 0.2564,
|
|
"miss_rate": 0.7436
|
|
},
|
|
"R4": {
|
|
"total": 41,
|
|
"detected": 7,
|
|
"recall": 0.1707,
|
|
"miss_rate": 0.8293
|
|
},
|
|
"R5": {
|
|
"total": 36,
|
|
"detected": 8,
|
|
"recall": 0.2222,
|
|
"miss_rate": 0.7778
|
|
},
|
|
"R6": {
|
|
"total": 33,
|
|
"detected": 6,
|
|
"recall": 0.1818,
|
|
"miss_rate": 0.8182
|
|
},
|
|
"R7": {
|
|
"total": 32,
|
|
"detected": 4,
|
|
"recall": 0.125,
|
|
"miss_rate": 0.875
|
|
},
|
|
"R8": {
|
|
"total": 36,
|
|
"detected": 25,
|
|
"recall": 0.6944,
|
|
"miss_rate": 0.3056
|
|
},
|
|
"R9": {
|
|
"total": 33,
|
|
"detected": 3,
|
|
"recall": 0.0909,
|
|
"miss_rate": 0.9091
|
|
},
|
|
"R10": {
|
|
"total": 37,
|
|
"detected": 10,
|
|
"recall": 0.2703,
|
|
"miss_rate": 0.7297
|
|
}
|
|
},
|
|
"L1b_regex": {
|
|
"name": "L1b_regex",
|
|
"accuracy": 0.3025,
|
|
"precision": 1.0,
|
|
"recall": 0.0504,
|
|
"f1": 0.096,
|
|
"fnr": 0.9496,
|
|
"tp": 18,
|
|
"fp": 0,
|
|
"fn": 339,
|
|
"tn": 129
|
|
},
|
|
"L1b_regex_cat_recall": {
|
|
"R1": {
|
|
"total": 31,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R2": {
|
|
"total": 39,
|
|
"detected": 1,
|
|
"recall": 0.0256,
|
|
"miss_rate": 0.9744
|
|
},
|
|
"R3": {
|
|
"total": 39,
|
|
"detected": 9,
|
|
"recall": 0.2308,
|
|
"miss_rate": 0.7692
|
|
},
|
|
"R4": {
|
|
"total": 41,
|
|
"detected": 3,
|
|
"recall": 0.0732,
|
|
"miss_rate": 0.9268
|
|
},
|
|
"R5": {
|
|
"total": 36,
|
|
"detected": 1,
|
|
"recall": 0.0278,
|
|
"miss_rate": 0.9722
|
|
},
|
|
"R6": {
|
|
"total": 33,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R7": {
|
|
"total": 32,
|
|
"detected": 2,
|
|
"recall": 0.0625,
|
|
"miss_rate": 0.9375
|
|
},
|
|
"R8": {
|
|
"total": 36,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R9": {
|
|
"total": 33,
|
|
"detected": 0,
|
|
"recall": 0.0,
|
|
"miss_rate": 1.0
|
|
},
|
|
"R10": {
|
|
"total": 37,
|
|
"detected": 2,
|
|
"recall": 0.0541,
|
|
"miss_rate": 0.9459
|
|
}
|
|
},
|
|
"L1c_combined": {
|
|
"name": "L1c_combined",
|
|
"accuracy": 0.4486,
|
|
"precision": 0.9588,
|
|
"recall": 0.2605,
|
|
"f1": 0.4097,
|
|
"fnr": 0.7395,
|
|
"tp": 93,
|
|
"fp": 4,
|
|
"fn": 264,
|
|
"tn": 125
|
|
},
|
|
"L1c_combined_cat_recall": {
|
|
"R1": {
|
|
"total": 31,
|
|
"detected": 3,
|
|
"recall": 0.0968,
|
|
"miss_rate": 0.9032
|
|
},
|
|
"R2": {
|
|
"total": 39,
|
|
"detected": 4,
|
|
"recall": 0.1026,
|
|
"miss_rate": 0.8974
|
|
},
|
|
"R3": {
|
|
"total": 39,
|
|
"detected": 16,
|
|
"recall": 0.4103,
|
|
"miss_rate": 0.5897
|
|
},
|
|
"R4": {
|
|
"total": 41,
|
|
"detected": 9,
|
|
"recall": 0.2195,
|
|
"miss_rate": 0.7805
|
|
},
|
|
"R5": {
|
|
"total": 36,
|
|
"detected": 9,
|
|
"recall": 0.25,
|
|
"miss_rate": 0.75
|
|
},
|
|
"R6": {
|
|
"total": 33,
|
|
"detected": 6,
|
|
"recall": 0.1818,
|
|
"miss_rate": 0.8182
|
|
},
|
|
"R7": {
|
|
"total": 32,
|
|
"detected": 6,
|
|
"recall": 0.1875,
|
|
"miss_rate": 0.8125
|
|
},
|
|
"R8": {
|
|
"total": 36,
|
|
"detected": 25,
|
|
"recall": 0.6944,
|
|
"miss_rate": 0.3056
|
|
},
|
|
"R9": {
|
|
"total": 33,
|
|
"detected": 3,
|
|
"recall": 0.0909,
|
|
"miss_rate": 0.9091
|
|
},
|
|
"R10": {
|
|
"total": 37,
|
|
"detected": 12,
|
|
"recall": 0.3243,
|
|
"miss_rate": 0.6757
|
|
}
|
|
},
|
|
"intervention_Rule(l≥3→REJECT)": {
|
|
"policy": "Rule(l≥3→REJECT)",
|
|
"intervention_recall_high": 1.0,
|
|
"over_intervention_rate": 0.0,
|
|
"safety_ux_fscore": 1.0,
|
|
"crisis_precision": "N/A",
|
|
"action_distribution": {
|
|
"PASS": 233,
|
|
"REJECT": 253
|
|
}
|
|
},
|
|
"intervention_Threshold(level→action)": {
|
|
"policy": "Threshold(level→action)",
|
|
"intervention_recall_high": 1.0,
|
|
"over_intervention_rate": 0.0,
|
|
"safety_ux_fscore": 1.0,
|
|
"crisis_precision": 1.0,
|
|
"action_distribution": {
|
|
"PASS": 129,
|
|
"WARN": 104,
|
|
"REWRITE": 193,
|
|
"CRISIS": 60
|
|
}
|
|
}
|
|
} |