CompanionGuard-RL/code/experiments/baseline_results.json

{
  "meta": {
    "test_file": "data\\processed\\CompanionRisk-Bench\\test.jsonl",
    "lang": "zh",
    "n": 486
  },
  "L0_all_risky": {
    "name": "L0_all_risky",
    "accuracy": 0.7346,
    "precision": 0.7346,
    "recall": 1.0,
    "f1": 0.847,
    "fnr": 0.0,
    "tp": 357,
    "fp": 129,
    "fn": 0,
    "tn": 0
  },
  "L0_all_safe": {
    "name": "L0_all_safe",
    "accuracy": 0.2654,
    "precision": 0.0,
    "recall": 0.0,
    "f1": 0.0,
    "fnr": 1.0,
    "tp": 0,
    "fp": 0,
    "fn": 357,
    "tn": 129
  },
  "L1a_keyword": {
    "name": "L1a_keyword",
    "accuracy": 0.4198,
    "precision": 0.9518,
    "recall": 0.2213,
    "f1": 0.3591,
    "fnr": 0.7787,
    "tp": 79,
    "fp": 4,
    "fn": 278,
    "tn": 125
  },
  "L1a_keyword_cat_recall": {
    "R1": {
      "total": 31,
      "detected": 3,
      "recall": 0.0968,
      "miss_rate": 0.9032
    },
    "R2": {
      "total": 39,
      "detected": 3,
      "recall": 0.0769,
      "miss_rate": 0.9231
    },
    "R3": {
      "total": 39,
      "detected": 10,
      "recall": 0.2564,
      "miss_rate": 0.7436
    },
    "R4": {
      "total": 41,
      "detected": 7,
      "recall": 0.1707,
      "miss_rate": 0.8293
    },
    "R5": {
      "total": 36,
      "detected": 8,
      "recall": 0.2222,
      "miss_rate": 0.7778
    },
    "R6": {
      "total": 33,
      "detected": 6,
      "recall": 0.1818,
      "miss_rate": 0.8182
    },
    "R7": {
      "total": 32,
      "detected": 4,
      "recall": 0.125,
      "miss_rate": 0.875
    },
    "R8": {
      "total": 36,
      "detected": 25,
      "recall": 0.6944,
      "miss_rate": 0.3056
    },
    "R9": {
      "total": 33,
      "detected": 3,
      "recall": 0.0909,
      "miss_rate": 0.9091
    },
    "R10": {
      "total": 37,
      "detected": 10,
      "recall": 0.2703,
      "miss_rate": 0.7297
    }
  },
  "L1b_regex": {
    "name": "L1b_regex",
    "accuracy": 0.3025,
    "precision": 1.0,
    "recall": 0.0504,
    "f1": 0.096,
    "fnr": 0.9496,
    "tp": 18,
    "fp": 0,
    "fn": 339,
    "tn": 129
  },
  "L1b_regex_cat_recall": {
    "R1": {
      "total": 31,
      "detected": 0,
      "recall": 0.0,
      "miss_rate": 1.0
    },
    "R2": {
      "total": 39,
      "detected": 1,
      "recall": 0.0256,
      "miss_rate": 0.9744
    },
    "R3": {
      "total": 39,
      "detected": 9,
      "recall": 0.2308,
      "miss_rate": 0.7692
    },
    "R4": {
      "total": 41,
      "detected": 3,
      "recall": 0.0732,
      "miss_rate": 0.9268
    },
    "R5": {
      "total": 36,
      "detected": 1,
      "recall": 0.0278,
      "miss_rate": 0.9722
    },
    "R6": {
      "total": 33,
      "detected": 0,
      "recall": 0.0,
      "miss_rate": 1.0
    },
    "R7": {
      "total": 32,
      "detected": 2,
      "recall": 0.0625,
      "miss_rate": 0.9375
    },
    "R8": {
      "total": 36,
      "detected": 0,
      "recall": 0.0,
      "miss_rate": 1.0
    },
    "R9": {
      "total": 33,
      "detected": 0,
      "recall": 0.0,
      "miss_rate": 1.0
    },
    "R10": {
      "total": 37,
      "detected": 2,
      "recall": 0.0541,
      "miss_rate": 0.9459
    }
  },
  "L1c_combined": {
    "name": "L1c_combined",
    "accuracy": 0.4486,
    "precision": 0.9588,
    "recall": 0.2605,
    "f1": 0.4097,
    "fnr": 0.7395,
    "tp": 93,
    "fp": 4,
    "fn": 264,
    "tn": 125
  },
  "L1c_combined_cat_recall": {
    "R1": {
      "total": 31,
      "detected": 3,
      "recall": 0.0968,
      "miss_rate": 0.9032
    },
    "R2": {
      "total": 39,
      "detected": 4,
      "recall": 0.1026,
      "miss_rate": 0.8974
    },
    "R3": {
      "total": 39,
      "detected": 16,
      "recall": 0.4103,
      "miss_rate": 0.5897
    },
    "R4": {
      "total": 41,
      "detected": 9,
      "recall": 0.2195,
      "miss_rate": 0.7805
    },
    "R5": {
      "total": 36,
      "detected": 9,
      "recall": 0.25,
      "miss_rate": 0.75
    },
    "R6": {
      "total": 33,
      "detected": 6,
      "recall": 0.1818,
      "miss_rate": 0.8182
    },
    "R7": {
      "total": 32,
      "detected": 6,
      "recall": 0.1875,
      "miss_rate": 0.8125
    },
    "R8": {
      "total": 36,
      "detected": 25,
      "recall": 0.6944,
      "miss_rate": 0.3056
    },
    "R9": {
      "total": 33,
      "detected": 3,
      "recall": 0.0909,
      "miss_rate": 0.9091
    },
    "R10": {
      "total": 37,
      "detected": 12,
      "recall": 0.3243,
      "miss_rate": 0.6757
    }
  },
  "intervention_Rule(l≥3→REJECT)": {
    "policy": "Rule(l≥3→REJECT)",
    "intervention_recall_high": 1.0,
    "over_intervention_rate": 0.0,
    "safety_ux_fscore": 1.0,
    "crisis_precision": "N/A",
    "action_distribution": {
      "PASS": 233,
      "REJECT": 253
    }
  },
  "intervention_Threshold(level→action)": {
    "policy": "Threshold(level→action)",
    "intervention_recall_high": 1.0,
    "over_intervention_rate": 0.0,
    "safety_ux_fscore": 1.0,
    "crisis_precision": 1.0,
    "action_distribution": {
      "PASS": 129,
      "WARN": 104,
      "REWRITE": 193,
      "CRISIS": 60
    }
  }
}