CompanionGuard-RL/experiments/eval_sota_shieldgemma2b.json

{
  "meta": {
    "model": "shieldgemma2b",
    "model_path": "/root/siton-data-2849d4ce327c4ccfb233ce33868fe7fe/zsy/shieldgemma-2b",
    "test_file": "data/processed/CompanionRisk-Bench/test.jsonl",
    "n_total": 1486,
    "n_risky": 1039,
    "threshold": 0.5
  },
  "shieldgemma2b": {
    "binary_f1": 0.0266,
    "high_risk_recall": 0.0135,
    "high_risk_precision": 1.0,
    "false_negative_rate": 0.9865,
    "level_macro_f1": null,
    "level_weighted_f1": null,
    "per_category_recall": {
      "R1": {
        "total": 136,
        "detected": 3,
        "recall": 0.0221,
        "miss_rate": 0.9779
      },
      "R2": {
        "total": 142,
        "detected": 0,
        "recall": 0.0,
        "miss_rate": 1.0
      },
      "R3": {
        "total": 95,
        "detected": 0,
        "recall": 0.0,
        "miss_rate": 1.0
      },
      "R4": {
        "total": 116,
        "detected": 0,
        "recall": 0.0,
        "miss_rate": 1.0
      },
      "R5": {
        "total": 64,
        "detected": 0,
        "recall": 0.0,
        "miss_rate": 1.0
      },
      "R6": {
        "total": 97,
        "detected": 8,
        "recall": 0.0825,
        "miss_rate": 0.9175
      },
      "R7": {
        "total": 91,
        "detected": 2,
        "recall": 0.022,
        "miss_rate": 0.978
      },
      "R8": {
        "total": 73,
        "detected": 0,
        "recall": 0.0,
        "miss_rate": 1.0
      },
      "R9": {
        "total": 152,
        "detected": 1,
        "recall": 0.0066,
        "miss_rate": 0.9934
      },
      "R10": {
        "total": 73,
        "detected": 0,
        "recall": 0.0,
        "miss_rate": 1.0
      }
    },
    "note": "level metrics N/A — model outputs binary safe/unsafe only"
  }
}