feat: Module C v5/v6 training complete, ablations, SOTA baselines, paper updates

- Module C: BC+PPO training v5/v6 done; eval results in experiments/eval_intervention_v{5,6}.json - Reward: v5 label-aligned constrained reward (code/src/rl/reward.py) - Ablations: Module B (history_r, response_only, full) + Module C (wo_category_reward) - SOTA baselines: WildGuard and ShieldGemma2b eval scripts and results - Paper: update sections 05–08 (Module B/C description, experiments table, discussion) - Docs: add record.md (change log), update state.md and exp.md; retire change.md - Tools: add html-to-ppt utilities and run_shieldgemma2b.sh - Configs: add ablation YAML configs for Module B and C - Cleanup: remove stale reference/ PNG screenshots Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-20 14:24:09 +08:00
parent 6d61a950f1
commit 52ba43f08d
55 changed files with 8239 additions and 1244 deletions
--- a/code/scripts/evaluate.py
+++ b/code/scripts/evaluate.py
@@ -146,6 +146,7 @@ def run_neural_detection(
    cfg: Dict,
    device: str,
    label_filter: str = "all",
+    ablation_mode: str = "full",
 ) -> Dict:
    """Run the neural detector on test samples, compute binary + level + category metrics."""
    model.eval()
@@ -166,6 +167,7 @@ def run_neural_detection(
            sample["persona"], sample["history"],
            sample["user_input"], sample["ai_response"],
            max_history_turns=data_cfg.get("max_history_turns", 5),
+            ablation_mode=ablation_mode,
        )

        def enc(text, max_len):
@@ -301,6 +303,9 @@ def _collect_c_primary_idx(processed_samples: List[Dict]) -> List[int]:
    return result


+_ACTION_WARN    = 1
+_ACTION_REWRITE = 2
+
 def run_rl_intervention(agent, processed_samples: List[Dict], device: str) -> Dict:
    agent.eval()
    y_risk_true, l_risk_true, a_pred, a_recommend = [], [], [], []
@@ -310,9 +315,16 @@ def run_rl_intervention(agent, processed_samples: List[Dict], device: str) -> Di
        with torch.no_grad():
            action, _, _, _ = agent.get_action(obs, deterministic=True)

+        action_val = action.item()
+        # Safety floor: WARN is not a real intervention on high-risk content.
+        # Use det_l_risk (detector prediction) consistent with deployment.
+        det_l = int(s.get("det_l_risk", s["l_risk"]))
+        if action_val == _ACTION_WARN and det_l >= 3:
+            action_val = _ACTION_REWRITE
+
        y_risk_true.append(int(s["y_risk"]))
        l_risk_true.append(int(s["l_risk"]))
-        a_pred.append(action.item())
+        a_pred.append(action_val)
        a_recommend.append(ACTION_NAME_TO_ID.get(s.get("a_recommend", "PASS"), 0))

    c_primary_idx = _collect_c_primary_idx(processed_samples)
@@ -431,6 +443,14 @@ def main():
                            "public=10个通用标签(R1/R2/R5-R9，人工子集可用), "
                            "companion=4个companion专属标签(R3/R4/R10)"
                        ))
+    parser.add_argument("--ablation-mode", default="full",
+                        choices=["full", "response_only", "history_r"],
+                        help=(
+                            "Module B 消融模式: "
+                            "full=全输入流（默认）, "
+                            "history_r=无Persona（History+Response）, "
+                            "response_only=仅Response"
+                        ))
    args = parser.parse_args()

    with open(args.config) as f:
@@ -523,6 +543,7 @@ def main():
    neural_m = run_neural_detection(
        detector, tokenizer, samples, cfg, device,
        label_filter=args.label_filter,
+        ablation_mode=args.ablation_mode,
    )
    print_metrics("Ours: CompanionRiskDetector", neural_m)
    all_results["ours_detection"] = neural_m