feat: Module C v5/v6 training complete, ablations, SOTA baselines, paper updates

- Module C: BC+PPO training v5/v6 done; eval results in experiments/eval_intervention_v{5,6}.json
- Reward: v5 label-aligned constrained reward (code/src/rl/reward.py)
- Ablations: Module B (history_r, response_only, full) + Module C (wo_category_reward)
- SOTA baselines: WildGuard and ShieldGemma2b eval scripts and results
- Paper: update sections 05–08 (Module B/C description, experiments table, discussion)
- Docs: add record.md (change log), update state.md and exp.md; retire change.md
- Tools: add html-to-ppt utilities and run_shieldgemma2b.sh
- Configs: add ablation YAML configs for Module B and C
- Cleanup: remove stale reference/ PNG screenshots

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-20 14:24:09 +08:00
parent 6d61a950f1
commit 52ba43f08d
55 changed files with 8239 additions and 1244 deletions

View File

@@ -146,6 +146,7 @@ def run_neural_detection(
cfg: Dict,
device: str,
label_filter: str = "all",
ablation_mode: str = "full",
) -> Dict:
"""Run the neural detector on test samples, compute binary + level + category metrics."""
model.eval()
@@ -166,6 +167,7 @@ def run_neural_detection(
sample["persona"], sample["history"],
sample["user_input"], sample["ai_response"],
max_history_turns=data_cfg.get("max_history_turns", 5),
ablation_mode=ablation_mode,
)
def enc(text, max_len):
@@ -301,6 +303,9 @@ def _collect_c_primary_idx(processed_samples: List[Dict]) -> List[int]:
return result
_ACTION_WARN = 1
_ACTION_REWRITE = 2
def run_rl_intervention(agent, processed_samples: List[Dict], device: str) -> Dict:
agent.eval()
y_risk_true, l_risk_true, a_pred, a_recommend = [], [], [], []
@@ -310,9 +315,16 @@ def run_rl_intervention(agent, processed_samples: List[Dict], device: str) -> Di
with torch.no_grad():
action, _, _, _ = agent.get_action(obs, deterministic=True)
action_val = action.item()
# Safety floor: WARN is not a real intervention on high-risk content.
# Use det_l_risk (detector prediction) consistent with deployment.
det_l = int(s.get("det_l_risk", s["l_risk"]))
if action_val == _ACTION_WARN and det_l >= 3:
action_val = _ACTION_REWRITE
y_risk_true.append(int(s["y_risk"]))
l_risk_true.append(int(s["l_risk"]))
a_pred.append(action.item())
a_pred.append(action_val)
a_recommend.append(ACTION_NAME_TO_ID.get(s.get("a_recommend", "PASS"), 0))
c_primary_idx = _collect_c_primary_idx(processed_samples)
@@ -431,6 +443,14 @@ def main():
"public=10个通用标签(R1/R2/R5-R9人工子集可用), "
"companion=4个companion专属标签(R3/R4/R10)"
))
parser.add_argument("--ablation-mode", default="full",
choices=["full", "response_only", "history_r"],
help=(
"Module B 消融模式: "
"full=全输入流(默认), "
"history_r=无PersonaHistory+Response, "
"response_only=仅Response"
))
args = parser.parse_args()
with open(args.config) as f:
@@ -523,6 +543,7 @@ def main():
neural_m = run_neural_detection(
detector, tokenizer, samples, cfg, device,
label_filter=args.label_filter,
ablation_mode=args.ablation_mode,
)
print_metrics("Ours: CompanionRiskDetector", neural_m)
all_results["ours_detection"] = neural_m