feat: Module C v5/v6 training complete, ablations, SOTA baselines, paper updates
- Module C: BC+PPO training v5/v6 done; eval results in experiments/eval_intervention_v{5,6}.json
- Reward: v5 label-aligned constrained reward (code/src/rl/reward.py)
- Ablations: Module B (history_r, response_only, full) + Module C (wo_category_reward)
- SOTA baselines: WildGuard and ShieldGemma2b eval scripts and results
- Paper: update sections 05–08 (Module B/C description, experiments table, discussion)
- Docs: add record.md (change log), update state.md and exp.md; retire change.md
- Tools: add html-to-ppt utilities and run_shieldgemma2b.sh
- Configs: add ablation YAML configs for Module B and C
- Cleanup: remove stale reference/ PNG screenshots
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -146,6 +146,7 @@ def run_neural_detection(
|
||||
cfg: Dict,
|
||||
device: str,
|
||||
label_filter: str = "all",
|
||||
ablation_mode: str = "full",
|
||||
) -> Dict:
|
||||
"""Run the neural detector on test samples, compute binary + level + category metrics."""
|
||||
model.eval()
|
||||
@@ -166,6 +167,7 @@ def run_neural_detection(
|
||||
sample["persona"], sample["history"],
|
||||
sample["user_input"], sample["ai_response"],
|
||||
max_history_turns=data_cfg.get("max_history_turns", 5),
|
||||
ablation_mode=ablation_mode,
|
||||
)
|
||||
|
||||
def enc(text, max_len):
|
||||
@@ -301,6 +303,9 @@ def _collect_c_primary_idx(processed_samples: List[Dict]) -> List[int]:
|
||||
return result
|
||||
|
||||
|
||||
_ACTION_WARN = 1
|
||||
_ACTION_REWRITE = 2
|
||||
|
||||
def run_rl_intervention(agent, processed_samples: List[Dict], device: str) -> Dict:
|
||||
agent.eval()
|
||||
y_risk_true, l_risk_true, a_pred, a_recommend = [], [], [], []
|
||||
@@ -310,9 +315,16 @@ def run_rl_intervention(agent, processed_samples: List[Dict], device: str) -> Di
|
||||
with torch.no_grad():
|
||||
action, _, _, _ = agent.get_action(obs, deterministic=True)
|
||||
|
||||
action_val = action.item()
|
||||
# Safety floor: WARN is not a real intervention on high-risk content.
|
||||
# Use det_l_risk (detector prediction) consistent with deployment.
|
||||
det_l = int(s.get("det_l_risk", s["l_risk"]))
|
||||
if action_val == _ACTION_WARN and det_l >= 3:
|
||||
action_val = _ACTION_REWRITE
|
||||
|
||||
y_risk_true.append(int(s["y_risk"]))
|
||||
l_risk_true.append(int(s["l_risk"]))
|
||||
a_pred.append(action.item())
|
||||
a_pred.append(action_val)
|
||||
a_recommend.append(ACTION_NAME_TO_ID.get(s.get("a_recommend", "PASS"), 0))
|
||||
|
||||
c_primary_idx = _collect_c_primary_idx(processed_samples)
|
||||
@@ -431,6 +443,14 @@ def main():
|
||||
"public=10个通用标签(R1/R2/R5-R9,人工子集可用), "
|
||||
"companion=4个companion专属标签(R3/R4/R10)"
|
||||
))
|
||||
parser.add_argument("--ablation-mode", default="full",
|
||||
choices=["full", "response_only", "history_r"],
|
||||
help=(
|
||||
"Module B 消融模式: "
|
||||
"full=全输入流(默认), "
|
||||
"history_r=无Persona(History+Response), "
|
||||
"response_only=仅Response"
|
||||
))
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.config) as f:
|
||||
@@ -523,6 +543,7 @@ def main():
|
||||
neural_m = run_neural_detection(
|
||||
detector, tokenizer, samples, cfg, device,
|
||||
label_filter=args.label_filter,
|
||||
ablation_mode=args.ablation_mode,
|
||||
)
|
||||
print_metrics("Ours: CompanionRiskDetector", neural_m)
|
||||
all_results["ours_detection"] = neural_m
|
||||
|
||||
Reference in New Issue
Block a user