feat: Module C v5/v6 training complete, ablations, SOTA baselines, paper updates

- Module C: BC+PPO training v5/v6 done; eval results in experiments/eval_intervention_v{5,6}.json - Reward: v5 label-aligned constrained reward (code/src/rl/reward.py) - Ablations: Module B (history_r, response_only, full) + Module C (wo_category_reward) - SOTA baselines: WildGuard and ShieldGemma2b eval scripts and results - Paper: update sections 05–08 (Module B/C description, experiments table, discussion) - Docs: add record.md (change log), update state.md and exp.md; retire change.md - Tools: add html-to-ppt utilities and run_shieldgemma2b.sh - Configs: add ablation YAML configs for Module B and C - Cleanup: remove stale reference/ PNG screenshots Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-20 14:24:09 +08:00
parent 6d61a950f1
commit 52ba43f08d
55 changed files with 8239 additions and 1244 deletions
--- a/code/configs/detector_config_abl_history_r.yaml
+++ b/code/configs/detector_config_abl_history_r.yaml
@@ -0,0 +1,51 @@
+model:
+  name: "/root/siton-data-2849d4ce327c4ccfb233ce33868fe7fe/zsy/macbert-large"
+  hidden_size: 1024
+  num_heads: 8
+  dropout: 0.1
+  use_lora: false
+
+data:
+  train_path: "data/processed/CompanionRisk-Bench/train.jsonl"
+  val_path:   "data/processed/CompanionRisk-Bench/dev.jsonl"
+  test_path:  "data/processed/CompanionRisk-Bench/test.jsonl"
+  max_persona_len:   128
+  max_context_len:   512
+  max_response_len:  256
+  max_history_turns: 5
+  num_workers: 4
+  ablation_mode: "history_r"   # 消融：History+Response，persona 置空
+
+training:
+  epochs: 10
+  per_gpu_batch_size: 16
+  gradient_accumulation_steps: 2
+  lr: 2e-5
+  warmup_steps: 100
+  weight_decay: 0.01
+  gradient_clip: 1.0
+  eval_steps: 100
+  mixed_precision: "bf16"
+  seed: 42
+
+loss_weights:
+  binary:  1.0
+  level:   1.0
+  primary: 1.0
+  fine:    2.0
+
+fine_training:
+  use_pos_weight: true
+  risky_only:     true
+
+evaluation:
+  binary_threshold: 0.5
+  fine_threshold:   0.4
+
+logging:
+  project:   "CompanionGuard-RL"
+  run_name:  "detector-abl-history-r"
+  use_wandb: false
+
+output:
+  checkpoint_dir: "checkpoints/detector_abl_history_r"
--- a/code/configs/detector_config_abl_response_only.yaml
+++ b/code/configs/detector_config_abl_response_only.yaml
@@ -0,0 +1,51 @@
+model:
+  name: "/root/siton-data-2849d4ce327c4ccfb233ce33868fe7fe/zsy/macbert-large"
+  hidden_size: 1024
+  num_heads: 8
+  dropout: 0.1
+  use_lora: false
+
+data:
+  train_path: "data/processed/CompanionRisk-Bench/train.jsonl"
+  val_path:   "data/processed/CompanionRisk-Bench/dev.jsonl"
+  test_path:  "data/processed/CompanionRisk-Bench/test.jsonl"
+  max_persona_len:   128
+  max_context_len:   512
+  max_response_len:  256
+  max_history_turns: 5
+  num_workers: 4
+  ablation_mode: "response_only"   # 消融：仅 Response 流，persona/context 均置空
+
+training:
+  epochs: 10
+  per_gpu_batch_size: 16
+  gradient_accumulation_steps: 2
+  lr: 2e-5
+  warmup_steps: 100
+  weight_decay: 0.01
+  gradient_clip: 1.0
+  eval_steps: 100
+  mixed_precision: "bf16"
+  seed: 42
+
+loss_weights:
+  binary:  1.0
+  level:   1.0
+  primary: 1.0
+  fine:    2.0
+
+fine_training:
+  use_pos_weight: true
+  risky_only:     true
+
+evaluation:
+  binary_threshold: 0.5
+  fine_threshold:   0.4
+
+logging:
+  project:   "CompanionGuard-RL"
+  run_name:  "detector-abl-response-only"
+  use_wandb: false
+
+output:
+  checkpoint_dir: "checkpoints/detector_abl_response_only"
--- a/code/configs/detector_config_server.yaml
+++ b/code/configs/detector_config_server.yaml
@@ -1,5 +1,5 @@
 model:
-  name: "/root/siton-data-740d234e02d749f08fe5347b0c74c49f/zsy/macbert-large"
+  name: "/root/siton-data-2849d4ce327c4ccfb233ce33868fe7fe/zsy/macbert-large"
  hidden_size: 1024
  num_heads: 8
  dropout: 0.1
--- a/code/configs/intervention_config.yaml
+++ b/code/configs/intervention_config.yaml
@@ -1,7 +1,6 @@
 detector:
  checkpoint: "checkpoints/detector/best.pt"
-  # Server 2 path — update this when running on server 2
-  model_name: "/root/siton-data-740d234e02d749f08fe5347b0c74c49f/zsy/macbert-large"
+  model_name: "/root/siton-data-2849d4ce327c4ccfb233ce33868fe7fe/zsy/macbert-large"
  hidden_size: 1024

 agent:
--- a/code/configs/intervention_config_abl_wo_category.yaml
+++ b/code/configs/intervention_config_abl_wo_category.yaml
@@ -0,0 +1,56 @@
+detector:
+  checkpoint: "checkpoints/detector/best.pt"
+  model_name: "/root/siton-data-2849d4ce327c4ccfb233ce33868fe7fe/zsy/macbert-large"
+  hidden_size: 1024
+
+agent:
+  state_hidden: 256
+  dropout: 0.1
+
+# Stage 1: Behavior cloning warm-up
+behavior_cloning:
+  enabled: true
+  epochs: 5
+  per_gpu_batch_size: 256
+  lr: 0.001
+  mixed_precision: "bf16"
+
+# Stage 2: PPO runs on GPU-0 only
+ppo:
+  total_timesteps: 200000
+  n_rollout_steps: 2048
+  n_epochs: 4
+  batch_size: 256
+  lr: 0.0003
+  clip_eps: 0.2
+  entropy_coef: 0.01
+  value_coef: 0.5
+  max_grad_norm: 0.5
+  gamma: 0.99
+  gae_lambda: 0.95
+
+environment:
+  max_turns: 20
+
+reward:
+  enable_category_reward: false   # 消融：关闭类别特定奖励（CRISIS_R1/REJECT_R6R7/REWRITE_companion）
+  w1: 2.0
+  w2: 3.0
+  w3: 4.0
+  w4: 1.5
+  w5: 0.5
+
+evaluation:
+  binary_threshold: 0.5
+
+preprocessing:
+  per_gpu_batch_size: 64
+
+logging:
+  project:   "CompanionGuard-RL"
+  run_name:  "intervention-abl-wo-category"
+  use_wandb: false
+
+output:
+  checkpoint_dir: "checkpoints/intervention_abl_wo_category"
+  save_interval:  10000