feat: initial CompanionGuard-RL framework

Two-module pipeline for AI companion safety: - Module B: context-aware risk detector with CrossAttention fusion - Module C: PPO-based adaptive intervention policy Includes CompanionRisk Taxonomy (10 primary + 14 fine-grained labels), dataset generation/annotation pipeline, training scripts, and eval suite. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-09 17:21:11 +08:00
commit 7d4345c29d
29 changed files with 3317 additions and 0 deletions
--- a/configs/intervention_config.yaml
+++ b/configs/intervention_config.yaml
@@ -0,0 +1,46 @@
+detector:
+  checkpoint: "checkpoints/detector/best.pt"
+  model_name: "hfl/chinese-macbert-large"
+  hidden_size: 1024
+
+agent:
+  state_hidden: 256
+  dropout: 0.1
+
+reward:
+  w1: 2.0   # safety gain for correct intervention
+  w2: 3.0   # false negative penalty
+  w3: 4.0   # crisis bonus for R1
+  w4: 1.5   # over-refusal penalty
+  w5: 0.5   # UX cost
+
+behavior_cloning:
+  enabled: true
+  epochs: 5
+  lr: 1e-3
+
+ppo:
+  total_timesteps: 200000
+  n_rollout_steps: 2048
+  n_epochs: 4
+  batch_size: 64
+  lr: 3e-4
+  clip_eps: 0.2
+  entropy_coef: 0.01
+  value_coef: 0.5
+  max_grad_norm: 0.5
+  gamma: 0.99
+  gae_lambda: 0.95
+
+environment:
+  n_envs: 1
+  max_turns: 20
+
+logging:
+  project: "CompanionGuard-RL"
+  run_name: "intervention-ppo"
+  use_wandb: true
+
+output:
+  checkpoint_dir: "checkpoints/intervention"
+  save_interval: 10000