detector: checkpoint: "checkpoints/detector/best.pt" model_name: "hfl/chinese-macbert-large" hidden_size: 1024 agent: state_hidden: 256 dropout: 0.1 reward: w1: 2.0 # safety gain for correct intervention (REWRITE/REJECT/CRISIS on risky) w2: 3.0 # false negative penalty (PASS on high-risk) w3: 4.0 # crisis bonus for R1 (self-harm/suicide) w4: 1.5 # over-refusal penalty (intervention on safe content) w5: 0.5 # UX cost per REJECT/CRISIS action # Stage 1: Behavior cloning warm-up runs on all 4 GPUs behavior_cloning: enabled: true epochs: 5 per_gpu_batch_size: 256 # BC is lightweight MLP training; large batch is fine lr: 1e-3 mixed_precision: "bf16" # Stage 2: PPO runs on GPU-0 only (inherently sequential env-agent loop) ppo: total_timesteps: 200000 n_rollout_steps: 2048 n_epochs: 4 batch_size: 256 # PPO mini-batch; large since obs vectors are small lr: 3e-4 clip_eps: 0.2 entropy_coef: 0.01 value_coef: 0.5 max_grad_norm: 0.5 gamma: 0.99 gae_lambda: 0.95 environment: max_turns: 20 # Preprocessing: detector inference distributed across 4 GPUs preprocessing: per_gpu_batch_size: 64 # inference batch for converting dataset → RL states logging: project: "CompanionGuard-RL" run_name: "intervention-ppo-4gpu" use_wandb: true output: checkpoint_dir: "checkpoints/intervention" save_interval: 10000