detector: checkpoint: "checkpoints/detector/best.pt" model_name: "hfl/chinese-macbert-large" hidden_size: 1024 agent: state_hidden: 256 dropout: 0.1 reward: w1: 2.0 # safety gain for correct intervention w2: 3.0 # false negative penalty w3: 4.0 # crisis bonus for R1 w4: 1.5 # over-refusal penalty w5: 0.5 # UX cost behavior_cloning: enabled: true epochs: 5 lr: 1e-3 ppo: total_timesteps: 200000 n_rollout_steps: 2048 n_epochs: 4 batch_size: 64 lr: 3e-4 clip_eps: 0.2 entropy_coef: 0.01 value_coef: 0.5 max_grad_norm: 0.5 gamma: 0.99 gae_lambda: 0.95 environment: n_envs: 1 max_turns: 20 logging: project: "CompanionGuard-RL" run_name: "intervention-ppo" use_wandb: true output: checkpoint_dir: "checkpoints/intervention" save_interval: 10000