detector:
  checkpoint: "checkpoints/detector/best.pt"
  model_name: "hfl/chinese-macbert-large"
  hidden_size: 1024

agent:
  state_hidden: 256
  dropout: 0.1

reward:
  w1: 2.0   # safety gain for correct intervention (REWRITE/REJECT/CRISIS on risky)
  w2: 3.0   # false negative penalty (PASS on high-risk)
  w3: 4.0   # crisis bonus for R1 (self-harm/suicide)
  w4: 1.5   # over-refusal penalty (intervention on safe content)
  w5: 0.5   # UX cost per REJECT/CRISIS action

# Stage 1: Behavior cloning warm-up runs on all 4 GPUs
behavior_cloning:
  enabled: true
  epochs: 5
  per_gpu_batch_size: 256   # BC is lightweight MLP training; large batch is fine
  lr: 1e-3
  mixed_precision: "bf16"

# Stage 2: PPO runs on GPU-0 only (inherently sequential env-agent loop)
ppo:
  total_timesteps: 200000
  n_rollout_steps: 2048
  n_epochs: 4
  batch_size: 256           # PPO mini-batch; large since obs vectors are small
  lr: 3e-4
  clip_eps: 0.2
  entropy_coef: 0.01
  value_coef: 0.5
  max_grad_norm: 0.5
  gamma: 0.99
  gae_lambda: 0.95

environment:
  max_turns: 20

# Preprocessing: detector inference distributed across 4 GPUs
preprocessing:
  per_gpu_batch_size: 64    # inference batch for converting dataset → RL states

logging:
  project:   "CompanionGuard-RL"
  run_name:  "intervention-ppo-4gpu"
  use_wandb: true

output:
  checkpoint_dir: "checkpoints/intervention"
  save_interval:  10000