detector:
  checkpoint: "checkpoints/detector/best.pt"
  # Server 2 path — update this when running on server 2
  model_name: "/root/siton-data-740d234e02d749f08fe5347b0c74c49f/zsy/macbert-large"
  hidden_size: 1024

agent:
  state_hidden: 256
  dropout: 0.1

# Stage 1: Behavior cloning warm-up
behavior_cloning:
  enabled: true
  epochs: 5
  per_gpu_batch_size: 256
  lr: 0.001
  mixed_precision: "bf16"

# Stage 2: PPO runs on GPU-0 only
ppo:
  total_timesteps: 200000
  n_rollout_steps: 2048
  n_epochs: 4
  batch_size: 256
  lr: 0.0003
  clip_eps: 0.2
  entropy_coef: 0.01
  value_coef: 0.5
  max_grad_norm: 0.5
  gamma: 0.99
  gae_lambda: 0.95

environment:
  max_turns: 20

# reward weights — NOTE: v5 reward.py uses hardcoded constants (EXACT_ALIGN_BONUS,
# PASS_HIGH_PENALTY, etc.), the `weights` param of compute_reward() is unused.
# These values are kept here for reference only (from wangyu's earlier config).
# DEFAULT_WEIGHTS alias in reward.py: {"w1":2.0, "w2":3.0, "w3":4.0, "w4":1.5, "w5":0.5}
reward:
  w1: 2.0   # safety_recall (reference only — not read by reward.py v5)
  w2: 3.0   # crisis_precision (reference only)
  w3: 4.0   # action_accuracy (reference only)
  w4: 1.5   # over_refusal penalty (reference only)
  w5: 0.5   # fluency (reference only)

evaluation:
  binary_threshold: 0.5

preprocessing:
  per_gpu_batch_size: 64

logging:
  project:   "CompanionGuard-RL"
  run_name:  "intervention-v5-1gpu"
  use_wandb: false

output:
  checkpoint_dir: "checkpoints/intervention"
  save_interval:  10000