detector: checkpoint: "checkpoints/detector/best.pt" # Server 2 path — update this when running on server 2 model_name: "/root/siton-data-740d234e02d749f08fe5347b0c74c49f/zsy/macbert-large" hidden_size: 1024 agent: state_hidden: 256 dropout: 0.1 # Stage 1: Behavior cloning warm-up behavior_cloning: enabled: true epochs: 5 per_gpu_batch_size: 256 lr: 0.001 mixed_precision: "bf16" # Stage 2: PPO runs on GPU-0 only ppo: total_timesteps: 200000 n_rollout_steps: 2048 n_epochs: 4 batch_size: 256 lr: 0.0003 clip_eps: 0.2 entropy_coef: 0.01 value_coef: 0.5 max_grad_norm: 0.5 gamma: 0.99 gae_lambda: 0.95 environment: max_turns: 20 # reward weights — NOTE: v5 reward.py uses hardcoded constants (EXACT_ALIGN_BONUS, # PASS_HIGH_PENALTY, etc.), the `weights` param of compute_reward() is unused. # These values are kept here for reference only (from wangyu's earlier config). # DEFAULT_WEIGHTS alias in reward.py: {"w1":2.0, "w2":3.0, "w3":4.0, "w4":1.5, "w5":0.5} reward: w1: 2.0 # safety_recall (reference only — not read by reward.py v5) w2: 3.0 # crisis_precision (reference only) w3: 4.0 # action_accuracy (reference only) w4: 1.5 # over_refusal penalty (reference only) w5: 0.5 # fluency (reference only) evaluation: binary_threshold: 0.5 preprocessing: per_gpu_batch_size: 64 logging: project: "CompanionGuard-RL" run_name: "intervention-v5-1gpu" use_wandb: false output: checkpoint_dir: "checkpoints/intervention" save_interval: 10000