CompanionGuard-RL/configs/intervention_config.yaml

detector:
  checkpoint: "checkpoints/detector/best.pt"
  model_name: "hfl/chinese-macbert-large"
  hidden_size: 1024

agent:
  state_hidden: 256
  dropout: 0.1

reward:
  w1: 2.0   # safety gain for correct intervention
  w2: 3.0   # false negative penalty
  w3: 4.0   # crisis bonus for R1
  w4: 1.5   # over-refusal penalty
  w5: 0.5   # UX cost

behavior_cloning:
  enabled: true
  epochs: 5
  lr: 1e-3

ppo:
  total_timesteps: 200000
  n_rollout_steps: 2048
  n_epochs: 4
  batch_size: 64
  lr: 3e-4
  clip_eps: 0.2
  entropy_coef: 0.01
  value_coef: 0.5
  max_grad_norm: 0.5
  gamma: 0.99
  gae_lambda: 0.95

environment:
  n_envs: 1
  max_turns: 20

logging:
  project: "CompanionGuard-RL"
  run_name: "intervention-ppo"
  use_wandb: true

output:
  checkpoint_dir: "checkpoints/intervention"
  save_interval: 10000
feat: initial CompanionGuard-RL framework Two-module pipeline for AI companion safety: - Module B: context-aware risk detector with CrossAttention fusion - Module C: PPO-based adaptive intervention policy Includes CompanionRisk Taxonomy (10 primary + 14 fine-grained labels), dataset generation/annotation pipeline, training scripts, and eval suite. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-05-09 17:21:11 +08:00			`detector:`
			`checkpoint: "checkpoints/detector/best.pt"`
			`model_name: "hfl/chinese-macbert-large"`
			`hidden_size: 1024`

			`agent:`
			`state_hidden: 256`
			`dropout: 0.1`

			`reward:`
			`w1: 2.0 # safety gain for correct intervention`
			`w2: 3.0 # false negative penalty`
			`w3: 4.0 # crisis bonus for R1`
			`w4: 1.5 # over-refusal penalty`
			`w5: 0.5 # UX cost`

			`behavior_cloning:`
			`enabled: true`
			`epochs: 5`
			`lr: 1e-3`

			`ppo:`
			`total_timesteps: 200000`
			`n_rollout_steps: 2048`
			`n_epochs: 4`
			`batch_size: 64`
			`lr: 3e-4`
			`clip_eps: 0.2`
			`entropy_coef: 0.01`
			`value_coef: 0.5`
			`max_grad_norm: 0.5`
			`gamma: 0.99`
			`gae_lambda: 0.95`

			`environment:`
			`n_envs: 1`
			`max_turns: 20`

			`logging:`
			`project: "CompanionGuard-RL"`
			`run_name: "intervention-ppo"`
			`use_wandb: true`

			`output:`
			`checkpoint_dir: "checkpoints/intervention"`
			`save_interval: 10000`