feat: initial CompanionGuard-RL framework

Two-module pipeline for AI companion safety: - Module B: context-aware risk detector with CrossAttention fusion - Module C: PPO-based adaptive intervention policy Includes CompanionRisk Taxonomy (10 primary + 14 fine-grained labels), dataset generation/annotation pipeline, training scripts, and eval suite. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-09 17:21:11 +08:00
commit 7d4345c29d
29 changed files with 3317 additions and 0 deletions
--- a/configs/data_generation.yaml
+++ b/configs/data_generation.yaml
@@ -0,0 +1,22 @@
+api:
+  type: "qwen"         # "qwen" or "openai"
+  model: "qwen-max"
+
+generation:
+  total_samples: 3000
+  samples_per_category: 300
+  delay: 0.5           # seconds between API calls
+
+output:
+  raw_dir: "data/raw"
+  output_file: "data/raw/generated.jsonl"
+
+annotation:
+  judge_model: "qwen-max"
+  output_file: "data/processed/annotated.jsonl"
+
+split:
+  train: 0.8
+  val: 0.1
+  test: 0.1
+  seed: 42
--- a/configs/detector_config.yaml
+++ b/configs/detector_config.yaml
@@ -0,0 +1,43 @@
+model:
+  name: "hfl/chinese-macbert-large"
+  hidden_size: 1024
+  num_heads: 8
+  dropout: 0.1
+  use_lora: false
+
+data:
+  train_path: "data/processed/train.jsonl"
+  val_path: "data/processed/val.jsonl"
+  test_path: "data/processed/test.jsonl"
+  max_persona_len: 128
+  max_context_len: 512
+  max_response_len: 256
+  max_history_turns: 5
+
+training:
+  epochs: 10
+  batch_size: 16
+  lr: 2e-5
+  warmup_steps: 200
+  weight_decay: 0.01
+  gradient_clip: 1.0
+  eval_steps: 200
+  save_steps: 500
+
+loss_weights:
+  binary: 1.0
+  level: 1.0
+  primary: 1.0
+  fine: 1.0
+
+evaluation:
+  binary_threshold: 0.5
+  fine_threshold: 0.4
+
+logging:
+  project: "CompanionGuard-RL"
+  run_name: "detector-macbert"
+  use_wandb: true
+
+output:
+  checkpoint_dir: "checkpoints/detector"
--- a/configs/intervention_config.yaml
+++ b/configs/intervention_config.yaml
@@ -0,0 +1,46 @@
+detector:
+  checkpoint: "checkpoints/detector/best.pt"
+  model_name: "hfl/chinese-macbert-large"
+  hidden_size: 1024
+
+agent:
+  state_hidden: 256
+  dropout: 0.1
+
+reward:
+  w1: 2.0   # safety gain for correct intervention
+  w2: 3.0   # false negative penalty
+  w3: 4.0   # crisis bonus for R1
+  w4: 1.5   # over-refusal penalty
+  w5: 0.5   # UX cost
+
+behavior_cloning:
+  enabled: true
+  epochs: 5
+  lr: 1e-3
+
+ppo:
+  total_timesteps: 200000
+  n_rollout_steps: 2048
+  n_epochs: 4
+  batch_size: 64
+  lr: 3e-4
+  clip_eps: 0.2
+  entropy_coef: 0.01
+  value_coef: 0.5
+  max_grad_norm: 0.5
+  gamma: 0.99
+  gae_lambda: 0.95
+
+environment:
+  n_envs: 1
+  max_turns: 20
+
+logging:
+  project: "CompanionGuard-RL"
+  run_name: "intervention-ppo"
+  use_wandb: true
+
+output:
+  checkpoint_dir: "checkpoints/intervention"
+  save_interval: 10000