feat: initial CompanionGuard-RL framework

Two-module pipeline for AI companion safety: - Module B: context-aware risk detector with CrossAttention fusion - Module C: PPO-based adaptive intervention policy Includes CompanionRisk Taxonomy (10 primary + 14 fine-grained labels), dataset generation/annotation pipeline, training scripts, and eval suite. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-09 17:21:11 +08:00
commit 7d4345c29d
29 changed files with 3317 additions and 0 deletions
--- a/configs/detector_config.yaml
+++ b/configs/detector_config.yaml
@@ -0,0 +1,43 @@
+model:
+  name: "hfl/chinese-macbert-large"
+  hidden_size: 1024
+  num_heads: 8
+  dropout: 0.1
+  use_lora: false
+
+data:
+  train_path: "data/processed/train.jsonl"
+  val_path: "data/processed/val.jsonl"
+  test_path: "data/processed/test.jsonl"
+  max_persona_len: 128
+  max_context_len: 512
+  max_response_len: 256
+  max_history_turns: 5
+
+training:
+  epochs: 10
+  batch_size: 16
+  lr: 2e-5
+  warmup_steps: 200
+  weight_decay: 0.01
+  gradient_clip: 1.0
+  eval_steps: 200
+  save_steps: 500
+
+loss_weights:
+  binary: 1.0
+  level: 1.0
+  primary: 1.0
+  fine: 1.0
+
+evaluation:
+  binary_threshold: 0.5
+  fine_threshold: 0.4
+
+logging:
+  project: "CompanionGuard-RL"
+  run_name: "detector-macbert"
+  use_wandb: true
+
+output:
+  checkpoint_dir: "checkpoints/detector"