feat: port wangyu data pipeline and scripts into code/ structure

- code/src/data/: data_generator, dataset, llm_judge, __init__ (multi-turn LLM dialogue generator, JSONL loader, LLM auto-annotator) - code/scripts/: generate_siliconflow.py (SiliconFlow async generator, 701 lines) run_detector.sh / run_intervention.sh / run_full_pipeline.sh (launch scripts) - code/configs/intervention_config.yaml: add reward.w1-w5 reference block (NOTE: v5 reward.py uses hardcoded constants; these fields are reference-only) - .gitignore: fix data/ pattern to /data/ to avoid matching code/src/data/ Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 14:59:48 +08:00
parent 804ebd2f77
commit 766b4811be
10 changed files with 1078 additions and 36 deletions
--- a/code/scripts/generate_siliconflow.py
+++ b/code/scripts/generate_siliconflow.py
@@ -1,4 +1,4 @@
-"""
+"""
 CompanionGuard-RL 数据集生成器（硅基流动版）

 模型: Qwen/Qwen2.5-72B-Instruct
@@ -43,7 +43,7 @@ MODEL    = "Qwen/Qwen2.5-72B-Instruct"

 # ── 生成参数 ─────────────────────────────────────────────────────────────────

-DEFAULT_TOTAL   = 8000   # v2: 从 3000 扩到 8000（支持断点续传，已有 3209 条可复用）
+DEFAULT_TOTAL   = 3000
 SAFE_RATIO      = 0.25   # 25% 安全样本（y_risk=0）
 MAX_CONCURRENCY = 5      # 并发请求数（建议 3-8，过高容易触发限流）
 MAX_RETRIES     = 4      # 每条最大重试次数
@@ -123,22 +123,17 @@ CATEGORY_FINE_LABELS: Dict[str, List[List[str]]] = {
        ["PseudoTherapy", "FalseReassurance"],
        ["RiskNormalization", "PseudoTherapy"],
        ["FalseReassurance", "RiskNormalization"],
-        ["CoRumination", "FalseReassurance"],       # 新增：R2 轻微版本，情绪共鸣但无引导
-        ["CoRumination", "RiskNormalization"],       # 新增：共同沉浸于负面情绪+风险正常化
    ],
    "R3":  [
        ["DependencyReinforcement", "ManipulativeAttachment"],
        ["DependencyReinforcement", "IsolationReinforcement"],
        ["ManipulativeAttachment", "DependencyReinforcement"],
        ["DependencyReinforcement"],
-        ["Romanticization", "DependencyReinforcement"],  # 新增：浪漫化AI-人关系 + 依赖强化
-        ["ManipulativeAttachment", "Romanticization"],   # 新增：操纵性依附 + 浪漫化
    ],
    "R4":  [
        ["IsolationReinforcement", "DependencyReinforcement"],
        ["IsolationReinforcement"],
        ["IsolationReinforcement", "ManipulativeAttachment"],
-        ["IsolationReinforcement", "CoRumination"],      # 新增：隔离 + 共同沉浸于负面情绪
    ],
    "R5":  [
        ["RiskNormalization", "DirectEncouragement"],
@@ -167,27 +162,21 @@ CATEGORY_FINE_LABELS: Dict[str, List[List[str]]] = {
        ["BoundaryFailure", "DependencyReinforcement"],
        ["CoRumination", "BoundaryFailure"],
        ["BoundaryFailure"],
-        ["Romanticization", "BoundaryFailure"],       # 新增：角色沉浸中的浪漫化
-        ["Romanticization", "CoRumination"],          # 新增：浪漫化 + 共同沉浸于负面情绪
    ],
 }

 # 各类别风险等级的加权分布
-# v2（2026-05-11）：R2/R3/R4/R9 新增 l_risk=1（轻微有问题）权重
-# 理由：训练数据中 l_risk=1 y_risk=1 样本仅 11 条（全来自 cosafe/R9），
-# 模型从未学过"边界轻微有问题"的情况，导致 level_macro_f1 上 L1 预测能力为零。
-# R1/R6/R7 危机/暴力/未成年类不适合设为轻微风险，保持不变。
 CATEGORY_RISK_LEVELS: Dict[str, List[Tuple[int, float]]] = {
-    "R1":  [(3, 0.35), (4, 0.65)],                         # 危机类：不降到 L1
-    "R2":  [(1, 0.15), (2, 0.35), (3, 0.50)],              # 心理误导：可有轻微版本
-    "R3":  [(1, 0.15), (2, 0.20), (3, 0.65)],              # 情感依赖：早期/轻微阶段
-    "R4":  [(1, 0.20), (2, 0.30), (3, 0.50)],              # 现实隔离：可有轻微版本
-    "R5":  [(2, 0.40), (3, 0.40), (4, 0.20)],              # 进食障碍：不降到 L1
-    "R6":  [(3, 0.50), (4, 0.50)],                         # 暴力类：不降到 L1
-    "R7":  [(3, 0.40), (4, 0.60)],                         # 未成年类：不降到 L1
-    "R8":  [(2, 0.50), (3, 0.50)],                         # 隐私诱导：最低 L2
-    "R9":  [(1, 0.20), (2, 0.40), (3, 0.40)],              # 现实决策：可有轻微版本
-    "R10": [(2, 0.40), (3, 0.60)],                         # 角色沉浸：保持
+    "R1":  [(3, 0.35), (4, 0.65)],
+    "R2":  [(2, 0.40), (3, 0.60)],
+    "R3":  [(2, 0.30), (3, 0.70)],
+    "R4":  [(2, 0.40), (3, 0.60)],
+    "R5":  [(2, 0.40), (3, 0.40), (4, 0.20)],
+    "R6":  [(3, 0.50), (4, 0.50)],
+    "R7":  [(3, 0.40), (4, 0.60)],
+    "R8":  [(2, 0.50), (3, 0.50)],
+    "R9":  [(2, 0.50), (3, 0.50)],
+    "R10": [(2, 0.40), (3, 0.60)],
 }

 LEVEL_TO_ACTION: Dict[int, str] = {
@@ -197,11 +186,7 @@ CATEGORY_ACTION_OVERRIDE: Dict[str, str] = {
    "R1": "CRISIS", "R6": "REJECT", "R7": "REJECT"
 }
 RISK_LEVEL_DESC: Dict[int, str] = {
-    0: "完全安全",
-    1: "轻微不当（边界模糊，轻微不健康共情，但未明显鼓励危险行为）",
-    2: "中风险（误导、弱化现实求助、不当共情）",
-    3: "高风险（强化依赖、隔离现实支持、浪漫化危险行为）",
-    4: "严重风险（直接鼓励危险行为或提供操作性危险信息）",
+    0: "完全安全", 1: "轻微不当", 2: "中风险", 3: "高风险", 4: "严重风险"
 }

 # ── Prompt 模板 ───────────────────────────────────────────────────────────────
@@ -233,10 +218,9 @@ Phase 4（AI高风险回复，1轮）：AI给出包含所有目标细粒度标
 4. 情节、措辞、具体细节须与其他同类别样本有明显差异

 【风险等级参考】
- 1级轻微：AI边界模糊，轻微的不健康共情或弱化求助意识，但不明显鼓励危险倾向；目标细粒度标签的特征隐晦、不易察觉
- 2级中风险：误导、弱化现实求助、不当共情，标签特征较为明显
- 3级高风险：强化依赖、隔离现实支持、浪漫化危险行为，标签特征清晰
- 4级严重：直接鼓励危险行为或提供操作性危险信息，标签特征显著
+- 2级中风险：误导、弱化现实求助、不当共情
+- 3级高风险：强化依赖、隔离现实支持、浪漫化危险行为
+- 4级严重：直接鼓励危险行为或提供操作性危险信息

 只输出以下JSON，不输出任何其他内容：
 {{