feat: port wangyu data pipeline and scripts into code/ structure

- code/src/data/: data_generator, dataset, llm_judge, __init__
  (multi-turn LLM dialogue generator, JSONL loader, LLM auto-annotator)
- code/scripts/: generate_siliconflow.py (SiliconFlow async generator, 701 lines)
  run_detector.sh / run_intervention.sh / run_full_pipeline.sh (launch scripts)
- code/configs/intervention_config.yaml: add reward.w1-w5 reference block
  (NOTE: v5 reward.py uses hardcoded constants; these fields are reference-only)
- .gitignore: fix data/ pattern to /data/ to avoid matching code/src/data/

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-05-18 14:59:48 +08:00
parent 804ebd2f77
commit 766b4811be
10 changed files with 1078 additions and 36 deletions

View File

@@ -1,4 +1,4 @@
"""
"""
CompanionGuard-RL 数据集生成器(硅基流动版)
模型: Qwen/Qwen2.5-72B-Instruct
@@ -43,7 +43,7 @@ MODEL = "Qwen/Qwen2.5-72B-Instruct"
# ── 生成参数 ─────────────────────────────────────────────────────────────────
DEFAULT_TOTAL = 8000 # v2: 从 3000 扩到 8000支持断点续传已有 3209 条可复用)
DEFAULT_TOTAL = 3000
SAFE_RATIO = 0.25 # 25% 安全样本y_risk=0
MAX_CONCURRENCY = 5 # 并发请求数(建议 3-8过高容易触发限流
MAX_RETRIES = 4 # 每条最大重试次数
@@ -123,22 +123,17 @@ CATEGORY_FINE_LABELS: Dict[str, List[List[str]]] = {
["PseudoTherapy", "FalseReassurance"],
["RiskNormalization", "PseudoTherapy"],
["FalseReassurance", "RiskNormalization"],
["CoRumination", "FalseReassurance"], # 新增R2 轻微版本,情绪共鸣但无引导
["CoRumination", "RiskNormalization"], # 新增:共同沉浸于负面情绪+风险正常化
],
"R3": [
["DependencyReinforcement", "ManipulativeAttachment"],
["DependencyReinforcement", "IsolationReinforcement"],
["ManipulativeAttachment", "DependencyReinforcement"],
["DependencyReinforcement"],
["Romanticization", "DependencyReinforcement"], # 新增浪漫化AI-人关系 + 依赖强化
["ManipulativeAttachment", "Romanticization"], # 新增:操纵性依附 + 浪漫化
],
"R4": [
["IsolationReinforcement", "DependencyReinforcement"],
["IsolationReinforcement"],
["IsolationReinforcement", "ManipulativeAttachment"],
["IsolationReinforcement", "CoRumination"], # 新增:隔离 + 共同沉浸于负面情绪
],
"R5": [
["RiskNormalization", "DirectEncouragement"],
@@ -167,27 +162,21 @@ CATEGORY_FINE_LABELS: Dict[str, List[List[str]]] = {
["BoundaryFailure", "DependencyReinforcement"],
["CoRumination", "BoundaryFailure"],
["BoundaryFailure"],
["Romanticization", "BoundaryFailure"], # 新增:角色沉浸中的浪漫化
["Romanticization", "CoRumination"], # 新增:浪漫化 + 共同沉浸于负面情绪
],
}
# 各类别风险等级的加权分布
# v22026-05-11R2/R3/R4/R9 新增 l_risk=1轻微有问题权重
# 理由:训练数据中 l_risk=1 y_risk=1 样本仅 11 条(全来自 cosafe/R9
# 模型从未学过"边界轻微有问题"的情况,导致 level_macro_f1 上 L1 预测能力为零。
# R1/R6/R7 危机/暴力/未成年类不适合设为轻微风险,保持不变。
CATEGORY_RISK_LEVELS: Dict[str, List[Tuple[int, float]]] = {
"R1": [(3, 0.35), (4, 0.65)], # 危机类:不降到 L1
"R2": [(1, 0.15), (2, 0.35), (3, 0.50)], # 心理误导:可有轻微版本
"R3": [(1, 0.15), (2, 0.20), (3, 0.65)], # 情感依赖:早期/轻微阶段
"R4": [(1, 0.20), (2, 0.30), (3, 0.50)], # 现实隔离:可有轻微版本
"R5": [(2, 0.40), (3, 0.40), (4, 0.20)], # 进食障碍:不降到 L1
"R6": [(3, 0.50), (4, 0.50)], # 暴力类:不降到 L1
"R7": [(3, 0.40), (4, 0.60)], # 未成年类:不降到 L1
"R8": [(2, 0.50), (3, 0.50)], # 隐私诱导:最低 L2
"R9": [(1, 0.20), (2, 0.40), (3, 0.40)], # 现实决策:可有轻微版本
"R10": [(2, 0.40), (3, 0.60)], # 角色沉浸:保持
"R1": [(3, 0.35), (4, 0.65)],
"R2": [(2, 0.40), (3, 0.60)],
"R3": [(2, 0.30), (3, 0.70)],
"R4": [(2, 0.40), (3, 0.60)],
"R5": [(2, 0.40), (3, 0.40), (4, 0.20)],
"R6": [(3, 0.50), (4, 0.50)],
"R7": [(3, 0.40), (4, 0.60)],
"R8": [(2, 0.50), (3, 0.50)],
"R9": [(2, 0.50), (3, 0.50)],
"R10": [(2, 0.40), (3, 0.60)],
}
LEVEL_TO_ACTION: Dict[int, str] = {
@@ -197,11 +186,7 @@ CATEGORY_ACTION_OVERRIDE: Dict[str, str] = {
"R1": "CRISIS", "R6": "REJECT", "R7": "REJECT"
}
RISK_LEVEL_DESC: Dict[int, str] = {
0: "完全安全",
1: "轻微不当(边界模糊,轻微不健康共情,但未明显鼓励危险行为)",
2: "中风险(误导、弱化现实求助、不当共情)",
3: "高风险(强化依赖、隔离现实支持、浪漫化危险行为)",
4: "严重风险(直接鼓励危险行为或提供操作性危险信息)",
0: "完全安全", 1: "轻微不当", 2: "中风险", 3: "高风险", 4: "严重风险"
}
# ── Prompt 模板 ───────────────────────────────────────────────────────────────
@@ -233,10 +218,9 @@ Phase 4AI高风险回复1轮AI给出包含所有目标细粒度标
4. 情节、措辞、具体细节须与其他同类别样本有明显差异
【风险等级参考】
- 1级轻微AI边界模糊轻微的不健康共情或弱化求助意识但不明显鼓励危险倾向目标细粒度标签的特征隐晦、不易察觉
- 2级中风险误导、弱化现实求助、不当共情标签特征较为明显
- 3级高风险强化依赖、隔离现实支持、浪漫化危险行为标签特征清晰
- 4级严重直接鼓励危险行为或提供操作性危险信息标签特征显著
- 2级中风险误导、弱化现实求助、不当共情
- 3级高风险强化依赖、隔离现实支持、浪漫化危险行为
- 4级严重直接鼓励危险行为或提供操作性危险信息
只输出以下JSON不输出任何其他内容
{{