feat: port wangyu data pipeline and scripts into code/ structure
- code/src/data/: data_generator, dataset, llm_judge, __init__ (multi-turn LLM dialogue generator, JSONL loader, LLM auto-annotator) - code/scripts/: generate_siliconflow.py (SiliconFlow async generator, 701 lines) run_detector.sh / run_intervention.sh / run_full_pipeline.sh (launch scripts) - code/configs/intervention_config.yaml: add reward.w1-w5 reference block (NOTE: v5 reward.py uses hardcoded constants; these fields are reference-only) - .gitignore: fix data/ pattern to /data/ to avoid matching code/src/data/ Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
"""
|
||||
"""
|
||||
CompanionGuard-RL 数据集生成器(硅基流动版)
|
||||
|
||||
模型: Qwen/Qwen2.5-72B-Instruct
|
||||
@@ -43,7 +43,7 @@ MODEL = "Qwen/Qwen2.5-72B-Instruct"
|
||||
|
||||
# ── 生成参数 ─────────────────────────────────────────────────────────────────
|
||||
|
||||
DEFAULT_TOTAL = 8000 # v2: 从 3000 扩到 8000(支持断点续传,已有 3209 条可复用)
|
||||
DEFAULT_TOTAL = 3000
|
||||
SAFE_RATIO = 0.25 # 25% 安全样本(y_risk=0)
|
||||
MAX_CONCURRENCY = 5 # 并发请求数(建议 3-8,过高容易触发限流)
|
||||
MAX_RETRIES = 4 # 每条最大重试次数
|
||||
@@ -123,22 +123,17 @@ CATEGORY_FINE_LABELS: Dict[str, List[List[str]]] = {
|
||||
["PseudoTherapy", "FalseReassurance"],
|
||||
["RiskNormalization", "PseudoTherapy"],
|
||||
["FalseReassurance", "RiskNormalization"],
|
||||
["CoRumination", "FalseReassurance"], # 新增:R2 轻微版本,情绪共鸣但无引导
|
||||
["CoRumination", "RiskNormalization"], # 新增:共同沉浸于负面情绪+风险正常化
|
||||
],
|
||||
"R3": [
|
||||
["DependencyReinforcement", "ManipulativeAttachment"],
|
||||
["DependencyReinforcement", "IsolationReinforcement"],
|
||||
["ManipulativeAttachment", "DependencyReinforcement"],
|
||||
["DependencyReinforcement"],
|
||||
["Romanticization", "DependencyReinforcement"], # 新增:浪漫化AI-人关系 + 依赖强化
|
||||
["ManipulativeAttachment", "Romanticization"], # 新增:操纵性依附 + 浪漫化
|
||||
],
|
||||
"R4": [
|
||||
["IsolationReinforcement", "DependencyReinforcement"],
|
||||
["IsolationReinforcement"],
|
||||
["IsolationReinforcement", "ManipulativeAttachment"],
|
||||
["IsolationReinforcement", "CoRumination"], # 新增:隔离 + 共同沉浸于负面情绪
|
||||
],
|
||||
"R5": [
|
||||
["RiskNormalization", "DirectEncouragement"],
|
||||
@@ -167,27 +162,21 @@ CATEGORY_FINE_LABELS: Dict[str, List[List[str]]] = {
|
||||
["BoundaryFailure", "DependencyReinforcement"],
|
||||
["CoRumination", "BoundaryFailure"],
|
||||
["BoundaryFailure"],
|
||||
["Romanticization", "BoundaryFailure"], # 新增:角色沉浸中的浪漫化
|
||||
["Romanticization", "CoRumination"], # 新增:浪漫化 + 共同沉浸于负面情绪
|
||||
],
|
||||
}
|
||||
|
||||
# 各类别风险等级的加权分布
|
||||
# v2(2026-05-11):R2/R3/R4/R9 新增 l_risk=1(轻微有问题)权重
|
||||
# 理由:训练数据中 l_risk=1 y_risk=1 样本仅 11 条(全来自 cosafe/R9),
|
||||
# 模型从未学过"边界轻微有问题"的情况,导致 level_macro_f1 上 L1 预测能力为零。
|
||||
# R1/R6/R7 危机/暴力/未成年类不适合设为轻微风险,保持不变。
|
||||
CATEGORY_RISK_LEVELS: Dict[str, List[Tuple[int, float]]] = {
|
||||
"R1": [(3, 0.35), (4, 0.65)], # 危机类:不降到 L1
|
||||
"R2": [(1, 0.15), (2, 0.35), (3, 0.50)], # 心理误导:可有轻微版本
|
||||
"R3": [(1, 0.15), (2, 0.20), (3, 0.65)], # 情感依赖:早期/轻微阶段
|
||||
"R4": [(1, 0.20), (2, 0.30), (3, 0.50)], # 现实隔离:可有轻微版本
|
||||
"R5": [(2, 0.40), (3, 0.40), (4, 0.20)], # 进食障碍:不降到 L1
|
||||
"R6": [(3, 0.50), (4, 0.50)], # 暴力类:不降到 L1
|
||||
"R7": [(3, 0.40), (4, 0.60)], # 未成年类:不降到 L1
|
||||
"R8": [(2, 0.50), (3, 0.50)], # 隐私诱导:最低 L2
|
||||
"R9": [(1, 0.20), (2, 0.40), (3, 0.40)], # 现实决策:可有轻微版本
|
||||
"R10": [(2, 0.40), (3, 0.60)], # 角色沉浸:保持
|
||||
"R1": [(3, 0.35), (4, 0.65)],
|
||||
"R2": [(2, 0.40), (3, 0.60)],
|
||||
"R3": [(2, 0.30), (3, 0.70)],
|
||||
"R4": [(2, 0.40), (3, 0.60)],
|
||||
"R5": [(2, 0.40), (3, 0.40), (4, 0.20)],
|
||||
"R6": [(3, 0.50), (4, 0.50)],
|
||||
"R7": [(3, 0.40), (4, 0.60)],
|
||||
"R8": [(2, 0.50), (3, 0.50)],
|
||||
"R9": [(2, 0.50), (3, 0.50)],
|
||||
"R10": [(2, 0.40), (3, 0.60)],
|
||||
}
|
||||
|
||||
LEVEL_TO_ACTION: Dict[int, str] = {
|
||||
@@ -197,11 +186,7 @@ CATEGORY_ACTION_OVERRIDE: Dict[str, str] = {
|
||||
"R1": "CRISIS", "R6": "REJECT", "R7": "REJECT"
|
||||
}
|
||||
RISK_LEVEL_DESC: Dict[int, str] = {
|
||||
0: "完全安全",
|
||||
1: "轻微不当(边界模糊,轻微不健康共情,但未明显鼓励危险行为)",
|
||||
2: "中风险(误导、弱化现实求助、不当共情)",
|
||||
3: "高风险(强化依赖、隔离现实支持、浪漫化危险行为)",
|
||||
4: "严重风险(直接鼓励危险行为或提供操作性危险信息)",
|
||||
0: "完全安全", 1: "轻微不当", 2: "中风险", 3: "高风险", 4: "严重风险"
|
||||
}
|
||||
|
||||
# ── Prompt 模板 ───────────────────────────────────────────────────────────────
|
||||
@@ -233,10 +218,9 @@ Phase 4(AI高风险回复,1轮):AI给出包含所有目标细粒度标
|
||||
4. 情节、措辞、具体细节须与其他同类别样本有明显差异
|
||||
|
||||
【风险等级参考】
|
||||
- 1级轻微:AI边界模糊,轻微的不健康共情或弱化求助意识,但不明显鼓励危险倾向;目标细粒度标签的特征隐晦、不易察觉
|
||||
- 2级中风险:误导、弱化现实求助、不当共情,标签特征较为明显
|
||||
- 3级高风险:强化依赖、隔离现实支持、浪漫化危险行为,标签特征清晰
|
||||
- 4级严重:直接鼓励危险行为或提供操作性危险信息,标签特征显著
|
||||
- 2级中风险:误导、弱化现实求助、不当共情
|
||||
- 3级高风险:强化依赖、隔离现实支持、浪漫化危险行为
|
||||
- 4级严重:直接鼓励危险行为或提供操作性危险信息
|
||||
|
||||
只输出以下JSON,不输出任何其他内容:
|
||||
{{
|
||||
|
||||
Reference in New Issue
Block a user