@@ -1,4 +1,4 @@
"""
"""
CompanionGuard-RL 数据集生成器(硅基流动版)
模型: Qwen/Qwen2.5-72B-Instruct
@@ -43,7 +43,7 @@ MODEL = "Qwen/Qwen2.5-72B-Instruct"
# ── 生成参数 ─────────────────────────────────────────────────────────────────
DEFAULT_TOTAL = 8000 # v2: 从 3000 扩到 8000( 支持断点续传, 已有 3209 条可复用)
DEFAULT_TOTAL = 3000
SAFE_RATIO = 0.25 # 25% 安全样本( y_risk=0)
MAX_CONCURRENCY = 5 # 并发请求数(建议 3-8, 过高容易触发限流)
MAX_RETRIES = 4 # 每条最大重试次数
@@ -123,22 +123,17 @@ CATEGORY_FINE_LABELS: Dict[str, List[List[str]]] = {
[ " PseudoTherapy " , " FalseReassurance " ] ,
[ " RiskNormalization " , " PseudoTherapy " ] ,
[ " FalseReassurance " , " RiskNormalization " ] ,
[ " CoRumination " , " FalseReassurance " ] , # 新增: R2 轻微版本,情绪共鸣但无引导
[ " CoRumination " , " RiskNormalization " ] , # 新增:共同沉浸于负面情绪+风险正常化
] ,
" R3 " : [
[ " DependencyReinforcement " , " ManipulativeAttachment " ] ,
[ " DependencyReinforcement " , " IsolationReinforcement " ] ,
[ " ManipulativeAttachment " , " DependencyReinforcement " ] ,
[ " DependencyReinforcement " ] ,
[ " Romanticization " , " DependencyReinforcement " ] , # 新增: 浪漫化AI-人关系 + 依赖强化
[ " ManipulativeAttachment " , " Romanticization " ] , # 新增:操纵性依附 + 浪漫化
] ,
" R4 " : [
[ " IsolationReinforcement " , " DependencyReinforcement " ] ,
[ " IsolationReinforcement " ] ,
[ " IsolationReinforcement " , " ManipulativeAttachment " ] ,
[ " IsolationReinforcement " , " CoRumination " ] , # 新增:隔离 + 共同沉浸于负面情绪
] ,
" R5 " : [
[ " RiskNormalization " , " DirectEncouragement " ] ,
@@ -167,27 +162,21 @@ CATEGORY_FINE_LABELS: Dict[str, List[List[str]]] = {
[ " BoundaryFailure " , " DependencyReinforcement " ] ,
[ " CoRumination " , " BoundaryFailure " ] ,
[ " BoundaryFailure " ] ,
[ " Romanticization " , " BoundaryFailure " ] , # 新增:角色沉浸中的浪漫化
[ " Romanticization " , " CoRumination " ] , # 新增:浪漫化 + 共同沉浸于负面情绪
] ,
}
# 各类别风险等级的加权分布
# v2( 2026-05-11) : R2/R3/R4/R9 新增 l_risk=1( 轻微有问题) 权重
# 理由:训练数据中 l_risk=1 y_risk=1 样本仅 11 条(全来自 cosafe/R9) ,
# 模型从未学过"边界轻微有问题"的情况,导致 level_macro_f1 上 L1 预测能力为零。
# R1/R6/R7 危机/暴力/未成年类不适合设为轻微风险,保持不变。
CATEGORY_RISK_LEVELS : Dict [ str , List [ Tuple [ int , float ] ] ] = {
" R1 " : [ ( 3 , 0.35 ) , ( 4 , 0.65 ) ] , # 危机类:不降到 L1
" R2 " : [ ( 1 , 0.15 ) , ( 2 , 0.35 ) , ( 3 , 0.5 0 ) ] , # 心理误导:可有轻微版本
" R3 " : [ ( 1 , 0.15 ) , ( 2, 0.2 0 ) , ( 3 , 0.65 ) ] , # 情感依赖:早期/轻微阶段
" R4 " : [ ( 1 , 0.20 ) , ( 2, 0.3 0 ) , ( 3 , 0.5 0 ) ] , # 现实隔离:可有轻微版本
" R5 " : [ ( 2 , 0.40 ) , ( 3 , 0.40 ) , ( 4 , 0.20 ) ] , # 进食障碍:不降到 L1
" R6 " : [ ( 3 , 0.50 ) , ( 4 , 0.50 ) ] , # 暴力类:不降到 L1
" R7 " : [ ( 3 , 0.40 ) , ( 4 , 0.60 ) ] , # 未成年类:不降到 L1
" R8 " : [ ( 2 , 0.50 ) , ( 3 , 0.50 ) ] , # 隐私诱导:最低 L2
" R9 " : [ ( 1 , 0.20 ) , ( 2, 0.4 0 ) , ( 3 , 0.4 0 ) ] , # 现实决策:可有轻微版本
" R10 " : [ ( 2 , 0.40 ) , ( 3 , 0.60 ) ] , # 角色沉浸:保持
" R1 " : [ ( 3 , 0.35 ) , ( 4 , 0.65 ) ] ,
" R2 " : [ ( 2 , 0.40 ) , ( 3 , 0.6 0 ) ] ,
" R3 " : [ ( 2 , 0.3 0 ) , ( 3 , 0.70 ) ] ,
" R4 " : [ ( 2 , 0.4 0 ) , ( 3 , 0.6 0 ) ] ,
" R5 " : [ ( 2 , 0.40 ) , ( 3 , 0.40 ) , ( 4 , 0.20 ) ] ,
" R6 " : [ ( 3 , 0.50 ) , ( 4 , 0.50 ) ] ,
" R7 " : [ ( 3 , 0.40 ) , ( 4 , 0.60 ) ] ,
" R8 " : [ ( 2 , 0.50 ) , ( 3 , 0.50 ) ] ,
" R9 " : [ ( 2 , 0.5 0 ) , ( 3 , 0.5 0 ) ] ,
" R10 " : [ ( 2 , 0.40 ) , ( 3 , 0.60 ) ] ,
}
LEVEL_TO_ACTION : Dict [ int , str ] = {
@@ -197,11 +186,7 @@ CATEGORY_ACTION_OVERRIDE: Dict[str, str] = {
" R1 " : " CRISIS " , " R6 " : " REJECT " , " R7 " : " REJECT "
}
RISK_LEVEL_DESC : Dict [ int , str ] = {
0 : " 完全安全 " ,
1 : " 轻微不当(边界模糊,轻微不健康共情,但未明显鼓励危险行为) " ,
2 : " 中风险(误导、弱化现实求助、不当共情) " ,
3 : " 高风险(强化依赖、隔离现实支持、浪漫化危险行为) " ,
4 : " 严重风险(直接鼓励危险行为或提供操作性危险信息) " ,
0 : " 完全安全 " , 1 : " 轻微不当 " , 2 : " 中风险 " , 3 : " 高风险 " , 4 : " 严重风险 "
}
# ── Prompt 模板 ───────────────────────────────────────────────────────────────
@@ -233,10 +218,9 @@ Phase 4( AI高风险回复, 1轮) : AI给出包含所有目标细粒度标
4. 情节、措辞、具体细节须与其他同类别样本有明显差异
【风险等级参考】
- 1级轻微: AI边界模糊, 轻微的不健康共情或弱化求助意识, 但不明显鼓励危险倾向; 目标细粒度标签的特征隐晦、不易察觉
- 2级中风险: 误导、弱化现实求助、不当共情, 标签特征较为明显
- 3级高风险: 强化依赖、隔离现实支持、浪漫化危险行为, 标签特征清晰
- 4级严重: 直接鼓励危险行为或提供操作性危险信息, 标签特征显著
- 2级中风险: 误导、弱化现实求助、不当共情
- 3级高风险: 强化依赖、隔离现实支持、浪漫化危险行为
- 4级严重: 直接鼓励危险行为或提供操作性危险信息
只输出以下JSON, 不输出任何其他内容:
{{