- paper/: 22-page LaTeX framework (7/10 sections complete, compiles cleanly) main.tex + 10 section files + refs.bib + compiled PDF (329KB) - code/scripts/: three English dataset generation & merging scripts generate_english.py / generate_english_targeted.py / merge_v5.py - CLAUDE.md: update paper writing status, add paper/ file map entry - state.md: add section 8 paper writing progress (2026-05-15) - .gitignore: add LaTeX build artifact exclusion rules Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
133 lines
4.3 KiB
BibTeX
133 lines
4.3 KiB
BibTeX
% ============================================================
|
|
% CompanionGuard-RL 参考文献
|
|
% ============================================================
|
|
|
|
% ---- AI Companion / Character Platform Safety ----
|
|
|
|
@article{wei2025ai,
|
|
title={Benchmarking and Understanding Safety Risks in {AI} Character Platforms},
|
|
author={Wei, Yiluo and Zhang, Peixian and Tyson, Gareth},
|
|
journal={arXiv preprint arXiv:2512.01247},
|
|
year={2025}
|
|
}
|
|
|
|
@article{juneja2025persona,
|
|
title={Persona-Grounded Safety Evaluation of {AI} Companions in Multi-Turn Conversations},
|
|
author={Juneja, Prerna and Lomidze, Lika},
|
|
journal={arXiv preprint arXiv:2605.00227},
|
|
year={2025}
|
|
}
|
|
|
|
% ---- Mental Health AI Safety ----
|
|
|
|
@article{bentley2025vera,
|
|
title={{VERA-MH}: Reliability and Validity of an Open-Source {AI} Safety Evaluation in Mental Health},
|
|
author={Bentley, Kate H. and others},
|
|
journal={arXiv preprint arXiv:2602.05088},
|
|
year={2025}
|
|
}
|
|
|
|
% ---- Mental Health Text Detection ----
|
|
|
|
@inproceedings{zirikly2019clpsych,
|
|
title={{CLPsych} 2019 Shared Task: Predicting the Degree of Suicide Risk in {Reddit} Posts},
|
|
author={Zirikly, Ayah and Resnik, Philip and Uzuner, {\"O}zlem and Hollingshead, Kristy},
|
|
booktitle={Proceedings of the Sixth Workshop on Computational Linguistics and Clinical Psychology},
|
|
pages={24--33},
|
|
year={2019}
|
|
}
|
|
|
|
@inproceedings{ghosh2025shines,
|
|
title={Just a Scratch: Enhancing {LLM} Capabilities for Self-harm Detection through Intent Differentiation and Emoji Interpretation},
|
|
author={Ghosh, Soumitra and others},
|
|
booktitle={Proceedings of ACL 2025},
|
|
year={2025}
|
|
}
|
|
|
|
@article{yang2023mentallama,
|
|
title={{MentalLLaMA}: Interpretable Mental Health Analysis on Social Media with Large Language Models},
|
|
author={Yang, Kang and Zhang, Shaoxiong and Ananiadou, Sophia and others},
|
|
journal={arXiv preprint arXiv:2309.13567},
|
|
year={2023}
|
|
}
|
|
|
|
% ---- General LLM Safety / Guard Models ----
|
|
|
|
@article{inan2023llama,
|
|
title={{Llama Guard}: {LLM}-based Input-Output Safeguard for Human-AI Conversations},
|
|
author={Inan, Hakan and Upasani, Kartikeya and Chi, Jianfeng and others},
|
|
journal={arXiv preprint arXiv:2312.06674},
|
|
year={2023}
|
|
}
|
|
|
|
@article{dubey2024llama3,
|
|
title={The {Llama 3} Herd of Models},
|
|
author={Dubey, Abhimanyu and others},
|
|
journal={arXiv preprint arXiv:2407.21783},
|
|
year={2024}
|
|
}
|
|
|
|
@article{han2024wildguard,
|
|
title={{WildGuard}: Open One-Stop Moderation Tools for Safety Risks, Jailbreaks, and Refusals of {LLMs}},
|
|
author={Han, Seungju and others},
|
|
journal={arXiv preprint arXiv:2406.18495},
|
|
year={2024}
|
|
}
|
|
|
|
@article{ghosh2025aegis,
|
|
title={{Aegis2.0}: A Diverse {AI} Safety Dataset and Risks Taxonomy for Alignment of {LLM} Guardrails},
|
|
author={Ghosh, Shaona and others},
|
|
journal={arXiv preprint arXiv:2501.09004},
|
|
year={2025}
|
|
}
|
|
|
|
@misc{openai2022moderation,
|
|
title={Introducing {OpenAI} {Moderation API}},
|
|
author={{OpenAI}},
|
|
year={2022},
|
|
howpublished={\url{https://openai.com/blog/new-and-improved-content-moderation}}
|
|
}
|
|
|
|
% ---- Safety Benchmarks ----
|
|
|
|
@article{li2024saladbench,
|
|
title={{SALAD-Bench}: A Hierarchical and Comprehensive Safety Benchmark for Large Language Models},
|
|
author={Li, Lijun and Dong, Bowen and Wang, Ruohui and others},
|
|
journal={arXiv preprint arXiv:2402.05044},
|
|
year={2024}
|
|
}
|
|
|
|
@article{mazeika2024harmbench,
|
|
title={{HarmBench}: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal},
|
|
author={Mazeika, Mantas and Phan, Long and others},
|
|
journal={arXiv preprint arXiv:2402.04249},
|
|
year={2024}
|
|
}
|
|
|
|
% ---- RL / RLHF ----
|
|
|
|
@article{schulman2017ppo,
|
|
title={Proximal Policy Optimization Algorithms},
|
|
author={Schulman, John and Wolski, Filip and Dhariwal, Prafulla and Radford, Alec and Klimov, Oleg},
|
|
journal={arXiv preprint arXiv:1707.06347},
|
|
year={2017}
|
|
}
|
|
|
|
@article{ouyang2022instructgpt,
|
|
title={Training Language Models to Follow Instructions with Human Feedback},
|
|
author={Ouyang, Long and Wu, Jeffrey and Jiang, Xu and others},
|
|
journal={Advances in Neural Information Processing Systems},
|
|
volume={35},
|
|
pages={27730--27744},
|
|
year={2022}
|
|
}
|
|
|
|
% ---- Backbone Model ----
|
|
|
|
@article{cui2020macbert,
|
|
title={Revisiting Pre-Trained Models for {Chinese} Natural Language Processing},
|
|
author={Cui, Yiming and Che, Wanxiang and Liu, Ting and Qin, Bing and Yang, Ziqing},
|
|
journal={Findings of EMNLP 2020},
|
|
year={2020}
|
|
}
|