% ============================================================ % CompanionGuard-RL 参考文献 % ============================================================ % ---- AI Companion / Character Platform Safety ---- @article{wei2025ai, title={Benchmarking and Understanding Safety Risks in {AI} Character Platforms}, author={Wei, Yiluo and Zhang, Peixian and Tyson, Gareth}, journal={arXiv preprint arXiv:2512.01247}, year={2025} } @article{juneja2025persona, title={Persona-Grounded Safety Evaluation of {AI} Companions in Multi-Turn Conversations}, author={Juneja, Prerna and Lomidze, Lika}, journal={arXiv preprint arXiv:2605.00227}, year={2025} } % ---- Mental Health AI Safety ---- @article{bentley2025vera, title={{VERA-MH}: Reliability and Validity of an Open-Source {AI} Safety Evaluation in Mental Health}, author={Bentley, Kate H. and others}, journal={arXiv preprint arXiv:2602.05088}, year={2025} } % ---- Mental Health Text Detection ---- @inproceedings{zirikly2019clpsych, title={{CLPsych} 2019 Shared Task: Predicting the Degree of Suicide Risk in {Reddit} Posts}, author={Zirikly, Ayah and Resnik, Philip and Uzuner, {\"O}zlem and Hollingshead, Kristy}, booktitle={Proceedings of the Sixth Workshop on Computational Linguistics and Clinical Psychology}, pages={24--33}, year={2019} } @inproceedings{ghosh2025shines, title={Just a Scratch: Enhancing {LLM} Capabilities for Self-harm Detection through Intent Differentiation and Emoji Interpretation}, author={Ghosh, Soumitra and others}, booktitle={Proceedings of ACL 2025}, year={2025} } @article{yang2023mentallama, title={{MentalLLaMA}: Interpretable Mental Health Analysis on Social Media with Large Language Models}, author={Yang, Kang and Zhang, Shaoxiong and Ananiadou, Sophia and others}, journal={arXiv preprint arXiv:2309.13567}, year={2023} } % ---- General LLM Safety / Guard Models ---- @article{inan2023llama, title={{Llama Guard}: {LLM}-based Input-Output Safeguard for Human-AI Conversations}, author={Inan, Hakan and Upasani, Kartikeya and Chi, Jianfeng and others}, journal={arXiv preprint arXiv:2312.06674}, year={2023} } @article{dubey2024llama3, title={The {Llama 3} Herd of Models}, author={Dubey, Abhimanyu and others}, journal={arXiv preprint arXiv:2407.21783}, year={2024} } @article{han2024wildguard, title={{WildGuard}: Open One-Stop Moderation Tools for Safety Risks, Jailbreaks, and Refusals of {LLMs}}, author={Han, Seungju and others}, journal={arXiv preprint arXiv:2406.18495}, year={2024} } @article{ghosh2025aegis, title={{Aegis2.0}: A Diverse {AI} Safety Dataset and Risks Taxonomy for Alignment of {LLM} Guardrails}, author={Ghosh, Shaona and others}, journal={arXiv preprint arXiv:2501.09004}, year={2025} } @misc{openai2022moderation, title={Introducing {OpenAI} {Moderation API}}, author={{OpenAI}}, year={2022}, howpublished={\url{https://openai.com/blog/new-and-improved-content-moderation}} } % ---- Safety Benchmarks ---- @article{li2024saladbench, title={{SALAD-Bench}: A Hierarchical and Comprehensive Safety Benchmark for Large Language Models}, author={Li, Lijun and Dong, Bowen and Wang, Ruohui and others}, journal={arXiv preprint arXiv:2402.05044}, year={2024} } @article{mazeika2024harmbench, title={{HarmBench}: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal}, author={Mazeika, Mantas and Phan, Long and others}, journal={arXiv preprint arXiv:2402.04249}, year={2024} } % ---- RL / RLHF ---- @article{schulman2017ppo, title={Proximal Policy Optimization Algorithms}, author={Schulman, John and Wolski, Filip and Dhariwal, Prafulla and Radford, Alec and Klimov, Oleg}, journal={arXiv preprint arXiv:1707.06347}, year={2017} } @article{ouyang2022instructgpt, title={Training Language Models to Follow Instructions with Human Feedback}, author={Ouyang, Long and Wu, Jeffrey and Jiang, Xu and others}, journal={Advances in Neural Information Processing Systems}, volume={35}, pages={27730--27744}, year={2022} } % ---- Backbone Model ---- @article{cui2020macbert, title={Revisiting Pre-Trained Models for {Chinese} Natural Language Processing}, author={Cui, Yiming and Che, Wanxiang and Liu, Ting and Qin, Bing and Yang, Ziqing}, journal={Findings of EMNLP 2020}, year={2020} }