CompanionGuard-RL/paper/refs.bib

% ============================================================
%  CompanionGuard-RL 参考文献
% ============================================================

% ---- AI Companion / Character Platform Safety ----

@article{wei2025ai,
  title={Benchmarking and Understanding Safety Risks in {AI} Character Platforms},
  author={Wei, Yiluo and Zhang, Peixian and Tyson, Gareth},
  journal={arXiv preprint arXiv:2512.01247},
  year={2025}
}

@article{juneja2025persona,
  title={Persona-Grounded Safety Evaluation of {AI} Companions in Multi-Turn Conversations},
  author={Juneja, Prerna and Lomidze, Lika},
  journal={arXiv preprint arXiv:2605.00227},
  year={2025}
}

% ---- Mental Health AI Safety ----

@article{bentley2025vera,
  title={{VERA-MH}: Reliability and Validity of an Open-Source {AI} Safety Evaluation in Mental Health},
  author={Bentley, Kate H. and others},
  journal={arXiv preprint arXiv:2602.05088},
  year={2025}
}

% ---- Mental Health Text Detection ----

@inproceedings{zirikly2019clpsych,
  title={{CLPsych} 2019 Shared Task: Predicting the Degree of Suicide Risk in {Reddit} Posts},
  author={Zirikly, Ayah and Resnik, Philip and Uzuner, {\"O}zlem and Hollingshead, Kristy},
  booktitle={Proceedings of the Sixth Workshop on Computational Linguistics and Clinical Psychology},
  pages={24--33},
  year={2019}
}

@inproceedings{ghosh2025shines,
  title={Just a Scratch: Enhancing {LLM} Capabilities for Self-harm Detection through Intent Differentiation and Emoji Interpretation},
  author={Ghosh, Soumitra and others},
  booktitle={Proceedings of ACL 2025},
  year={2025}
}

@article{yang2023mentallama,
  title={{MentalLLaMA}: Interpretable Mental Health Analysis on Social Media with Large Language Models},
  author={Yang, Kang and Zhang, Shaoxiong and Ananiadou, Sophia and others},
  journal={arXiv preprint arXiv:2309.13567},
  year={2023}
}

% ---- General LLM Safety / Guard Models ----

@article{inan2023llama,
  title={{Llama Guard}: {LLM}-based Input-Output Safeguard for Human-AI Conversations},
  author={Inan, Hakan and Upasani, Kartikeya and Chi, Jianfeng and others},
  journal={arXiv preprint arXiv:2312.06674},
  year={2023}
}

@article{dubey2024llama3,
  title={The {Llama 3} Herd of Models},
  author={Dubey, Abhimanyu and others},
  journal={arXiv preprint arXiv:2407.21783},
  year={2024}
}

@article{han2024wildguard,
  title={{WildGuard}: Open One-Stop Moderation Tools for Safety Risks, Jailbreaks, and Refusals of {LLMs}},
  author={Han, Seungju and others},
  journal={arXiv preprint arXiv:2406.18495},
  year={2024}
}

@article{ghosh2025aegis,
  title={{Aegis2.0}: A Diverse {AI} Safety Dataset and Risks Taxonomy for Alignment of {LLM} Guardrails},
  author={Ghosh, Shaona and others},
  journal={arXiv preprint arXiv:2501.09004},
  year={2025}
}

@misc{openai2022moderation,
  title={Introducing {OpenAI} {Moderation API}},
  author={{OpenAI}},
  year={2022},
  howpublished={\url{https://openai.com/blog/new-and-improved-content-moderation}}
}

% ---- Safety Benchmarks ----

@article{li2024saladbench,
  title={{SALAD-Bench}: A Hierarchical and Comprehensive Safety Benchmark for Large Language Models},
  author={Li, Lijun and Dong, Bowen and Wang, Ruohui and others},
  journal={arXiv preprint arXiv:2402.05044},
  year={2024}
}

@article{mazeika2024harmbench,
  title={{HarmBench}: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal},
  author={Mazeika, Mantas and Phan, Long and others},
  journal={arXiv preprint arXiv:2402.04249},
  year={2024}
}

% ---- RL / RLHF ----

@article{schulman2017ppo,
  title={Proximal Policy Optimization Algorithms},
  author={Schulman, John and Wolski, Filip and Dhariwal, Prafulla and Radford, Alec and Klimov, Oleg},
  journal={arXiv preprint arXiv:1707.06347},
  year={2017}
}

@article{ouyang2022instructgpt,
  title={Training Language Models to Follow Instructions with Human Feedback},
  author={Ouyang, Long and Wu, Jeffrey and Jiang, Xu and others},
  journal={Advances in Neural Information Processing Systems},
  volume={35},
  pages={27730--27744},
  year={2022}
}

% ---- Backbone Model ----

@article{cui2020macbert,
  title={Revisiting Pre-Trained Models for {Chinese} Natural Language Processing},
  author={Cui, Yiming and Che, Wanxiang and Liu, Ting and Qin, Bing and Yang, Ziqing},
  journal={Findings of EMNLP 2020},
  year={2020}
}