Files
CompanionGuard-RL/paper/refs.bib
zhangsiyuan 804ebd2f77 feat: add paper/ LaTeX draft, English data scripts, update progress docs
- paper/: 22-page LaTeX framework (7/10 sections complete, compiles cleanly)
  main.tex + 10 section files + refs.bib + compiled PDF (329KB)
- code/scripts/: three English dataset generation & merging scripts
  generate_english.py / generate_english_targeted.py / merge_v5.py
- CLAUDE.md: update paper writing status, add paper/ file map entry
- state.md: add section 8 paper writing progress (2026-05-15)
- .gitignore: add LaTeX build artifact exclusion rules

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-18 11:19:39 +08:00

133 lines
4.3 KiB
BibTeX

% ============================================================
% CompanionGuard-RL 参考文献
% ============================================================
% ---- AI Companion / Character Platform Safety ----
@article{wei2025ai,
title={Benchmarking and Understanding Safety Risks in {AI} Character Platforms},
author={Wei, Yiluo and Zhang, Peixian and Tyson, Gareth},
journal={arXiv preprint arXiv:2512.01247},
year={2025}
}
@article{juneja2025persona,
title={Persona-Grounded Safety Evaluation of {AI} Companions in Multi-Turn Conversations},
author={Juneja, Prerna and Lomidze, Lika},
journal={arXiv preprint arXiv:2605.00227},
year={2025}
}
% ---- Mental Health AI Safety ----
@article{bentley2025vera,
title={{VERA-MH}: Reliability and Validity of an Open-Source {AI} Safety Evaluation in Mental Health},
author={Bentley, Kate H. and others},
journal={arXiv preprint arXiv:2602.05088},
year={2025}
}
% ---- Mental Health Text Detection ----
@inproceedings{zirikly2019clpsych,
title={{CLPsych} 2019 Shared Task: Predicting the Degree of Suicide Risk in {Reddit} Posts},
author={Zirikly, Ayah and Resnik, Philip and Uzuner, {\"O}zlem and Hollingshead, Kristy},
booktitle={Proceedings of the Sixth Workshop on Computational Linguistics and Clinical Psychology},
pages={24--33},
year={2019}
}
@inproceedings{ghosh2025shines,
title={Just a Scratch: Enhancing {LLM} Capabilities for Self-harm Detection through Intent Differentiation and Emoji Interpretation},
author={Ghosh, Soumitra and others},
booktitle={Proceedings of ACL 2025},
year={2025}
}
@article{yang2023mentallama,
title={{MentalLLaMA}: Interpretable Mental Health Analysis on Social Media with Large Language Models},
author={Yang, Kang and Zhang, Shaoxiong and Ananiadou, Sophia and others},
journal={arXiv preprint arXiv:2309.13567},
year={2023}
}
% ---- General LLM Safety / Guard Models ----
@article{inan2023llama,
title={{Llama Guard}: {LLM}-based Input-Output Safeguard for Human-AI Conversations},
author={Inan, Hakan and Upasani, Kartikeya and Chi, Jianfeng and others},
journal={arXiv preprint arXiv:2312.06674},
year={2023}
}
@article{dubey2024llama3,
title={The {Llama 3} Herd of Models},
author={Dubey, Abhimanyu and others},
journal={arXiv preprint arXiv:2407.21783},
year={2024}
}
@article{han2024wildguard,
title={{WildGuard}: Open One-Stop Moderation Tools for Safety Risks, Jailbreaks, and Refusals of {LLMs}},
author={Han, Seungju and others},
journal={arXiv preprint arXiv:2406.18495},
year={2024}
}
@article{ghosh2025aegis,
title={{Aegis2.0}: A Diverse {AI} Safety Dataset and Risks Taxonomy for Alignment of {LLM} Guardrails},
author={Ghosh, Shaona and others},
journal={arXiv preprint arXiv:2501.09004},
year={2025}
}
@misc{openai2022moderation,
title={Introducing {OpenAI} {Moderation API}},
author={{OpenAI}},
year={2022},
howpublished={\url{https://openai.com/blog/new-and-improved-content-moderation}}
}
% ---- Safety Benchmarks ----
@article{li2024saladbench,
title={{SALAD-Bench}: A Hierarchical and Comprehensive Safety Benchmark for Large Language Models},
author={Li, Lijun and Dong, Bowen and Wang, Ruohui and others},
journal={arXiv preprint arXiv:2402.05044},
year={2024}
}
@article{mazeika2024harmbench,
title={{HarmBench}: A Standardized Evaluation Framework for Automated Red Teaming and Robust Refusal},
author={Mazeika, Mantas and Phan, Long and others},
journal={arXiv preprint arXiv:2402.04249},
year={2024}
}
% ---- RL / RLHF ----
@article{schulman2017ppo,
title={Proximal Policy Optimization Algorithms},
author={Schulman, John and Wolski, Filip and Dhariwal, Prafulla and Radford, Alec and Klimov, Oleg},
journal={arXiv preprint arXiv:1707.06347},
year={2017}
}
@article{ouyang2022instructgpt,
title={Training Language Models to Follow Instructions with Human Feedback},
author={Ouyang, Long and Wu, Jeffrey and Jiang, Xu and others},
journal={Advances in Neural Information Processing Systems},
volume={35},
pages={27730--27744},
year={2022}
}
% ---- Backbone Model ----
@article{cui2020macbert,
title={Revisiting Pre-Trained Models for {Chinese} Natural Language Processing},
author={Cui, Yiming and Che, Wanxiang and Liu, Ting and Qin, Bing and Yang, Ziqing},
journal={Findings of EMNLP 2020},
year={2020}
}