Deliberative Alignment: Reasoning Enables Safer Language Models. Guan, M. Y., Joglekar, M., Wallace, E., Jain, S., Barak, B., Helyar, A., Dias, R., Vallone, A., Ren, H., Wei, J., Chung, H. W., Toyer, S., Heidecke, J., Beutel, A., & Glaese, A. 2025.
Deliberative Alignment: Reasoning Enables Safer Language Models [link]Paper  bibtex   1 download  
@misc{guan2025deliberativealignmentreasoningenables,
      title={Deliberative Alignment: Reasoning Enables Safer Language Models}, 
      author={Melody Y. Guan and Manas Joglekar and Eric Wallace and Saachi Jain and Boaz Barak and Alec Helyar and Rachel Dias and Andrea Vallone and Hongyu Ren and Jason Wei and Hyung Won Chung and Sam Toyer and Johannes Heidecke and Alex Beutel and Amelia Glaese},
      year={2025},
      eprint={2412.16339},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2412.16339}, 
      selected = {yes}
}

Downloads: 1