Deliberative Alignment: Reasoning Enables Safer Language Models. Guan, M. Y., Joglekar, M., Wallace, E., Jain, S., Barak, B., Helyar, A., Dias, R., Vallone, A., Ren, H., Wei, J., Chung, H. W., Toyer, S., Heidecke, J., Beutel, A., & Glaese, A. CoRR, 2024.
Paper doi bibtex @article{DBLP:journals/corr/abs-2412-16339,
author = {Melody Y. Guan and
Manas Joglekar and
Eric Wallace and
Saachi Jain and
Boaz Barak and
Alec Helyar and
Rachel Dias and
Andrea Vallone and
Hongyu Ren and
Jason Wei and
Hyung Won Chung and
Sam Toyer and
Johannes Heidecke and
Alex Beutel and
Amelia Glaese},
title = {Deliberative Alignment: Reasoning Enables Safer Language Models},
journal = {CoRR},
volume = {abs/2412.16339},
year = {2024},
url = {https://doi.org/10.48550/arXiv.2412.16339},
doi = {10.48550/ARXIV.2412.16339},
eprinttype = {arXiv},
eprint = {2412.16339},
timestamp = {Thu, 23 Jan 2025 00:00:00 +0100},
biburl = {https://dblp.org/rec/journals/corr/abs-2412-16339.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
Downloads: 0
{"_id":"CAHoG6Eb6SoBGPsfq","bibbaseid":"guan-joglekar-wallace-jain-barak-helyar-dias-vallone-etal-deliberativealignmentreasoningenablessaferlanguagemodels-2024","author_short":["Guan, M. Y.","Joglekar, M.","Wallace, E.","Jain, S.","Barak, B.","Helyar, A.","Dias, R.","Vallone, A.","Ren, H.","Wei, J.","Chung, H. W.","Toyer, S.","Heidecke, J.","Beutel, A.","Glaese, A."],"bibdata":{"bibtype":"article","type":"article","author":[{"firstnames":["Melody","Y."],"propositions":[],"lastnames":["Guan"],"suffixes":[]},{"firstnames":["Manas"],"propositions":[],"lastnames":["Joglekar"],"suffixes":[]},{"firstnames":["Eric"],"propositions":[],"lastnames":["Wallace"],"suffixes":[]},{"firstnames":["Saachi"],"propositions":[],"lastnames":["Jain"],"suffixes":[]},{"firstnames":["Boaz"],"propositions":[],"lastnames":["Barak"],"suffixes":[]},{"firstnames":["Alec"],"propositions":[],"lastnames":["Helyar"],"suffixes":[]},{"firstnames":["Rachel"],"propositions":[],"lastnames":["Dias"],"suffixes":[]},{"firstnames":["Andrea"],"propositions":[],"lastnames":["Vallone"],"suffixes":[]},{"firstnames":["Hongyu"],"propositions":[],"lastnames":["Ren"],"suffixes":[]},{"firstnames":["Jason"],"propositions":[],"lastnames":["Wei"],"suffixes":[]},{"firstnames":["Hyung","Won"],"propositions":[],"lastnames":["Chung"],"suffixes":[]},{"firstnames":["Sam"],"propositions":[],"lastnames":["Toyer"],"suffixes":[]},{"firstnames":["Johannes"],"propositions":[],"lastnames":["Heidecke"],"suffixes":[]},{"firstnames":["Alex"],"propositions":[],"lastnames":["Beutel"],"suffixes":[]},{"firstnames":["Amelia"],"propositions":[],"lastnames":["Glaese"],"suffixes":[]}],"title":"Deliberative Alignment: Reasoning Enables Safer Language Models","journal":"CoRR","volume":"abs/2412.16339","year":"2024","url":"https://doi.org/10.48550/arXiv.2412.16339","doi":"10.48550/ARXIV.2412.16339","eprinttype":"arXiv","eprint":"2412.16339","timestamp":"Thu, 23 Jan 2025 00:00:00 +0100","biburl":"https://dblp.org/rec/journals/corr/abs-2412-16339.bib","bibsource":"dblp computer science bibliography, https://dblp.org","bibtex":"@article{DBLP:journals/corr/abs-2412-16339,\n author = {Melody Y. Guan and\n Manas Joglekar and\n Eric Wallace and\n Saachi Jain and\n Boaz Barak and\n Alec Helyar and\n Rachel Dias and\n Andrea Vallone and\n Hongyu Ren and\n Jason Wei and\n Hyung Won Chung and\n Sam Toyer and\n Johannes Heidecke and\n Alex Beutel and\n Amelia Glaese},\n title = {Deliberative Alignment: Reasoning Enables Safer Language Models},\n journal = {CoRR},\n volume = {abs/2412.16339},\n year = {2024},\n url = {https://doi.org/10.48550/arXiv.2412.16339},\n doi = {10.48550/ARXIV.2412.16339},\n eprinttype = {arXiv},\n eprint = {2412.16339},\n timestamp = {Thu, 23 Jan 2025 00:00:00 +0100},\n biburl = {https://dblp.org/rec/journals/corr/abs-2412-16339.bib},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n\n","author_short":["Guan, M. Y.","Joglekar, M.","Wallace, E.","Jain, S.","Barak, B.","Helyar, A.","Dias, R.","Vallone, A.","Ren, H.","Wei, J.","Chung, H. W.","Toyer, S.","Heidecke, J.","Beutel, A.","Glaese, A."],"key":"DBLP:journals/corr/abs-2412-16339","id":"DBLP:journals/corr/abs-2412-16339","bibbaseid":"guan-joglekar-wallace-jain-barak-helyar-dias-vallone-etal-deliberativealignmentreasoningenablessaferlanguagemodels-2024","role":"author","urls":{"Paper":"https://doi.org/10.48550/arXiv.2412.16339"},"metadata":{"authorlinks":{}}},"bibtype":"article","biburl":"https://dblp.org/pid/b/BBarak.bib","dataSources":["miMWJWjn4sT6Q3GDs","763eChmCTEH5onHpy"],"keywords":[],"search_terms":["deliberative","alignment","reasoning","enables","safer","language","models","guan","joglekar","wallace","jain","barak","helyar","dias","vallone","ren","wei","chung","toyer","heidecke","beutel","glaese"],"title":"Deliberative Alignment: Reasoning Enables Safer Language Models","year":2024}