Direct preference optimization: Your language model is secretly a reward model. Rafailov, R., Sharma, A., Mitchell, E., Manning, C. D, Ermon, S., & Finn, C. Advances in neural information processing systems, 36:53728–53741, 2023.
bibtex   
@article{rafailov2023direct,
  title={Direct preference optimization: Your language model is secretly a reward model},
  author={Rafailov, Rafael and Sharma, Archit and Mitchell, Eric and Manning, Christopher D and Ermon, Stefano and Finn, Chelsea},
  journal={Advances in neural information processing systems},
  volume={36},
  pages={53728--53741},
  year={2023}
}

Downloads: 0