LLMs Encode Harmfulness and Refusal Separately. Zhao, J., Huang, J., Wu, Z., Bau, D., & Shi, W. arXiv preprint arXiv:2507.11878, 2025.
bibtex   
@article{zhao2025harmfulness,
  title   = {LLMs Encode Harmfulness and Refusal Separately},
  author  = {Zhao, Jiachen and Huang, Jing and Wu, Zhengxuan and Bau, David and Shi, Weiyan},
  journal = {arXiv preprint arXiv:2507.11878},
  year    = {2025}
}

Downloads: 0