Towards Understanding Jailbreak Attacks in LLMs: A Representation Space Analysis. Lin, Y., He, P., Xu, H., Xing, Y., Yamada, M., Liu, H., & Tang, J. In EMNLP, 2024.
bibtex   
@inproceedings{yuping-emnlp-2024-1,
  author    = {Yuping Lin and Pengfei He and Han Xu and Yue Xing and Makoto Yamada and Hui Liu and Jiliang Tang},
  title     = {Towards Understanding Jailbreak Attacks in LLMs: A Representation Space Analysis},
  booktitle = {EMNLP},
  year      = {2024}
}

Downloads: 0