Just Enough Shifts: Mitigating Over-Refusal in Aligned Language Models with Targeted Representation Fine-Tuning. Dabas, M., Chen, S., Fleming, C., Jin, M., & Jia, R. In The International Conference on Machine Learning (ICML), 2025. bibtex @inproceedings{2025_4C_LLMOverRefuse,
title={Just Enough Shifts: Mitigating Over-Refusal in Aligned Language Models with Targeted Representation Fine-Tuning},
author={Dabas, Mahavir and Chen, Si and Fleming, Charles and Jin, Ming and Jia, Ruoxi },
booktitle={The International Conference on Machine Learning (ICML)},
year={2025}
}
Downloads: 0
{"_id":"ZJyn3jdojwdW7QntX","bibbaseid":"dabas-chen-fleming-jin-jia-justenoughshiftsmitigatingoverrefusalinalignedlanguagemodelswithtargetedrepresentationfinetuning-2025","author_short":["Dabas, M.","Chen, S.","Fleming, C.","Jin, M.","Jia, R."],"bibdata":{"bibtype":"inproceedings","type":"inproceedings","title":"Just Enough Shifts: Mitigating Over-Refusal in Aligned Language Models with Targeted Representation Fine-Tuning","author":[{"propositions":[],"lastnames":["Dabas"],"firstnames":["Mahavir"],"suffixes":[]},{"propositions":[],"lastnames":["Chen"],"firstnames":["Si"],"suffixes":[]},{"propositions":[],"lastnames":["Fleming"],"firstnames":["Charles"],"suffixes":[]},{"propositions":[],"lastnames":["Jin"],"firstnames":["Ming"],"suffixes":[]},{"propositions":[],"lastnames":["Jia"],"firstnames":["Ruoxi"],"suffixes":[]}],"booktitle":"The International Conference on Machine Learning (ICML)","year":"2025","bibtex":"@inproceedings{2025_4C_LLMOverRefuse,\n title={Just Enough Shifts: Mitigating Over-Refusal in Aligned Language Models with Targeted Representation Fine-Tuning},\n author={Dabas, Mahavir and Chen, Si and Fleming, Charles and Jin, Ming and Jia, Ruoxi },\n booktitle={The International Conference on Machine Learning (ICML)},\n year={2025}\n}\n","author_short":["Dabas, M.","Chen, S.","Fleming, C.","Jin, M.","Jia, R."],"key":"2025_4C_LLMOverRefuse","id":"2025_4C_LLMOverRefuse","bibbaseid":"dabas-chen-fleming-jin-jia-justenoughshiftsmitigatingoverrefusalinalignedlanguagemodelswithtargetedrepresentationfinetuning-2025","role":"author","urls":{},"metadata":{"authorlinks":{}}},"bibtype":"inproceedings","biburl":"http://www.jinming.tech/papers/myref.bib","dataSources":["sTzDHHaipTZWjp8oe"],"keywords":[],"search_terms":["enough","shifts","mitigating","over","refusal","aligned","language","models","targeted","representation","fine","tuning","dabas","chen","fleming","jin","jia"],"title":"Just Enough Shifts: Mitigating Over-Refusal in Aligned Language Models with Targeted Representation Fine-Tuning","year":2025}