Right for the Wrong Reasons: Diagnosing Syntactic Heuristics in Natural Language Inference. McCoy, T., Pavlick, E., & Linzen, T. In Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics, pages 3428–3448, Stroudsburg, PA, USA, 2019. Association for Computational Linguistics.
Right for the Wrong Reasons: Diagnosing Syntactic Heuristics in Natural Language Inference [link]Paper  doi  abstract   bibtex   
Machine learning systems can often achieve high performance on a test set by relying on heuristics that are effective for frequent example types but break down in more challenging cases. We study this issue within natural language inference (NLI), the task of determining whether one sentence entails another. Based on an analysis of the task, we hypothesize three fallible syntactic heuristics that NLI models are likely to adopt: the lexical overlap heuristic, the subsequence heuristic, and the constituent heuristic. To determine whether models have adopted these heuristics, we introduce a controlled evaluation set called HANS (Heuristic Analysis for NLI Systems), which contains many examples where the heuristics fail. We find that models trained on MNLI, including the state-of-the-art model BERT, perform very poorly on HANS, suggesting that they have indeed adopted these heuristics. We conclude that there is substantial room for improvement in NLI systems, and that the HANS dataset can motivate and measure progress in this area.
@inproceedings{McCoy2019,
abstract = {Machine learning systems can often achieve high performance on a test set by relying on heuristics that are effective for frequent example types but break down in more challenging cases. We study this issue within natural language inference (NLI), the task of determining whether one sentence entails another. Based on an analysis of the task, we hypothesize three fallible syntactic heuristics that NLI models are likely to adopt: the lexical overlap heuristic, the subsequence heuristic, and the constituent heuristic. To determine whether models have adopted these heuristics, we introduce a controlled evaluation set called HANS (Heuristic Analysis for NLI Systems), which contains many examples where the heuristics fail. We find that models trained on MNLI, including the state-of-the-art model BERT, perform very poorly on HANS, suggesting that they have indeed adopted these heuristics. We conclude that there is substantial room for improvement in NLI systems, and that the HANS dataset can motivate and measure progress in this area.},
address = {Stroudsburg, PA, USA},
archivePrefix = {arXiv},
arxivId = {1902.01007},
author = {McCoy, Tom and Pavlick, Ellie and Linzen, Tal},
booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics},
doi = {10.18653/v1/P19-1334},
eprint = {1902.01007},
file = {:Users/shanest/Documents/Library/McCoy, Pavlick, Linzen/Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics/McCoy, Pavlick, Linzen - 2019 - Right for the Wrong Reasons Diagnosing Syntactic Heuristics in Natural Language Inference.pdf:pdf},
keywords = {method: adversarial data,phenomenon: inference},
pages = {3428--3448},
publisher = {Association for Computational Linguistics},
title = {{Right for the Wrong Reasons: Diagnosing Syntactic Heuristics in Natural Language Inference}},
url = {https://www.aclweb.org/anthology/P19-1334},
year = {2019}
}

Downloads: 0