Stress Test Evaluation for Natural Language Inference

Stress Test Evaluation for Natural Language Inference. Naik, A., Ravichander, A., Sadeh, N., Rose, C., & Neubig, G. In Proceedings ofthe 27th International Conference on Computational Linguistics (COLING), pages 2340–2353, 2018.

Paper abstract bibtex

Natural language inference (NLI) is the task of determining if a natural language hypothesis can be inferred from a given premise in a justifiable manner. NLI was proposed as a benchmark task for natural language understanding. Existing models perform well at standard datasets for NLI, achieving impressive results across different genres of text. However, the extent to which these models understand the semantic content of sentences is unclear. In this work, we propose an evaluation methodology consisting of automatically constructed "stress tests" that allow us to examine whether systems have the ability to make real inferential decisions. Our evaluation of six sentence-encoder models on these stress tests reveals strengths and weaknesses of these models with respect to challenging linguistic phenomena, and suggests important directions for future work in this area.

@inproceedings{Naik2018,
abstract = {Natural language inference (NLI) is the task of determining if a natural language hypothesis can be inferred from a given premise in a justifiable manner. NLI was proposed as a benchmark task for natural language understanding. Existing models perform well at standard datasets for NLI, achieving impressive results across different genres of text. However, the extent to which these models understand the semantic content of sentences is unclear. In this work, we propose an evaluation methodology consisting of automatically constructed "stress tests" that allow us to examine whether systems have the ability to make real inferential decisions. Our evaluation of six sentence-encoder models on these stress tests reveals strengths and weaknesses of these models with respect to challenging linguistic phenomena, and suggests important directions for future work in this area.},
archivePrefix = {arXiv},
arxivId = {1806.00692},
author = {Naik, Aakanksha and Ravichander, Abhilasha and Sadeh, Norman and Rose, Carolyn and Neubig, Graham},
booktitle = {Proceedings ofthe 27th International Conference on Computational Linguistics (COLING)},
eprint = {1806.00692},
file = {:Users/shanest/Documents/Library/Naik et al/Proceedings ofthe 27th International Conference on Computational Linguistics (COLING)/Naik et al. - 2018 - Stress Test Evaluation for Natural Language Inference.pdf:pdf},
keywords = {method: adversarial data},
pages = {2340--2353},
title = {{Stress Test Evaluation for Natural Language Inference}},
url = {https://www.aclweb.org/anthology/C18-1198},
year = {2018}
}

Downloads: 0

{"_id":"GJqHhS9xXBwgeHPcR","bibbaseid":"naik-ravichander-sadeh-rose-neubig-stresstestevaluationfornaturallanguageinference-2018","authorIDs":[],"author_short":["Naik, A.","Ravichander, A.","Sadeh, N.","Rose, C.","Neubig, G."],"bibdata":{"bibtype":"inproceedings","type":"inproceedings","abstract":"Natural language inference (NLI) is the task of determining if a natural language hypothesis can be inferred from a given premise in a justifiable manner. NLI was proposed as a benchmark task for natural language understanding. Existing models perform well at standard datasets for NLI, achieving impressive results across different genres of text. However, the extent to which these models understand the semantic content of sentences is unclear. In this work, we propose an evaluation methodology consisting of automatically constructed \"stress tests\" that allow us to examine whether systems have the ability to make real inferential decisions. Our evaluation of six sentence-encoder models on these stress tests reveals strengths and weaknesses of these models with respect to challenging linguistic phenomena, and suggests important directions for future work in this area.","archiveprefix":"arXiv","arxivid":"1806.00692","author":[{"propositions":[],"lastnames":["Naik"],"firstnames":["Aakanksha"],"suffixes":[]},{"propositions":[],"lastnames":["Ravichander"],"firstnames":["Abhilasha"],"suffixes":[]},{"propositions":[],"lastnames":["Sadeh"],"firstnames":["Norman"],"suffixes":[]},{"propositions":[],"lastnames":["Rose"],"firstnames":["Carolyn"],"suffixes":[]},{"propositions":[],"lastnames":["Neubig"],"firstnames":["Graham"],"suffixes":[]}],"booktitle":"Proceedings ofthe 27th International Conference on Computational Linguistics (COLING)","eprint":"1806.00692","file":":Users/shanest/Documents/Library/Naik et al/Proceedings ofthe 27th International Conference on Computational Linguistics (COLING)/Naik et al. - 2018 - Stress Test Evaluation for Natural Language Inference.pdf:pdf","keywords":"method: adversarial data","pages":"2340–2353","title":"Stress Test Evaluation for Natural Language Inference","url":"https://www.aclweb.org/anthology/C18-1198","year":"2018","bibtex":"@inproceedings{Naik2018,\nabstract = {Natural language inference (NLI) is the task of determining if a natural language hypothesis can be inferred from a given premise in a justifiable manner. NLI was proposed as a benchmark task for natural language understanding. Existing models perform well at standard datasets for NLI, achieving impressive results across different genres of text. However, the extent to which these models understand the semantic content of sentences is unclear. In this work, we propose an evaluation methodology consisting of automatically constructed \"stress tests\" that allow us to examine whether systems have the ability to make real inferential decisions. Our evaluation of six sentence-encoder models on these stress tests reveals strengths and weaknesses of these models with respect to challenging linguistic phenomena, and suggests important directions for future work in this area.},\narchivePrefix = {arXiv},\narxivId = {1806.00692},\nauthor = {Naik, Aakanksha and Ravichander, Abhilasha and Sadeh, Norman and Rose, Carolyn and Neubig, Graham},\nbooktitle = {Proceedings ofthe 27th International Conference on Computational Linguistics (COLING)},\neprint = {1806.00692},\nfile = {:Users/shanest/Documents/Library/Naik et al/Proceedings ofthe 27th International Conference on Computational Linguistics (COLING)/Naik et al. - 2018 - Stress Test Evaluation for Natural Language Inference.pdf:pdf},\nkeywords = {method: adversarial data},\npages = {2340--2353},\ntitle = {{Stress Test Evaluation for Natural Language Inference}},\nurl = {https://www.aclweb.org/anthology/C18-1198},\nyear = {2018}\n}\n","author_short":["Naik, A.","Ravichander, A.","Sadeh, N.","Rose, C.","Neubig, G."],"key":"Naik2018","id":"Naik2018","bibbaseid":"naik-ravichander-sadeh-rose-neubig-stresstestevaluationfornaturallanguageinference-2018","role":"author","urls":{"Paper":"https://www.aclweb.org/anthology/C18-1198"},"keyword":["method: adversarial data"],"metadata":{"authorlinks":{}},"downloads":0},"bibtype":"inproceedings","biburl":"https://www.shane.st/teaching/575/win20/MachineLearning-interpretability.bib","creationDate":"2020-01-06T00:27:01.040Z","downloads":0,"keywords":["method: adversarial data"],"search_terms":["stress","test","evaluation","natural","language","inference","naik","ravichander","sadeh","rose","neubig"],"title":"Stress Test Evaluation for Natural Language Inference","year":2018,"dataSources":["okYcdTpf4JJ2zkj7A","znj7izS5PeehdLR3G","aGtG992oMsrqA3Aas"]}