Targeted Syntactic Evaluation of Language Models. Marvin, R. & Linzen, T. In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pages 1192–1202, Stroudsburg, PA, USA, 2018. Association for Computational Linguistics. Paper doi abstract bibtex We present a dataset for evaluating the grammaticality of the predictions of a language model. We automatically construct a large number of minimally different pairs of English sentences, each consisting of a grammatical and an ungrammatical sentence. The sentence pairs represent different variations of structure-sensitive phenomena: subject-verb agreement, reflexive anaphora and negative polarity items. We expect a language model to assign a higher probability to the grammatical sentence than the ungrammatical one. In an experiment using this data set, an LSTM language model performed poorly on many of the constructions. Multi-task training with a syntactic objective (CCG supertagging) improved the LSTM's accuracy, but a large gap remained between its performance and the accuracy of human participants recruited online. This suggests that there is considerable room for improvement over LSTMs in capturing syntax in a language model.
@inproceedings{Marvin2019,
abstract = {We present a dataset for evaluating the grammaticality of the predictions of a language model. We automatically construct a large number of minimally different pairs of English sentences, each consisting of a grammatical and an ungrammatical sentence. The sentence pairs represent different variations of structure-sensitive phenomena: subject-verb agreement, reflexive anaphora and negative polarity items. We expect a language model to assign a higher probability to the grammatical sentence than the ungrammatical one. In an experiment using this data set, an LSTM language model performed poorly on many of the constructions. Multi-task training with a syntactic objective (CCG supertagging) improved the LSTM's accuracy, but a large gap remained between its performance and the accuracy of human participants recruited online. This suggests that there is considerable room for improvement over LSTMs in capturing syntax in a language model.},
address = {Stroudsburg, PA, USA},
author = {Marvin, Rebecca and Linzen, Tal},
booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},
doi = {10.18653/v1/D18-1151},
file = {:Users/shanest/Documents/Library/Marvin, Linzen/Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing/Marvin, Linzen - 2018 - Targeted Syntactic Evaluation of Language Models.pdf:pdf},
keywords = {method: acceptability judgment,method: psycholinguistic,phenomenon: NPIs,phenomenon: anaphora,phenomenon: number agreement},
pages = {1192--1202},
publisher = {Association for Computational Linguistics},
title = {{Targeted Syntactic Evaluation of Language Models}},
url = {http://aclweb.org/anthology/D18-1151},
year = {2018}
}
Downloads: 0
{"_id":"gdhfyc7apq3SZyWQQ","bibbaseid":"marvin-linzen-targetedsyntacticevaluationoflanguagemodels-2018","authorIDs":[],"author_short":["Marvin, R.","Linzen, T."],"bibdata":{"bibtype":"inproceedings","type":"inproceedings","abstract":"We present a dataset for evaluating the grammaticality of the predictions of a language model. We automatically construct a large number of minimally different pairs of English sentences, each consisting of a grammatical and an ungrammatical sentence. The sentence pairs represent different variations of structure-sensitive phenomena: subject-verb agreement, reflexive anaphora and negative polarity items. We expect a language model to assign a higher probability to the grammatical sentence than the ungrammatical one. In an experiment using this data set, an LSTM language model performed poorly on many of the constructions. Multi-task training with a syntactic objective (CCG supertagging) improved the LSTM's accuracy, but a large gap remained between its performance and the accuracy of human participants recruited online. This suggests that there is considerable room for improvement over LSTMs in capturing syntax in a language model.","address":"Stroudsburg, PA, USA","author":[{"propositions":[],"lastnames":["Marvin"],"firstnames":["Rebecca"],"suffixes":[]},{"propositions":[],"lastnames":["Linzen"],"firstnames":["Tal"],"suffixes":[]}],"booktitle":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing","doi":"10.18653/v1/D18-1151","file":":Users/shanest/Documents/Library/Marvin, Linzen/Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing/Marvin, Linzen - 2018 - Targeted Syntactic Evaluation of Language Models.pdf:pdf","keywords":"method: acceptability judgment,method: psycholinguistic,phenomenon: NPIs,phenomenon: anaphora,phenomenon: number agreement","pages":"1192–1202","publisher":"Association for Computational Linguistics","title":"Targeted Syntactic Evaluation of Language Models","url":"http://aclweb.org/anthology/D18-1151","year":"2018","bibtex":"@inproceedings{Marvin2019,\nabstract = {We present a dataset for evaluating the grammaticality of the predictions of a language model. We automatically construct a large number of minimally different pairs of English sentences, each consisting of a grammatical and an ungrammatical sentence. The sentence pairs represent different variations of structure-sensitive phenomena: subject-verb agreement, reflexive anaphora and negative polarity items. We expect a language model to assign a higher probability to the grammatical sentence than the ungrammatical one. In an experiment using this data set, an LSTM language model performed poorly on many of the constructions. Multi-task training with a syntactic objective (CCG supertagging) improved the LSTM's accuracy, but a large gap remained between its performance and the accuracy of human participants recruited online. This suggests that there is considerable room for improvement over LSTMs in capturing syntax in a language model.},\naddress = {Stroudsburg, PA, USA},\nauthor = {Marvin, Rebecca and Linzen, Tal},\nbooktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},\ndoi = {10.18653/v1/D18-1151},\nfile = {:Users/shanest/Documents/Library/Marvin, Linzen/Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing/Marvin, Linzen - 2018 - Targeted Syntactic Evaluation of Language Models.pdf:pdf},\nkeywords = {method: acceptability judgment,method: psycholinguistic,phenomenon: NPIs,phenomenon: anaphora,phenomenon: number agreement},\npages = {1192--1202},\npublisher = {Association for Computational Linguistics},\ntitle = {{Targeted Syntactic Evaluation of Language Models}},\nurl = {http://aclweb.org/anthology/D18-1151},\nyear = {2018}\n}\n","author_short":["Marvin, R.","Linzen, T."],"key":"Marvin2019","id":"Marvin2019","bibbaseid":"marvin-linzen-targetedsyntacticevaluationoflanguagemodels-2018","role":"author","urls":{"Paper":"http://aclweb.org/anthology/D18-1151"},"keyword":["method: acceptability judgment","method: psycholinguistic","phenomenon: NPIs","phenomenon: anaphora","phenomenon: number agreement"],"metadata":{"authorlinks":{}},"downloads":0},"bibtype":"inproceedings","biburl":"https://www.shane.st/teaching/575/win20/MachineLearning-interpretability.bib","creationDate":"2020-01-05T04:04:02.887Z","downloads":0,"keywords":["method: acceptability judgment","method: psycholinguistic","phenomenon: npis","phenomenon: anaphora","phenomenon: number agreement"],"search_terms":["targeted","syntactic","evaluation","language","models","marvin","linzen"],"title":"Targeted Syntactic Evaluation of Language Models","year":2018,"dataSources":["okYcdTpf4JJ2zkj7A","Qf8nXnKaPEZYBmSLn","znj7izS5PeehdLR3G"]}