<script src="https://bibbase.org/show?bib=https%3A%2F%2Fwww.shane.st%2Fteaching%2F575%2Fwin20%2FMachineLearning-interpretability.bib&jsonp=1"></script>
<?php
$contents = file_get_contents("https://bibbase.org/show?bib=https%3A%2F%2Fwww.shane.st%2Fteaching%2F575%2Fwin20%2FMachineLearning-interpretability.bib");
print_r($contents);
?>
<iframe src="https://bibbase.org/show?bib=https%3A%2F%2Fwww.shane.st%2Fteaching%2F575%2Fwin20%2FMachineLearning-interpretability.bib"></iframe>
For more details see the documention.
To the site owner:
Action required! Mendeley is changing its API. In order to keep using Mendeley with BibBase past April 14th, you need to:
@inproceedings{Lan2020, author = {Lan, Zhenzhong and Chen, Mingda and Goodman, Sebastian and Gimpel, Kevin and Sharma, Piyush and Soricut, Radu}, booktitle = {International Conference of Learning Representations}, file = {:Users/shanest/Documents/Library/Lan et al/International Conference of Learning Representations/Lan et al. - 2020 - ALBERT A Lite BERT for Self-supervised Learning of Language Representations.pdf:pdf}, keywords = {model}, title = {{ALBERT: A Lite BERT for Self-supervised Learning of Language Representations}}, url = {https://openreview.net/forum?id=H1eA7AEtvS}, year = {2020} }
@inproceedings{Kim2020, author = {Kim, Taeuk and Choi, Jihun and Lee, Sang-goo and Edmiston, Daniel}, booktitle = {International Conference of Learning Representations}, file = {:Users/shanest/Documents/Library/Kim et al/International Conference of Learning Representations/Kim et al. - 2020 - Are Pre-trained Language Models Aware of Phrases Simple but Strong Baselines for Grammar Inductio.pdf:pdf}, keywords = {method: syntactic distance}, title = {{Are Pre-trained Language Models Aware of Phrases? Simple but Strong Baselines for Grammar Inductio}}, year = {2020} }
@inproceedings{Keysers2020, author = {Keysers, Daniel and Sch\"arli, Nathanael and Scales, Nathan and Buisman, Hylke and Furrer, Daniel and Kashubin, Sergii and Momchev, Nikola and Sinopalnikov, Danila and Stafiniak, Lukasz and Tihon, Tibor and Tsarkov, Dmitry and Wang, Xiao and van Zee, Marc and Bousquet, Olivier}, booktitle = {International Conference of Learning Representations}, file = {:Users/shanest/Documents/Library/Keysers et al/International Conference of Learning Representations/Keysers et al. - 2020 - Measuring Compositional Generalization A Comprehensive Method on Realistic Data.pdf:pdf}, keywords = {method: new data,phenomenon: compositionality}, title = {{Measuring Compositional Generalization: A Comprehensive Method on Realistic Data}}, url = {https://openreview.net/pdf?id=SygcCnNKwr}, year = {2020} }
@article{Baroni2020, abstract = {In the last decade, deep artificial neural networks have achieved astounding performance in many natural language-processing tasks. Given the high productivity of language, these models must possess effective generalization abilities. It is widely assumed that humans handle linguistic productivity by means of algebraic compositional rules: are deep networks similarly compositional? After reviewing the main innovations characterizing current deep language-processing networks, I discuss a set of studies suggesting that deep networks are capable of subtle grammar-dependent generalizations, but also that they do not rely on systematic compositional rules. I argue that the intriguing behaviour of these devices (still awaiting a full understanding) should be of interest to linguists and cognitive scientists, as it offers a new perspective on possible computational strategies to deal with linguistic productivity beyond rule-based compositionality, and it might lead to new insights into the less systematic generalization patterns that also appear in natural language.}, archivePrefix = {arXiv}, arxivId = {1904.00157}, author = {Baroni, Marco}, doi = {10.1098/rstb.2019.0307}, eprint = {1904.00157}, file = {:Users/shanest/Documents/Library/Baroni/Philosophical Transactions of the Royal Society B Biological Sciences/Baroni - 2020 - Linguistic generalization and compositionality in modern artificial neural networks.pdf:pdf}, issn = {0962-8436}, journal = {Philosophical Transactions of the Royal Society B: Biological Sciences}, keywords = {phenomenon: compositionality,survey}, number = {1791}, title = {{Linguistic generalization and compositionality in modern artificial neural networks}}, url = {https://royalsocietypublishing.org/doi/10.1098/rstb.2019.0307}, volume = {375}, year = {2020} }
@inproceedings{Zhou2020, abstract = {Contextualized representations trained over large raw text data have given remarkable improvements for NLP tasks including question answering and reading comprehension. There have been works showing that syntactic, semantic and word sense knowledge are contained in such representations, which explains why they benefit such tasks. However, relatively little work has been done investigating commonsense knowledge contained in contextualized representations, which is crucial for human question answering and reading comprehension. We study the commonsense ability of GPT, BERT, XLNet, and RoBERTa by testing them on seven challenging benchmarks, finding that language modeling and its variants are effective objectives for promoting models' commonsense ability while bi-directional context and larger training set are bonuses. We additionally find that current models do poorly on tasks require more necessary inference steps. Finally, we test the robustness of models by making dual test cases, which are correlated so that the correct prediction of one sample should lead to correct prediction of the other. Interestingly, the models show confusion on these test cases, which suggests that they learn commonsense at the surface rather than the deep level. We release a test set, named CATs publicly, for future research.}, archivePrefix = {arXiv}, arxivId = {1911.11931}, author = {Zhou, Xuhui and Zhang, Yue and Cui, Leyang and Huang, Dandan}, booktitle = {Association for the Advancement of Artificial Intelligence (AAAI)}, eprint = {1911.11931}, file = {:Users/shanest/Documents/Library/Zhou et al/Association for the Advancement of Artificial Intelligence (AAAI)/Zhou et al. - 2020 - Evaluating Commonsense in Pre-trained Language Models.pdf:pdf}, keywords = {dataset,method: new data,phenomenon: commonsense}, title = {{Evaluating Commonsense in Pre-trained Language Models}}, url = {http://arxiv.org/abs/1911.11931}, year = {2020} }
@inproceedings{Voita2019a, abstract = {Multi-head self-attention is a key component of the Transformer, a state-of-the-art architecture for neural machine translation. In this work we evaluate the contribution made by individual attention heads in the encoder to the overall performance of the model and analyze the roles played by them. We find that the most important and confident heads play consistent and often linguistically-interpretable roles. When pruning heads using a method based on stochastic gates and a differentiable relaxation of the L0 penalty, we observe that specialized heads are last to be pruned. Our novel pruning method removes the vast majority of heads without seriously affecting performance. For example, on the English-Russian WMT dataset, pruning 38 out of 48 encoder heads results in a drop of only 0.15 BLEU.}, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {1905.09418}, author = {Voita, Elena and Talbot, David and Moiseev, Fedor and Sennrich, Rico and Titov, Ivan}, booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, doi = {10.18653/v1/P19-1580}, eprint = {1905.09418}, file = {:Users/shanest/Documents/Library/Voita et al/Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics/Voita et al. - 2019 - Analyzing Multi-Head Self-Attention Specialized Heads Do the Heavy Lifting, the Rest Can Be Pruned.pdf:pdf}, keywords = {method: attention,method: pruning}, pages = {5797--5808}, publisher = {Association for Computational Linguistics}, title = {{Analyzing Multi-Head Self-Attention: Specialized Heads Do the Heavy Lifting, the Rest Can Be Pruned}}, url = {https://www.aclweb.org/anthology/P19-1580}, year = {2019} }
@inproceedings{Jawahar2019, abstract = {BERT is a recent language representation model that has surprisingly performed well in diverse language understanding benchmarks. This result indicates the possibility that BERT networks capture structural information about language. In this work, we provide novel support for this claim by performing a series of experiments to unpack the elements of English language structure learned by BERT. We first show that BERT's phrasal representation captures phrase-level information in the lower layers. We also show that BERT's intermediate layers encode a rich hierarchy of linguistic information , with surface features at the bottom, syntactic features in the middle and semantic features at the top. BERT turns out to require deeper layers when long-distance dependency information is required, e.g. to track subject-verb agreement. Finally, we show that BERT representations capture linguistic information in a compositional way that mimics classical, tree-like structures.}, address = {Stroudsburg, PA, USA}, author = {Jawahar, Ganesh and Sagot, Beno{\^{i}}t and Seddah, Djam{\'{e}}}, booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, doi = {10.18653/v1/P19-1356}, file = {:Users/shanest/Documents/Library/Jawahar, Sagot, Seddah/Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics/Jawahar, Sagot, Seddah - 2019 - What Does BERT Learn about the Structure of Language.pdf:pdf}, keywords = {method: diagnostic classifier,phenomenon: compositionality,phenomenon: various}, pages = {3651--3657}, publisher = {Association for Computational Linguistics}, title = {{What Does BERT Learn about the Structure of Language?}}, url = {https://www.aclweb.org/anthology/P19-1356}, year = {2019} }
@inproceedings{Kovaleva2019, abstract = {BERT-based architectures currently give state-of-the-art performance on many NLP tasks, but little is known about the exact mechanisms that contribute to its success. In the current work, we focus on the interpretation of self-attention, which is one of the fundamental underlying components of BERT. Using a subset of GLUE tasks and a set of handcrafted features-of-interest, we propose the methodology and carry out a qualitative and quantitative analysis of the information encoded by the individual BERT's heads. Our findings suggest that there is a limited set of attention patterns that are repeated across different heads, indicating the overall model overparametrization. While different heads consistently use the same attention patterns, they have varying impact on performance across different tasks. We show that manually disabling attention in certain heads leads to a performance improvement over the regular fine-tuned BERT models.}, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {1908.08593}, author = {Kovaleva, Olga and Romanov, Alexey and Rogers, Anna and Rumshisky, Anna}, booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)}, doi = {10.18653/v1/D19-1445}, eprint = {1908.08593}, file = {:Users/shanest/Documents/Library/Kovaleva et al/Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural./Kovaleva et al. - 2019 - Revealing the Dark Secrets of BERT.pdf:pdf}, keywords = {method: attention,method: pruning}, pages = {4364--4373}, publisher = {Association for Computational Linguistics}, title = {{Revealing the Dark Secrets of BERT}}, url = {https://www.aclweb.org/anthology/D19-1445}, year = {2019} }
@inproceedings{Lakretz2019, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {arXiv:1903.07435v1}, author = {Lakretz, Yair and Kruszewski, German and Desbordes, Theo and Hupkes, Dieuwke and Dehaene, Stanislas and Baroni, Marco}, booktitle = {Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)}, doi = {10.18653/v1/N19-1002}, eprint = {arXiv:1903.07435v1}, file = {:Users/shanest/Documents/Library/Lakretz et al/Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics Human Language Technologie./Lakretz et al. - 2019 - The emergence of number and syntax units in LSTM language models.pdf:pdf}, keywords = {method: individual neurons,phenomenon: number agreement}, pages = {11--20}, publisher = {Association for Computational Linguistics}, title = {{The emergence of number and syntax units in LSTM language models}}, url = {http://aclweb.org/anthology/N19-1002}, year = {2019} }
@article{Raffel2019, abstract = {Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a downstream task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework that converts every language problem into a text-to-text format. Our systematic study compares pre-training objectives, architectures, unlabeled datasets, transfer approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration with scale and our new "Colossal Clean Crawled Corpus", we achieve state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more. To facilitate future work on transfer learning for NLP, we release our dataset, pre-trained models, and code.}, archivePrefix = {arXiv}, arxivId = {1910.10683}, author = {Raffel, Colin and Shazeer, Noam and Roberts, Adam and Lee, Katherine and Narang, Sharan and Matena, Michael and Zhou, Yanqi and Li, Wei and Liu, Peter J}, eprint = {1910.10683}, file = {:Users/shanest/Documents/Library/Raffel et al/Unknown/Raffel et al. - 2019 - Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer.pdf:pdf}, keywords = {model}, month = {oct}, pages = {1--53}, title = {{Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer}}, url = {http://arxiv.org/abs/1910.10683}, year = {2019} }
@article{Lewis2019, abstract = {We present BART, a denoising autoencoder for pretraining sequence-to-sequence models. BART is trained by (1) corrupting text with an arbitrary noising function, and (2) learning a model to reconstruct the original text. It uses a standard Tranformer-based neural machine translation architecture which, despite its simplicity, can be seen as generalizing BERT (due to the bidirectional encoder), GPT (with the left-to-right decoder), and many other more recent pretraining schemes. We evaluate a number of noising approaches, finding the best performance by both randomly shuffling the order of the original sentences and using a novel in-filling scheme, where spans of text are replaced with a single mask token. BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks. It matches the performance of RoBERTa with comparable training resources on GLUE and SQuAD, achieves new state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains of up to 6 ROUGE. BART also provides a 1.1 BLEU increase over a back-translation system for machine translation, with only target language pretraining. We also report ablation experiments that replicate other pretraining schemes within the BART framework, to better measure which factors most influence end-task performance.}, archivePrefix = {arXiv}, arxivId = {1910.13461}, author = {Lewis, Mike and Liu, Yinhan and Goyal, Naman and Ghazvininejad, Marjan and Mohamed, Abdelrahman and Levy, Omer and Stoyanov, Ves and Zettlemoyer, Luke}, eprint = {1910.13461}, file = {:Users/shanest/Documents/Library/Lewis et al/Unknown/Lewis et al. - 2019 - BART Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension.pdf:pdf}, keywords = {model}, month = {oct}, title = {{BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension}}, url = {http://arxiv.org/abs/1910.13461}, year = {2019} }
@inproceedings{Ravfogel2019, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {arXiv:1903.06400v1}, author = {Ravfogel, Shauli and Goldberg, Yoav and Linzen, Tal}, booktitle = {Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)}, doi = {10.18653/v1/N19-1356}, eprint = {arXiv:1903.06400v1}, file = {:Users/shanest/Documents/Library/Ravfogel, Goldberg, Linzen/Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics Human Language Technologie./Ravfogel, Goldberg, Linzen - 2019 - Studying the Inductive Biases of RNNs with Synthetic Variations of Natural Languages.pdf:pdf}, keywords = {method: cross-linguistic,method: synthetic languages,phenomenon: number agreement}, number = {2016}, pages = {3532--3542}, publisher = {Association for Computational Linguistics}, title = {{Studying the Inductive Biases of RNNs with Synthetic Variations of Natural Languages}}, url = {http://aclweb.org/anthology/N19-1356}, year = {2019} }
@article{Wolf2019, abstract = {Recent advances in modern Natural Language Processing (NLP) research have been dominated by the combination of Transfer Learning methods with large-scale language models, in particular based on the Transformer architecture. With them came a paradigm shift in NLP with the starting point for training a model on a downstream task moving from a blank specific model to a general-purpose pretrained architecture. Still, creating these general-purpose models remains an expensive and time-consuming process restricting the use of these methods to a small sub-set of the wider NLP community. In this paper, we present HuggingFace's Transformers library, a library for state-of-the-art NLP, making these developments available to the community by gathering state-of-the-art general-purpose pretrained models under a unified API together with an ecosystem of libraries, examples, tutorials and scripts targeting many downstream NLP tasks. HuggingFace's Transformers library features carefully crafted model implementations and high-performance pretrained weights for two main deep learning frameworks, PyTorch and TensorFlow, while supporting all the necessary tools to analyze, evaluate and use these models in downstream tasks such as text/token classification, questions answering and language generation among others. The library has gained significant organic traction and adoption among both the researcher and practitioner communities. We are committed at HuggingFace to pursue the efforts to develop this toolkit with the ambition of creating the standard library for building NLP systems.}, archivePrefix = {arXiv}, arxivId = {1910.03771}, author = {Wolf, Thomas and Debut, Lysandre and Sanh, Victor and Chaumond, Julien and Delangue, Clement and Moi, Anthony and Cistac, Pierric and Rault, Tim and Louf, R{\'{e}}mi and Funtowicz, Morgan and Brew, Jamie}, eprint = {1910.03771}, file = {:Users/shanest/Documents/Library/Wolf et al/Unknown/Wolf et al. - 2019 - HuggingFace's Transformers State-of-the-art Natural Language Processing.pdf:pdf}, keywords = {dataset,model}, month = {oct}, title = {{HuggingFace's Transformers: State-of-the-art Natural Language Processing}}, url = {http://arxiv.org/abs/1910.03771}, year = {2019} }
@inproceedings{Yang2019, archivePrefix = {arXiv}, arxivId = {arXiv:1906.08237v1}, author = {Yang, Zhilin and Dai, Zihang and Yang, Yiming and Carbonell, Jaime and Salakhutdinov, Ruslan and Le, Quoc V}, booktitle = {Advances in Neural Information Processing Systems 32 (NeurIPS 2019)}, eprint = {arXiv:1906.08237v1}, file = {:Users/shanest/Documents/Library/Yang et al/Advances in Neural Information Processing Systems 32 (NeurIPS 2019)/Yang et al. - 2019 - XLNet Generalized Autoregressive Pretraining for Language Understanding.pdf:pdf}, keywords = {model}, title = {{XLNet: Generalized Autoregressive Pretraining for Language Understanding}}, url = {https://papers.nips.cc/paper/8812-xlnet-generalized-autoregressive-pretraining-for-language-understanding}, year = {2019} }
@inproceedings{Dalvi2019, abstract = {Despite the remarkable evolution of deep neural networks in natural language processing (NLP), their interpretability remains a challenge. Previous work largely focused on what these models learn at the representation level. We break this analysis down further and study individual dimensions (neurons) in the vector representation learned by end-to-end neural models in NLP tasks. We propose two methods: Linguistic Correlation Analysis, based on a supervised method to extract the most relevant neurons with respect to an extrinsic task, and Cross-model Correlation Analysis, an unsupervised method to extract salient neurons w.r.t. the model itself. We evaluate the effectiveness of our techniques by ablating the identified neurons and reevaluating the network's performance for two tasks: neural machine translation (NMT) and neural language modeling (NLM). We further present a comprehensive analysis of neurons with the aim to address the following questions: i) how localized or distributed are different linguistic properties in the models? ii) are certain neurons exclusive to some properties and not others? iii) is the information more or less distributed in NMT vs. NLM? and iv) how important are the neurons identified through the linguistic correlation method to the overall task? Our code is publicly available as part of the NeuroX toolkit (Dalvi et al. 2019).}, archivePrefix = {arXiv}, arxivId = {1812.09355}, author = {Dalvi, Fahim and Durrani, Nadir and Sajjad, Hassan and Belinkov, Yonatan and Bau, Anthony and Glass, James}, booktitle = {Association for the Advancement of Artificial Intelligence (AAAI)}, eprint = {1812.09355}, file = {:Users/shanest/Documents/Library/Dalvi et al/Association for the Advancement of Artificial Intelligence (AAAI)/Dalvi et al. - 2019 - What Is One Grain of Sand in the Desert Analyzing Individual Neurons in Deep NLP Models.pdf:pdf}, keywords = {method: cross-model correlation analysis,method: individual neurons,method: linguistic correlation analysis}, month = {dec}, title = {{What Is One Grain of Sand in the Desert? Analyzing Individual Neurons in Deep NLP Models}}, url = {http://arxiv.org/abs/1812.09355}, year = {2019} }
@inproceedings{Sanh2019, abstract = {As Transfer Learning from large-scale pre-trained models becomes more prevalent in Natural Language Processing (NLP), operating these large models in on-the-edge and/or under constrained computational training or inference budgets remains challenging. In this work, we propose a method to pre-train a smaller general-purpose language representation model, called DistilBERT, which can then be fine-tuned with good performances on a wide range of tasks like its larger counterparts. While most prior work investigated the use of distillation for building task-specific models, we leverage knowledge distillation during the pre-training phase and show that it is possible to reduce the size of a BERT model by 40%, while retaining 97% of its language understanding capabilities and being 60% faster. To leverage the inductive biases learned by larger models during pre-training, we introduce a triple loss combining language modeling, distillation and cosine-distance losses. Our smaller, faster and lighter model is cheaper to pre-train and we demonstrate its capabilities for on-device computations in a proof-of-concept experiment and a comparative on-device study.}, archivePrefix = {arXiv}, arxivId = {1910.01108}, author = {Sanh, Victor and Debut, Lysandre and Chaumond, Julien and Wolf, Thomas}, booktitle = {5th Workshop on Energy Efficient Machine Learning and Cognitive Computing @ NeurIPS 2019}, eprint = {1910.01108}, file = {:Users/shanest/Documents/Library/Sanh et al/5th Workshop on Energy Efficient Machine Learning and Cognitive Computing @ NeurIPS 2019/Sanh et al. - 2019 - DistilBERT, a distilled version of BERT smaller, faster, cheaper and lighter.pdf:pdf}, keywords = {model}, title = {{DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter}}, url = {http://arxiv.org/abs/1910.01108}, year = {2019} }
@inproceedings{Lample2019, abstract = {Recent studies have demonstrated the efficiency of generative pretraining for English natural language understanding. In this work, we extend this approach to multiple languages and show the effectiveness of cross-lingual pretraining. We propose two methods to learn cross-lingual language models (XLMs): one unsupervised that only relies on monolingual data, and one supervised that leverages parallel data with a new cross-lingual language model objective. We obtain state-of-the-art results on cross-lingual classification, unsupervised and supervised machine translation. On XNLI, our approach pushes the state of the art by an absolute gain of 4.9% accuracy. On unsupervised machine translation, we obtain 34.3 BLEU on WMT'16 German-English, improving the previous state of the art by more than 9 BLEU. On supervised machine translation, we obtain a new state of the art of 38.5 BLEU on WMT'16 Romanian-English, outperforming the previous best approach by more than 4 BLEU. Our code and pretrained models will be made publicly available.}, archivePrefix = {arXiv}, arxivId = {1901.07291}, author = {Lample, Guillaume and Conneau, Alexis}, booktitle = {33rd Conference on Neural Information Processing Systems (NeurIPS 2019)}, eprint = {1901.07291}, file = {:Users/shanest/Documents/Library/Lample, Conneau/33rd Conference on Neural Information Processing Systems (NeurIPS 2019)/Lample, Conneau - 2019 - Cross-lingual Language Model Pretraining.pdf:pdf}, keywords = {method: cross-linguistic,model}, title = {{Cross-lingual Language Model Pretraining}}, url = {http://arxiv.org/abs/1901.07291}, year = {2019} }
@article{Conneau2019, abstract = {This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +13.8% average accuracy on XNLI, +12.3% average F1 score on MLQA, and +2.1% average F1 score on NER. XLM-R performs particularly well on low-resource languages, improving 11.8% in XNLI accuracy for Swahili and 9.2% for Urdu over the previous XLM model. We also present a detailed empirical evaluation of the key factors that are required to achieve these gains, including the trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and low resource languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling without sacrificing per-language performance; XLM-Ris very competitive with strong monolingual models on the GLUE and XNLI benchmarks. We will make XLM-R code, data, and models publicly available.}, archivePrefix = {arXiv}, arxivId = {1911.02116}, author = {Conneau, Alexis and Khandelwal, Kartikay and Goyal, Naman and Chaudhary, Vishrav and Wenzek, Guillaume and Guzm{\'{a}}n, Francisco and Grave, Edouard and Ott, Myle and Zettlemoyer, Luke and Stoyanov, Veselin}, eprint = {1911.02116}, file = {:Users/shanest/Documents/Library/Conneau et al/Unknown/Conneau et al. - 2019 - Unsupervised Cross-lingual Representation Learning at Scale.pdf:pdf}, keywords = {method: cross-linguistic,model}, month = {nov}, title = {{Unsupervised Cross-lingual Representation Learning at Scale}}, url = {http://arxiv.org/abs/1911.02116}, year = {2019} }
@article{Liu2019a, abstract = {Language model pretraining has led to significant performance gains but careful comparison between different approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes, and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of every model published after it. Our best model achieves state-of-the-art results on GLUE, RACE and SQuAD. These results highlight the importance of previously overlooked design choices, and raise questions about the source of recently reported improvements. We release our models and code.}, archivePrefix = {arXiv}, arxivId = {1907.11692}, author = {Liu, Yinhan and Ott, Myle and Goyal, Naman and Du, Jingfei and Joshi, Mandar and Chen, Danqi and Levy, Omer and Lewis, Mike and Zettlemoyer, Luke and Stoyanov, Veselin}, eprint = {1907.11692}, file = {:Users/shanest/Documents/Library/Liu et al/Unknown/Liu et al. - 2019 - RoBERTa A Robustly Optimized BERT Pretraining Approach.pdf:pdf}, keywords = {model}, title = {{RoBERTa: A Robustly Optimized BERT Pretraining Approach}}, url = {http://arxiv.org/abs/1907.11692}, year = {2019} }
@inproceedings{Serrano2019, abstract = {Attention mechanisms have recently boosted performance on a range of NLP tasks. Because attention layers explicitly weight input components' representations, it is also often assumed that attention can be used to identify information that models found important (e.g., specific contextualized word tokens). We test whether that assumption holds by manipulating attention weights in already-trained text classification models and analyzing the resulting differences in their predictions. While we observe some ways in which higher attention weights correlate with greater impact on model predictions, we also find many ways in which this does not hold, i.e., where gradient-based rankings of attention weights better predict their effects than their magnitudes. We conclude that while attention noisily predicts input components' overall importance to a model, it is by no means a fail-safe indicator.}, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {1906.03731}, author = {Serrano, Sofia and Smith, Noah A.}, booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, doi = {10.18653/v1/P19-1282}, eprint = {1906.03731}, file = {:Users/shanest/Documents/Library/Serrano, Smith/Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics/Serrano, Smith - 2019 - Is Attention Interpretable.pdf:pdf}, keywords = {method: attention}, pages = {2931--2951}, publisher = {Association for Computational Linguistics}, title = {{Is Attention Interpretable?}}, url = {https://www.aclweb.org/anthology/P19-1282}, year = {2019} }
@article{Warstadt2019a, abstract = {Recent pretrained sentence encoders achieve state of the art results on language understanding tasks, but does this mean they have implicit knowledge of syntactic structures? We introduce a grammatically annotated development set for the Corpus of Linguistic Acceptability (CoLA; Warstadt et al., 2018), which we use to investigate the grammatical knowledge of three pretrained encoders, including the popular OpenAI Transformer (Radford et al., 2018) and BERT (Devlin et al., 2018). We fine-tune these encoders to do acceptability classification over CoLA and compare the models' performance on the annotated analysis set. Some phenomena, e.g. modification by adjuncts, are easy to learn for all models, while others, e.g. long-distance movement, are learned effectively only by models with strong overall performance, and others still, e.g. morphological agreement, are hardly learned by any model.}, archivePrefix = {arXiv}, arxivId = {1901.03438}, author = {Warstadt, Alex and Bowman, Samuel R.}, eprint = {1901.03438}, file = {:Users/shanest/Documents/Library/Warstadt, Bowman/Unknown/Warstadt, Bowman - 2019 - Grammatical Analysis of Pretrained Sentence Encoders with Acceptability Judgments.pdf:pdf}, keywords = {dataset,method: acceptability judgment,method: model comparison}, title = {{Grammatical Analysis of Pretrained Sentence Encoders with Acceptability Judgments}}, url = {http://arxiv.org/abs/1901.03438}, year = {2019} }
@inproceedings{Clark2019, abstract = {Large pre-trained neural networks such as BERT have had great recent success in NLP, motivating a growing body of research investigating what aspects of language they are able to learn from unlabeled data. Most recent analysis has focused on model outputs (e.g., language model surprisal) or internal vector representations (e.g., probing classifiers). Complementary to these works, we propose methods for analyzing the attention mechanisms of pre-trained models and apply them to BERT. BERT's attention heads exhibit patterns such as attending to delimiter tokens, specific positional offsets, or broadly attending over the whole sentence, with heads in the same layer often exhibiting similar behaviors. We further show that certain attention heads correspond well to linguistic notions of syntax and coreference. For example, we find heads that attend to the direct objects of verbs, determiners of nouns, objects of prepositions, and coreferent mentions with remarkably high accuracy. Lastly, we propose an attention-based probing classifier and use it to further demonstrate that substantial syntactic information is captured in BERT's attention.}, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {1906.04341}, author = {Clark, Kevin and Khandelwal, Urvashi and Levy, Omer and Manning, Christopher D.}, booktitle = {Proceedings of the 2019 ACL Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP}, doi = {10.18653/v1/W19-4828}, eprint = {1906.04341}, file = {:Users/shanest/Documents/Library/Clark et al/Proceedings of the 2019 ACL Workshop BlackboxNLP Analyzing and Interpreting Neural Networks for NLP/Clark et al. - 2019 - What Does BERT Look at An Analysis of BERT's Attention.pdf:pdf}, keywords = {method: attention}, pages = {276--286}, publisher = {Association for Computational Linguistics}, title = {{What Does BERT Look at? An Analysis of BERT's Attention}}, url = {https://www.aclweb.org/anthology/W19-4828}, year = {2019} }
@article{Radford2019, abstract = {Natural language processing tasks, such as question answering, machine translation, reading comprehension , and summarization, are typically approached with supervised learning on task-specific datasets. We demonstrate that language models begin to learn these tasks without any explicit supervision when trained on a new dataset of millions of webpages called WebText. When conditioned on a document plus questions, the answers generated by the language model reach 55 F1 on the CoQA dataset-matching or exceeding the performance of 3 out of 4 baseline systems without using the 127,000+ training examples. The capacity of the language model is essential to the success of zero-shot task transfer and increasing it improves performance in a log-linear fashion across tasks. Our largest model, GPT-2, is a 1.5B parameter Transformer that achieves state of the art results on 7 out of 8 tested language modeling datasets in a zero-shot setting but still underfits WebText. Samples from the model reflect these improvements and contain coherent paragraphs of text. These findings suggest a promising path towards building language processing systems which learn to perform tasks from their naturally occurring demonstrations.}, author = {Radford, Alec and Wu, Jeffrey and Child, Rewon and Luan, David and Amodei, Dario and Sutskever, Ilya}, file = {:Users/shanest/Documents/Library/Radford et al/Unknown/Radford et al. - 2019 - Language Models are Unsupervised Multitask Learners.pdf:pdf}, keywords = {model}, title = {{Language Models are Unsupervised Multitask Learners}}, url = {https://openai.com/blog/better-language-models/}, year = {2019} }
@inproceedings{Wiegreffe2019, abstract = {Attention mechanisms play a central role in NLP systems, especially within recurrent neural network (RNN) models. Recently, there has been increasing interest in whether or not the intermediate representations offered by these modules may be used to explain the reasoning for a model's prediction, and consequently reach insights regarding the model's decision-making process. A recent paper claims that `Attention is not Explanation' (Jain and Wallace, 2019). We challenge many of the assumptions underlying this work, arguing that such a claim depends on one's definition of explanation, and that testing it needs to take into account all elements of the model, using a rigorous experimental design. We propose four alternative tests to determine when/whether attention can be used as explanation: a simple uniform-weights baseline; a variance calibration based on multiple random seed runs; a diagnostic framework using frozen weights from pretrained models; and an end-to-end adversarial attention training protocol. Each allows for meaningful interpretation of attention mechanisms in RNN models. We show that even when reliable adversarial distributions can be found, they don't perform well on the simple diagnostic, indicating that prior work does not disprove the usefulness of attention mechanisms for explainability.}, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {1908.04626}, author = {Wiegreffe, Sarah and Pinter, Yuval}, booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)}, doi = {10.18653/v1/D19-1002}, eprint = {1908.04626}, file = {:Users/shanest/Documents/Library/Wiegreffe, Pinter/Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural./Wiegreffe, Pinter - 2019 - Attention is not not Explanation.pdf:pdf}, keywords = {method: attention}, pages = {11--20}, publisher = {Association for Computational Linguistics}, title = {{Attention is not not Explanation}}, url = {https://www.aclweb.org/anthology/D19-1002}, year = {2019} }
@inproceedings{Jain2019, address = {Stroudsburg, PA, USA}, author = {Jain, Sarthak and Wallace, Byron C.}, booktitle = {Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)}, doi = {10.18653/v1/N19-1357}, file = {:Users/shanest/Documents/Library/Jain, Wallace/Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics Human Language Technologie./Jain, Wallace - 2019 - Attention is not Explanation.pdf:pdf}, keywords = {method: attention}, pages = {3543--3556}, publisher = {Association for Computational Linguistics}, title = {{Attention is not Explanation}}, url = {http://aclweb.org/anthology/N19-1357}, year = {2019} }
@inproceedings{Devlin2018, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {1810.04805v1}, author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina}, booktitle = {Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)}, doi = {10.18653/v1/N19-1423}, eprint = {1810.04805v1}, file = {:Users/shanest/Documents/Library/Devlin et al/Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics Human Language Technologie./Devlin et al. - 2019 - BERT Pre-training of Deep Bidirectional Transformers for Language Understanding.pdf:pdf}, keywords = {model}, pages = {4171--4186}, publisher = {Association for Computational Linguistics}, title = {{BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding}}, url = {http://aclweb.org/anthology/N19-1423}, year = {2019} }
@inproceedings{Lin2019, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {arXiv:1906.01698v1}, author = {Lin, Yongjie and Tan, Yi Chern and Frank, Robert}, booktitle = {Proceedings of the 2019 ACL Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP}, doi = {10.18653/v1/W19-4825}, eprint = {arXiv:1906.01698v1}, file = {:Users/shanest/Documents/Library/Lin, Tan, Frank/Proceedings of the 2019 ACL Workshop BlackboxNLP Analyzing and Interpreting Neural Networks for NLP/Lin, Tan, Frank - 2019 - Open Sesame Getting inside BERT's Linguistic Knowledge.pdf:pdf}, keywords = {method: attention,method: diagnostic classifier,phenomenon: anaphora,phenomenon: number agreement}, pages = {241--253}, publisher = {Association for Computational Linguistics}, title = {{Open Sesame: Getting inside BERT's Linguistic Knowledge}}, url = {https://www.aclweb.org/anthology/W19-4825}, year = {2019} }
@inproceedings{Hewitt2019, abstract = {Recent work has improved our ability to detect linguistic knowledge in word representations. However, current methods for detecting syntactic knowledge do not test whether syntax trees are represented in their entirety. In this work, we propose a structural probe, which evaluates whether syntax trees are embedded in a linear transformation of a neural network's word representation space. The probe identifies a linear transformation under which squared L2 distance encodes the distance between words in the parse tree, and one in which squared L2 norm encodes depth in the parse tree. Using our probe, we show that such transformations exist for both ELMo and BERT but not in baselines, providing evidence that entire syntax trees are embedded implicitly in deep models' vector geometry.}, address = {Stroudsburg, PA, USA}, author = {Hewitt, John and Manning, Christopher D.}, booktitle = {Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)}, doi = {10.18653/v1/N19-1419}, file = {:Users/shanest/Documents/Library/Hewitt, Manning/Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics Human Language Technologie./Hewitt, Manning - 2019 - A Structural Probe for Finding Syntax in Word Representations.pdf:pdf}, keywords = {method: geometry,phenomenon: dependency parsing}, pages = {4129--4138}, publisher = {Association for Computational Linguistics}, title = {{A Structural Probe for Finding Syntax in Word Representations}}, url = {http://aclweb.org/anthology/N19-1419}, year = {2019} }
@inproceedings{May2019, abstract = {The Word Embedding Association Test shows that GloVe and word2vec word embeddings exhibit human-like implicit biases based on gender, race, and other social constructs (Caliskan et al., 2017). Meanwhile, research on learning reusable text representations has begun to explore sentence-level texts, with some sentence encoders seeing enthusiastic adoption. Accordingly, we extend the Word Embedding Association Test to measure bias in sentence encoders. We then test several sentence encoders, including state-of-the-art methods such as ELMo and BERT, for the social biases studied in prior work and two important biases that are difficult or impossible to test at the word level. We observe mixed results including suspicious patterns of sensitivity that suggest the test's assumptions may not hold in general. We conclude by proposing directions for future work on measuring bias in sentence encoders.}, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {1903.10561}, author = {May, Chandler and Wang, Alex and Bordia, Shikha and Bowman, Samuel R. and Rudinger, Rachel}, booktitle = {Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)}, doi = {10.18653/v1/N19-1063}, eprint = {1903.10561}, file = {:Users/shanest/Documents/Library/May et al/Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics Human Language Technologie./May et al. - 2019 - On Measuring Social Biases in Sentence Encoders.pdf:pdf}, keywords = {method: association test,phenomenon: social bias}, pages = {622--628}, publisher = {Association for Computational Linguistics}, title = {{On Measuring Social Biases in Sentence Encoders}}, url = {http://aclweb.org/anthology/N19-1063}, year = {2019} }
@inproceedings{Futrell2019, abstract = {We deploy the methods of controlled psycholinguistic experimentation to shed light on the extent to which the behavior of neural network language models reflects incremental representations of syntactic state. To do so, we examine model behavior on artificial sentences containing a variety of syntactically complex structures. We test four models: two publicly available LSTM sequence models of English (Jozefowicz et al., 2016; Gulordava et al., 2018) trained on large datasets; an RNNG (Dyer et al., 2016) trained on a small, parsed dataset; and an LSTM trained on the same small corpus as the RNNG. We find evidence that the LSTMs trained on large datasets represent syntactic state over large spans of text in a way that is comparable to the RNNG, while the LSTM trained on the small dataset does not or does so only weakly.}, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {1903.03260}, author = {Futrell, Richard and Wilcox, Ethan and Morita, Takashi and Qian, Peng and Ballesteros, Miguel and Levy, Roger}, booktitle = {Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)}, doi = {10.18653/v1/N19-1004}, eprint = {1903.03260}, file = {:Users/shanest/Documents/Library/Futrell et al/Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics Human Language Technologie./Futrell et al. - 2019 - Neural language models as psycholinguistic subjects Representations of syntactic state.pdf:pdf}, keywords = {method: psycholinguistic,phenomenon: incremental syntax}, pages = {32--42}, publisher = {Association for Computational Linguistics}, title = {{Neural language models as psycholinguistic subjects: Representations of syntactic state}}, url = {http://aclweb.org/anthology/N19-1004}, year = {2019} }
@inproceedings{McCoy2019, abstract = {Machine learning systems can often achieve high performance on a test set by relying on heuristics that are effective for frequent example types but break down in more challenging cases. We study this issue within natural language inference (NLI), the task of determining whether one sentence entails another. Based on an analysis of the task, we hypothesize three fallible syntactic heuristics that NLI models are likely to adopt: the lexical overlap heuristic, the subsequence heuristic, and the constituent heuristic. To determine whether models have adopted these heuristics, we introduce a controlled evaluation set called HANS (Heuristic Analysis for NLI Systems), which contains many examples where the heuristics fail. We find that models trained on MNLI, including the state-of-the-art model BERT, perform very poorly on HANS, suggesting that they have indeed adopted these heuristics. We conclude that there is substantial room for improvement in NLI systems, and that the HANS dataset can motivate and measure progress in this area.}, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {1902.01007}, author = {McCoy, Tom and Pavlick, Ellie and Linzen, Tal}, booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, doi = {10.18653/v1/P19-1334}, eprint = {1902.01007}, file = {:Users/shanest/Documents/Library/McCoy, Pavlick, Linzen/Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics/McCoy, Pavlick, Linzen - 2019 - Right for the Wrong Reasons Diagnosing Syntactic Heuristics in Natural Language Inference.pdf:pdf}, keywords = {method: adversarial data,phenomenon: inference}, pages = {3428--3448}, publisher = {Association for Computational Linguistics}, title = {{Right for the Wrong Reasons: Diagnosing Syntactic Heuristics in Natural Language Inference}}, url = {https://www.aclweb.org/anthology/P19-1334}, year = {2019} }
@inproceedings{Abnar2019, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {arXiv:1906.01539v2}, author = {Abnar, Samira and Beinborn, Lisa and Choenni, Rochelle and Zuidema, Willem}, booktitle = {Proceedings of the 2019 ACL Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP}, doi = {10.18653/v1/W19-4820}, eprint = {arXiv:1906.01539v2}, file = {:Users/shanest/Documents/Library/Abnar et al/Proceedings of the 2019 ACL Workshop BlackboxNLP Analyzing and Interpreting Neural Networks for NLP/Abnar et al. - 2019 - Blackbox Meets Blackbox Representational Similarity & Stability Analysis of Neural Language Models and Brains.pdf:pdf}, keywords = {method: representational similarity analysis}, pages = {191--203}, publisher = {Association for Computational Linguistics}, title = {{Blackbox Meets Blackbox: Representational Similarity & Stability Analysis of Neural Language Models and Brains}}, url = {https://www.aclweb.org/anthology/W19-4820}, year = {2019} }
@article{Hupkes2019, abstract = {Despite a multitude of empirical studies, little consensus exists on whether neural networks are able to generalise compositionally, a controversy that, in part, stems from a lack of agreement about what it means for a neural model to be compositional. As a response to this controversy, we present a set of tests that provide a bridge between, on the one hand, the vast amount of linguistic and philosophical theory about compositionality and, on the other, the successful neural models of language. We collect different interpretations of compositionality and translate them into five theoretically grounded tests that are formulated on a task-independent level. In particular, we provide tests to investigate (i) if models systematically recombine known parts and rules (ii) if models can extend their predictions beyond the length they have seen in the training data (iii) if models' composition operations are local or global (iv) if models' predictions are robust to synonym substitutions and (v) if models favour rules or exceptions during training. To demonstrate the usefulness of this evaluation paradigm, we instantiate these five tests on a highly compositional data set which we dub PCFG SET and apply the resulting tests to three popular sequence-to-sequence models: a recurrent, a convolution based and a transformer model. We provide an in depth analysis of the results, that uncover the strengths and weaknesses of these three architectures and point to potential areas of improvement.}, archivePrefix = {arXiv}, arxivId = {1908.08351}, author = {Hupkes, Dieuwke and Dankers, Verna and Mul, Mathijs and Bruni, Elia}, eprint = {1908.08351}, file = {:Users/shanest/Documents/Library/Hupkes et al/Unknown/Hupkes et al. - 2019 - The compositionality of neural networks integrating symbolism and connectionism.pdf:pdf}, keywords = {method: various,phenomenon: compositionality}, pages = {1--40}, title = {{The compositionality of neural networks: integrating symbolism and connectionism}}, url = {http://arxiv.org/abs/1908.08351}, year = {2019} }
@inproceedings{Chrupaa2019, abstract = {Analysis methods which enable us to better understand the representations and functioning of neural models of language are increasingly needed as deep learning becomes the dominant approach in NLP. Here we present two methods based on Representational Similarity Analysis (RSA) and Tree Kernels (TK) which allow us to directly quantify how strongly the information encoded in neural activation patterns corresponds to information represented by symbolic structures such as syntax trees. We first validate our methods on the case of a simple synthetic language for arithmetic expressions with clearly defined syntax and semantics, and show that they exhibit the expected pattern of results. We then apply our methods to correlate neural representations of English sentences with their constituency parse trees.}, address = {Stroudsburg, PA, USA}, author = {Chrupa{\l}a, Grzegorz and Alishahi, Afra}, booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, doi = {10.18653/v1/P19-1283}, file = {:Users/shanest/Documents/Library/Chrupa{\l}a, Alishahi/Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics/Chrupa{\l}a, Alishahi - 2019 - Correlating Neural and Symbolic Representations of Language.pdf:pdf}, keywords = {method: representational similarity analysis,method: tree kernel,phenomenon: constituency parsing}, pages = {2952--2962}, publisher = {Association for Computational Linguistics}, title = {{Correlating Neural and Symbolic Representations of Language}}, url = {https://www.aclweb.org/anthology/P19-1283}, year = {2019} }
@article{Hahn2019, abstract = {Recurrent neural networks (RNNs) have reached striking performance in many natural language processing tasks. This has renewed interest in whether these generic sequence processing devices are inducing genuine linguistic knowledge. Nearly all current analytical studies, however, initialize the RNNs with a vocabulary of known words, and feed them tokenized input during training. We present a multi-lingual study of the linguistic knowledge encoded in RNNs trained as character-level language models, on input data with word boundaries removed. These networks face a tougher and more cognitively realistic task, having to discover any useful linguistic unit from scratch based on input statistics. The results show that our “near tabula rasa” RNNs are mostly able to solve morphological, syntactic and semantic tasks that intuitively presuppose word-level knowledge, and indeed they learned, to some extent, to track word boundaries. Our study opens the door to speculations about the necessity of an explicit, rigid word lexicon in language learning and usage.}, archivePrefix = {arXiv}, arxivId = {arXiv:1906.07285v1}, author = {Hahn, Michael and Baroni, Marco}, doi = {10.1162/tacl_a_00283}, eprint = {arXiv:1906.07285v1}, file = {:Users/shanest/Documents/Library/Hahn, Baroni/Transactions of the Association for Computational Linguistics/Hahn, Baroni - 2019 - Tabula Nearly Rasa Probing the Linguistic Knowledge of Character-level Neural Language Models Trained on Unsegment.pdf:pdf}, journal = {Transactions of the Association for Computational Linguistics}, keywords = {method: character-level,method: cross-linguistic,method: diagnostic classifier}, pages = {467--484}, title = {{Tabula Nearly Rasa: Probing the Linguistic Knowledge of Character-level Neural Language Models Trained on Unsegmented Text}}, url = {http://doi.org/10.1162/tacl_a_00283}, volume = {7}, year = {2019} }
@inproceedings{Richardson2019, archivePrefix = {arXiv}, arxivId = {1909.07521v2}, author = {Richardson, Kyle and Hu, Hai and Moss, Lawrence S and Sabharwal, Ashish}, booktitle = {Association for the Advancement of Artificial Intelligence (AAAI)}, eprint = {1909.07521v2}, file = {:Users/shanest/Documents/Library/Richardson et al/Association for the Advancement of Artificial Intelligence (AAAI)/Richardson et al. - 2019 - Probing Natural Language Inference Models through Semantic Fragments.pdf:pdf}, keywords = {method: adversarial data,method: psycholinguistic,phenomenon: inference}, title = {{Probing Natural Language Inference Models through Semantic Fragments}}, url = {https://arxiv.org/abs/1909.07521}, year = {2019} }
@inproceedings{Wilcox2019a, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {arXiv:1906.04068v1}, author = {Wilcox, Ethan and Levy, Roger and Futrell, Richard}, booktitle = {Proceedings of the 2019 ACL Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP}, doi = {10.18653/v1/W19-4819}, eprint = {arXiv:1906.04068v1}, file = {:Users/shanest/Documents/Library/Wilcox, Levy, Futrell/Proceedings of the 2019 ACL Workshop BlackboxNLP Analyzing and Interpreting Neural Networks for NLP/Wilcox, Levy, Futrell - 2019 - Hierarchical Representation in Neural Language Models Suppression and Recovery of Expectations.pdf:pdf}, keywords = {method: psycholinguistic,phenomenon: center embedding,phenomenon: incremental syntax,phenomenon: island constraints}, pages = {181--190}, publisher = {Association for Computational Linguistics}, title = {{Hierarchical Representation in Neural Language Models: Suppression and Recovery of Expectations}}, url = {https://www.aclweb.org/anthology/W19-4819}, year = {2019} }
@article{Belinkov2019, abstract = {The field of natural language processing has seen impressive progress in recent years, with neural network models replacing many of the traditional systems. A plethora of new models have been proposed, many of which are thought to be opaque compared to their feature-rich counterparts. This has led researchers to analyze, interpret, and evaluate neural networks in novel and more fine-grained ways. In this survey paper, we review analysis methods in neural language processing, categorize them according to prominent research trends, highlight existing limitations, and point to potential directions for future work.}, author = {Belinkov, Yonatan and Glass, James}, doi = {10.1162/tacl_a_00254}, file = {:Users/shanest/Documents/Library/Belinkov, Glass/Transactions of the Association for Computational Linguistics/Belinkov, Glass - 2019 - Analysis Methods in Neural Language Processing A Survey.pdf:pdf}, journal = {Transactions of the Association for Computational Linguistics}, keywords = {survey}, pages = {49--72}, title = {{Analysis Methods in Neural Language Processing: A Survey}}, volume = {7}, year = {2019} }
@inproceedings{Yanaka2019, abstract = {Monotonicity reasoning is one of the important reasoning skills for any intelligent natural language inference (NLI) model in that it requires the ability to capture the interaction between lexical and syntactic structures. Since no test set has been developed for monotonicity reasoning with wide coverage, it is still unclear whether neural models can perform monotonicity reasoning in a proper way. To investigate this issue, we introduce the Monotonicity Entailment Dataset (MED). Performance by state-of-the-art NLI models on the new test set is substantially worse, under 55%, especially on downward reasoning. In addition, analysis using a monotonicity-driven data augmentation method showed that these models might be limited in their generalization ability in upward and downward reasoning.}, address = {Stroudsburg, PA, USA}, author = {Yanaka, Hitomi and Mineshima, Koji and Bekki, Daisuke and Inui, Kentaro and Sekine, Satoshi and Abzianidze, Lasha and Bos, Johan}, booktitle = {Proceedings of the 2019 ACL Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP}, doi = {10.18653/v1/W19-4804}, file = {:Users/shanest/Documents/Library/Yanaka et al/Proceedings of the 2019 ACL Workshop BlackboxNLP Analyzing and Interpreting Neural Networks for NLP/Yanaka et al. - 2019 - Can Neural Networks Understand Monotonicity Reasoning.pdf:pdf}, keywords = {dataset,method: new data}, pages = {31--40}, publisher = {Association for Computational Linguistics}, title = {{Can Neural Networks Understand Monotonicity Reasoning?}}, url = {https://www.aclweb.org/anthology/W19-4804}, year = {2019} }
@inproceedings{Coenen2019, archivePrefix = {arXiv}, arxivId = {1906.02715v2}, author = {Coenen, Andy and Yuan, Ann and Kim, Been and Pearce, Adam and Vi{\'{e}}gas, Fernanda and Wattenberg, Martin}, booktitle = {Proceedings of the 33rd Conference on Neural Information Processing Systems (NeurIPS 2019)}, eprint = {1906.02715v2}, file = {:Users/shanest/Documents/Library/Coenen et al/Proceedings of the 33rd Conference on Neural Information Processing Systems (NeurIPS 2019)/Coenen et al. - 2019 - Visualizing and Measuring the Geometry of BERT.pdf:pdf}, keywords = {method: attention,method: diagnostic classifier,method: geometry,phenomenon: dependency parsing}, title = {{Visualizing and Measuring the Geometry of BERT}}, url = {https://papers.nips.cc/paper/9065-visualizing-and-measuring-the-geometry-of-bert}, year = {2019} }
@inproceedings{Wilcox2019, abstract = {State-of-the-art LSTM language models trained on large corpora learn sequential contingencies in impressive detail and have been shown to acquire a number of non-local grammatical dependencies with some success. Here we investigate whether supervision with hierarchical structure enhances learning of a range of grammatical dependencies, a question that has previously been addressed only for subject-verb agreement. Using controlled experimental methods from psycholinguistics, we compare the performance of word-based LSTM models versus two models that represent hierarchical structure and deploy it in left-to-right processing: Recurrent Neural Network Grammars (RNNGs) (Dyer et al., 2016) and a incrementalized version of the Parsing-as-Language-Modeling configuration from Chariak et al., (2016). Models are tested on a diverse range of configurations for two classes of non-local grammatical dependencies in English---Negative Polarity licensing and Filler--Gap Dependencies. Using the same training data across models, we find that structurally-supervised models outperform the LSTM, with the RNNG demonstrating best results on both types of grammatical dependencies and even learning many of the Island Constraints on the filler--gap dependency. Structural supervision thus provides data efficiency advantages over purely string-based training of neural language models in acquiring human-like generalizations about non-local grammatical dependencies.}, archivePrefix = {arXiv}, arxivId = {1903.00943}, author = {Wilcox, Ethan and Qian, Peng and Futrell, Richard and Ballesteros, Miguel and Levy, Roger}, booktitle = {Proceedings of North American Association for Computational Linguistics (NAACL)}, eprint = {1903.00943}, file = {:Users/shanest/Documents/Library/Wilcox et al/Proceedings of North American Association for Computational Linguistics (NAACL)/Wilcox et al. - 2019 - Structural Supervision Improves Learning of Non-Local Grammatical Dependencies.pdf:pdf}, keywords = {method: psycholinguistic,method: structural supervision,phenomenon: NPIs,phenomenon: island constraints}, pages = {3302--3312}, title = {{Structural Supervision Improves Learning of Non-Local Grammatical Dependencies}}, url = {http://arxiv.org/abs/1903.00943}, year = {2019} }
@inproceedings{Hewitt2019a, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {arXiv:1909.03368v1}, author = {Hewitt, John and Liang, Percy}, booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)}, doi = {10.18653/v1/D19-1275}, eprint = {arXiv:1909.03368v1}, file = {:Users/shanest/Documents/Library/Hewitt, Liang/Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural./Hewitt, Liang - 2019 - Designing and Interpreting Probes with Control Tasks.pdf:pdf}, keywords = {method: control task,method: diagnostic classifier}, pages = {2733--2743}, publisher = {Association for Computational Linguistics}, title = {{Designing and Interpreting Probes with Control Tasks}}, url = {https://www.aclweb.org/anthology/D19-1275}, year = {2019} }
@article{Ettinger2019, archivePrefix = {arXiv}, arxivId = {1907.13528v1}, author = {Ettinger, Allyson}, eprint = {1907.13528v1}, file = {:Users/shanest/Documents/Library/Ettinger/Unknown/Ettinger - 2019 - What BERT is not Lessons from a new suite of psycholinguistic diagnostics for language models.pdf:pdf}, keywords = {method: psycholinguistic,phenomenon: negation}, title = {{What BERT is not: Lessons from a new suite of psycholinguistic diagnostics for language models}}, url = {https://arxiv.org/abs/1907.13528}, year = {2019} }
@inproceedings{Ulmer2019, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {arXiv:1906.03293v1}, author = {Ulmer, Dennis and Hupkes, Dieuwke and Bruni, Elia}, booktitle = {Proceedings of the 4th Workshop on Representation Learning for NLP (RepL4NLP-2019)}, doi = {10.18653/v1/W19-4324}, eprint = {arXiv:1906.03293v1}, file = {:Users/shanest/Documents/Library/Ulmer, Hupkes, Bruni/Proceedings of the 4th Workshop on Representation Learning for NLP (RepL4NLP-2019)/Ulmer, Hupkes, Bruni - 2019 - Assessing Incrementality in Sequence-to-Sequence Models.pdf:pdf}, keywords = {method: diagnostic classifier,method: integration ratio,method: representational similarity analysis,phenomenon: incremental syntax}, pages = {209--217}, publisher = {Association for Computational Linguistics}, title = {{Assessing Incrementality in Sequence-to-Sequence Models}}, url = {https://www.aclweb.org/anthology/W19-4324}, year = {2019} }
@article{Warstadt2018, abstract = {In this work, we explore the ability of artificial neural networks to judge the grammatical acceptability of a sentence. Machine learning research of this kind is well placed to answer important open questions about the role of prior linguistic bias in language acquisition by providing a test for the Poverty of the Stimulus Argument. In service of this goal, we introduce the Corpus of Linguistic Acceptability (CoLA), a set of 10,657 English sentences labeled as grammatical or ungrammatical by expert linguists. We train several recurrent neural networks to do binary acceptability classification. These models set a baseline for the task. Error-analysis testing the models on specific grammatical phenomena reveals that they learn some systematic grammatical generalizations like subject-verb-object word order without any grammatical supervision. We find that neural sequence models show promise on the acceptability classification task. However, human-like performance across a wide range of grammatical constructions remains far off.}, archivePrefix = {arXiv}, arxivId = {1805.12471}, author = {Warstadt, Alex and Singh, Amanpreet and Bowman, Samuel R.}, eprint = {1805.12471}, file = {:Users/shanest/Documents/Library/Warstadt, Singh, Bowman/Transactions of the Association for Computational Linguistics/Warstadt, Singh, Bowman - 2019 - Neural Network Acceptability Judgments.pdf:pdf}, journal = {Transactions of the Association for Computational Linguistics}, keywords = {dataset,method: acceptability judgment,method: model comparison,method: new data}, title = {{Neural Network Acceptability Judgments}}, url = {http://arxiv.org/abs/1805.12471}, year = {2019} }
@inproceedings{Tenney2019, abstract = {Contextualized representation models such as ELMo (Peters et al., 2018a) and BERT (Devlin et al., 2018) have recently achieved state-of-the-art results on a diverse array of downstream NLP tasks. Building on recent token-level probing work, we introduce a novel edge probing task design and construct a broad suite of sub-sentence tasks derived from the traditional structured NLP pipeline. We probe word-level contextual representations from four recent models and investigate how they encode sentence structure across a range of syntactic, semantic, local, and long-range phenomena. We find that existing models trained on language modeling and translation produce strong representations for syntactic phenomena, but only offer comparably small improvements on semantic tasks over a non-contextual baseline.}, archivePrefix = {arXiv}, arxivId = {1905.06316}, author = {Tenney, Ian and Xia, Patrick and Chen, Berlin and Wang, Alex and Poliak, Adam and McCoy, R Thomas and Kim, Najoung and {Van Durme}, Benjamin and Bowman, Samuel R and Das, Dipanjan and Pavlick, Ellie}, booktitle = {International Conference of Learning Representations (ICLR 2019)}, eprint = {1905.06316}, file = {:Users/shanest/Documents/Library/Tenney et al/International Conference of Learning Representations (ICLR 2019)/Tenney et al. - 2019 - What do you learn from context Probing for sentence structure in contextualized word representations.pdf:pdf}, keywords = {method: diagnostic classifier,method: model comparison,phenomenon: various}, pages = {1--17}, title = {{What do you learn from context? Probing for sentence structure in contextualized word representations}}, url = {http://arxiv.org/abs/1905.06316}, year = {2019} }
@inproceedings{McCoy2018, abstract = {Recurrent neural networks (RNNs) can learn continuous vector representations of symbolic structures such as sequences and sentences; these representations often exhibit linear regularities (analogies). Such regularities motivate our hypothesis that RNNs that show such regularities implicitly compile symbolic structures into tensor product representations (TPRs; Smolensky, 1990), which additively combine tensor products of vectors representing roles (e.g., sequence positions) and vectors representing fillers (e.g., particular words). To test this hypothesis, we introduce Tensor Product Decomposition Networks (TPDNs), which use TPRs to approximate existing vector representations. We demonstrate using synthetic data that TPDNs can successfully approximate linear and tree-based RNN autoencoder representations, suggesting that these representations exhibit interpretable compositional structure; we explore the settings that lead RNNs to induce such structure-sensitive representations. By contrast, further TPDN experiments show that the representations of four models trained to encode naturally-occurring sentences can be largely approximated with a bag-of-words, with only marginal improvements from more sophisticated structures. We conclude that TPDNs provide a powerful method for interpreting vector representations, and that standard RNNs can induce compositional sequence representations that are remarkably well approximated by TPRs; at the same time, existing training tasks for sentence representation learning may not be sufficient for inducing robust structural representations.}, archivePrefix = {arXiv}, arxivId = {1812.08718}, author = {McCoy, R. Thomas and Linzen, Tal and Dunbar, Ewan and Smolensky, Paul}, booktitle = {International Conference of Learning Representations (ICLR)}, eprint = {1812.08718}, file = {:Users/shanest/Documents/Library/McCoy et al/International Conference of Learning Representations (ICLR)/McCoy et al. - 2019 - RNNs Implicitly Implement Tensor Product Representations.pdf:pdf}, keywords = {method: model approximation,phenomenon: compositionality}, title = {{RNNs Implicitly Implement Tensor Product Representations}}, url = {http://arxiv.org/abs/1812.08718}, year = {2019} }
@inproceedings{Tenney2019a, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {arXiv:1905.05950v1}, author = {Tenney, Ian and Das, Dipanjan and Pavlick, Ellie}, booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, doi = {10.18653/v1/P19-1452}, eprint = {arXiv:1905.05950v1}, file = {:Users/shanest/Documents/Library/Tenney, Das, Pavlick/Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics/Tenney, Das, Pavlick - 2019 - BERT Rediscovers the Classical NLP Pipeline.pdf:pdf}, keywords = {method: diagnostic classifier,phenomenon: various}, pages = {4593--4601}, publisher = {Association for Computational Linguistics}, title = {{BERT Rediscovers the Classical NLP Pipeline}}, url = {https://www.aclweb.org/anthology/P19-1452}, year = {2019} }
@inproceedings{Voita2019, abstract = {We seek to understand how the representations of individual tokens and the structure of the learned feature space evolve between layers in deep neural networks under different learning objectives. We focus on the Transformers for our analysis as they have been shown effective on various tasks, including machine translation (MT), standard left-to-right language models (LM) and masked language modeling (MLM). Previous work used black-box probing tasks to show that the representations learned by the Transformer differ significantly depending on the objective. In this work, we use canonical correlation analysis and mutual information estimators to study how information flows across Transformer layers and how this process depends on the choice of learning objective. For example, as you go from bottom to top layers, information about the past in left-to-right language models gets vanished and predictions about the future get formed. In contrast, for MLM, representations initially acquire information about the context around the token, partially forgetting the token identity and producing a more generalized token representation. The token identity then gets recreated at the top MLM layers.}, archivePrefix = {arXiv}, arxivId = {1909.01380}, author = {Voita, Elena and Sennrich, Rico and Titov, Ivan}, booktitle = {Empirical Methods in Natural Language Processing (EMNLP)}, eprint = {1909.01380}, file = {:Users/shanest/Documents/Library/Voita, Sennrich, Titov/Empirical Methods in Natural Language Processing (EMNLP)/Voita, Sennrich, Titov - 2019 - The Bottom-up Evolution of Representations in the Transformer A Study with Machine Translation and Langu.pdf:pdf}, keywords = {method: canonical correlation analysis,method: information bottleneck,method: layer-wise analysis}, title = {{The Bottom-up Evolution of Representations in the Transformer: A Study with Machine Translation and Language Modeling Objectives}}, url = {http://arxiv.org/abs/1909.01380}, year = {2019} }
@inproceedings{Liu2019, abstract = {Contextual word representations derived from large-scale neural language models are successful across a diverse set of NLP tasks, suggesting that they encode useful and transferable features of language. To shed light on the linguistic knowledge they capture, we study the representations produced by several recent pretrained contextualizers (variants of ELMo, the OpenAI transformer language model, and BERT) with a suite of seventeen diverse probing tasks. We find that linear models trained on top of frozen contextual representations are competitive with state-of-the-art task-specific models in many cases, but fail on tasks requiring fine-grained linguistic knowledge (e.g., conjunct identification). To investigate the transferability of contextual word representations, we quantify differences in the transferability of individual layers within contextualizers, especially between recurrent neural networks (RNNs) and transformers. For instance, higher layers of RNNs are more task-specific, while transformer layers do not exhibit the same monotonic trend. In addition, to better understand what makes contextual word representations transferable, we compare language model pretraining with eleven supervised pretraining tasks. For any given task, pretraining on a closely related task yields better performance than language model pretraining (which is better on average) when the pretraining dataset is fixed. However, language model pretraining on more data gives the best results.}, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {1903.08855}, author = {Liu, Nelson F. and Gardner, Matt and Belinkov, Yonatan and Peters, Matthew E. and Smith, Noah A.}, booktitle = {Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)}, doi = {10.18653/v1/N19-1112}, eprint = {1903.08855}, file = {:Users/shanest/Documents/Library/Liu et al/Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics Human Language Technologie./Liu et al. - 2019 - Linguistic Knowledge and Transferability of Contextual Representations.pdf:pdf}, keywords = {method: diagnostic classifier,method: layer-wise analysis,method: model comparison,phenomenon: various}, pages = {1073--1094}, publisher = {Association for Computational Linguistics}, title = {{Linguistic Knowledge and Transferability of Contextual Representations}}, url = {http://aclweb.org/anthology/N19-1112}, year = {2019} }
@article{Linzen2019, author = {Linzen, Tal}, doi = {10.1353/lan.2019.0001}, file = {:Users/shanest/Documents/Library/Linzen/Language/Linzen - 2019 - What can linguistics and deep learning contribute to each other Response to Pater.pdf:pdf}, issn = {1535-0665}, journal = {Language}, keywords = {position}, number = {1}, title = {{What can linguistics and deep learning contribute to each other? Response to Pater}}, url = {https://muse.jhu.edu/article/718440}, volume = {95}, year = {2019} }
@inproceedings{Andreas2019, abstract = {Many machine learning algorithms represent input data with vector embeddings or discrete codes. When inputs exhibit compositional structure (e.g. objects built from parts or procedures from subroutines), it is natural to ask whether this compositional structure is reflected in the the inputs' learned representations. While the assessment of compositionality in languages has received significant attention in linguistics and adjacent fields, the machine learning literature lacks general-purpose tools for producing graded measurements of compositional structure in more general (e.g. vector-valued) representation spaces. We describe a procedure for evaluating compositionality by measuring how well the true representation-producing model can be approximated by a model that explicitly composes a collection of inferred representational primitives. We use the procedure to provide formal and empirical characterizations of compositional structure in a variety of settings, exploring the relationship between compositionality and learning dynamics, human judgments, representational similarity, and generalization.}, archivePrefix = {arXiv}, arxivId = {1902.07181}, author = {Andreas, Jacob}, booktitle = {International Conference of Learning Representations}, eprint = {1902.07181}, file = {:Users/shanest/Documents/Library/Andreas/International Conference of Learning Representations/Andreas - 2019 - Measuring Compositionality in Representation Learning.pdf:pdf}, keywords = {method: tree reconstruction error,phenomenon: compositionality}, title = {{Measuring Compositionality in Representation Learning}}, url = {http://arxiv.org/abs/1902.07181}, year = {2019} }
@article{Potts2018, archivePrefix = {arXiv}, arxivId = {arXiv:1809.03068v1}, author = {Potts, Christopher}, doi = {10.1353/lan.2019.0003}, eprint = {arXiv:1809.03068v1}, file = {:Users/shanest/Documents/Library/Potts/Language/Potts - 2019 - A case for deep learning in semantics Response to Pater.pdf:pdf}, issn = {1535-0665}, journal = {Language}, keywords = {position,survey}, title = {{A case for deep learning in semantics: Response to Pater}}, url = {https://muse.jhu.edu/article/718442}, year = {2019} }
@inproceedings{Niven2019, abstract = {We are surprised to find that BERT's peak performance of 77% on the Argument Reasoning Comprehension Task reaches just three points below the average untrained human baseline. However, we show that this result is entirely accounted for by exploitation of spurious statistical cues in the dataset. We analyze the nature of these cues and demonstrate that a range of models all exploit them. This analysis informs the construction of an adversarial dataset on which all models achieve random accuracy. Our adversarial dataset provides a more robust assessment of argument comprehension and should be adopted as the standard in future work.}, address = {Stroudsburg, PA, USA}, author = {Niven, Timothy and Kao, Hung-Yu}, booktitle = {Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics}, doi = {10.18653/v1/P19-1459}, file = {:Users/shanest/Documents/Library/Niven, Kao/Proceedings of the 57th Annual Meeting of the Association for Computational Linguistics/Niven, Kao - 2019 - Probing Neural Network Comprehension of Natural Language Arguments.pdf:pdf}, keywords = {method: adversarial data,phenomenon: argumentation}, pages = {4658--4664}, publisher = {Association for Computational Linguistics}, title = {{Probing Neural Network Comprehension of Natural Language Arguments}}, url = {https://www.aclweb.org/anthology/P19-1459}, year = {2019} }
@article{Goldberg2019, abstract = {I assess the extent to which the recently introduced BERT model captures English syntactic phenomena, using (1) naturally-occurring subject-verb agreement stimuli; (2) "coloreless green ideas" subject-verb agreement stimuli, in which content words in natural sentences are randomly replaced with words sharing the same part-of-speech and inflection; and (3) manually crafted stimuli for subject-verb agreement and reflexive anaphora phenomena. The BERT model performs remarkably well on all cases.}, archivePrefix = {arXiv}, arxivId = {1901.05287}, author = {Goldberg, Yoav}, eprint = {1901.05287}, file = {:Users/shanest/Documents/Library/Goldberg/Unknown/Goldberg - 2019 - Assessing BERT's Syntactic Abilities.pdf:pdf}, keywords = {method: psycholinguistic,phenomenon: number agreement}, pages = {2--5}, title = {{Assessing BERT's Syntactic Abilities}}, url = {http://arxiv.org/abs/1901.05287}, year = {2019} }
@inproceedings{An2019a, abstract = {Neural language models have achieved state-of-the-art performances on many NLP tasks, and recently have been shown to learn a number of hierarchically-sensitive syntactic dependencies between individual words. However, equally important for language processing is the ability to combine words into phrasal constituents, and use constituent-level features to drive downstream expectations. Here we investigate neural models' ability to represent constituent-level features, using coordinated noun phrases as a case study. We assess whether different neural language models trained on English and French represent phrase-level number and gender features, and use those features to drive downstream expectations. Our results suggest that models use a linear combination of NP constituent number to drive CoordNP/verb number agreement. This behavior is highly regular and even sensitive to local syntactic context, however it differs crucially from observed human behavior. Models have less success with gender agreement. Models trained on large corpora perform best, and there is no obvious advantage for models trained using explicit syntactic supervision.}, archivePrefix = {arXiv}, arxivId = {1909.04625}, author = {An, Aixiu and Qian, Peng and Wilcox, Ethan and Levy, Roger}, booktitle = {Empirical Methods in Natural Language Processing (EMNLP)}, eprint = {1909.04625}, file = {:Users/shanest/Documents/Library/An et al/Empirical Methods in Natural Language Processing (EMNLP)/An et al. - 2019 - Representation of Constituents in Neural Language Models Coordination Phrase as a Case Study.pdf:pdf}, keywords = {method: psycholinguistic,phenomenon: coordinated NPs}, title = {{Representation of Constituents in Neural Language Models: Coordination Phrase as a Case Study}}, url = {http://arxiv.org/abs/1909.04625}, year = {2019} }
@inproceedings{Jumelet2019, abstract = {Extensive research has recently shown that recurrent neural language models are able to process a wide range of grammatical phenomena. How these models are able to perform these remarkable feats so well, however, is still an open question. To gain more insight into what information LSTMs base their decisions on, we propose a generalisation of Contextual Decomposition (GCD). In particular, this setup enables us to accurately distil which part of a prediction stems from semantic heuristics, which part truly emanates from syntactic cues and which part arise from the model biases themselves instead. We investigate this technique on tasks pertaining to syntactic agreement and co-reference resolution and discover that the model strongly relies on a default reasoning effect to perform these tasks.}, archivePrefix = {arXiv}, arxivId = {1909.08975}, author = {Jumelet, Jaap and Zuidema, Willem and Hupkes, Dieuwke}, booktitle = {Proceedings of the Conference on Computational Natural Language Learning (CoNLL)}, eprint = {1909.08975}, file = {:Users/shanest/Documents/Library/Jumelet, Zuidema, Hupkes/Proceedings of the Conference on Computational Natural Language Learning (CoNLL)/Jumelet, Zuidema, Hupkes - 2019 - Analysing Neural Language Models Contextual Decomposition Reveals Default Reasoning in Number and Gend.pdf:pdf}, keywords = {method: contextual decomposition,phenomenon: gender agreement,phenomenon: number agreement}, title = {{Analysing Neural Language Models: Contextual Decomposition Reveals Default Reasoning in Number and Gender Assignment}}, url = {http://arxiv.org/abs/1909.08975}, year = {2019} }
@inproceedings{Warstadt2019, abstract = {Though state-of-the-art sentence representation models can perform tasks requiring significant knowledge of grammar, it is an open question how best to evaluate their grammatical knowledge. We explore five experimental methods inspired by prior work evaluating pretrained sentence representation models. We use a single linguistic phenomenon, negative polarity item (NPI) licensing in English, as a case study for our experiments. NPIs like "any" are grammatical only if they appear in a licensing environment like negation ("Sue doesn't have any cats" vs. "Sue has any cats"). This phenomenon is challenging because of the variety of NPI licensing environments that exist. We introduce an artificially generated dataset that manipulates key features of NPI licensing for the experiments. We find that BERT has significant knowledge of these features, but its success varies widely across different experimental methods. We conclude that a variety of methods is necessary to reveal all relevant aspects of a model's grammatical knowledge in a given domain.}, archivePrefix = {arXiv}, arxivId = {1909.02597}, author = {Warstadt, Alex and Cao, Yu and Grosu, Ioana and Peng, Wei and Blix, Hagen and Nie, Yining and Alsop, Anna and Bordia, Shikha and Liu, Haokun and Parrish, Alicia and Wang, Sheng-Fu and Phang, Jason and Mohananey, Anhad and Htut, Phu Mon and Jereti{\v{c}}, Paloma and Bowman, Samuel R.}, booktitle = {Empirical Methods in Natural Language Processing (EMNLP)}, eprint = {1909.02597}, file = {:Users/shanest/Documents/Library/Warstadt et al/Empirical Methods in Natural Language Processing (EMNLP)/Warstadt et al. - 2019 - Investigating BERT's Knowledge of Language Five Analysis Methods with NPIs.pdf:pdf}, keywords = {method: method comparison,phenomenon: NPIs}, title = {{Investigating BERT's Knowledge of Language: Five Analysis Methods with NPIs}}, url = {http://arxiv.org/abs/1909.02597}, year = {2019} }
@article{Pater2019, abstract = {The birthdate of both generative linguistics and neural networks can be taken as 1957, the year of the publication of foundational work by both Noam Chomsky and Frank Rosenblatt. This paper traces the development of these two approaches to cognitive science, from their largely autonomous early development in their first thirty years, through their collision in the 1980s around the past tense debate (Rumelhart and McClelland 1986, Pinker and Prince 1988), and their integration in much subsequent work up to the present, 2017. Although this integration has produced a considerable body of results, the continued general gulf between these two lines of research is likely impeding progress in both: on learning in generative linguistics, and on the representation of language in neural modeling. The paper concludes with a brief argument that generative linguistics is unlikely to fulfill its promise of accounting for language learning if it continues to maintain its distance from neural and other statistical approaches to learning.}, author = {Pater, Joe}, doi = {10.1353/lan.2019.0005}, file = {:Users/shanest/Documents/Library/Pater/Language/Pater - 2019 - Generative linguistics and neural networks at 60 Foundation, friction, and fusion.pdf:pdf}, issn = {1535-0665}, journal = {Language}, keywords = {position,survey}, number = {1}, title = {{Generative linguistics and neural networks at 60: Foundation, friction, and fusion}}, url = {https://muse.jhu.edu/article/718444}, volume = {95}, year = {2019} }
@inproceedings{Ethayarajh2019, abstract = {Replacing static word embeddings with contextualized word representations has yielded significant improvements on many NLP tasks. However, just how contextual are the contextualized representations produced by models such as ELMo and BERT? Are there infinitely many context-specific representations for each word, or are words essentially assigned one of a finite number of word-sense representations? For one, we find that the contextualized representations of all words are not isotropic in any layer of the contextualizing model. While representations of the same word in different contexts still have a greater cosine similarity than those of two different words, this self-similarity is much lower in upper layers. This suggests that upper layers of contextualizing models produce more context-specific representations, much like how upper layers of LSTMs produce more task-specific representations. In all layers of ELMo, BERT, and GPT-2, on average, less than 5% of the variance in a word's contextualized representations can be explained by a static embedding for that word, providing some justification for the success of contextualized representations.}, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {1909.00512}, author = {Ethayarajh, Kawin}, booktitle = {Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)}, doi = {10.18653/v1/D19-1006}, eprint = {1909.00512}, file = {:Users/shanest/Documents/Library/Ethayarajh/Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural./Ethayarajh - 2019 - How Contextual are Contextualized Word Representations Comparing the Geometry of BERT, ELMo, and GPT-2 Embeddings.pdf:pdf}, keywords = {method: geometry,method: model comparison}, pages = {55--65}, publisher = {Association for Computational Linguistics}, title = {{How Contextual are Contextualized Word Representations? Comparing the Geometry of BERT, ELMo, and GPT-2 Embeddings}}, url = {https://www.aclweb.org/anthology/D19-1006}, year = {2019} }
@inproceedings{Linzen2018, author = {Linzen, Tal and Leonard, Brian}, booktitle = {Proceedings of the 40th Annual Conference of the Cognitive Science Society}, file = {:Users/shanest/Documents/Library/Linzen, Leonard/Proceedings of the 40th Annual Conference of the Cognitive Science Society/Linzen, Leonard - 2018 - Distinct patterns of syntactic agreement errors in recurrent networks and humans.pdf:pdf}, keywords = {method: human comparison,method: psycholinguistic,phenomenon: number agreement}, title = {{Distinct patterns of syntactic agreement errors in recurrent networks and humans}}, url = {https://arxiv.org/abs/1807.06882}, year = {2018} }
@inproceedings{Poliak2018, address = {Stroudsburg, PA, USA}, author = {Poliak, Adam and Naradowsky, Jason and Haldar, Aparajita and Rudinger, Rachel and {Van Durme}, Benjamin}, booktitle = {Proceedings of the Seventh Joint Conference on Lexical and Computational Semantics}, doi = {10.18653/v1/S18-2023}, file = {:Users/shanest/Documents/Library/Poliak et al/Proceedings of the Seventh Joint Conference on Lexical and Computational Semantics/Poliak et al. - 2018 - Hypothesis Only Baselines in Natural Language Inference.pdf:pdf}, keywords = {method: dataset analysis,phenomenon: inference}, pages = {180--191}, publisher = {Association for Computational Linguistics}, title = {{Hypothesis Only Baselines in Natural Language Inference}}, url = {http://aclweb.org/anthology/S18-2023}, year = {2018} }
@inproceedings{Evans2018, archivePrefix = {arXiv}, arxivId = {arXiv:1802.08535v1}, author = {Evans, Richard and Saxton, David and Amos, David and Kohli, Pushmeet and Grefenstette, Edward}, booktitle = {International Conference of Learning Representations}, eprint = {arXiv:1802.08535v1}, file = {:Users/shanest/Documents/Library/Evans et al/International Conference of Learning Representations/Evans et al. - 2018 - Can Neural Networks Understand Logical Entailment.pdf:pdf}, keywords = {method: new data,phenomenon: inference}, title = {{Can Neural Networks Understand Logical Entailment}}, url = {https://arxiv.org/abs/1802.08535}, year = {2018} }
@inproceedings{Naik2018, abstract = {Natural language inference (NLI) is the task of determining if a natural language hypothesis can be inferred from a given premise in a justifiable manner. NLI was proposed as a benchmark task for natural language understanding. Existing models perform well at standard datasets for NLI, achieving impressive results across different genres of text. However, the extent to which these models understand the semantic content of sentences is unclear. In this work, we propose an evaluation methodology consisting of automatically constructed "stress tests" that allow us to examine whether systems have the ability to make real inferential decisions. Our evaluation of six sentence-encoder models on these stress tests reveals strengths and weaknesses of these models with respect to challenging linguistic phenomena, and suggests important directions for future work in this area.}, archivePrefix = {arXiv}, arxivId = {1806.00692}, author = {Naik, Aakanksha and Ravichander, Abhilasha and Sadeh, Norman and Rose, Carolyn and Neubig, Graham}, booktitle = {Proceedings ofthe 27th International Conference on Computational Linguistics (COLING)}, eprint = {1806.00692}, file = {:Users/shanest/Documents/Library/Naik et al/Proceedings ofthe 27th International Conference on Computational Linguistics (COLING)/Naik et al. - 2018 - Stress Test Evaluation for Natural Language Inference.pdf:pdf}, keywords = {method: adversarial data}, pages = {2340--2353}, title = {{Stress Test Evaluation for Natural Language Inference}}, url = {https://www.aclweb.org/anthology/C18-1198}, year = {2018} }
@inproceedings{Peters2018, abstract = {We introduce a new type of deep contextualized word representation that models both (1) complex characteristics of word use (e.g., syntax and semantics), and (2) how these uses vary across linguistic contexts (i.e., to model polysemy). Our word vectors are learned functions of the internal states of a deep bidirectional language model (biLM), which is pre-trained on a large text corpus. We show that these representations can be easily added to existing models and significantly improve the state of the art across six challenging NLP problems, including question answering, textual entailment and sentiment analysis. We also present an analysis showing that exposing the deep internals of the pre-trained network is crucial, allowing downstream models to mix different types of semi-supervision signals.}, archivePrefix = {arXiv}, arxivId = {1802.05365}, author = {Peters, Matthew E. and Neumann, Mark and Iyyer, Mohit and Gardner, Matt and Clark, Christopher and Lee, Kenton and Zettlemoyer, Luke}, booktitle = {Proceedings of North American Association for Computational Linguistics (NAACL)}, eprint = {1802.05365}, file = {:Users/shanest/Documents/Library/Peters et al/Proceedings of North American Association for Computational Linguistics (NAACL)/Peters et al. - 2018 - Deep contextualized word representations.pdf:pdf}, keywords = {model}, month = {feb}, title = {{Deep contextualized word representations}}, url = {http://arxiv.org/abs/1802.05365}, year = {2018} }
@inproceedings{Conneau2018, abstract = {Although much effort has recently been devoted to training high-quality sentence embeddings, we still have a poor understanding of what they are capturing. "Downstream" tasks, often based on sentence classification, are commonly used to evaluate the quality of sentence representations. The complexity of the tasks makes it however difficult to infer what kind of information is present in the representations. We introduce here 10 probing tasks designed to capture simple linguistic features of sentences, and we use them to study embeddings generated by three different encoders trained in eight distinct ways, uncovering intriguing properties of both encoders and training methods.}, address = {Stroudsburg, PA, USA}, author = {Conneau, Alexis and Kruszewski, German and Lample, Guillaume and Barrault, Lo{\"{i}}c and Baroni, Marco}, booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)}, doi = {10.18653/v1/P18-1198}, file = {:Users/shanest/Documents/Library/Conneau et al/Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 1 Long Papers)/Conneau et al. - 2018 - What you can cram into a single $&!# vector Probing sentence embeddings for linguistic properties.pdf:pdf}, isbn = {9781948087322}, keywords = {method: diagnostic classifier,method: pre-training task comparison,method: sentence-level,phenomenon: various}, pages = {2126--2136}, publisher = {Association for Computational Linguistics}, title = {{What you can cram into a single $&!#* vector: Probing sentence embeddings for linguistic properties}}, url = {http://aclweb.org/anthology/P18-1198}, volume = {1}, year = {2018} }
@inproceedings{Lake2018, archivePrefix = {arXiv}, arxivId = {1711.00350v3}, author = {Lake, Brenden and Baroni, Marco}, booktitle = {International Conference of Machine Learning (ICML 2018)}, eprint = {1711.00350v3}, file = {:Users/shanest/Documents/Library/Lake, Baroni/International Conference of Machine Learning (ICML 2018)/Lake, Baroni - 2018 - Generalization without Systematicity On the Compositional Skills of Sequence-to-Sequence Recurrent Networks.pdf:pdf}, keywords = {method: new data,phenomenon: compositionality}, title = {{Generalization without Systematicity: On the Compositional Skills of Sequence-to-Sequence Recurrent Networks}}, url = {https://arxiv.org/abs/1711.00350}, year = {2018} }
@inproceedings{Giulianelli2018, abstract = {How do neural language models keep track of number agreement between subject and verb? We show that `diagnostic classifiers', trained to predict number from the internal states of a language model, provide a detailed understanding of how, when, and where this information is represented. Moreover, they give us insight into when and where number information is corrupted in cases where the language model ends up making agreement errors. To demonstrate the causal role played by the representations we find, we then use agreement information to influence the course of the LSTM during the processing of difficult sentences. Results from such an intervention reveal a large increase in the language model's accuracy. Together, these results show that diagnostic classifiers give us an unrivalled detailed look into the representation of linguistic information in neural models, and demonstrate that this knowledge can be used to improve their performance.}, archivePrefix = {arXiv}, arxivId = {1808.08079}, author = {Giulianelli, Mario and Harding, Jack and Mohnert, Florian and Hupkes, Dieuwke and Zuidema, Willem}, booktitle = {Proceedings of the 2018 EMNLP Workshop BlackboxNLP}, eprint = {1808.08079}, file = {:Users/shanest/Documents/Library/Giulianelli et al/Proceedings of the 2018 EMNLP Workshop BlackboxNLP/Giulianelli et al. - 2018 - Under the Hood Using Diagnostic Classifiers to Investigate and Improve how Language Models Track Agreement I.pdf:pdf}, keywords = {method: diagnostic classifier,method: intervention,phenomenon: number agreement}, pages = {240--248}, title = {{Under the Hood: Using Diagnostic Classifiers to Investigate and Improve how Language Models Track Agreement Information}}, url = {http://arxiv.org/abs/1808.08079}, year = {2018} }
@article{Kirov2018, abstract = {Can advances in NLP help advance cognitive modeling? We examine the role of artificial neural networks, the current state of the art in many common NLP tasks, by returning to a classic case study. In 1986, Rumelhart and McClelland famously introduced a neural architecture that learned to transduce English verb stems to their past tense forms. Shortly thereafter, Pinker & Prince (1988) presented a comprehensive rebuttal of many of Rumelhart and McClelland's claims. Much of the force of their attack centered on the empirical inadequacy of the Rumelhart and McClelland (1986) model. Today, however, that model is severely outmoded. We show that the Encoder-Decoder network architectures used in modern NLP systems obviate most of Pinker and Prince's criticisms without requiring any simplication of the past tense mapping problem. We suggest that the empirical performance of modern networks warrants a re-examination of their utility in linguistic and cognitive modeling.}, archivePrefix = {arXiv}, arxivId = {1807.04783}, author = {Kirov, Christo and Cotterell, Ryan}, eprint = {1807.04783}, file = {:Users/shanest/Documents/Library/Kirov, Cotterell/Transactions of the Association of Computational Linguistics/Kirov, Cotterell - 2018 - Recurrent Neural Networks in Linguistic Theory Revisiting Pinker and Prince (1988) and the Past Tense Debate.pdf:pdf}, journal = {Transactions of the Association of Computational Linguistics}, keywords = {method: acquisition,phenomenon: tense,position}, pages = {651--665}, title = {{Recurrent Neural Networks in Linguistic Theory: Revisiting Pinker and Prince (1988) and the Past Tense Debate}}, url = {http://arxiv.org/abs/1807.04783}, volume = {6}, year = {2018} }
@inproceedings{Peters2019, abstract = {Contextual word representations derived from pre-trained bidirectional language models (biLMs) have recently been shown to provide significant improvements to the state of the art for a wide range of NLP tasks. However, many questions remain as to how and why these models are so effective. In this paper, we present a detailed empirical study of how the choice of neural architecture (e.g. LSTM, CNN, or self attention) influences both end task accuracy and qualitative properties of the representations that are learned. We show there is a tradeoff between speed and accuracy, but all architectures learn high quality contextual representations that outperform word embeddings for four challenging NLP tasks. Additionally, all architectures learn representations that vary with network depth, from exclusively morphological based at the word embedding layer through local syntax based in the lower contextual layers to longer range semantics such coreference at the upper layers. Together, these results suggest that unsupervised biLMs, independent of architecture, are learning much more about the structure of language than previously appreciated.}, address = {Stroudsburg, PA, USA}, author = {Peters, Matthew and Neumann, Mark and Zettlemoyer, Luke and Yih, Wen-tau}, booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, doi = {10.18653/v1/D18-1179}, file = {:Users/shanest/Documents/Library/Peters et al/Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing/Peters et al. - 2018 - Dissecting Contextual Word Embeddings Architecture and Representation.pdf:pdf}, keywords = {method: diagnostic classifier,method: layer-wise analysis,method: model comparison}, pages = {1499--1509}, publisher = {Association for Computational Linguistics}, title = {{Dissecting Contextual Word Embeddings: Architecture and Representation}}, url = {https://aclweb.org/anthology/D18-1179}, year = {2018} }
@inproceedings{Gururangan2018, address = {Stroudsburg, PA, USA}, author = {Gururangan, Suchin and Swayamdipta, Swabha and Levy, Omer and Schwartz, Roy and Bowman, Samuel and Smith, Noah A}, booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 2 (Short Papers)}, doi = {10.18653/v1/N18-2017}, file = {:Users/shanest/Documents/Library/Gururangan et al/Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics Human Language Technologie./Gururangan et al. - 2018 - Annotation Artifacts in Natural Language Inference Data.pdf:pdf}, keywords = {method: dataset analysis,phenomenon: inference}, pages = {107--112}, publisher = {Association for Computational Linguistics}, title = {{Annotation Artifacts in Natural Language Inference Data}}, url = {http://aclweb.org/anthology/N18-2017}, year = {2018} }
@inproceedings{Sennhauser2018, abstract = {While long short-term memory (LSTM) neural net architectures are designed to capture sequence information, human language is generally composed of hierarchical structures. This raises the question as to whether LSTMs can learn hierarchical structures. We explore this question with a well-formed bracket prediction task using two types of brackets modeled by an LSTM. Demonstrating that such a system is learnable by an LSTM is the first step in demonstrating that the entire class of CFLs is also learnable. We observe that the model requires exponential memory in terms of the number of characters and embedded depth, where a sub-linear memory should suffice. Still, the model does more than memorize the training input. It learns how to distinguish between relevant and irrelevant information. On the other hand, we also observe that the model does not generalize well. We conclude that LSTMs do not learn the relevant underlying context-free rules, suggesting the good overall performance is attained rather by an efficient way of evaluating nuisance variables. LSTMs are a way to quickly reach good results for many natural language tasks, but to understand and generate natural language one has to investigate other concepts that can make more direct use of natural language's structural nature.}, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {1811.02611}, author = {Sennhauser, Luzi and Berwick, Robert}, booktitle = {Proceedings of the 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP}, doi = {10.18653/v1/W18-5414}, eprint = {1811.02611}, file = {:Users/shanest/Documents/Library/Sennhauser, Berwick/Proceedings of the 2018 EMNLP Workshop BlackboxNLP Analyzing and Interpreting Neural Networks for NLP/Sennhauser, Berwick - 2018 - Evaluating the Ability of LSTMs to Learn Context-Free Grammars.pdf:pdf}, keywords = {method: formal languages}, pages = {115--124}, publisher = {Association for Computational Linguistics}, title = {{Evaluating the Ability of LSTMs to Learn Context-Free Grammars}}, url = {http://aclweb.org/anthology/W18-5414}, year = {2018} }
@article{Hupkes2018, abstract = {We investigate how neural networks can learn and process languages with hierarchical, compositional semantics. To this end, we define the artificial task of processing nested arithmetic expressions, and study whether different types of neural networks can learn to compute their meaning. We find that recursive neural networks can find a generalising solution to this problem, and we visualise this solution by breaking it up in three steps: project, sum and squash. As a next step, we investigate recurrent neural networks, and show that a gated recurrent unit, that processes its input incrementally, also performs very well on this task. To develop an understanding of what the recurrent network encodes, visualisation techniques alone do not suffice. Therefore, we develop an approach where we formulate and test multiple hypotheses on the information encoded and processed by the network. For each hypothesis, we derive predictions about features of the hidden state representations at each time step, and train 'diagnostic classifiers' to test those predictions. Our results indicate that the networks follow a strategy similar to our hypothesised 'cumulative strategy', which explains the high accuracy of the network on novel expressions, the generalisation to longer expressions than seen in training, and the mild deterioration with increasing length. This is turn shows that diagnostic classifiers can be a useful technique for opening up the black box of neural networks. We argue that diagnostic classification, unlike most visualisation techniques, does scale up from small networks in a toy domain, to larger and deeper recurrent networks dealing with real-life data, and may therefore contribute to a better understanding of the internal dynamics of current state-of-the-art models in natural language processing.}, archivePrefix = {arXiv}, arxivId = {arXiv:1711.10203v2}, author = {Hupkes, Dieuwke and Veldhoen, Sara and Zuidema, Willem}, doi = {10.1613/jair.1.11196}, eprint = {arXiv:1711.10203v2}, file = {:Users/shanest/Documents/Library/Hupkes, Veldhoen, Zuidema/Journal of Artificial Intelligence Research/Hupkes, Veldhoen, Zuidema - 2018 - Visualisation and `Diagnostic Classifiers' Reveal how Recurrent and Recursive Neural Networks Process.pdf:pdf}, journal = {Journal of Artificial Intelligence Research}, keywords = {method: diagnostic classifier}, pages = {907--926}, title = {{Visualisation and `Diagnostic Classifiers' Reveal how Recurrent and Recursive Neural Networks Process Hierarchical Structure}}, url = {https://doi.org/10.1613/jair.1.11196}, volume = {61}, year = {2018} }
@inproceedings{Jumelet2018, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {arXiv:1808.10627v1}, author = {Jumelet, Jaap and Hupkes, Dieuwke}, booktitle = {Proceedings of the 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP}, doi = {10.18653/v1/W18-5424}, eprint = {arXiv:1808.10627v1}, file = {:Users/shanest/Documents/Library/Jumelet, Hupkes/Proceedings of the 2018 EMNLP Workshop BlackboxNLP Analyzing and Interpreting Neural Networks for NLP/Jumelet, Hupkes - 2018 - Do Language Models Understand emph{Anything} On the Ability of LSTMs to Understand Negative Polarity Items.pdf:pdf}, keywords = {method: diagnostic classifier,method: psycholinguistic,phenomenon: NPIs}, pages = {222--231}, publisher = {Association for Computational Linguistics}, title = {{Do Language Models Understand \emph{Anything}? On the Ability of LSTMs to Understand Negative Polarity Items}}, url = {http://aclweb.org/anthology/W18-5424}, year = {2018} }
@inproceedings{Marvin2019, abstract = {We present a dataset for evaluating the grammaticality of the predictions of a language model. We automatically construct a large number of minimally different pairs of English sentences, each consisting of a grammatical and an ungrammatical sentence. The sentence pairs represent different variations of structure-sensitive phenomena: subject-verb agreement, reflexive anaphora and negative polarity items. We expect a language model to assign a higher probability to the grammatical sentence than the ungrammatical one. In an experiment using this data set, an LSTM language model performed poorly on many of the constructions. Multi-task training with a syntactic objective (CCG supertagging) improved the LSTM's accuracy, but a large gap remained between its performance and the accuracy of human participants recruited online. This suggests that there is considerable room for improvement over LSTMs in capturing syntax in a language model.}, address = {Stroudsburg, PA, USA}, author = {Marvin, Rebecca and Linzen, Tal}, booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, doi = {10.18653/v1/D18-1151}, file = {:Users/shanest/Documents/Library/Marvin, Linzen/Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing/Marvin, Linzen - 2018 - Targeted Syntactic Evaluation of Language Models.pdf:pdf}, keywords = {method: acceptability judgment,method: psycholinguistic,phenomenon: NPIs,phenomenon: anaphora,phenomenon: number agreement}, pages = {1192--1202}, publisher = {Association for Computational Linguistics}, title = {{Targeted Syntactic Evaluation of Language Models}}, url = {http://aclweb.org/anthology/D18-1151}, year = {2018} }
@inproceedings{Zhang2018b, abstract = {Recent work using auxiliary prediction task classifiers to investigate the properties of LSTM representations has begun to shed light on why pretrained representations, like ELMo (Peters et al., 2018) and CoVe (McCann et al., 2017), are so beneficial for neural language understanding models. We still, though, do not yet have a clear understanding of how the choice of pretraining objective affects the type of linguistic information that models learn. With this in mind, we compare four objectives---language modeling, translation, skip-thought, and autoencoding---on their ability to induce syntactic and part-of-speech information. We make a fair comparison between the tasks by holding constant the quantity and genre of the training data, as well as the LSTM architecture. We find that representations from language models consistently perform best on our syntactic auxiliary prediction tasks, even when trained on relatively small amounts of data. These results suggest that language modeling may be the best data-rich pretraining task for transfer learning applications requiring syntactic information. We also find that the representations from randomly-initialized, frozen LSTMs perform strikingly well on our syntactic auxiliary tasks, but this effect disappears when the amount of training data for the auxiliary tasks is reduced.}, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {1809.10040}, author = {Zhang, Kelly and Bowman, Samuel}, booktitle = {Proceedings of the 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP}, doi = {10.18653/v1/W18-5448}, eprint = {1809.10040}, file = {:Users/shanest/Documents/Library/Zhang, Bowman/Proceedings of the 2018 EMNLP Workshop BlackboxNLP Analyzing and Interpreting Neural Networks for NLP/Zhang, Bowman - 2018 - Language Modeling Teaches You More than Translation Does Lessons Learned Through Auxiliary Syntactic Task Analysi.pdf:pdf}, keywords = {method: diagnostic classifier,method: layer-wise analysis,method: pre-training task comparison}, pages = {359--361}, publisher = {Association for Computational Linguistics}, title = {{Language Modeling Teaches You More than Translation Does: Lessons Learned Through Auxiliary Syntactic Task Analysis}}, url = {http://aclweb.org/anthology/W18-5448}, year = {2018} }
@inproceedings{Blevins2018, abstract = {We present a set of experiments to demonstrate that deep recurrent neural networks (RNNs) learn internal representations that capture soft hierarchical notions of syntax from highly varied supervision. We consider four syntax tasks at different depths of the parse tree; for each word, we predict its part of speech as well as the first (parent), second (grandparent) and third level (great-grandparent) constituent labels that appear above it. These predictions are made from representations produced at different depths in networks that are pretrained with one of four objectives: dependency parsing, semantic role labeling, machine translation, or language modeling. In every case, we find a correspondence between network depth and syntactic depth, suggesting that a soft syntactic hierarchy emerges. This effect is robust across all conditions, indicating that the models encode significant amounts of syntax even in the absence of an explicit syntactic training supervision.}, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {1805.04218}, author = {Blevins, Terra and Levy, Omer and Zettlemoyer, Luke}, booktitle = {Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2: Short Papers)}, doi = {10.18653/v1/P18-2003}, eprint = {1805.04218}, file = {:Users/shanest/Documents/Library/Blevins, Levy, Zettlemoyer/Proceedings of the 56th Annual Meeting of the Association for Computational Linguistics (Volume 2 Short Papers)/Blevins, Levy, Zettlemoyer - 2018 - Deep RNNs Encode Soft Hierarchical Syntax.pdf:pdf}, isbn = {9781948087346}, keywords = {method: diagnostic classifier,method: pre-training task comparison,phenomenon: hierarchical syntax}, pages = {14--19}, publisher = {Association for Computational Linguistics}, title = {{Deep RNNs Encode Soft Hierarchical Syntax}}, url = {http://aclweb.org/anthology/P18-2003}, volume = {2}, year = {2018} }
@inproceedings{White2018, abstract = {We investigate neural models' ability to capture lexicosyntactic inferences: inferences triggered by the interaction of lexical and syntactic information. We take the task of event factuality prediction as a case study and build a factuality judgment dataset for all English clause-embedding verbs in various syntactic contexts. We use this dataset, which we make publicly available, to probe the behavior of current state-of-the-art neural systems, showing that these systems make certain systematic errors that are clearly visible through the lens of factuality prediction.}, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {1808.06232}, author = {White, Aaron Steven and Rudinger, Rachel and Rawlins, Kyle and {Van Durme}, Benjamin}, booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing}, doi = {10.18653/v1/D18-1501}, eprint = {1808.06232}, file = {:Users/shanest/Documents/Library/White et al/Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing/White et al. - 2018 - Lexicosyntactic Inference in Neural Models.pdf:pdf}, keywords = {dataset,phenomenon: factuality}, pages = {4717--4724}, publisher = {Association for Computational Linguistics}, title = {{Lexicosyntactic Inference in Neural Models}}, url = {https://www.aclweb.org/anthology/D18-1501/}, year = {2018} }
@inproceedings{Gulordava2018, abstract = {Recurrent neural networks (RNNs) have achieved impressive results in a variety of linguistic processing tasks, suggesting that they can induce non-trivial properties of language. We investigate here to what extent RNNs learn to track abstract hierarchical syntactic structure. We test whether RNNs trained with a generic language modeling objective in four languages (Italian, English, Hebrew, Russian) can predict long-distance number agreement in various constructions. We include in our evaluation nonsensical sentences where RNNs cannot rely on semantic or lexical cues ("The colorless green ideas I ate with the chair sleep furiously"), and, for Italian, we compare model performance to human intuitions. Our language-model-trained RNNs make reliable predictions about long-distance agreement, and do not lag much behind human performance. We thus bring support to the hypothesis that RNNs are not just shallow-pattern extractors, but they also acquire deeper grammatical competence.}, address = {Stroudsburg, PA, USA}, archivePrefix = {arXiv}, arxivId = {1803.11138}, author = {Gulordava, Kristina and Bojanowski, Piotr and Grave, Edouard and Linzen, Tal and Baroni, Marco}, booktitle = {Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long Papers)}, doi = {10.18653/v1/N18-1108}, eprint = {1803.11138}, file = {:Users/shanest/Documents/Library/Gulordava et al/Proceedings of the 2018 Conference of the North American Chapter of the Association for Computational Linguistics Human Language Technologie./Gulordava et al. - 2018 - Colorless Green Recurrent Networks Dream Hierarchically.pdf:pdf}, keywords = {method: cross-linguistic,method: nonsense,model,phenomenon: number agreement}, pages = {1195--1205}, publisher = {Association for Computational Linguistics}, title = {{Colorless Green Recurrent Networks Dream Hierarchically}}, url = {http://aclweb.org/anthology/N18-1108}, year = {2018} }
@inproceedings{Bacon2018, address = {Stroudsburg, PA, USA}, author = {Bacon, Geoff and Regier, Terry}, booktitle = {Proceedings of the 2018 EMNLP Workshop BlackboxNLP: Analyzing and Interpreting Neural Networks for NLP}, doi = {10.18653/v1/W18-5440}, file = {:Users/shanest/Documents/Library/Bacon, Regier/Proceedings of the 2018 EMNLP Workshop BlackboxNLP Analyzing and Interpreting Neural Networks for NLP/Bacon, Regier - 2018 - Probing sentence embeddings for structure-dependent tense.pdf:pdf}, keywords = {method: diagnostic classifier,phenomenon: tense}, pages = {334--336}, publisher = {Association for Computational Linguistics}, title = {{Probing sentence embeddings for structure-dependent tense}}, url = {http://aclweb.org/anthology/W18-5440}, year = {2018} }
@article{Alain2017, abstract = {Neural network models have a reputation for being black boxes. We propose to monitor the features at every layer of a model and measure how suitable they are for classification. We use linear classifiers, which we refer to as "probes", trained entirely independently of the model itself. This helps us better understand the roles and dynamics of the intermediate layers. We demonstrate how this can be used to develop a better intuition about models and to diagnose potential problems. We apply this technique to the popular models Inception v3 and Resnet-50. Among other things, we observe experimentally that the linear separability of features increase monotonically along the depth of the model.}, archivePrefix = {arXiv}, arxivId = {1610.01644}, author = {Alain, Guillaume and Bengio, Yoshua}, eprint = {1610.01644}, file = {:Users/shanest/Documents/Library/Alain, Bengio/Unknown/Alain, Bengio - 2017 - Understanding intermediate layers using linear classifier probes.pdf:pdf}, keywords = {method: diagnostic classifier}, title = {{Understanding intermediate layers using linear classifier probes}}, url = {http://arxiv.org/abs/1610.01644}, year = {2017} }
@inproceedings{Adi2017, abstract = {There is a lot of research interest in encoding variable length sentences into fixed length vectors, in a way that preserves the sentence meanings. Two common methods include representations based on averaging word vectors, and representations based on the hidden states of recurrent neural networks such as LSTMs. The sentence vectors are used as features for subsequent machine learning tasks or for pre-training in the context of deep learning. However, not much is known about the properties that are encoded in these sentence representations and about the language information they capture. We propose a framework that facilitates better understanding of the encoded representations. We define prediction tasks around isolated aspects of sentence structure (namely sentence length, word content, and word order), and score representations by the ability to train a classifier to solve each prediction task when using the representation as input. We demonstrate the potential contribution of the approach by analyzing different sentence representation mechanisms. The analysis sheds light on the relative strengths of different sentence embedding methods with respect to these low level prediction tasks, and on the effect of the encoded vector's dimensionality on the resulting representations.}, archivePrefix = {arXiv}, arxivId = {1608.04207}, author = {Adi, Yossi and Kermany, Einat and Belinkov, Yonatan and Lavi, Ofer and Goldberg, Yoav}, booktitle = {International Conference on Learning Representations}, eprint = {1608.04207}, file = {:Users/shanest/Documents/Library/Adi et al/International Conference on Learning Representations/Adi et al. - 2017 - Fine-grained Analysis of Sentence Embeddings Using Auxiliary Prediction Tasks.pdf:pdf}, keywords = {method: diagnostic classifier}, title = {{Fine-grained Analysis of Sentence Embeddings Using Auxiliary Prediction Tasks}}, url = {http://arxiv.org/abs/1608.04207}, year = {2017} }
@article{Linzen2016, abstract = {The success of long short-term memory (LSTM) neural networks in language processing is typically attributed to their ability to capture long-distance statistical regularities. Linguistic regularities are often sensitive to syntactic structure; can such dependencies be captured by LSTMs, which do not have explicit structural representations? We begin addressing this question using number agreement in English subject-verb dependencies. We probe the architecture's grammatical competence both using training objectives with an explicit grammatical target (number prediction, grammaticality judgments) and using language models. In the strongly supervised settings, the LSTM achieved very high overall accuracy (less than 1% errors), but errors increased when sequential and structural information conflicted. The frequency of such errors rose sharply in the language-modeling setting. We conclude that LSTMs can capture a non-trivial amount of grammatical structure given targeted supervision, but stronger architectures may be required to further reduce errors; furthermore, the language modeling signal is insufficient for capturing syntax-sensitive dependencies, and should be supplemented with more direct supervision if such dependencies need to be captured.}, archivePrefix = {arXiv}, arxivId = {1611.01368}, author = {Linzen, Tal and Dupoux, Emmanuel and Goldberg, Yoav}, eprint = {1611.01368}, file = {:Users/shanest/Documents/Library/Linzen, Dupoux, Goldberg/Transactions of the Association for Computational Linguistics/Linzen, Dupoux, Goldberg - 2016 - Assessing the Ability of LSTMs to Learn Syntax-Sensitive Dependencies.pdf:pdf}, journal = {Transactions of the Association for Computational Linguistics}, keywords = {method: psycholinguistic,phenomenon: number agreement}, pages = {521--535}, title = {{Assessing the Ability of LSTMs to Learn Syntax-Sensitive Dependencies}}, url = {http://arxiv.org/abs/1611.01368}, volume = {4}, year = {2016} }
@inproceedings{Ettinger2016, abstract = {We propose a diagnostic method for prob-ing specific information captured in vector representations of sentence meaning, via simple classification tasks with strategi-cally constructed sentence sets. We iden-tify some key types of semantic informa-tion that we might expect to be captured in sentence composition, and illustrate ex-ample classification tasks for targeting this information.}, address = {Stroudsburg, PA, USA}, author = {Ettinger, Allyson and Elgohary, Ahmed and Resnik, Philip}, booktitle = {Proceedings of the 1st Workshop on Evaluating Vector-Space Representations for NLP}, doi = {10.18653/v1/W16-2524}, file = {:Users/shanest/Documents/Library/Ettinger, Elgohary, Resnik/Proceedings of the 1st Workshop on Evaluating Vector-Space Representations for NLP/Ettinger, Elgohary, Resnik - 2016 - Probing for semantic evidence of composition by means of simple classification tasks.pdf:pdf}, keywords = {method: diagnostic classifier,phenomenon: compositionality}, pages = {134--139}, publisher = {Association for Computational Linguistics}, title = {{Probing for semantic evidence of composition by means of simple classification tasks}}, url = {http://aclweb.org/anthology/W16-2524}, year = {2016} }
@unpublished{Jozefowicz2016, abstract = {In this work we explore recent advances in Recurrent Neural Networks for large scale Language Modeling, a task central to language understanding. We extend current models to deal with two key challenges present in this task: corpora and vocabulary sizes, and complex, long term structure of language. We perform an exhaustive study on techniques such as character Convolutional Neural Networks or Long-Short Term Memory, on the One Billion Word Benchmark. Our best single model significantly improves state-of-the-art perplexity from 51.3 down to 30.0 (whilst reducing the number of parameters by a factor of 20), while an ensemble of models sets a new record by improving perplexity from 41.0 down to 23.7. We also release these models for the NLP and ML community to study and improve upon.}, archivePrefix = {arXiv}, arxivId = {1602.02410}, author = {Jozefowicz, Rafal and Vinyals, Oriol and Schuster, Mike and Shazeer, Noam and Wu, Yonghui}, eprint = {1602.02410}, file = {:Users/shanest/Documents/Library/Jozefowicz et al/Unknown/Jozefowicz et al. - 2016 - Exploring the Limits of Language Modeling.pdf:pdf}, keywords = {model}, title = {{Exploring the Limits of Language Modeling}}, url = {https://arxiv.org/abs/1602.02410}, year = {2016} }
@article{Veldhoen2016, abstract = {We investigate how neural networks can be used for hierarchical, compositional semantics. To this end, we define the simple but nontrivial artificial task of pro-cessing nested arithmetic expressions and study whether different types of neural networks can learn to add and subtract. We find that recursive neural networks can implement a generalising solution, and we visualise the intermediate steps: projection, summation and squashing. We also show that gated recurrent neural networks, which process the expressions incrementally, perform surprisingly well on this task: they learn to predict the outcome of the arithmetic expressions with reasonable accuracy, although performance deteriorates with increasing length. To analyse what strategy the recurrent network applies, visualisation techniques are less insightful. Therefore, we develop an approach where we formulate and test hypotheses on what strategies these networks might be following. For each hypoth-esis, we derive predictions about features of the hidden state representations at each time step, and train 'diagnostic classifiers' to test those predictions. Our results indicate the networks follow a strategy similar to our hypothesised 'incremental strategy'.}, author = {Veldhoen, Sara and Hupkes, Dieuwke and Zuidema, Willem}, file = {:Users/shanest/Documents/Library/Veldhoen, Hupkes, Zuidema/CEUR Workshop Proceedings/Veldhoen, Hupkes, Zuidema - 2016 - Diagnostic classifiers Revealing how neural networks process hierarchical structure.pdf:pdf}, issn = {16130073}, journal = {CEUR Workshop Proceedings}, keywords = {method: diagnostic classifier}, title = {{Diagnostic classifiers: Revealing how neural networks process hierarchical structure}}, volume = {1773}, year = {2016} }
@article{Karpathy2015, abstract = {Recurrent Neural Networks (RNNs), and specifically a variant with Long Short-Term Memory (LSTM), are enjoying renewed interest as a result of successful applications in a wide range of machine learning problems that involve sequential data. However, while LSTMs provide exceptional results in practice, the source of their performance and their limitations remain rather poorly understood. Using character-level language models as an interpretable testbed, we aim to bridge this gap by providing an analysis of their representations, predictions and error types. In particular, our experiments reveal the existence of interpretable cells that keep track of long-range dependencies such as line lengths, quotes and brackets. Moreover, our comparative analysis with finite horizon n-gram models traces the source of the LSTM improvements to long-range structural dependencies. Finally, we provide analysis of the remaining errors and suggests areas for further study.}, archivePrefix = {arXiv}, arxivId = {1506.02078}, author = {Karpathy, Andrej and Johnson, Justin and Fei-Fei, Li}, eprint = {1506.02078}, file = {:Users/shanest/Documents/Library/Karpathy, Johnson, Fei-Fei/Unknown/Karpathy, Johnson, Fei-Fei - 2015 - Visualizing and Understanding Recurrent Networks.pdf:pdf}, keywords = {method: individual neurons,method: visualization}, pages = {1--12}, title = {{Visualizing and Understanding Recurrent Networks}}, url = {http://arxiv.org/abs/1506.02078}, year = {2015} }