Dissecting Contextual Word Embeddings: Architecture and Representation. Peters, M., Neumann, M., Zettlemoyer, L., & Yih, W. In Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing, pages 1499–1509, Stroudsburg, PA, USA, 2018. Association for Computational Linguistics. Paper doi abstract bibtex Contextual word representations derived from pre-trained bidirectional language models (biLMs) have recently been shown to provide significant improvements to the state of the art for a wide range of NLP tasks. However, many questions remain as to how and why these models are so effective. In this paper, we present a detailed empirical study of how the choice of neural architecture (e.g. LSTM, CNN, or self attention) influences both end task accuracy and qualitative properties of the representations that are learned. We show there is a tradeoff between speed and accuracy, but all architectures learn high quality contextual representations that outperform word embeddings for four challenging NLP tasks. Additionally, all architectures learn representations that vary with network depth, from exclusively morphological based at the word embedding layer through local syntax based in the lower contextual layers to longer range semantics such coreference at the upper layers. Together, these results suggest that unsupervised biLMs, independent of architecture, are learning much more about the structure of language than previously appreciated.
@inproceedings{Peters2019,
abstract = {Contextual word representations derived from pre-trained bidirectional language models (biLMs) have recently been shown to provide significant improvements to the state of the art for a wide range of NLP tasks. However, many questions remain as to how and why these models are so effective. In this paper, we present a detailed empirical study of how the choice of neural architecture (e.g. LSTM, CNN, or self attention) influences both end task accuracy and qualitative properties of the representations that are learned. We show there is a tradeoff between speed and accuracy, but all architectures learn high quality contextual representations that outperform word embeddings for four challenging NLP tasks. Additionally, all architectures learn representations that vary with network depth, from exclusively morphological based at the word embedding layer through local syntax based in the lower contextual layers to longer range semantics such coreference at the upper layers. Together, these results suggest that unsupervised biLMs, independent of architecture, are learning much more about the structure of language than previously appreciated.},
address = {Stroudsburg, PA, USA},
author = {Peters, Matthew and Neumann, Mark and Zettlemoyer, Luke and Yih, Wen-tau},
booktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},
doi = {10.18653/v1/D18-1179},
file = {:Users/shanest/Documents/Library/Peters et al/Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing/Peters et al. - 2018 - Dissecting Contextual Word Embeddings Architecture and Representation.pdf:pdf},
keywords = {method: diagnostic classifier,method: layer-wise analysis,method: model comparison},
pages = {1499--1509},
publisher = {Association for Computational Linguistics},
title = {{Dissecting Contextual Word Embeddings: Architecture and Representation}},
url = {https://aclweb.org/anthology/D18-1179},
year = {2018}
}
Downloads: 0
{"_id":"LxhqQxw5tBYezgyEb","bibbaseid":"peters-neumann-zettlemoyer-yih-dissectingcontextualwordembeddingsarchitectureandrepresentation-2018","authorIDs":[],"author_short":["Peters, M.","Neumann, M.","Zettlemoyer, L.","Yih, W."],"bibdata":{"bibtype":"inproceedings","type":"inproceedings","abstract":"Contextual word representations derived from pre-trained bidirectional language models (biLMs) have recently been shown to provide significant improvements to the state of the art for a wide range of NLP tasks. However, many questions remain as to how and why these models are so effective. In this paper, we present a detailed empirical study of how the choice of neural architecture (e.g. LSTM, CNN, or self attention) influences both end task accuracy and qualitative properties of the representations that are learned. We show there is a tradeoff between speed and accuracy, but all architectures learn high quality contextual representations that outperform word embeddings for four challenging NLP tasks. Additionally, all architectures learn representations that vary with network depth, from exclusively morphological based at the word embedding layer through local syntax based in the lower contextual layers to longer range semantics such coreference at the upper layers. Together, these results suggest that unsupervised biLMs, independent of architecture, are learning much more about the structure of language than previously appreciated.","address":"Stroudsburg, PA, USA","author":[{"propositions":[],"lastnames":["Peters"],"firstnames":["Matthew"],"suffixes":[]},{"propositions":[],"lastnames":["Neumann"],"firstnames":["Mark"],"suffixes":[]},{"propositions":[],"lastnames":["Zettlemoyer"],"firstnames":["Luke"],"suffixes":[]},{"propositions":[],"lastnames":["Yih"],"firstnames":["Wen-tau"],"suffixes":[]}],"booktitle":"Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing","doi":"10.18653/v1/D18-1179","file":":Users/shanest/Documents/Library/Peters et al/Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing/Peters et al. - 2018 - Dissecting Contextual Word Embeddings Architecture and Representation.pdf:pdf","keywords":"method: diagnostic classifier,method: layer-wise analysis,method: model comparison","pages":"1499–1509","publisher":"Association for Computational Linguistics","title":"Dissecting Contextual Word Embeddings: Architecture and Representation","url":"https://aclweb.org/anthology/D18-1179","year":"2018","bibtex":"@inproceedings{Peters2019,\nabstract = {Contextual word representations derived from pre-trained bidirectional language models (biLMs) have recently been shown to provide significant improvements to the state of the art for a wide range of NLP tasks. However, many questions remain as to how and why these models are so effective. In this paper, we present a detailed empirical study of how the choice of neural architecture (e.g. LSTM, CNN, or self attention) influences both end task accuracy and qualitative properties of the representations that are learned. We show there is a tradeoff between speed and accuracy, but all architectures learn high quality contextual representations that outperform word embeddings for four challenging NLP tasks. Additionally, all architectures learn representations that vary with network depth, from exclusively morphological based at the word embedding layer through local syntax based in the lower contextual layers to longer range semantics such coreference at the upper layers. Together, these results suggest that unsupervised biLMs, independent of architecture, are learning much more about the structure of language than previously appreciated.},\naddress = {Stroudsburg, PA, USA},\nauthor = {Peters, Matthew and Neumann, Mark and Zettlemoyer, Luke and Yih, Wen-tau},\nbooktitle = {Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing},\ndoi = {10.18653/v1/D18-1179},\nfile = {:Users/shanest/Documents/Library/Peters et al/Proceedings of the 2018 Conference on Empirical Methods in Natural Language Processing/Peters et al. - 2018 - Dissecting Contextual Word Embeddings Architecture and Representation.pdf:pdf},\nkeywords = {method: diagnostic classifier,method: layer-wise analysis,method: model comparison},\npages = {1499--1509},\npublisher = {Association for Computational Linguistics},\ntitle = {{Dissecting Contextual Word Embeddings: Architecture and Representation}},\nurl = {https://aclweb.org/anthology/D18-1179},\nyear = {2018}\n}\n","author_short":["Peters, M.","Neumann, M.","Zettlemoyer, L.","Yih, W."],"key":"Peters2019","id":"Peters2019","bibbaseid":"peters-neumann-zettlemoyer-yih-dissectingcontextualwordembeddingsarchitectureandrepresentation-2018","role":"author","urls":{"Paper":"https://aclweb.org/anthology/D18-1179"},"keyword":["method: diagnostic classifier","method: layer-wise analysis","method: model comparison"],"metadata":{"authorlinks":{}},"downloads":0},"bibtype":"inproceedings","biburl":"https://www.shane.st/teaching/575/win20/MachineLearning-interpretability.bib","creationDate":"2020-01-05T04:04:02.869Z","downloads":0,"keywords":["method: diagnostic classifier","method: layer-wise analysis","method: model comparison"],"search_terms":["dissecting","contextual","word","embeddings","architecture","representation","peters","neumann","zettlemoyer","yih"],"title":"Dissecting Contextual Word Embeddings: Architecture and Representation","year":2018,"dataSources":["okYcdTpf4JJ2zkj7A","FoqTuDx9Cxduvwxiv","znj7izS5PeehdLR3G"]}