@article{devlinBERTPretrainingDeep2018, archivePrefix = {arXiv}, eprinttype = {arxiv}, eprint = {1810.04805}, primaryClass = {cs}, title = {{{BERT}}: {{Pre}}-Training of {{Deep Bidirectional Transformers}} for {{Language Understanding}}}, url = {http://arxiv.org/abs/1810.04805}, shorttitle = {{{BERT}}}, abstract = {We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models, BERT is designed to pre-train deep bidirectional representations by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT representations can be fine-tuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial task-specific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE benchmark to 80.4\% (7.6\% absolute improvement), MultiNLI accuracy to 86.7 (5.6\% absolute improvement) and the SQuAD v1.1 question answering Test F1 to 93.2 (1.5\% absolute improvement), outperforming human performance by 2.0\%.}, urldate = {2019-02-21}, date = {2018-10-10}, keywords = {Computer Science - Computation and Language}, author = {Devlin, Jacob and Chang, Ming-Wei and Lee, Kenton and Toutanova, Kristina}, file = {/home/dimitri/Nextcloud/Zotero/storage/3M8VB4UN/Devlin et al. - 2018 - BERT Pre-training of Deep Bidirectional Transform.pdf;/home/dimitri/Nextcloud/Zotero/storage/3P9Z7WWU/1810.html} }