Improving Multi-Task Deep Neural Networks via Knowledge Distillation for Natural Language Understanding. Liu, X., He, P., Chen, W., & Gao, J. 4, 2019.
Improving Multi-Task Deep Neural Networks via Knowledge Distillation for Natural Language Understanding [link]Website  abstract   bibtex   
This paper explores the use of knowledge distillation to improve a Multi-Task Deep Neural Network (MT-DNN) (Liu et al., 2019) for learning text representations across multiple natural language understanding tasks. Although ensemble learning can improve model performance, serving an ensemble of large DNNs such as MT-DNN can be prohibitively expensive. Here we apply the knowledge distillation method (Hinton et al., 2015) in the multi-task learning setting. For each task, we train an ensemble of different MT-DNNs (teacher) that outperforms any single model, and then train a single MT-DNN (student) via multi-task learning to \emphdistill knowledge from these ensemble teachers. We show that the distilled MT-DNN significantly outperforms the original MT-DNN on 7 out of 9 GLUE tasks, pushing the GLUE benchmark (single model) to 83.7\% (1.5\% absolute improvement\footnote Based on the GLUE leaderboard at https://gluebenchmark.com/leaderboard as of April 1, 2019.). The code and pre-trained models will be made publicly available at https://github.com/namisan/mt-dnn.
@article{
 title = {Improving Multi-Task Deep Neural Networks via Knowledge Distillation for Natural Language Understanding},
 type = {article},
 year = {2019},
 websites = {https://arxiv.org/abs/1904.09482v1},
 month = {4},
 day = {20},
 id = {60b4963c-2a8d-3d8b-886e-e4485fe67b26},
 created = {2023-12-13T07:40:06.816Z},
 accessed = {2023-12-13},
 file_attached = {true},
 profile_id = {f1f70cad-e32d-3de2-a3c0-be1736cb88be},
 group_id = {5ec9cc91-a5d6-3de5-82f3-3ef3d98a89c1},
 last_modified = {2023-12-13T07:40:09.027Z},
 read = {false},
 starred = {false},
 authored = {false},
 confirmed = {false},
 hidden = {false},
 folder_uuids = {d25a2be2-b54f-400b-918b-b254e8044e39},
 private_publication = {false},
 abstract = {This paper explores the use of knowledge distillation to improve a Multi-Task Deep Neural Network (MT-DNN) (Liu et al., 2019) for learning text representations across multiple natural language understanding tasks. Although ensemble learning can improve model performance, serving an ensemble of large DNNs such as MT-DNN can be prohibitively expensive. Here we apply the knowledge distillation method (Hinton et al., 2015) in the multi-task learning setting. For each task, we train an ensemble of different MT-DNNs (teacher) that outperforms any single model, and then train a single MT-DNN (student) via multi-task learning to \emphdistill knowledge from these ensemble teachers. We show that the distilled MT-DNN significantly outperforms the original MT-DNN on 7 out of 9 GLUE tasks, pushing the GLUE benchmark (single model) to 83.7\% (1.5\% absolute improvement\footnote Based on the GLUE leaderboard at https://gluebenchmark.com/leaderboard as of April 1, 2019.). The code and pre-trained models will be made publicly available at https://github.com/namisan/mt-dnn.},
 bibtype = {article},
 author = {Liu, Xiaodong and He, Pengcheng and Chen, Weizhu and Gao, Jianfeng}
}

Downloads: 0