Improving Multi-Task Deep Neural Networks via Knowledge Distillation for Natural Language Understanding. Liu, X., He, P., Chen, W., & Gao, J. 4, 2019. Paper Website abstract bibtex This paper explores the use of knowledge distillation to improve a Multi-Task Deep Neural Network (MT-DNN) (Liu et al., 2019) for learning text representations across multiple natural language understanding tasks. Although ensemble learning can improve model performance, serving an ensemble of large DNNs such as MT-DNN can be prohibitively expensive. Here we apply the knowledge distillation method (Hinton et al., 2015) in the multi-task learning setting. For each task, we train an ensemble of different MT-DNNs (teacher) that outperforms any single model, and then train a single MT-DNN (student) via multi-task learning to \emphdistill knowledge from these ensemble teachers. We show that the distilled MT-DNN significantly outperforms the original MT-DNN on 7 out of 9 GLUE tasks, pushing the GLUE benchmark (single model) to 83.7\% (1.5\% absolute improvement\footnote Based on the GLUE leaderboard at https://gluebenchmark.com/leaderboard as of April 1, 2019.). The code and pre-trained models will be made publicly available at https://github.com/namisan/mt-dnn.
@article{
title = {Improving Multi-Task Deep Neural Networks via Knowledge Distillation for Natural Language Understanding},
type = {article},
year = {2019},
websites = {https://arxiv.org/abs/1904.09482v1},
month = {4},
day = {20},
id = {60b4963c-2a8d-3d8b-886e-e4485fe67b26},
created = {2023-12-13T07:40:06.816Z},
accessed = {2023-12-13},
file_attached = {true},
profile_id = {f1f70cad-e32d-3de2-a3c0-be1736cb88be},
group_id = {5ec9cc91-a5d6-3de5-82f3-3ef3d98a89c1},
last_modified = {2023-12-13T07:40:09.027Z},
read = {false},
starred = {false},
authored = {false},
confirmed = {false},
hidden = {false},
folder_uuids = {d25a2be2-b54f-400b-918b-b254e8044e39},
private_publication = {false},
abstract = {This paper explores the use of knowledge distillation to improve a Multi-Task Deep Neural Network (MT-DNN) (Liu et al., 2019) for learning text representations across multiple natural language understanding tasks. Although ensemble learning can improve model performance, serving an ensemble of large DNNs such as MT-DNN can be prohibitively expensive. Here we apply the knowledge distillation method (Hinton et al., 2015) in the multi-task learning setting. For each task, we train an ensemble of different MT-DNNs (teacher) that outperforms any single model, and then train a single MT-DNN (student) via multi-task learning to \emphdistill knowledge from these ensemble teachers. We show that the distilled MT-DNN significantly outperforms the original MT-DNN on 7 out of 9 GLUE tasks, pushing the GLUE benchmark (single model) to 83.7\% (1.5\% absolute improvement\footnote Based on the GLUE leaderboard at https://gluebenchmark.com/leaderboard as of April 1, 2019.). The code and pre-trained models will be made publicly available at https://github.com/namisan/mt-dnn.},
bibtype = {article},
author = {Liu, Xiaodong and He, Pengcheng and Chen, Weizhu and Gao, Jianfeng}
}
Downloads: 0
{"_id":"xWuWH42pFyboMR6gc","bibbaseid":"liu-he-chen-gao-improvingmultitaskdeepneuralnetworksviaknowledgedistillationfornaturallanguageunderstanding-2019","author_short":["Liu, X.","He, P.","Chen, W.","Gao, J."],"bibdata":{"title":"Improving Multi-Task Deep Neural Networks via Knowledge Distillation for Natural Language Understanding","type":"article","year":"2019","websites":"https://arxiv.org/abs/1904.09482v1","month":"4","day":"20","id":"60b4963c-2a8d-3d8b-886e-e4485fe67b26","created":"2023-12-13T07:40:06.816Z","accessed":"2023-12-13","file_attached":"true","profile_id":"f1f70cad-e32d-3de2-a3c0-be1736cb88be","group_id":"5ec9cc91-a5d6-3de5-82f3-3ef3d98a89c1","last_modified":"2023-12-13T07:40:09.027Z","read":false,"starred":false,"authored":false,"confirmed":false,"hidden":false,"folder_uuids":"d25a2be2-b54f-400b-918b-b254e8044e39","private_publication":false,"abstract":"This paper explores the use of knowledge distillation to improve a Multi-Task Deep Neural Network (MT-DNN) (Liu et al., 2019) for learning text representations across multiple natural language understanding tasks. Although ensemble learning can improve model performance, serving an ensemble of large DNNs such as MT-DNN can be prohibitively expensive. Here we apply the knowledge distillation method (Hinton et al., 2015) in the multi-task learning setting. For each task, we train an ensemble of different MT-DNNs (teacher) that outperforms any single model, and then train a single MT-DNN (student) via multi-task learning to \\emphdistill knowledge from these ensemble teachers. We show that the distilled MT-DNN significantly outperforms the original MT-DNN on 7 out of 9 GLUE tasks, pushing the GLUE benchmark (single model) to 83.7\\% (1.5\\% absolute improvement\\footnote Based on the GLUE leaderboard at https://gluebenchmark.com/leaderboard as of April 1, 2019.). The code and pre-trained models will be made publicly available at https://github.com/namisan/mt-dnn.","bibtype":"article","author":"Liu, Xiaodong and He, Pengcheng and Chen, Weizhu and Gao, Jianfeng","bibtex":"@article{\n title = {Improving Multi-Task Deep Neural Networks via Knowledge Distillation for Natural Language Understanding},\n type = {article},\n year = {2019},\n websites = {https://arxiv.org/abs/1904.09482v1},\n month = {4},\n day = {20},\n id = {60b4963c-2a8d-3d8b-886e-e4485fe67b26},\n created = {2023-12-13T07:40:06.816Z},\n accessed = {2023-12-13},\n file_attached = {true},\n profile_id = {f1f70cad-e32d-3de2-a3c0-be1736cb88be},\n group_id = {5ec9cc91-a5d6-3de5-82f3-3ef3d98a89c1},\n last_modified = {2023-12-13T07:40:09.027Z},\n read = {false},\n starred = {false},\n authored = {false},\n confirmed = {false},\n hidden = {false},\n folder_uuids = {d25a2be2-b54f-400b-918b-b254e8044e39},\n private_publication = {false},\n abstract = {This paper explores the use of knowledge distillation to improve a Multi-Task Deep Neural Network (MT-DNN) (Liu et al., 2019) for learning text representations across multiple natural language understanding tasks. Although ensemble learning can improve model performance, serving an ensemble of large DNNs such as MT-DNN can be prohibitively expensive. Here we apply the knowledge distillation method (Hinton et al., 2015) in the multi-task learning setting. For each task, we train an ensemble of different MT-DNNs (teacher) that outperforms any single model, and then train a single MT-DNN (student) via multi-task learning to \\emphdistill knowledge from these ensemble teachers. We show that the distilled MT-DNN significantly outperforms the original MT-DNN on 7 out of 9 GLUE tasks, pushing the GLUE benchmark (single model) to 83.7\\% (1.5\\% absolute improvement\\footnote Based on the GLUE leaderboard at https://gluebenchmark.com/leaderboard as of April 1, 2019.). The code and pre-trained models will be made publicly available at https://github.com/namisan/mt-dnn.},\n bibtype = {article},\n author = {Liu, Xiaodong and He, Pengcheng and Chen, Weizhu and Gao, Jianfeng}\n}","author_short":["Liu, X.","He, P.","Chen, W.","Gao, J."],"urls":{"Paper":"https://bibbase.org/service/mendeley/bfbbf840-4c42-3914-a463-19024f50b30c/file/4096fa6c-ffd3-e7bb-5b3c-731603d9f3e2/full_text.pdf.pdf","Website":"https://arxiv.org/abs/1904.09482v1"},"biburl":"https://bibbase.org/service/mendeley/bfbbf840-4c42-3914-a463-19024f50b30c","bibbaseid":"liu-he-chen-gao-improvingmultitaskdeepneuralnetworksviaknowledgedistillationfornaturallanguageunderstanding-2019","role":"author","metadata":{"authorlinks":{}}},"bibtype":"article","biburl":"https://bibbase.org/service/mendeley/bfbbf840-4c42-3914-a463-19024f50b30c","dataSources":["7bvyz9GrWb3E3F8y3","ya2CyA73rpZseyrZ8","2252seNhipfTmjEBQ"],"keywords":[],"search_terms":["improving","multi","task","deep","neural","networks","via","knowledge","distillation","natural","language","understanding","liu","he","chen","gao"],"title":"Improving Multi-Task Deep Neural Networks via Knowledge Distillation for Natural Language Understanding","year":2019}