Towards Robust Speech Representation Learning for Thousands of Languages. Chen, W., Zhang, W., Peng, Y., Li, X., Tian, J., Shi, J., Chang, X., Maiti, S., Livescu, K., & Watanabe, S. In Al-Onaizan, Y., Bansal, M., & Chen, Y., editors, Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 10205–10224, Miami, Florida, USA, November, 2024. Association for Computational Linguistics.
Paper doi abstract bibtex Self-supervised learning (SSL) has helped extend speech technologies to more languages by reducing the need for labeled data. However, models are still far from supporting the world`s 7000+ languages. We propose XEUS, a Cross-lingual Encoder for Universal Speech, trained on over 1 million hours of data across 4057 languages, extending the language coverage of SSL models 4-fold. We combine 1 million hours of speech from existing publicly accessible corpora with a newly created corpus of 7400+ hours from 4057 languages, which will be publicly released. To handle the diverse conditions of multilingual speech data, we augment the typical SSL masked prediction approach with a novel dereverberation objective, increasing robustness. We evaluate XEUS on several benchmarks, and show that it consistently outperforms or achieves comparable results to state-of-the-art (SOTA) SSL models across a variety of tasks. XEUS sets a new SOTA on the ML-SUPERB benchmark: it outperforms MMS 1B and w2v-BERT 2.0 v2 by 0.8% and 4.4% respectively, despite having less parameters or pre-training data. Checkpoints, code, and data are found in https://www.wavlab.org/activities/2024/xeus/.
@inproceedings{chen_towards_2024,
address = {Miami, Florida, USA},
title = {Towards {Robust} {Speech} {Representation} {Learning} for {Thousands} of {Languages}},
url = {https://aclanthology.org/2024.emnlp-main.570/},
doi = {10.18653/v1/2024.emnlp-main.570},
abstract = {Self-supervised learning (SSL) has helped extend speech technologies to more languages by reducing the need for labeled data. However, models are still far from supporting the world`s 7000+ languages. We propose XEUS, a Cross-lingual Encoder for Universal Speech, trained on over 1 million hours of data across 4057 languages, extending the language coverage of SSL models 4-fold. We combine 1 million hours of speech from existing publicly accessible corpora with a newly created corpus of 7400+ hours from 4057 languages, which will be publicly released. To handle the diverse conditions of multilingual speech data, we augment the typical SSL masked prediction approach with a novel dereverberation objective, increasing robustness. We evaluate XEUS on several benchmarks, and show that it consistently outperforms or achieves comparable results to state-of-the-art (SOTA) SSL models across a variety of tasks. XEUS sets a new SOTA on the ML-SUPERB benchmark: it outperforms MMS 1B and w2v-BERT 2.0 v2 by 0.8\% and 4.4\% respectively, despite having less parameters or pre-training data. Checkpoints, code, and data are found in https://www.wavlab.org/activities/2024/xeus/.},
urldate = {2025-01-26},
booktitle = {Proceedings of the 2024 {Conference} on {Empirical} {Methods} in {Natural} {Language} {Processing}},
publisher = {Association for Computational Linguistics},
author = {Chen, William and Zhang, Wangyou and Peng, Yifan and Li, Xinjian and Tian, Jinchuan and Shi, Jiatong and Chang, Xuankai and Maiti, Soumi and Livescu, Karen and Watanabe, Shinji},
editor = {Al-Onaizan, Yaser and Bansal, Mohit and Chen, Yun-Nung},
month = nov,
year = {2024},
pages = {10205--10224},
}
Downloads: 0
{"_id":"uCXzqxapSHk5xa5ta","bibbaseid":"chen-zhang-peng-li-tian-shi-chang-maiti-etal-towardsrobustspeechrepresentationlearningforthousandsoflanguages-2024","author_short":["Chen, W.","Zhang, W.","Peng, Y.","Li, X.","Tian, J.","Shi, J.","Chang, X.","Maiti, S.","Livescu, K.","Watanabe, S."],"bibdata":{"bibtype":"inproceedings","type":"inproceedings","address":"Miami, Florida, USA","title":"Towards Robust Speech Representation Learning for Thousands of Languages","url":"https://aclanthology.org/2024.emnlp-main.570/","doi":"10.18653/v1/2024.emnlp-main.570","abstract":"Self-supervised learning (SSL) has helped extend speech technologies to more languages by reducing the need for labeled data. However, models are still far from supporting the world`s 7000+ languages. We propose XEUS, a Cross-lingual Encoder for Universal Speech, trained on over 1 million hours of data across 4057 languages, extending the language coverage of SSL models 4-fold. We combine 1 million hours of speech from existing publicly accessible corpora with a newly created corpus of 7400+ hours from 4057 languages, which will be publicly released. To handle the diverse conditions of multilingual speech data, we augment the typical SSL masked prediction approach with a novel dereverberation objective, increasing robustness. We evaluate XEUS on several benchmarks, and show that it consistently outperforms or achieves comparable results to state-of-the-art (SOTA) SSL models across a variety of tasks. XEUS sets a new SOTA on the ML-SUPERB benchmark: it outperforms MMS 1B and w2v-BERT 2.0 v2 by 0.8% and 4.4% respectively, despite having less parameters or pre-training data. Checkpoints, code, and data are found in https://www.wavlab.org/activities/2024/xeus/.","urldate":"2025-01-26","booktitle":"Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing","publisher":"Association for Computational Linguistics","author":[{"propositions":[],"lastnames":["Chen"],"firstnames":["William"],"suffixes":[]},{"propositions":[],"lastnames":["Zhang"],"firstnames":["Wangyou"],"suffixes":[]},{"propositions":[],"lastnames":["Peng"],"firstnames":["Yifan"],"suffixes":[]},{"propositions":[],"lastnames":["Li"],"firstnames":["Xinjian"],"suffixes":[]},{"propositions":[],"lastnames":["Tian"],"firstnames":["Jinchuan"],"suffixes":[]},{"propositions":[],"lastnames":["Shi"],"firstnames":["Jiatong"],"suffixes":[]},{"propositions":[],"lastnames":["Chang"],"firstnames":["Xuankai"],"suffixes":[]},{"propositions":[],"lastnames":["Maiti"],"firstnames":["Soumi"],"suffixes":[]},{"propositions":[],"lastnames":["Livescu"],"firstnames":["Karen"],"suffixes":[]},{"propositions":[],"lastnames":["Watanabe"],"firstnames":["Shinji"],"suffixes":[]}],"editor":[{"propositions":[],"lastnames":["Al-Onaizan"],"firstnames":["Yaser"],"suffixes":[]},{"propositions":[],"lastnames":["Bansal"],"firstnames":["Mohit"],"suffixes":[]},{"propositions":[],"lastnames":["Chen"],"firstnames":["Yun-Nung"],"suffixes":[]}],"month":"November","year":"2024","pages":"10205–10224","bibtex":"@inproceedings{chen_towards_2024,\n\taddress = {Miami, Florida, USA},\n\ttitle = {Towards {Robust} {Speech} {Representation} {Learning} for {Thousands} of {Languages}},\n\turl = {https://aclanthology.org/2024.emnlp-main.570/},\n\tdoi = {10.18653/v1/2024.emnlp-main.570},\n\tabstract = {Self-supervised learning (SSL) has helped extend speech technologies to more languages by reducing the need for labeled data. However, models are still far from supporting the world`s 7000+ languages. We propose XEUS, a Cross-lingual Encoder for Universal Speech, trained on over 1 million hours of data across 4057 languages, extending the language coverage of SSL models 4-fold. We combine 1 million hours of speech from existing publicly accessible corpora with a newly created corpus of 7400+ hours from 4057 languages, which will be publicly released. To handle the diverse conditions of multilingual speech data, we augment the typical SSL masked prediction approach with a novel dereverberation objective, increasing robustness. We evaluate XEUS on several benchmarks, and show that it consistently outperforms or achieves comparable results to state-of-the-art (SOTA) SSL models across a variety of tasks. XEUS sets a new SOTA on the ML-SUPERB benchmark: it outperforms MMS 1B and w2v-BERT 2.0 v2 by 0.8\\% and 4.4\\% respectively, despite having less parameters or pre-training data. Checkpoints, code, and data are found in https://www.wavlab.org/activities/2024/xeus/.},\n\turldate = {2025-01-26},\n\tbooktitle = {Proceedings of the 2024 {Conference} on {Empirical} {Methods} in {Natural} {Language} {Processing}},\n\tpublisher = {Association for Computational Linguistics},\n\tauthor = {Chen, William and Zhang, Wangyou and Peng, Yifan and Li, Xinjian and Tian, Jinchuan and Shi, Jiatong and Chang, Xuankai and Maiti, Soumi and Livescu, Karen and Watanabe, Shinji},\n\teditor = {Al-Onaizan, Yaser and Bansal, Mohit and Chen, Yun-Nung},\n\tmonth = nov,\n\tyear = {2024},\n\tpages = {10205--10224},\n}\n\n\n\n","author_short":["Chen, W.","Zhang, W.","Peng, Y.","Li, X.","Tian, J.","Shi, J.","Chang, X.","Maiti, S.","Livescu, K.","Watanabe, S."],"editor_short":["Al-Onaizan, Y.","Bansal, M.","Chen, Y."],"key":"chen_towards_2024","id":"chen_towards_2024","bibbaseid":"chen-zhang-peng-li-tian-shi-chang-maiti-etal-towardsrobustspeechrepresentationlearningforthousandsoflanguages-2024","role":"author","urls":{"Paper":"https://aclanthology.org/2024.emnlp-main.570/"},"metadata":{"authorlinks":{}}},"bibtype":"inproceedings","biburl":"https://bibbase.org/zotero/abhishek-p","dataSources":["h7kKWXpJh2iaX92T5"],"keywords":[],"search_terms":["towards","robust","speech","representation","learning","thousands","languages","chen","zhang","peng","li","tian","shi","chang","maiti","livescu","watanabe"],"title":"Towards Robust Speech Representation Learning for Thousands of Languages","year":2024}