Towards Robust Speech Representation Learning for Thousands of Languages. Chen, W., Zhang, W., Peng, Y., Li, X., Tian, J., Shi, J., Chang, X., Maiti, S., Livescu, K., & Watanabe, S. In Al-Onaizan, Y., Bansal, M., & Chen, Y., editors, Proceedings of the 2024 Conference on Empirical Methods in Natural Language Processing, pages 10205–10224, Miami, Florida, USA, November, 2024. Association for Computational Linguistics.
Towards Robust Speech Representation Learning for Thousands of Languages [link]Paper  doi  abstract   bibtex   
Self-supervised learning (SSL) has helped extend speech technologies to more languages by reducing the need for labeled data. However, models are still far from supporting the world`s 7000+ languages. We propose XEUS, a Cross-lingual Encoder for Universal Speech, trained on over 1 million hours of data across 4057 languages, extending the language coverage of SSL models 4-fold. We combine 1 million hours of speech from existing publicly accessible corpora with a newly created corpus of 7400+ hours from 4057 languages, which will be publicly released. To handle the diverse conditions of multilingual speech data, we augment the typical SSL masked prediction approach with a novel dereverberation objective, increasing robustness. We evaluate XEUS on several benchmarks, and show that it consistently outperforms or achieves comparable results to state-of-the-art (SOTA) SSL models across a variety of tasks. XEUS sets a new SOTA on the ML-SUPERB benchmark: it outperforms MMS 1B and w2v-BERT 2.0 v2 by 0.8% and 4.4% respectively, despite having less parameters or pre-training data. Checkpoints, code, and data are found in https://www.wavlab.org/activities/2024/xeus/.
@inproceedings{chen_towards_2024,
	address = {Miami, Florida, USA},
	title = {Towards {Robust} {Speech} {Representation} {Learning} for {Thousands} of {Languages}},
	url = {https://aclanthology.org/2024.emnlp-main.570/},
	doi = {10.18653/v1/2024.emnlp-main.570},
	abstract = {Self-supervised learning (SSL) has helped extend speech technologies to more languages by reducing the need for labeled data. However, models are still far from supporting the world`s 7000+ languages. We propose XEUS, a Cross-lingual Encoder for Universal Speech, trained on over 1 million hours of data across 4057 languages, extending the language coverage of SSL models 4-fold. We combine 1 million hours of speech from existing publicly accessible corpora with a newly created corpus of 7400+ hours from 4057 languages, which will be publicly released. To handle the diverse conditions of multilingual speech data, we augment the typical SSL masked prediction approach with a novel dereverberation objective, increasing robustness. We evaluate XEUS on several benchmarks, and show that it consistently outperforms or achieves comparable results to state-of-the-art (SOTA) SSL models across a variety of tasks. XEUS sets a new SOTA on the ML-SUPERB benchmark: it outperforms MMS 1B and w2v-BERT 2.0 v2 by 0.8\% and 4.4\% respectively, despite having less parameters or pre-training data. Checkpoints, code, and data are found in https://www.wavlab.org/activities/2024/xeus/.},
	urldate = {2025-01-26},
	booktitle = {Proceedings of the 2024 {Conference} on {Empirical} {Methods} in {Natural} {Language} {Processing}},
	publisher = {Association for Computational Linguistics},
	author = {Chen, William and Zhang, Wangyou and Peng, Yifan and Li, Xinjian and Tian, Jinchuan and Shi, Jiatong and Chang, Xuankai and Maiti, Soumi and Livescu, Karen and Watanabe, Shinji},
	editor = {Al-Onaizan, Yaser and Bansal, Mohit and Chen, Yun-Nung},
	month = nov,
	year = {2024},
	pages = {10205--10224},
}

Downloads: 0