The BigScience ROOTS Corpus: A 1.6TB Composite Multilingual Dataset. Laurençon, H., Saulnier, L., Wang, T., Akiki, C., del Moral, A. V., Scao, T. L., von Werra, L., Mou, C., Ponferrada, E. G., Nguyen, H., Frohberg, J., Sasko, M., Lhoest, Q., McMillan-Major, A., Dupont, G., Biderman, S., Rogers, A., Allal, L. B., Toni, F. D., Pistilli, G., Nguyen, O., Nikpoor, S., Masoud, M., Colombo, P., de la Rosa, J., Villegas, P., Thrush, T., Longpre, S., Nagel, S., Weber, L., Muñoz, M., Zhu, J., van Strien, D., Alyafeai, Z., Almubarak, K., Vu, M. C., Gonzalez-Dios, I., Soroa, A., Lo, K., Dey, M., Suarez, P. O., Gokaslan, A., Bose, S., Adelani, D. I., Phan, L., Tran, H., Yu, I., Pai, S., Chim, J., Lepercq, V., Ilic, S., Mitchell, M., Luccioni, S., & Jernite, Y. CoRR, 2023.
Paper doi bibtex @article{DBLP:journals/corr/abs-2303-03915,
author = {Hugo Lauren{\c{c}}on and
Lucile Saulnier and
Thomas Wang and
Christopher Akiki and
Albert Villanova del Moral and
Teven Le Scao and
Leandro von Werra and
Chenghao Mou and
Eduardo Gonz{\'{a}}lez Ponferrada and
Huu Nguyen and
J{\"{o}}rg Frohberg and
Mario Sasko and
Quentin Lhoest and
Angelina McMillan{-}Major and
G{\'{e}}rard Dupont and
Stella Biderman and
Anna Rogers and
Loubna Ben Allal and
Francesco De Toni and
Giada Pistilli and
Olivier Nguyen and
Somaieh Nikpoor and
Maraim Masoud and
Pierre Colombo and
Javier de la Rosa and
Paulo Villegas and
Tristan Thrush and
Shayne Longpre and
Sebastian Nagel and
Leon Weber and
Manuel Mu{\~{n}}oz and
Jian Zhu and
Daniel van Strien and
Zaid Alyafeai and
Khalid Almubarak and
Minh Chien Vu and
Itziar Gonzalez{-}Dios and
Aitor Soroa and
Kyle Lo and
Manan Dey and
Pedro Ortiz Suarez and
Aaron Gokaslan and
Shamik Bose and
David Ifeoluwa Adelani and
Long Phan and
Hieu Tran and
Ian Yu and
Suhas Pai and
Jenny Chim and
Violette Lepercq and
Suzana Ilic and
Margaret Mitchell and
Sasha Luccioni and
Yacine Jernite},
title = {The BigScience {ROOTS} Corpus: {A} 1.6TB Composite Multilingual Dataset},
journal = {CoRR},
volume = {abs/2303.03915},
year = {2023},
url = {https://doi.org/10.48550/arXiv.2303.03915},
doi = {10.48550/ARXIV.2303.03915},
eprinttype = {arXiv},
eprint = {2303.03915},
timestamp = {Sat, 13 Jul 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2303-03915.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}