The BigScience ROOTS Corpus: A 1.6TB Composite Multilingual Dataset. Laurençon, H., Saulnier, L., Wang, T., Akiki, C., del Moral, A. V., Scao, T. L., von Werra, L., Mou, C., Ponferrada, E. G., Nguyen, H., Frohberg, J., Sasko, M., Lhoest, Q., McMillan-Major, A., Dupont, G., Biderman, S., Rogers, A., Allal, L. B., Toni, F. D., Pistilli, G., Nguyen, O., Nikpoor, S., Masoud, M., Colombo, P., de la Rosa, J., Villegas, P., Thrush, T., Longpre, S., Nagel, S., Weber, L., Muñoz, M., Zhu, J., van Strien, D., Alyafeai, Z., Almubarak, K., Vu, M. C., Gonzalez-Dios, I., Soroa, A., Lo, K., Dey, M., Suarez, P. O., Gokaslan, A., Bose, S., Adelani, D. I., Phan, L., Tran, H., Yu, I., Pai, S., Chim, J., Lepercq, V., Ilic, S., Mitchell, M., Luccioni, A. S., & Jernite, Y. In Koyejo, S., Mohamed, S., Agarwal, A., Belgrave, D., Cho, K., & Oh, A., editors, Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans, LA, USA, November 28 - December 9, 2022, 2022.
Paper bibtex @inproceedings{DBLP:conf/nips/LaurenconSWAMSW22,
author = {Hugo Lauren{\c{c}}on and
Lucile Saulnier and
Thomas Wang and
Christopher Akiki and
Albert Villanova del Moral and
Teven Le Scao and
Leandro von Werra and
Chenghao Mou and
Eduardo Gonz{\'{a}}lez Ponferrada and
Huu Nguyen and
J{\"{o}}rg Frohberg and
Mario Sasko and
Quentin Lhoest and
Angelina McMillan{-}Major and
G{\'{e}}rard Dupont and
Stella Biderman and
Anna Rogers and
Loubna Ben Allal and
Francesco De Toni and
Giada Pistilli and
Olivier Nguyen and
Somaieh Nikpoor and
Maraim Masoud and
Pierre Colombo and
Javier de la Rosa and
Paulo Villegas and
Tristan Thrush and
Shayne Longpre and
Sebastian Nagel and
Leon Weber and
Manuel Mu{\~{n}}oz and
Jian Zhu and
Daniel van Strien and
Zaid Alyafeai and
Khalid Almubarak and
Minh Chien Vu and
Itziar Gonzalez{-}Dios and
Aitor Soroa and
Kyle Lo and
Manan Dey and
Pedro Ortiz Suarez and
Aaron Gokaslan and
Shamik Bose and
David Ifeoluwa Adelani and
Long Phan and
Hieu Tran and
Ian Yu and
Suhas Pai and
Jenny Chim and
Violette Lepercq and
Suzana Ilic and
Margaret Mitchell and
Alexandra Sasha Luccioni and
Yacine Jernite},
editor = {Sanmi Koyejo and
S. Mohamed and
A. Agarwal and
Danielle Belgrave and
K. Cho and
A. Oh},
title = {The BigScience {ROOTS} Corpus: {A} 1.6TB Composite Multilingual Dataset},
booktitle = {Advances in Neural Information Processing Systems 35: Annual Conference
on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans,
LA, USA, November 28 - December 9, 2022},
year = {2022},
url = {http://papers.nips.cc/paper\_files/paper/2022/hash/ce9e92e3de2372a4b93353eb7f3dc0bd-Abstract-Datasets\_and\_Benchmarks.html},
timestamp = {Sat, 13 Jul 2024 01:00:00 +0200},
biburl = {https://dblp.org/rec/conf/nips/LaurenconSWAMSW22.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
Downloads: 0
{"_id":"ZuZZtpqcmdTYWsKe4","bibbaseid":"laurenon-saulnier-wang-akiki-delmoral-scao-vonwerra-mou-etal-thebigsciencerootscorpusa16tbcompositemultilingualdataset-2022","author_short":["Laurençon, H.","Saulnier, L.","Wang, T.","Akiki, C.","del Moral, A. V.","Scao, T. L.","von Werra, L.","Mou, C.","Ponferrada, E. G.","Nguyen, H.","Frohberg, J.","Sasko, M.","Lhoest, Q.","McMillan-Major, A.","Dupont, G.","Biderman, S.","Rogers, A.","Allal, L. B.","Toni, F. D.","Pistilli, G.","Nguyen, O.","Nikpoor, S.","Masoud, M.","Colombo, P.","de la Rosa, J.","Villegas, P.","Thrush, T.","Longpre, S.","Nagel, S.","Weber, L.","Muñoz, M.","Zhu, J.","van Strien, D.","Alyafeai, Z.","Almubarak, K.","Vu, M. C.","Gonzalez-Dios, I.","Soroa, A.","Lo, K.","Dey, M.","Suarez, P. O.","Gokaslan, A.","Bose, S.","Adelani, D. I.","Phan, L.","Tran, H.","Yu, I.","Pai, S.","Chim, J.","Lepercq, V.","Ilic, S.","Mitchell, M.","Luccioni, A. S.","Jernite, Y."],"bibdata":{"bibtype":"inproceedings","type":"inproceedings","author":[{"firstnames":["Hugo"],"propositions":[],"lastnames":["Laurençon"],"suffixes":[]},{"firstnames":["Lucile"],"propositions":[],"lastnames":["Saulnier"],"suffixes":[]},{"firstnames":["Thomas"],"propositions":[],"lastnames":["Wang"],"suffixes":[]},{"firstnames":["Christopher"],"propositions":[],"lastnames":["Akiki"],"suffixes":[]},{"firstnames":["Albert","Villanova"],"propositions":["del"],"lastnames":["Moral"],"suffixes":[]},{"firstnames":["Teven","Le"],"propositions":[],"lastnames":["Scao"],"suffixes":[]},{"firstnames":["Leandro"],"propositions":["von"],"lastnames":["Werra"],"suffixes":[]},{"firstnames":["Chenghao"],"propositions":[],"lastnames":["Mou"],"suffixes":[]},{"firstnames":["Eduardo","González"],"propositions":[],"lastnames":["Ponferrada"],"suffixes":[]},{"firstnames":["Huu"],"propositions":[],"lastnames":["Nguyen"],"suffixes":[]},{"firstnames":["Jörg"],"propositions":[],"lastnames":["Frohberg"],"suffixes":[]},{"firstnames":["Mario"],"propositions":[],"lastnames":["Sasko"],"suffixes":[]},{"firstnames":["Quentin"],"propositions":[],"lastnames":["Lhoest"],"suffixes":[]},{"firstnames":["Angelina"],"propositions":[],"lastnames":["McMillan-Major"],"suffixes":[]},{"firstnames":["Gérard"],"propositions":[],"lastnames":["Dupont"],"suffixes":[]},{"firstnames":["Stella"],"propositions":[],"lastnames":["Biderman"],"suffixes":[]},{"firstnames":["Anna"],"propositions":[],"lastnames":["Rogers"],"suffixes":[]},{"firstnames":["Loubna","Ben"],"propositions":[],"lastnames":["Allal"],"suffixes":[]},{"firstnames":["Francesco","De"],"propositions":[],"lastnames":["Toni"],"suffixes":[]},{"firstnames":["Giada"],"propositions":[],"lastnames":["Pistilli"],"suffixes":[]},{"firstnames":["Olivier"],"propositions":[],"lastnames":["Nguyen"],"suffixes":[]},{"firstnames":["Somaieh"],"propositions":[],"lastnames":["Nikpoor"],"suffixes":[]},{"firstnames":["Maraim"],"propositions":[],"lastnames":["Masoud"],"suffixes":[]},{"firstnames":["Pierre"],"propositions":[],"lastnames":["Colombo"],"suffixes":[]},{"firstnames":["Javier"],"propositions":["de","la"],"lastnames":["Rosa"],"suffixes":[]},{"firstnames":["Paulo"],"propositions":[],"lastnames":["Villegas"],"suffixes":[]},{"firstnames":["Tristan"],"propositions":[],"lastnames":["Thrush"],"suffixes":[]},{"firstnames":["Shayne"],"propositions":[],"lastnames":["Longpre"],"suffixes":[]},{"firstnames":["Sebastian"],"propositions":[],"lastnames":["Nagel"],"suffixes":[]},{"firstnames":["Leon"],"propositions":[],"lastnames":["Weber"],"suffixes":[]},{"firstnames":["Manuel"],"propositions":[],"lastnames":["Muñoz"],"suffixes":[]},{"firstnames":["Jian"],"propositions":[],"lastnames":["Zhu"],"suffixes":[]},{"firstnames":["Daniel"],"propositions":["van"],"lastnames":["Strien"],"suffixes":[]},{"firstnames":["Zaid"],"propositions":[],"lastnames":["Alyafeai"],"suffixes":[]},{"firstnames":["Khalid"],"propositions":[],"lastnames":["Almubarak"],"suffixes":[]},{"firstnames":["Minh","Chien"],"propositions":[],"lastnames":["Vu"],"suffixes":[]},{"firstnames":["Itziar"],"propositions":[],"lastnames":["Gonzalez-Dios"],"suffixes":[]},{"firstnames":["Aitor"],"propositions":[],"lastnames":["Soroa"],"suffixes":[]},{"firstnames":["Kyle"],"propositions":[],"lastnames":["Lo"],"suffixes":[]},{"firstnames":["Manan"],"propositions":[],"lastnames":["Dey"],"suffixes":[]},{"firstnames":["Pedro","Ortiz"],"propositions":[],"lastnames":["Suarez"],"suffixes":[]},{"firstnames":["Aaron"],"propositions":[],"lastnames":["Gokaslan"],"suffixes":[]},{"firstnames":["Shamik"],"propositions":[],"lastnames":["Bose"],"suffixes":[]},{"firstnames":["David","Ifeoluwa"],"propositions":[],"lastnames":["Adelani"],"suffixes":[]},{"firstnames":["Long"],"propositions":[],"lastnames":["Phan"],"suffixes":[]},{"firstnames":["Hieu"],"propositions":[],"lastnames":["Tran"],"suffixes":[]},{"firstnames":["Ian"],"propositions":[],"lastnames":["Yu"],"suffixes":[]},{"firstnames":["Suhas"],"propositions":[],"lastnames":["Pai"],"suffixes":[]},{"firstnames":["Jenny"],"propositions":[],"lastnames":["Chim"],"suffixes":[]},{"firstnames":["Violette"],"propositions":[],"lastnames":["Lepercq"],"suffixes":[]},{"firstnames":["Suzana"],"propositions":[],"lastnames":["Ilic"],"suffixes":[]},{"firstnames":["Margaret"],"propositions":[],"lastnames":["Mitchell"],"suffixes":[]},{"firstnames":["Alexandra","Sasha"],"propositions":[],"lastnames":["Luccioni"],"suffixes":[]},{"firstnames":["Yacine"],"propositions":[],"lastnames":["Jernite"],"suffixes":[]}],"editor":[{"firstnames":["Sanmi"],"propositions":[],"lastnames":["Koyejo"],"suffixes":[]},{"firstnames":["S."],"propositions":[],"lastnames":["Mohamed"],"suffixes":[]},{"firstnames":["A."],"propositions":[],"lastnames":["Agarwal"],"suffixes":[]},{"firstnames":["Danielle"],"propositions":[],"lastnames":["Belgrave"],"suffixes":[]},{"firstnames":["K."],"propositions":[],"lastnames":["Cho"],"suffixes":[]},{"firstnames":["A."],"propositions":[],"lastnames":["Oh"],"suffixes":[]}],"title":"The BigScience ROOTS Corpus: A 1.6TB Composite Multilingual Dataset","booktitle":"Advances in Neural Information Processing Systems 35: Annual Conference on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans, LA, USA, November 28 - December 9, 2022","year":"2022","url":"http://papers.nips.cc/paper\\_files/paper/2022/hash/ce9e92e3de2372a4b93353eb7f3dc0bd-Abstract-Datasets\\_and\\_Benchmarks.html","timestamp":"Sat, 13 Jul 2024 01:00:00 +0200","biburl":"https://dblp.org/rec/conf/nips/LaurenconSWAMSW22.bib","bibsource":"dblp computer science bibliography, https://dblp.org","bibtex":"@inproceedings{DBLP:conf/nips/LaurenconSWAMSW22,\n author = {Hugo Lauren{\\c{c}}on and\n Lucile Saulnier and\n Thomas Wang and\n Christopher Akiki and\n Albert Villanova del Moral and\n Teven Le Scao and\n Leandro von Werra and\n Chenghao Mou and\n Eduardo Gonz{\\'{a}}lez Ponferrada and\n Huu Nguyen and\n J{\\\"{o}}rg Frohberg and\n Mario Sasko and\n Quentin Lhoest and\n Angelina McMillan{-}Major and\n G{\\'{e}}rard Dupont and\n Stella Biderman and\n Anna Rogers and\n Loubna Ben Allal and\n Francesco De Toni and\n Giada Pistilli and\n Olivier Nguyen and\n Somaieh Nikpoor and\n Maraim Masoud and\n Pierre Colombo and\n Javier de la Rosa and\n Paulo Villegas and\n Tristan Thrush and\n Shayne Longpre and\n Sebastian Nagel and\n Leon Weber and\n Manuel Mu{\\~{n}}oz and\n Jian Zhu and\n Daniel van Strien and\n Zaid Alyafeai and\n Khalid Almubarak and\n Minh Chien Vu and\n Itziar Gonzalez{-}Dios and\n Aitor Soroa and\n Kyle Lo and\n Manan Dey and\n Pedro Ortiz Suarez and\n Aaron Gokaslan and\n Shamik Bose and\n David Ifeoluwa Adelani and\n Long Phan and\n Hieu Tran and\n Ian Yu and\n Suhas Pai and\n Jenny Chim and\n Violette Lepercq and\n Suzana Ilic and\n Margaret Mitchell and\n Alexandra Sasha Luccioni and\n Yacine Jernite},\n editor = {Sanmi Koyejo and\n S. Mohamed and\n A. Agarwal and\n Danielle Belgrave and\n K. Cho and\n A. Oh},\n title = {The BigScience {ROOTS} Corpus: {A} 1.6TB Composite Multilingual Dataset},\n booktitle = {Advances in Neural Information Processing Systems 35: Annual Conference\n on Neural Information Processing Systems 2022, NeurIPS 2022, New Orleans,\n LA, USA, November 28 - December 9, 2022},\n year = {2022},\n url = {http://papers.nips.cc/paper\\_files/paper/2022/hash/ce9e92e3de2372a4b93353eb7f3dc0bd-Abstract-Datasets\\_and\\_Benchmarks.html},\n timestamp = {Sat, 13 Jul 2024 01:00:00 +0200},\n biburl = {https://dblp.org/rec/conf/nips/LaurenconSWAMSW22.bib},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n\n","author_short":["Laurençon, H.","Saulnier, L.","Wang, T.","Akiki, C.","del Moral, A. V.","Scao, T. L.","von Werra, L.","Mou, C.","Ponferrada, E. G.","Nguyen, H.","Frohberg, J.","Sasko, M.","Lhoest, Q.","McMillan-Major, A.","Dupont, G.","Biderman, S.","Rogers, A.","Allal, L. B.","Toni, F. D.","Pistilli, G.","Nguyen, O.","Nikpoor, S.","Masoud, M.","Colombo, P.","de la Rosa, J.","Villegas, P.","Thrush, T.","Longpre, S.","Nagel, S.","Weber, L.","Muñoz, M.","Zhu, J.","van Strien, D.","Alyafeai, Z.","Almubarak, K.","Vu, M. C.","Gonzalez-Dios, I.","Soroa, A.","Lo, K.","Dey, M.","Suarez, P. O.","Gokaslan, A.","Bose, S.","Adelani, D. I.","Phan, L.","Tran, H.","Yu, I.","Pai, S.","Chim, J.","Lepercq, V.","Ilic, S.","Mitchell, M.","Luccioni, A. S.","Jernite, Y."],"editor_short":["Koyejo, S.","Mohamed, S.","Agarwal, A.","Belgrave, D.","Cho, K.","Oh, A."],"key":"DBLP:conf/nips/LaurenconSWAMSW22","id":"DBLP:conf/nips/LaurenconSWAMSW22","bibbaseid":"laurenon-saulnier-wang-akiki-delmoral-scao-vonwerra-mou-etal-thebigsciencerootscorpusa16tbcompositemultilingualdataset-2022","role":"author","urls":{"Paper":"http://papers.nips.cc/paper\\_files/paper/2022/hash/ce9e92e3de2372a4b93353eb7f3dc0bd-Abstract-Datasets\\_and\\_Benchmarks.html"},"metadata":{"authorlinks":{}}},"bibtype":"inproceedings","biburl":"https://dblp.org/pid/229/3167.bib","dataSources":["FD6yNzHMPC2zrsvYD","GXv5EbWSzjLH96rpE"],"keywords":[],"search_terms":["bigscience","roots","corpus","6tb","composite","multilingual","dataset","laurençon","saulnier","wang","akiki","del moral","scao","von werra","mou","ponferrada","nguyen","frohberg","sasko","lhoest","mcmillan-major","dupont","biderman","rogers","allal","toni","pistilli","nguyen","nikpoor","masoud","colombo","de la rosa","villegas","thrush","longpre","nagel","weber","muñoz","zhu","van strien","alyafeai","almubarak","vu","gonzalez-dios","soroa","lo","dey","suarez","gokaslan","bose","adelani","phan","tran","yu","pai","chim","lepercq","ilic","mitchell","luccioni","jernite"],"title":"The BigScience ROOTS Corpus: A 1.6TB Composite Multilingual Dataset","year":2022}