The fineweb datasets: Decanting the web for the finest text data at scale. Penedo, G., Kydlíček, H., Lozhkov, A., Mitchell, M., Raffel, C. A, Von Werra, L., & Wolf, T. In Advances in Neural Information Processing Systems, volume 37, pages 30811–30849, 2024.
Paper doi bibtex @inproceedings{penedo_fineweb_2024,
title = {The fineweb datasets: {Decanting} the web for the finest text data at scale},
volume = {37},
url = {https://proceedings.neurips.cc/paper_files/paper/2024/file/370df50ccfdf8bde18f8f9c2d9151bda-Paper-Datasets_and_Benchmarks_Track.pdf},
doi = {10.52202/079017-0970},
booktitle = {Advances in {Neural} {Information} {Processing} {Systems}},
author = {Penedo, Guilherme and Kydlíček, Hynek and Lozhkov, Anton and Mitchell, Margaret and Raffel, Colin A and Von Werra, Leandro and Wolf, Thomas},
year = {2024},
pages = {30811--30849},
}
Downloads: 0
{"_id":"3JKLBWjPzTTfaA5SW","bibbaseid":"penedo-kydlek-lozhkov-mitchell-raffel-vonwerra-wolf-thefinewebdatasetsdecantingthewebforthefinesttextdataatscale-2024","author_short":["Penedo, G.","Kydlíček, H.","Lozhkov, A.","Mitchell, M.","Raffel, C. A","Von Werra, L.","Wolf, T."],"bibdata":{"bibtype":"inproceedings","type":"inproceedings","title":"The fineweb datasets: Decanting the web for the finest text data at scale","volume":"37","url":"https://proceedings.neurips.cc/paper_files/paper/2024/file/370df50ccfdf8bde18f8f9c2d9151bda-Paper-Datasets_and_Benchmarks_Track.pdf","doi":"10.52202/079017-0970","booktitle":"Advances in Neural Information Processing Systems","author":[{"propositions":[],"lastnames":["Penedo"],"firstnames":["Guilherme"],"suffixes":[]},{"propositions":[],"lastnames":["Kydlíček"],"firstnames":["Hynek"],"suffixes":[]},{"propositions":[],"lastnames":["Lozhkov"],"firstnames":["Anton"],"suffixes":[]},{"propositions":[],"lastnames":["Mitchell"],"firstnames":["Margaret"],"suffixes":[]},{"propositions":[],"lastnames":["Raffel"],"firstnames":["Colin","A"],"suffixes":[]},{"propositions":[],"lastnames":["Von","Werra"],"firstnames":["Leandro"],"suffixes":[]},{"propositions":[],"lastnames":["Wolf"],"firstnames":["Thomas"],"suffixes":[]}],"year":"2024","pages":"30811–30849","bibtex":"@inproceedings{penedo_fineweb_2024,\n\ttitle = {The fineweb datasets: {Decanting} the web for the finest text data at scale},\n\tvolume = {37},\n\turl = {https://proceedings.neurips.cc/paper_files/paper/2024/file/370df50ccfdf8bde18f8f9c2d9151bda-Paper-Datasets_and_Benchmarks_Track.pdf},\n\tdoi = {10.52202/079017-0970},\n\tbooktitle = {Advances in {Neural} {Information} {Processing} {Systems}},\n\tauthor = {Penedo, Guilherme and Kydlíček, Hynek and Lozhkov, Anton and Mitchell, Margaret and Raffel, Colin A and Von Werra, Leandro and Wolf, Thomas},\n\tyear = {2024},\n\tpages = {30811--30849},\n}\n\n\n\n","author_short":["Penedo, G.","Kydlíček, H.","Lozhkov, A.","Mitchell, M.","Raffel, C. A","Von Werra, L.","Wolf, T."],"key":"penedo_fineweb_2024","id":"penedo_fineweb_2024","bibbaseid":"penedo-kydlek-lozhkov-mitchell-raffel-vonwerra-wolf-thefinewebdatasetsdecantingthewebforthefinesttextdataatscale-2024","role":"author","urls":{"Paper":"https://proceedings.neurips.cc/paper_files/paper/2024/file/370df50ccfdf8bde18f8f9c2d9151bda-Paper-Datasets_and_Benchmarks_Track.pdf"},"metadata":{"authorlinks":{}}},"bibtype":"inproceedings","biburl":"https://bibbase.org/zotero-group/schulzkx/5158478","dataSources":["JFDnASMkoQCjjGL8E"],"keywords":[],"search_terms":["fineweb","datasets","decanting","web","finest","text","data","scale","penedo","kydlíček","lozhkov","mitchell","raffel","von werra","wolf"],"title":"The fineweb datasets: Decanting the web for the finest text data at scale","year":2024}