DataRec: A Python Library for Standardized and Reproducible Data Management in Recommender Systems. Mancino, A. C. M., Bufi, S., Di Fazio, A., Ferrara, A., Malitesta, D., Pomo, C., & Di Noia, T. In Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval, of SIGIR '25, pages 3478–3487, New York, NY, USA, July, 2025. Association for Computing Machinery.
Paper doi abstract bibtex Recommender systems have demonstrated a significant impact across diverse domains, yet ensuring the reproducibility of experimental findings remains a persistent challenge. A primary obstacle lies in the fragmented and often opaque data management strategies employed during the preprocessing stage, where decisions about dataset selection, filtering, and splitting can substantially influence outcomes. To address these limitations, we introduce DataRec, an open-source Python-based library specifically designed to unify and streamline data handling in recommender system research. By providing reproducible routines for dataset preparation, data versioning, and seamless integration with other frameworks, DataRec promotes methodological standardization, interoperability, and comparability across different experimental setups. Our design is informed by an in-depth review of 55 state-of-the-art recommendation studies, ensuring that DataRec adopts best practices while addressing common pitfalls in data management. Ultimately, our contribution facilitates fair benchmarking, enhances reproducibility, and fosters greater trust in experimental results within the broader recommender systems community. The DataRec library, documentation, and examples are freely available at https://github.com/sisinflab/DataRec.
@inproceedings{mancino_datarec_2025,
address = {New York, NY, USA},
series = {{SIGIR} '25},
title = {{DataRec}: {A} {Python} {Library} for {Standardized} and {Reproducible} {Data} {Management} in {Recommender} {Systems}},
isbn = {979-8-4007-1592-1},
shorttitle = {{DataRec}},
url = {https://dl.acm.org/doi/10.1145/3726302.3730320},
doi = {10.1145/3726302.3730320},
abstract = {Recommender systems have demonstrated a significant impact across diverse domains, yet ensuring the reproducibility of experimental findings remains a persistent challenge. A primary obstacle lies in the fragmented and often opaque data management strategies employed during the preprocessing stage, where decisions about dataset selection, filtering, and splitting can substantially influence outcomes. To address these limitations, we introduce DataRec, an open-source Python-based library specifically designed to unify and streamline data handling in recommender system research. By providing reproducible routines for dataset preparation, data versioning, and seamless integration with other frameworks, DataRec promotes methodological standardization, interoperability, and comparability across different experimental setups. Our design is informed by an in-depth review of 55 state-of-the-art recommendation studies, ensuring that DataRec adopts best practices while addressing common pitfalls in data management. Ultimately, our contribution facilitates fair benchmarking, enhances reproducibility, and fosters greater trust in experimental results within the broader recommender systems community. The DataRec library, documentation, and examples are freely available at https://github.com/sisinflab/DataRec.},
urldate = {2025-07-18},
booktitle = {Proceedings of the 48th {International} {ACM} {SIGIR} {Conference} on {Research} and {Development} in {Information} {Retrieval}},
publisher = {Association for Computing Machinery},
author = {Mancino, Alberto Carlo Maria and Bufi, Salvatore and Di Fazio, Angela and Ferrara, Antonio and Malitesta, Daniele and Pomo, Claudio and Di Noia, Tommaso},
month = jul,
year = {2025},
pages = {3478--3487},
}
Downloads: 0
{"_id":"ytZ2P9TMuRqe45oXW","bibbaseid":"mancino-bufi-difazio-ferrara-malitesta-pomo-dinoia-datarecapythonlibraryforstandardizedandreproducibledatamanagementinrecommendersystems-2025","author_short":["Mancino, A. C. M.","Bufi, S.","Di Fazio, A.","Ferrara, A.","Malitesta, D.","Pomo, C.","Di Noia, T."],"bibdata":{"bibtype":"inproceedings","type":"inproceedings","address":"New York, NY, USA","series":"SIGIR '25","title":"DataRec: A Python Library for Standardized and Reproducible Data Management in Recommender Systems","isbn":"979-8-4007-1592-1","shorttitle":"DataRec","url":"https://dl.acm.org/doi/10.1145/3726302.3730320","doi":"10.1145/3726302.3730320","abstract":"Recommender systems have demonstrated a significant impact across diverse domains, yet ensuring the reproducibility of experimental findings remains a persistent challenge. A primary obstacle lies in the fragmented and often opaque data management strategies employed during the preprocessing stage, where decisions about dataset selection, filtering, and splitting can substantially influence outcomes. To address these limitations, we introduce DataRec, an open-source Python-based library specifically designed to unify and streamline data handling in recommender system research. By providing reproducible routines for dataset preparation, data versioning, and seamless integration with other frameworks, DataRec promotes methodological standardization, interoperability, and comparability across different experimental setups. Our design is informed by an in-depth review of 55 state-of-the-art recommendation studies, ensuring that DataRec adopts best practices while addressing common pitfalls in data management. Ultimately, our contribution facilitates fair benchmarking, enhances reproducibility, and fosters greater trust in experimental results within the broader recommender systems community. The DataRec library, documentation, and examples are freely available at https://github.com/sisinflab/DataRec.","urldate":"2025-07-18","booktitle":"Proceedings of the 48th International ACM SIGIR Conference on Research and Development in Information Retrieval","publisher":"Association for Computing Machinery","author":[{"propositions":[],"lastnames":["Mancino"],"firstnames":["Alberto","Carlo","Maria"],"suffixes":[]},{"propositions":[],"lastnames":["Bufi"],"firstnames":["Salvatore"],"suffixes":[]},{"propositions":[],"lastnames":["Di","Fazio"],"firstnames":["Angela"],"suffixes":[]},{"propositions":[],"lastnames":["Ferrara"],"firstnames":["Antonio"],"suffixes":[]},{"propositions":[],"lastnames":["Malitesta"],"firstnames":["Daniele"],"suffixes":[]},{"propositions":[],"lastnames":["Pomo"],"firstnames":["Claudio"],"suffixes":[]},{"propositions":[],"lastnames":["Di","Noia"],"firstnames":["Tommaso"],"suffixes":[]}],"month":"July","year":"2025","pages":"3478–3487","bibtex":"@inproceedings{mancino_datarec_2025,\n\taddress = {New York, NY, USA},\n\tseries = {{SIGIR} '25},\n\ttitle = {{DataRec}: {A} {Python} {Library} for {Standardized} and {Reproducible} {Data} {Management} in {Recommender} {Systems}},\n\tisbn = {979-8-4007-1592-1},\n\tshorttitle = {{DataRec}},\n\turl = {https://dl.acm.org/doi/10.1145/3726302.3730320},\n\tdoi = {10.1145/3726302.3730320},\n\tabstract = {Recommender systems have demonstrated a significant impact across diverse domains, yet ensuring the reproducibility of experimental findings remains a persistent challenge. A primary obstacle lies in the fragmented and often opaque data management strategies employed during the preprocessing stage, where decisions about dataset selection, filtering, and splitting can substantially influence outcomes. To address these limitations, we introduce DataRec, an open-source Python-based library specifically designed to unify and streamline data handling in recommender system research. By providing reproducible routines for dataset preparation, data versioning, and seamless integration with other frameworks, DataRec promotes methodological standardization, interoperability, and comparability across different experimental setups. Our design is informed by an in-depth review of 55 state-of-the-art recommendation studies, ensuring that DataRec adopts best practices while addressing common pitfalls in data management. Ultimately, our contribution facilitates fair benchmarking, enhances reproducibility, and fosters greater trust in experimental results within the broader recommender systems community. The DataRec library, documentation, and examples are freely available at https://github.com/sisinflab/DataRec.},\n\turldate = {2025-07-18},\n\tbooktitle = {Proceedings of the 48th {International} {ACM} {SIGIR} {Conference} on {Research} and {Development} in {Information} {Retrieval}},\n\tpublisher = {Association for Computing Machinery},\n\tauthor = {Mancino, Alberto Carlo Maria and Bufi, Salvatore and Di Fazio, Angela and Ferrara, Antonio and Malitesta, Daniele and Pomo, Claudio and Di Noia, Tommaso},\n\tmonth = jul,\n\tyear = {2025},\n\tpages = {3478--3487},\n}\n\n","author_short":["Mancino, A. C. M.","Bufi, S.","Di Fazio, A.","Ferrara, A.","Malitesta, D.","Pomo, C.","Di Noia, T."],"key":"mancino_datarec_2025","id":"mancino_datarec_2025","bibbaseid":"mancino-bufi-difazio-ferrara-malitesta-pomo-dinoia-datarecapythonlibraryforstandardizedandreproducibledatamanagementinrecommendersystems-2025","role":"author","urls":{"Paper":"https://dl.acm.org/doi/10.1145/3726302.3730320"},"metadata":{"authorlinks":{}}},"bibtype":"inproceedings","biburl":"https://api.zotero.org/users/6655/collections/3TB3KT36/items?key=VFvZhZXIoHNBbzoLZ1IM2zgf&format=bibtex&limit=100","dataSources":["7KNAjxiv2tsagmbgY"],"keywords":[],"search_terms":["datarec","python","library","standardized","reproducible","data","management","recommender","systems","mancino","bufi","di fazio","ferrara","malitesta","pomo","di noia"],"title":"DataRec: A Python Library for Standardized and Reproducible Data Management in Recommender Systems","year":2025}