Evaluating Deep Learning Recommendation Model Training Scalability with the Dynamic Opera Network. Imes, C., Rittenbach, A., Xie, P., Kang, D. I. D., Walters, J. P., & Crago, S. P. In Proceedings of the 4th Workshop on Machine Learning and Systems, of EuroMLSys '24, pages 169–175, New York, NY, USA, 2024. Association for Computing Machinery. Paper doi abstract bibtex 6 downloads Deep learning is commonly used to make personalized recommendations to users for a wide variety of activities. However, deep learning recommendation model (DLRM) training is increasingly dominated by all-to-all and many-to-many communication patterns. While there are a wide variety of algorithms to efficiently overlap communication and computation for many collective operations, these patterns are strictly limited by network bottlenecks. We propose co-designing DLRM model training with the recently proposed Opera network, which is designed to avoid multiple network hops using time-varying source-to-destination circuits. Using measurements from state-of-the-art NVIDIA A100 GPUs, we simulate DLRM model training on networks ranging from 16 to 1024 nodes and demonstrate up to 1.79× improvement using Opera compared with equivalent fat-tree networks. We identify important parameters affecting training time and demonstrate that careful co-design is needed to optimize training latency.
@inproceedings{OperaDLRM,
author = {Imes, Connor and Rittenbach, Andrew and Xie, Peng and Kang, Dong In D. and Walters, John Paul and Crago, Stephen P.},
title = {Evaluating Deep Learning Recommendation Model Training Scalability with the Dynamic Opera Network},
year = {2024},
isbn = {9798400705410},
publisher = {Association for Computing Machinery},
address = {New York, NY, USA},
url = {https://doi.org/10.1145/3642970.3655825},
doi = {10.1145/3642970.3655825},
abstract = {Deep learning is commonly used to make personalized recommendations to users for a wide variety of activities. However, deep learning recommendation model (DLRM) training is increasingly dominated by all-to-all and many-to-many communication patterns. While there are a wide variety of algorithms to efficiently overlap communication and computation for many collective operations, these patterns are strictly limited by network bottlenecks. We propose co-designing DLRM model training with the recently proposed Opera network, which is designed to avoid multiple network hops using time-varying source-to-destination circuits. Using measurements from state-of-the-art NVIDIA A100 GPUs, we simulate DLRM model training on networks ranging from 16 to 1024 nodes and demonstrate up to 1.79\texttimes{} improvement using Opera compared with equivalent fat-tree networks. We identify important parameters affecting training time and demonstrate that careful co-design is needed to optimize training latency.},
booktitle = {Proceedings of the 4th Workshop on Machine Learning and Systems},
pages = {169–175},
numpages = {7},
keywords = {deep learning, dynamic networks, machine learning, networks, recommendation models},
location = {, Athens, Greece, },
series = {EuroMLSys '24},
ISIArea = {ML, CAS, NET}
}
Downloads: 6
{"_id":"dSjYuFSELenmq92nx","bibbaseid":"imes-rittenbach-xie-kang-walters-crago-evaluatingdeeplearningrecommendationmodeltrainingscalabilitywiththedynamicoperanetwork-2024","author_short":["Imes, C.","Rittenbach, A.","Xie, P.","Kang, D. I. D.","Walters, J. P.","Crago, S. P."],"bibdata":{"bibtype":"inproceedings","type":"inproceedings","author":[{"propositions":[],"lastnames":["Imes"],"firstnames":["Connor"],"suffixes":[]},{"propositions":[],"lastnames":["Rittenbach"],"firstnames":["Andrew"],"suffixes":[]},{"propositions":[],"lastnames":["Xie"],"firstnames":["Peng"],"suffixes":[]},{"propositions":[],"lastnames":["Kang"],"firstnames":["Dong","In","D."],"suffixes":[]},{"propositions":[],"lastnames":["Walters"],"firstnames":["John","Paul"],"suffixes":[]},{"propositions":[],"lastnames":["Crago"],"firstnames":["Stephen","P."],"suffixes":[]}],"title":"Evaluating Deep Learning Recommendation Model Training Scalability with the Dynamic Opera Network","year":"2024","isbn":"9798400705410","publisher":"Association for Computing Machinery","address":"New York, NY, USA","url":"https://doi.org/10.1145/3642970.3655825","doi":"10.1145/3642970.3655825","abstract":"Deep learning is commonly used to make personalized recommendations to users for a wide variety of activities. However, deep learning recommendation model (DLRM) training is increasingly dominated by all-to-all and many-to-many communication patterns. While there are a wide variety of algorithms to efficiently overlap communication and computation for many collective operations, these patterns are strictly limited by network bottlenecks. We propose co-designing DLRM model training with the recently proposed Opera network, which is designed to avoid multiple network hops using time-varying source-to-destination circuits. Using measurements from state-of-the-art NVIDIA A100 GPUs, we simulate DLRM model training on networks ranging from 16 to 1024 nodes and demonstrate up to 1.79× improvement using Opera compared with equivalent fat-tree networks. We identify important parameters affecting training time and demonstrate that careful co-design is needed to optimize training latency.","booktitle":"Proceedings of the 4th Workshop on Machine Learning and Systems","pages":"169–175","numpages":"7","keywords":"deep learning, dynamic networks, machine learning, networks, recommendation models","location":", Athens, Greece, ","series":"EuroMLSys '24","isiarea":"ML, CAS, NET","bibtex":"@inproceedings{OperaDLRM,\nauthor = {Imes, Connor and Rittenbach, Andrew and Xie, Peng and Kang, Dong In D. and Walters, John Paul and Crago, Stephen P.},\ntitle = {Evaluating Deep Learning Recommendation Model Training Scalability with the Dynamic Opera Network},\nyear = {2024},\nisbn = {9798400705410},\npublisher = {Association for Computing Machinery},\naddress = {New York, NY, USA},\nurl = {https://doi.org/10.1145/3642970.3655825},\ndoi = {10.1145/3642970.3655825},\nabstract = {Deep learning is commonly used to make personalized recommendations to users for a wide variety of activities. However, deep learning recommendation model (DLRM) training is increasingly dominated by all-to-all and many-to-many communication patterns. While there are a wide variety of algorithms to efficiently overlap communication and computation for many collective operations, these patterns are strictly limited by network bottlenecks. We propose co-designing DLRM model training with the recently proposed Opera network, which is designed to avoid multiple network hops using time-varying source-to-destination circuits. Using measurements from state-of-the-art NVIDIA A100 GPUs, we simulate DLRM model training on networks ranging from 16 to 1024 nodes and demonstrate up to 1.79\\texttimes{} improvement using Opera compared with equivalent fat-tree networks. We identify important parameters affecting training time and demonstrate that careful co-design is needed to optimize training latency.},\nbooktitle = {Proceedings of the 4th Workshop on Machine Learning and Systems},\npages = {169–175},\nnumpages = {7},\nkeywords = {deep learning, dynamic networks, machine learning, networks, recommendation models},\nlocation = {, Athens, Greece, },\nseries = {EuroMLSys '24},\nISIArea = {ML, CAS, NET}\n}\n\n","author_short":["Imes, C.","Rittenbach, A.","Xie, P.","Kang, D. I. D.","Walters, J. P.","Crago, S. P."],"bibbaseid":"imes-rittenbach-xie-kang-walters-crago-evaluatingdeeplearningrecommendationmodeltrainingscalabilitywiththedynamicoperanetwork-2024","role":"author","urls":{"Paper":"https://doi.org/10.1145/3642970.3655825"},"keyword":["deep learning","dynamic networks","machine learning","networks","recommendation models"],"metadata":{"authorlinks":{}},"downloads":6},"bibtype":"inproceedings","biburl":"https://bibbase.org/f/nxnupMTqshJBmGaut/cimes-2024-09.bib","dataSources":["8wxz2MZJayyReHAbN","aKv9a6ZaATgxxHGR8","LPA7qi8rmQfuwnQuM"],"keywords":["deep learning","dynamic networks","machine learning","networks","recommendation models"],"search_terms":["evaluating","deep","learning","recommendation","model","training","scalability","dynamic","opera","network","imes","rittenbach","xie","kang","walters","crago"],"title":"Evaluating Deep Learning Recommendation Model Training Scalability with the Dynamic Opera Network","year":2024,"downloads":6}