Qsparse-local-SGD: Distributed SGD with quantization, sparsification and local computations

Qsparse-local-SGD: Distributed SGD with quantization, sparsification and local computations. Basu, D., Data, D., Karakus, C., & Diggavi, S. In Advances in Neural Information Processing Systems, pages 14695–14706, 2019.

Arxiv abstract bibtex

Communication bottleneck has been identified as a significant issue in distributed optimization of large-scale learning models. Recently, several approaches to mitigate this problem have been proposed, including different forms of gradient compression or computing local models and mixing them iteratively. In this paper, we propose \emphQsparse-local-SGD algorithm, which combines aggressive sparsification with quantization and local computation along with error compensation, by keeping track of the difference between the true and compressed gradients. We propose both synchronous and asynchronous implementations of \emphQsparse-local-SGD. We analyze convergence for \emphQsparse-local-SGD in the \emphdistributed setting for smooth non-convex and convex objective functions. We demonstrate that \emphQsparse-local-SGD converges at the same rate as vanilla distributed SGD for many important classes of sparsifiers and quantizers. We use \emphQsparse-local-SGD to train ResNet-50 on ImageNet and show that it results in significant savings over the state-of-the-art, in the number of bits transmitted to reach target accuracy.

@inproceedings{basu2019qsparse,
 abstract = {Communication bottleneck has been identified as a significant issue in distributed optimization of large-scale learning models. Recently, several approaches to mitigate this problem have been proposed, including different forms of gradient compression or computing local models and mixing them iteratively. In this paper, we propose \emph{Qsparse-local-SGD} algorithm, which combines aggressive sparsification with quantization and local computation along with error compensation, by keeping track of the difference between the true and compressed gradients. We propose both synchronous and asynchronous implementations of \emph{Qsparse-local-SGD}. We analyze convergence for \emph{Qsparse-local-SGD} in the \emph{distributed} setting for smooth non-convex and convex objective functions. We demonstrate that \emph{Qsparse-local-SGD} converges at the same rate as vanilla distributed SGD for many important classes of sparsifiers and quantizers. We use \emph{Qsparse-local-SGD} to train ResNet-50 on ImageNet and show that it results in significant savings over the state-of-the-art, in the number of bits transmitted to reach target accuracy.},
 author = {Basu, Debraj and Data, Deepesh and Karakus, Can and Diggavi, Suhas},
 booktitle = {Advances in Neural Information Processing Systems},
 pages = {14695--14706},
 tags = {conf,CEDL,DML},
 title = {Qsparse-local-SGD: Distributed SGD with quantization, sparsification and local computations},
 type = {4},
 url_arxiv = {https://arxiv.org/abs/1906.02367},
 year = {2019}
}

Downloads: 0

{"_id":"9S87fGmut5acCPd2C","bibbaseid":"basu-data-karakus-diggavi-qsparselocalsgddistributedsgdwithquantizationsparsificationandlocalcomputations-2019","author_short":["Basu, D.","Data, D.","Karakus, C.","Diggavi, S."],"bibdata":{"bibtype":"inproceedings","type":"4","abstract":"Communication bottleneck has been identified as a significant issue in distributed optimization of large-scale learning models. Recently, several approaches to mitigate this problem have been proposed, including different forms of gradient compression or computing local models and mixing them iteratively. In this paper, we propose \\emphQsparse-local-SGD algorithm, which combines aggressive sparsification with quantization and local computation along with error compensation, by keeping track of the difference between the true and compressed gradients. We propose both synchronous and asynchronous implementations of \\emphQsparse-local-SGD. We analyze convergence for \\emphQsparse-local-SGD in the \\emphdistributed setting for smooth non-convex and convex objective functions. We demonstrate that \\emphQsparse-local-SGD converges at the same rate as vanilla distributed SGD for many important classes of sparsifiers and quantizers. We use \\emphQsparse-local-SGD to train ResNet-50 on ImageNet and show that it results in significant savings over the state-of-the-art, in the number of bits transmitted to reach target accuracy.","author":[{"propositions":[],"lastnames":["Basu"],"firstnames":["Debraj"],"suffixes":[]},{"propositions":[],"lastnames":["Data"],"firstnames":["Deepesh"],"suffixes":[]},{"propositions":[],"lastnames":["Karakus"],"firstnames":["Can"],"suffixes":[]},{"propositions":[],"lastnames":["Diggavi"],"firstnames":["Suhas"],"suffixes":[]}],"booktitle":"Advances in Neural Information Processing Systems","pages":"14695–14706","tags":"conf,CEDL,DML","title":"Qsparse-local-SGD: Distributed SGD with quantization, sparsification and local computations","url_arxiv":"https://arxiv.org/abs/1906.02367","year":"2019","bibtex":"@inproceedings{basu2019qsparse,\n abstract = {Communication bottleneck has been identified as a significant issue in distributed optimization of large-scale learning models. Recently, several approaches to mitigate this problem have been proposed, including different forms of gradient compression or computing local models and mixing them iteratively. In this paper, we propose \\emph{Qsparse-local-SGD} algorithm, which combines aggressive sparsification with quantization and local computation along with error compensation, by keeping track of the difference between the true and compressed gradients. We propose both synchronous and asynchronous implementations of \\emph{Qsparse-local-SGD}. We analyze convergence for \\emph{Qsparse-local-SGD} in the \\emph{distributed} setting for smooth non-convex and convex objective functions. We demonstrate that \\emph{Qsparse-local-SGD} converges at the same rate as vanilla distributed SGD for many important classes of sparsifiers and quantizers. We use \\emph{Qsparse-local-SGD} to train ResNet-50 on ImageNet and show that it results in significant savings over the state-of-the-art, in the number of bits transmitted to reach target accuracy.},\n author = {Basu, Debraj and Data, Deepesh and Karakus, Can and Diggavi, Suhas},\n booktitle = {Advances in Neural Information Processing Systems},\n pages = {14695--14706},\n tags = {conf,CEDL,DML},\n title = {Qsparse-local-SGD: Distributed SGD with quantization, sparsification and local computations},\n type = {4},\n url_arxiv = {https://arxiv.org/abs/1906.02367},\n year = {2019}\n}\n\n","author_short":["Basu, D.","Data, D.","Karakus, C.","Diggavi, S."],"key":"basu2019qsparse","id":"basu2019qsparse","bibbaseid":"basu-data-karakus-diggavi-qsparselocalsgddistributedsgdwithquantizationsparsificationandlocalcomputations-2019","role":"author","urls":{" arxiv":"https://arxiv.org/abs/1906.02367"},"metadata":{"authorlinks":{}}},"bibtype":"inproceedings","biburl":"https://research.seas.ucla.edu/licos/files/2023/02/publicactions_v1.bib","dataSources":["hicKnsKYNEFXC4CgH","jxCYzXXYRqw2fiEXQ","yuqM5ah4HMsTyDrMa","YaM87hGQiepg5qijZ","n9wmfkt5w8CPqCepg","soj2cS6PgG8NPmWGr","FaDBDiyFAJY5pL28h","ycfdiwWPzC2rE6H77"],"keywords":[],"search_terms":["qsparse","local","sgd","distributed","sgd","quantization","sparsification","local","computations","basu","data","karakus","diggavi"],"title":"Qsparse-local-SGD: Distributed SGD with quantization, sparsification and local computations","year":2019}