Qsparse-Local-SGD: Distributed SGD With Quantization, Sparsification, and Local Computations

Qsparse-Local-SGD: Distributed SGD With Quantization, Sparsification, and Local Computations. Basu, D., Data, D., Karakus, C., & Diggavi, S. N. IEEE Journal on Selected Areas in Information Theory, 1(1):217-226, May, 2020.

Arxiv doi abstract bibtex 3 downloads

Communication bottleneck has been identified as a significant issue in distributed optimization of large-scale learning models. Recently, several approaches to mitigate this problem have been proposed, including different forms of gradient compression or computing local models and mixing them iteratively. In this paper, we propose Qsparse-local-SGD algorithm, which combines aggressive sparsification with quantization and local computation along with error compensation, by keeping track of the difference between the true and compressed gradients. We propose both synchronous and asynchronous implementations of Qsparse-local-SGD. We analyze convergence for Qsparse-local-SGD in the distributed setting for smooth non-convex and convex objective functions. We demonstrate that Qsparse-local-SGD converges at the same rate as vanilla distributed SGD for many important classes of sparsifiers and quantizers. We use Qsparse-local-SGD to train ResNet-50 on ImageNet and show that it results in significant savings over the state-of-the-art, in the number of bits transmitted to reach target accuracy.

@article{9057579,
 abstract = {Communication bottleneck has been identified as a significant issue in distributed optimization of large-scale learning models. Recently, several approaches to mitigate this problem have been proposed, including different forms of gradient compression or computing local models and mixing them iteratively. In this paper, we propose Qsparse-local-SGD algorithm, which combines aggressive sparsification with quantization and local computation along with error compensation, by keeping track of the difference between the true and compressed gradients. We propose both synchronous and asynchronous implementations of Qsparse-local-SGD. We analyze convergence for Qsparse-local-SGD in the distributed setting for smooth non-convex and convex objective functions. We demonstrate that Qsparse-local-SGD converges at the same rate as vanilla distributed SGD for many important classes of sparsifiers and quantizers. We use Qsparse-local-SGD to train ResNet-50 on ImageNet and show that it results in significant savings over the state-of-the-art, in the number of bits transmitted to reach target accuracy.},
 author = {D. {Basu} and D. {Data} and C. {Karakus} and S. N. {Diggavi}},
 doi = {10.1109/JSAIT.2020.2985917},
 issn = {2641-8770},
 journal = {IEEE Journal on Selected Areas in Information Theory},
 keywords = {concave programming;convex programming;gradient methods;learning (artificial intelligence);optimisation;stochastic processes;stochastic gradient descent;convex objective functions;nonconvex objective functions;error compensation;sparsification;Qsparse-local-SGD;distributed setting;compressed gradients;true gradients;gradient compression;large-scale learning models;distributed optimization;quantization;vanilla distributed SGD;Quantization (signal);Convergence;Computational modeling;Stochastic processes;Training;Peer-to-peer computing;Optimization;Distributed optimization and learning;stochastic optimization;communication efficient training methods},
 month = {May},
 number = {1},
 pages = {217-226},
 tags = {journal,CEDL,DML},
 title = {Qsparse-Local-SGD: Distributed SGD With Quantization, Sparsification, and Local Computations},
 type = {2},
 url_arxiv = {https://arxiv.org/abs/1906.02367},
 volume = {1},
 year = {2020}
}

Downloads: 3

{"_id":"R39J4rzJRDeoZexB8","bibbaseid":"basu-data-karakus-diggavi-qsparselocalsgddistributedsgdwithquantizationsparsificationandlocalcomputations-2020","author_short":["Basu, D.","Data, D.","Karakus, C.","Diggavi, S. N."],"bibdata":{"bibtype":"article","type":"2","abstract":"Communication bottleneck has been identified as a significant issue in distributed optimization of large-scale learning models. Recently, several approaches to mitigate this problem have been proposed, including different forms of gradient compression or computing local models and mixing them iteratively. In this paper, we propose Qsparse-local-SGD algorithm, which combines aggressive sparsification with quantization and local computation along with error compensation, by keeping track of the difference between the true and compressed gradients. We propose both synchronous and asynchronous implementations of Qsparse-local-SGD. We analyze convergence for Qsparse-local-SGD in the distributed setting for smooth non-convex and convex objective functions. We demonstrate that Qsparse-local-SGD converges at the same rate as vanilla distributed SGD for many important classes of sparsifiers and quantizers. We use Qsparse-local-SGD to train ResNet-50 on ImageNet and show that it results in significant savings over the state-of-the-art, in the number of bits transmitted to reach target accuracy.","author":[{"firstnames":["D."],"propositions":[],"lastnames":["Basu"],"suffixes":[]},{"firstnames":["D."],"propositions":[],"lastnames":["Data"],"suffixes":[]},{"firstnames":["C."],"propositions":[],"lastnames":["Karakus"],"suffixes":[]},{"firstnames":["S.","N."],"propositions":[],"lastnames":["Diggavi"],"suffixes":[]}],"doi":"10.1109/JSAIT.2020.2985917","issn":"2641-8770","journal":"IEEE Journal on Selected Areas in Information Theory","keywords":"concave programming;convex programming;gradient methods;learning (artificial intelligence);optimisation;stochastic processes;stochastic gradient descent;convex objective functions;nonconvex objective functions;error compensation;sparsification;Qsparse-local-SGD;distributed setting;compressed gradients;true gradients;gradient compression;large-scale learning models;distributed optimization;quantization;vanilla distributed SGD;Quantization (signal);Convergence;Computational modeling;Stochastic processes;Training;Peer-to-peer computing;Optimization;Distributed optimization and learning;stochastic optimization;communication efficient training methods","month":"May","number":"1","pages":"217-226","tags":"journal,CEDL,DML","title":"Qsparse-Local-SGD: Distributed SGD With Quantization, Sparsification, and Local Computations","url_arxiv":"https://arxiv.org/abs/1906.02367","volume":"1","year":"2020","bibtex":"@article{9057579,\n abstract = {Communication bottleneck has been identified as a significant issue in distributed optimization of large-scale learning models. Recently, several approaches to mitigate this problem have been proposed, including different forms of gradient compression or computing local models and mixing them iteratively. In this paper, we propose Qsparse-local-SGD algorithm, which combines aggressive sparsification with quantization and local computation along with error compensation, by keeping track of the difference between the true and compressed gradients. We propose both synchronous and asynchronous implementations of Qsparse-local-SGD. We analyze convergence for Qsparse-local-SGD in the distributed setting for smooth non-convex and convex objective functions. We demonstrate that Qsparse-local-SGD converges at the same rate as vanilla distributed SGD for many important classes of sparsifiers and quantizers. We use Qsparse-local-SGD to train ResNet-50 on ImageNet and show that it results in significant savings over the state-of-the-art, in the number of bits transmitted to reach target accuracy.},\n author = {D. {Basu} and D. {Data} and C. {Karakus} and S. N. {Diggavi}},\n doi = {10.1109/JSAIT.2020.2985917},\n issn = {2641-8770},\n journal = {IEEE Journal on Selected Areas in Information Theory},\n keywords = {concave programming;convex programming;gradient methods;learning (artificial intelligence);optimisation;stochastic processes;stochastic gradient descent;convex objective functions;nonconvex objective functions;error compensation;sparsification;Qsparse-local-SGD;distributed setting;compressed gradients;true gradients;gradient compression;large-scale learning models;distributed optimization;quantization;vanilla distributed SGD;Quantization (signal);Convergence;Computational modeling;Stochastic processes;Training;Peer-to-peer computing;Optimization;Distributed optimization and learning;stochastic optimization;communication efficient training methods},\n month = {May},\n number = {1},\n pages = {217-226},\n tags = {journal,CEDL,DML},\n title = {Qsparse-Local-SGD: Distributed SGD With Quantization, Sparsification, and Local Computations},\n type = {2},\n url_arxiv = {https://arxiv.org/abs/1906.02367},\n volume = {1},\n year = {2020}\n}\n\n","author_short":["Basu, D.","Data, D.","Karakus, C.","Diggavi, S. N."],"key":"9057579","id":"9057579","bibbaseid":"basu-data-karakus-diggavi-qsparselocalsgddistributedsgdwithquantizationsparsificationandlocalcomputations-2020","role":"author","urls":{" arxiv":"https://arxiv.org/abs/1906.02367"},"keyword":["concave programming;convex programming;gradient methods;learning (artificial intelligence);optimisation;stochastic processes;stochastic gradient descent;convex objective functions;nonconvex objective functions;error compensation;sparsification;Qsparse-local-SGD;distributed setting;compressed gradients;true gradients;gradient compression;large-scale learning models;distributed optimization;quantization;vanilla distributed SGD;Quantization (signal);Convergence;Computational modeling;Stochastic processes;Training;Peer-to-peer computing;Optimization;Distributed optimization and learning;stochastic optimization;communication efficient training methods"],"metadata":{"authorlinks":{}},"downloads":3,"html":""},"bibtype":"article","biburl":"https://bibbase.org/network/files/e2kjGxYgtBo8SWSbC","dataSources":["hicKnsKYNEFXC4CgH","jxCYzXXYRqw2fiEXQ","yuqM5ah4HMsTyDrMa","YaM87hGQiepg5qijZ","n9wmfkt5w8CPqCepg","soj2cS6PgG8NPmWGr","FaDBDiyFAJY5pL28h","ycfdiwWPzC2rE6H77"],"keywords":["concave programming;convex programming;gradient methods;learning (artificial intelligence);optimisation;stochastic processes;stochastic gradient descent;convex objective functions;nonconvex objective functions;error compensation;sparsification;qsparse-local-sgd;distributed setting;compressed gradients;true gradients;gradient compression;large-scale learning models;distributed optimization;quantization;vanilla distributed sgd;quantization (signal);convergence;computational modeling;stochastic processes;training;peer-to-peer computing;optimization;distributed optimization and learning;stochastic optimization;communication efficient training methods"],"search_terms":["qsparse","local","sgd","distributed","sgd","quantization","sparsification","local","computations","basu","data","karakus","diggavi"],"title":"Qsparse-Local-SGD: Distributed SGD With Quantization, Sparsification, and Local Computations","year":2020,"downloads":3}