On Byzantine-Resilient High-Dimensional Stochastic Gradient Descent

On Byzantine-Resilient High-Dimensional Stochastic Gradient Descent. Data, D. & Diggavi, S. In 2020 IEEE International Symposium on Information Theory (ISIT), pages 2628–2633, 2020. IEEE.
doi abstract bibtex

We study stochastic gradient descent (SGD) in the master-worker architecture under Byzantine attacks. Building upon the recent advances in algorithmic high-dimensional robust statistics, in each SGD iteration, master employs a non-trivial decoding to estimate the true gradient from the unbiased stochastic gradients received from workers, some of which may be corrupt. We provide convergence analyses for both strongly-convex and non-convex smooth objectives under standard SGD assumptions. We can control the approximation error of our solution in both these settings by the mini-batch size of stochastic gradients; and we can make the approximation error as small as we want, provided that workers use a sufficiently large mini-batch size. Our algorithm can tolerate less than 1/3 fraction of Byzantine workers. It can approximately find the optimal parameters in the strongly-convex setting exponentially fast, and reaches to an approximate stationary point in the non-convex setting with linear speed, i.e., with a rate of 1/T, thus, matching the convergence rates of vanilla SGD in the Byzantine-free setting.

@inproceedings{data2020byzantine,
 abstract = {We study stochastic gradient descent (SGD) in the master-worker architecture under Byzantine attacks. Building upon the recent advances in algorithmic high-dimensional robust statistics, in each SGD iteration, master employs a non-trivial decoding to estimate the true gradient from the unbiased stochastic gradients received from workers, some of which may be corrupt. We provide convergence analyses for both strongly-convex and non-convex smooth objectives under standard SGD assumptions. We can control the approximation error of our solution in both these settings by the mini-batch size of stochastic gradients; and we can make the approximation error as small as we want, provided that workers use a sufficiently large mini-batch size. Our algorithm can tolerate less than 1/3 fraction of Byzantine workers. It can approximately find the optimal parameters in the strongly-convex setting exponentially fast, and reaches to an approximate stationary point in the non-convex setting with linear speed, i.e., with a rate of 1/T, thus, matching the convergence rates of vanilla SGD in the Byzantine-free setting.},
 author = {Data, Deepesh and Diggavi, Suhas},
 booktitle = {2020 IEEE International Symposium on Information Theory (ISIT)},
 organization = {IEEE},
 pages = {2628--2633},
 tags = {conf,SDL,DML},
 title = {On Byzantine-Resilient High-Dimensional Stochastic Gradient Descent},
 type = {4},
 doi = {10.1109/ISIT44484.2020.9174363},
 year = {2020}
}

Downloads: 0

{"_id":"HaMCK3Tm2Ck6ii2Mb","bibbaseid":"data-diggavi-onbyzantineresilienthighdimensionalstochasticgradientdescent-2020","author_short":["Data, D.","Diggavi, S."],"bibdata":{"bibtype":"inproceedings","type":"4","abstract":"We study stochastic gradient descent (SGD) in the master-worker architecture under Byzantine attacks. Building upon the recent advances in algorithmic high-dimensional robust statistics, in each SGD iteration, master employs a non-trivial decoding to estimate the true gradient from the unbiased stochastic gradients received from workers, some of which may be corrupt. We provide convergence analyses for both strongly-convex and non-convex smooth objectives under standard SGD assumptions. We can control the approximation error of our solution in both these settings by the mini-batch size of stochastic gradients; and we can make the approximation error as small as we want, provided that workers use a sufficiently large mini-batch size. Our algorithm can tolerate less than 1/3 fraction of Byzantine workers. It can approximately find the optimal parameters in the strongly-convex setting exponentially fast, and reaches to an approximate stationary point in the non-convex setting with linear speed, i.e., with a rate of 1/T, thus, matching the convergence rates of vanilla SGD in the Byzantine-free setting.","author":[{"propositions":[],"lastnames":["Data"],"firstnames":["Deepesh"],"suffixes":[]},{"propositions":[],"lastnames":["Diggavi"],"firstnames":["Suhas"],"suffixes":[]}],"booktitle":"2020 IEEE International Symposium on Information Theory (ISIT)","organization":"IEEE","pages":"2628–2633","tags":"conf,SDL,DML","title":"On Byzantine-Resilient High-Dimensional Stochastic Gradient Descent","doi":"10.1109/ISIT44484.2020.9174363","year":"2020","bibtex":"@inproceedings{data2020byzantine,\n abstract = {We study stochastic gradient descent (SGD) in the master-worker architecture under Byzantine attacks. Building upon the recent advances in algorithmic high-dimensional robust statistics, in each SGD iteration, master employs a non-trivial decoding to estimate the true gradient from the unbiased stochastic gradients received from workers, some of which may be corrupt. We provide convergence analyses for both strongly-convex and non-convex smooth objectives under standard SGD assumptions. We can control the approximation error of our solution in both these settings by the mini-batch size of stochastic gradients; and we can make the approximation error as small as we want, provided that workers use a sufficiently large mini-batch size. Our algorithm can tolerate less than 1/3 fraction of Byzantine workers. It can approximately find the optimal parameters in the strongly-convex setting exponentially fast, and reaches to an approximate stationary point in the non-convex setting with linear speed, i.e., with a rate of 1/T, thus, matching the convergence rates of vanilla SGD in the Byzantine-free setting.},\n author = {Data, Deepesh and Diggavi, Suhas},\n booktitle = {2020 IEEE International Symposium on Information Theory (ISIT)},\n organization = {IEEE},\n pages = {2628--2633},\n tags = {conf,SDL,DML},\n title = {On Byzantine-Resilient High-Dimensional Stochastic Gradient Descent},\n type = {4},\n doi = {10.1109/ISIT44484.2020.9174363},\n year = {2020}\n}\n\n","author_short":["Data, D.","Diggavi, S."],"key":"data2020byzantine","id":"data2020byzantine","bibbaseid":"data-diggavi-onbyzantineresilienthighdimensionalstochasticgradientdescent-2020","role":"author","urls":{},"metadata":{"authorlinks":{}},"html":""},"bibtype":"inproceedings","biburl":"https://bibbase.org/network/files/e2kjGxYgtBo8SWSbC","dataSources":["hicKnsKYNEFXC4CgH","jxCYzXXYRqw2fiEXQ","wCByFFrQMyRwfzrJ6","yuqM5ah4HMsTyDrMa","YaM87hGQiepg5qijZ","n9wmfkt5w8CPqCepg","soj2cS6PgG8NPmWGr","FaDBDiyFAJY5pL28h","ycfdiwWPzC2rE6H77"],"keywords":[],"search_terms":["byzantine","resilient","high","dimensional","stochastic","gradient","descent","data","diggavi"],"title":"On Byzantine-Resilient High-Dimensional Stochastic Gradient Descent","year":2020}