abstract bibtex

We study finite sample expressivity, i.e., memorization power of ReLU networks. Recent results require \$N\$ hidden nodes to memorize/interpolate arbitrary \$N\$ data points. In contrast, by exploiting depth, we show that 3-layer ReLU networks with \$\Omega(\sqrt{N})\$ hidden nodes can perfectly memorize most datasets with \$N\$ points. We also prove that width \$\Theta(\sqrt{N})\$ is necessary and sufficient for memorizing \$N\$ data points, proving tight bounds on memorization capacity. The sufficiency result can be extended to deeper networks; we show that an \$L\$-layer network with \$W\$ parameters in the hidden layers can memorize \$N\$ data points if \$W = \Omega(N)\$. Combined with a recent upper bound \$O(WL\log W)\$ on VC dimension, our construction is nearly tight for any fixed \$L\$. Subsequently, we analyze memorization capacity of residual networks under a general position assumption; we prove results that substantially reduce the known requirement of \$N\$ hidden nodes. Finally, we study the dynamics of stochastic gradient descent (SGD), and show that when initialized near a memorizing global minimum of the empirical risk, SGD quickly finds a nearby point with much smaller empirical risk.

@Article{Yun2018, author = {Yun, Chulhee and Sra, Suvrit and Jadbabaie, Ali}, title = {Small ReLU networks are powerful memorizers: a tight analysis of memorization capacity}, journal = {arXiv}, volume = {}, number = {}, pages = {1810.07770v3}, year = {2018}, abstract = {We study finite sample expressivity, i.e., memorization power of ReLU networks. Recent results require \$N\$ hidden nodes to memorize/interpolate arbitrary \$N\$ data points. In contrast, by exploiting depth, we show that 3-layer ReLU networks with \$\Omega(\sqrt{N})\$ hidden nodes can perfectly memorize most datasets with \$N\$ points. We also prove that width \$\Theta(\sqrt{N})\$ is necessary and sufficient for memorizing \$N\$ data points, proving tight bounds on memorization capacity. The sufficiency result can be extended to deeper networks; we show that an \$L\$-layer network with \$W\$ parameters in the hidden layers can memorize \$N\$ data points if \$W = \Omega(N)\$. Combined with a recent upper bound \$O(WL\log W)\$ on VC dimension, our construction is nearly tight for any fixed \$L\$. Subsequently, we analyze memorization capacity of residual networks under a general position assumption; we prove results that substantially reduce the known requirement of \$N\$ hidden nodes. Finally, we study the dynamics of stochastic gradient descent (SGD), and show that when initialized near a memorizing global minimum of the empirical risk, SGD quickly finds a nearby point with much smaller empirical risk.}, location = {}, keywords = {}}

Downloads: 0