DeepNet: Scaling Transformers to 1, 000 Layers. Wang, H., Ma, S., Dong, L., Huang, S., Zhang, D., & Wei, F. CoRR, 2022.
DeepNet: Scaling Transformers to 1, 000 Layers [link]Paper  doi  bibtex   
@article{DBLP:journals/corr/abs-2203-00555,
  author       = {Hongyu Wang and
                  Shuming Ma and
                  Li Dong and
                  Shaohan Huang and
                  Dongdong Zhang and
                  Furu Wei},
  title        = {DeepNet: Scaling Transformers to 1, 000 Layers},
  journal      = {CoRR},
  volume       = {abs/2203.00555},
  year         = {2022},
  url          = {https://doi.org/10.48550/arXiv.2203.00555},
  doi          = {10.48550/ARXIV.2203.00555},
  eprinttype    = {arXiv},
  eprint       = {2203.00555},
  timestamp    = {Tue, 20 Dec 2022 00:00:00 +0100},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2203-00555.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

Downloads: 0