Generating Long Sequences with Sparse Transformers

Generating Long Sequences with Sparse Transformers. Child, R., Gray, S., Radford, A., & Sutskever, I. 4, 2019.

Generating Long Sequences with Sparse Transformers [link]

Transformers are powerful sequence models, but require time and memory that grows quadratically with the sequence length. In this paper we introduce sparse factorizations of the attention matrix which reduce this to $O(n \sqrtn)$. We also introduce a) a variation on architecture and initialization to train deeper networks, b) the recomputation of attention matrices to save memory, and c) fast attention kernels for training. We call networks with these changes Sparse Transformers, and show they can model sequences tens of thousands of timesteps long using hundreds of layers. We use the same architecture to model images, audio, and text from raw bytes, setting a new state of the art for density modeling of Enwik8, CIFAR-10, and ImageNet-64. We generate unconditional samples that demonstrate global coherence and great diversity, and show it is possible in principle to use self-attention to model sequences of length one million or more.

@article{
 title = {Generating Long Sequences with Sparse Transformers},
 type = {article},
 year = {2019},
 websites = {https://arxiv.org/abs/1904.10509v1},
 month = {4},
 day = {23},
 id = {454dd0b7-73d2-3c25-8e02-db7e7e07de48},
 created = {2021-09-03T07:12:51.573Z},
 accessed = {2021-09-03},
 file_attached = {true},
 profile_id = {48fc0258-023d-3602-860e-824092d62c56},
 group_id = {1ff583c0-be37-34fa-9c04-73c69437d354},
 last_modified = {2021-09-03T07:12:55.333Z},
 read = {false},
 starred = {false},
 authored = {false},
 confirmed = {false},
 hidden = {false},
 folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4},
 private_publication = {false},
 abstract = {Transformers are powerful sequence models, but require time and memory that
grows quadratically with the sequence length. In this paper we introduce sparse
factorizations of the attention matrix which reduce this to $O(n \sqrtn)$. We
also introduce a) a variation on architecture and initialization to train
deeper networks, b) the recomputation of attention matrices to save memory, and
c) fast attention kernels for training. We call networks with these changes
Sparse Transformers, and show they can model sequences tens of thousands of
timesteps long using hundreds of layers. We use the same architecture to model
images, audio, and text from raw bytes, setting a new state of the art for
density modeling of Enwik8, CIFAR-10, and ImageNet-64. We generate
unconditional samples that demonstrate global coherence and great diversity,
and show it is possible in principle to use self-attention to model sequences
of length one million or more.},
 bibtype = {article},
 author = {Child, Rewon and Gray, Scott and Radford, Alec and Sutskever, Ilya}
}

Downloads: 0

{"_id":"JG64Kr62NDP5J7q6Z","bibbaseid":"child-gray-radford-sutskever-generatinglongsequenceswithsparsetransformers-2019","authorIDs":[],"author_short":["Child, R.","Gray, S.","Radford, A.","Sutskever, I."],"bibdata":{"title":"Generating Long Sequences with Sparse Transformers","type":"article","year":"2019","websites":"https://arxiv.org/abs/1904.10509v1","month":"4","day":"23","id":"454dd0b7-73d2-3c25-8e02-db7e7e07de48","created":"2021-09-03T07:12:51.573Z","accessed":"2021-09-03","file_attached":"true","profile_id":"48fc0258-023d-3602-860e-824092d62c56","group_id":"1ff583c0-be37-34fa-9c04-73c69437d354","last_modified":"2021-09-03T07:12:55.333Z","read":false,"starred":false,"authored":false,"confirmed":false,"hidden":false,"folder_uuids":"8d050117-e419-4b32-ad70-c875c74fa2b4","private_publication":false,"abstract":"Transformers are powerful sequence models, but require time and memory that\ngrows quadratically with the sequence length. In this paper we introduce sparse\nfactorizations of the attention matrix which reduce this to $O(n \\sqrtn)$. We\nalso introduce a) a variation on architecture and initialization to train\ndeeper networks, b) the recomputation of attention matrices to save memory, and\nc) fast attention kernels for training. We call networks with these changes\nSparse Transformers, and show they can model sequences tens of thousands of\ntimesteps long using hundreds of layers. We use the same architecture to model\nimages, audio, and text from raw bytes, setting a new state of the art for\ndensity modeling of Enwik8, CIFAR-10, and ImageNet-64. We generate\nunconditional samples that demonstrate global coherence and great diversity,\nand show it is possible in principle to use self-attention to model sequences\nof length one million or more.","bibtype":"article","author":"Child, Rewon and Gray, Scott and Radford, Alec and Sutskever, Ilya","bibtex":"@article{\n title = {Generating Long Sequences with Sparse Transformers},\n type = {article},\n year = {2019},\n websites = {https://arxiv.org/abs/1904.10509v1},\n month = {4},\n day = {23},\n id = {454dd0b7-73d2-3c25-8e02-db7e7e07de48},\n created = {2021-09-03T07:12:51.573Z},\n accessed = {2021-09-03},\n file_attached = {true},\n profile_id = {48fc0258-023d-3602-860e-824092d62c56},\n group_id = {1ff583c0-be37-34fa-9c04-73c69437d354},\n last_modified = {2021-09-03T07:12:55.333Z},\n read = {false},\n starred = {false},\n authored = {false},\n confirmed = {false},\n hidden = {false},\n folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4},\n private_publication = {false},\n abstract = {Transformers are powerful sequence models, but require time and memory that\ngrows quadratically with the sequence length. In this paper we introduce sparse\nfactorizations of the attention matrix which reduce this to $O(n \\sqrtn)$. We\nalso introduce a) a variation on architecture and initialization to train\ndeeper networks, b) the recomputation of attention matrices to save memory, and\nc) fast attention kernels for training. We call networks with these changes\nSparse Transformers, and show they can model sequences tens of thousands of\ntimesteps long using hundreds of layers. We use the same architecture to model\nimages, audio, and text from raw bytes, setting a new state of the art for\ndensity modeling of Enwik8, CIFAR-10, and ImageNet-64. We generate\nunconditional samples that demonstrate global coherence and great diversity,\nand show it is possible in principle to use self-attention to model sequences\nof length one million or more.},\n bibtype = {article},\n author = {Child, Rewon and Gray, Scott and Radford, Alec and Sutskever, Ilya}\n}","author_short":["Child, R.","Gray, S.","Radford, A.","Sutskever, I."],"urls":{"Paper":"https://bibbase.org/service/mendeley/bfbbf840-4c42-3914-a463-19024f50b30c/file/4abf689f-6eb3-a5cf-3da9-1ec3a89bd59a/full_text.pdf.pdf","Website":"https://arxiv.org/abs/1904.10509v1"},"biburl":"https://bibbase.org/service/mendeley/bfbbf840-4c42-3914-a463-19024f50b30c","bibbaseid":"child-gray-radford-sutskever-generatinglongsequenceswithsparsetransformers-2019","role":"author","metadata":{"authorlinks":{}},"downloads":0},"bibtype":"article","biburl":"https://bibbase.org/service/mendeley/bfbbf840-4c42-3914-a463-19024f50b30c","creationDate":"2021-02-12T21:37:01.624Z","downloads":0,"keywords":[],"search_terms":["generating","long","sequences","sparse","transformers","child","gray","radford","sutskever"],"title":"Generating Long Sequences with Sparse Transformers","year":2019,"dataSources":["qLJ7Ld8T2ZKybATHB","ya2CyA73rpZseyrZ8","2252seNhipfTmjEBQ"]}