Generating Long Sequences with Sparse Transformers. Child, R., Gray, S., Radford, A., & Sutskever, I. 4, 2019. Paper Website abstract bibtex Transformers are powerful sequence models, but require time and memory that
grows quadratically with the sequence length. In this paper we introduce sparse
factorizations of the attention matrix which reduce this to $O(n \sqrtn)$. We
also introduce a) a variation on architecture and initialization to train
deeper networks, b) the recomputation of attention matrices to save memory, and
c) fast attention kernels for training. We call networks with these changes
Sparse Transformers, and show they can model sequences tens of thousands of
timesteps long using hundreds of layers. We use the same architecture to model
images, audio, and text from raw bytes, setting a new state of the art for
density modeling of Enwik8, CIFAR-10, and ImageNet-64. We generate
unconditional samples that demonstrate global coherence and great diversity,
and show it is possible in principle to use self-attention to model sequences
of length one million or more.
@article{
title = {Generating Long Sequences with Sparse Transformers},
type = {article},
year = {2019},
websites = {https://arxiv.org/abs/1904.10509v1},
month = {4},
day = {23},
id = {454dd0b7-73d2-3c25-8e02-db7e7e07de48},
created = {2021-09-03T07:12:51.573Z},
accessed = {2021-09-03},
file_attached = {true},
profile_id = {48fc0258-023d-3602-860e-824092d62c56},
group_id = {1ff583c0-be37-34fa-9c04-73c69437d354},
last_modified = {2021-09-03T07:12:55.333Z},
read = {false},
starred = {false},
authored = {false},
confirmed = {false},
hidden = {false},
folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4},
private_publication = {false},
abstract = {Transformers are powerful sequence models, but require time and memory that
grows quadratically with the sequence length. In this paper we introduce sparse
factorizations of the attention matrix which reduce this to $O(n \sqrtn)$. We
also introduce a) a variation on architecture and initialization to train
deeper networks, b) the recomputation of attention matrices to save memory, and
c) fast attention kernels for training. We call networks with these changes
Sparse Transformers, and show they can model sequences tens of thousands of
timesteps long using hundreds of layers. We use the same architecture to model
images, audio, and text from raw bytes, setting a new state of the art for
density modeling of Enwik8, CIFAR-10, and ImageNet-64. We generate
unconditional samples that demonstrate global coherence and great diversity,
and show it is possible in principle to use self-attention to model sequences
of length one million or more.},
bibtype = {article},
author = {Child, Rewon and Gray, Scott and Radford, Alec and Sutskever, Ilya}
}
Downloads: 0
{"_id":"JG64Kr62NDP5J7q6Z","bibbaseid":"child-gray-radford-sutskever-generatinglongsequenceswithsparsetransformers-2019","authorIDs":[],"author_short":["Child, R.","Gray, S.","Radford, A.","Sutskever, I."],"bibdata":{"title":"Generating Long Sequences with Sparse Transformers","type":"article","year":"2019","websites":"https://arxiv.org/abs/1904.10509v1","month":"4","day":"23","id":"454dd0b7-73d2-3c25-8e02-db7e7e07de48","created":"2021-09-03T07:12:51.573Z","accessed":"2021-09-03","file_attached":"true","profile_id":"48fc0258-023d-3602-860e-824092d62c56","group_id":"1ff583c0-be37-34fa-9c04-73c69437d354","last_modified":"2021-09-03T07:12:55.333Z","read":false,"starred":false,"authored":false,"confirmed":false,"hidden":false,"folder_uuids":"8d050117-e419-4b32-ad70-c875c74fa2b4","private_publication":false,"abstract":"Transformers are powerful sequence models, but require time and memory that\ngrows quadratically with the sequence length. In this paper we introduce sparse\nfactorizations of the attention matrix which reduce this to $O(n \\sqrtn)$. We\nalso introduce a) a variation on architecture and initialization to train\ndeeper networks, b) the recomputation of attention matrices to save memory, and\nc) fast attention kernels for training. We call networks with these changes\nSparse Transformers, and show they can model sequences tens of thousands of\ntimesteps long using hundreds of layers. We use the same architecture to model\nimages, audio, and text from raw bytes, setting a new state of the art for\ndensity modeling of Enwik8, CIFAR-10, and ImageNet-64. We generate\nunconditional samples that demonstrate global coherence and great diversity,\nand show it is possible in principle to use self-attention to model sequences\nof length one million or more.","bibtype":"article","author":"Child, Rewon and Gray, Scott and Radford, Alec and Sutskever, Ilya","bibtex":"@article{\n title = {Generating Long Sequences with Sparse Transformers},\n type = {article},\n year = {2019},\n websites = {https://arxiv.org/abs/1904.10509v1},\n month = {4},\n day = {23},\n id = {454dd0b7-73d2-3c25-8e02-db7e7e07de48},\n created = {2021-09-03T07:12:51.573Z},\n accessed = {2021-09-03},\n file_attached = {true},\n profile_id = {48fc0258-023d-3602-860e-824092d62c56},\n group_id = {1ff583c0-be37-34fa-9c04-73c69437d354},\n last_modified = {2021-09-03T07:12:55.333Z},\n read = {false},\n starred = {false},\n authored = {false},\n confirmed = {false},\n hidden = {false},\n folder_uuids = {8d050117-e419-4b32-ad70-c875c74fa2b4},\n private_publication = {false},\n abstract = {Transformers are powerful sequence models, but require time and memory that\ngrows quadratically with the sequence length. In this paper we introduce sparse\nfactorizations of the attention matrix which reduce this to $O(n \\sqrtn)$. We\nalso introduce a) a variation on architecture and initialization to train\ndeeper networks, b) the recomputation of attention matrices to save memory, and\nc) fast attention kernels for training. We call networks with these changes\nSparse Transformers, and show they can model sequences tens of thousands of\ntimesteps long using hundreds of layers. We use the same architecture to model\nimages, audio, and text from raw bytes, setting a new state of the art for\ndensity modeling of Enwik8, CIFAR-10, and ImageNet-64. We generate\nunconditional samples that demonstrate global coherence and great diversity,\nand show it is possible in principle to use self-attention to model sequences\nof length one million or more.},\n bibtype = {article},\n author = {Child, Rewon and Gray, Scott and Radford, Alec and Sutskever, Ilya}\n}","author_short":["Child, R.","Gray, S.","Radford, A.","Sutskever, I."],"urls":{"Paper":"https://bibbase.org/service/mendeley/bfbbf840-4c42-3914-a463-19024f50b30c/file/4abf689f-6eb3-a5cf-3da9-1ec3a89bd59a/full_text.pdf.pdf","Website":"https://arxiv.org/abs/1904.10509v1"},"biburl":"https://bibbase.org/service/mendeley/bfbbf840-4c42-3914-a463-19024f50b30c","bibbaseid":"child-gray-radford-sutskever-generatinglongsequenceswithsparsetransformers-2019","role":"author","metadata":{"authorlinks":{}},"downloads":0},"bibtype":"article","biburl":"https://bibbase.org/service/mendeley/bfbbf840-4c42-3914-a463-19024f50b30c","creationDate":"2021-02-12T21:37:01.624Z","downloads":0,"keywords":[],"search_terms":["generating","long","sequences","sparse","transformers","child","gray","radford","sutskever"],"title":"Generating Long Sequences with Sparse Transformers","year":2019,"dataSources":["qLJ7Ld8T2ZKybATHB","ya2CyA73rpZseyrZ8","2252seNhipfTmjEBQ"]}