Towards end-to-end speech recognition with recurrent neural networks

Towards end-to-end speech recognition with recurrent neural networks. Graves, A. & Jaitly, N. In Proceedings of the 31st International Conference on International Conference on Machine Learning - Volume 32, of ICML'14, pages II–1764–II–1772, Beijing, China, June, 2014. JMLR.org.
abstract bibtex

This paper presents a speech recognition system that directly transcribes audio data with text, without requiring an intermediate phonetic representation. The system is based on a combination of the deep bidirectional LSTM recurrent neural network architecture and the Connectionist Temporal Classification objective function. A modification to the objective function is introduced that trains the network to minimise the expectation of an arbitrary transcription loss function. This allows a direct optimisation of the word error rate, even in the absence of a lexicon or language model. The system achieves a word error rate of 27.3% on the Wall Street Journal corpus with no prior linguistic information, 21.9% with only a lexicon of allowed words, and 8.2% with a trigram language model. Combining the network with a baseline system further reduces the error rate to 6.7%.

@inproceedings{graves_towards_2014,
	address = {Beijing, China},
	series = {{ICML}'14},
	title = {Towards end-to-end speech recognition with recurrent neural networks},
	abstract = {This paper presents a speech recognition system that directly transcribes audio data with text, without requiring an intermediate phonetic representation. The system is based on a combination of the deep bidirectional LSTM recurrent neural network architecture and the Connectionist Temporal Classification objective function. A modification to the objective function is introduced that trains the network to minimise the expectation of an arbitrary transcription loss function. This allows a direct optimisation of the word error rate, even in the absence of a lexicon or language model. The system achieves a word error rate of 27.3\% on the Wall Street Journal corpus with no prior linguistic information, 21.9\% with only a lexicon of allowed words, and 8.2\% with a trigram language model. Combining the network with a baseline system further reduces the error rate to 6.7\%.},
	language = {en},
	urldate = {2023-07-05},
	booktitle = {Proceedings of the 31st {International} {Conference} on {International} {Conference} on {Machine} {Learning} - {Volume} 32},
	publisher = {JMLR.org},
	author = {Graves, Alex and Jaitly, Navdeep},
	month = jun,
	year = {2014},
	keywords = {\#ICML{\textgreater}14, \#RNN, \#Recognition, \#Speech, /unread},
	pages = {II--1764--II--1772},
}

Downloads: 0

{"_id":"Yojzo8Emw7WZhE3v5","bibbaseid":"graves-jaitly-towardsendtoendspeechrecognitionwithrecurrentneuralnetworks-2014","downloads":0,"creationDate":"2018-10-05T11:34:46.192Z","title":"Towards end-to-end speech recognition with recurrent neural networks","author_short":["Graves, A.","Jaitly, N."],"year":2014,"bibtype":"inproceedings","biburl":"https://bibbase.org/zotero/zzhenry2012","bibdata":{"bibtype":"inproceedings","type":"inproceedings","address":"Beijing, China","series":"ICML'14","title":"Towards end-to-end speech recognition with recurrent neural networks","abstract":"This paper presents a speech recognition system that directly transcribes audio data with text, without requiring an intermediate phonetic representation. The system is based on a combination of the deep bidirectional LSTM recurrent neural network architecture and the Connectionist Temporal Classification objective function. A modification to the objective function is introduced that trains the network to minimise the expectation of an arbitrary transcription loss function. This allows a direct optimisation of the word error rate, even in the absence of a lexicon or language model. The system achieves a word error rate of 27.3% on the Wall Street Journal corpus with no prior linguistic information, 21.9% with only a lexicon of allowed words, and 8.2% with a trigram language model. Combining the network with a baseline system further reduces the error rate to 6.7%.","language":"en","urldate":"2023-07-05","booktitle":"Proceedings of the 31st International Conference on International Conference on Machine Learning - Volume 32","publisher":"JMLR.org","author":[{"propositions":[],"lastnames":["Graves"],"firstnames":["Alex"],"suffixes":[]},{"propositions":[],"lastnames":["Jaitly"],"firstnames":["Navdeep"],"suffixes":[]}],"month":"June","year":"2014","keywords":"#ICML\\textgreater14, #RNN, #Recognition, #Speech, /unread","pages":"II–1764–II–1772","bibtex":"@inproceedings{graves_towards_2014,\n\taddress = {Beijing, China},\n\tseries = {{ICML}'14},\n\ttitle = {Towards end-to-end speech recognition with recurrent neural networks},\n\tabstract = {This paper presents a speech recognition system that directly transcribes audio data with text, without requiring an intermediate phonetic representation. The system is based on a combination of the deep bidirectional LSTM recurrent neural network architecture and the Connectionist Temporal Classification objective function. A modification to the objective function is introduced that trains the network to minimise the expectation of an arbitrary transcription loss function. This allows a direct optimisation of the word error rate, even in the absence of a lexicon or language model. The system achieves a word error rate of 27.3\\% on the Wall Street Journal corpus with no prior linguistic information, 21.9\\% with only a lexicon of allowed words, and 8.2\\% with a trigram language model. Combining the network with a baseline system further reduces the error rate to 6.7\\%.},\n\tlanguage = {en},\n\turldate = {2023-07-05},\n\tbooktitle = {Proceedings of the 31st {International} {Conference} on {International} {Conference} on {Machine} {Learning} - {Volume} 32},\n\tpublisher = {JMLR.org},\n\tauthor = {Graves, Alex and Jaitly, Navdeep},\n\tmonth = jun,\n\tyear = {2014},\n\tkeywords = {\\#ICML{\\textgreater}14, \\#RNN, \\#Recognition, \\#Speech, /unread},\n\tpages = {II--1764--II--1772},\n}\n\n\n\n\n\n\n\n\n\n\n\n","author_short":["Graves, A.","Jaitly, N."],"key":"graves_towards_2014","id":"graves_towards_2014","bibbaseid":"graves-jaitly-towardsendtoendspeechrecognitionwithrecurrentneuralnetworks-2014","role":"author","urls":{},"keyword":["#ICML\\textgreater14","#RNN","#Recognition","#Speech","/unread"],"metadata":{"authorlinks":{}},"downloads":0,"html":""},"search_terms":["towards","end","end","speech","recognition","recurrent","neural","networks","graves","jaitly"],"keywords":["#icml\\textgreater14","#rnn","#recognition","#speech","/unread"],"authorIDs":[],"dataSources":["EmYaiv9TCHbg7caTW","nZHrFJKyxKKDaWYM8"]}