Grapheme-to-Phoneme Conversion with Convolutional Neural Networks. Yolchuyeva, S., Németh, G., & Gyires-Tóth, B. Applied Sciences, 9(6):1143, January, 2019. Number: 6 Publisher: Multidisciplinary Digital Publishing Institute
Grapheme-to-Phoneme Conversion with Convolutional Neural Networks [link]Paper  doi  abstract   bibtex   
Grapheme-to-phoneme (G2P) conversion is the process of generating pronunciation for words based on their written form. It has a highly essential role for natural language processing, text-to-speech synthesis and automatic speech recognition systems. In this paper, we investigate convolutional neural networks (CNN) for G2P conversion. We propose a novel CNN-based sequence-to-sequence (seq2seq) architecture for G2P conversion. Our approach includes an end-to-end CNN G2P conversion with residual connections and, furthermore, a model that utilizes a convolutional neural network (with and without residual connections) as encoder and Bi-LSTM as a decoder. We compare our approach with state-of-the-art methods, including Encoder-Decoder LSTM and Encoder-Decoder Bi-LSTM. Training and inference times, phoneme and word error rates were evaluated on the public CMUDict dataset for US English, and the best performing convolutional neural network-based architecture was also evaluated on the NetTalk dataset. Our method approaches the accuracy of previous state-of-the-art results in terms of phoneme error rate.
@article{yolchuyeva_grapheme--phoneme_2019,
	title = {Grapheme-to-{Phoneme} {Conversion} with {Convolutional} {Neural} {Networks}},
	volume = {9},
	copyright = {http://creativecommons.org/licenses/by/3.0/},
	issn = {2076-3417},
	url = {https://www.mdpi.com/2076-3417/9/6/1143},
	doi = {10.3390/app9061143},
	abstract = {Grapheme-to-phoneme (G2P) conversion is the process of generating pronunciation for words based on their written form. It has a highly essential role for natural language processing, text-to-speech synthesis and automatic speech recognition systems. In this paper, we investigate convolutional neural networks (CNN) for G2P conversion. We propose a novel CNN-based sequence-to-sequence (seq2seq) architecture for G2P conversion. Our approach includes an end-to-end CNN G2P conversion with residual connections and, furthermore, a model that utilizes a convolutional neural network (with and without residual connections) as encoder and Bi-LSTM as a decoder. We compare our approach with state-of-the-art methods, including Encoder-Decoder LSTM and Encoder-Decoder Bi-LSTM. Training and inference times, phoneme and word error rates were evaluated on the public CMUDict dataset for US English, and the best performing convolutional neural network-based architecture was also evaluated on the NetTalk dataset. Our method approaches the accuracy of previous state-of-the-art results in terms of phoneme error rate.},
	language = {en},
	number = {6},
	urldate = {2022-10-08},
	journal = {Applied Sciences},
	author = {Yolchuyeva, Sevinj and Németh, Géza and Gyires-Tóth, Bálint},
	month = jan,
	year = {2019},
	note = {Number: 6
Publisher: Multidisciplinary Digital Publishing Institute},
	keywords = {1D convolution, Bi-LSTM, LSTM, encoder-decoder, grapheme-to-phoneme (G2P), residual architecture},
	pages = {1143},
}

Downloads: 0