Speech-to-video synthesis using facial animation parameters

Speech-to-video synthesis using facial animation parameters. Aleksic, P. & Katsaggelos, A. In Proceedings 2003 International Conference on Image Processing (Cat. No.03CH37429), volume 2, pages III–1–4, 2003. IEEE.

Paper doi abstract bibtex

The presence of visual information in addition to audio could improve speech understanding in noisy environments. This additional information could be especially useful for people with impaired hearing who are able to speechread. This paper focuses on the problem of synthesizing the Facial Animation Parameters (FAPs), supported by the MPEG-4 standard for the visual representation of speech, from a narrowband acoustic speech (telephone) signal. A correlation Hidden Markov Model (CHMM) system for performing visual speech synthesis is proposed. The CHMM system integrates an independently trained acoustic HMM (AHMM) system and a visual HMM (VHMM) system, in order to realize speech-to-video synthesis. Objective experiments are performed by analyzing the synthesized FAPs and computing the time alignment errors. Time alignment errors are reduced by 40.5% compared to the conventional temporal scaling method.

@inproceedings{Petar2003,
abstract = {The presence of visual information in addition to audio could improve speech understanding in noisy environments. This additional information could be especially useful for people with impaired hearing who are able to speechread. This paper focuses on the problem of synthesizing the Facial Animation Parameters (FAPs), supported by the MPEG-4 standard for the visual representation of speech, from a narrowband acoustic speech (telephone) signal. A correlation Hidden Markov Model (CHMM) system for performing visual speech synthesis is proposed. The CHMM system integrates an independently trained acoustic HMM (AHMM) system and a visual HMM (VHMM) system, in order to realize speech-to-video synthesis. Objective experiments are performed by analyzing the synthesized FAPs and computing the time alignment errors. Time alignment errors are reduced by 40.5% compared to the conventional temporal scaling method.},
author = {Aleksic, P.S. and Katsaggelos, A.K.},
booktitle = {Proceedings 2003 International Conference on Image Processing (Cat. No.03CH37429)},
doi = {10.1109/ICIP.2003.1247166},
isbn = {0-7803-7750-8},
pages = {III--1--4},
publisher = {IEEE},
title = {{Speech-to-video synthesis using facial animation parameters}},
url = {http://ieeexplore.ieee.org/document/1247166/},
volume = {2},
year = {2003}
}

Downloads: 0

{"_id":"sGfWKezXcb2d8Qj6j","bibbaseid":"aleksic-katsaggelos-speechtovideosynthesisusingfacialanimationparameters-2003","author_short":["Aleksic, P.","Katsaggelos, A."],"bibdata":{"bibtype":"inproceedings","type":"inproceedings","abstract":"The presence of visual information in addition to audio could improve speech understanding in noisy environments. This additional information could be especially useful for people with impaired hearing who are able to speechread. This paper focuses on the problem of synthesizing the Facial Animation Parameters (FAPs), supported by the MPEG-4 standard for the visual representation of speech, from a narrowband acoustic speech (telephone) signal. A correlation Hidden Markov Model (CHMM) system for performing visual speech synthesis is proposed. The CHMM system integrates an independently trained acoustic HMM (AHMM) system and a visual HMM (VHMM) system, in order to realize speech-to-video synthesis. Objective experiments are performed by analyzing the synthesized FAPs and computing the time alignment errors. Time alignment errors are reduced by 40.5% compared to the conventional temporal scaling method.","author":[{"propositions":[],"lastnames":["Aleksic"],"firstnames":["P.S."],"suffixes":[]},{"propositions":[],"lastnames":["Katsaggelos"],"firstnames":["A.K."],"suffixes":[]}],"booktitle":"Proceedings 2003 International Conference on Image Processing (Cat. No.03CH37429)","doi":"10.1109/ICIP.2003.1247166","isbn":"0-7803-7750-8","pages":"III–1–4","publisher":"IEEE","title":"Speech-to-video synthesis using facial animation parameters","url":"http://ieeexplore.ieee.org/document/1247166/","volume":"2","year":"2003","bibtex":"@inproceedings{Petar2003,\nabstract = {The presence of visual information in addition to audio could improve speech understanding in noisy environments. This additional information could be especially useful for people with impaired hearing who are able to speechread. This paper focuses on the problem of synthesizing the Facial Animation Parameters (FAPs), supported by the MPEG-4 standard for the visual representation of speech, from a narrowband acoustic speech (telephone) signal. A correlation Hidden Markov Model (CHMM) system for performing visual speech synthesis is proposed. The CHMM system integrates an independently trained acoustic HMM (AHMM) system and a visual HMM (VHMM) system, in order to realize speech-to-video synthesis. Objective experiments are performed by analyzing the synthesized FAPs and computing the time alignment errors. Time alignment errors are reduced by 40.5% compared to the conventional temporal scaling method.},\nauthor = {Aleksic, P.S. and Katsaggelos, A.K.},\nbooktitle = {Proceedings 2003 International Conference on Image Processing (Cat. No.03CH37429)},\ndoi = {10.1109/ICIP.2003.1247166},\nisbn = {0-7803-7750-8},\npages = {III--1--4},\npublisher = {IEEE},\ntitle = {{Speech-to-video synthesis using facial animation parameters}},\nurl = {http://ieeexplore.ieee.org/document/1247166/},\nvolume = {2},\nyear = {2003}\n}\n","author_short":["Aleksic, P.","Katsaggelos, A."],"key":"Petar2003","id":"Petar2003","bibbaseid":"aleksic-katsaggelos-speechtovideosynthesisusingfacialanimationparameters-2003","role":"author","urls":{"Paper":"http://ieeexplore.ieee.org/document/1247166/"},"metadata":{"authorlinks":{}},"html":""},"bibtype":"inproceedings","biburl":"https://sites.northwestern.edu/ivpl/files/2023/06/IVPL_Updated_publications-1.bib","dataSources":["KTWAakbPXLGfYseXn","ePKPjG8C6yvpk4mEK","qhF8zxmGcJfvtdeAg","fvDEHD49E2ZRwE3fb","H7crv8NWhZup4d4by","DHqokWsryttGh7pJE","vRJd4wNg9HpoZSMHD","sYxQ6pxFgA59JRhxi","w2WahSbYrbcCKBDsC","XasdXLL99y5rygCmq","3gkSihZQRfAD2KBo3","t5XMbyZbtPBo4wBGS","bEpHM2CtrwW2qE8FP","teJzFLHexaz5AQW5z","taz5xnPrcQTmMdtqr"],"keywords":[],"search_terms":["speech","video","synthesis","using","facial","animation","parameters","aleksic","katsaggelos"],"title":"Speech-to-video synthesis using facial animation parameters","year":2003}