Exploiting Visual Information in Automatic Speech Processing

Exploiting Visual Information in Automatic Speech Processing. Aleksic, P. S., Potamianos, G., & Katsaggelos, A. K. In Handbook of Image and Video Processing, pages 1263–XXXIX. Elsevier, 2005.

Paper doi abstract bibtex

This chapter focuses on how the joint processing of visual and audio signals, both generated by a talking person, can provide valuable speech information to benefit a number of audiovisual speech processing applications crucial to human-computer interactions. The analysis of visual signals has been done followed by a description of various possible ways of representing and extracting the speech information available in them. It has been shown in the chapter that the obtained visual features can complement features extracted from the acoustic signal and that the two modality representations can be fused together to allow joint audiovisual speech processing. The general bimodal integration framework is subsequently applied to three problems-automatic speech recognition, talking face synthesis, and speaker identification and authentication. In all three cases, issues specific to the particular application have been discussed, several relevant systems that have been reported in the literature have been reviewed, and the results using the implementations developed at IBM Research and Northwestern University have been presented. © 2005 Elsevier Inc. All rights reserved.

@incollection{Petar2005,
abstract = {This chapter focuses on how the joint processing of visual and audio signals, both generated by a talking person, can provide valuable speech information to benefit a number of audiovisual speech processing applications crucial to human-computer interactions. The analysis of visual signals has been done followed by a description of various possible ways of representing and extracting the speech information available in them. It has been shown in the chapter that the obtained visual features can complement features extracted from the acoustic signal and that the two modality representations can be fused together to allow joint audiovisual speech processing. The general bimodal integration framework is subsequently applied to three problems-automatic speech recognition, talking face synthesis, and speaker identification and authentication. In all three cases, issues specific to the particular application have been discussed, several relevant systems that have been reported in the literature have been reviewed, and the results using the implementations developed at IBM Research and Northwestern University have been presented. {\textcopyright} 2005 Elsevier Inc. All rights reserved.},
author = {Aleksic, Petar S. and Potamianos, Gerasimos and Katsaggelos, Aggelos K.},
booktitle = {Handbook of Image and Video Processing},
doi = {10.1016/B978-012119792-6/50134-0},
isbn = {9780121197926},
pages = {1263--XXXIX},
publisher = {Elsevier},
title = {{Exploiting Visual Information in Automatic Speech Processing}},
url = {https://linkinghub.elsevier.com/retrieve/pii/B9780121197926501340},
year = {2005}
}

Downloads: 0

{"_id":"exTBkAraz35RikMBc","bibbaseid":"aleksic-potamianos-katsaggelos-exploitingvisualinformationinautomaticspeechprocessing-2005","author_short":["Aleksic, P. S.","Potamianos, G.","Katsaggelos, A. K."],"bibdata":{"bibtype":"incollection","type":"incollection","abstract":"This chapter focuses on how the joint processing of visual and audio signals, both generated by a talking person, can provide valuable speech information to benefit a number of audiovisual speech processing applications crucial to human-computer interactions. The analysis of visual signals has been done followed by a description of various possible ways of representing and extracting the speech information available in them. It has been shown in the chapter that the obtained visual features can complement features extracted from the acoustic signal and that the two modality representations can be fused together to allow joint audiovisual speech processing. The general bimodal integration framework is subsequently applied to three problems-automatic speech recognition, talking face synthesis, and speaker identification and authentication. In all three cases, issues specific to the particular application have been discussed, several relevant systems that have been reported in the literature have been reviewed, and the results using the implementations developed at IBM Research and Northwestern University have been presented. © 2005 Elsevier Inc. All rights reserved.","author":[{"propositions":[],"lastnames":["Aleksic"],"firstnames":["Petar","S."],"suffixes":[]},{"propositions":[],"lastnames":["Potamianos"],"firstnames":["Gerasimos"],"suffixes":[]},{"propositions":[],"lastnames":["Katsaggelos"],"firstnames":["Aggelos","K."],"suffixes":[]}],"booktitle":"Handbook of Image and Video Processing","doi":"10.1016/B978-012119792-6/50134-0","isbn":"9780121197926","pages":"1263–XXXIX","publisher":"Elsevier","title":"Exploiting Visual Information in Automatic Speech Processing","url":"https://linkinghub.elsevier.com/retrieve/pii/B9780121197926501340","year":"2005","bibtex":"@incollection{Petar2005,\nabstract = {This chapter focuses on how the joint processing of visual and audio signals, both generated by a talking person, can provide valuable speech information to benefit a number of audiovisual speech processing applications crucial to human-computer interactions. The analysis of visual signals has been done followed by a description of various possible ways of representing and extracting the speech information available in them. It has been shown in the chapter that the obtained visual features can complement features extracted from the acoustic signal and that the two modality representations can be fused together to allow joint audiovisual speech processing. The general bimodal integration framework is subsequently applied to three problems-automatic speech recognition, talking face synthesis, and speaker identification and authentication. In all three cases, issues specific to the particular application have been discussed, several relevant systems that have been reported in the literature have been reviewed, and the results using the implementations developed at IBM Research and Northwestern University have been presented. {\\textcopyright} 2005 Elsevier Inc. All rights reserved.},\nauthor = {Aleksic, Petar S. and Potamianos, Gerasimos and Katsaggelos, Aggelos K.},\nbooktitle = {Handbook of Image and Video Processing},\ndoi = {10.1016/B978-012119792-6/50134-0},\nisbn = {9780121197926},\npages = {1263--XXXIX},\npublisher = {Elsevier},\ntitle = {{Exploiting Visual Information in Automatic Speech Processing}},\nurl = {https://linkinghub.elsevier.com/retrieve/pii/B9780121197926501340},\nyear = {2005}\n}\n","author_short":["Aleksic, P. S.","Potamianos, G.","Katsaggelos, A. K."],"key":"Petar2005","id":"Petar2005","bibbaseid":"aleksic-potamianos-katsaggelos-exploitingvisualinformationinautomaticspeechprocessing-2005","role":"author","urls":{"Paper":"https://linkinghub.elsevier.com/retrieve/pii/B9780121197926501340"},"metadata":{"authorlinks":{}}},"bibtype":"incollection","biburl":"https://sites.northwestern.edu/ivpl/files/2023/06/IVPL_Updated_publications-1.bib","dataSources":["qhF8zxmGcJfvtdeAg","fvDEHD49E2ZRwE3fb","H7crv8NWhZup4d4by","DHqokWsryttGh7pJE","vRJd4wNg9HpoZSMHD","sYxQ6pxFgA59JRhxi","w2WahSbYrbcCKBDsC","XasdXLL99y5rygCmq","3gkSihZQRfAD2KBo3","t5XMbyZbtPBo4wBGS","bEpHM2CtrwW2qE8FP","teJzFLHexaz5AQW5z"],"keywords":[],"search_terms":["exploiting","visual","information","automatic","speech","processing","aleksic","potamianos","katsaggelos"],"title":"Exploiting Visual Information in Automatic Speech Processing","year":2005}