Feature space video stream consistency estimation for dynamic stream weighting in audio-visual speech recognition

Feature space video stream consistency estimation for dynamic stream weighting in audio-visual speech recognition. Terry, L. H., Shiell, D. J., & Katsaggelos, A. K. In 2008 15th IEEE International Conference on Image Processing, pages 1316–1319, 2008. IEEE.

Paper doi abstract bibtex

Most current audio-visual automatic speech recognition (AV-ASR) systems use static weights to leverage between audio and visual information during information fusion. State of the art research has led to using audio reliability metrics for dynamically changing the fusion weights in order to successfully improve overall recognition results. So far, however, incorporating visual reliability metrics into these audio reliability metric based systems have not significantly improved performance. We introduce a new approach to this problem by inferring the "consistency" between the audio and visual information and leveraging the existing audio reliability metrics to create a video reliability metric. Our approach is formulated in the extractedfeature space and, thus, does not rely on analyzing the actual video signalitself. The framework presented in this work competes with the audio-onlyreliability metric based systems and shows promise to consistently outperform. © 2008 IEEE.

@inproceedings{Louis2008a,
abstract = {Most current audio-visual automatic speech recognition (AV-ASR) systems use static weights to leverage between audio and visual information during information fusion. State of the art research has led to using audio reliability metrics for dynamically changing the fusion weights in order to successfully improve overall recognition results. So far, however, incorporating visual reliability metrics into these audio reliability metric based systems have not significantly improved performance. We introduce a new approach to this problem by inferring the "consistency" between the audio and visual information and leveraging the existing audio reliability metrics to create a video reliability metric. Our approach is formulated in the extractedfeature space and, thus, does not rely on analyzing the actual video signalitself. The framework presented in this work competes with the audio-onlyreliability metric based systems and shows promise to consistently outperform. {\textcopyright} 2008 IEEE.},
author = {Terry, Louis H. and Shiell, Derek J. and Katsaggelos, Aggelos K.},
booktitle = {2008 15th IEEE International Conference on Image Processing},
doi = {10.1109/ICIP.2008.4712005},
isbn = {978-1-4244-1765-0},
issn = {15224880},
keywords = {Hidden Markov models,Speech recognition,Vector quantization},
pages = {1316--1319},
publisher = {IEEE},
title = {{Feature space video stream consistency estimation for dynamic stream weighting in audio-visual speech recognition}},
url = {http://ieeexplore.ieee.org/document/4712005/},
year = {2008}
}

Downloads: 0

{"_id":"f3LFHBsFyaXaZn8qy","bibbaseid":"terry-shiell-katsaggelos-featurespacevideostreamconsistencyestimationfordynamicstreamweightinginaudiovisualspeechrecognition-2008","author_short":["Terry, L. H.","Shiell, D. J.","Katsaggelos, A. K."],"bibdata":{"bibtype":"inproceedings","type":"inproceedings","abstract":"Most current audio-visual automatic speech recognition (AV-ASR) systems use static weights to leverage between audio and visual information during information fusion. State of the art research has led to using audio reliability metrics for dynamically changing the fusion weights in order to successfully improve overall recognition results. So far, however, incorporating visual reliability metrics into these audio reliability metric based systems have not significantly improved performance. We introduce a new approach to this problem by inferring the \"consistency\" between the audio and visual information and leveraging the existing audio reliability metrics to create a video reliability metric. Our approach is formulated in the extractedfeature space and, thus, does not rely on analyzing the actual video signalitself. The framework presented in this work competes with the audio-onlyreliability metric based systems and shows promise to consistently outperform. © 2008 IEEE.","author":[{"propositions":[],"lastnames":["Terry"],"firstnames":["Louis","H."],"suffixes":[]},{"propositions":[],"lastnames":["Shiell"],"firstnames":["Derek","J."],"suffixes":[]},{"propositions":[],"lastnames":["Katsaggelos"],"firstnames":["Aggelos","K."],"suffixes":[]}],"booktitle":"2008 15th IEEE International Conference on Image Processing","doi":"10.1109/ICIP.2008.4712005","isbn":"978-1-4244-1765-0","issn":"15224880","keywords":"Hidden Markov models,Speech recognition,Vector quantization","pages":"1316–1319","publisher":"IEEE","title":"Feature space video stream consistency estimation for dynamic stream weighting in audio-visual speech recognition","url":"http://ieeexplore.ieee.org/document/4712005/","year":"2008","bibtex":"@inproceedings{Louis2008a,\nabstract = {Most current audio-visual automatic speech recognition (AV-ASR) systems use static weights to leverage between audio and visual information during information fusion. State of the art research has led to using audio reliability metrics for dynamically changing the fusion weights in order to successfully improve overall recognition results. So far, however, incorporating visual reliability metrics into these audio reliability metric based systems have not significantly improved performance. We introduce a new approach to this problem by inferring the \"consistency\" between the audio and visual information and leveraging the existing audio reliability metrics to create a video reliability metric. Our approach is formulated in the extractedfeature space and, thus, does not rely on analyzing the actual video signalitself. The framework presented in this work competes with the audio-onlyreliability metric based systems and shows promise to consistently outperform. {\\textcopyright} 2008 IEEE.},\nauthor = {Terry, Louis H. and Shiell, Derek J. and Katsaggelos, Aggelos K.},\nbooktitle = {2008 15th IEEE International Conference on Image Processing},\ndoi = {10.1109/ICIP.2008.4712005},\nisbn = {978-1-4244-1765-0},\nissn = {15224880},\nkeywords = {Hidden Markov models,Speech recognition,Vector quantization},\npages = {1316--1319},\npublisher = {IEEE},\ntitle = {{Feature space video stream consistency estimation for dynamic stream weighting in audio-visual speech recognition}},\nurl = {http://ieeexplore.ieee.org/document/4712005/},\nyear = {2008}\n}\n","author_short":["Terry, L. H.","Shiell, D. J.","Katsaggelos, A. K."],"key":"Louis2008a","id":"Louis2008a","bibbaseid":"terry-shiell-katsaggelos-featurespacevideostreamconsistencyestimationfordynamicstreamweightinginaudiovisualspeechrecognition-2008","role":"author","urls":{"Paper":"http://ieeexplore.ieee.org/document/4712005/"},"keyword":["Hidden Markov models","Speech recognition","Vector quantization"],"metadata":{"authorlinks":{}}},"bibtype":"inproceedings","biburl":"https://sites.northwestern.edu/ivpl/files/2023/06/IVPL_Updated_publications-1.bib","dataSources":["KTWAakbPXLGfYseXn","ePKPjG8C6yvpk4mEK","ya2CyA73rpZseyrZ8","qhF8zxmGcJfvtdeAg","fvDEHD49E2ZRwE3fb","H7crv8NWhZup4d4by","DHqokWsryttGh7pJE","vRJd4wNg9HpoZSMHD","sYxQ6pxFgA59JRhxi","w2WahSbYrbcCKBDsC","XasdXLL99y5rygCmq","3gkSihZQRfAD2KBo3","t5XMbyZbtPBo4wBGS","bEpHM2CtrwW2qE8FP","teJzFLHexaz5AQW5z"],"keywords":["hidden markov models","speech recognition","vector quantization"],"search_terms":["feature","space","video","stream","consistency","estimation","dynamic","stream","weighting","audio","visual","speech","recognition","terry","shiell","katsaggelos"],"title":"Feature space video stream consistency estimation for dynamic stream weighting in audio-visual speech recognition","year":2008}