A review of recent advances in visual speech decoding. Zhou Z Zhao G, H., X., &., P., M. Image and Vision Computing, 32(9):590-605, 2014.
abstract   bibtex   
Visual speech information plays an important role in automatic speech recognition (ASR) especially when audio is corrupted or even inaccessible. Despite the success of audio-based ASR, the problem of visual speech decoding remains widely open. This paper provides a detailed review of recent advances in this research area. In comparison with the previous survey which covers the whole ASR system that uses visual speech information, we focus on the important questions asked by researchers and summarize the recent studies that attempt to answer them. In particular, there are three questions related to the extraction of visual features, concerning the speaker dependency, pose variation and temporal information, respectively. Another question is about the audio-visual speech fusion, considering the dynamic changes of modality reliabilities encountered in practice. In addition, the state-of-the-art on facial landmark localization is brie y introduced in this paper. Those advanced techniques can be used to improve the region-of-interest detection, but have been largely ignored when building a visual-based ASR system. We also provide details of the audio-visual speech databases. Finally, we discuss the remaining challenges and o er our insights into the future research on visual speech decoding.
@article{
 title = {A review of recent advances in visual speech decoding.},
 type = {article},
 year = {2014},
 identifiers = {[object Object]},
 pages = {590-605},
 volume = {32},
 id = {9c397efd-dc37-3789-b012-8b3bd824f57e},
 created = {2019-11-19T13:01:14.903Z},
 file_attached = {false},
 profile_id = {bddcf02d-403b-3b06-9def-6d15cc293e20},
 group_id = {17585b85-df99-3a34-98c2-c73e593397d7},
 last_modified = {2019-11-19T13:49:10.258Z},
 read = {false},
 starred = {false},
 authored = {false},
 confirmed = {true},
 hidden = {false},
 citation_key = {mvg:1854},
 source_type = {article},
 private_publication = {false},
 abstract = {Visual speech information plays an important role in automatic speech recognition (ASR) especially when audio is corrupted or even inaccessible. Despite the success of audio-based ASR, the problem of visual speech decoding remains widely open. This paper provides a detailed review of recent advances in this research area.
In comparison with the previous survey which covers the whole ASR system that uses visual speech information, we focus on the important questions asked by researchers and summarize the recent studies that attempt to answer them. In particular, there are three questions related to the extraction of visual features, concerning the speaker dependency, pose variation and temporal information, respectively. Another question is about the audio-visual speech fusion, considering the dynamic changes of modality reliabilities
encountered in practice. In addition, the state-of-the-art on facial landmark localization is brie
y introduced in this paper. Those advanced techniques can be used to improve the region-of-interest detection, but have
been largely ignored when building a visual-based ASR system. We also provide details of the audio-visual speech databases. Finally, we discuss the remaining challenges and oer our insights into the future research on visual speech decoding.},
 bibtype = {article},
 author = {Zhou Z Zhao G, Hong X & Pietikäinen M},
 journal = {Image and Vision Computing},
 number = {9}
}

Downloads: 0