\n
\n\n \n \n \n \n \n \n Improving phoneme recognition of throat microphone speech recordings using transfer learning.\n \n \n \n \n\n\n \n Turan, M. T.; and Erzin, E.\n\n\n \n\n\n\n
Speech Communication, 129: 25-32. 2021.\n
\n\n
\n\n
\n\n
\n\n \n \n Paper\n \n \n\n \n \n doi\n \n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{TURAN202125,\n author = "Turan, M.A. Tu{\\u{g}}tekin and Erzin, Engin",\n title = "Improving phoneme recognition of throat microphone speech recordings using transfer learning",\n journal = "Speech Communication",\n volume = "129",\n pages = "25-32",\n year = "2021",\n issn = "0167-6393",\n doi = "https://doi.org/10.1016/j.specom.2021.02.004",\n url = "https://www.sciencedirect.com/science/article/pii/S0167639321000200",\n keywords = "Phoneme recognition,Feature augmentation,Transfer learning,Throat microphone,Denoising auto-encoder,MSP",\n abstract = "Throat microphones (TM) are a type of skin-attached non-acoustic sensors, which are robust to environmental noise but carry a lower signal bandwidth characterization than the traditional close-talk microphones (CM). Attaining high-performance phoneme recognition is a challenging task when the training data from a degrading channel, such as TM, is limited. In this paper, we address this challenge for the TM speech recordings using a transfer learning approach based on the stacked denoising auto-encoders (SDA). The proposed transfer learning approach defines an SDA-based domain adaptation framework to map the source domain CM representations and the target domain TM representations into a common latent space, where the mismatch across TM and CM is eliminated to better train an acoustic model and to improve the TM phoneme recognition. For the phoneme recognition task, we use the convolutional neural network (CNN) and the hidden Markov model (HMM) based CNN/HMM hybrid system, which delivers better acoustic modeling performance compared to the conventional Gaussian mixture model (GMM) based models. In the experimental evaluations, we observed more than 12\\% relative phoneme error rate (PER) improvement for the TM recordings with the proposed transfer learning approach compared to baseline performances.",\n publisher = "Elsevier B.V."\n}\n\n
\n
\n\n\n
\n Throat microphones (TM) are a type of skin-attached non-acoustic sensors, which are robust to environmental noise but carry a lower signal bandwidth characterization than the traditional close-talk microphones (CM). Attaining high-performance phoneme recognition is a challenging task when the training data from a degrading channel, such as TM, is limited. In this paper, we address this challenge for the TM speech recordings using a transfer learning approach based on the stacked denoising auto-encoders (SDA). The proposed transfer learning approach defines an SDA-based domain adaptation framework to map the source domain CM representations and the target domain TM representations into a common latent space, where the mismatch across TM and CM is eliminated to better train an acoustic model and to improve the TM phoneme recognition. For the phoneme recognition task, we use the convolutional neural network (CNN) and the hidden Markov model (HMM) based CNN/HMM hybrid system, which delivers better acoustic modeling performance compared to the conventional Gaussian mixture model (GMM) based models. In the experimental evaluations, we observed more than 12% relative phoneme error rate (PER) improvement for the TM recordings with the proposed transfer learning approach compared to baseline performances.\n
\n\n\n
\n
\n\n \n \n \n \n \n \n Domain Adaptation for Food Intake Classification with Teacher/Student Learning.\n \n \n \n \n\n\n \n Turan, M. A. T.; and Erzin, E.\n\n\n \n\n\n\n
IEEE Transactions on Multimedia, 23: 4220 - 4231. 2021.\n
\n\n
\n\n
\n\n
\n\n \n \n Paper\n \n \n\n \n \n doi\n \n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n \n \n \n \n \n\n\n\n
\n
@article{turan2020domain,\n author = "Turan, Mehmet Ali Tugtekin and Erzin, Engin",\n title = "Domain Adaptation for Food Intake Classification with Teacher/Student Learning",\n journal = "IEEE Transactions on Multimedia",\n publisher = "IEEE",\n keywords = "MSP",\n url = "https://ieeexplore.ieee.org/document/9261115",\n volume = "23",\n pages = "4220 - 4231",\n doi = "10.1109/TMM.2020.3038315",\n abstract = "Automatic dietary monitoring (ADM) stands as a challenging application in wearable healthcare technologies. In this paper, we define an ADM to perform food intake classification (FIC) over throat microphone recordings. We investigate the use of transfer learning to design an improved FIC system. Although labeled data with acoustic close-talk microphones are abundant, throat data is scarce. Therefore, we propose a new adaptation framework based on teacher/student learning. The teacher network is trained over high-quality acoustic microphone recordings, whereas the student network distills deep feature extraction capacity of the teacher over a parallel dataset. Our approach allows us to transfer the representational capacity, adds robustness to the resulting model, and improves the FIC through throat microphone recordings. The classification problem is formulated as a spectra-temporal sequence recognition using the Convolutional LSTM (ConvLSTM) models. We evaluate the proposed approach using a large scale acoustic dataset collected from online recordings, an in-house food intake throat microphone dataset, and a parallel speech dataset. The bidirectional ConvLSTM network with the proposed domain adaptation approach consistently outperforms the SVM- and CNN-based baseline methods and attains 85.2\\% accuracy for the classification of 10 different food intake items. This translates to 17.8\\% accuracy improvement with the proposed domain adaptation.",\n year = "2021"\n}\n\n
\n
\n\n\n
\n Automatic dietary monitoring (ADM) stands as a challenging application in wearable healthcare technologies. In this paper, we define an ADM to perform food intake classification (FIC) over throat microphone recordings. We investigate the use of transfer learning to design an improved FIC system. Although labeled data with acoustic close-talk microphones are abundant, throat data is scarce. Therefore, we propose a new adaptation framework based on teacher/student learning. The teacher network is trained over high-quality acoustic microphone recordings, whereas the student network distills deep feature extraction capacity of the teacher over a parallel dataset. Our approach allows us to transfer the representational capacity, adds robustness to the resulting model, and improves the FIC through throat microphone recordings. The classification problem is formulated as a spectra-temporal sequence recognition using the Convolutional LSTM (ConvLSTM) models. We evaluate the proposed approach using a large scale acoustic dataset collected from online recordings, an in-house food intake throat microphone dataset, and a parallel speech dataset. The bidirectional ConvLSTM network with the proposed domain adaptation approach consistently outperforms the SVM- and CNN-based baseline methods and attains 85.2% accuracy for the classification of 10 different food intake items. This translates to 17.8% accuracy improvement with the proposed domain adaptation.\n
\n\n\n