Towards end-2-end learning for predicting behavior codes from spoken utterances in psychotherapy conversations

Towards end-2-end learning for predicting behavior codes from spoken utterances in psychotherapy conversations. Singla, K., Chen, Z., Atkins, D., & Narayanan, S. In Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics, pages 3797-3803, Online, Jul, 2020. Association for Computational Linguistics.
doi abstract bibtex

Spoken language understanding tasks usually rely on pipelines involving complex processing blocks such as voice activity detection, speaker diarization and Automatic speech recognition (ASR). We propose a novel framework for predicting utterance level labels directly from speech features, thus removing the dependency on first generating transcripts, and transcription free behavioral coding. Our classifier uses a pretrained Speech-2-Vector encoder as bottleneck to generate word-level representations from speech features. This pretrained encoder learns to encode speech features for a word using an objective similar to Word2Vec. Our proposed approach just uses speech features and word segmentation information for predicting spoken utterance-level target labels. We show that our model achieves competitive results to other state-of-the-art approaches which use transcribed text for the task of predicting psychotherapy-relevant behavior codes.

@inproceedings{Singla2020End2EndBehavioralCoding,
 abstract = {Spoken language understanding tasks usually rely on pipelines involving complex processing blocks such as voice activity detection, speaker diarization and Automatic speech recognition (ASR). We propose a novel framework for predicting utterance level labels directly from speech features, thus removing the dependency on first generating transcripts, and transcription free behavioral coding. Our classifier uses a pretrained Speech-2-Vector encoder as bottleneck to generate word-level representations from speech features. This pretrained encoder learns to encode speech features for a word using an objective similar to Word2Vec. Our proposed approach just uses speech features and word segmentation information for predicting spoken utterance-level target labels. We show that our model achieves competitive results to other state-of-the-art approaches which use transcribed text for the task of predicting psychotherapy-relevant behavior codes.},
 address = {Online},
 author = {Singla, Karan and Chen, Zhuohao and Atkins, David and Narayanan, Shrikanth},
 booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},
 doi = {10.18653/v1/2020.acl-main.351},
 link = {https://www.aclweb.org/anthology/2020.acl-main.351.pdf},
 month = {Jul},
 pages = {3797-3803},
 publisher = {Association for Computational Linguistics},
 title = {Towards end-2-end learning for predicting behavior codes from spoken utterances in psychotherapy conversations},
 year = {2020}
}

Downloads: 0

{"_id":"wNLy9NQQt7PatTnPf","bibbaseid":"singla-chen-atkins-narayanan-towardsend2endlearningforpredictingbehaviorcodesfromspokenutterancesinpsychotherapyconversations-2020","author_short":["Singla, K.","Chen, Z.","Atkins, D.","Narayanan, S."],"bibdata":{"bibtype":"inproceedings","type":"inproceedings","abstract":"Spoken language understanding tasks usually rely on pipelines involving complex processing blocks such as voice activity detection, speaker diarization and Automatic speech recognition (ASR). We propose a novel framework for predicting utterance level labels directly from speech features, thus removing the dependency on first generating transcripts, and transcription free behavioral coding. Our classifier uses a pretrained Speech-2-Vector encoder as bottleneck to generate word-level representations from speech features. This pretrained encoder learns to encode speech features for a word using an objective similar to Word2Vec. Our proposed approach just uses speech features and word segmentation information for predicting spoken utterance-level target labels. We show that our model achieves competitive results to other state-of-the-art approaches which use transcribed text for the task of predicting psychotherapy-relevant behavior codes.","address":"Online","author":[{"propositions":[],"lastnames":["Singla"],"firstnames":["Karan"],"suffixes":[]},{"propositions":[],"lastnames":["Chen"],"firstnames":["Zhuohao"],"suffixes":[]},{"propositions":[],"lastnames":["Atkins"],"firstnames":["David"],"suffixes":[]},{"propositions":[],"lastnames":["Narayanan"],"firstnames":["Shrikanth"],"suffixes":[]}],"booktitle":"Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics","doi":"10.18653/v1/2020.acl-main.351","link":"https://www.aclweb.org/anthology/2020.acl-main.351.pdf","month":"Jul","pages":"3797-3803","publisher":"Association for Computational Linguistics","title":"Towards end-2-end learning for predicting behavior codes from spoken utterances in psychotherapy conversations","year":"2020","bibtex":"@inproceedings{Singla2020End2EndBehavioralCoding,\n abstract = {Spoken language understanding tasks usually rely on pipelines involving complex processing blocks such as voice activity detection, speaker diarization and Automatic speech recognition (ASR). We propose a novel framework for predicting utterance level labels directly from speech features, thus removing the dependency on first generating transcripts, and transcription free behavioral coding. Our classifier uses a pretrained Speech-2-Vector encoder as bottleneck to generate word-level representations from speech features. This pretrained encoder learns to encode speech features for a word using an objective similar to Word2Vec. Our proposed approach just uses speech features and word segmentation information for predicting spoken utterance-level target labels. We show that our model achieves competitive results to other state-of-the-art approaches which use transcribed text for the task of predicting psychotherapy-relevant behavior codes.},\n address = {Online},\n author = {Singla, Karan and Chen, Zhuohao and Atkins, David and Narayanan, Shrikanth},\n booktitle = {Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics},\n doi = {10.18653/v1/2020.acl-main.351},\n link = {https://www.aclweb.org/anthology/2020.acl-main.351.pdf},\n month = {Jul},\n pages = {3797-3803},\n publisher = {Association for Computational Linguistics},\n title = {Towards end-2-end learning for predicting behavior codes from spoken utterances in psychotherapy conversations},\n year = {2020}\n}\n\n","author_short":["Singla, K.","Chen, Z.","Atkins, D.","Narayanan, S."],"bibbaseid":"singla-chen-atkins-narayanan-towardsend2endlearningforpredictingbehaviorcodesfromspokenutterancesinpsychotherapyconversations-2020","role":"author","urls":{},"metadata":{"authorlinks":{}}},"bibtype":"inproceedings","biburl":"https://bibbase.org/f/nWhKb4SffvhfreEmj/shri-isi-edu.bib","dataSources":["P3nQrSLkFzGGSmKJQ","Reikhy6EiDXFTcuR9"],"keywords":[],"search_terms":["towards","end","end","learning","predicting","behavior","codes","spoken","utterances","psychotherapy","conversations","singla","chen","atkins","narayanan"],"title":"Towards end-2-end learning for predicting behavior codes from spoken utterances in psychotherapy conversations","year":2020}