<script src="https://bibbase.org/service/mendeley/fb8d345a-1d79-3791-a6c6-00233ea44521?jsonp=1"></script>
<?php
$contents = file_get_contents("https://bibbase.org/service/mendeley/fb8d345a-1d79-3791-a6c6-00233ea44521");
print_r($contents);
?>
<iframe src="https://bibbase.org/service/mendeley/fb8d345a-1d79-3791-a6c6-00233ea44521"></iframe>
For more details see the documention.
To the site owner:
Action required! Mendeley is changing its API. In order to keep using Mendeley with BibBase past April 14th, you need to:
@inproceedings{
title = {Gesticulator: A framework for semantically-aware speech-driven gesture generation},
type = {inproceedings},
year = {2020},
id = {84c5dc7f-f42a-373c-ab67-7c3404947c94},
created = {2020-02-19T11:04:42.636Z},
file_attached = {false},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-10-21T13:30:48.195Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {true},
hidden = {false},
citation_key = {kucherenko2020gesticulator},
source_type = {misc},
folder_uuids = {b9e8b2b5-59ea-4ffd-818c-22a0d98c2144},
private_publication = {false},
abstract = {During speech, people spontaneously gesticulate, which plays a key role in conveying information. Similarly, realistic co-speech gestures are crucial to enable natural and smooth interactions with social agents. Current data-driven co-speech gesture generation systems use a single modality for representing speech: either audio or text. These systems are therefore confined to producing either acoustically-linked beat gestures or semantically-linked gesticulation (e.g., raising a hand when saying ``high''): they cannot appropriately learn to generate both gesture types. We present a model designed to produce arbitrary beat and semantic gestures together. Our deep-learning based model takes both acoustic and semantic representations of speech as input, and generates gestures as a sequence of joint angle rotations as output. The resulting gestures can be applied to both virtual agents and humanoid robots. Subjective and objective evaluations confirm the success of our approach. The code is publicly available at https://github.com/Svito-zar/gesticulator.},
bibtype = {inproceedings},
author = {Kucherenko, Taras and Jonell, Patrik and van Waveren, Sanne and Henter, Gustav Eje and Alexanderson, Simon and Leite, Iolanda and Kjellström, Hedvig},
booktitle = {Proceedings of the 22th ACM International Conference on Multimodal Interaction (ICMI'20)}
}
@inproceedings{
title = {Let's Face It: Probabilistic Multi-Modal Interlocutor-Aware Generation of Facial Gestures in Dyadic Settings},
type = {inproceedings},
year = {2020},
websites = {https://dl.acm.org/doi/10.1145/3383652.3423911},
month = {10},
publisher = {Association for Computing Machinery},
day = {20},
id = {126215d3-3019-37b7-ac2d-284c76a30605},
created = {2020-07-06T09:10:49.587Z},
file_attached = {true},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-11-08T21:58:48.299Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {true},
hidden = {false},
citation_key = {jonell2020lets},
private_publication = {false},
abstract = {To enable more natural face-to-face interactions, conversational agents need to adapt their behavior to their interlocutors. One key aspect of this is generation of appropriate non-verbal behavior for the agent, for example facial gestures, here defined as facial expressions and head movements. Most existing gesture-generating systems do not utilize multi-modal cues from the interlocutor when synthesizing non-verbal behavior. Those that do, typically use deterministic methods that risk producing repetitive and non-vivid motions. In this paper, we introduce a probabilistic method to synthesize interlocutor-aware facial gestures - represented by highly expressive FLAME parameters - in dyadic conversations. Our contributions are: a) a method for feature extraction from multi-party video and speech recordings, resulting in a representation that allows for independent control and manipulation of expression and speech articulation in a 3D avatar; b) an extension to MoGlow, a recent motion-synthesis method based on normalizing flows, to also take multi-modal signals from the interlocutor as input and subsequently output interlocutor-aware facial gestures; and c) subjective and objective experiments assessing the use and relative importance of the different modalities in the synthesized output. The results show that the model successfully leverages the input from the interlocutor to generate more appropriate behavior.},
bibtype = {inproceedings},
author = {Jonell, Patrik and Kucherenko, Taras and Henter, Gustav Eje and Beskow, Jonas},
doi = {10.1145/3383652.3423911},
booktitle = {Proceedings of the 20th ACM International Conference on Intelligent Virtual Agents (IVA'20)}
}
@inproceedings{
title = {Can we trust online crowdworkers?: Comparing online and offline participants in a preference test of virtual agents},
type = {inproceedings},
year = {2020},
websites = {https://dl.acm.org/doi/10.1145/3383652.3423860},
month = {10},
publisher = {Association for Computing Machinery},
day = {20},
id = {bb0c568d-814c-3ff7-b29c-ecd9ff337e6b},
created = {2020-09-25T10:32:02.444Z},
file_attached = {false},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-10-20T13:07:59.308Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {true},
hidden = {false},
citation_key = {jonell2020trust},
private_publication = {false},
abstract = {Conducting user studies is a crucial component in many scientific fields. While some studies require participants to be physically present, other studies can be conducted both physically (e.g. in-lab) and online (e.g. via crowdsourcing). Inviting participants to the lab can be a time-consuming and logistically difficult endeavor, not to mention that sometimes research groups might not be able to run in-lab experiments, because of, for example, a pandemic. Crowdsourcing platforms such as Amazon Mechanical Turk (AMT) or Prolific can therefore be a suitable alternative to run certain experiments, such as evaluating virtual agents. Although previous studies investigated the use of crowdsourcing platforms for running experiments, there is still uncertainty as to whether the results are reliable for perceptual studies. Here we replicate a previous experiment where participants evaluated a gesture generation model for virtual agents. The experiment is conducted across three participant pools - in-lab, Prolific, and AMT - having similar demographics across the in-lab participants and the Prolific platform. Our results show no difference between the three participant pools in regards to their evaluations of the gesture generation models and their reliability scores. The results indicate that online platforms can successfully be used for perceptual evaluations of this kind., booktitle = Proceedings of the 20th ACM International Conference on Intelligent Virtual Agents},
bibtype = {inproceedings},
author = {Jonell, Patrik and Kucherenko, Taras and Torre, Ilaria and Beskow, Jonas},
doi = {10.1145/3383652.3423860},
booktitle = {Proceedings of the 20th ACM International Conference on Intelligent Virtual Agents (IVA'20)}
}
@inproceedings{
title = {Embodiment and gender interact in alignment to TTS voices},
type = {inproceedings},
year = {2020},
id = {4224d79d-7417-3fef-8d71-bb6c0fab0931},
created = {2020-09-25T10:34:05.827Z},
file_attached = {false},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-10-21T13:29:36.855Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {true},
hidden = {false},
citation_key = {cohnembodiment},
source_type = {article},
private_publication = {false},
abstract = {The current study tests subjects’ vocal alignment toward female and male text-to-speech (TTS) voices presented via three systems: Amazon Echo, Nao, and Furhat. These systems vary in their physical form, ranging from a cylindrical speaker (Echo), to a small robot (Nao), to a human-like robot bust (Furhat). We test whether this cline of personification (cylinder< mini robot< human-like robot bust) predicts patterns of gender-mediated vocal alignment. In addition to comparing multiple systems, this study addresses a confound in many prior vocal alignment studies by using identical voices across the systems. Results show evidence for a cline of personification toward female TTS voices by female shadowers (Echo< Nao< Furhat) and a more categorical effect of device personification for male TTS voices by male shadowers (Echo< Nao, Furhat). These findings are discussed in terms of their implications for models of devicehuman interaction and theories of computer personification.},
bibtype = {inproceedings},
author = {Cohn, Michelle and Jonell, Patrik and Kim, Taylor and Beskow, Jonas and Zellou, Georgia},
booktitle = {Proceedings of the 42th Annual Meeting of the Cognitive Science Society, CogSci 2020}
}
@misc{
title = {The GENEA Challenge 2020:Benchmarking gesture-generation systems on common data},
type = {misc},
year = {2020},
websites = {https://doi.org/10.5281/zenodo.4094697},
month = {10},
publisher = {Zenodo},
id = {c7994777-877d-3fc9-91de-63c808f05644},
created = {2020-10-20T08:49:49.146Z},
file_attached = {false},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-10-20T13:07:59.315Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {true},
hidden = {false},
citation_key = {taras_kucherenko_2020_4094697},
source_type = {misc},
private_publication = {false},
bibtype = {misc},
author = {Kucherenko, Taras and Jonell, Patrik and Yoon, Youngwoo and Wolfert, Pieter and Henter, Gustav Eje},
doi = {10.5281/zenodo.4094697}
}
@misc{
title = {Let’s face it: Probabilistic multi-modal interlocutor-aware generation of facial gestures in dyadic settings},
type = {misc},
year = {2020},
source = {arXiv},
id = {dc7db9b1-2e57-3cbc-a8f4-f314c18d7a09},
created = {2020-11-10T23:59:00.000Z},
file_attached = {false},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-11-13T12:25:48.475Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {false},
hidden = {false},
private_publication = {false},
abstract = {Copyright © 2020, arXiv, All rights reserved. To enable more natural face-to-face interactions, conversational agents need to adapt their behavior to their interlocutors. One key aspect of this is generation of appropriate non-verbal behavior for the agent, for example facial gestures, here defined as facial expressions and head movements. Most existing gesture-generating systems do not utilize multi-modal cues from the interlocutor when synthesizing non-verbal behavior. Those that do, typically use deterministic methods that risk producing repetitive and non-vivid motions. In this paper, we introduce a probabilistic method to synthesize interlocutor-aware facial gestures – represented by highly expressive FLAME parameters – in dyadic conversations. Our contributions are: a) a method for feature extraction from multi-party video and speech recordings, resulting in a representation that allows for independent control and manipulation of expression and speech articulation in a 3D avatar; b) an extension to MoGlow, a recent motion-synthesis method based on normalizing flows, to also take multi-modal signals from the interlocutor as input and subsequently output interlocutor-aware facial gestures; and c) subjective and objective experiments assessing the use and relative importance of the different modalities in the synthesized output. The results show that the model successfully leverages the input from the interlocutor to generate more appropriate behavior.},
bibtype = {misc},
author = {Jonell, P. and Henter, G.E. and Kucherenko, T. and Beskow, J.}
}
@inproceedings{
title = {Crowdsourcing a Self-Evolving Dialog Graph},
type = {inproceedings},
year = {2019},
keywords = {crowdsourcing,datasets,dialog systems,human-computer interaction},
websites = {https://doi.org/10.1145/3342775.3342790},
publisher = {Association for Computing Machinery},
city = {New York, NY, USA},
series = {CUI ’19},
id = {3988d024-6a60-3c08-8e62-8bc506eb1c2a},
created = {2020-01-06T19:15:01.778Z},
file_attached = {false},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-01-06T19:15:01.778Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {true},
hidden = {false},
citation_key = {10.1145/3342775.3342790},
source_type = {inproceedings},
private_publication = {false},
bibtype = {inproceedings},
author = {Jonell, Patrik and Fallgren, Per and Doundefinedan, Fethiye Irmak and Lopes, José and Wennberg, Ulme and Skantze, Gabriel},
doi = {10.1145/3342775.3342790},
booktitle = {Proceedings of the 1st International Conference on Conversational User Interfaces}
}
@inproceedings{
title = {Learning Non-verbal Behavior for a Social Robot from YouTube Videos},
type = {inproceedings},
year = {2019},
keywords = {Facial expressions,generative models,head movement,neural network,non-verbal behavior,social robotics},
websites = {https://nicolas-navarro-guerrero.gitlab.io/workshop-non-verbal-human-robot-interactions-icdl-epirob-2019/},
institution = {KTH, Speech, Music and Hearing},
id = {8640b02c-ffac-3d0d-b1aa-6689a1d57352},
created = {2020-01-06T19:15:01.816Z},
file_attached = {false},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-07-06T09:08:48.686Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {true},
hidden = {false},
citation_key = {Jonell1357431},
source_type = {inproceedings},
notes = {QC 20191007},
folder_uuids = {b9e8b2b5-59ea-4ffd-818c-22a0d98c2144,f59b6e50-2bf8-4a1f-bfbb-f7865b907fde},
private_publication = {false},
abstract = {Non-verbal behavior is crucial for positive perception of humanoid robots. If modeled well it can improve the interaction and leave the user with a positive experience, on the other hand, if it is modelled poorly it may impede the interaction and become a source of distraction. Most of the existing work on modeling non-verbal behavior show limited variability due to the fact that the models employed are deterministic and the generated motion can be perceived as repetitive and predictable. In this paper, we present a novel method for generation of a limited set of facial expressions and head movements, based on a probabilistic generative deep learning architecture called Glow. We have implemented a workflow which takes videos directly from YouTube, extracts relevant features, and trains a model that generates gestures that can be realized in a robot without any post processing. A user study was conducted and illustrated the importance of having any kind of non-verbal behavior while most differences between the ground truth, the proposed method, and a random control were not significant (however, the differences that were significant were in favor of the proposed method).},
bibtype = {inproceedings},
author = {Jonell, Patrik and Kucherenko, Taras and Ekstedt, Erik and Beskow, Jonas},
booktitle = {ICDL-EpiRob Workshop on Naturalistic Non-Verbal and Affective Human-Robot Interactions}
}
@inproceedings{
title = {Using Social and Physiological Signals for User Adaptation in Conversational Agents},
type = {inproceedings},
year = {2019},
keywords = {communication,deep learning,learning agent capabilities (agent models,observation),single and multi-agent planning and scheduling},
pages = {2420–2422},
publisher = {International Foundation for Autonomous Agents and Multiagent Systems},
city = {Richland, SC},
series = {AAMAS ’19},
id = {27653ef2-8f05-3acb-856e-0be729ecdb0c},
created = {2020-01-06T19:15:02.181Z},
file_attached = {false},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-01-06T19:15:02.181Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {true},
hidden = {false},
citation_key = {10.5555/3306127.3332133},
source_type = {inproceedings},
private_publication = {false},
bibtype = {inproceedings},
author = {Jonell, Patrik},
booktitle = {Proceedings of the 18th International Conference on Autonomous Agents and MultiAgent Systems}
}
@inproceedings{
title = {The visual prominence of whispered speech in swedish},
type = {inproceedings},
year = {2019},
keywords = {gestures,head,orofacial gestures,prosody,voice quality,whisper},
pages = {235-239},
websites = {https://pdfs.semanticscholar.org/c361/2c4da6e7dd80066203ef5e7d6c01e89d8e0a.pdf},
id = {6cee0154-b895-3e7b-a686-b189ee8feb0e},
created = {2020-01-06T19:15:02.207Z},
file_attached = {true},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-01-14T19:02:45.277Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {false},
hidden = {false},
citation_key = {Malisz2019},
private_publication = {false},
bibtype = {inproceedings},
author = {Malisz, Zofia and Jonell, Patrik and Beskow, Jonas},
booktitle = {Proceedings of the 19th International Congress of Phonetic Sciences}
}
@inproceedings{
title = {Farmi: A framework for recording multi-modal interactions},
type = {inproceedings},
year = {2019},
keywords = {Human-robot interaction,Multimodal interaction,Multisensory processing},
id = {5e463f09-8d0c-330d-bb2e-2c11895cabb2},
created = {2018-12-19T23:59:00.000Z},
file_attached = {false},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-10-24T18:20:24.565Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {false},
hidden = {false},
private_publication = {false},
abstract = {© LREC 2018 - 11th International Conference on Language Resources and Evaluation. All rights reserved. In this paper we present (1) a processing architecture used to collect multi-modal sensor data, both for corpora collection and real-time processing, (2) an open-source implementation thereof and (3) a use-case where we deploy the architecture in a multi-party deception game, featuring six human players and one robot. The architecture is agnostic to the choice of hardware (e.g. microphones, cameras, etc.) and programming languages, although our implementation is mostly written in Python. In our use-case, different methods of capturing verbal and non-verbal cues from the participants were used. These were processed in real-time and used to inform the robot about the participants' deceptive behaviour. The framework is of particular interest for researchers who are interested in the collection of multi-party, richly recorded corpora and the design of conversational systems. Moreover for researchers who are interested in human-robot interaction the available modules offer the possibility to easily create both autonomous and wizard-of-Oz interactions.},
bibtype = {inproceedings},
author = {Jonell, P. and Bystedt, M. and Fallgren, P. and Kontogiorgos, D. and Lopes, J. and Malisz, Z. and Mascarenhas, S. and Oertel, C. and Raveh, E. and Shore, T.},
booktitle = {LREC 2018 - 11th International Conference on Language Resources and Evaluation}
}
@inproceedings{
title = {Crowdsourced multimodal corpora collection tool},
type = {inproceedings},
year = {2019},
keywords = {Crowdsourcing,Human-computer interaction,Multimodal corpus},
id = {1af28926-8f0d-3444-9338-d525e5dc6a61},
created = {2019-01-22T23:59:00.000Z},
file_attached = {false},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-11-04T20:22:31.316Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {false},
hidden = {false},
private_publication = {false},
abstract = {© LREC 2018 - 11th International Conference on Language Resources and Evaluation. All rights reserved. In recent years, more and more multimodal corpora have been created. To our knowledge there is no publicly available tool which allows for acquiring controlled multimodal data of people in a rapid and scalable fashion. We therefore are proposing (1) a novel tool which will enable researchers to rapidly gather large amounts of multimodal data spanning a wide demographic range, and (2) an example of how we used this tool for corpus collection of our “Attentive listener” multimodal corpus. The code is released under an Apache License 2.0 and available as an open-source repository, which can be found at https://github.com/kth-social-robotics/multimodal-crowdsourcing-tool. This tool will allow researchers to set-up their own multimodal data collection system quickly and create their own multimodal corpora. Finally, this paper provides a discussion about the advantages and disadvantages with a crowd-sourced data collection tool, especially in comparison to a lab recorded corpora.},
bibtype = {inproceedings},
author = {Jonell, P. and Oertel, C. and Kontogiorgos, D. and Beskow, J. and Gustafson, J.},
booktitle = {LREC 2018 - 11th International Conference on Language Resources and Evaluation}
}
@inproceedings{
title = {A multimodal corpus for mutual gaze and joint attention in multiparty situated interaction},
type = {inproceedings},
year = {2019},
keywords = {Joint attention,Multimodal situated interaction,Mutual gaze,Reference resolution,Referential gaze,Social eye-gaze},
id = {8430c7e3-46a6-32d4-bef7-c75a91b64c01},
created = {2019-01-22T23:59:00.000Z},
file_attached = {false},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-11-05T04:47:37.810Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {false},
hidden = {false},
private_publication = {false},
abstract = {© LREC 2018 - 11th International Conference on Language Resources and Evaluation. All rights reserved. In this paper we present a corpus of multiparty situated interaction where participants collaborated on moving virtual objects on a large touch screen. A moderator facilitated the discussion and directed the interaction. The corpus contains recordings of a variety of multimodal data, in that we captured speech, eye gaze and gesture data using a multisensory setup (wearable eye trackers, motion capture and audio/video). Furthermore, in the description of the multimodal corpus, we investigate four different types of social gaze: referential gaze, joint attention, mutual gaze and gaze aversion by both perspectives of a speaker and a listener. We annotated the groups' object references during object manipulation tasks and analysed the group's proportional referential eye-gaze with regards to the referent object. When investigating the distributions of gaze during and before referring expressions we could corroborate the differences in time between speakers' and listeners' eye gaze found in earlier studies. This corpus is of particular interest to researchers who are interested in social eye-gaze patterns in turn-taking and referring language in situated multi-party interaction.},
bibtype = {inproceedings},
author = {Kontogiorgos, D. and Avramova, V. and Alexanderson, S. and Jonell, P. and Oertel, C. and Beskow, J. and Skantze, G. and Gustafson, J.},
booktitle = {LREC 2018 - 11th International Conference on Language Resources and Evaluation}
}
@inproceedings{
title = {Using social and physiological signals for user adaptation in conversational agents: Doctoral consortium},
type = {inproceedings},
year = {2019},
keywords = {Deep learning,Learning agent capabilities (agent models, communi,Single and multi-agent planning and scheduling},
volume = {4},
id = {c4f429cb-ec70-3998-9a01-ef91e02096c4},
created = {2020-01-02T23:59:00.000Z},
file_attached = {false},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2021-03-02T14:08:28.648Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {false},
hidden = {false},
private_publication = {true},
abstract = {© 2019 International Foundation for Autonomous Agents and Multiagent Systems (www.ifaamas.org). All rights reserved. In face-to-face communication, humans subconsciously emit social signals which are picked up and used by their interlocutors as feedback for how well the previously communicated messages have been received. The feedback is then used in order to adapt the way the coming messages are being produced and sent to the interlocutor, leading to the communication to become as efficient and enjoyable as possible. Currently however, it is rare to find conversational agents utilizing this feedback channel for altering how the multimodal output is produced during interactions with users, largely due to the complex nature of the problem. In most regards, humans have a significant advantage over conversational agents in interpreting and acting on social signals. Humans are however restricted to a limited set of sensors, "the five senses", which conversational agents are not. This makes it possible for conversational agents to use specialized sensors to pick up physiological signals, such as skin temperature, respiratory rate or pupil dilation, which carry valuable information about the user with respect to the conversation. This thesis work aims at developing methods for utilizing both social and physiological signals emitted by humans in order to adapt the output of the conversational agent, allowing for an increase in conversation quality. These methods will primarily be based on automatically learning adaptive behavior from examples of real human interactions using machine learning methods.},
bibtype = {inproceedings},
author = {Jonell, P.},
booktitle = {Proceedings of the International Joint Conference on Autonomous Agents and Multiagent Systems, AAMAS}
}
@inproceedings{
title = {A Multimodal Corpus for Mutual Gaze and Joint Attention in Multiparty Situated Interaction},
type = {inproceedings},
year = {2018},
pages = {119-127},
websites = {http://www.lrec-conf.org/proceedings/lrec2018/pdf/987.pdf},
institution = {Ecole Polytechnique Fédérale de Lausanne (EPFL), Switzerland},
id = {2ce1d3eb-1dde-3ed1-9e11-6a3b20b09a5d},
created = {2020-01-06T19:15:01.568Z},
file_attached = {false},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-01-06T19:15:01.568Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {true},
hidden = {false},
citation_key = {Kontogiorgos1217277},
source_type = {inproceedings},
notes = {QC 20180614},
private_publication = {false},
abstract = {In this paper we present a corpus of multiparty situated interaction where participants collaborated on moving virtual objects on a large touch screen. A moderator facilitated the discussion and directed the interaction. The corpus contains recordings of a variety of multimodal data, in that we captured speech, eye gaze and gesture data using a multisensory setup (wearable eye trackers, motion capture and audio/video). Furthermore, in the description of the multimodal corpus, we investigate four different types of social gaze: referential gaze, joint attention, mutual gaze and gaze aversion by both perspectives of a speaker and a listener. We annotated the groups’ object references during object manipulation tasks and analysed the group’s proportional referential eye-gaze with regards to the referent object. When investigating the distributions of gaze during and before referring expressions we could corroborate the differences in time between speakers’ and listeners’ eye gaze found in earlier studies. This corpus is of particular interest to researchers who are interested in social eye-gaze patterns in turn-taking and referring language in situated multi-party interaction. },
bibtype = {inproceedings},
author = {Kontogiorgos, Dimosthenis and Avramova, Vanya and Alexanderson, Simon and Jonell, Patrik and Oertel, Catharine and Beskow, Jonas and Skantze, Gabriel and Gustafson, Joakim},
booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018) :}
}
@inproceedings{
title = {FARMI: A Framework for Recording Multi-Modal Interactions},
type = {inproceedings},
year = {2018},
pages = {3969-3974},
institution = {Multimodal Computing and Interaction, Saarland University, Germany},
id = {02d9c304-9a36-3a9e-aa94-b0ed8ee4a78f},
created = {2020-01-06T19:15:01.624Z},
file_attached = {false},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-01-06T19:15:01.624Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {true},
hidden = {false},
citation_key = {Jonell1217276},
source_type = {inproceedings},
notes = {QC 20180618},
private_publication = {false},
abstract = {In this paper we present (1) a processing architecture used to collect multi-modal sensor data, both for corpora collection and real-time processing, (2) an open-source implementation thereof and (3) a use-case where we deploy the architecture in a multi-party deception game, featuring six human players and one robot. The architecture is agnostic to the choice of hardware (e.g. microphones, cameras, etc.) and programming languages, although our implementation is mostly written in Python. In our use-case, different methods of capturing verbal and non-verbal cues from the participants were used. These were processed in real-time and used to inform the robot about the participants’ deceptive behaviour. The framework is of particular interest for researchers who are interested in the collection of multi-party, richly recorded corpora and the design of conversational systems. Moreover for researchers who are interested in human-robot interaction the available modules offer the possibility to easily create both autonomous and wizard-of-Oz interactions. },
bibtype = {inproceedings},
author = {Jonell, Patrik and Mattias, Bystedt and Per, Fallgren and Kontogiorgos, Dimosthenis and David Aguas Lopes, Jos\’e; and Malisz, Zofia and Samuel, Mascarenhas and Oertel, Catharine and Eran, Raveh and Shore, Todd},
booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018) :}
}
@article{
title = {Fantom: A Crowdsourced Social Chatbot using an Evolving Dialog Graph},
type = {article},
year = {2018},
pages = {1-20},
websites = {https://wit.ai/},
id = {7e31239c-24d9-3b86-863d-7bb7b8892bb6},
created = {2020-01-06T19:15:02.009Z},
file_attached = {true},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-01-06T19:27:16.569Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {true},
hidden = {false},
citation_key = {Jonell2018},
private_publication = {false},
abstract = {In this paper we present Fantom, a social chatbot competing in the Amazon Alexa Prize 2018 1. The system uses a dialog graph for retrieving an approximation of the current dialog context in order to find suitable response candidates in this context. The graph is gradually built using user utterances from actual interactions, and system responses suggested by crowd workers. To this end, we developed an automatic system for finding dialog contexts that were often visited but lacked system responses in order to automatically post tasks on Amazon Mechanical Turk. Workers could see a brief excerpt of past conversation history and were asked to suggest a good response, based on a description of the system's persona and a set of rules that would help foster more engaging conversations. Our main contributions are 1) describing the use of a graph-based approach for context modeling, 2) techniques used in order to make the crowd workers author good content, and 3) discussion of learning outcomes from the Alexa Prize challenge.},
bibtype = {article},
author = {Jonell, Patrik and Bystedt, Mattias and Irmak Do, Fethiye and Fallgren, Per and Ivarsson, Jonas and Slukova, Marketa and Wennberg, Ulme and Lopes, José and Boye, Johan and Skantze, Gabriel},
journal = {1st Proceedings of Alexa Prize (Alexa Prize 2018).}
}
@inproceedings{
title = {Crowdsourced Multimodal Corpora Collection Tool},
type = {inproceedings},
year = {2018},
pages = {728-734},
institution = {KTH, Speech, Music and Hearing, TMH},
id = {d4b5d513-4bbc-38aa-a85a-6960becdcec1},
created = {2020-01-06T19:15:02.031Z},
file_attached = {false},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-01-06T19:15:02.031Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {true},
hidden = {false},
citation_key = {Jonell1217275},
source_type = {inproceedings},
notes = {QC 20180618},
private_publication = {false},
abstract = {In recent years, more and more multimodal corpora have been created. To our knowledge there is no publicly available tool which allows for acquiring controlled multimodal data of people in a rapid and scalable fashion. We therefore are proposing (1) a novel tool which will enable researchers to rapidly gather large amounts of multimodal data spanning a wide demographic range, and (2) an example of how we used this tool for corpus collection of our "Attentive listener'' multimodal corpus. The code is released under an Apache License 2.0 and available as an open-source repository, which can be found at https://github.com/kth-social-robotics/multimodal-crowdsourcing-tool. This tool will allow researchers to set-up their own multimodal data collection system quickly and create their own multimodal corpora. Finally, this paper provides a discussion about the advantages and disadvantages with a crowd-sourced data collection tool, especially in comparison to a lab recorded corpora. },
bibtype = {inproceedings},
author = {Jonell, Patrik and Oertel, Catharine and Kontogiorgos, Dimosthenis and Beskow, Jonas and Gustafson, Joakim},
booktitle = {Proceedings of the Eleventh International Conference on Language Resources and Evaluation (LREC 2018) :}
}
@inproceedings{
title = {Crowd-Sourced Design of Artificial Attentive Listeners},
type = {inproceedings},
year = {2017},
keywords = {Crowd-sourcing,Human-robot interaction,Multi-modal feedback tokens},
pages = {854-858},
volume = {2017-Augus},
websites = {http://www.isca-speech.org/archive/Interspeech_2017/abstracts/0926.html},
month = {8},
publisher = {ISCA},
day = {20},
city = {ISCA},
id = {7ab128fc-5749-3cb5-9326-7517642e7789},
created = {2018-02-10T01:33:17.610Z},
file_attached = {false},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-10-21T13:29:36.984Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {true},
hidden = {false},
citation_key = {Oertel2017a},
private_publication = {false},
abstract = {Copyright © 2017 ISCA. Feedback generation is an important component of humanhuman communication. Humans can choose to signal support, understanding, agreement or also sceptiscism by means of feedback tokens. Many studies have focused on the timing of feedback behaviours. In the current study, however, we keep the timing constant and instead focus on the lexical form and prosody of feedback tokens as well as their sequential patterns. For this we crowdsourced participant's feedback behaviour in identical interactional contexts in order to model a virtual agent that is able to provide feedback as an attentive/supportive as well as attentive/sceptical listener. The resulting models were realised in a robot which was evaluated by third-party observers.},
bibtype = {inproceedings},
author = {Oertel, Catharine and Jonell, Patrik and Kontogiorgos, Dimosthenis and Mendelson, Joseph and Beskow, Jonas and Gustafson, Joakim},
doi = {10.21437/Interspeech.2017-926},
booktitle = {Interspeech 2017}
}
@inproceedings{
title = {Using crowd-sourcing for the design of listening agents: Challenges and opportunities},
type = {inproceedings},
year = {2017},
keywords = {Artificial listener,Listening agent,Multimodal behaviour generation},
id = {d69d895b-31ae-366b-a829-4ea1ba8488dc},
created = {2018-02-10T03:32:58.684Z},
file_attached = {false},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-01-06T19:27:16.813Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {false},
hidden = {false},
citation_key = {Oertel2017},
private_publication = {false},
abstract = {© 2017 Copyright is held by the owner/author(s). In this paper we are describing how audio-visual corpora recordings using crowd-sourcing techniques can be used for the audio-visual synthesis of attitudinal non-verbal feedback expressions for virtual agents. We are discussing the limitations of this approach as well as where we see the opportunities for this technology.},
bibtype = {inproceedings},
author = {Oertel, C. and Jonell, P. and Haddad, K.E. and Szekely, E. and Gustafson, J.},
doi = {10.1145/3139491.3139499},
booktitle = {ISIAA 2017 - Proceedings of the 1st ACM SIGCHI International Workshop on Investigating Social Interactions with Artificial Agents, Co-located with ICMI 2017}
}
@inproceedings{
title = {Crowd-Powered Design of Virtual Attentive Listeners},
type = {inproceedings},
year = {2017},
pages = {188-191},
websites = {http://link.springer.com/10.1007/978-3-319-67401-8_21},
id = {8e61cf69-cd9e-3d05-ba85-c597386a0128},
created = {2018-02-10T05:05:27.331Z},
file_attached = {false},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-10-21T13:29:36.894Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {true},
hidden = {false},
citation_key = {Jonell2017},
private_publication = {false},
abstract = {This demo presents a web-based system that generates attentive listening behaviours in a virtual agent acquired from audio-visual recordings of attitudinal feedback behaviour of crowdworkers.},
bibtype = {inproceedings},
author = {Jonell, Patrik and Oertel, Catharine and Kontogiorgos, Dimosthenis and Beskow, Jonas and Gustafson, Joakim},
doi = {10.1007/978-3-319-67401-8_21},
booktitle = {International Conference on Intelligent Virtual Agents (IVA'17)}
}
@misc{
title = {Machine Learning and Social Robotics for Detecting Early Signs of Dementia},
type = {misc},
year = {2017},
id = {91b3eec3-78e3-39cd-967d-9f0aa944f375},
created = {2020-01-06T19:15:02.008Z},
file_attached = {true},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-01-06T19:27:19.496Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {true},
hidden = {false},
citation_key = {jonell2017machine},
source_type = {misc},
private_publication = {false},
abstract = {This paper presents the EACare project, an ambitious multi-disciplinary collaboration with the aim to develop an embodied system, capable of carrying out neuropsychological tests to detect early signs of dementia, e.g., due to Alzheimer's disease. The system will use methods from Machine Learning and Social Robotics, and be trained with examples of recorded clinician-patient interactions. The interaction will be developed using a participatory design approach. We describe the scope and method of the project, and report on a first Wizard of Oz prototype.},
bibtype = {misc},
author = {Jonell, Patrik and Mendelson, Joseph and Storskog, Thomas and Hagman, Goran and Ostberg, Per and Leite, Iolanda and Kucherenko, Taras and Mikheeva, Olga and Akenine, Ulrika and Jelic, Vesna and Solomon, Alina and Beskow, Jonas and Gustafson, Joakim and Kivipelto, Miia and Kjellstrom, Hedvig}
}
@misc{
title = {Machine learning and social robotics for detecting early signs of dementia},
type = {misc},
year = {2017},
source = {arXiv},
id = {f4c0604d-dc4d-388f-bcbc-7f940bbfe5d1},
created = {2020-11-03T23:59:00.000Z},
file_attached = {false},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-11-04T16:36:16.239Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {false},
hidden = {false},
private_publication = {false},
abstract = {Copyright © 2017, arXiv, All rights reserved. This paper presents the EACare project, an ambitious multidisciplinary collaboration with the aim to develop an embodied system, capable of carrying out neuropsychological tests to detect early signs of dementia, e.g., due to Alzheimer's disease. The system will use methods from Machine Learning and Social Robotics, and be trained with examples of recorded clinician-patient interactions. The interaction will be developed using a participatory design approach. We describe the scope and method of the project, and report on a},
bibtype = {misc},
author = {Jonell, P. and Mendelson, J. and Storskog, T. and Hagman, G. and Ostberg, P. and Leite, I. and Kucherenko, T. and Mikheeva, O. and Akenine, U. and Jelic, V. and Solomon, A. and Beskow, J. and Gustafson, J. and Kivipelto, M. and Kjellstrom, H.}
}
@inproceedings{
title = {Affordance++: Allowing objects to communicate dynamic use},
type = {inproceedings},
year = {2015},
keywords = {Electrical muscle stimulation],[Affordance},
volume = {2015-April},
id = {1a3fcfc5-53dd-3765-8de3-741775f64bc5},
created = {2017-01-08T22:58:07.000Z},
file_attached = {true},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-01-06T19:27:16.803Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {true},
hidden = {false},
citation_key = {Lopes2015a},
private_publication = {false},
abstract = {© Copyright 2015 ACM.We propose extending the affordance of objects by allowing them to communicate dynamic use, such as (1) motion (e.g., spray can shakes when touched), (2) multi-step processes (e.g., spray can sprays only after shaking), and (3) behaviors that change over time (e.g., empty spray can does not allow spraying anymore). Rather than enhancing objects directly, however, we implement this concept by enhancing the user. We call this affordance++. By stimulating the user's arms using electrical muscle stimulation, our prototype allows objects not only to make the user actuate them, but also perform required movements while merely approaching the object, such as not to touch objects that do not "want" to be touched. In our user study, affordance++ helped participants to successfully operate devices of poor natural affordance, such as a multi-functional slicer tool or a magnetic nail sweeper, and to stay away from cups filled with hot liquids.},
bibtype = {inproceedings},
author = {Lopes, P. and Jonell, P. and Baudisch, P.},
doi = {10.1145/2702123.2702128},
booktitle = {Conference on Human Factors in Computing Systems - Proceedings}
}
@inproceedings{
title = {Proprioceptive interaction},
type = {inproceedings},
year = {2015},
keywords = {Muscle actuation,Proprioception],[IO},
volume = {2015-April},
id = {61eecf91-d5a8-39a2-8ab5-37a471925da2},
created = {2017-01-08T22:58:07.000Z},
file_attached = {true},
profile_id = {fb8d345a-1d79-3791-a6c6-00233ea44521},
last_modified = {2020-01-06T19:27:16.362Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {true},
hidden = {false},
citation_key = {Lopes2015},
private_publication = {false},
abstract = {© Copyright 2015 ACM.We propose a new way of eyes-free interaction for wearables. It is based on the user's proprioceptive sense, i.e., rather than seeing, hearing, or feeling an outside stimulus, users feel the pose of their own body. We have implemented a wearable device called Pose-IO that offers input and output based on proprioception. Users communicate with Pose-IO through the pose of their wrists. Users enter information by performing an input gesture by flexing their wrist, which the device senses using a 3-axis accelerometer. Users receive output from Pose-IO by finding their wrist posed in an output gesture, which Pose-IO actuates using electrical muscle stimulation. This mechanism allows users to interact with Pose-IO without visual or auditory senses, but through the proprioceptive sense alone. We developed three simple applications that demonstrate symmetric proprioceptive interaction, where input and output occur through the same limb, as well as asymmetric interaction, where input and output occur through different limbs. In a first user study, participants using a symmetric proprioceptive interface re-entered poses received from Pose-IO with an average accuracy of 5.8° despite the minimal bandwidth offered by the device. In a second, exploratory study, we investigated participants' emotional response to asymmetric proprioceptive interaction and the concept of the user's body serving as interface. Participants reported to enjoy the experience (mean= 4.6 out of 5).},
bibtype = {inproceedings},
author = {Lopes, P. and Ion, A. and Mueller, W. and Hoffmann, D. and Jonell, P. and Baudisch, P.},
doi = {10.1145/2702123.2702461},
booktitle = {Conference on Human Factors in Computing Systems - Proceedings}
}