Entity-Aware Cross-Modal Pretraining for Knowledge-based Visual Question Answering. Adjali, O., Ferret, O., Ghannay, S., & Le Borgne, H. In European Conference on Information Retrieval (ECIR), 2025.
Hal abstract bibtex 1 download Knowledge-Aware Visual Question Answering about Entities (KVQAE) is a recent multimodal task aiming to answer visual questions about named entities from a multimodal knowledge base. In this context, we focus more particularly on cross-modal retrieval and propose to inject information about entities in the representations of both texts and images during their building through two pretraining auxiliary tasks, namely entity-level masked language modeling and entity type prediction. We show competitive results over existing approaches on 3 KVQAE standard benchmarks, revealing the benefit of raising entity awareness during cross-modal pretraining, specifically for the KVQAE task.
@inproceedings{adjali2025entitiy_aware,
title = {Entity-Aware Cross-Modal Pretraining for Knowledge-based Visual Question Answering},
author = {Adjali, Omar and Ferret, Olivier and Ghannay, Sahar and Le Borgne, Herv{\'e}},
booktitle = {European Conference on Information Retrieval (ECIR)},
year = {2025},
url_HAL = {https://hal-lara.archives-ouvertes.fr/SHARP/cea-04910767},
abstract = {Knowledge-Aware Visual Question Answering about Entities (KVQAE) is a recent multimodal task aiming to answer visual questions about named entities from a multimodal knowledge base. In this context, we focus more particularly on cross-modal retrieval and propose to inject information about entities in the representations of both texts and images during their building through two pretraining auxiliary tasks, namely entity-level masked language modeling and entity type prediction. We show competitive results over existing approaches on 3 KVQAE standard benchmarks, revealing the benefit of raising entity awareness during cross-modal pretraining, specifically for the KVQAE task.},
keywords = {kvqae}
}
Downloads: 1
{"_id":"jYS5BCBHtGGYmTD5K","bibbaseid":"adjali-ferret-ghannay-leborgne-entityawarecrossmodalpretrainingforknowledgebasedvisualquestionanswering-2025","author_short":["Adjali, O.","Ferret, O.","Ghannay, S.","Le Borgne, H."],"bibdata":{"bibtype":"inproceedings","type":"inproceedings","title":"Entity-Aware Cross-Modal Pretraining for Knowledge-based Visual Question Answering","author":[{"propositions":[],"lastnames":["Adjali"],"firstnames":["Omar"],"suffixes":[]},{"propositions":[],"lastnames":["Ferret"],"firstnames":["Olivier"],"suffixes":[]},{"propositions":[],"lastnames":["Ghannay"],"firstnames":["Sahar"],"suffixes":[]},{"propositions":[],"lastnames":["Le","Borgne"],"firstnames":["Hervé"],"suffixes":[]}],"booktitle":"European Conference on Information Retrieval (ECIR)","year":"2025","url_hal":"https://hal-lara.archives-ouvertes.fr/SHARP/cea-04910767","abstract":"Knowledge-Aware Visual Question Answering about Entities (KVQAE) is a recent multimodal task aiming to answer visual questions about named entities from a multimodal knowledge base. In this context, we focus more particularly on cross-modal retrieval and propose to inject information about entities in the representations of both texts and images during their building through two pretraining auxiliary tasks, namely entity-level masked language modeling and entity type prediction. We show competitive results over existing approaches on 3 KVQAE standard benchmarks, revealing the benefit of raising entity awareness during cross-modal pretraining, specifically for the KVQAE task.","keywords":"kvqae","bibtex":"@inproceedings{adjali2025entitiy_aware,\n title = {Entity-Aware Cross-Modal Pretraining for Knowledge-based Visual Question Answering},\n author = {Adjali, Omar and Ferret, Olivier and Ghannay, Sahar and Le Borgne, Herv{\\'e}},\n booktitle = {European Conference on Information Retrieval (ECIR)},\n year = {2025},\n url_HAL = {https://hal-lara.archives-ouvertes.fr/SHARP/cea-04910767},\n abstract = {Knowledge-Aware Visual Question Answering about Entities (KVQAE) is a recent multimodal task aiming to answer visual questions about named entities from a multimodal knowledge base. In this context, we focus more particularly on cross-modal retrieval and propose to inject information about entities in the representations of both texts and images during their building through two pretraining auxiliary tasks, namely entity-level masked language modeling and entity type prediction. We show competitive results over existing approaches on 3 KVQAE standard benchmarks, revealing the benefit of raising entity awareness during cross-modal pretraining, specifically for the KVQAE task.},\n keywords = {kvqae}\n}\n\n","author_short":["Adjali, O.","Ferret, O.","Ghannay, S.","Le Borgne, H."],"key":"adjali2025entitiy_aware","id":"adjali2025entitiy_aware","bibbaseid":"adjali-ferret-ghannay-leborgne-entityawarecrossmodalpretrainingforknowledgebasedvisualquestionanswering-2025","role":"author","urls":{" hal":"https://hal-lara.archives-ouvertes.fr/SHARP/cea-04910767"},"keyword":["kvqae"],"metadata":{"authorlinks":{}},"downloads":1,"html":""},"bibtype":"inproceedings","biburl":"https://hleborgne.github.io/files/hleborgne-publications.bib","dataSources":["sJzmxoNKfHCgQoayi"],"keywords":["kvqae"],"search_terms":["entity","aware","cross","modal","pretraining","knowledge","based","visual","question","answering","adjali","ferret","ghannay","le borgne"],"title":"Entity-Aware Cross-Modal Pretraining for Knowledge-based Visual Question Answering","year":2025,"downloads":1}