Describe Anything Model for Visual Question Answering on Text-rich Images. Vu, Y., Duong, D., Duong, T., Nguyen, A., Nguyen, T., Nguyen, L. T. P., Xing, J., Li, X., Wang, T., Bagci, U., & Xu, M. CoRR, 2025.
Paper doi bibtex @article{DBLP:journals/corr/abs-2507-12441,
author = {Yen{-}Linh Vu and
Dinh{-}Thang Duong and
Truong{-}Binh Duong and
Anh{-}Khoi Nguyen and
Thanh{-}Huy Nguyen and
Le Thien Phuc Nguyen and
Jianhua Xing and
Xingjian Li and
Tianyang Wang and
Ulas Bagci and
Min Xu},
title = {Describe Anything Model for Visual Question Answering on Text-rich
Images},
journal = {CoRR},
volume = {abs/2507.12441},
year = {2025},
url = {https://doi.org/10.48550/arXiv.2507.12441},
doi = {10.48550/ARXIV.2507.12441},
eprinttype = {arXiv},
eprint = {2507.12441},
timestamp = {Sat, 06 Sep 2025 01:00:00 +0200},
biburl = {https://dblp.org/rec/journals/corr/abs-2507-12441.bib},
bibsource = {dblp computer science bibliography, https://dblp.org}
}
Downloads: 0
{"_id":"Ks6KTcjR7EAb8t2Rn","bibbaseid":"vu-duong-duong-nguyen-nguyen-nguyen-xing-li-etal-describeanythingmodelforvisualquestionansweringontextrichimages-2025","author_short":["Vu, Y.","Duong, D.","Duong, T.","Nguyen, A.","Nguyen, T.","Nguyen, L. T. P.","Xing, J.","Li, X.","Wang, T.","Bagci, U.","Xu, M."],"bibdata":{"bibtype":"article","type":"article","author":[{"firstnames":["Yen-Linh"],"propositions":[],"lastnames":["Vu"],"suffixes":[]},{"firstnames":["Dinh-Thang"],"propositions":[],"lastnames":["Duong"],"suffixes":[]},{"firstnames":["Truong-Binh"],"propositions":[],"lastnames":["Duong"],"suffixes":[]},{"firstnames":["Anh-Khoi"],"propositions":[],"lastnames":["Nguyen"],"suffixes":[]},{"firstnames":["Thanh-Huy"],"propositions":[],"lastnames":["Nguyen"],"suffixes":[]},{"firstnames":["Le","Thien","Phuc"],"propositions":[],"lastnames":["Nguyen"],"suffixes":[]},{"firstnames":["Jianhua"],"propositions":[],"lastnames":["Xing"],"suffixes":[]},{"firstnames":["Xingjian"],"propositions":[],"lastnames":["Li"],"suffixes":[]},{"firstnames":["Tianyang"],"propositions":[],"lastnames":["Wang"],"suffixes":[]},{"firstnames":["Ulas"],"propositions":[],"lastnames":["Bagci"],"suffixes":[]},{"firstnames":["Min"],"propositions":[],"lastnames":["Xu"],"suffixes":[]}],"title":"Describe Anything Model for Visual Question Answering on Text-rich Images","journal":"CoRR","volume":"abs/2507.12441","year":"2025","url":"https://doi.org/10.48550/arXiv.2507.12441","doi":"10.48550/ARXIV.2507.12441","eprinttype":"arXiv","eprint":"2507.12441","timestamp":"Sat, 06 Sep 2025 01:00:00 +0200","biburl":"https://dblp.org/rec/journals/corr/abs-2507-12441.bib","bibsource":"dblp computer science bibliography, https://dblp.org","bibtex":"@article{DBLP:journals/corr/abs-2507-12441,\n author = {Yen{-}Linh Vu and\n Dinh{-}Thang Duong and\n Truong{-}Binh Duong and\n Anh{-}Khoi Nguyen and\n Thanh{-}Huy Nguyen and\n Le Thien Phuc Nguyen and\n Jianhua Xing and\n Xingjian Li and\n Tianyang Wang and\n Ulas Bagci and\n Min Xu},\n title = {Describe Anything Model for Visual Question Answering on Text-rich\n Images},\n journal = {CoRR},\n volume = {abs/2507.12441},\n year = {2025},\n url = {https://doi.org/10.48550/arXiv.2507.12441},\n doi = {10.48550/ARXIV.2507.12441},\n eprinttype = {arXiv},\n eprint = {2507.12441},\n timestamp = {Sat, 06 Sep 2025 01:00:00 +0200},\n biburl = {https://dblp.org/rec/journals/corr/abs-2507-12441.bib},\n bibsource = {dblp computer science bibliography, https://dblp.org}\n}\n\n","author_short":["Vu, Y.","Duong, D.","Duong, T.","Nguyen, A.","Nguyen, T.","Nguyen, L. T. P.","Xing, J.","Li, X.","Wang, T.","Bagci, U.","Xu, M."],"key":"DBLP:journals/corr/abs-2507-12441","id":"DBLP:journals/corr/abs-2507-12441","bibbaseid":"vu-duong-duong-nguyen-nguyen-nguyen-xing-li-etal-describeanythingmodelforvisualquestionansweringontextrichimages-2025","role":"author","urls":{"Paper":"https://doi.org/10.48550/arXiv.2507.12441"},"metadata":{"authorlinks":{}}},"bibtype":"article","biburl":"http://dblp.org/pers/tb2/b/Bagci:Ulas","dataSources":["yGTvE6gpRZxrrNnsS"],"keywords":[],"search_terms":["describe","anything","model","visual","question","answering","text","rich","images","vu","duong","duong","nguyen","nguyen","nguyen","xing","li","wang","bagci","xu"],"title":"Describe Anything Model for Visual Question Answering on Text-rich Images","year":2025}