Describe Anything Model for Visual Question Answering on Text-rich Images. Vu, Y., Duong, D., Duong, T., Nguyen, A., Nguyen, T., Nguyen, L. T. P., Xing, J., Li, X., Wang, T., Bagci, U., & Xu, M. CoRR, 2025.
Describe Anything Model for Visual Question Answering on Text-rich Images [link]Paper  doi  bibtex   
@article{DBLP:journals/corr/abs-2507-12441,
  author       = {Yen{-}Linh Vu and
                  Dinh{-}Thang Duong and
                  Truong{-}Binh Duong and
                  Anh{-}Khoi Nguyen and
                  Thanh{-}Huy Nguyen and
                  Le Thien Phuc Nguyen and
                  Jianhua Xing and
                  Xingjian Li and
                  Tianyang Wang and
                  Ulas Bagci and
                  Min Xu},
  title        = {Describe Anything Model for Visual Question Answering on Text-rich
                  Images},
  journal      = {CoRR},
  volume       = {abs/2507.12441},
  year         = {2025},
  url          = {https://doi.org/10.48550/arXiv.2507.12441},
  doi          = {10.48550/ARXIV.2507.12441},
  eprinttype    = {arXiv},
  eprint       = {2507.12441},
  timestamp    = {Sat, 06 Sep 2025 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2507-12441.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

Downloads: 0