AlanaVLM: A Multimodal Embodied AI Foundation Model for Egocentric Video Understanding. Suglia, A., Greco, C., Baker, K., Part, J. L., Papaioannou, I., Eshghi, A., Konstas, I., & Lemon, O. CoRR, 2024.
AlanaVLM: A Multimodal Embodied AI Foundation Model for Egocentric Video Understanding [link]Paper  doi  bibtex   
@article{DBLP:journals/corr/abs-2406-13807,
  author       = {Alessandro Suglia and
                  Claudio Greco and
                  Katie Baker and
                  Jose L. Part and
                  Ioannis Papaioannou and
                  Arash Eshghi and
                  Ioannis Konstas and
                  Oliver Lemon},
  title        = {AlanaVLM: {A} Multimodal Embodied {AI} Foundation Model for Egocentric
                  Video Understanding},
  journal      = {CoRR},
  volume       = {abs/2406.13807},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2406.13807},
  doi          = {10.48550/ARXIV.2406.13807},
  eprinttype    = {arXiv},
  eprint       = {2406.13807},
  timestamp    = {Fri, 12 Jul 2024 01:00:00 +0200},
  biburl       = {https://dblp.org/rec/journals/corr/abs-2406-13807.bib},
  bibsource    = {dblp computer science bibliography, https://dblp.org}
}

Downloads: 0