BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models. Li, J., Li, D., Savarese, S., & Hoi, S. Proceedings of Machine Learning Research, 202:20351-20383, ML Research Press, 1, 2023.
BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models [link]Website  abstract   bibtex   
The cost of vision-and-language pre-training has become increasingly prohibitive due to end-to-end training of large-scale models. This paper proposes BLIP-2, a generic and efficient pre-training strategy that bootstraps vision-language pre-training from off-the-shelf frozen pre-trained image encoders and frozen large language models. BLIP-2 bridges the modality gap with a lightweight Querying Transformer, which is pre-trained in two stages. The first stage bootstraps vision-language representation learning from a frozen image encoder. The second stage bootstraps vision-to-language generative learning from a frozen language model. BLIP-2 achieves state-of-the-art performance on various vision-language tasks, despite having significantly fewer trainable parameters than existing methods. For example, our model outperforms Flamingo80B by 8.7% on zero-shot VQAv2 with 54x fewer trainable parameters. We also demonstrate the model's emerging capabilities of zero-shot image-to-text generation that can follow natural language instructions.
@article{
 title = {BLIP-2: Bootstrapping Language-Image Pre-training with Frozen Image Encoders and Large Language Models},
 type = {article},
 year = {2023},
 pages = {20351-20383},
 volume = {202},
 websites = {https://arxiv.org/abs/2301.12597v3},
 month = {1},
 publisher = {ML Research Press},
 day = {30},
 id = {ed6b4fb9-5346-3c51-a430-6da04671cbe2},
 created = {2025-07-08T06:07:37.225Z},
 accessed = {2025-07-08},
 file_attached = {true},
 profile_id = {f1f70cad-e32d-3de2-a3c0-be1736cb88be},
 group_id = {5ec9cc91-a5d6-3de5-82f3-3ef3d98a89c1},
 last_modified = {2025-07-16T12:10:36.313Z},
 read = {true},
 starred = {false},
 authored = {false},
 confirmed = {false},
 hidden = {false},
 folder_uuids = {8d1a9e56-6d87-49e8-bb3a-d793337257c6},
 private_publication = {false},
 abstract = {The cost of vision-and-language pre-training has become increasingly prohibitive due to end-to-end training of large-scale models. This paper proposes BLIP-2, a generic and efficient pre-training strategy that bootstraps vision-language pre-training from off-the-shelf frozen pre-trained image encoders and frozen large language models. BLIP-2 bridges the modality gap with a lightweight Querying Transformer, which is pre-trained in two stages. The first stage bootstraps vision-language representation learning from a frozen image encoder. The second stage bootstraps vision-to-language generative learning from a frozen language model. BLIP-2 achieves state-of-the-art performance on various vision-language tasks, despite having significantly fewer trainable parameters than existing methods. For example, our model outperforms Flamingo80B by 8.7% on zero-shot VQAv2 with 54x fewer trainable parameters. We also demonstrate the model's emerging capabilities of zero-shot image-to-text generation that can follow natural language instructions.},
 bibtype = {article},
 author = {Li, Junnan and Li, Dongxu and Savarese, Silvio and Hoi, Steven},
 journal = {Proceedings of Machine Learning Research}
}

Downloads: 0