, 10. 2022.\n
\n
@article{\n title = {Deep Learning Approaches Based on Transformer Architectures for Image Captioning Tasks},\n type = {article},\n year = {2022},\n keywords = {Image captioning,artificial intelligence,computer vision,supervised learning,visual attention},\n volume = {10},\n id = {04af3974-8652-3b86-beef-4afd2466db00},\n created = {2022-05-23T20:59:33.149Z},\n file_attached = {false},\n profile_id = {ea08b3f3-0663-3787-971d-82ac11993544},\n last_modified = {2022-05-23T20:59:33.149Z},\n read = {false},\n starred = {false},\n authored = {true},\n confirmed = {false},\n hidden = {false},\n private_publication = {true},\n abstract = {This paper focuses on visual attention, a state-of-the-art approach for image captioning tasks within the computer vision research area. We study the impact that different hyperparemeter configurations on an encoder-decoder visual attention architecture in terms of efficiency. Results show that the correct selection of both the cost function and the gradient-based optimizer can significantly impact the captioning results. Our system considers the cross-entropy, Kullback-Leibler divergence, mean squared error, and negative log-likelihood loss functions; the adaptive momentum (Adam), AdamW, RMSprop, stochastic gradient descent, and Adadelta optimizers. Experimentation shows that a combination of cross-entropy with Adam is the best alternative returning a Top-5 accuracy value of 73.092 and a BLEU-4 value of 20.10. Furthermore, a comparative analysis of alternative convolutional architectures demonstrated their performance as an encoder. Our results show that ResNext-101 stands out with a Top-5 accuracy of 73.128 and a BLEU-4 of 19.80; positioning itself as the best option when looking for the optimum captioning quality. However, MobileNetV3 proved to be a much more compact alternative with 2,971,952 parameters and 0.23 Giga fixed-point Multiply-Accumulate operations per Second (GMACS). Consequently, MobileNetV3 offers a competitive output quality at the cost of lower computational performance, supported by values of 19.50 and 72.928 for the BLEU-4 and Top-5 accuracy, respectively. Finally, when testing vision transformer (ViT), and data-efficient image transformer (DeiT) models to replace the convolutional component of the architecture, DeiT achieved an improvement over ViT, obtaining a value of 34.44 in the BLEU-4 metric.},\n bibtype = {article},\n author = {Castro, R. and Pineda, I. and Lim, W. and Morocho-Cayamcela, M.E.},\n doi = {10.1109/ACCESS.2022.3161428},\n journal = {IEEE Access}\n}
\n
\n This paper focuses on visual attention, a state-of-the-art approach for image captioning tasks within the computer vision research area. We study the impact that different hyperparemeter configurations on an encoder-decoder visual attention architecture in terms of efficiency. Results show that the correct selection of both the cost function and the gradient-based optimizer can significantly impact the captioning results. Our system considers the cross-entropy, Kullback-Leibler divergence, mean squared error, and negative log-likelihood loss functions; the adaptive momentum (Adam), AdamW, RMSprop, stochastic gradient descent, and Adadelta optimizers. Experimentation shows that a combination of cross-entropy with Adam is the best alternative returning a Top-5 accuracy value of 73.092 and a BLEU-4 value of 20.10. Furthermore, a comparative analysis of alternative convolutional architectures demonstrated their performance as an encoder. Our results show that ResNext-101 stands out with a Top-5 accuracy of 73.128 and a BLEU-4 of 19.80; positioning itself as the best option when looking for the optimum captioning quality. However, MobileNetV3 proved to be a much more compact alternative with 2,971,952 parameters and 0.23 Giga fixed-point Multiply-Accumulate operations per Second (GMACS). Consequently, MobileNetV3 offers a competitive output quality at the cost of lower computational performance, supported by values of 19.50 and 72.928 for the BLEU-4 and Top-5 accuracy, respectively. Finally, when testing vision transformer (ViT), and data-efficient image transformer (DeiT) models to replace the convolutional component of the architecture, DeiT achieved an improvement over ViT, obtaining a value of 34.44 in the BLEU-4 metric.\n