\n\n \n\n \n\n \n \n\n \n\n \n \n\n \n\n \n

\n generated by\n \n $\"bibbase.org\"\n$ \n \n

\n \n\n

\n\n

\n \n Group by\n \n \n
- \n Year\n
- \n Author\n
- \n Type\n
- \n Keyword\n
- \n Downloads\n
\n
\n \n \n \n \n
- \n \n Expand/Collapse All\n \n
- \n \n Download BibTeX\n \n
- \n \n RSS Feed\n \n
\n

\n\n\n

\n\n Excellent! Next you can\n create a new website with this list, or\n embed it in an existing web page by copying & pasting\n any of the following snippets.\n\n

\n JavaScript\n (easiest)\n

\n            <script src=\"https://bibbase.org/service/mendeley/ea08b3f3-0663-3787-971d-82ac11993544?jsonp=1&jsonp=1\"></script>\n

\n\n PHP\n

\n            <?php\n            $contents = file_get_contents(\"https://bibbase.org/service/mendeley/ea08b3f3-0663-3787-971d-82ac11993544?jsonp=1\");\n            print_r($contents);\n            ?>\n

\n\n iFrame\n (not recommended)\n

\n            <iframe src=\"https://bibbase.org/service/mendeley/ea08b3f3-0663-3787-971d-82ac11993544?jsonp=1\"></iframe>\n

\n\n

\n For more details see the documention.\n

\n\n

\n\n This is a preview! To use this list on your own web site\n or create a new web site from it,\n create a free account. The file will be added\n and you will be able to edit it in the File Manager.\n We will show you instructions once you've created your account.\n

\n\n

To the site owner:

\n\n

Action required! Mendeley is changing its\n API. In order to keep using Mendeley with BibBase past April\n 14th, you need to:\n

renew the authorization for BibBase on Mendeley, and
update the BibBase URL\n in your page the same way you did when you initially set up\n this page.\n

\n\n

\n \n \n Fix it now\n

\n\n

\n\n\n

\n \n \n

\n \n 2022\n \n \n (1)\n \n \n

\n \n \n

\n \n\n \n \n \n \n \n Deep Learning Approaches Based on Transformer Architectures for Image Captioning Tasks.\n \n \n \n\n\n \n Castro, R.; Pineda, I.; Lim, W.; and Morocho-Cayamcela, M.\n\n\n \n\n\n\n IEEE Access, 10. 2022.\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n \n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n

@article{\n title = {Deep Learning Approaches Based on Transformer Architectures for Image Captioning Tasks},\n type = {article},\n year = {2022},\n keywords = {Image captioning,artificial intelligence,computer vision,supervised learning,visual attention},\n volume = {10},\n id = {04af3974-8652-3b86-beef-4afd2466db00},\n created = {2022-05-23T20:59:33.149Z},\n file_attached = {false},\n profile_id = {ea08b3f3-0663-3787-971d-82ac11993544},\n last_modified = {2022-05-23T20:59:33.149Z},\n read = {false},\n starred = {false},\n authored = {true},\n confirmed = {false},\n hidden = {false},\n private_publication = {true},\n abstract = {This paper focuses on visual attention, a state-of-the-art approach for image captioning tasks within the computer vision research area. We study the impact that different hyperparemeter configurations on an encoder-decoder visual attention architecture in terms of efficiency. Results show that the correct selection of both the cost function and the gradient-based optimizer can significantly impact the captioning results. Our system considers the cross-entropy, Kullback-Leibler divergence, mean squared error, and negative log-likelihood loss functions; the adaptive momentum (Adam), AdamW, RMSprop, stochastic gradient descent, and Adadelta optimizers. Experimentation shows that a combination of cross-entropy with Adam is the best alternative returning a Top-5 accuracy value of 73.092 and a BLEU-4 value of 20.10. Furthermore, a comparative analysis of alternative convolutional architectures demonstrated their performance as an encoder. Our results show that ResNext-101 stands out with a Top-5 accuracy of 73.128 and a BLEU-4 of 19.80; positioning itself as the best option when looking for the optimum captioning quality. However, MobileNetV3 proved to be a much more compact alternative with 2,971,952 parameters and 0.23 Giga fixed-point Multiply-Accumulate operations per Second (GMACS). Consequently, MobileNetV3 offers a competitive output quality at the cost of lower computational performance, supported by values of 19.50 and 72.928 for the BLEU-4 and Top-5 accuracy, respectively. Finally, when testing vision transformer (ViT), and data-efficient image transformer (DeiT) models to replace the convolutional component of the architecture, DeiT achieved an improvement over ViT, obtaining a value of 34.44 in the BLEU-4 metric.},\n bibtype = {article},\n author = {Castro, R. and Pineda, I. and Lim, W. and Morocho-Cayamcela, M.E.},\n doi = {10.1109/ACCESS.2022.3161428},\n journal = {IEEE Access}\n}

\n\n\n\n\n\n

\n\n

\n \n 2021\n \n \n (1)\n \n \n

\n \n \n

\n \n\n \n \n \n \n \n Hyperparameter Tuning over an Attention Model for Image Captioning.\n \n \n \n\n\n \n Castro, R.; Pineda, I.; and Morocho-Cayamcela, M.\n\n\n \n\n\n\n Volume 1456 CCIS 2021.\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n \n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n

@book{\n title = {Hyperparameter Tuning over an Attention Model for Image Captioning},\n type = {book},\n year = {2021},\n source = {Communications in Computer and Information Science},\n keywords = {Artificial intelligence,Computer vision,Image captioning,Supervised learning,Visual attention},\n volume = {1456 CCIS},\n id = {f64f8a18-bec8-371c-a2e8-577dce293bbe},\n created = {2022-05-23T20:59:33.149Z},\n file_attached = {false},\n profile_id = {ea08b3f3-0663-3787-971d-82ac11993544},\n last_modified = {2022-05-23T20:59:33.149Z},\n read = {false},\n starred = {false},\n authored = {true},\n confirmed = {false},\n hidden = {false},\n private_publication = {true},\n abstract = {Considering the historical trajectory and evolution of image captioning as a research area, this paper focuses on visual attention as an approach to solve captioning tasks with computer vision. This article studies the efficiency of different hyperparameter configurations on a state-of-the-art visual attention architecture composed of a pre-trained residual neural network encoder, and a long short-term memory decoder. Results show that the selection of both the cost function and the gradient-based optimizer have a significant impact on the captioning results. Our system considers the cross-entropy, Kullback-Leibler divergence, mean squared error, and the negative log-likelihood loss functions, as well as the adaptive momentum, AdamW, RMSprop, stochastic gradient descent, and Adadelta optimizers. Based on the performance metrics, a combination of cross-entropy with Adam is identified as the best alternative returning a Top-5 accuracy value of 73.092, and a BLEU-4 value of 0.201. Setting the cross-entropy as an independent variable, the first two optimization alternatives prove the best performance with a BLEU-4 metric value of 0.201. In terms of the inference loss, Adam outperforms AdamW with 3.413 over 3.418 and a Top-5 accuracy of 73.092 over 72.989.},\n bibtype = {book},\n author = {Castro, R. and Pineda, I. and Morocho-Cayamcela, M.E.},\n doi = {10.1007/978-3-030-89941-7_13}\n}

\n\n\n\n\n\n

\n\n\n\n\n

\n\n\n \n\n \n \n \n \n\n