TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models. Li, M., Lv, T., Chen, J., Cui, L., Lu, Y., Florencio, D., Zhang, C., Li, Z., & Wei, F. September, 2022. arXiv:2109.10282 [cs]Paper doi abstract bibtex Text recognition is a long-standing research problem for document digitalization. Existing approaches are usually built based on CNN for image understanding and RNN for char-level text generation. In addition, another language model is usually needed to improve the overall accuracy as a post-processing step. In this paper, we propose an end-to-end text recognition approach with pre-trained image Transformer and text Transformer models, namely TrOCR, which leverages the Transformer architecture for both image understanding and wordpiece-level text generation. The TrOCR model is simple but effective, and can be pre-trained with large-scale synthetic data and fine-tuned with human-labeled datasets. Experiments show that the TrOCR model outperforms the current state-of-the-art models on the printed, handwritten and scene text recognition tasks. The TrOCR models and code are publicly available at \url\https://aka.ms/trocr\.
@misc{li_trocr_2022,
title = {{TrOCR}: {Transformer}-based {Optical} {Character} {Recognition} with {Pre}-trained {Models}},
shorttitle = {{TrOCR}},
url = {http://arxiv.org/abs/2109.10282},
doi = {10.48550/arXiv.2109.10282},
abstract = {Text recognition is a long-standing research problem for document digitalization. Existing approaches are usually built based on CNN for image understanding and RNN for char-level text generation. In addition, another language model is usually needed to improve the overall accuracy as a post-processing step. In this paper, we propose an end-to-end text recognition approach with pre-trained image Transformer and text Transformer models, namely TrOCR, which leverages the Transformer architecture for both image understanding and wordpiece-level text generation. The TrOCR model is simple but effective, and can be pre-trained with large-scale synthetic data and fine-tuned with human-labeled datasets. Experiments show that the TrOCR model outperforms the current state-of-the-art models on the printed, handwritten and scene text recognition tasks. The TrOCR models and code are publicly available at {\textbackslash}url\{https://aka.ms/trocr\}.},
urldate = {2023-05-11},
publisher = {arXiv},
author = {Li, Minghao and Lv, Tengchao and Chen, Jingye and Cui, Lei and Lu, Yijuan and Florencio, Dinei and Zhang, Cha and Li, Zhoujun and Wei, Furu},
month = sep,
year = {2022},
note = {arXiv:2109.10282 [cs]},
keywords = {Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition, Remember},
}
Downloads: 0
{"_id":"mqQkuQFnNBdExqDoh","bibbaseid":"li-lv-chen-cui-lu-florencio-zhang-li-etal-trocrtransformerbasedopticalcharacterrecognitionwithpretrainedmodels-2022","author_short":["Li, M.","Lv, T.","Chen, J.","Cui, L.","Lu, Y.","Florencio, D.","Zhang, C.","Li, Z.","Wei, F."],"bibdata":{"bibtype":"misc","type":"misc","title":"TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models","shorttitle":"TrOCR","url":"http://arxiv.org/abs/2109.10282","doi":"10.48550/arXiv.2109.10282","abstract":"Text recognition is a long-standing research problem for document digitalization. Existing approaches are usually built based on CNN for image understanding and RNN for char-level text generation. In addition, another language model is usually needed to improve the overall accuracy as a post-processing step. In this paper, we propose an end-to-end text recognition approach with pre-trained image Transformer and text Transformer models, namely TrOCR, which leverages the Transformer architecture for both image understanding and wordpiece-level text generation. The TrOCR model is simple but effective, and can be pre-trained with large-scale synthetic data and fine-tuned with human-labeled datasets. Experiments show that the TrOCR model outperforms the current state-of-the-art models on the printed, handwritten and scene text recognition tasks. The TrOCR models and code are publicly available at \\url\\https://aka.ms/trocr\\.","urldate":"2023-05-11","publisher":"arXiv","author":[{"propositions":[],"lastnames":["Li"],"firstnames":["Minghao"],"suffixes":[]},{"propositions":[],"lastnames":["Lv"],"firstnames":["Tengchao"],"suffixes":[]},{"propositions":[],"lastnames":["Chen"],"firstnames":["Jingye"],"suffixes":[]},{"propositions":[],"lastnames":["Cui"],"firstnames":["Lei"],"suffixes":[]},{"propositions":[],"lastnames":["Lu"],"firstnames":["Yijuan"],"suffixes":[]},{"propositions":[],"lastnames":["Florencio"],"firstnames":["Dinei"],"suffixes":[]},{"propositions":[],"lastnames":["Zhang"],"firstnames":["Cha"],"suffixes":[]},{"propositions":[],"lastnames":["Li"],"firstnames":["Zhoujun"],"suffixes":[]},{"propositions":[],"lastnames":["Wei"],"firstnames":["Furu"],"suffixes":[]}],"month":"September","year":"2022","note":"arXiv:2109.10282 [cs]","keywords":"Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition, Remember","bibtex":"@misc{li_trocr_2022,\n\ttitle = {{TrOCR}: {Transformer}-based {Optical} {Character} {Recognition} with {Pre}-trained {Models}},\n\tshorttitle = {{TrOCR}},\n\turl = {http://arxiv.org/abs/2109.10282},\n\tdoi = {10.48550/arXiv.2109.10282},\n\tabstract = {Text recognition is a long-standing research problem for document digitalization. Existing approaches are usually built based on CNN for image understanding and RNN for char-level text generation. In addition, another language model is usually needed to improve the overall accuracy as a post-processing step. In this paper, we propose an end-to-end text recognition approach with pre-trained image Transformer and text Transformer models, namely TrOCR, which leverages the Transformer architecture for both image understanding and wordpiece-level text generation. The TrOCR model is simple but effective, and can be pre-trained with large-scale synthetic data and fine-tuned with human-labeled datasets. Experiments show that the TrOCR model outperforms the current state-of-the-art models on the printed, handwritten and scene text recognition tasks. The TrOCR models and code are publicly available at {\\textbackslash}url\\{https://aka.ms/trocr\\}.},\n\turldate = {2023-05-11},\n\tpublisher = {arXiv},\n\tauthor = {Li, Minghao and Lv, Tengchao and Chen, Jingye and Cui, Lei and Lu, Yijuan and Florencio, Dinei and Zhang, Cha and Li, Zhoujun and Wei, Furu},\n\tmonth = sep,\n\tyear = {2022},\n\tnote = {arXiv:2109.10282 [cs]},\n\tkeywords = {Computer Science - Computation and Language, Computer Science - Computer Vision and Pattern Recognition, Remember},\n}\n\n","author_short":["Li, M.","Lv, T.","Chen, J.","Cui, L.","Lu, Y.","Florencio, D.","Zhang, C.","Li, Z.","Wei, F."],"key":"li_trocr_2022","id":"li_trocr_2022","bibbaseid":"li-lv-chen-cui-lu-florencio-zhang-li-etal-trocrtransformerbasedopticalcharacterrecognitionwithpretrainedmodels-2022","role":"author","urls":{"Paper":"http://arxiv.org/abs/2109.10282"},"keyword":["Computer Science - Computation and Language","Computer Science - Computer Vision and Pattern Recognition","Remember"],"metadata":{"authorlinks":{}}},"bibtype":"misc","biburl":"https://bibbase.org/network/files/LQTAQ9TepEbwDwNyw","dataSources":["7PTj9uCrffbdGYthn"],"keywords":["computer science - computation and language","computer science - computer vision and pattern recognition","remember"],"search_terms":["trocr","transformer","based","optical","character","recognition","pre","trained","models","li","lv","chen","cui","lu","florencio","zhang","li","wei"],"title":"TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models","year":2022}