A reproduction of Apple's bi-directional LSTM models for language identification in short strings. Toftrup, M., Asger Sørensen, S., Ciosici, M. R., & Assent, I. In Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop, pages 36–42, Online, April, 2021. Association for Computational Linguistics. Paper doi abstract bibtex Language Identification is the task of identifying a document's language. For applications like automatic spell checker selection, language identification must use very short strings such as text message fragments. In this work, we reproduce a language identification architecture that Apple briefly sketched in a blog post. We confirm the bi-LSTM model's performance and find that it outperforms current open-source language identifiers. We further find that its language identification mistakes are due to confusion between related languages.
@inproceedings{toftrup-etal-2021-reproduction,
abstract = {Language Identification is the task of identifying a document{'}s language. For applications like automatic spell checker selection, language identification must use very short strings such as text message fragments. In this work, we reproduce a language identification architecture that Apple briefly sketched in a blog post. We confirm the bi-LSTM model{'}s performance and find that it outperforms current open-source language identifiers. We further find that its language identification mistakes are due to confusion between related languages.},
address = {Online},
author = {Toftrup, Mads and Asger S{\o}rensen, S{\o}ren and Ciosici, Manuel R. and Assent, Ira},
booktitle = {Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop},
doi = {10.18653/v1/2021.eacl-srw.6},
month = apr,
pages = {36--42},
publisher = {Association for Computational Linguistics},
title = {A reproduction of Apple{'}s bi-directional {LSTM} models for language identification in short strings},
url = {https://aclanthology.org/2021.eacl-srw.6},
year = {2021},
bdsk-url-1 = {https://aclanthology.org/2021.eacl-srw.6},
bdsk-url-2 = {https://doi.org/10.18653/v1/2021.eacl-srw.6}}
Downloads: 0
{"_id":"cJZvwcoKkBv89yvHb","bibbaseid":"toftrup-asgersrensen-ciosici-assent-areproductionofapplesbidirectionallstmmodelsforlanguageidentificationinshortstrings-2021","author_short":["Toftrup, M.","Asger Sørensen, S.","Ciosici, M. R.","Assent, I."],"bibdata":{"bibtype":"inproceedings","type":"inproceedings","abstract":"Language Identification is the task of identifying a document's language. For applications like automatic spell checker selection, language identification must use very short strings such as text message fragments. In this work, we reproduce a language identification architecture that Apple briefly sketched in a blog post. We confirm the bi-LSTM model's performance and find that it outperforms current open-source language identifiers. We further find that its language identification mistakes are due to confusion between related languages.","address":"Online","author":[{"propositions":[],"lastnames":["Toftrup"],"firstnames":["Mads"],"suffixes":[]},{"propositions":[],"lastnames":["Asger","Sørensen"],"firstnames":["Søren"],"suffixes":[]},{"propositions":[],"lastnames":["Ciosici"],"firstnames":["Manuel","R."],"suffixes":[]},{"propositions":[],"lastnames":["Assent"],"firstnames":["Ira"],"suffixes":[]}],"booktitle":"Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop","doi":"10.18653/v1/2021.eacl-srw.6","month":"April","pages":"36–42","publisher":"Association for Computational Linguistics","title":"A reproduction of Apple's bi-directional LSTM models for language identification in short strings","url":"https://aclanthology.org/2021.eacl-srw.6","year":"2021","bdsk-url-1":"https://aclanthology.org/2021.eacl-srw.6","bdsk-url-2":"https://doi.org/10.18653/v1/2021.eacl-srw.6","bibtex":"@inproceedings{toftrup-etal-2021-reproduction,\n\tabstract = {Language Identification is the task of identifying a document{'}s language. For applications like automatic spell checker selection, language identification must use very short strings such as text message fragments. In this work, we reproduce a language identification architecture that Apple briefly sketched in a blog post. We confirm the bi-LSTM model{'}s performance and find that it outperforms current open-source language identifiers. We further find that its language identification mistakes are due to confusion between related languages.},\n\taddress = {Online},\n\tauthor = {Toftrup, Mads and Asger S{\\o}rensen, S{\\o}ren and Ciosici, Manuel R. and Assent, Ira},\n\tbooktitle = {Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Student Research Workshop},\n\tdoi = {10.18653/v1/2021.eacl-srw.6},\n\tmonth = apr,\n\tpages = {36--42},\n\tpublisher = {Association for Computational Linguistics},\n\ttitle = {A reproduction of Apple{'}s bi-directional {LSTM} models for language identification in short strings},\n\turl = {https://aclanthology.org/2021.eacl-srw.6},\n\tyear = {2021},\n\tbdsk-url-1 = {https://aclanthology.org/2021.eacl-srw.6},\n\tbdsk-url-2 = {https://doi.org/10.18653/v1/2021.eacl-srw.6}}\n\n","author_short":["Toftrup, M.","Asger Sørensen, S.","Ciosici, M. R.","Assent, I."],"bibbaseid":"toftrup-asgersrensen-ciosici-assent-areproductionofapplesbidirectionallstmmodelsforlanguageidentificationinshortstrings-2021","role":"author","urls":{"Paper":"https://aclanthology.org/2021.eacl-srw.6"},"metadata":{"authorlinks":{}}},"bibtype":"inproceedings","biburl":"https://bibbase.org/f/rgWMwNyg47s6MAuEJ/manuelc-2023.bib","dataSources":["bNHGR4jWCTLTnHxvM"],"keywords":[],"search_terms":["reproduction","apple","directional","lstm","models","language","identification","short","strings","toftrup","asger sørensen","ciosici","assent"],"title":"A reproduction of Apple's bi-directional LSTM models for language identification in short strings","year":2021}