var bibbase_data = {"data":"\"Loading..\"\n\n
\n\n \n\n \n\n \n \n\n \n\n \n \n\n \n\n \n
\n generated by\n \n \"bibbase.org\"\n\n \n
\n \n\n
\n\n \n\n\n
\n\n Excellent! Next you can\n create a new website with this list, or\n embed it in an existing web page by copying & pasting\n any of the following snippets.\n\n
\n JavaScript\n (easiest)\n
\n \n <script src=\"https://bibbase.org/show?bib=http%3A%2F%2Fwww.cardamom-project.org%2Fwp-content%2Fuploads%2F2023%2F11%2Fcardamom_bib.txt&commas=true&noBootstrap=1&jsonp=1&jsonp=1\"></script>\n \n
\n\n PHP\n
\n \n <?php\n $contents = file_get_contents(\"https://bibbase.org/show?bib=http%3A%2F%2Fwww.cardamom-project.org%2Fwp-content%2Fuploads%2F2023%2F11%2Fcardamom_bib.txt&commas=true&noBootstrap=1&jsonp=1\");\n print_r($contents);\n ?>\n \n
\n\n iFrame\n (not recommended)\n
\n \n <iframe src=\"https://bibbase.org/show?bib=http%3A%2F%2Fwww.cardamom-project.org%2Fwp-content%2Fuploads%2F2023%2F11%2Fcardamom_bib.txt&commas=true&noBootstrap=1&jsonp=1\"></iframe>\n \n
\n\n

\n For more details see the documention.\n

\n
\n
\n\n
\n\n This is a preview! To use this list on your own web site\n or create a new web site from it,\n create a free account. The file will be added\n and you will be able to edit it in the File Manager.\n We will show you instructions once you've created your account.\n
\n\n
\n\n

To the site owner:

\n\n

Action required! Mendeley is changing its\n API. In order to keep using Mendeley with BibBase past April\n 14th, you need to:\n

    \n
  1. renew the authorization for BibBase on Mendeley, and
  2. \n
  3. update the BibBase URL\n in your page the same way you did when you initially set up\n this page.\n
  4. \n
\n

\n\n

\n \n \n Fix it now\n

\n
\n\n
\n\n\n
\n \n \n
\n
\n  \n 2023\n \n \n (4)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n The Cardamom Workbench for Historical and Under-Resourced Languages.\n \n \n \n \n\n\n \n Doyle, A., Fransen, T., Stearns, B., McCrae, J. P., Dereza, O., & Rani, P.\n\n\n \n\n\n\n In Carvalho, S., Khan, A. F., Anić, A. O., Spahiu, B., Gracia, J., McCrae, J. P., Gromann, D., Heinisch, B., & Salgado, A., editor(s), Proceedings of the 4th Conference on Language, Data and Knowledge, pages 109–120, Vienna, Austria, September 2023. NOVA CLUNL, Portugal\n \n\n\n\n
\n\n\n\n \n \n \"ThePaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n  \n \n 11 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{doyle-etal-2023-cardamom,\n    title = "The Cardamom Workbench for Historical and Under-Resourced Languages",\n    author = "Doyle, Adrian  and\n      Fransen, Theodorus  and\n      Stearns, Bernardo  and\n      McCrae, John P.  and\n      Dereza, Oksana  and\n      Rani, Priya",\n    editor = "Carvalho, Sara  and\n      Khan, Anas Fahad  and\n      Ani{\\'c}, Ana Ostro{\\v{s}}ki  and\n      Spahiu, Blerina  and\n      Gracia, Jorge  and\n      McCrae, John P.  and\n      Gromann, Dagmar  and\n      Heinisch, Barbara  and\n      Salgado, Ana",\n    booktitle = "Proceedings of the 4th Conference on Language, Data and Knowledge",\n    month = sep,\n    year = "2023",\n    address = "Vienna, Austria",\n    publisher = "NOVA CLUNL, Portugal",\n    url = "https://aclanthology.org/2023.ldk-1.10",\n    pages = "109--120",\n}\n\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Do not Trust the Experts: How the Lack of Standard Complicates NLP for Historical Irish.\n \n \n \n \n\n\n \n Dereza, O., Fransen, T., & Mccrae, J. P.\n\n\n \n\n\n\n In The Fourth Workshop on Insights from Negative Results in NLP, pages 82–87, Dubrovnik, Croatia, May 2023. Association for Computational Linguistics\n \n\n\n\n
\n\n\n\n \n \n \"DoPaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{dereza-etal-2023-trust,\n    title = "Do not Trust the Experts: How the Lack of Standard Complicates {NLP} for Historical {I}rish",\n    author = "Dereza, Oksana  and\n      Fransen, Theodorus  and\n      Mccrae, John P.",\n    booktitle = "The Fourth Workshop on Insights from Negative Results in NLP",\n    month = may,\n    year = "2023",\n    address = "Dubrovnik, Croatia",\n    publisher = "Association for Computational Linguistics",\n    url = "https://aclanthology.org/2023.insights-1.10",\n    pages = "82--87",\n    abstract = "In this paper, we describe how we unearthed some fundamental problems while building an analogy dataset modelled on BATS (Gladkova et al., 2016) to evaluate historical Irish embeddings on their ability to detect orthographic, morphological and semantic similarity.The performance of our models in the analogy task was extremely poor regardless of the architecture, hyperparameters and evaluation metrics, while the qualitative evaluation revealed positive tendencies. We argue that low agreement between field experts on fundamental lexical and orthographic issues, and the lack of a unified editorial standard in available resources make it impossible to build reliable evaluation datasets for computational models and obtain interpretable results. We emphasise the need for such a standard, particularly for NLP applications, and prompt Celticists and historical linguists to engage in further discussion. We would also like to draw NLP scholars{'} attention to the role of data and its (extra)linguistic properties in testing new models, technologies and evaluation scenarios.",\n}\n\n
\n
\n\n\n
\n In this paper, we describe how we unearthed some fundamental problems while building an analogy dataset modelled on BATS (Gladkova et al., 2016) to evaluate historical Irish embeddings on their ability to detect orthographic, morphological and semantic similarity.The performance of our models in the analogy task was extremely poor regardless of the architecture, hyperparameters and evaluation metrics, while the qualitative evaluation revealed positive tendencies. We argue that low agreement between field experts on fundamental lexical and orthographic issues, and the lack of a unified editorial standard in available resources make it impossible to build reliable evaluation datasets for computational models and obtain interpretable results. We emphasise the need for such a standard, particularly for NLP applications, and prompt Celticists and historical linguists to engage in further discussion. We would also like to draw NLP scholars' attention to the role of data and its (extra)linguistic properties in testing new models, technologies and evaluation scenarios.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Temporal Domain Adaptation for Historical Irish.\n \n \n \n \n\n\n \n Dereza, O., Fransen, T., & Mccrae, J. P.\n\n\n \n\n\n\n In Tenth Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial 2023), pages 55–66, Dubrovnik, Croatia, May 2023. Association for Computational Linguistics\n \n\n\n\n
\n\n\n\n \n \n \"TemporalPaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{dereza-etal-2023-temporal,\n    title = "Temporal Domain Adaptation for Historical {I}rish",\n    author = "Dereza, Oksana  and\n      Fransen, Theodorus  and\n      Mccrae, John P.",\n    booktitle = "Tenth Workshop on NLP for Similar Languages, Varieties and Dialects (VarDial 2023)",\n    month = may,\n    year = "2023",\n    address = "Dubrovnik, Croatia",\n    publisher = "Association for Computational Linguistics",\n    url = "https://aclanthology.org/2023.vardial-1.6",\n    pages = "55--66",\n    abstract = "The digitisation of historical texts has provided new horizons for NLP research, but such data also presents a set of challenges, including scarcity and inconsistency. The lack of editorial standard during digitisation exacerbates these difficulties.This study explores the potential for temporal domain adaptation in Early Modern Irish and pre-reform Modern Irish data. We describe two experiments carried out on the book subcorpus of the Historical Irish Corpus, which includes Early Modern Irish and pre-reform Modern Irish texts from 1581 to 1926. We also propose a simple orthographic normalisation method for historical Irish that reduces the type-token ratio by 21.43{\\%} on average in our data.The results demonstrate that the use of out-of-domain data significantly improves a language model{'}s performance. Providing a model with additional input from another historical stage of the language improves its quality by 12.49{\\%} on average on non-normalised texts and by 27.02{\\%} on average on normalised (demutated) texts. Most notably, using only out-of-domain data for both pre-training and training stages allowed for up to 86.81{\\%} of the baseline model quality on non-normalised texts and up to 95.68{\\%} on normalised texts without any target domain data. Additionally, we investigate the effect of temporal distance between the training and test data. The hypothesis that there is a positive correlation between performance and temporal proximity of training and test data has been validated, which manifests best in normalised data. Expanding this approach even further back, to Middle and Old Irish, and testing it on other languages is a further research direction.",\n}\n\n
\n
\n\n\n
\n The digitisation of historical texts has provided new horizons for NLP research, but such data also presents a set of challenges, including scarcity and inconsistency. The lack of editorial standard during digitisation exacerbates these difficulties.This study explores the potential for temporal domain adaptation in Early Modern Irish and pre-reform Modern Irish data. We describe two experiments carried out on the book subcorpus of the Historical Irish Corpus, which includes Early Modern Irish and pre-reform Modern Irish texts from 1581 to 1926. We also propose a simple orthographic normalisation method for historical Irish that reduces the type-token ratio by 21.43% on average in our data.The results demonstrate that the use of out-of-domain data significantly improves a language model's performance. Providing a model with additional input from another historical stage of the language improves its quality by 12.49% on average on non-normalised texts and by 27.02% on average on normalised (demutated) texts. Most notably, using only out-of-domain data for both pre-training and training stages allowed for up to 86.81% of the baseline model quality on non-normalised texts and up to 95.68% on normalised texts without any target domain data. Additionally, we investigate the effect of temporal distance between the training and test data. The hypothesis that there is a positive correlation between performance and temporal proximity of training and test data has been validated, which manifests best in normalised data. Expanding this approach even further back, to Middle and Old Irish, and testing it on other languages is a further research direction.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Findings of the SIGTYP 2023 Shared task on Cognate and Derivative Detection For Low-Resourced Languages.\n \n \n \n \n\n\n \n Rani, P., Goswami, K., Doyle, A., Fransen, T., Stearns, B., & McCrae, J. P.\n\n\n \n\n\n\n In Proceedings of the 5th Workshop on Research in Computational Linguistic Typology and Multilingual NLP, pages 126–131, Dubrovnik, Croatia, May 2023. Association for Computational Linguistics\n \n\n\n\n
\n\n\n\n \n \n \"FindingsPaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{rani-etal-2023-findings,\n    title = "Findings of the {SIGTYP} 2023 Shared task on Cognate and Derivative Detection For Low-Resourced Languages",\n    author = "Rani, Priya  and\n      Goswami, Koustava  and\n      Doyle, Adrian  and\n      Fransen, Theodorus  and\n      Stearns, Bernardo  and\n      McCrae, John P.",\n    booktitle = "Proceedings of the 5th Workshop on Research in Computational Linguistic Typology and Multilingual NLP",\n    month = may,\n    year = "2023",\n    address = "Dubrovnik, Croatia",\n    publisher = "Association for Computational Linguistics",\n    url = "https://aclanthology.org/2023.sigtyp-1.13",\n    pages = "126--131",\n    abstract = "This paper describes the structure and findings of the SIGTYP 2023 shared task on cognate and derivative detection for low-resourced languages, broken down into a supervised and unsupervised sub-task. The participants were asked to submit the test data{'}s final prediction. A total of nine teams registered for the shared task where seven teams registered for both sub-tasks. Only two participants ended up submitting system descriptions, with only one submitting systems for both sub-tasks. While all systems show a rather promising performance, all could be within the baseline score for the supervised sub-task. However, the system submitted for the unsupervised sub-task outperforms the baseline score.",\n}\n\n
\n
\n\n\n
\n This paper describes the structure and findings of the SIGTYP 2023 shared task on cognate and derivative detection for low-resourced languages, broken down into a supervised and unsupervised sub-task. The participants were asked to submit the test data's final prediction. A total of nine teams registered for the shared task where seven teams registered for both sub-tasks. Only two participants ended up submitting system descriptions, with only one submitting systems for both sub-tasks. While all systems show a rather promising performance, all could be within the baseline score for the supervised sub-task. However, the system submitted for the unsupervised sub-task outperforms the baseline score.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2022\n \n \n (1)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Toward an Integrative Approach for Making Sense Distinctions.\n \n \n \n \n\n\n \n McCrae, J. P., Fransen, T., Ahmadi, S., Buitelaar, P., & Goswami, K.\n\n\n \n\n\n\n Frontiers in Artificial Intelligence, 5. 2022.\n \n\n\n\n
\n\n\n\n \n \n \"TowardPaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@article{10.3389/frai.2022.745626,  \nauthor={McCrae, John P. and Fransen, Theodorus and Ahmadi, Sina and Buitelaar, Paul and Goswami, Koustava},   \ntitle={Toward an Integrative Approach for Making Sense Distinctions},  \njournal={Frontiers in Artificial Intelligence},\nvolume={5},           \nyear={2022},      \nurl={https://www.frontiersin.org/articles/10.3389/frai.2022.745626},       \ndoi={10.3389/frai.2022.745626},      \nissn={2624-8212},   \nabstract={Word senses are the fundamental unit of description in lexicography, yet it is rarely the case that different dictionaries reach any agreement on the number and definition of senses in a language. With the recent rise in natural language processing and other computational approaches there is an increasing demand for quantitatively validated sense catalogues of words, yet no consensus methodology exists. In this paper, we look at four main approaches to making sense distinctions: formal, cognitive, distributional, and intercultural and examine the strengths and weaknesses of each approach. We then consider how these may be combined into a single sound methodology. We illustrate this by examining two English words, “wing” and “fish,” using existing resources for each of these four approaches and illustrate the weaknesses of each. We then look at the impact of such an integrated method and provide some future perspectives on the research that is necessary to reach a principled method for making sense distinctions.}\n}\n\n
\n
\n\n\n
\n Word senses are the fundamental unit of description in lexicography, yet it is rarely the case that different dictionaries reach any agreement on the number and definition of senses in a language. With the recent rise in natural language processing and other computational approaches there is an increasing demand for quantitatively validated sense catalogues of words, yet no consensus methodology exists. In this paper, we look at four main approaches to making sense distinctions: formal, cognitive, distributional, and intercultural and examine the strengths and weaknesses of each approach. We then consider how these may be combined into a single sound methodology. We illustrate this by examining two English words, “wing” and “fish,” using existing resources for each of these four approaches and illustrate the weaknesses of each. We then look at the impact of such an integrated method and provide some future perspectives on the research that is necessary to reach a principled method for making sense distinctions.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2021\n \n \n (9)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n Mufin: Enriching Semantic Understanding of Sentence Embedding using Dual Tune Framework.\n \n \n \n\n\n \n Goswami, K., Dutta, S., & Assem, H.\n\n\n \n\n\n\n In 2021 IEEE International Conference on Big Data (Big Data), pages 2034–2039, 2021. \n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 5 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{goswami-etal-2021-mufin,\n  author={Goswami, Koustava and Dutta, Sourav and Assem, Haytham},\n  booktitle={2021 IEEE International Conference on Big Data (Big Data)}, \n  title={Mufin: Enriching Semantic Understanding of Sentence Embedding using Dual Tune Framework}, \n  year={2021},\n  volume={},\n  number={},\n  pages={2034--2039},\n  doi={10.1109/BigData52589.2021.9671614},\n  abstract={With the advancements of Natural Language Understanding (NLU), diverse industrial applications like user intent classification, smart chatbots, sentiment analysis and question answering have be-come a primary paradigm. Transformers-based multi-lingual language models such as XLM have performed significantly well in diverse semantic understanding and classification tasks. However, fine-tuning such large pre-trained architectures is resource and compute intensive, limiting its wide adoption in enterprise environments.We present a novel efficient and light-weight frame-work based on sentence embeddings to obtain enhanced multi-lingual text representations for domain-specific NLU applications. Our framework combines the concepts of up-projection, alignment and meta-embeddings enhancing the textual semantic similarity knowledge of smaller sentence embedding architectures. Extensive experiments on diverse cross-lingual classification tasks showcase the proposed framework to be comparable to state-of-the-art large language models (in mono-lingual and zero-shot settings), even with lesser training and resource requirements.}\n}\n\n
\n
\n\n\n
\n With the advancements of Natural Language Understanding (NLU), diverse industrial applications like user intent classification, smart chatbots, sentiment analysis and question answering have be-come a primary paradigm. Transformers-based multi-lingual language models such as XLM have performed significantly well in diverse semantic understanding and classification tasks. However, fine-tuning such large pre-trained architectures is resource and compute intensive, limiting its wide adoption in enterprise environments.We present a novel efficient and light-weight frame-work based on sentence embeddings to obtain enhanced multi-lingual text representations for domain-specific NLU applications. Our framework combines the concepts of up-projection, alignment and meta-embeddings enhancing the textual semantic similarity knowledge of smaller sentence embedding architectures. Extensive experiments on diverse cross-lingual classification tasks showcase the proposed framework to be comparable to state-of-the-art large language models (in mono-lingual and zero-shot settings), even with lesser training and resource requirements.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Cross-lingual Sentence Embedding using Multi-Task Learning.\n \n \n \n \n\n\n \n Goswami, K., Dutta, S., Assem, H., Fransen, T., & McCrae, J. P.\n\n\n \n\n\n\n In Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing, pages 9099–9113, Online and Punta Cana, Dominican Republic, November 2021. Association for Computational Linguistics\n \n\n\n\n
\n\n\n\n \n \n \"Cross-lingualPaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 6 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{goswami-etal-2021-cross,\n    title = "Cross-lingual Sentence Embedding using Multi-Task Learning",\n    author = "Goswami, Koustava  and\n      Dutta, Sourav  and\n      Assem, Haytham  and\n      Fransen, Theodorus  and\n      McCrae, John P.",\n    booktitle = "Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing",\n    month = nov,\n    year = "2021",\n    address = "Online and Punta Cana, Dominican Republic",\n    publisher = "Association for Computational Linguistics",\n    url = "https://aclanthology.org/2021.emnlp-main.716",\n    doi = "10.18653/v1/2021.emnlp-main.716",\n    pages = "9099--9113",\n    abstract = "Multilingual sentence embeddings capture rich semantic information not only for measuring similarity between texts but also for catering to a broad range of downstream cross-lingual NLP tasks. State-of-the-art multilingual sentence embedding models require large parallel corpora to learn efficiently, which confines the scope of these models. In this paper, we propose a novel sentence embedding framework based on an unsupervised loss function for generating effective multilingual sentence embeddings, eliminating the need for parallel corpora. We capture semantic similarity and relatedness between sentences using a multi-task loss function for training a dual encoder model mapping different languages onto the same vector space. We demonstrate the efficacy of an unsupervised as well as a weakly supervised variant of our framework on STS, BUCC and Tatoeba benchmark tasks. The proposed unsupervised sentence embedding framework outperforms even supervised state-of-the-art methods for certain under-resourced languages on the Tatoeba dataset and on a monolingual benchmark. Further, we show enhanced zero-shot learning capabilities for more than 30 languages, with the model being trained on only 13 languages. Our model can be extended to a wide range of languages from any language family, as it overcomes the requirement of parallel corpora for training.",\n}\n\n
\n
\n\n\n
\n Multilingual sentence embeddings capture rich semantic information not only for measuring similarity between texts but also for catering to a broad range of downstream cross-lingual NLP tasks. State-of-the-art multilingual sentence embedding models require large parallel corpora to learn efficiently, which confines the scope of these models. In this paper, we propose a novel sentence embedding framework based on an unsupervised loss function for generating effective multilingual sentence embeddings, eliminating the need for parallel corpora. We capture semantic similarity and relatedness between sentences using a multi-task loss function for training a dual encoder model mapping different languages onto the same vector space. We demonstrate the efficacy of an unsupervised as well as a weakly supervised variant of our framework on STS, BUCC and Tatoeba benchmark tasks. The proposed unsupervised sentence embedding framework outperforms even supervised state-of-the-art methods for certain under-resourced languages on the Tatoeba dataset and on a monolingual benchmark. Further, we show enhanced zero-shot learning capabilities for more than 30 languages, with the model being trained on only 13 languages. Our model can be extended to a wide range of languages from any language family, as it overcomes the requirement of parallel corpora for training.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Findings of the LoResMT 2021 Shared Task on COVID and Sign Language for Low-Resource Languages.\n \n \n \n \n\n\n \n Ojha, A. K., Liu, C., Kann, K., Ortega, J., Satam, S., & Fransen, T.\n\n\n \n\n\n\n In Proceedings of the 4th Workshop on Technologies for MT of Low Resource Languages, pages 114–123, Boston, MA, August 2021. Association for Machine Translation in the Americas\n \n\n\n\n
\n\n\n\n \n \n \"FindingsPaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 22 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{ojha-etal-2021-findings,\n  title        = {Findings of the {LoResMT 2021 Shared Task on COVID and Sign Language for Low-Resource Languages}},\n  author       = {Ojha, Atul Kr. and Liu, Chao-Hong and Kann, Katharina and Ortega, John and Satam, Sheetal and Fransen, Theodorus},\n  year         = 2021,\n  month        = aug,\n  booktitle    = {Proceedings of the 4th Workshop on Technologies for MT of Low Resource Languages},\n  publisher    = {Association for Machine Translation in the Americas},\n  address      = {Boston, MA},\n  pages        = {114--123},\n  url          = {https://aclanthology.org/2021.mtsummit-loresmt.11/},\n  abstract     = {We present the findings of the LoResMT 2021 shared task which focuses on machine translation (MT) of COVID-19 data for both low-resource spoken and sign languages. The organization of this task was conducted as part of the fourth workshop on technologies for machine translation of low resource languages (LoResMT). Parallel corpora is presented and publicly available which includes the following directions: English↔Irish, English↔Marathi, and Taiwanese Sign language↔Traditional Chinese. Training data consists of 8112, 20933 and 128608 segments, respectively. There are additional monolingual data sets for Marathi and English that consist of 21901 segments. The results presented here are based on entries from a total of eight teams. Three teams submitted systems for English↔Irish while five teams submitted systems for English↔Marathi. Unfortunately, there were no systems submissions for the Taiwanese Sign language↔Traditional Chinese task. Maximum system performance was computed using BLEU and follow as 36.0 for English–Irish, 34.6 for Irish–English, 24.2 for English–Marathi, and 31.3 for Marathi–English.}\n}\n\n
\n
\n\n\n
\n We present the findings of the LoResMT 2021 shared task which focuses on machine translation (MT) of COVID-19 data for both low-resource spoken and sign languages. The organization of this task was conducted as part of the fourth workshop on technologies for machine translation of low resource languages (LoResMT). Parallel corpora is presented and publicly available which includes the following directions: English↔Irish, English↔Marathi, and Taiwanese Sign language↔Traditional Chinese. Training data consists of 8112, 20933 and 128608 segments, respectively. There are additional monolingual data sets for Marathi and English that consist of 21901 segments. The results presented here are based on entries from a total of eight teams. Three teams submitted systems for English↔Irish while five teams submitted systems for English↔Marathi. Unfortunately, there were no systems submissions for the Taiwanese Sign language↔Traditional Chinese task. Maximum system performance was computed using BLEU and follow as 36.0 for English–Irish, 34.6 for Irish–English, 24.2 for English–Marathi, and 31.3 for Marathi–English.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n How Computers Can Future-Proof Minority Languages.\n \n \n \n \n\n\n \n Fransen, T., & McCrae, J.\n\n\n \n\n\n\n Cois Coiribe. 2021.\n \n\n\n\n
\n\n\n\n \n \n \"HowPaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@article{fransen-mccrae-2021-cois-coiribe,\n  title        = {How Computers Can Future-Proof Minority Languages},\n  author       = {Fransen, Theodorus and McCrae, John},\n  year         = 2021,\n  journal      = {Cois Coiribe},\n  url          = {https://impact.nuigalway.ie/arts/how-computers-can-future-proof-minority-languages/},\n  date         = {2021-07-29},\n  abstract     = {Dr. Theodorus Fransen &amp;amp; Dr. John McCrae explore how digital language tools can potentially resolve the underrepresentation of minority languages in terms of digital technology and the Web.}\n}\n\n\n
\n
\n\n\n
\n Dr. Theodorus Fransen &amp; Dr. John McCrae explore how digital language tools can potentially resolve the underrepresentation of minority languages in terms of digital technology and the Web.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Enriching a terminology for under-resourced languages using knowledge graphs.\n \n \n \n \n\n\n \n McCrae, J. P., Ojha, A. K., Chakravarthi, B. R., Kelly, I., Buffini, P., Tang, G., Paquin, E., & Locria, M.\n\n\n \n\n\n\n In Proceedings of The Seventh Biennial Conference on Electronic Lexicography, eLex 2021, pages 560–571, July 2021. \n \n\n\n\n
\n\n\n\n \n \n \"EnrichingPaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{mccrae-etal-2021-enriching,\n  title        = {{Enriching a terminology for under-resourced languages using knowledge graphs}},\n  author       = {McCrae, John P. and Ojha, Atul Kumar and Chakravarthi, Bharathi Raja and Kelly, Ian and Buffini, Patricia and Tang, Grace and Paquin, Eric and Locria, Manuel},\n  year         = 2021,\n  month        = jul,\n  booktitle    = {Proceedings of The Seventh Biennial Conference on Electronic Lexicography, eLex 2021},\n  pages        = {560--571},\n  url          = {https://elex.link/elex2021/proceedings-download/},\n  affiliation  = {['NUIG', 'NUIG', 'NUIG', 'Dublin City University', 'Dublin City University', 'Translators without Borders', 'Translators without Borders', 'Translators without Borders']},\n  abstract     = {Translated terminology for severely under-resourced languages is a vital tool for aid workers working in humanitarian crises. However there are generally no lexical resources that can be used for this purpose. Translators without Borders (TWB) is a non-profit whose goal is to help get vital information, including developing lexical resources for aid workers. In order to help with the resource construction, TWB has worked with the ADAPT Centre to develop tools to help with the development of their resources for crisis response. In particular, we have enriched these resources by linking with open lexical resources such as WordNet and Wikidata as well as the derivation of a novel extended corpus. In particular, this work has focused on the development of resources for languages useful for aid workers working with Rohingya refugees, namely, Rohingya, Chittagonian, Bengali and Burmese. These languages are all under-resourced and for Rohingya and Chittagonian there are only very limited major lexical resources available. For these languages, we have constructed some of the first corpora resources that will allow automatic construction of lexical resources. We have also used the Naisc tool for monolingual dictionary linking in order to connect the existing English parts of the lexical resources with information from WordNet and Wikidata and this has provided a wealth of extra information including images, alternative definitions, translations (in Bengali, Burmese and other languages) as well as many related terms that may guide TWB linguists and terminologists in the process of extending their resources. We have presented these results in an interface allowing the lexicographers to browse through the results extracted from the external resources and select those that they wish to include in their resource. We present results on the quality of the linking inferred by the Naisc system as well as qualitative analysis of the effectiveness of the tool in the development of the TWB glossaries.}\n}\n\n\n
\n
\n\n\n
\n Translated terminology for severely under-resourced languages is a vital tool for aid workers working in humanitarian crises. However there are generally no lexical resources that can be used for this purpose. Translators without Borders (TWB) is a non-profit whose goal is to help get vital information, including developing lexical resources for aid workers. In order to help with the resource construction, TWB has worked with the ADAPT Centre to develop tools to help with the development of their resources for crisis response. In particular, we have enriched these resources by linking with open lexical resources such as WordNet and Wikidata as well as the derivation of a novel extended corpus. In particular, this work has focused on the development of resources for languages useful for aid workers working with Rohingya refugees, namely, Rohingya, Chittagonian, Bengali and Burmese. These languages are all under-resourced and for Rohingya and Chittagonian there are only very limited major lexical resources available. For these languages, we have constructed some of the first corpora resources that will allow automatic construction of lexical resources. We have also used the Naisc tool for monolingual dictionary linking in order to connect the existing English parts of the lexical resources with information from WordNet and Wikidata and this has provided a wealth of extra information including images, alternative definitions, translations (in Bengali, Burmese and other languages) as well as many related terms that may guide TWB linguists and terminologists in the process of extending their resources. We have presented these results in an interface allowing the lexicographers to browse through the results extracted from the external resources and select those that they wish to include in their resource. We present results on the quality of the linking inferred by the Naisc system as well as qualitative analysis of the effectiveness of the tool in the development of the TWB glossaries.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A Survey of Orthographic Information in Machine Translation.\n \n \n \n \n\n\n \n Chakravarthi, B. R., Rani, P., Arcan, M., & McCrae, J. P.\n\n\n \n\n\n\n SN Computer Science, 2(4). June 2021.\n \n\n\n\n
\n\n\n\n \n \n \"APaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@article{chakravarthi-etal-2021-survey,\n  title        = {A Survey of Orthographic Information in Machine Translation},\n  author       = {Chakravarthi, Bharathi Raja and Rani, Priya and Arcan, Mihael and McCrae, John P.},\n  year         = 2021,\n  month        = jun,\n  journal      = {SN Computer Science},\n  publisher    = {Springer Science and Business Media LLC},\n  volume       = 2,\n  number       = 4,\n  doi          = {10.1007/s42979-021-00723-4},\n  issn         = {2661-8907},\n  url          = {https://doi.org/10.1007/s42979-021-00723-4},\n  abstract     = {Machine translation is one of the applications of natural language processing which has been explored in different languages. Recently researchers started paying attention towards machine translation for resource-poor languages and closely related languages. A widespread and underlying problem for these machine translation systems is the linguistic difference and variation in orthographic conventions which causes many issues to traditional approaches. Two languages written in two different orthographies are not easily comparable but orthographic information can also be used to improve the machine translation system. This article offers a survey of research regarding orthography{\\textquoteright}s influence on machine translation of under-resourced languages. It introduces under-resourced languages in terms of machine translation and how orthographic information can be utilised to improve machine translation. We describe previous work in this area, discussing what underlying assumptions were made, and showing how orthographic knowledge improves the performance of machine translation of under-resourced languages. We discuss different types of machine translation and demonstrate a recent trend that seeks to link orthographic information with well-established machine translation methods. Considerable attention is given to current efforts using cognate information at different levels of machine translation and the lessons that can be drawn from this. Additionally, multilingual neural machine translation of closely related languages is given a particular focus in this survey. This article ends with a discussion of the way forward in machine translation with orthographic information, focusing on multilingual settings and bilingual lexicon induction.}\n}\n\n\n
\n
\n\n\n
\n Machine translation is one of the applications of natural language processing which has been explored in different languages. Recently researchers started paying attention towards machine translation for resource-poor languages and closely related languages. A widespread and underlying problem for these machine translation systems is the linguistic difference and variation in orthographic conventions which causes many issues to traditional approaches. Two languages written in two different orthographies are not easily comparable but orthographic information can also be used to improve the machine translation system. This article offers a survey of research regarding orthography\\textquoterights influence on machine translation of under-resourced languages. It introduces under-resourced languages in terms of machine translation and how orthographic information can be utilised to improve machine translation. We describe previous work in this area, discussing what underlying assumptions were made, and showing how orthographic knowledge improves the performance of machine translation of under-resourced languages. We discuss different types of machine translation and demonstrate a recent trend that seeks to link orthographic information with well-established machine translation methods. Considerable attention is given to current efforts using cognate information at different levels of machine translation and the lessons that can be drawn from this. Additionally, multilingual neural machine translation of closely related languages is given a particular focus in this survey. This article ends with a discussion of the way forward in machine translation with orthographic information, focusing on multilingual settings and bilingual lexicon induction.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Findings of the Shared Task on Machine Translation in Dravidian languages.\n \n \n \n \n\n\n \n Chakravarthi, B. R., Priyadharshini, R., Banerjee, S., Saldanha, R., McCrae, J. P., M, A. K., Krishnamurthy, P., & Johnson, M.\n\n\n \n\n\n\n In Proceedings of the First Workshop on Speech and Language Technologies for Dravidian Languages, pages 119–125, Kyiv, April 2021. Association for Computational Linguistics\n \n\n\n\n
\n\n\n\n \n \n \"FindingsPaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{chakravarthi-etal-2021-findings,\n  title        = {Findings of the Shared Task on Machine Translation in {Dravidian} languages},\n  author       = {Chakravarthi, Bharathi Raja and Priyadharshini, Ruba  and Banerjee, Shubhanker and Saldanha, Richard and McCrae, John P. and M, Anand Kumar and Krishnamurthy, Parameswari and Johnson, Melvin},\n  year         = 2021,\n  month        = apr,\n  booktitle    = {Proceedings of the First Workshop on Speech and Language Technologies for Dravidian Languages},\n  publisher    = {Association for Computational Linguistics},\n  address      = {Kyiv},\n  pages        = {119--125},\n  url          = {https://www.aclweb.org/anthology/2021.dravidianlangtech-1.15},\n  abstract     = {This paper presents an overview of the shared task on machine translation of Dravidian languages. We presented the shared task results at the EACL 2021 workshop on Speech and Language Technologies for Dravidian Languages. This paper describes the datasets used, the methodology used for the evaluation of participants, and the experiments{'} overall results. As a part of this shared task, we organized four sub-tasks corresponding to machine translation of the following language pairs: English to Tamil, English to Malayalam, English to Telugu and Tamil to Telugu which are available at https://competitions.codalab.org/competitions/27650. We provided the participants with training and development datasets to perform experiments, and the results were evaluated on unseen test data. In total, 46 research groups participated in the shared task and 7 experimental runs were submitted for evaluation. We used BLEU scores for assessment of the translations.}\n}\n\n\n
\n
\n\n\n
\n This paper presents an overview of the shared task on machine translation of Dravidian languages. We presented the shared task results at the EACL 2021 workshop on Speech and Language Technologies for Dravidian Languages. This paper describes the datasets used, the methodology used for the evaluation of participants, and the experiments' overall results. As a part of this shared task, we organized four sub-tasks corresponding to machine translation of the following language pairs: English to Tamil, English to Malayalam, English to Telugu and Tamil to Telugu which are available at https://competitions.codalab.org/competitions/27650. We provided the participants with training and development datasets to perform experiments, and the results were evaluated on unseen test data. In total, 46 research groups participated in the shared task and 7 experimental runs were submitted for evaluation. We used BLEU scores for assessment of the translations.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Findings of the Shared Task on Troll Meme Classification in Tamil.\n \n \n \n \n\n\n \n Suryawanshi, S., & Chakravarthi, B. R.\n\n\n \n\n\n\n In Proceedings of the First Workshop on Speech and Language Technologies for Dravidian Languages, pages 126–132, Kyiv, April 2021. Association for Computational Linguistics\n \n\n\n\n
\n\n\n\n \n \n \"FindingsPaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{suryawanshi-chakravarthi-2021-findings,\n  title        = {Findings of the Shared Task on Troll Meme Classification in {Tamil}},\n  author       = {Suryawanshi, Shardul and Chakravarthi, Bharathi Raja},\n  year         = 2021,\n  month        = apr,\n  booktitle    = {Proceedings of the First Workshop on Speech and Language Technologies for Dravidian Languages},\n  publisher    = {Association for Computational Linguistics},\n  address      = {Kyiv},\n  pages        = {126--132},\n  url          = {https://www.aclweb.org/anthology/2021.dravidianlangtech-1.16},\n  abstract     = {The internet has facilitated its user-base with a platform to communicate and express their views without any censorship. On the other hand, this freedom of expression or free speech can be abused by its user or a troll to demean an individual or a group. Demeaning people based on their gender, sexual orientation, religious believes or any other characteristics {--}trolling{--} could cause great distress in the online community. Hence, the content posted by a troll needs to be identified and dealt with before causing any more damage. Amongst all the forms of troll content, memes are most prevalent due to their popularity and ability to propagate across cultures. A troll uses a meme to demean, attack or offend its targetted audience. In this shared task, we provide a resource (TamilMemes) that could be used to train a system capable of identifying a troll meme in the Tamil language. In our TamilMemes dataset, each meme has been categorized into either a {``}troll{''} or a {``}not{\\_}troll{''} class. Along with the meme images, we also provided the Latin transcripted text from memes. We received 10 system submissions from the participants which were evaluated using the weighted average F1-score. The system with the weighted average F1-score of 0.55 secured the first rank.}\n}\n\n\n
\n
\n\n\n
\n The internet has facilitated its user-base with a platform to communicate and express their views without any censorship. On the other hand, this freedom of expression or free speech can be abused by its user or a troll to demean an individual or a group. Demeaning people based on their gender, sexual orientation, religious believes or any other characteristics –trolling– could cause great distress in the online community. Hence, the content posted by a troll needs to be identified and dealt with before causing any more damage. Amongst all the forms of troll content, memes are most prevalent due to their popularity and ability to propagate across cultures. A troll uses a meme to demean, attack or offend its targetted audience. In this shared task, we provide a resource (TamilMemes) that could be used to train a system capable of identifying a troll meme in the Tamil language. In our TamilMemes dataset, each meme has been categorized into either a ``troll'' or a ``not_troll'' class. Along with the meme images, we also provided the Latin transcripted text from memes. We received 10 system submissions from the participants which were evaluated using the weighted average F1-score. The system with the weighted average F1-score of 0.55 secured the first rank.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Findings of the Shared Task on Offensive Language Identification in Tamil, Malayalam, and Kannada.\n \n \n \n \n\n\n \n Chakravarthi, B. R., Priyadharshini, R., Jose, N., Kumar M, A., Mandl, T., Kumaresan, P. K., Ponnusamy, R., R L, H., McCrae, J. P., & Sherly, E.\n\n\n \n\n\n\n In Proceedings of the First Workshop on Speech and Language Technologies for Dravidian Languages, pages 133–145, Kyiv, April 2021. Association for Computational Linguistics\n \n\n\n\n
\n\n\n\n \n \n \"FindingsPaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{chakravarthi-etal-2021-findings-shared,\n  title        = {Findings of the Shared Task on Offensive Language Identification in {Tamil}, {Malayalam}, and {Kannada}},\n  author       = {Chakravarthi, Bharathi Raja and Priyadharshini, Ruba and Jose, Navya and Kumar M, Anand and Mandl, Thomas and Kumaresan, Prasanna Kumar and Ponnusamy, Rahul and R L, Hariharan and McCrae, John P. and Sherly, Elizabeth},\n  year         = 2021,\n  month        = apr,\n  booktitle    = {Proceedings of the First Workshop on Speech and Language Technologies for Dravidian Languages},\n  publisher    = {Association for Computational Linguistics},\n  address      = {Kyiv},\n  pages        = {133--145},\n  url          = {https://www.aclweb.org/anthology/2021.dravidianlangtech-1.17},\n  abstract     = {Detecting offensive language in social media in local languages is critical for moderating user-generated content. Thus, the field of offensive language identification in under-resourced Tamil, Malayalam and Kannada languages are essential. As the user-generated content is more code-mixed and not well studied for under-resourced languages, it is imperative to create resources and conduct benchmarking studies to encourage research in under-resourced Dravidian languages. We created a shared task on offensive language detection in Dravidian languages. We summarize here the dataset for this challenge which are openly available at https://competitions.codalab.org/competitions/27654, and present an overview of the methods and the results of the competing systems.}\n}\n\n\n
\n
\n\n\n
\n Detecting offensive language in social media in local languages is critical for moderating user-generated content. Thus, the field of offensive language identification in under-resourced Tamil, Malayalam and Kannada languages are essential. As the user-generated content is more code-mixed and not well studied for under-resourced languages, it is imperative to create resources and conduct benchmarking studies to encourage research in under-resourced Dravidian languages. We created a shared task on offensive language detection in Dravidian languages. We summarize here the dataset for this challenge which are openly available at https://competitions.codalab.org/competitions/27654, and present an overview of the methods and the results of the competing systems.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2020\n \n \n (11)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n HopeEDI: A Multilingual Hope Speech Detection Dataset for Equality, Diversity, and Inclusion.\n \n \n \n \n\n\n \n Chakravarthi, B. R.\n\n\n \n\n\n\n In Proceedings of the Third Workshop on Computational Modeling of People's Opinions, Personality, and Emotion's in Social Media, pages 41–53, Barcelona, Spain (Online), December 2020. Association for Computational Linguistics\n \n\n\n\n
\n\n\n\n \n \n \"HopeEDI:Paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{chakravarthi-2020-hopeedi,\n  title        = {{HopeEDI}: A Multilingual Hope Speech Detection Dataset for Equality, Diversity, and Inclusion},\n  author       = {Chakravarthi, Bharathi Raja},\n  year         = 2020,\n  month        = dec,\n  booktitle    = {Proceedings of the Third Workshop on Computational Modeling of People's Opinions, Personality, and Emotion's in Social Media},\n  publisher    = {Association for Computational Linguistics},\n  address      = {Barcelona, Spain (Online)},\n  pages        = {41--53},\n  url          = {https://www.aclweb.org/anthology/2020.peoples-1.5},\n  abstract     = {Over the past few years, systems have been developed to control online content and eliminate abusive, offensive or hate speech content. However, people in power sometimes misuse this form of censorship to obstruct the democratic right of freedom of speech. Therefore, it is imperative that research should take a positive reinforcement approach towards online content that is encouraging, positive and supportive contents. Until now, most studies have focused on solving this problem of negativity in the English language, though the problem is much more than just harmful content. Furthermore, it is multilingual as well. Thus, we have constructed a Hope Speech dataset for Equality, Diversity and Inclusion (HopeEDI) containing user-generated comments from the social media platform YouTube with 28,451, 20,198 and 10,705 comments in English, Tamil and Malayalam, respectively, manually labelled as containing hope speech or not. To our knowledge, this is the first research of its kind to annotate hope speech for equality, diversity and inclusion in a multilingual setting. We determined that the inter-annotator agreement of our dataset using Krippendorff{'}s alpha. Further, we created several baselines to benchmark the resulting dataset and the results have been expressed using precision, recall and F1-score. The dataset is publicly available for the research community. We hope that this resource will spur further research on encouraging inclusive and responsive speech that reinforces positiveness.}\n}\n\n\n
\n
\n\n\n
\n Over the past few years, systems have been developed to control online content and eliminate abusive, offensive or hate speech content. However, people in power sometimes misuse this form of censorship to obstruct the democratic right of freedom of speech. Therefore, it is imperative that research should take a positive reinforcement approach towards online content that is encouraging, positive and supportive contents. Until now, most studies have focused on solving this problem of negativity in the English language, though the problem is much more than just harmful content. Furthermore, it is multilingual as well. Thus, we have constructed a Hope Speech dataset for Equality, Diversity and Inclusion (HopeEDI) containing user-generated comments from the social media platform YouTube with 28,451, 20,198 and 10,705 comments in English, Tamil and Malayalam, respectively, manually labelled as containing hope speech or not. To our knowledge, this is the first research of its kind to annotate hope speech for equality, diversity and inclusion in a multilingual setting. We determined that the inter-annotator agreement of our dataset using Krippendorff's alpha. Further, we created several baselines to benchmark the resulting dataset and the results have been expressed using precision, recall and F1-score. The dataset is publicly available for the research community. We hope that this resource will spur further research on encouraging inclusive and responsive speech that reinforces positiveness.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n KanCMD: Kannada CodeMixed Dataset for Sentiment Analysis and Offensive Language Detection.\n \n \n \n \n\n\n \n Hande, A., Priyadharshini, R., & Chakravarthi, B. R.\n\n\n \n\n\n\n In Proceedings of the Third Workshop on Computational Modeling of People's Opinions, Personality, and Emotion's in Social Media, pages 54–63, Barcelona, Spain (Online), December 2020. Association for Computational Linguistics\n \n\n\n\n
\n\n\n\n \n \n \"KanCMD:Paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{hande-etal-2020-kancmd,\n  title        = {{KanCMD}: {Kannada} {CodeMixed} Dataset for Sentiment Analysis and Offensive Language Detection},\n  author       = {Hande, Adeep and Priyadharshini, Ruba and Chakravarthi, Bharathi Raja},\n  year         = 2020,\n  month        = dec,\n  booktitle    = {Proceedings of the Third Workshop on Computational Modeling of People's Opinions, Personality, and Emotion's in Social Media},\n  publisher    = {Association for Computational Linguistics},\n  address      = {Barcelona, Spain (Online)},\n  pages        = {54--63},\n  url          = {https://www.aclweb.org/anthology/2020.peoples-1.6},\n  abstract     = {We introduce Kannada CodeMixed Dataset (KanCMD), a multi-task learning dataset for sentiment analysis and offensive language identification. The KanCMD dataset highlights two real-world issues from the social media text. First, it contains actual comments in code mixed text posted by users on YouTube social media, rather than in monolingual text from the textbook. Second, it has been annotated for two tasks, namely sentiment analysis and offensive language detection for under-resourced Kannada language. Hence, KanCMD is meant to stimulate research in under-resourced Kannada language on real-world code-mixed social media text and multi-task learning. KanCMD was obtained by crawling the YouTube, and a minimum of three annotators annotates each comment. We release KanCMD 7,671 comments for multitask learning research purpose.}\n}\n\n\n
\n
\n\n\n
\n We introduce Kannada CodeMixed Dataset (KanCMD), a multi-task learning dataset for sentiment analysis and offensive language identification. The KanCMD dataset highlights two real-world issues from the social media text. First, it contains actual comments in code mixed text posted by users on YouTube social media, rather than in monolingual text from the textbook. Second, it has been annotated for two tasks, namely sentiment analysis and offensive language detection for under-resourced Kannada language. Hence, KanCMD is meant to stimulate research in under-resourced Kannada language on real-world code-mixed social media text and multi-task learning. KanCMD was obtained by crawling the YouTube, and a minimum of three annotators annotates each comment. We release KanCMD 7,671 comments for multitask learning research purpose.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Bilingual Lexicon Induction across Orthographically-distinct Under-Resourced Dravidian Languages.\n \n \n \n \n\n\n \n Chakravarthi, B. R., Rajasekaran, N., Arcan, M., McGuinness, K., E. O'Connor, N., & McCrae, J. P.\n\n\n \n\n\n\n In Proceedings of the 7th Workshop on NLP for Similar Languages, Varieties and Dialects, pages 57–69, Barcelona, Spain (Online), December 2020. International Committee on Computational Linguistics (ICCL)\n \n\n\n\n
\n\n\n\n \n \n \"BilingualPaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{chakravarthi-etal-2020-bilingual,\n  title        = {Bilingual Lexicon Induction across Orthographically-distinct Under-Resourced {Dravidian} Languages},\n  author       = {Chakravarthi, Bharathi Raja and Rajasekaran, Navaneethan and Arcan, Mihael and McGuinness, Kevin and E. O{'}Connor, Noel and McCrae, John P.},\n  year         = 2020,\n  month        = dec,\n  booktitle    = {Proceedings of the 7th Workshop on NLP for Similar Languages, Varieties and Dialects},\n  publisher    = {International Committee on Computational Linguistics (ICCL)},\n  address      = {Barcelona, Spain (Online)},\n  pages        = {57--69},\n  url          = {https://www.aclweb.org/anthology/2020.vardial-1.6},\n  abstract     = {Bilingual lexicons are a vital tool for under-resourced languages and recent state-of-the-art approaches to this leverage pretrained monolingual word embeddings using supervised or semi-supervised approaches. However, these approaches require cross-lingual information such as seed dictionaries to train the model and find a linear transformation between the word embedding spaces. Especially in the case of low-resourced languages, seed dictionaries are not readily available, and as such, these methods produce extremely weak results on these languages. In this work, we focus on the Dravidian languages, namely Tamil, Telugu, Kannada, and Malayalam, which are even more challenging as they are written in unique scripts. To take advantage of orthographic information and cognates in these languages, we bring the related languages into a single script. Previous approaches have used linguistically sub-optimal measures such as the Levenshtein edit distance to detect cognates, whereby we demonstrate that the longest common sub-sequence is linguistically more sound and improves the performance of bilingual lexicon induction. We show that our approach can increase the accuracy of bilingual lexicon induction methods on these languages many times, making bilingual lexicon induction approaches feasible for such under-resourced languages.}\n}\n\n\n
\n
\n\n\n
\n Bilingual lexicons are a vital tool for under-resourced languages and recent state-of-the-art approaches to this leverage pretrained monolingual word embeddings using supervised or semi-supervised approaches. However, these approaches require cross-lingual information such as seed dictionaries to train the model and find a linear transformation between the word embedding spaces. Especially in the case of low-resourced languages, seed dictionaries are not readily available, and as such, these methods produce extremely weak results on these languages. In this work, we focus on the Dravidian languages, namely Tamil, Telugu, Kannada, and Malayalam, which are even more challenging as they are written in unique scripts. To take advantage of orthographic information and cognates in these languages, we bring the related languages into a single script. Previous approaches have used linguistically sub-optimal measures such as the Levenshtein edit distance to detect cognates, whereby we demonstrate that the longest common sub-sequence is linguistically more sound and improves the performance of bilingual lexicon induction. We show that our approach can increase the accuracy of bilingual lexicon induction methods on these languages many times, making bilingual lexicon induction approaches feasible for such under-resourced languages.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Unsupervised Deep Language and Dialect Identification for Short Texts.\n \n \n \n \n\n\n \n Goswami, K., Sarkar, R., Chakravarthi, B. R., Fransen, T., & McCrae, J. P.\n\n\n \n\n\n\n In Proceedings of the 28th International Conference on Computational Linguistics, pages 1606–1617, Barcelona, Spain (Online), December 2020. International Committee on Computational Linguistics\n \n\n\n\n
\n\n\n\n \n \n \"UnsupervisedPaper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{goswami-etal-2020-unsupervised,\n  title        = {Unsupervised Deep Language and Dialect Identification for Short Texts},\n  author       = {Goswami, Koustava and Sarkar, Rajdeep and Chakravarthi, Bharathi Raja and Fransen, Theodorus and McCrae, John P.},\n  year         = 2020,\n  month        = dec,\n  booktitle    = {Proceedings of the 28th International Conference on Computational Linguistics},\n  publisher    = {International Committee on Computational Linguistics},\n  address      = {Barcelona, Spain (Online)},\n  pages        = {1606--1617},\n  doi          = {10.18653/v1/2020.coling-main.141},\n  url          = {https://www.aclweb.org/anthology/2020.coling-main.141},\n  abstract     = {Automatic Language Identification (LI) or Dialect Identification (DI) of short texts of closely related languages or dialects, is one of the primary steps in many natural language processing pipelines. Language identification is considered a solved task in many cases; however, in the case of very closely related languages, or in an unsupervised scenario (where the languages are not known in advance), performance is still poor. In this paper, we propose the Unsupervised Deep Language and Dialect Identification (UDLDI)method, whichcan simultaneously learn sentence embeddings and cluster assignments from short texts. The UDLDI modelunderstands the sentence constructions of languages by applying attention to character relations which helps to optimize the clustering of languages. We have performed our experiments on three short-text datasets for different language families, each consisting of closely related languages or dialects, with very minimal training sets. Our experimental evaluations on these datasets have shown significant improvement over state-of-the-artunsupervised methods and our model has outperformed state-of-the-art LI and DI systems in supervised settings.}\n}\n\n\n
\n
\n\n\n
\n Automatic Language Identification (LI) or Dialect Identification (DI) of short texts of closely related languages or dialects, is one of the primary steps in many natural language processing pipelines. Language identification is considered a solved task in many cases; however, in the case of very closely related languages, or in an unsupervised scenario (where the languages are not known in advance), performance is still poor. In this paper, we propose the Unsupervised Deep Language and Dialect Identification (UDLDI)method, whichcan simultaneously learn sentence embeddings and cluster assignments from short texts. The UDLDI modelunderstands the sentence constructions of languages by applying attention to character relations which helps to optimize the clustering of languages. We have performed our experiments on three short-text datasets for different language families, each consisting of closely related languages or dialects, with very minimal training sets. Our experimental evaluations on these datasets have shown significant improvement over state-of-the-artunsupervised methods and our model has outperformed state-of-the-art LI and DI systems in supervised settings.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A Comparative Study of Different State-of-the-Art Hate Speech Detection Methods in Hindi-English Code-Mixed Data.\n \n \n \n \n\n\n \n Rani, P., Suryawanshi, S., Goswami, K., Chakravarthi, B. R., Fransen, T., & McCrae, J. P.\n\n\n \n\n\n\n In Proceedings of the Second Workshop on Trolling, Aggression and Cyberbullying, pages 42–48, Marseille, France, May 2020. European Language Resources Association (ELRA)\n \n\n\n\n
\n\n\n\n \n \n \"APaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{rani-etal-2020-comparative,\n  title        = {A Comparative Study of Different State-of-the-Art Hate Speech Detection Methods in {Hindi}-{English} Code-Mixed Data},\n  author       = {Rani, Priya and Suryawanshi, Shardul and Goswami, Koustava and Chakravarthi, Bharathi Raja and Fransen, Theodorus and McCrae, John Philip},\n  year         = 2020,\n  month        = may,\n  booktitle    = {Proceedings of the Second Workshop on Trolling, Aggression and Cyberbullying},\n  publisher    = {European Language Resources Association (ELRA)},\n  address      = {Marseille, France},\n  pages        = {42--48},\n  isbn         = {979-10-95546-56-6},\n  url          = {https://www.aclweb.org/anthology/2020.trac-1.7},\n  abstract     = {Hate speech detection in social media communication has become one of the primary concerns to avoid conflicts and curb undesired activities. In an environment where multilingual speakers switch among multiple languages, hate speech detection becomes a challenging task using methods that are designed for monolingual corpora. In our work, we attempt to analyze, detect and provide a comparative study of hate speech in a code-mixed social media text. We also provide a Hindi-English code-mixed data set consisting of Facebook and Twitter posts and comments. Our experiments show that deep learning models trained on this code-mixed corpus perform better.},\n  language     = {English},\n  keywords     = {Hate Speech, Code mixing, Convolutional Neural Networks}\n}\n\n\n
\n
\n\n\n
\n Hate speech detection in social media communication has become one of the primary concerns to avoid conflicts and curb undesired activities. In an environment where multilingual speakers switch among multiple languages, hate speech detection becomes a challenging task using methods that are designed for monolingual corpora. In our work, we attempt to analyze, detect and provide a comparative study of hate speech in a code-mixed social media text. We also provide a Hindi-English code-mixed data set consisting of Facebook and Twitter posts and comments. Our experiments show that deep learning models trained on this code-mixed corpus perform better.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n ULD@NUIG at SemEval-2020 Task 9: Generative Morphemes with an Attention Model for Sentiment Analysis in Code-Mixed Text.\n \n \n \n \n\n\n \n Goswami, K., Rani, P., Chakravarthi, B. R., Fransen, T., & McCrae, J. P.\n\n\n \n\n\n\n In Proceedings of the Fourteenth Workshop on Semantic Evaluation, pages 968–974, Barcelona (online), December 2020. International Committee for Computational Linguistics\n \n\n\n\n
\n\n\n\n \n \n \"ULD@NUIGPaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{goswami-etal-2020-uld,\n  title        = {{ULD}@{NUIG} at {SemEval}-2020 Task 9: Generative Morphemes with an Attention Model for Sentiment Analysis in Code-Mixed Text},\n  author       = {Goswami, Koustava and Rani, Priya and Chakravarthi, Bharathi Raja and Fransen, Theodorus and McCrae, John P.},\n  year         = 2020,\n  month        = dec,\n  booktitle    = {Proceedings of the Fourteenth Workshop on Semantic Evaluation},\n  publisher    = {International Committee for Computational Linguistics},\n  address      = {Barcelona (online)},\n  pages        = {968--974},\n  url          = {https://www.aclweb.org/anthology/2020.semeval-1.125},\n  abstract     = {Code mixing is a common phenomena in multilingual societies where people switch from one language to another for various reasons. Recent advances in public communication over different social media sites have led to an increase in the frequency of code-mixed usage in written language. In this paper, we present the Generative Morphemes with Attention (GenMA) Model sentiment analysis system contributed to SemEval 2020 Task 9 SentiMix. The system aims to predict the sentiments of the given English-Hindi code-mixed tweets without using word-level language tags instead inferring this automatically using a morphological model. The system is based on a novel deep neural network (DNN) architecture, which has outperformed the baseline F1-score on the test data-set as well as the validation data-set. Our results can be found under the user name {``}koustava{''} on the {``}Sentimix Hindi English{''} page.}\n}\n\n\n
\n
\n\n\n
\n Code mixing is a common phenomena in multilingual societies where people switch from one language to another for various reasons. Recent advances in public communication over different social media sites have led to an increase in the frequency of code-mixed usage in written language. In this paper, we present the Generative Morphemes with Attention (GenMA) Model sentiment analysis system contributed to SemEval 2020 Task 9 SentiMix. The system aims to predict the sentiments of the given English-Hindi code-mixed tweets without using word-level language tags instead inferring this automatically using a morphological model. The system is based on a novel deep neural network (DNN) architecture, which has outperformed the baseline F1-score on the test data-set as well as the validation data-set. Our results can be found under the user name ``koustava'' on the ``Sentimix Hindi English'' page.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Rekenen met taal: computationele taalkunde en historisch Iers.\n \n \n \n \n\n\n \n Fransen, T.\n\n\n \n\n\n\n Kelten: Mededelingen van de Stichting A. G. van Hamel voor Keltische Studies, 84. 2020.\n \n\n\n\n
\n\n\n\n \n \n \"RekenenPaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@article{fransen-2020-rekenen,\n  title        = {Rekenen met taal: computationele taalkunde en historisch {Iers}},\n  author       = {Theodorus Fransen},\n  year         = 2020,\n  journal      = {Kelten: Mededelingen van de Stichting A. G. van Hamel voor Keltische Studies},\n  volume       = 84,\n  url          = {https://kelten.vanhamel.nl/k84-2020-fransen-oudiers-computationele-taalkunde-morfologie-deep-learning},\n  date         = {2020-08-17},\n  abstract     = {Language and mathematics meet in the dynamic field of computational linguistics. Using the latest deep learning techniques and transferring results from better resourced languages, Theodorus Fransen of the Cardamom project in Galway hopes to develop an algorithm capable of analysing the various forms of the Irish verb across time and space, adding to the linguistic tools available to both students and scholars of historical Irish.}\n}\n\n\n
\n
\n\n\n
\n Language and mathematics meet in the dynamic field of computational linguistics. Using the latest deep learning techniques and transferring results from better resourced languages, Theodorus Fransen of the Cardamom project in Galway hopes to develop an algorithm capable of analysing the various forms of the Irish verb across time and space, adding to the linguistic tools available to both students and scholars of historical Irish.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Corpus Creation for Sentiment Analysis in Code-Mixed Tamil-English Text.\n \n \n \n \n\n\n \n Chakravarthi, B. R., Muralidaran, V., Priyadharshini, R., & McCrae, J. P.\n\n\n \n\n\n\n In Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL), pages 202–210, Marseille, France, May 2020. European Language Resources association\n \n\n\n\n
\n\n\n\n \n \n \"CorpusPaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{chakravarthi-etal-2020-corpus,\n  title        = {Corpus Creation for Sentiment Analysis in Code-Mixed {Tamil}-{English} Text},\n  author       = {Chakravarthi, Bharathi Raja and Muralidaran, Vigneshwaran and Priyadharshini, Ruba and McCrae, John Philip},\n  year         = 2020,\n  month        = may,\n  booktitle    = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n  publisher    = {European Language Resources association},\n  address      = {Marseille, France},\n  pages        = {202--210},\n  isbn         = {979-10-95546-35-1},\n  url          = {https://www.aclweb.org/anthology/2020.sltu-1.28},\n  abstract     = {Understanding the sentiment of a comment from a video or an image is an essential task in many applications. Sentiment analysis of a text can be useful for various decision-making processes. One such application is to analyse the popular sentiments of videos on social media based on viewer comments. However, comments from social media do not follow strict rules of grammar, and they contain mixing of more than one language, often written in non-native scripts. Non-availability of annotated code-mixed data for a low-resourced language like Tamil also adds difficulty to this problem. To overcome this, we created a gold standard Tamil-English code-switched, sentiment-annotated corpus containing 15,744 comment posts from YouTube. In this paper, we describe the process of creating the corpus and assigning polarities. We present inter-annotator agreement and show the results of sentiment analysis trained on this corpus as a benchmark.},\n  language     = {English}\n}\n\n\n
\n
\n\n\n
\n Understanding the sentiment of a comment from a video or an image is an essential task in many applications. Sentiment analysis of a text can be useful for various decision-making processes. One such application is to analyse the popular sentiments of videos on social media based on viewer comments. However, comments from social media do not follow strict rules of grammar, and they contain mixing of more than one language, often written in non-native scripts. Non-availability of annotated code-mixed data for a low-resourced language like Tamil also adds difficulty to this problem. To overcome this, we created a gold standard Tamil-English code-switched, sentiment-annotated corpus containing 15,744 comment posts from YouTube. In this paper, we describe the process of creating the corpus and assigning polarities. We present inter-annotator agreement and show the results of sentiment analysis trained on this corpus as a benchmark.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Leveraging orthographic information to improve machine translation of under-resourced languages.\n \n \n \n\n\n \n Chakravarthi, B. R.\n\n\n \n\n\n\n Ph.D. Thesis, National University of Ireland Galway, 2020.\n \n\n\n\n
\n\n\n\n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@phdthesis{chakravarthi-2020-leveraging,\n  title        = {Leveraging orthographic information to improve machine translation of under-resourced languages},\n  author       = {Bharathi Raja Chakravarthi},\n  year         = 2020,\n  doi          = {http://hdl.handle.net/10379/16100},\n  date         = {2020-07-27},\n  school       = {National University of Ireland Galway},\n  abstract     = {This thesis describes our improvement of word sense translation for under-resourced languages utilizing orthographic information with a particular focus on creating resources using machine translation. The first target of this thesis is cleaning the noisy corpus in the form of code-mixed content at word-level based on orthographic information to improve machine translation quality. Our results indicate that the proposed removing of code-mixed text based on orthography results in improvement for Dravidian languages. We then turn our interest to the usage of training data from closely-related languages. While languages within the same language family share many properties, many under-resourced languages are written in their own native script, which makes taking advantage of these language similarities difficult. We propose to alleviate the problem of different scripts by transcribing the native script into a common representation such as the Latin script or the International Phonetic Alphabet (IPA). We also show that our method could aid the creation or improvement of wordnets for under-resourced languages using machine translation. Further, we investigate bilingual lexicon induction using pre-trained monolingual word embeddings and orthographic information. We use existing resources such as IndoWordNet entries as a seed dictionary and test set for the under-resourced Dravidian languages. To take advantage of orthographic information, we propose to bring the related languages into a single script before creating word embeddings, and use the longest common subsequence to take advantage of cognate information. Our methods for under-resourced word sense translation of Dravidian languages outperformed state-of-the art systems in terms of both automatic and manual evaluation.},\n  tppubtype    = {phdthesis}\n}\n\n\n
\n
\n\n\n
\n This thesis describes our improvement of word sense translation for under-resourced languages utilizing orthographic information with a particular focus on creating resources using machine translation. The first target of this thesis is cleaning the noisy corpus in the form of code-mixed content at word-level based on orthographic information to improve machine translation quality. Our results indicate that the proposed removing of code-mixed text based on orthography results in improvement for Dravidian languages. We then turn our interest to the usage of training data from closely-related languages. While languages within the same language family share many properties, many under-resourced languages are written in their own native script, which makes taking advantage of these language similarities difficult. We propose to alleviate the problem of different scripts by transcribing the native script into a common representation such as the Latin script or the International Phonetic Alphabet (IPA). We also show that our method could aid the creation or improvement of wordnets for under-resourced languages using machine translation. Further, we investigate bilingual lexicon induction using pre-trained monolingual word embeddings and orthographic information. We use existing resources such as IndoWordNet entries as a seed dictionary and test set for the under-resourced Dravidian languages. To take advantage of orthographic information, we propose to bring the related languages into a single script before creating word embeddings, and use the longest common subsequence to take advantage of cognate information. Our methods for under-resourced word sense translation of Dravidian languages outperformed state-of-the art systems in terms of both automatic and manual evaluation.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A Sentiment Analysis Dataset for Code-Mixed Malayalam-English.\n \n \n \n \n\n\n \n Chakravarthi, B. R., Jose, N., Suryawanshi, S., Sherly, E., & McCrae, J. P.\n\n\n \n\n\n\n In Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL), pages 177–184, Marseille, France, May 2020. European Language Resources association\n \n\n\n\n
\n\n\n\n \n \n \"APaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{chakravarthi-etal-2020-sentiment,\n  title        = {A Sentiment Analysis Dataset for Code-Mixed {Malayalam}-{English}},\n  author       = {Chakravarthi, Bharathi Raja and Jose, Navya and Suryawanshi, Shardul and Sherly, Elizabeth and McCrae, John Philip},\n  year         = 2020,\n  month        = may,\n  booktitle    = {Proceedings of the 1st Joint Workshop on Spoken Language Technologies for Under-resourced languages (SLTU) and Collaboration and Computing for Under-Resourced Languages (CCURL)},\n  publisher    = {European Language Resources association},\n  address      = {Marseille, France},\n  pages        = {177--184},\n  isbn         = {979-10-95546-35-1},\n  url          = {https://www.aclweb.org/anthology/2020.sltu-1.25},\n  abstract     = {There is an increasing demand for sentiment analysis of text from social media which are mostly code-mixed. Systems trained on monolingual data fail for code-mixed data due to the complexity of mixing at different levels of the text. However, very few resources are available for code-mixed data to create models specific for this data. Although much research in multilingual and cross-lingual sentiment analysis has used semi-supervised or unsupervised methods, supervised methods still performs better. Only a few datasets for popular languages such as English-Spanish, English-Hindi, and English-Chinese are available. There are no resources available for Malayalam-English code-mixed data. This paper presents a new gold standard corpus for sentiment analysis of code-mixed text in Malayalam-English annotated by voluntary annotators. This gold standard corpus obtained a Krippendorff{'}s alpha above 0.8 for the dataset. We use this new corpus to provide the benchmark for sentiment analysis in Malayalam-English code-mixed texts.},\n  language     = {English}\n}\n\n\n
\n
\n\n\n
\n There is an increasing demand for sentiment analysis of text from social media which are mostly code-mixed. Systems trained on monolingual data fail for code-mixed data due to the complexity of mixing at different levels of the text. However, very few resources are available for code-mixed data to create models specific for this data. Although much research in multilingual and cross-lingual sentiment analysis has used semi-supervised or unsupervised methods, supervised methods still performs better. Only a few datasets for popular languages such as English-Spanish, English-Hindi, and English-Chinese are available. There are no resources available for Malayalam-English code-mixed data. This paper presents a new gold standard corpus for sentiment analysis of code-mixed text in Malayalam-English annotated by voluntary annotators. This gold standard corpus obtained a Krippendorff's alpha above 0.8 for the dataset. We use this new corpus to provide the benchmark for sentiment analysis in Malayalam-English code-mixed texts.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Some Issues with Building a Multilingual Wordnet.\n \n \n \n \n\n\n \n Bond, F., Morgado da Costa, L., Goodman, M. W., McCrae, J. P., & Lohk, A.\n\n\n \n\n\n\n In Proceedings of the 12th Language Resources and Evaluation Conference, pages 3189–3197, Marseille, France, May 2020. European Language Resources Association\n \n\n\n\n
\n\n\n\n \n \n \"SomePaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{bond-etal-2020-issues,\n  title        = {Some Issues with Building a Multilingual {Wordnet}},\n  author       = {Bond, Francis and Morgado da Costa, Luis and Goodman, Michael Wayne and McCrae, John Philip and Lohk, Ahti},\n  year         = 2020,\n  month        = may,\n  booktitle    = {Proceedings of the 12th Language Resources and Evaluation Conference},\n  publisher    = {European Language Resources Association},\n  address      = {Marseille, France},\n  pages        = {3189--3197},\n  isbn         = {979-10-95546-34-4},\n  url          = {https://www.aclweb.org/anthology/2020.lrec-1.390},\n  abstract     = {In this paper we discuss the experience of bringing together over 40 different wordnets. We introduce some extensions to the GWA wordnet LMF format proposed in Vossen et al. (2016) and look at how this new information can be displayed. Notable extensions include: confidence, corpus frequency, orthographic variants, lexicalized and non-lexicalized synsets and lemmas, new parts of speech, and more. Many of these extensions already exist in multiple wordnets {--} the challenge was to find a compatible representation. To this end, we introduce a new version of the Open Multilingual Wordnet (Bond and Foster, 2013), that integrates a new set of tools that tests the extensions introduced by this new format, while also ensuring the integrity of the Collaborative Interlingual Index (CILI: Bond et al., 2016), avoiding the same new concept to be introduced through multiple projects.},\n  language     = {English},\n  keywords     = {multilingual lexicon, wordnet, collaborative development}\n}\n\n\n
\n
\n\n\n
\n In this paper we discuss the experience of bringing together over 40 different wordnets. We introduce some extensions to the GWA wordnet LMF format proposed in Vossen et al. (2016) and look at how this new information can be displayed. Notable extensions include: confidence, corpus frequency, orthographic variants, lexicalized and non-lexicalized synsets and lemmas, new parts of speech, and more. Many of these extensions already exist in multiple wordnets – the challenge was to find a compatible representation. To this end, we introduce a new version of the Open Multilingual Wordnet (Bond and Foster, 2013), that integrates a new set of tools that tests the extensions introduced by this new format, while also ensuring the integrity of the Collaborative Interlingual Index (CILI: Bond et al., 2016), avoiding the same new concept to be introduced through multiple projects.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2019\n \n \n (1)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Cardamom: Comparative Deep Models for Minority and Historical Languages.\n \n \n \n \n\n\n \n McCrae, J. P., & Fransen, T.\n\n\n \n\n\n\n In Proceedings of the 1st International Conference on Language Technologies for All, pages 276–279, Paris, France, December 2019. European Language Resources Association (ELRA)\n \n\n\n\n
\n\n\n\n \n \n \"Cardamom:Paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 17 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{mccrae-fransen-2019-cardamom,\n  title        = {Cardamom: Comparative Deep Models for Minority and Historical Languages},\n  author       = {McCrae, John P. and Fransen, Theodorus},\n  year         = 2019,\n  month        = dec,\n  booktitle    = {Proceedings of the 1st International Conference on Language Technologies for All},\n  publisher    = {European Language Resources Association (ELRA)},\n  address      = {Paris, France},\n  pages        = {276--279},\n  url          = {https://lt4all.elra.info/proceedings/lt4all2019/pdf/2019.lt4all-1.69.pdf},\n  abstract     = {This paper gives an overview of the Cardamom project, which aims to close the resource gap for minority and under-resourced languages by means of deep-learning-based natural language processing (NLP) and exploiting similarities of closely-related languages. The project further extends this idea to historical languages, which can be considered as closely related to their modern form, and as such aims to provide NLP through both space and time for languages that have been ignored by current approaches.}\n}\n\n\n
\n
\n\n\n
\n This paper gives an overview of the Cardamom project, which aims to close the resource gap for minority and under-resourced languages by means of deep-learning-based natural language processing (NLP) and exploiting similarities of closely-related languages. The project further extends this idea to historical languages, which can be considered as closely related to their modern form, and as such aims to provide NLP through both space and time for languages that have been ignored by current approaches.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n\n\n\n
\n\n\n \n\n \n \n \n \n\n
\n"}; document.write(bibbase_data.data);