Spoken Language Identification Using Convolutional Neural Network In Nepalese Context. Sapkota, S. S., Shakya, A., & Joshi, B. In Proceedings of the 26th International Conference of the ORIENTAL- COCOSDA (O-COCOSDA 2023), pages 1-6, December, 2023. IEEE. Paper doi abstract bibtex In this work we perform a language identification (LID) task that classifies few regional languages spoken in Nepal. We use a Convolutional Neural Network (CNN) that operates on spectrogram of the provided audio utterances. Datasets for three languages Nepali, Hindi and Sanskrit were downloaded from voxlingua107. Additional two languages Newari and Maithili were extracted from YouTube news channels, since there were no standard datasets available for these languages. Nepal Mandel TV and TV TODAY JANAKPUR channels were used for audio extraction. Classification accuracy for three languages from Voxlingua107 datasets are 95%, 92% and 89% for Sanskrit, Nepali and Hindi respectively. Experiment carried out in YouTube datasets along with voxlingua107 datasets produces accuracy of Sanskrit-74%, Nepali-72%, Hindi-68%, Newari-66% and Maithili-63%. Significant reduction in accuracy in this case is due to the quality of non-standard datasets obtained from YouTube. With suitable preprocessing and filtration performance can be enhanced further. In our experiments we show, that our model is capable to classify between few Nepalese regional languages from short speech utterances and can be easily extended to include more group of languages.
@inproceedings{sapkota2023spoken,
abstract = {In this work we perform a language identification (LID) task that classifies few regional languages spoken in Nepal.
We use a Convolutional Neural Network (CNN) that operates on spectrogram of the provided audio utterances. Datasets for three languages Nepali, Hindi and Sanskrit were downloaded from voxlingua107. Additional two languages Newari and Maithili were extracted from YouTube news channels, since there were no standard datasets available for these languages. Nepal Mandel TV and TV TODAY JANAKPUR channels were used for audio extraction. Classification accuracy for three languages from Voxlingua107 datasets are 95%, 92% and 89% for Sanskrit, Nepali and Hindi respectively. Experiment carried out in YouTube datasets along with voxlingua107 datasets produces accuracy of Sanskrit-74%, Nepali-72%, Hindi-68%, Newari-66% and Maithili-63%. Significant reduction in accuracy in this case is due to the quality of non-standard datasets obtained from YouTube. With suitable preprocessing and filtration performance can be enhanced further. In our experiments we show, that our model is capable to classify between few Nepalese regional languages from short speech utterances and can be easily extended to include more group of languages.},
added-at = {2023-12-16T16:51:56.000+0100},
author = {Sapkota, Shiva Sagar and Shakya, Aman and Joshi, Basanta},
biburl = {https://www.bibsonomy.org/bibtex/21f1966e38e5a652f01d8ae96733c338e/amanshakya},
booktitle = {Proceedings of the 26th International Conference of the ORIENTAL- COCOSDA (O-COCOSDA 2023)},
doi = {10.1109/O-COCOSDA60357.2023.10482929},
eventdate = {December, 2023},
eventtitle = {26th International Conference of the ORIENTAL- COCOSDA (O-COCOSDA 2023)},
interhash = {4a0aafdee100c99097766037ccf557e5},
intrahash = {1f1966e38e5a652f01d8ae96733c338e},
isbn = {979-8-3503-4402-8},
issn = {2472-7695},
keywords = {Spectrogram;ASR Video analysis;LID;CNN;Voxlingua107;Audio demand;TV;Filtration;Databases;Web myown; networks;Task neural on sites;Convolutional},
month = {December},
pages = {1-6},
publisher = {IEEE},
timestamp = {2024-04-07T19:25:56.000+0200},
title = {Spoken Language Identification Using Convolutional Neural Network In Nepalese Context},
url = {https://ieeexplore.ieee.org/abstract/document/10482929},
venue = {Delhi, India},
year = 2023
}
Downloads: 0
{"_id":"uGLSuZ359zisAfN9e","bibbaseid":"sapkota-shakya-joshi-spokenlanguageidentificationusingconvolutionalneuralnetworkinnepalesecontext-2023","author_short":["Sapkota, S. S.","Shakya, A.","Joshi, B."],"bibdata":{"bibtype":"inproceedings","type":"inproceedings","abstract":"In this work we perform a language identification (LID) task that classifies few regional languages spoken in Nepal. We use a Convolutional Neural Network (CNN) that operates on spectrogram of the provided audio utterances. Datasets for three languages Nepali, Hindi and Sanskrit were downloaded from voxlingua107. Additional two languages Newari and Maithili were extracted from YouTube news channels, since there were no standard datasets available for these languages. Nepal Mandel TV and TV TODAY JANAKPUR channels were used for audio extraction. Classification accuracy for three languages from Voxlingua107 datasets are 95%, 92% and 89% for Sanskrit, Nepali and Hindi respectively. Experiment carried out in YouTube datasets along with voxlingua107 datasets produces accuracy of Sanskrit-74%, Nepali-72%, Hindi-68%, Newari-66% and Maithili-63%. Significant reduction in accuracy in this case is due to the quality of non-standard datasets obtained from YouTube. With suitable preprocessing and filtration performance can be enhanced further. In our experiments we show, that our model is capable to classify between few Nepalese regional languages from short speech utterances and can be easily extended to include more group of languages.","added-at":"2023-12-16T16:51:56.000+0100","author":[{"propositions":[],"lastnames":["Sapkota"],"firstnames":["Shiva","Sagar"],"suffixes":[]},{"propositions":[],"lastnames":["Shakya"],"firstnames":["Aman"],"suffixes":[]},{"propositions":[],"lastnames":["Joshi"],"firstnames":["Basanta"],"suffixes":[]}],"biburl":"https://www.bibsonomy.org/bibtex/21f1966e38e5a652f01d8ae96733c338e/amanshakya","booktitle":"Proceedings of the 26th International Conference of the ORIENTAL- COCOSDA (O-COCOSDA 2023)","doi":"10.1109/O-COCOSDA60357.2023.10482929","eventdate":"December, 2023","eventtitle":"26th International Conference of the ORIENTAL- COCOSDA (O-COCOSDA 2023)","interhash":"4a0aafdee100c99097766037ccf557e5","intrahash":"1f1966e38e5a652f01d8ae96733c338e","isbn":"979-8-3503-4402-8","issn":"2472-7695","keywords":"Spectrogram;ASR Video analysis;LID;CNN;Voxlingua107;Audio demand;TV;Filtration;Databases;Web myown; networks;Task neural on sites;Convolutional","month":"December","pages":"1-6","publisher":"IEEE","timestamp":"2024-04-07T19:25:56.000+0200","title":"Spoken Language Identification Using Convolutional Neural Network In Nepalese Context","url":"https://ieeexplore.ieee.org/abstract/document/10482929","venue":"Delhi, India","year":"2023","bibtex":"@inproceedings{sapkota2023spoken,\n abstract = {In this work we perform a language identification (LID) task that classifies few regional languages spoken in Nepal.\r\nWe use a Convolutional Neural Network (CNN) that operates on spectrogram of the provided audio utterances. Datasets for three languages Nepali, Hindi and Sanskrit were downloaded from voxlingua107. Additional two languages Newari and Maithili were extracted from YouTube news channels, since there were no standard datasets available for these languages. Nepal Mandel TV and TV TODAY JANAKPUR channels were used for audio extraction. Classification accuracy for three languages from Voxlingua107 datasets are 95%, 92% and 89% for Sanskrit, Nepali and Hindi respectively. Experiment carried out in YouTube datasets along with voxlingua107 datasets produces accuracy of Sanskrit-74%, Nepali-72%, Hindi-68%, Newari-66% and Maithili-63%. Significant reduction in accuracy in this case is due to the quality of non-standard datasets obtained from YouTube. With suitable preprocessing and filtration performance can be enhanced further. In our experiments we show, that our model is capable to classify between few Nepalese regional languages from short speech utterances and can be easily extended to include more group of languages.},\n added-at = {2023-12-16T16:51:56.000+0100},\n author = {Sapkota, Shiva Sagar and Shakya, Aman and Joshi, Basanta},\n biburl = {https://www.bibsonomy.org/bibtex/21f1966e38e5a652f01d8ae96733c338e/amanshakya},\n booktitle = {Proceedings of the 26th International Conference of the ORIENTAL- COCOSDA (O-COCOSDA 2023)},\n doi = {10.1109/O-COCOSDA60357.2023.10482929},\n eventdate = {December, 2023},\n eventtitle = {26th International Conference of the ORIENTAL- COCOSDA (O-COCOSDA 2023)},\n interhash = {4a0aafdee100c99097766037ccf557e5},\n intrahash = {1f1966e38e5a652f01d8ae96733c338e},\n isbn = {979-8-3503-4402-8},\n issn = {2472-7695},\n keywords = {Spectrogram;ASR Video analysis;LID;CNN;Voxlingua107;Audio demand;TV;Filtration;Databases;Web myown; networks;Task neural on sites;Convolutional},\n month = {December},\n pages = {1-6},\n publisher = {IEEE},\n timestamp = {2024-04-07T19:25:56.000+0200},\n title = {Spoken Language Identification Using Convolutional Neural Network In Nepalese Context},\n url = {https://ieeexplore.ieee.org/abstract/document/10482929},\n venue = {Delhi, India},\n year = 2023\n}\n\n","author_short":["Sapkota, S. S.","Shakya, A.","Joshi, B."],"key":"sapkota2023spoken","id":"sapkota2023spoken","bibbaseid":"sapkota-shakya-joshi-spokenlanguageidentificationusingconvolutionalneuralnetworkinnepalesecontext-2023","role":"author","urls":{"Paper":"https://ieeexplore.ieee.org/abstract/document/10482929"},"keyword":["Spectrogram;ASR Video analysis;LID;CNN;Voxlingua107;Audio demand;TV;Filtration;Databases;Web myown; networks;Task neural on sites;Convolutional"],"metadata":{"authorlinks":{}}},"bibtype":"inproceedings","biburl":"https://www.bibsonomy.org/bib/user/amanshakya?items=1000","dataSources":["Z3HkGSt6ayrcPfAbv","jvJNqbNyAwTbrnSFr","MweK3w55kX6Mvg33y"],"keywords":["spectrogram;asr video analysis;lid;cnn;voxlingua107;audio demand;tv;filtration;databases;web myown; networks;task neural on sites;convolutional"],"search_terms":["spoken","language","identification","using","convolutional","neural","network","nepalese","context","sapkota","shakya","joshi"],"title":"Spoken Language Identification Using Convolutional Neural Network In Nepalese Context","year":2023}