PlantLncBoost: key features for plant lncRNA identification and significant improvement in accuracy and generalization. Tian, X., Nie, S., Domingues, D., Rossi Paschoal, A., Jiang, L., & Mao, J. New Phytologist. _eprint: https://onlinelibrary.wiley.com/doi/pdf/10.1111/nph.70211
Paper doi abstract bibtex Long noncoding RNAs (lncRNAs) are critical regulators of numerous biological processes in plants. Nevertheless, their identification is challenging due to the low sequence conservation across various species. Existing computational methods for lncRNA identification often face difficulties in generalizing across diverse plant species, highlighting the need for more robust and versatile identification models. Here, we present PlantLncBoost, a novel computational tool designed to improve the generalization in plant lncRNA identification. By integrating advanced gradient boosting algorithms with comprehensive feature selection, our approach achieves both high accuracy and generalizability. We conducted an extensive analysis of 1662 features and identified three key features – ORF coverage, complex Fourier average, and atomic Fourier amplitude – that effectively distinguish lncRNAs from mRNAs. We assessed the performance of PlantLncBoost using comprehensive datasets from 20 plant species. The model exhibited exceptional performance, with an accuracy of 96.63%, a sensitivity of 98.42%, and a specificity of 94.93%, significantly outperforming existing tools. Further analysis revealed that the features we selected effectively capture the differences between lncRNAs and mRNAs across a variety of plant species. PlantLncBoost represents a significant advancement in plant lncRNA identification. It is freely accessible on GitHub (https://github.com/xuechantian/PlantLncBoost) and has been integrated into a comprehensive analysis pipeline, Plant-LncRNA-pipeline v.2 (https://github.com/xuechantian/Plant-LncRNA-pipeline-v2).
@article{tian_plantlncboost_nodate,
title = {{PlantLncBoost}: key features for plant {lncRNA} identification and significant improvement in accuracy and generalization},
volume = {n/a},
copyright = {© 2025 The Author(s). New Phytologist © 2025 New Phytologist Foundation.},
issn = {1469-8137},
shorttitle = {{PlantLncBoost}},
url = {https://onlinelibrary.wiley.com/doi/abs/10.1111/nph.70211},
doi = {10.1111/nph.70211},
abstract = {Long noncoding RNAs (lncRNAs) are critical regulators of numerous biological processes in plants. Nevertheless, their identification is challenging due to the low sequence conservation across various species. Existing computational methods for lncRNA identification often face difficulties in generalizing across diverse plant species, highlighting the need for more robust and versatile identification models. Here, we present PlantLncBoost, a novel computational tool designed to improve the generalization in plant lncRNA identification. By integrating advanced gradient boosting algorithms with comprehensive feature selection, our approach achieves both high accuracy and generalizability. We conducted an extensive analysis of 1662 features and identified three key features – ORF coverage, complex Fourier average, and atomic Fourier amplitude – that effectively distinguish lncRNAs from mRNAs. We assessed the performance of PlantLncBoost using comprehensive datasets from 20 plant species. The model exhibited exceptional performance, with an accuracy of 96.63\%, a sensitivity of 98.42\%, and a specificity of 94.93\%, significantly outperforming existing tools. Further analysis revealed that the features we selected effectively capture the differences between lncRNAs and mRNAs across a variety of plant species. PlantLncBoost represents a significant advancement in plant lncRNA identification. It is freely accessible on GitHub (https://github.com/xuechantian/PlantLncBoost) and has been integrated into a comprehensive analysis pipeline, Plant-LncRNA-pipeline v.2 (https://github.com/xuechantian/Plant-LncRNA-pipeline-v2).},
language = {en},
number = {n/a},
urldate = {2025-05-30},
journal = {New Phytologist},
author = {Tian, Xue-Chan and Nie, Shuai and Domingues, Douglas and Rossi Paschoal, Alexandre and Jiang, Li-Bo and Mao, Jian-Feng},
note = {\_eprint: https://onlinelibrary.wiley.com/doi/pdf/10.1111/nph.70211},
keywords = {Fourier transform, ORF coverage, feature selection, gradient boosting algorithms, long noncoding RNAs (lncRNAs), model selection},
}
Downloads: 0
{"_id":"xNdfdwZyhkmrvuwos","bibbaseid":"tian-nie-domingues-rossipaschoal-jiang-mao-plantlncboostkeyfeaturesforplantlncrnaidentificationandsignificantimprovementinaccuracyandgeneralization","author_short":["Tian, X.","Nie, S.","Domingues, D.","Rossi Paschoal, A.","Jiang, L.","Mao, J."],"bibdata":{"bibtype":"article","type":"article","title":"PlantLncBoost: key features for plant lncRNA identification and significant improvement in accuracy and generalization","volume":"n/a","copyright":"© 2025 The Author(s). New Phytologist © 2025 New Phytologist Foundation.","issn":"1469-8137","shorttitle":"PlantLncBoost","url":"https://onlinelibrary.wiley.com/doi/abs/10.1111/nph.70211","doi":"10.1111/nph.70211","abstract":"Long noncoding RNAs (lncRNAs) are critical regulators of numerous biological processes in plants. Nevertheless, their identification is challenging due to the low sequence conservation across various species. Existing computational methods for lncRNA identification often face difficulties in generalizing across diverse plant species, highlighting the need for more robust and versatile identification models. Here, we present PlantLncBoost, a novel computational tool designed to improve the generalization in plant lncRNA identification. By integrating advanced gradient boosting algorithms with comprehensive feature selection, our approach achieves both high accuracy and generalizability. We conducted an extensive analysis of 1662 features and identified three key features – ORF coverage, complex Fourier average, and atomic Fourier amplitude – that effectively distinguish lncRNAs from mRNAs. We assessed the performance of PlantLncBoost using comprehensive datasets from 20 plant species. The model exhibited exceptional performance, with an accuracy of 96.63%, a sensitivity of 98.42%, and a specificity of 94.93%, significantly outperforming existing tools. Further analysis revealed that the features we selected effectively capture the differences between lncRNAs and mRNAs across a variety of plant species. PlantLncBoost represents a significant advancement in plant lncRNA identification. It is freely accessible on GitHub (https://github.com/xuechantian/PlantLncBoost) and has been integrated into a comprehensive analysis pipeline, Plant-LncRNA-pipeline v.2 (https://github.com/xuechantian/Plant-LncRNA-pipeline-v2).","language":"en","number":"n/a","urldate":"2025-05-30","journal":"New Phytologist","author":[{"propositions":[],"lastnames":["Tian"],"firstnames":["Xue-Chan"],"suffixes":[]},{"propositions":[],"lastnames":["Nie"],"firstnames":["Shuai"],"suffixes":[]},{"propositions":[],"lastnames":["Domingues"],"firstnames":["Douglas"],"suffixes":[]},{"propositions":[],"lastnames":["Rossi","Paschoal"],"firstnames":["Alexandre"],"suffixes":[]},{"propositions":[],"lastnames":["Jiang"],"firstnames":["Li-Bo"],"suffixes":[]},{"propositions":[],"lastnames":["Mao"],"firstnames":["Jian-Feng"],"suffixes":[]}],"note":"_eprint: https://onlinelibrary.wiley.com/doi/pdf/10.1111/nph.70211","keywords":"Fourier transform, ORF coverage, feature selection, gradient boosting algorithms, long noncoding RNAs (lncRNAs), model selection","bibtex":"@article{tian_plantlncboost_nodate,\n\ttitle = {{PlantLncBoost}: key features for plant {lncRNA} identification and significant improvement in accuracy and generalization},\n\tvolume = {n/a},\n\tcopyright = {© 2025 The Author(s). New Phytologist © 2025 New Phytologist Foundation.},\n\tissn = {1469-8137},\n\tshorttitle = {{PlantLncBoost}},\n\turl = {https://onlinelibrary.wiley.com/doi/abs/10.1111/nph.70211},\n\tdoi = {10.1111/nph.70211},\n\tabstract = {Long noncoding RNAs (lncRNAs) are critical regulators of numerous biological processes in plants. Nevertheless, their identification is challenging due to the low sequence conservation across various species. Existing computational methods for lncRNA identification often face difficulties in generalizing across diverse plant species, highlighting the need for more robust and versatile identification models. Here, we present PlantLncBoost, a novel computational tool designed to improve the generalization in plant lncRNA identification. By integrating advanced gradient boosting algorithms with comprehensive feature selection, our approach achieves both high accuracy and generalizability. We conducted an extensive analysis of 1662 features and identified three key features – ORF coverage, complex Fourier average, and atomic Fourier amplitude – that effectively distinguish lncRNAs from mRNAs. We assessed the performance of PlantLncBoost using comprehensive datasets from 20 plant species. The model exhibited exceptional performance, with an accuracy of 96.63\\%, a sensitivity of 98.42\\%, and a specificity of 94.93\\%, significantly outperforming existing tools. Further analysis revealed that the features we selected effectively capture the differences between lncRNAs and mRNAs across a variety of plant species. PlantLncBoost represents a significant advancement in plant lncRNA identification. It is freely accessible on GitHub (https://github.com/xuechantian/PlantLncBoost) and has been integrated into a comprehensive analysis pipeline, Plant-LncRNA-pipeline v.2 (https://github.com/xuechantian/Plant-LncRNA-pipeline-v2).},\n\tlanguage = {en},\n\tnumber = {n/a},\n\turldate = {2025-05-30},\n\tjournal = {New Phytologist},\n\tauthor = {Tian, Xue-Chan and Nie, Shuai and Domingues, Douglas and Rossi Paschoal, Alexandre and Jiang, Li-Bo and Mao, Jian-Feng},\n\tnote = {\\_eprint: https://onlinelibrary.wiley.com/doi/pdf/10.1111/nph.70211},\n\tkeywords = {Fourier transform, ORF coverage, feature selection, gradient boosting algorithms, long noncoding RNAs (lncRNAs), model selection},\n}\n\n\n\n","author_short":["Tian, X.","Nie, S.","Domingues, D.","Rossi Paschoal, A.","Jiang, L.","Mao, J."],"key":"tian_plantlncboost_nodate","id":"tian_plantlncboost_nodate","bibbaseid":"tian-nie-domingues-rossipaschoal-jiang-mao-plantlncboostkeyfeaturesforplantlncrnaidentificationandsignificantimprovementinaccuracyandgeneralization","role":"author","urls":{"Paper":"https://onlinelibrary.wiley.com/doi/abs/10.1111/nph.70211"},"keyword":["Fourier transform","ORF coverage","feature selection","gradient boosting algorithms","long noncoding RNAs (lncRNAs)","model selection"],"metadata":{"authorlinks":{}}},"bibtype":"article","biburl":"https://bibbase.org/zotero/upscpub","dataSources":["9cGcv2t8pRzC92kzs"],"keywords":["fourier transform","orf coverage","feature selection","gradient boosting algorithms","long noncoding rnas (lncrnas)","model selection"],"search_terms":["plantlncboost","key","features","plant","lncrna","identification","significant","improvement","accuracy","generalization","tian","nie","domingues","rossi paschoal","jiang","mao"],"title":"PlantLncBoost: key features for plant lncRNA identification and significant improvement in accuracy and generalization","year":null}