High-performing feature selection for text classification

High-performing feature selection for text classification. Rogati, M. & Yang, Y. In International Conference on Information and Knowledge Management, Proceedings, pages 659–661, 2002.
abstract bibtex

This paper reports a controlled study on a large number of filter\nfeature selection methods for text classification. Over 100 variants\nof five major feature selection criteria were examined using four\nwell-known classification algorithms: a Naive Bayesian (NB) approach,\na Rocchio-style classifier, a k-nearest neighbor (kNN) method and\na Support Vector Machine (SVM) system. Two benchmark collections\nwere chosen as the testbeds: Reuters-21578 and small portion of Reuters\nCorpus Version 1 (RCV1), making the new results comparable to published\nresults. We found that feature selection methods based on chi2 statistics\nconsistently outperformed those based on other criteria (including\ninformation gain) for all four classifiers and both data collections,\nand that a further increase in performance was obtained by combining\nuncorrelated and high-performing feature selection methods.The results\nwe obtained using only 3% of the available features are among the\nbest reported, including results obtained with the full feature set.

@inproceedings{Rogati2002,
	title = {High-performing feature selection for text classification},
	isbn = {1-58113-492-4},
	abstract = {This paper reports a controlled study on a large number of filter{\textbackslash}nfeature selection methods for text classification. Over 100 variants{\textbackslash}nof five major feature selection criteria were examined using four{\textbackslash}nwell-known classification algorithms: a Naive Bayesian (NB) approach,{\textbackslash}na Rocchio-style classifier, a k-nearest neighbor (kNN) method and{\textbackslash}na Support Vector Machine (SVM) system. Two benchmark collections{\textbackslash}nwere chosen as the testbeds: Reuters-21578 and small portion of Reuters{\textbackslash}nCorpus Version 1 (RCV1), making the new results comparable to published{\textbackslash}nresults. We found that feature selection methods based on chi2 statistics{\textbackslash}nconsistently outperformed those based on other criteria (including{\textbackslash}ninformation gain) for all four classifiers and both data collections,{\textbackslash}nand that a further increase in performance was obtained by combining{\textbackslash}nuncorrelated and high-performing feature selection methods.The results{\textbackslash}nwe obtained using only 3\% of the available features are among the{\textbackslash}nbest reported, including results obtained with the full feature set.},
	booktitle = {International {Conference} on {Information} and {Knowledge} {Management}, {Proceedings}},
	author = {Rogati, Monica and Yang, Yiming},
	year = {2002},
	keywords = {Feature selection, Text classification, feature selection},
	pages = {659--661},
}

Downloads: 0

{"_id":"NGL6c8tfS7iueTzc5","bibbaseid":"rogati-yang-highperformingfeatureselectionfortextclassification-2002","author_short":["Rogati, M.","Yang, Y."],"bibdata":{"bibtype":"inproceedings","type":"inproceedings","title":"High-performing feature selection for text classification","isbn":"1-58113-492-4","abstract":"This paper reports a controlled study on a large number of filter\\nfeature selection methods for text classification. Over 100 variants\\nof five major feature selection criteria were examined using four\\nwell-known classification algorithms: a Naive Bayesian (NB) approach,\\na Rocchio-style classifier, a k-nearest neighbor (kNN) method and\\na Support Vector Machine (SVM) system. Two benchmark collections\\nwere chosen as the testbeds: Reuters-21578 and small portion of Reuters\\nCorpus Version 1 (RCV1), making the new results comparable to published\\nresults. We found that feature selection methods based on chi2 statistics\\nconsistently outperformed those based on other criteria (including\\ninformation gain) for all four classifiers and both data collections,\\nand that a further increase in performance was obtained by combining\\nuncorrelated and high-performing feature selection methods.The results\\nwe obtained using only 3% of the available features are among the\\nbest reported, including results obtained with the full feature set.","booktitle":"International Conference on Information and Knowledge Management, Proceedings","author":[{"propositions":[],"lastnames":["Rogati"],"firstnames":["Monica"],"suffixes":[]},{"propositions":[],"lastnames":["Yang"],"firstnames":["Yiming"],"suffixes":[]}],"year":"2002","keywords":"Feature selection, Text classification, feature selection","pages":"659–661","bibtex":"@inproceedings{Rogati2002,\n\ttitle = {High-performing feature selection for text classification},\n\tisbn = {1-58113-492-4},\n\tabstract = {This paper reports a controlled study on a large number of filter{\\textbackslash}nfeature selection methods for text classification. Over 100 variants{\\textbackslash}nof five major feature selection criteria were examined using four{\\textbackslash}nwell-known classification algorithms: a Naive Bayesian (NB) approach,{\\textbackslash}na Rocchio-style classifier, a k-nearest neighbor (kNN) method and{\\textbackslash}na Support Vector Machine (SVM) system. Two benchmark collections{\\textbackslash}nwere chosen as the testbeds: Reuters-21578 and small portion of Reuters{\\textbackslash}nCorpus Version 1 (RCV1), making the new results comparable to published{\\textbackslash}nresults. We found that feature selection methods based on chi2 statistics{\\textbackslash}nconsistently outperformed those based on other criteria (including{\\textbackslash}ninformation gain) for all four classifiers and both data collections,{\\textbackslash}nand that a further increase in performance was obtained by combining{\\textbackslash}nuncorrelated and high-performing feature selection methods.The results{\\textbackslash}nwe obtained using only 3\\% of the available features are among the{\\textbackslash}nbest reported, including results obtained with the full feature set.},\n\tbooktitle = {International {Conference} on {Information} and {Knowledge} {Management}, {Proceedings}},\n\tauthor = {Rogati, Monica and Yang, Yiming},\n\tyear = {2002},\n\tkeywords = {Feature selection, Text classification, feature selection},\n\tpages = {659--661},\n}\n\n","author_short":["Rogati, M.","Yang, Y."],"key":"Rogati2002","id":"Rogati2002","bibbaseid":"rogati-yang-highperformingfeatureselectionfortextclassification-2002","role":"author","urls":{},"keyword":["Feature selection","Text classification","feature selection"],"metadata":{"authorlinks":{}},"html":""},"bibtype":"inproceedings","biburl":"https://bibbase.org/zotero/ifromm","dataSources":["N4kJAiLiJ7kxfNsoh"],"keywords":["feature selection","text classification","feature selection"],"search_terms":["high","performing","feature","selection","text","classification","rogati","yang"],"title":"High-performing feature selection for text classification","year":2002}