Robust, lexicalized native language identification. Brooke, J. & Hirst, G. In Proceedings, 24th International Conference on Computational Linguistics (COLING-2012), Mumbai, December, 2012. abstract bibtex Previous approaches to the task of native language identification (Koppel et al., 2005) have been limited to small, within-corpus evaluations. Because these are restrictive and unreliable, we apply cross-corpus evaluation to the task. We demonstrate the efficacy of lexical features, which had previously been avoided due to the within-corpus topic confounds, and provide a detailed evaluation of various options, including a simple bias adaptation technique and a number of classifier algorithms. Using a new web corpus as a training set, we reach high classification accuracy for a 7-language task, performance which is robust across two independent test sets. Although we show that even higher accuracy is possible using cross-validation, we present strong evidence calling into question the validity of cross-validation evaluation using the standard dataset.
@InProceedings{ brooke12,
author = {Julian Brooke and Graeme Hirst},
title = {Robust, lexicalized native language identification},
booktitle = {Proceedings, 24th International Conference on
Computational Linguistics (COLING-2012)},
year = 2012,
address = {Mumbai},
month = {December},
abstract = {Previous approaches to the task of native language
identification (Koppel et al., 2005) have been limited to
small, within-corpus evaluations. Because these are
restrictive and unreliable, we apply cross-corpus
evaluation to the task. We demonstrate the efficacy of
lexical features, which had previously been avoided due to
the within-corpus topic confounds, and provide a detailed
evaluation of various options, including a simple bias
adaptation technique and a number of classifier algorithms.
Using a new web corpus as a training set, we reach high
classification accuracy for a 7-language task, performance
which is robust across two independent test sets. Although
we show that even higher accuracy is possible using
cross-validation, we present strong evidence calling into
question the validity of cross-validation evaluation using
the standard dataset.},
download = {http://ftp.cs.toronto.edu/pub/gh/Brooke+Hirst-COLING-2012.pdf}
}
Downloads: 0
{"_id":{"_str":"53d57a48f414ae191e0003a4"},"__v":0,"authorIDs":[],"author_short":["Brooke, J.","Hirst, G."],"bibbaseid":"brooke-hirst-robustlexicalizednativelanguageidentification-2012","bibdata":{"bibtype":"inproceedings","type":"inproceedings","author":[{"firstnames":["Julian"],"propositions":[],"lastnames":["Brooke"],"suffixes":[]},{"firstnames":["Graeme"],"propositions":[],"lastnames":["Hirst"],"suffixes":[]}],"title":"Robust, lexicalized native language identification","booktitle":"Proceedings, 24th International Conference on Computational Linguistics (COLING-2012)","year":"2012","address":"Mumbai","month":"December","abstract":"Previous approaches to the task of native language identification (Koppel et al., 2005) have been limited to small, within-corpus evaluations. Because these are restrictive and unreliable, we apply cross-corpus evaluation to the task. We demonstrate the efficacy of lexical features, which had previously been avoided due to the within-corpus topic confounds, and provide a detailed evaluation of various options, including a simple bias adaptation technique and a number of classifier algorithms. Using a new web corpus as a training set, we reach high classification accuracy for a 7-language task, performance which is robust across two independent test sets. Although we show that even higher accuracy is possible using cross-validation, we present strong evidence calling into question the validity of cross-validation evaluation using the standard dataset.","download":"http://ftp.cs.toronto.edu/pub/gh/Brooke+Hirst-COLING-2012.pdf","bibtex":"@InProceedings{\t brooke12,\n author\t= {Julian Brooke and Graeme Hirst},\n title\t\t= {Robust, lexicalized native language identification},\n booktitle\t= {Proceedings, 24th International Conference on\n\t\t Computational Linguistics (COLING-2012)},\n year\t\t= 2012,\n address\t= {Mumbai},\n month\t\t= {December},\n abstract\t= {Previous approaches to the task of native language\n\t\t identification (Koppel et al., 2005) have been limited to\n\t\t small, within-corpus evaluations. Because these are\n\t\t restrictive and unreliable, we apply cross-corpus\n\t\t evaluation to the task. We demonstrate the efficacy of\n\t\t lexical features, which had previously been avoided due to\n\t\t the within-corpus topic confounds, and provide a detailed\n\t\t evaluation of various options, including a simple bias\n\t\t adaptation technique and a number of classifier algorithms.\n\t\t Using a new web corpus as a training set, we reach high\n\t\t classification accuracy for a 7-language task, performance\n\t\t which is robust across two independent test sets. Although\n\t\t we show that even higher accuracy is possible using\n\t\t cross-validation, we present strong evidence calling into\n\t\t question the validity of cross-validation evaluation using\n\t\t the standard dataset.},\n download\t= {http://ftp.cs.toronto.edu/pub/gh/Brooke+Hirst-COLING-2012.pdf}\n\t\t \n}\n\n","author_short":["Brooke, J.","Hirst, G."],"key":"brooke12","id":"brooke12","bibbaseid":"brooke-hirst-robustlexicalizednativelanguageidentification-2012","role":"author","urls":{},"metadata":{"authorlinks":{}}},"bibtype":"inproceedings","biburl":"www.cs.toronto.edu/~fritz/tmp/compling.bib","creationDate":"2014-07-27T22:16:40.602Z","downloads":0,"keywords":[],"search_terms":["robust","lexicalized","native","language","identification","brooke","hirst"],"title":"Robust, lexicalized native language identification","year":2012,"dataSources":["n8jB5BJxaeSmH6mtR","6b6A9kbkw4CsEGnRX"]}