Measuring interlanguage: Native language identification with L1-influence metrics. Brooke, J. & Hirst, G. In Proceedings, 8th ELRA Conference on Language Resources and Evaluation (LREC 2012) , Istanbul, May, 2012. abstract bibtex The task of native language (L1) identification suffers from a relative paucity of useful training corpora, and standard within-corpus evaluation is often problematic due to topic bias. In this paper, we introduce a method for L1 identification in second language (L2) texts that relies only on much more plentiful L1 data, rather than the L2 texts that are traditionally used for training. In particular, we do word-by-word translation of large L1 blog corpora to create a mapping to L2 forms that are a possible result of language transfer, and then use that information for unsupervised classification. We show this method is effective in several different learner corpora, with bigram features being particularly useful.
@InProceedings{ brooke6,
author = {Julian Brooke and Graeme Hirst},
title = {Measuring interlanguage: Native language identification
with L1-influence metrics},
address = {Istanbul},
booktitle = {Proceedings, 8th ELRA Conference on Language Resources and
Evaluation (LREC 2012) },
year = {2012},
month = {May},
download = {http://ftp.cs.toronto.edu/pub/gh/Brooke+Hirst-LREC-2012.pdf}
,
abstract = {The task of native language (L1) identification suffers
from a relative paucity of useful training corpora, and
standard within-corpus evaluation is often problematic due
to topic bias. In this paper, we introduce a method for L1
identification in second language (L2) texts that relies
only on much more plentiful L1 data, rather than the L2
texts that are traditionally used for training. In
particular, we do word-by-word translation of large L1 blog
corpora to create a mapping to L2 forms that are a possible
result of language transfer, and then use that information
for unsupervised classification. We show this method is
effective in several different learner corpora, with bigram
features being particularly useful.}
}
Downloads: 0
{"_id":{"_str":"53d57a48f414ae191e00038d"},"__v":0,"authorIDs":[],"author_short":["Brooke, J.","Hirst, G."],"bibbaseid":"brooke-hirst-measuringinterlanguagenativelanguageidentificationwithl1influencemetrics-2012","bibdata":{"bibtype":"inproceedings","type":"inproceedings","author":[{"firstnames":["Julian"],"propositions":[],"lastnames":["Brooke"],"suffixes":[]},{"firstnames":["Graeme"],"propositions":[],"lastnames":["Hirst"],"suffixes":[]}],"title":"Measuring interlanguage: Native language identification with L1-influence metrics","address":"Istanbul","booktitle":"Proceedings, 8th ELRA Conference on Language Resources and Evaluation (LREC 2012) ","year":"2012","month":"May","download":"http://ftp.cs.toronto.edu/pub/gh/Brooke+Hirst-LREC-2012.pdf","abstract":"The task of native language (L1) identification suffers from a relative paucity of useful training corpora, and standard within-corpus evaluation is often problematic due to topic bias. In this paper, we introduce a method for L1 identification in second language (L2) texts that relies only on much more plentiful L1 data, rather than the L2 texts that are traditionally used for training. In particular, we do word-by-word translation of large L1 blog corpora to create a mapping to L2 forms that are a possible result of language transfer, and then use that information for unsupervised classification. We show this method is effective in several different learner corpora, with bigram features being particularly useful.","bibtex":"@InProceedings{\t brooke6,\n author\t= {Julian Brooke and Graeme Hirst},\n title\t\t= {Measuring interlanguage: Native language identification\n\t\t with L1-influence metrics},\n address\t= {Istanbul},\n booktitle\t= {Proceedings, 8th ELRA Conference on Language Resources and\n\t\t Evaluation (LREC 2012) },\n year\t\t= {2012},\n month\t\t= {May},\n download\t= {http://ftp.cs.toronto.edu/pub/gh/Brooke+Hirst-LREC-2012.pdf}\n\t\t ,\n abstract\t= {The task of native language (L1) identification suffers\n\t\t from a relative paucity of useful training corpora, and\n\t\t standard within-corpus evaluation is often problematic due\n\t\t to topic bias. In this paper, we introduce a method for L1\n\t\t identification in second language (L2) texts that relies\n\t\t only on much more plentiful L1 data, rather than the L2\n\t\t texts that are traditionally used for training. In\n\t\t particular, we do word-by-word translation of large L1 blog\n\t\t corpora to create a mapping to L2 forms that are a possible\n\t\t result of language transfer, and then use that information\n\t\t for unsupervised classification. We show this method is\n\t\t effective in several different learner corpora, with bigram\n\t\t features being particularly useful.}\n}\n\n","author_short":["Brooke, J.","Hirst, G."],"key":"brooke6","id":"brooke6","bibbaseid":"brooke-hirst-measuringinterlanguagenativelanguageidentificationwithl1influencemetrics-2012","role":"author","urls":{},"metadata":{"authorlinks":{}}},"bibtype":"inproceedings","biburl":"www.cs.toronto.edu/~fritz/tmp/compling.bib","creationDate":"2014-07-27T22:16:40.275Z","downloads":0,"keywords":[],"search_terms":["measuring","interlanguage","native","language","identification","influence","metrics","brooke","hirst"],"title":"Measuring interlanguage: Native language identification with L1-influence metrics","year":2012,"dataSources":["n8jB5BJxaeSmH6mtR","6b6A9kbkw4CsEGnRX"]}