Paragraph Clustering for Intrinsic Plagiarism Detection using a Stylistic Vector-Space Model with Extrinsic Features. Brooke, J. & Hirst, G. In Proceedings, PAN 2012 Lab: Uncovering Plagiarism, Authorship and Social Software Misuse — at the CLEF 2012 Conference and Labs of the Evaluation Forum: Information Access Evaluation meets Multilinguality, Multimodality, and Visual Analytics, Rome, September, 2012. abstract bibtex Our approach to the task of intrinsic plagiarism detection uses a vector-space model which eschews surface features in favor of richer extrinsic features, including those based on latent semantic analysis in a larger external corpus. We posit that the popularity and success of surface n-gram features is mostly due to the topic-biased nature of current artificial evaluations, a problem which unfortunately extends to the present PAN evaluation. One interesting of aspect of our approach is our way of dealing with small, imbalanced span sizes; we improved performance considerably in our development evaluation by countering these effect using the expected difference of sums of random variables.
@InProceedings{ brooke11,
author = {Julian Brooke and Graeme Hirst},
title = {Paragraph Clustering for Intrinsic Plagiarism Detection
using a Stylistic Vector-Space Model with Extrinsic
Features},
booktitle = {Proceedings, {PAN} 2012 Lab: {U}ncovering Plagiarism,
Authorship and Social Software Misuse --- at the {CLEF}
2012 Conference and Labs of the Evaluation Forum:
Information Access Evaluation meets Multilinguality,
Multimodality, and Visual Analytics},
year = 2012,
address = {Rome},
month = {September},
abstract = {Our approach to the task of intrinsic plagiarism detection
uses a vector-space model which eschews surface features in
favor of richer extrinsic features, including those based
on latent semantic analysis in a larger external corpus. We
posit that the popularity and success of surface n-gram
features is mostly due to the topic-biased nature of
current artificial evaluations, a problem which
unfortunately extends to the present PAN evaluation. One
interesting of aspect of our approach is our way of dealing
with small, imbalanced span sizes; we improved performance
considerably in our development evaluation by countering
these effect using the expected difference of sums of
random variables.},
download = {http://ftp.cs.toronto.edu/pub/gh/Brooke+Hirst-PAN-2012.pdf}
}
Downloads: 0
{"_id":{"_str":"53d57a48f414ae191e00038a"},"__v":0,"authorIDs":[],"author_short":["Brooke, J.","Hirst, G."],"bibbaseid":"brooke-hirst-paragraphclusteringforintrinsicplagiarismdetectionusingastylisticvectorspacemodelwithextrinsicfeatures-2012","bibdata":{"bibtype":"inproceedings","type":"inproceedings","author":[{"firstnames":["Julian"],"propositions":[],"lastnames":["Brooke"],"suffixes":[]},{"firstnames":["Graeme"],"propositions":[],"lastnames":["Hirst"],"suffixes":[]}],"title":"Paragraph Clustering for Intrinsic Plagiarism Detection using a Stylistic Vector-Space Model with Extrinsic Features","booktitle":"Proceedings, PAN 2012 Lab: Uncovering Plagiarism, Authorship and Social Software Misuse — at the CLEF 2012 Conference and Labs of the Evaluation Forum: Information Access Evaluation meets Multilinguality, Multimodality, and Visual Analytics","year":"2012","address":"Rome","month":"September","abstract":"Our approach to the task of intrinsic plagiarism detection uses a vector-space model which eschews surface features in favor of richer extrinsic features, including those based on latent semantic analysis in a larger external corpus. We posit that the popularity and success of surface n-gram features is mostly due to the topic-biased nature of current artificial evaluations, a problem which unfortunately extends to the present PAN evaluation. One interesting of aspect of our approach is our way of dealing with small, imbalanced span sizes; we improved performance considerably in our development evaluation by countering these effect using the expected difference of sums of random variables.","download":"http://ftp.cs.toronto.edu/pub/gh/Brooke+Hirst-PAN-2012.pdf","bibtex":"@InProceedings{\t brooke11,\n author\t= {Julian Brooke and Graeme Hirst},\n title\t\t= {Paragraph Clustering for Intrinsic Plagiarism Detection\n\t\t using a Stylistic Vector-Space Model with Extrinsic\n\t\t Features},\n booktitle\t= {Proceedings, {PAN} 2012 Lab: {U}ncovering Plagiarism,\n\t\t Authorship and Social Software Misuse --- at the {CLEF}\n\t\t 2012 Conference and Labs of the Evaluation Forum:\n\t\t Information Access Evaluation meets Multilinguality,\n\t\t Multimodality, and Visual Analytics},\n year\t\t= 2012,\n address\t= {Rome},\n month\t\t= {September},\n abstract\t= {Our approach to the task of intrinsic plagiarism detection\n\t\t uses a vector-space model which eschews surface features in\n\t\t favor of richer extrinsic features, including those based\n\t\t on latent semantic analysis in a larger external corpus. We\n\t\t posit that the popularity and success of surface n-gram\n\t\t features is mostly due to the topic-biased nature of\n\t\t current artificial evaluations, a problem which\n\t\t unfortunately extends to the present PAN evaluation. One\n\t\t interesting of aspect of our approach is our way of dealing\n\t\t with small, imbalanced span sizes; we improved performance\n\t\t considerably in our development evaluation by countering\n\t\t these effect using the expected difference of sums of\n\t\t random variables.},\n download\t= {http://ftp.cs.toronto.edu/pub/gh/Brooke+Hirst-PAN-2012.pdf}\n\t\t \n}\n\n","author_short":["Brooke, J.","Hirst, G."],"key":"brooke11","id":"brooke11","bibbaseid":"brooke-hirst-paragraphclusteringforintrinsicplagiarismdetectionusingastylisticvectorspacemodelwithextrinsicfeatures-2012","role":"author","urls":{},"metadata":{"authorlinks":{}}},"bibtype":"inproceedings","biburl":"www.cs.toronto.edu/~fritz/tmp/compling.bib","creationDate":"2014-07-27T22:16:40.268Z","downloads":0,"keywords":[],"search_terms":["paragraph","clustering","intrinsic","plagiarism","detection","using","stylistic","vector","space","model","extrinsic","features","brooke","hirst"],"title":"Paragraph Clustering for Intrinsic Plagiarism Detection using a Stylistic Vector-Space Model with Extrinsic Features","year":2012,"dataSources":["n8jB5BJxaeSmH6mtR","6b6A9kbkw4CsEGnRX"]}