Are Neural Language Models Good Plagiarists? A Benchmark for Neural Paraphrase Detection. Wahle, J. P., Ruas, T., Meuschke, N., & Gipp, B. In Proceedings of the ACM/IEEE Joint Conference on Digital Libraries (JCDL), Virtual Event, September, 2021.
Paper
Code/data abstract bibtex 2 downloads Neural language models such as BERT allow for human-like text paraphrasing. This ability threatens academic integrity, as it aggravates identifying machine-obfuscated plagiarism. We make two contributions to foster the research on detecting these novel machine-paraphrases. First, we provide the first large-scale dataset of documents paraphrased using the Transformer-based models BERT, RoBERTa, and Longformer. The dataset includes paragraphs from scientific papers on arXiv, theses, and Wikipedia articles and their paraphrased counterparts (1.5M paragraphs in total). We show the paraphrased text maintains the semantics of the original source. Second, we benchmark how well neural classification models can distinguish the original and paraphrased text. The dataset and source code of our study are publicly available.
@inproceedings{WahleRMG21,
address = {Virtual Event},
title = {Are {Neural} {Language} {Models} {Good} {Plagiarists}? {A} {Benchmark} for {Neural} {Paraphrase} {Detection}},
url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/wahle2021.pdf code/data=https://doi.org/10.5281/zenodo.4621403},
abstract = {Neural language models such as BERT allow for human-like text paraphrasing. This ability threatens academic integrity, as it aggravates identifying machine-obfuscated plagiarism. We make two contributions to foster the research on detecting these novel machine-paraphrases. First, we provide the first large-scale dataset of documents paraphrased using the Transformer-based models BERT, RoBERTa, and Longformer. The dataset includes paragraphs from scientific papers on arXiv, theses, and Wikipedia articles and their paraphrased counterparts (1.5M paragraphs in total). We show the paraphrased text maintains the semantics of the original source. Second, we benchmark how well neural classification models can distinguish the original and paraphrased text. The dataset and source code of our study are publicly available.},
booktitle = {Proceedings of the {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})},
author = {Wahle, Jan Philip and Ruas, Terry and Meuschke, Norman and Gipp, Bela},
month = sep,
year = {2021},
keywords = {archived},
}
Downloads: 2
{"_id":"HSnPjuNnSqnQLD2YE","bibbaseid":"wahle-ruas-meuschke-gipp-areneurallanguagemodelsgoodplagiaristsabenchmarkforneuralparaphrasedetection-2021","author_short":["Wahle, J. P.","Ruas, T.","Meuschke, N.","Gipp, B."],"bibdata":{"bibtype":"inproceedings","type":"inproceedings","address":"Virtual Event","title":"Are Neural Language Models Good Plagiarists? A Benchmark for Neural Paraphrase Detection","abstract":"Neural language models such as BERT allow for human-like text paraphrasing. This ability threatens academic integrity, as it aggravates identifying machine-obfuscated plagiarism. We make two contributions to foster the research on detecting these novel machine-paraphrases. First, we provide the first large-scale dataset of documents paraphrased using the Transformer-based models BERT, RoBERTa, and Longformer. The dataset includes paragraphs from scientific papers on arXiv, theses, and Wikipedia articles and their paraphrased counterparts (1.5M paragraphs in total). We show the paraphrased text maintains the semantics of the original source. Second, we benchmark how well neural classification models can distinguish the original and paraphrased text. The dataset and source code of our study are publicly available.","booktitle":"Proceedings of the ACM/IEEE Joint Conference on Digital Libraries (JCDL)","author":[{"propositions":[],"lastnames":["Wahle"],"firstnames":["Jan","Philip"],"suffixes":[]},{"propositions":[],"lastnames":["Ruas"],"firstnames":["Terry"],"suffixes":[]},{"propositions":[],"lastnames":["Meuschke"],"firstnames":["Norman"],"suffixes":[]},{"propositions":[],"lastnames":["Gipp"],"firstnames":["Bela"],"suffixes":[]}],"month":"September","year":"2021","keywords":"archived","bibtex":"@inproceedings{WahleRMG21,\n\taddress = {Virtual Event},\n\ttitle = {Are {Neural} {Language} {Models} {Good} {Plagiarists}? {A} {Benchmark} for {Neural} {Paraphrase} {Detection}},\n\turl = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/wahle2021.pdf code/data=https://doi.org/10.5281/zenodo.4621403},\n\tabstract = {Neural language models such as BERT allow for human-like text paraphrasing. This ability threatens academic integrity, as it aggravates identifying machine-obfuscated plagiarism. We make two contributions to foster the research on detecting these novel machine-paraphrases. First, we provide the first large-scale dataset of documents paraphrased using the Transformer-based models BERT, RoBERTa, and Longformer. The dataset includes paragraphs from scientific papers on arXiv, theses, and Wikipedia articles and their paraphrased counterparts (1.5M paragraphs in total). We show the paraphrased text maintains the semantics of the original source. Second, we benchmark how well neural classification models can distinguish the original and paraphrased text. The dataset and source code of our study are publicly available.},\n\tbooktitle = {Proceedings of the {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})},\n\tauthor = {Wahle, Jan Philip and Ruas, Terry and Meuschke, Norman and Gipp, Bela},\n\tmonth = sep,\n\tyear = {2021},\n\tkeywords = {archived},\n}\n\n","author_short":["Wahle, J. P.","Ruas, T.","Meuschke, N.","Gipp, B."],"urlpaper":"https://www.gipp.com/wp-content/papercite-data/pdf/wahle2021.pdf","urlcode/data":"https://doi.org/10.5281/zenodo.4621403","key":"WahleRMG21","id":"WahleRMG21","bibbaseid":"wahle-ruas-meuschke-gipp-areneurallanguagemodelsgoodplagiaristsabenchmarkforneuralparaphrasedetection-2021","role":"author","urls":{"Paper":"https://www.gipp.com/wp-content/papercite-data/pdf/wahle2021.pdf","Code/data":"https://doi.org/10.5281/zenodo.4621403"},"keyword":["archived"],"metadata":{"authorlinks":{}},"downloads":2},"bibtype":"inproceedings","biburl":"https://api.zotero.org/groups/2532143/items?key=DOjJ33bOgISaFjBIBr7jCV5S&format=bibtex&limit=100","dataSources":["Zp98Nuv7ftsXLefzT","aEHCfX6B2taJt8dfa","9qTaLWxMN5hLpMP8m","xteq4cdC6ATE2G6Fg","JNgeyAG2vQ8k88oYh","FPjHiAkAja6XvmScK","RTGAqwGfLTSqYQMsS","Y7kZGjoN5Erk3Lo2J","yM7MefT3mRkY9m7i4","jnWJCpbQCoWvxj9kz","F32umBkhFrpeJbp7A","BWzEyLkMvdMGpHpr6","hBAe6Z5DsNbrQtje2","e3AdWzdxYmb85Fn5D","MtqPmSRuq4X8FJqNT","YCwvFifyPbazBYMQD","6oZMeYhGKA2Mp8xhF","gYMS6DBXsNosXKcRC","bQwdfx3o8Q3vnsqfH","SzFkcrpurPzNHEyqX","6KJgnNtYZiwwFkcGq","XJBi8b8xDjDoWPzcZ","kHqqD8pzLteJJWS2X","hG7rv86o2PDG2z44d","aJH3D6QaHCDgg2JGg"],"keywords":["archived"],"search_terms":["neural","language","models","good","plagiarists","benchmark","neural","paraphrase","detection","wahle","ruas","meuschke","gipp"],"title":"Are Neural Language Models Good Plagiarists? A Benchmark for Neural Paraphrase Detection","year":2021,"downloads":2}