<script src="https://bibbase.org/show?bib=https%3A%2F%2Fapi.zotero.org%2Fgroups%2F2532143%2Fitems%3Fkey%3DDOjJ33bOgISaFjBIBr7jCV5S%26format%3Dbibtex%26limit%3D100&jsonp=1"></script>
<?php
$contents = file_get_contents("https://bibbase.org/show?bib=https%3A%2F%2Fapi.zotero.org%2Fgroups%2F2532143%2Fitems%3Fkey%3DDOjJ33bOgISaFjBIBr7jCV5S%26format%3Dbibtex%26limit%3D100");
print_r($contents);
?>
<iframe src="https://bibbase.org/show?bib=https%3A%2F%2Fapi.zotero.org%2Fgroups%2F2532143%2Fitems%3Fkey%3DDOjJ33bOgISaFjBIBr7jCV5S%26format%3Dbibtex%26limit%3D100"></iframe>
For more details see the documention.
To the site owner:
Action required! Mendeley is changing its API. In order to keep using Mendeley with BibBase past April 14th, you need to:
@misc{SpindeHHR24, title = {The {Media} {Bias} {Taxonomy}: {A} {Systematic} {Literature} {Review} on the {Forms} and {Automated} {Detection} of {Media} {Bias}}, shorttitle = {The {Media} {Bias} {Taxonomy}}, url = {paper=https://arxiv.org/pdf/2312.16148}, abstract = {The way the media presents events can significantly affect public perception, which in turn can alter people's beliefs and views. Media bias describes a one-sided or polarizing perspective on a topic. This article summarizes the research on computational methods to detect media bias by systematically reviewing 3140 research papers published between 2019 and 2022. To structure our review and support a mutual understanding of bias across research domains, we introduce the Media Bias Taxonomy, which provides a coherent overview of the current state of research on media bias from different perspectives. We show that media bias detection is a highly active research field, in which transformer-based classification approaches have led to significant improvements in recent years. These improvements include higher classification accuracy and the ability to detect more fine-granular types of bias. However, we have identified a lack of interdisciplinarity in existing projects, and a need for more awareness of the various types of media bias to support methodologically thorough performance evaluations of media bias detection systems. Concluding from our analysis, we see the integration of recent machine learning advancements with reliable and diverse bias assessment strategies from other research areas as the most promising area for future research contributions in the field.}, urldate = {2024-05-13}, publisher = {arXiv}, author = {Spinde, Timo and Hinterreiter, Smi and Haak, Fabian and Ruas, Terry and Giese, Helge and Meuschke, Norman and Gipp, Bela}, month = jan, year = {2024}, note = {arXiv:2312.16148 [cs]}, }
@article{McDowellBBC23, title = {Future {Applications} for {Data} {Storytelling} {Toolkits}}, volume = {3}, issn = {1556-5068}, url = {paper=https://github.com/nmeuschke/nmeuschke.github.io/blob/master/pdf/McDowellBBC23.pdf}, doi = {10.2139/ssrn.4513189}, language = {en}, number = {6}, urldate = {2023-09-17}, journal = {Information Matters}, author = {McDowell, Kate and Bettivia, Rhiannon and Bonn, Maria S. and Campbell-Meier, Jennifer and Cheng, Jessica Yi-Yun and Corieri, Isabella Lena and Dinh, Ly and Ehlinger, Sam and Enwald, Heidi and Hossain, Md Khalid and Meuschke, Norman and Ocepek, Melissa and Anwar, Misita and Sharma, Sarika}, year = {2023}, }
@book{Meuschke23, title = {Analyzing {Non}-{Textual} {Content} {Elements} to {Detect} {Academic} {Plagiarism}}, isbn = {978-3-658-42061-1 978-3-658-42062-8}, url = {preprint=https://zenodo.org/record/4913345/files/Meuschke2021Thesis.pdf publisher=https://link.springer.com/10.1007/978-3-658-42062-8}, abstract = {Identifying plagiarism is a pressing problem for research institutions, publishers, and funding bodies. Current detection methods focus on textual analysis and find copied, moderately reworded, or translated content. However, detecting more subtle forms of plagiarism, including strong paraphrasing, sense-for-sense translations, or the reuse of non-textual content and ideas, remains a challenge. This book presents a novel approach to address this problem—analyzing non-textual elements in academic documents, such as citations, images, and mathematical content. The proposed detection techniques are validated in five evaluations using confirmed plagiarism cases and exploratory searches for new instances. The results show that non-textual elements contain much semantic information, are language-independent, and resilient to typical tactics for concealing plagiarism. Incorporating non-textual content analysis complements text-based detection approaches and increases the detection effectiveness, particularly for disguised forms of plagiarism. The book introduces the first integrated plagiarism detection system that combines citation, image, math, and text similarity analysis. Its user interface features visual aids that significantly reduce the time and effort users must invest in examining content similarity.}, language = {en}, publisher = {Springer Fachmedien Wiesbaden}, author = {Meuschke, Norman}, year = {2023}, doi = {10.1007/978-3-658-42062-8}, }
@article{IhleTSM23, title = {Incentive {Mechanisms} in {Peer}-to-{Peer} {Networks} — {A} {Systematic} {Literature} {Review}}, volume = {55}, issn = {0360-0300, 1557-7341}, url = {paper=https://gipplab.org/wp-content/papercite-data/pdf/ihle2023.pdf}, doi = {10.1145/3578581}, abstract = {Centralized networks inevitably exhibit single points of failure that malicious actors regularly target. Decentralized networks are more resilient if numerous participants contribute to the network’s functionality. Most decentralized networks employ incentive mechanisms to coordinate the participation and cooperation of peers and thereby ensure the functionality and security of the network. This article systematically reviews incentive mechanisms for decentralized networks and networked systems by covering 165 prior literature reviews and 178 primary research papers published between 1993 and October 2022. Of the considered sources, we analyze eleven literature reviews and 105 primary research papers in detail by categorizing and comparing the distinctive properties of the presented incentive mechanisms. The reviewed incentive mechanisms establish fairness and reward participation and cooperative behavior. We review work that substitutes central authority through independent and subjective mechanisms run in isolation at each participating peer and work that applies multiparty computation. We use monetary, reputation, and service rewards as categories to differentiate the implementations and evaluate each incentive mechanism’s data management, attack resistance, and contribution model. Further, we highlight research gaps and deficiencies in reproducibility and comparability. Finally, we summarize our assessments and provide recommendations to apply incentive mechanisms to decentralized networks that share computational resources.}, language = {en}, number = {14}, urldate = {2023-04-19}, journal = {ACM Computing Surveys}, author = {Ihle, Cornelius and Trautwein, Dennis and Schubotz, Moritz and Meuschke, Norman and Gipp, Bela}, month = jul, year = {2023}, }
@inproceedings{SatputeGSM23, address = {Santa Fe, NM, USA}, title = {{TEIMMA}: {The} {First} {Content} {Reuse} {Annotator} for {Text}, {Images}, and {Math}}, url = {paper=https://gipplab.org/wp-content/papercite-data/pdf/satpute2023.pdf demo=https://teimma.gipplab.org/}, doi = {10.1109/JCDL57899.2023.00056}, abstract = {This demo paper presents the first tool to annotate the reuse of text, images, and mathematical formulae in a document pair—TEIMMA. Annotating content reuse is particularly useful to develop plagiarism detection algorithms. Real-world content reuse is often obfuscated, which makes it challenging to identify such cases. TEIMMA allows entering the obfuscation type to enable novel classifications for confirmed cases of plagiarism. It enables recording different reuse types for text, images, and mathematical formulae in HTML and supports users by visualizing the content reuse in a document pair using similarity detection methods for text and math.}, booktitle = {Proceedings of 23rd {Annual} {International} {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, author = {Satpute, Ankit and Greiner-Petter, André and Schubotz, Moritz and Meuschke, Norman and Aizawa, Akiko and Gipp, Bela}, year = {2023}, }
@inproceedings{WahleRMM23, address = {Santa Fe, NM, USA}, title = {{AI} {Usage} {Card}: {Responsibly} {Reporting} {AI}-generated {Content}}, url = {paper=https://gipplab.org/wp-content/papercite-data/pdf/wahle2023.pdf demo=https://ai-cards.org}, doi = {10.1109/JCDL57899.2023.00060}, abstract = {There are growing concerns about the responsible use of content-generating AI systems. Current guidelines for using AI are specific to certain scenarios and not applicable to scientific research. We propose a three-dimensional model consisting of transparency, integrity, and accountability to define responsible AI use in science and introduce AI Usage Cards to report the use of AI in scientific research. Our model and reporting system promotes the ethical and responsible use of AI and provides a standardized approach for reporting AI across research fields. We also offer a free service to generate AI Usage Cards via a questionnaire at https://ai-cards.org.}, booktitle = {Proceedings of 23rd {Annual} {International} {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, publisher = {IEEE Computer Society}, author = {Wahle, Jan Philip and Ruas, Terry and Mohammad, Saif M. and Meuschke, Norman and Gipp, Bela}, year = {2023}, }
@techreport{GippGSM23, title = {Final {Report} for the {DFG}-{Project} {Methods} and {Tools} to {Advance} the {Retrieval} of {Mathematical} {Knowledge} from {Digital} {Libraries} for {Search}-, {Recommendation}- and {Assistance}-{Systems}}, copyright = {Creative Commons Attribution 4.0 International, Open Access}, url = {paper=https://zenodo.org/record/7924634/files/Gipp2023_DFG_Report_MathIR.pdf demo_1=https://lacast.wmflabs.org/ demo_2=https://physwikiquiz.wmflabs.org}, abstract = {This project investigated new approaches and technologies to enhance the accessibility of mathematical content and its semantic information for a broad range of information retrieval applications. To achieve this goal, the project addressed three main research challenges: (1) syntactic analysis of mathematical expressions, (2) semantic enrichment of mathematical expressions, and (3) evaluation using quality metrics and demonstrators. To make our research useful for the research community, we published tools that enable researchers to process mathematical expressions more effectively and efficiently. The project has made significant research contributions to various Mathematical Information Retrieval (MathIR) tasks and systems, including plagiarism detection and recommendation systems, search engines, the first mathematical type assistance system, math question answering and tutoring systems, automatic plausibility checks for mathematical expressions on Wikipedia, automatic computability of mathematical content via Computer Algebra Systems (CAS), and others. Although our project focused on MathIR tasks, its impact on other natural language research was significant, leading to a more extensive range of demonstrators than originally expected. Many of these demonstrators introduced novel applications, such as the tutoring system PhysWikiQuiz or LaCASt, which automatically verifies the correctness of math formulae on Wikipedia or the Digital Library of Mathematical Functions (DLMF) via commercial CAS. During the project, we published 29 peer-reviewed articles in international venues, including prestigious conferences like the Joint Conference on Digital Libraries (JCDL) and The Web Conference (WWW) (CORE rank A*), as well as journals such as IEEE Transactions on Pattern Analysis and Machine Intelligence (TPAMI) (IF: 24.314) and Scientometrics (IF: 3.801). Our Wikipedia demonstrator was also featured in public media. Furthermore, we actively presented our contributions, especially demonstrators, to the research community in multiple workshops. This project has strengthened our international collaborations, particularly with colleagues at the National Institute of Standards and Technology (NIST) in the US and the National Institute of Informatics (NII) in Japan. Several subprojects were partially developed in course projects and theses at the Universities of Konstanz, Wuppertal, and Göttingen, exposing junior researchers to cutting-edge technologies and sensitizing students and researchers to the outstanding issues in MathIR technologies. We firmly believe that this project will have a lasting effect on following MathIR technologies. Several of the subprojects initiated as part of this grant are ongoing and motivating follow-up DFG projects, such as Analyzing Mathematics to Detect Disguised Academic Plagiarism (project no. 437179652).}, language = {en}, urldate = {2023-05-15}, institution = {University of Goettingen}, author = {Gipp, Bela and Greiner-Petter, André and Schubotz, Moritz and Meuschke, Norman}, month = mar, year = {2023}, doi = {10.5281/ZENODO.7924634}, }
@incollection{MeuschkeJSM23, address = {Cham}, series = {{LNCS}}, title = {A {Benchmark} of {PDF} {Information} {Extraction} {Tools} {Using} a {Multi}-task and {Multi}-domain {Evaluation} {Framework} for {Academic} {Documents}}, volume = {13972}, isbn = {978-3-031-28031-3 978-3-031-28032-0}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/meuschke2023.pdf code/data=https://github.com/gipplab/pdf-benchmark}, booktitle = {Information for a {Better} {World}: {Normality}, {Virtuality}, {Physicality}, {Inclusivity}}, publisher = {Springer Nature Switzerland}, author = {Meuschke, Norman and Jagdale, Apurva and Spinde, Timo and Mitrović, Jelena and Gipp, Bela}, year = {2023}, doi = {10.1007/978-3-031-28032-0_31}, pages = {383--405}, }
@article{WahleRMG22, title = {Incorporating {Word} {Sense} {Disambiguation} in {Neural} {Language} {Models}}, copyright = {Creative Commons Attribution Share Alike 4.0 International}, url = {paper=https://arxiv.org/pdf/2106.07967.pdf}, doi = {10.48550/ARXIV.2106.07967}, abstract = {We present two supervised (pre-)training methods to incorporate gloss definitions from lexical resources into neural language models (LMs). The training improves our models' performance for Word Sense Disambiguation (WSD) but also benefits general language understanding tasks while adding almost no parameters. We evaluate our techniques with seven different neural LMs and find that XLNet is more suitable for WSD than BERT. Our best-performing methods exceeds state-of-the-art WSD techniques on the SemCor 3.0 dataset by 0.5\% F1 and increase BERT's performance on the GLUE benchmark by 1.1\% on average.}, urldate = {2022-09-22}, journal = {arXiv:2106.07967v3 [cs.CL]}, author = {Wahle, Jan Philip and Ruas, Terry and Meuschke, Norman and Gipp, Bela}, year = {2022}, }
@incollection{MeuschkeWG22, address = {Berlin, Boston}, edition = {7}, title = {F 5 {Plagiat}}, isbn = {978-3-11-076904-3}, url = {pdf=https://www.gipp.com/wp-content/papercite-data/pdf/meuschke2022.pdf}, abstract = {Eine Einführung in die rechtlichen, methodischen und technologischen Probleme und Lösungen für die Prävention und Erkennung wissenschaftlicher Plagiate}, urldate = {2022-11-25}, booktitle = {Grundlagen der {Informationswissenschaft}}, publisher = {De Gruyter Saur}, author = {Meuschke, Norman and Walger, Nicole and Gipp, Bela}, editor = {Kuhlen, Rainer and Lewandowski, Dirk and Semar, Wolfgang and Womser-Hacker, Christa}, month = nov, year = {2022}, doi = {10.1515/9783110769043-069}, pages = {817--828}, }
@inproceedings{BreitingerHFM22, address = {Cologne Germany}, title = {Recommending {Research} {Papers} to {Chemists}: {A} {Specialized} {Interface} for {Chemical} {Entity} {Exploration}}, isbn = {978-1-4503-9345-4}, shorttitle = {Recommending {Research} {Papers} to {Chemists}}, url = {paper=https://arxiv.org/ftp/arxiv/papers/2205/2205.05414.pdf code=https://github.com/gipplab/chem_formula_extractor}, doi = {10.1145/3529372.3533281}, abstract = {Researchers and scientists increasingly rely on specialized information retrieval (IR) or recommendation systems (RS) to support them in their daily research tasks. Paper recommender systems are one such tool scientists use to stay on top of the ever-increasing number of academic publications in their field. Improving research paper recommender systems is an active research field. However, less research has focused on how the interfaces of research paper recommender systems can be tailored to suit the needs of different research domains. For example, in the field of biomedicine and chemistry, researchers are not only interested in textual relevance but may also want to discover or compare the contained chemical entity information found in a paper’s full text. Existing recommender systems for academic literature do not support the discovery of this non-textual, but semantically valuable, chemical entity data. We present the first implementation of a specialized chemistry paper recommender system capable of visualizing the contained chemical structures, chemical formulae, and synonyms for chemical compounds within the document’s full text. We review existing tools and related research in this field before describing the implementation of our ChemVis system. With the help of chemists, we are expanding the functionality of ChemVis, and will perform an evaluation of recommendation performance and usability in future work.}, language = {en}, urldate = {2022-06-22}, booktitle = {Proceedings of the 22nd {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries}}, publisher = {ACM}, author = {Breitinger, Corinna and Herklotz, Kay and Flegelskamp, Tim and Meuschke, Norman}, month = jun, year = {2022}, }
@inproceedings{WahleARM22, address = {Virtual Event}, title = {Testing the {Generalization} of {Neural} {Language} {Models} for {COVID}-19 {Misinformation} {Detection}}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/wahle2022a.pdf code/data=https://github.com/ag-gipp/iConference22_COVID_misinformation}, doi = {10.1007/978-3-030-96957-8_33}, abstract = {A drastic rise in potentially life-threatening misinformation has been a by-product of the COVID-19 pandemic. Computational support to identify false information within the massive body of data on the topic is crucial to prevent harm. Researchers proposed many methods for flagging online misinformation related to COVID-19. However, these methods predominantly target specific content types (e.g., news) or platforms (e.g., Twitter). The methods’ capabilities to generalize were largely unclear so far. We evaluate fifteen Transformer-based models on five COVID-19 misinformation datasets that include social media posts, news articles, and scientific papers to fill this gap. We show tokenizers and models tailored to COVID-19 data do not provide a significant advantage over general-purpose ones. Our study provides a realistic assessment of models for detecting COVID-19 misinformation. We expect that evaluating a broad spectrum of datasets and models will benefit future research in developing misinformation detection systems.}, booktitle = {Proceedings of the {iConference}}, author = {Wahle, Jan Philip and Ashok, Nischal and Ruas, Terry and Meuschke, Norman and Ghosal, Tirthankar and Gipp, Bela}, year = {2022}, }
@inproceedings{WahleRFM22, address = {Virtual Event}, title = {Identifying {Machine}-{Paraphrased} {Plagiarism}}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/wahle2022b.pdf code=https://github.com/jpelhaW/ParaphraseDetection data=https://doi.org/10.5281/zenodo.3608000 demo=httpS://purl.org/spindetector}, abstract = {Employing paraphrasing tools to conceal plagiarized text is a severe threat to academic integrity. To enable the detection of machine-paraphrased text, we evaluate the effectiveness of five pre-trained word embedding models combined with machine learning classifiers and state-of-the-art neural language models. We analyze preprints of research papers, graduation theses, and Wikipedia articles, which we paraphrased using different configurations of the tools SpinBot and SpinnerChief. The best performing technique, Longformer, achieved an average F1 score of 80.99\% (F1=99.68\% for SpinBot and F1=71.64\% for SpinnerChief cases), while human evaluators achieved F1=78.4\% for SpinBot and F1=65.6\% for SpinnerChief cases. We show that the automated classification alleviates shortcomings of widely-used text-matching systems, such as Turnitin and PlagScan. To facilitate future research, all data, code, and two web applications showcasing our contributions are openly available.}, booktitle = {Proceedings of the {iConference}}, author = {Wahle, Jan Philip and Ruas, Terry and Foltýnek, Tomáš and Meuschke, Norman and Gipp, Bela}, year = {2022}, }
@phdthesis{Meuschke21, address = {Konstanz, Germany}, type = {Doctoral {Thesis}}, title = {Analyzing {Non}-{Textual} {Content} {Elements} to {Detect} {Academic} {Plagiarism}}, copyright = {Creative Commons Attribution 4.0 International, Open Access}, url = {paper=https://zenodo.org/record/4913345/files/Meuschke2021Thesis.pdf code/data=http://thesis.meuschke.org demo=https://gipplab.org/projects/hyplag}, abstract = {Identifying academic plagiarism is a pressing problem, among others, for research institutions, publishers, and funding organizations. Detection approaches proposed so far analyze lexical, syntactical, and semantic text similarity. These approaches find copied, moderately reworded, and literally translated text. However, reliably detecting disguised plagiarism, such as strong paraphrases, sense-for-sense translations, and the reuse of non-textual content and ideas, is an open research problem. The thesis addresses this problem by proposing plagiarism detection approaches that implement a different concept: analyzing non-textual content in academic documents, specifically citations, images, and mathematical content. To validate the effectiveness of the proposed detection approaches, the thesis presents five evaluations that use real cases of academic plagiarism and exploratory searches for unknown cases. The evaluation results show that non-textual content elements contain a high degree of semantic information, are language-independent, and largely immutable to the alterations that authors typically perform to conceal plagiarism. Analyzing non-textual content complements text-based detection approaches and increases the detection effectiveness, particularly for disguised forms of academic plagiarism. To demonstrate the benefit of combining non-textual and text-based detection methods, the thesis describes the first plagiarism detection system that integrates the analysis of citation-based, image-based, math-based, and text-based document similarity. The system's user interface employs visualizations that significantly reduce the effort and time users must invest in examining content similarity.}, language = {en}, school = {University of Konstanz, Dept. of Computer and Information Science}, author = {Meuschke, Norman}, year = {2021}, doi = {10.5281/zenodo.4913345}, }
@inproceedings{WahleRMG21, address = {Virtual Event}, title = {Are {Neural} {Language} {Models} {Good} {Plagiarists}? {A} {Benchmark} for {Neural} {Paraphrase} {Detection}}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/wahle2021.pdf code/data=https://doi.org/10.5281/zenodo.4621403}, abstract = {Neural language models such as BERT allow for human-like text paraphrasing. This ability threatens academic integrity, as it aggravates identifying machine-obfuscated plagiarism. We make two contributions to foster the research on detecting these novel machine-paraphrases. First, we provide the first large-scale dataset of documents paraphrased using the Transformer-based models BERT, RoBERTa, and Longformer. The dataset includes paragraphs from scientific papers on arXiv, theses, and Wikipedia articles and their paraphrased counterparts (1.5M paragraphs in total). We show the paraphrased text maintains the semantics of the original source. Second, we benchmark how well neural classification models can distinguish the original and paraphrased text. The dataset and source code of our study are publicly available.}, booktitle = {Proceedings of the {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, author = {Wahle, Jan Philip and Ruas, Terry and Meuschke, Norman and Gipp, Bela}, month = sep, year = {2021}, keywords = {archived}, }
@inproceedings{Stegmueller2021, address = {Virtual Event}, title = {Detecting {Cross}-{Language} {Plagiarism} using {Open} {Knowledge} {Graphs}}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/stegmueller2021.pdf code/data=https://doi.org/10.5281/zenodo.5159398}, abstract = {Identifying cross-language plagiarism is challenging, especially for distant language pairs and sense-for-sense translations. We introduce the new multilingual retrieval model Cross-Language Ontology-Based Similarity Analysis (CL-OSA) for this task. CL-OSA represents documents as entity vectors obtained from the open knowledge graph Wikidata. Opposed to other methods, CL-OSA does not require computationally expensive machine translation, nor pre-training using comparable or parallel corpora. It reliably disambiguates homonyms and scales to allow its application to Web-scale document collections. We show that CL-OSA outperforms state-of-the-art methods for retrieving candidate documents from five large, topically diverse test corpora that include distant language pairs like Japanese-English. For identifying cross-language plagiarism at the character level, CL-OSA primarily improves the detection of sense-for-sense translations. For these challenging cases, CL-OSA's performance in terms of the well-established PlagDet score exceeds that of the best competitor by more than factor two. The code and data of our study are openly available.}, booktitle = {2nd {Workshop} on {Extraction} and {Evaluation} of {Knowledge} {Entities} from {Scientific} {Documents} ({EEKE2021}) at the {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} 2021 ({JCDL2021})}, publisher = {ACM}, author = {Stegmueller, Johannes and Bauer-Marquart, Fabian and Meuschke, Norman and Ruas, Terry and Schubotz, Moritz and Gipp, Bela}, month = sep, year = {2021}, }
@inproceedings{SpindeSMG21, address = {Virtual Event}, title = {{TASSY} - {A} {Text} {Annotation} {Survey} {System}}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/spinde2021c.pdf code=http://tassy.gipplab.org demo=http://tassy-demo.gipplab.org}, abstract = {We present a free and open-source tool for creating web-based surveys that include text annotation tasks. Existing tools offer either text annotation or survey functionality but not both. Combining the two input types is particularly relevant for investigating a reader’s perception of a text which also de-pends on the reader’s background, such as age, gender, and education. Our tool caters primarily to the needs of research-ers in the Library and Information Sciences, the Social Scienc-es, and the Humanities who apply Content Analysis to investi-gate, e.g., media bias, political communication, or fake news.}, booktitle = {Proceedings of the {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, author = {Spinde, Timo and Sinha, Kanishka and Meuschke, Norman and Gipp, Bela}, month = sep, year = {2021}, keywords = {!nm\_author}, }
@inproceedings{BeckSSM21, address = {Virtual Event}, title = {Strategies to {Record}, {Annotate} and {Visualize} and {Parallel} {Structures} in {XML} {Documents}}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/beck2021.pdf code=https://github.com/ag-gipp/parallelXmlHighlighting}, abstract = {We present a four-phase parallel approach for capturing, annotating, and visualizing parallel structures in XML documents. We designed a highlighting strategy that first decomposes XML documents in various data streams, including plain text, formulae, and images. Second, those streams are processed with external algorithms and tools optimized for specific tasks, such as analyzing similarities or differences or differences in the respective formats. Third, we compute comparison metadata such as annotations and highlighting marks. Fourth, the position information is concatenated based on the original XML's computed positions document. Eventually, the resulting comparison can then be visualized or processed further while keeping the reference to the source documents intact. While our algorithm has been developed for visualizing similarities as part of plagiarism detection tasks, we expect that many applications will benefit from a well-designed and integrative method that separates between addressing the match locations and inserting highlight marks. For example, our algorithm can also add comments in XML-unaware plaintext editors. We also treat the edge cases, overlaps as well as multi-match with our approach.}, booktitle = {Proceedings of the {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, author = {Beck, Marco and Schubotz, Moritz and Stange, Vincent and Meuschke, Norman and Gipp, Bela}, month = sep, year = {2021}, }
@inproceedings{BreitingerKMM20, address = {Virtual Event}, title = {Supporting the {Exploration} of {Semantic} {Features} in {Academic} {Literature} using {Graph}-based {Visualizations}}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/breitinger2020.pdf demo=https://purl.org/recvis}, doi = {10.1145/3383583.3398599}, abstract = {Literature search and recommendation systems have traditionally focused on improving recommendation accuracy through new algorithmic approaches. Less research has focused on the crucial task of visualizing the retrieved results to the user. Today, the most common visualization for literature search and recommendation systems remains the ranked list. However, this format exhibits several shortcomings, especially for academic literature. We present an alternative visual interface for exploring the results of an academic literature retrieval system using a force-directed graph layout. The interactive information visualization techniques we describe allow for a higher resolution search and discovery space tailored to the unique feature-based similarity present among academic literature. RecVis—the visual interface we propose—supports academics in exploring the scientific literature beyond textual similarity alone, since it enables the rapid identification of other forms of similarity, including the similarity of citations, figures, and mathematical expressions.}, booktitle = {Proceedings of the {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, author = {Breitinger, Corinna and Kolcu, Birkan and Meuschke, Monique and Meuschke, Norman and Gipp, Bela}, month = aug, year = {2020}, note = {Venue Rating: CORE A*}, keywords = {Literature Recommendation}, }
@inproceedings{FoltynekVMD20, address = {Virtual Event}, title = {Cross-{Language} {Source} {Code} {Plagiarism} {Detection} using {Explicit} {Semantic} {Analysis} and {Scored} {Greedy} {String} {Tilling}}, url = {https://www.gipp.com/wp-content/papercite-data/pdf/foltynek2020a.pdf}, doi = {10.1145/3383583.3398594}, abstract = {We present a method for source code plagiarism detection that is independent of the programming language. Our method EsaGst combines Explicit Semantic Analysis and Greedy String Tiling. Using 25 cases of source code plagiarism in C++, Java, JavaScript, PHP, and Python, we show that EsaGst outperforms a baseline method in identifying plagiarism across programming languages.}, booktitle = {Proceedings of the {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, author = {Foltýnek, Tomáš and Vsiansky, Richard and Meuschke, Norman and Dlabolova, Dita and Gipp, Bela}, month = aug, year = {2020}, note = {Venue Rating: CORE A*}, keywords = {Plagiarism Detection}, }
@inproceedings{IhleSMG20, address = {Virtual Event}, title = {A {First} {Step} {Towards} {Content} {Protecting} {Plagiarism} {Detection}}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/ihle2020.pdf code=https://github.com/ag-gipp/20CppdData}, doi = {10.1145/3383583.3398620}, abstract = {Plagiarism detection systems are essential tools for safeguarding academic and educational integrity. However, today’s systems require disclosing the full content of the input documents and the document collection to which the input documents are compared. Moreover, the systems are centralized and under the control of individual, typically commercial providers. This situation raises procedural and legal concerns regarding the confidentiality of sensitive data, which can limit or prohibit the use of plagiarism detection services. To eliminate these weaknesses of current systems, we seek to devise a plagiarism detection approach that does not require a centralized provider nor exposing any content as cleartext. This paper presents the initial results of our research. Specifically, we employ Private Set Intersection to devise a content-protecting variant of the citation-based similarity measure Bibliographic Coupling implemented in our plagiarism detection system HyPlag. Our evaluation shows that the content-protecting method achieves the same detection effectiveness as the original method while making common attacks to disclose the protected content practically infeasible. Our future work will extend this successful proof-of-concept by devising plagiarism detection methods that can analyze the entire content of documents without disclosing it as cleartext.}, booktitle = {Proceedings of the {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, author = {Ihle, Cornelius and Schubotz, Moritz and Meuschke, Norman and Gipp, Bela}, month = aug, year = {2020}, note = {Venue Rating: CORE A*}, keywords = {Plagiarism Detection}, }
@inproceedings{ScharpfSYH20, address = {Virtual Event}, title = {Classification and {Clustering} of {arXiv} {Documents}, {Sections}, and {Abstracts} {Comparing} {Encodings} of {Natural} and {Mathematical} {Language}}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/scharpf2020.pdf code=https://purl.org/class_clust_arxiv_code}, doi = {10.1145/3383583.3398529}, abstract = {In this paper, we show how selecting and combining encodings of natural and mathematical language affect classification and clustering of documents with mathematical content. We demonstrate this by using sets of documents, sections, and abstracts from the arXiv preprint server that are labelled by their subject class (mathematics, computer science, physics, etc.) to compare different encodings of text and formulae and evaluate the performance and runtimes of selected classification and clustering algorithms. Our encodings achieve classification accuracies up to 82.8\% and cluster purities up to 69.4\% (number of clusters equals number of classes), and 99.9\% (unspecified number of clusters) respectively. We observe a relatively low correlation between text and math similarity, which indicates the independence of text and formulae and motivates treating them as separate features of a document. The classification and clustering can be employed, e.g., for document search and recommendation. Furthermore, we show that the computer outperforms a human expert when classifying documents. Finally, we evaluate and discuss multi-label classification and formula semantification.}, booktitle = {Proceedings of the {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, author = {Scharpf, Philipp and Schubotz, Moritz and Youssef, Abdou and Hamborg, Felix and Meuschke, Norman and Gipp, Bela}, month = aug, year = {2020}, note = {Venue Rating: CORE A*}, keywords = {Math Information Retrieval}, }
@inproceedings{SchubotzGMT20, address = {Virtual Event}, title = {Mathematical {Formulae} in {Wikimedia} {Projects} 2020}, url = {https://www.gipp.com/wp-content/papercite-data/pdf/schubotz2020.pdf}, doi = {10.1145/3383583.3398557}, abstract = {This poster summarizes our contributions to Wikimedia's processing pipeline for mathematical formulae. We describe how we have supported the transition from rendering formulae as course-grained PNG images in 2001 to providing modern semantically enriched language-independent MathML formulae in 2020. Additionally, we describe our plans to improve the accessibility and discoverability of mathematical knowledge in Wikimedia projects further.}, booktitle = {Proceedings of the {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, author = {Schubotz, Moritz and Greiner-Petter, André and Meuschke, Norman and Teschke, Olaf and Gipp, Bela}, month = aug, year = {2020}, note = {Venue Rating: CORE A*}, keywords = {Math Information Retrieval}, }
@incollection{FoltynekRSM20, address = {Cham}, title = {Detecting {Machine}-{Obfuscated} {Plagiarism}}, volume = {12051 LNCS}, isbn = {978-3-030-43686-5 978-3-030-43687-2}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/foltynek2020.pdf data=https://doi.org/10.7302/bewj-qx93 demo=https://purl.org/spindetector}, abstract = {Research on academic integrity has identified online paraphrasing tools as a severe threat to the effectiveness of plagiarism detection systems. To enable the automated identification of machine-paraphrased text, we make three contributions. First, we evaluate the effectiveness of six prominent word embedding models in combination with five classifiers for distinguishing human-written from machine-paraphrased text. The best performing classification approach achieves an accuracy of 99.0\% for documents and 83.4\% for paragraphs. Second, we show that the best approach outperforms human experts and established plagiarism detection systems for these classification tasks. Third, we provide a Web application that uses the best performing classification approach to indicate whether a text underwent machine-paraphrasing. The data and code of our study are openly available.}, booktitle = {Sustainable {Digital} {Communities}}, publisher = {Springer International Publishing}, author = {Foltýnek, Tomáš and Ruas, Terry and Scharpf, Philipp and Meuschke, Norman and Schubotz, Moritz and Grosky, William and Gipp, Bela}, editor = {Sundqvist, Anneli and Berget, Gerd and Nolin, Jan and Skjerdingstad, Kjell Ivar}, month = mar, year = {2020}, doi = {10.1007/978-3-030-43687-2_68}, keywords = {Plagiarism Detection}, pages = {816--827}, }
@inproceedings{SchubotzTSM19, address = {Czech Republic}, title = {Forms of {Plagiarism} in {Digital} {Mathematical} {Libraries}}, volume = {11617 LNCS}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/schubotz2019.pdf}, doi = {10.1007/978-3-030-23250-4_18}, abstract = {We report on an exploratory analysis of the forms of plagiarism observable in mathematical publications, which we identified by investigating editorial notes from zbMATH. While most cases we encountered were simple copies of earlier work, we also identified several forms of disguised plagiarism. We investigated 11 cases in detail and evaluate how current plagiarism detection systems perform in identifying these cases. Moreover, we describe the steps required to discover these and potentially undiscovered cases in the future.}, booktitle = {Proceedings {International} {Conference} on {Intelligent} {Computer} {Mathematics}}, author = {Schubotz, Moritz and Teschke, Olaf and Stange, Vincent and Meuschke, Norman and Gipp, Bela}, month = jul, year = {2019}, keywords = {Plagiarism Detection}, pages = {258--274}, }
@inproceedings{MeuschkeSSK19, address = {Urbana-Champaign, Illinois, USA}, title = {Improving {Academic} {Plagiarism} {Detection} for {STEM} {Documents} by {Analyzing} {Mathematical} {Content} and {Citations}}, isbn = {978-1-72811-547-4}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/meuschke2019.pdf code/data=https://purl.org/hybridPD}, doi = {10.1109/JCDL.2019.00026}, abstract = {Identifying academic plagiarism is a pressing task for educational and research institutions, publishers, and funding agencies. Current plagiarism detection systems reliably find instances of copied and moderately reworded text. However, reliably detecting concealed plagiarism, such as strong paraphrases, translations, and the reuse of nontextual content and ideas is an open research problem. In this paper, we extend our prior research on analyzing mathematical content and academic citations. Both are promising approaches for improving the detection ofconcealed academic plagiarism primarily in Science, Technology, Engineering and Mathematics (STEM). We make the following contributions: i) We present a two-stage detec- tion process that combines similarity assessments of mathematical content, academic citations, and text. ii) We introduce new similar- ity measures that consider the order of mathematical features and outperform the measures in our prior research. iii) We compare the effectiveness of the math-based, citation-based, and text-based detection approaches using confirmed cases of academic plagia- rism. iv) We demonstrate that the combined analysis of math-based and citation-based content features allows identifying potentially suspicious cases in a collection of 102K STEM documents. Overall, we show that analyzing the similarity of mathematical content and academic citations is a striking supplement for conventional text- based detection approaches for academic literature in the STEM disciplines. The data and code of our study are openly available at https://purl.org/hybridPD}, booktitle = {Proceedings of the {Annual} {International} {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, author = {Meuschke, Norman and Stange, Vincent and Schubotz, Moritz and Kramer, Michael and Gipp, Bela}, month = jun, year = {2019}, note = {Venue Rating: CORE A*}, keywords = {Plagiarism Detection}, pages = {120--129}, }
@article{FoltynekMG19, title = {Academic {Plagiarism} {Detection}: {A} {Systematic} {Literature} {Review}}, volume = {52}, issn = {0360-0300}, url = {https://www.gipp.com/wp-content/papercite-data/pdf/foltynek2019.pdf}, doi = {10.1145/3345317}, abstract = {This article summarizes the research on computational methods to detect academic plagiarism by systematically reviewing 239 research papers published between 2013 and 2018. To structure the presentation of the research contributions, we propose novel technically oriented typologies for plagiarism prevention and detection efforts, the forms of academic plagiarism, and computational plagiarism detection methods. We show that academic plagiarism detection is a highly active research field. Over the period we review, the field has seen major advances regarding the automated detection of strongly obfuscated and thus hard-to-identify forms of academic plagiarism. These improvements mainly originate from better semantic text analysis methods, the investigation of non-textual content features, and the application of machine learning. We identify a research gap in the lack of methodologically thorough performance evaluations of plagiarism detection systems. Concluding from our analysis, we see the integration of heterogeneous analysis methods for textual and non-textual content features using machine learning as the most promising area for future research contributions to improve the detection of academic plagiarism further.}, number = {6}, journal = {ACM Computing Surveys}, author = {Foltýnek, Tomáš and Meuschke, Norman and Gipp, Bela}, month = oct, year = {2019}, note = {Venue Rating: SJR Q1}, keywords = {Plagiarism Detection}, pages = {112:1--112:42}, }
@inproceedings{MeuschkeGSB18, address = {Fort Worth, USA}, title = {An {Adaptive} {Image}-{Based} {Plagiarism} {Detection} {Approach}}, isbn = {978-1-4503-5178-2}, url = {paper=https://gipp.com/wp-content/papercite-data/pdf/meuschke2018.pdf code=https://purl.org/imagepd}, doi = {10.1145/3197026.3197042}, abstract = {Identifying plagiarized content is a crucial task for educational and research institutions, funding agencies, and academic publishers. Plagiarism detection systems available for productive use reliably identify copied text, or near-copies of text, but oſten fail to detect disguised forms of academic plagiarism, such as paraphrases, trans- lations, and idea plagiarism. To improve the detection capabilities for disguised forms of academic plagiarism, we analyze the images in academic documents as text-independent features. We propose an adaptive, scalable, and extensible image-based plagiarism de- tection approach suitable for analyzing a wide range of image similarities that we observed in academic documents. The proposed detection approach integrates established image analysis methods, such as perceptual hashing, with newly developed similarity assess- ments for images, such as ratio hashing and position-aware OCR text matching. We evaluate our approach using 15 image pairs that are representative of the spectrum of image similarity we observed in alleged and confirmed cases of academic plagiarism. We embed the test cases in a collection of 4,500 related images from academic texts. Our detection approach achieved a recall of 0.73 and a pre- cision of 1. These results indicate that our image-based approach can complement other content-based feature analysis approaches to retrieve potential source documents for suspiciously similar con- tent from large collections. We provide our code as open source to facilitate future research on image-based plagiarism detection.}, booktitle = {Proceedings of the 18th {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, author = {Meuschke, Norman and Gondek, Christopher and Seebacher, Daniel and Breitinger, Corinna and Keim, Daniel and Gipp, Bela}, month = jun, year = {2018}, note = {Venue Rating: CORE A*}, keywords = {Plagiarism Detection}, pages = {131--140}, }
@article{HamborgMG18, title = {Bias-aware {News} {Analysis} {Using} {Matrix}-based {News} {Aggregation}}, issn = {1432-5012, 1432-1300}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/hamborg2018b.pdf code=https://github.com/fhamborg/NewsBirdServer}, doi = {10.1007/s00799-018-0239-9}, abstract = {Media bias describes differences in the content or presentation of news. It is an ubiquitous phenomenon in news coverage that can have severely negative effects on individuals and society. Identifying media bias is a challenging problem, for which current information systems offer little support. News aggregators are the most important class of systems to support users in coping with the large amount of news that is published nowadays. These systems focus on identifying and presenting important, common information in news articles, but do not reveal different perspectives on the same topic. Due to this analysis approach, current news aggregators cannot effectively reveal media bias. To address this problem, we present matrix-based news aggregation, a novel approach for news exploration that helps users gain a broad and diverse news understanding by presenting various perspectives on the same news topic. Additionally, we present NewsBird, an open-source news aggregator that implements matrix-based news aggregation for international news topics. The results of a user study showed that NewsBird more effectively broadens the user’s news understanding than the list-based visualization approach employed by established news aggregators, while achieving comparable effectiveness and efficiency for the two main use cases of news consumption: getting an overview of and finding details on current news topics.}, journal = {International Journal on Digital Libraries (IJDL)}, author = {Hamborg, Felix and Meuschke, Norman and Gipp, Bela}, month = may, year = {2018}, note = {Venue Rating: SJR Q1}, keywords = {News Analysis}, }
@inproceedings{MeuschkeSSG18, address = {Ann Arbor, MI, USA}, title = {{HyPlag}: {A} {Hybrid} {Approach} to {Academic} {Plagiarism} {Detection}}, isbn = {978-1-4503-5657-2}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/meuschke2018a.pdf demo=https://dke.uni-wuppertal.de/en/projects/hyplag}, doi = {10.1145/3209978.3210177}, abstract = {Current plagiarism detection systems reliably find instances of copied and moderately altered text, but often fail to detect strong paraphrases, translations, and the reuse of non-textual content and ideas. To improve upon the detection capabilities for such concealed content reuse in academic publications, we make four contributions: i) We present the first plagiarism detection approach that combines the analysis of mathematical expressions, images, citations and text. ii) We describe the implementation of this hybrid detection approach in the research prototype HyPlag. iii) We present novel visualization and interaction concepts to aid users in reviewing content similarities identified by the hybrid detection approach. iv) We demonstrate the usefulness of the hybrid detection and result visualization approaches by using HyPlag to analyze a confirmed case of content reuse present in a retracted research publication.}, booktitle = {Proceedings of the 41st {International} {ACM} {SIGIR} {Conference} on {Research} \& {Development} in {Information} {Retrieval}}, author = {Meuschke, Norman and Stange, Vincent and Schubotz, Moritz and Gipp, Bela}, month = jun, year = {2018}, note = {Venue Rating: CORE A*}, keywords = {Plagiarism Detection}, pages = {1321--1324}, }
@inproceedings{SchubotzGSM18, address = {Fort Worth, USA}, title = {Improving the {Representation} and {Conversion} of {Mathematical} {Formulae} by {Considering} their {Textual} {Context}}, isbn = {978-1-4503-5178-2}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/schubotz2018.pdf data/demo=https://mathmlben.wmflabs.org}, doi = {10.1145/3197026.3197058}, abstract = {Mathematical formulae represent complex semantic information in a concise form. Especially in Science, Technology, Engineering, and Mathematics, mathematical formulae are crucial to communicate information, e.g., in scientific papers, and to perform computations using computer algebra systems. Enabling computers to access the information encoded in mathematical formulae requires machine-readable formats that can represent both the presentation and content, i.e., the semantics, of formulae. Exchanging such information between systems additionally requires conversion methods for mathematical representation formats. We analyze how the semantic enrichment of formulae improves the format conversion process and show that considering the textual context of formulae reduces the error rate of such conversions. Our main contributions are: (1) providing an openly available benchmark dataset for the mathematical format conversion task consisting of a newly created test collection, an extensive, manually curated gold standard and task-specific evaluation metrics; (2) performing a quantitative evaluation of state-of-the-art tools for mathematical format conversions; (3) presenting a new approach that considers the textual context of formulae to reduce the error rate for mathematical format conversions. Our benchmark dataset facilitates future research on mathematical format conversions as well as research on many problems in mathematical information retrieval. Because we annotated and linked all components of formulae, e.g., identifiers, operators and other entities, to Wikidata entries, the gold standard can, for instance, be used to train methods for formula concept discovery and recognition. Such methods can then be applied to improve mathematical information retrieval systems, e.g., for semantic formula search, recommendation of mathematical content, or detection of mathematical plagiarism.}, booktitle = {Proceedings of the 18th {ACM}/{IEEE} on {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, publisher = {ACM}, author = {Schubotz, Moritz and Greiner-Petter, André and Scharpf, Philipp and Meuschke, Norman and Cohl, Howard S. and Gipp, Bela}, month = jun, year = {2018}, note = {Venue Rating: CORE A*}, keywords = {Math Information Retrieval}, pages = {233--242}, }
@inproceedings{MeuschkeSHS17, address = {Singapore}, title = {Analyzing {Mathematical} {Content} to {Detect} {Academic} {Plagiarism}}, isbn = {978-1-4503-4918-5}, shorttitle = {Proc. {CIKM}}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/meuschke2017b.pdf code/data=https://purl.org/mathpd}, doi = {10.1145/3132847.3133144}, abstract = {This paper presents, to our knowledge, the first study on analyzing mathematical expressions to detect academic plagiarism. We make the following contributions. First, we investigate confirmed cases of plagiarism to categorize the similarities of mathematical content commonly found in plagiarized publications. From this investigation, we derive possible feature selection and feature comparison strategies for developing math-based detection approaches and a ground truth for our experiments. Second, we create a test collection by embedding confirmed cases of plagiarism into the NTCIR-11 MathIR Task dataset, which contains approx. 60 million mathematical expressions in 105,120 documents from arXiv.org. Third, we develop a first math-based detection approach by implementing and evaluating different feature comparison approaches using an open source parallel data processing pipeline built using the Apache Flink framework. The best performing approach identifies all but two of our real-world test cases at the top rank and achieves a mean reciprocal rank of 0.86. The results show that mathematical expressions are promising text-independent features to identify academic plagiarism in large collections. To facilitate future research on math-based plagiarism detection, we make our source code and data available. ? 2017 Copyright held by the owner/author(s). Publication rights licensed to ACM.}, booktitle = {Proceedings {ACM} {Conference} on {Information} and {Knowledge} {Management} ({CIKM})}, publisher = {ACM}, author = {Meuschke, Norman and Schubotz, Moritz and Hamborg, Felix and Skopal, Tomas and Gipp, Bela}, month = nov, year = {2017}, note = {Venue Rating: CORE A}, keywords = {Plagiarism Detection}, pages = {2211--2214}, }
@inproceedings{HamborgMBG17, address = {Berlin}, title = {news-please: {A} {Generic} {News} {Crawler} and {Extractor}}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/hamborg2017.pdf code=https://github.com/fhamborg/news-please}, doi = {10.18452/1447}, abstract = {The amount of news published and read online has increased tremendously in recent years, making news data an interesting resource for many research dis-ciplines, such as the social sciences and linguistics. However, large scale col-lection of news data is cumbersome due to a lack of generic tools for crawling and extracting such data. We present news-please, a generic, multi-language, open-source crawler and extractor for news that works out-of-the-box for a large variety of news websites. Our system allows crawling arbitrary news websites and extracting the major elements of news articles on those websites, i.e., title, lead paragraph, main content, publication date, author, and main im-age. Compared to existing tools, news-please features full website extraction requiring only the root URL.}, booktitle = {Proceedings of the 15th {International} {Symposium} of {Information} {Science}}, author = {Hamborg, Felix and Meuschke, Norman and Breitinger, Corinna and Gipp, Bela}, editor = {Gaede, Maria and Trkulja, Violeta and Petra, Vivien}, month = mar, year = {2017}, keywords = {News Analysis}, pages = {218--223}, }
@incollection{SchubotzMHC17, series = {Lecture {Notes} in {Computer} {Science}}, title = {{VMEXT}: {A} {Visualization} {Tool} for {Mathematical} {Expression} {Trees}}, volume = {10383 LNCS}, isbn = {978-3-319-62074-9}, shorttitle = {{VMEXT}}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/schubotz2017a.pdf code=https://github.com/ag-gipp/vmext}, abstract = {Mathematical expressions can be represented as a tree consisting of terminal symbols, such as identifiers or numbers (leaf nodes), and functions or operators (non-leaf nodes). Expression trees are an important mechanism for storing and processing mathematical expressions as well as the most frequently used visualization of the structure of mathematical expressions. Typically, researchers and practitioners manually visualize expression trees using general-purpose tools. This approach is laborious, redundant, and error-prone. Manual visualizations represents a user’s notion of what the markup of an expression should be, but not necessarily what the actual markup is. This paper presents VMEXT – a free and open source tool to directly visualize expression trees from parallel Open image in new window. VMEXT simultaneously visualizes the presentation elements and the semantic structure of mathematical expressions to enable users to quickly spot deficiencies in the Content Open image in new window markup that does not affect the presentation of the expression. Identifying such discrepancies previously required reading the verbose and complex Open image in new window markup. VMEXT also allows one to visualize similar and identical elements of two expressions. Visualizing expression similarity can support developers in designing retrieval approaches and enable improved interaction concepts for users of mathematical information retrieval systems. We demonstrate VMEXT’s visualizations in two web-based applications. The first application presents the visualizations alone. The second application shows a possible integration of the visualizations in systems for mathematical knowledge management and mathematical information retrieval. The application converts Open image in new window input to parallel Open image in new window, computes basic similarity measures for mathematical expressions, and visualizes the results using VMEXT.}, booktitle = {Intelligent {Computer} {Mathematics}}, publisher = {Springer}, author = {Schubotz, Moritz and Meuschke, Norman and Hepp, Thomas and Cohl, Howard S. and Gipp, Bela}, editor = {Geuvers, Herman and England, Matthew and Hasan, Osman and Rabe, Florian and Teschke, Olaf}, month = jul, year = {2017}, doi = {10.1007/978-3-319-62075-6_24}, keywords = {Math Information Retrieval}, pages = {340--355}, }
@inproceedings{SchwarzerBSM17, title = {Citolytics: {A} {Link}-based {Recommender} {System} for {Wikipedia}}, isbn = {978-1-4503-4652-8}, shorttitle = {Citolytics}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/schwarzer2017.pdf code=https://github.com/wikimedia/citolytics}, doi = {10.1145/3109859.3109981}, abstract = {We present Citolytics - a novel link-based recommendation system for Wikipedia articles. In a preliminary study, Citolytics achieved promising results compared to the widely used text-based approach of Apache Lucene's MoreLikeThis (MLT). In this demo paper, we describe how we plan to integrate Citolytics into the Wikipedia infrastructure by using Elasticsearch and Apache Flink to serve recommendations for Wikipedia articles. Additionally, we propose a large-scale online evaluation design using the Wikipedia Android app. Working with Wikipedia data has several unique advantages. First, the availability of a very large user sample contributes to statistically significant results. Second, the openness of Wikipedia's architecture allows making our source code and evaluation data public, thus benefiting other researchers. If link-based recommendations show promise in our online evaluation, a deployment of the presented system within Wikipedia would have a far-reaching impact on Wikipedia's more than 30 million users.}, booktitle = {Proceedings of the 11th {ACM} {Conference} on {Recommender} {Systems} ({RecSys})}, publisher = {ACM}, author = {Schwarzer, Malte and Breitinger, Corinna and Schubotz, Moritz and Meuschke, Norman and Gipp, Bela}, month = aug, year = {2017}, note = {Venue Rating: CORE B}, keywords = {Literature Recommendation}, pages = {360--361}, }
@inproceedings{DahmSMG17, title = {A {Vision} for {Performing} {Social} and {Economic} {Data} {Analysis} using {Wikipedia}'s {Edit} {History}}, isbn = {978-1-4503-4914-7}, url = {https://www.gipp.com/wp-content/papercite-data/pdf/dahm2017.pdf}, doi = {10.1145/3041021.3053363}, abstract = {In this vision paper, we suggest combining two lines of research to study the collective behavior of Wikipedia contributors. The first line of research analyzes Wikipedia's edit history to quantify the quality of individual contributions and the resulting reputation of the contributor. The second line of research surveys Wikipedia contributors to gain insights, e.g., on their personal and professional background, socioeconomic status, or motives to contribute toWikipedia. While both lines of research are valuable on their own, we argue that the combination of both approaches could yield insights that exceed the sum of the individual parts. Linking survey data to contributor reputation and content-based quality metrics could provide a large-scale, public domain data set to perform user modeling, i.e. deducing interest profiles of user groups. User profiles can, among other applications, help to improve recommender systems. The resulting dataset can also enable a better understanding and improved prediction of high quality Wikipedia content and successfulWikipedia contributors. Furthermore, the dataset can enable novel research approaches to investigate team composition and collective behavior as well as help to identify domain experts and young talents. We report on the status of implementing our large-scale, content-based analysis of the Wikipedia edit history using the big data processing framework Apache Flink. Additionally, we describe our plans to conduct a survey among Wikipedia contributors to enhance the content-based quality metrics.}, booktitle = {Proceedings of the 26th {International} {Conference} on {World} {Wide} {Web} {Companion}}, publisher = {ACM}, author = {Dahm, Erik and Schubotz, Moritz and Meuschke, Norman and Gipp, Bela}, month = apr, year = {2017}, note = {Venue Rating: CORE A*}, keywords = {Miscellaneous}, pages = {1627--1634}, }
@inproceedings{GippBMB17, address = {Toronto, Canada}, title = {{CryptSubmit}: {Introducing} {Securely} {Timestamped} {Manuscript} {Submission} and {Peer} {Review} {Feedback} {Using} the {Blockchain}}, isbn = {978-1-5386-3861-3}, url = {https://www.gipp.com/wp-content/papercite-data/pdf/gipp2017b.pdf}, doi = {10.1109/jcdl.2017.7991588}, abstract = {Manuscript submission systems are a central fixture in scholarly publishing. However, with existing systems, researchers must trust that their yet unpublished findings will not prematurely be disseminated due to technical weaknesses and that anonymous peer reviewers or committee members will not plagiarize unpublished content. To address this limitation, we present CryptSubmit - a system that automatically creates a decentralized, tamperproof, and publicly verifiable timestamp for each submitted manuscript by utilizing the blockchain of the cryptocurrency Bitcoin. The publicly accessible and tamperproof infrastructure of the blockchain allows researchers to independently verify the validity of the timestamp associated with their manuscript at the time of submission to a conference or journal. Our system supports researchers in protecting their intellectual property even in the face of vulnerable submission platforms or dishonest peer reviewers. Optionally, the system also generates trusted timestamps for the feedback shared by peer reviewers to increase the traceability of ideas. CryptSubmit integrates these features into the open source conference management system OJS. In the future, the method could be integrated at nearly no overhead cost into other manuscript submission systems, such as EasyChair, ConfTool, or Ambra. The introduced method can also improve electronic pre-print services and storage systems for research data.}, booktitle = {Proceedings of the 17th {Annual} {International} {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, author = {Gipp, Bela and Breitinger, Corinna and Meuschke, Norman and Beel, Joeran}, month = jun, year = {2017}, note = {Venue Rating: CORE A*}, keywords = {Blockchain}, pages = {1--4}, }
@inproceedings{HamborgMG17, title = {Matrix-{Based} {News} {Aggregation}: {Exploring} {Different} {News} {Perspectives}}, isbn = {978-1-5386-3861-3}, url = {https://www.gipp.com/wp-content/papercite-data/pdf/hamborg2017b.pdf}, doi = {10.1109/jcdl.2017.7991561}, abstract = {News aggregators capably handle the large amount of news that is published nowadays. However, these systems focus on the presentation of important, common information in news, but do not reveal different perspectives on the same topic. Thus, current news aggregators suffer from media bias, i.e. differences in the content or presentation of news. Finding such differences is crucial to reduce the effects of media bias. This paper presents matrix-based news analysis (MNA), a novel design for news exploration. MNA helps users gain a broad and diverse news understanding by presenting various news perspectives on the same topic. Furthermore, we present NewsBird, a news aggregator that implements MNA to find different perspectives on international news topics. The results of a case study demonstrate that NewsBird broadens the user's news understanding while it also provides similar news aggregation functionalities as established systems.}, booktitle = {Proceedings of the 17th {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, author = {Hamborg, Felix and Meuschke, Norman and Gipp, Bela}, month = jun, year = {2017}, note = {Venue Rating: CORE A*}, keywords = {News Analysis}, pages = {1--10}, }
@inproceedings{MeuschkeSSG17, address = {Toronto, Canada}, title = {Analyzing {Semantic} {Concept} {Patterns} to {Detect} {Academic} {Plagiarism}}, isbn = {978-1-4503-5388-5}, url = {https://www.gipp.com/wp-content/papercite-data/pdf/meuschke2017a.pdf}, doi = {10.1145/3127526.3127535}, abstract = {Detecting academic plagiarism is a pressing problem, e.g., for educational and research institutions, funding agencies, and academic publishers. Existing plagiarism detection systems reliably identify copied text, or near copies of text, but often fail to detect disguised forms of academic plagiarism, such as paraphrases, translations, and idea plagiarism. We present Semantic Concept Pattern Analysis - an approach that performs an integrated analysis of semantic text relatedness and structural text similarity. Using 25 officially retracted academic plagiarism cases, we demonstrate that our approach can detect plagiarism that established text matching approaches would not identify. We view our approach as a promising addition to improve the detection capabilities for strong paraphrases. We plan to further improve Semantic Concept Pattern Analysis and include the approach as part of an integrated detection process that analyzes heterogeneous similarity features to better identify the many possible forms of plagiarism in academic documents.}, booktitle = {Proceedings of the {International} {Workshop} on {Mining} {Scientific} {Publications} ({WOSP}) co-located with the {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, publisher = {IEEE Computer Society}, author = {Meuschke, Norman and Siebeck, Nicolas and Schubotz, Moritz and Gipp, Bela}, month = jun, year = {2017}, note = {Venue Rating: CORE A*}, keywords = {Plagiarism Detection}, pages = {46--53}, }
@inproceedings{HamborgMAG17, address = {Berlin}, title = {Identification and {Analysis} of {Media} {Bias} in {News} {Articles}}, url = {https://ag-gipp.github.io/bib/preprints/hamborg2017a.pdf}, doi = {10.18452/1446}, abstract = {Depending on the news source, a reader can be exposed to a different narrative and conflicting perceptions for the same event. Today, news aggregators help users cope with the large volume of news published daily. However, aggregators focus on presenting shared information, but do not expose the different perspectives from articles on same topics. Thus, users of such aggregators suffer from media bias, which is often implemented intentionally to influence public opinion. In this paper, we present NewsBird, an aggregator that presents shared and different information on topics. Currently, NewsBird reveals different perspectives on international news. Our system has led to insights about media bias and news analysis, which we use to propose approaches to be investigated in future research. Our vision is to provide a system that reveals media bias, and thus ultimately allows users to make their own judgement on the potential bias inherent in news.}, booktitle = {Proceedings of the 15th {International} {Symposium} of {Information} {Science}}, author = {Hamborg, Felix and Meuschke, Norman and Aizawa, Akiko and Gipp, Bela}, editor = {Gaede, Maria and Trkulja, Violeta and Petra, Vivien}, month = mar, year = {2017}, keywords = {News Analysis}, pages = {224--236}, }
@incollection{SchubotzKMH17, address = {Cham}, series = {Lecture {Notes} in {Computer} {Science}}, title = {Evaluating and {Improving} the {Extraction} of {Mathematical} {Identifier} {Definitions}}, volume = {10456 LNCS}, isbn = {978-3-319-65812-4 978-3-319-65813-1}, url = {https://www.gipp.com/wp-content/papercite-data/pdf/schubotz2017.pdf}, abstract = {Mathematical formulae in academic texts significantly contribute to the overall semantic content of such texts, especially in the fields of Science, Technology, Engineering and Mathematics. Knowing the definitions of the identifiers in mathematical formulae is essential to understand the semantics of the formulae. Similar to the sense-making process of human readers, mathematical information retrieval systems can analyze the text that surrounds formulae to extract the definitions of identifiers occurring in the formulae. Several approaches for extracting the definitions of mathematical identifiers from documents have been proposed in recent years. So far, these approaches have been evaluated using different collections and gold standard datasets, which prevented comparative performance assessments. To facilitate future research on the task of identifier definition extraction, we make three contributions. First, we provide an automated evaluation framework, which uses the dataset and gold standard of the NTCIR-11 Math Retrieval Wikipedia task. Second, we compare existing identifier extraction approaches using the developed evaluation framework. Third, we present a new identifier extraction approach that uses machine learning to combine the well-performing features of previous approaches. The new approach increases the precision of extracting identifier definitions from 17.85\% to 48.60\%, and increases the recall from 22.58\% to 28.06\%. The evaluation framework, the dataset and our source code are openly available at: https://ident.formulasearchengine.com.}, booktitle = {Experimental {IR} {Meets} {Multilinguality}, {Multimodality}, and {Interaction}}, publisher = {Springer International Publishing}, author = {Schubotz, Moritz and Krämer, Leonard and Meuschke, Norman and Hamborg, Felix and Gipp, Bela}, editor = {Jones, Gareth J.F. and Lawless, Séamus and Gonzalo, Julio and Kelly, Liadh and Goeuriot, Lorraine and Mandl, Thomas and Cappellato, Linda and Ferro, Nicola}, month = aug, year = {2017}, doi = {10.1007/978-3-319-65813-1_7}, keywords = {Math Information Retrieval}, pages = {82--94}, }
@inproceedings{MeschenmoserMHG16, address = {Newark, New Jersey, USA}, title = {Scraping {Scientific} {Web} {Repositories}: {Challenges} and {Solutions} for {Automated} {Content} {Extraction}}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/meschenmoser2016a.pdf code=https://github.com/ag-gipp/grespa}, doi = {10.1045/september2016-meschenmoser}, abstract = {Aside from improving the visibility and accessibility of scientific publications, many scientific Web repositories also assess researchers' quantitative and qualitative publication performance, e.g., by displaying metrics such as the h-index. These metrics have become important for research institutions and other stakeholders to support impactful decision making processes such as hiring or funding decisions. However, scientific Web repositories typically offer only simple performance metrics and limited analysis options. Moreover, the data and algorithms to compute performance metrics are usually not published. Hence, it is not transparent or verifiable which publications the systems include in the computation and how the systems rank the results. Many researchers are interested in accessing the underlying scientometric raw data to increase the transparency of these systems. In this paper, we discuss the challenges and present strategies to programmatically access such data in scientific Web repositories. We demonstrate the strategies as part of an open source tool (MIT license) that allows research performance comparisons based on Google Scholar data. We would like to emphasize that the scraper included in the tool should only be used if consent was given by the operator of a repository. In our experience, consent is often given if the research goals are clearly explained and the project is of a non-commercial nature.}, booktitle = {Proceedings of the 5th {International} {Workshop} on {Mining} {Scientific} {Publications} ({WOSP}) held in conjunction with the 16th {ACM}/{IEEE}-{CS} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, author = {Meschenmoser, Philipp and Meuschke, Norman and Hotz, Manuel and Gipp, Bela}, year = {2016}, note = {Venue Rating: CORE A*}, keywords = {Miscellaneous}, }
@inproceedings{SchwarzerSMB16, address = {Newark, New Jersey, USA}, title = {Evaluating {Link}-based {Recommendations} for {Wikipedia}}, isbn = {978-1-4503-4229-2}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/schwarzer2016.pdf code/data=https://github.com/wikimedia/citolytics}, doi = {10.1145/2910896.2910908}, abstract = {Literature recommender systems support users in filtering the vast and increasing number of documents in digital libraries and on the Web. For academic literature, research has proven the ability of citation-based document similarity measures, such as Co-Citation (CoCit), or Co-Citation Proximity Analysis (CPA) to improve recommendation quality. In this paper, we report on the first large-scale investigation of the performance of the CPA approach in generating literature recommendations for Wikipedia, which is fundamentally different from the academic literature domain. We analyze links instead of citations to generate article recommendations. We evaluate CPA, CoCit, and the Apache Lucene MoreLikeThis (MLT) function, which represents a traditional text-based similarity measure. We use two datasets of 779,716 and 2.57 million Wikipedia articles, the Big Data processing framework Apache Flink, and a ten-node computing cluster. To enable our large-scale evaluation, we derive two quasi-gold standards from the links in Wikipedia's "See also" sections and a comprehensive Wikipedia clickstream dataset. Our results show that the citation-based measures CPA and CoCit have complementary strengths compared to the text-based MLT measure. While MLT performs well in identifying narrowly similar articles that share similar words and structure, the citation- based measures are better able to identify topically related information, such as information on the city of a certain university or other technical universities in the region. The CPA approach, which consistently outperformed CoCit, is better suited for identifying a broader spectrum of related articles, as well as popular articles that typically exhibit a higher quality. Additional benefits of the CPA approach are its lower runtime requirements and its language-independence that allows for a cross-language retrieval of articles. We present a manual analysis of exemplary articles to demonstrate and discuss our findings. The raw data and source code of our study, together with a manual on how to use them, are openly available at: https://github.com/wikimedia/citolytics}, booktitle = {Proceedings of the 16th {Annual} {International} {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, publisher = {ACM}, author = {Schwarzer, Malte and Schubotz, Moritz and Meuschke, Norman and Breitinger, Corinna and Markl, Volker and Gipp, Bela}, month = jun, year = {2016}, note = {Venue Rating: CORE A*}, keywords = {Literature Recommendation}, pages = {191--200}, }
@inproceedings{GippMBB16, title = {Using the {Blockchain} of {Cryptocurrencies} for {Timestamping} {Digital} {Cultural} {Heritage}}, volume = {13}, url = {https://zenodo.org/record/3547510/files/Gipp2017a_BC4DCH.pdf}, doi = {10.5281/zenodo.3547510}, abstract = {The proportion of information that is exclusively available online is continuously increasing. Unlike physical print media, online news outlets, magazines, or blogs are not immune to retrospective modification. Even significant editing of text in online news sources can easily go unnoticed. This poses a challenge to the preservation of digital cultural heritage. It is nearly impossible for regular readers to verify whether the textual content they encounter online has at one point been modified from its initial state, and at what time or to what extent the text was modified to its current version. In this paper, we propose a web-based platform that allows users to submit the URL for any web content they wish to track for changes. The system automatically creates a trusted timestamp stored in the blockchain of the cryptocurrency Bitcoin for the hash of the HTML content available at the user-specified URL. By using trusted timestamping to secure a ‘snapshot’ of online information as it existed at a specific time, any subsequent changes made to the content can be identified.}, booktitle = {Proceedings of the {Workshop} on {Web} {Archiving} and {Digital} {Libraries} ({WADL}) held in conjunction with the 16th {ACM}/{IEEE}-{CS} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, author = {Gipp, Bela and Meuschke, Norman and Beel, Joeran and Breitinger, Corinna}, year = {2016}, note = {Venue Rating: CORE A*}, keywords = {Blockchain}, pages = {1--3}, }
@inproceedings{SchubotzGLC16, address = {New York, NY, USA}, series = {{SIGIR} '16}, title = {Semantification of {Identifiers} in {Mathematics} for {Better} {Math} {Information} {Retrieval}}, isbn = {978-1-4503-4069-4}, shorttitle = {Semantification of {Identifiers} in {Mathematics} for {MIR}}, url = {https://www.gipp.com/wp-content/papercite-data/pdf/schubotz16.pdf}, doi = {10.1145/2911451.2911503}, abstract = {Mathematical formulae are essential in science, but face challenges of ambiguity, due to the use of a small number of identifiers to represent an immense number of concepts. Corresponding to word sense disambiguation in Natural Language Processing, we disambiguate mathematical identifiers. By regarding formulae and natural text as one monolithic information source, we are able to extract the semantics of identifiers in a process we term Mathematical Language Processing (MLP). As scientific communities tend to establish standard (identifier) notations, we use the document domain to infer the actual meaning of an identifier. Therefore, we adapt the software development concept of namespaces to mathematical notation. Thus, we learn namespace definitions by clustering the MLP results and mapping those clusters to subject classification schemata. In addition, this gives fundamental insights into the usage of mathematical notations in science, technology, engineering and mathematics. Our gold standard based evaluation shows that MLP extracts relevant identifier-definitions. Moreover, we discover that identifier namespaces improve the performance of automated identifier-definition extraction, and elevate it to a level that cannot be achieved within the document context alone.}, booktitle = {Proceedings of the 39th {International} {ACM} {SIGIR} {Conference} on {Research} and {Development} in {Information} {Retrieval}}, publisher = {ACM}, author = {Schubotz, Moritz and Grigorev, Alexey and Leich, Marcus and Cohl, Howard S. and Meuschke, Norman and Gipp, Bela and Youssef, Abdou S. and Markl, Volker}, month = jul, year = {2016}, note = {Venue Rating: CORE A*}, keywords = {Math Information Retrieval}, pages = {135--144}, }
@inproceedings{SchubotzMLG16, title = {Exploring the {One}-brain {Barrier}: a {Manual} {Contribution} to the {NTCIR}-12 {Math} {Task}}, shorttitle = {Exploring the one-brain-barrier}, url = {https://zenodo.org/record/3547436/files/Schubotz2016a_OneBrainBarrier.pdf}, doi = {10.5281/zenodo.3547436}, abstract = {This paper compares the search capabilities of a single human brain supported by the text search built into Wikipedia with state-of-the-art math search systems. To achieve this, we compare results of manual Wikipedia searches with the aggregated and assessed results of all systems participating in the NTCIR-12 MathIR Wikipedia Task. For 26 of the 30 topics, the average relevance score of our manually retrieved results exceeded the average relevance score of other participants by more than one standard deviation. However, math search engines at large achieved better recall and retrieved highly relevant results that our ‘single-brain system’ missed for 12 topics. By categorizing the topics of NTCIR-12 into six types of queries, we observe a particular strength of math search engines to answer queries of the types ‘definition lookup’ and ‘application look-up’. However, we see the low precision of current math search engines as the main challenge that prevents their wide-spread adoption in STEM research. By combining our results with highly relevant results of all other participants, we compile a new gold standard dataset and a dataset of duplicate content items. We discuss how the two datasets can be used to improve the query formulation and content augmentation capabilities of match search engines in the future}, booktitle = {Proceedings of the 12th {NTCIR} {Conference} on {Evaluation} of {Information} {Access} {Technologies}}, author = {Schubotz, Moritz and Meuschke, Norman and Leich, Marcus and Gipp, Bela}, month = jun, year = {2016}, keywords = {Math Information Retrieval}, }
@inproceedings{GippMG15iConfTT, address = {Newport Beach, California}, title = {Decentralized {Trusted} {Timestamping} using the {Crypto} {Currency} {Bitcoin}}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/gipp15a.pdf demo=https://originstamp.org}, doi = {10.5281/zenodo.3547488}, abstract = {Trusted timestamping is a process for proving that certain information existed at a given point in time. This paper presents a trusted timestamping concept and its implementation in form of a web-based service that uses the decentralized Bitcoin block chain to store anonymous, tamper-proof timestamps for digital content. The service allows users to hash files, such as text, photos or videos, and store the created hashes in the Bitcoin block chain. Users can then retrieve and verify the timestamps that have been committed to the block chain. The non-commercial service enables anyone, e.g., researchers, authors, journalists, students, or artists, to prove that they were in possession of certain information at a given point in time. Common use cases include proving that a contract has been signed, a photo taken, a video recorded, or a task completed prior to a certain date. All procedures maintain complete privacy of the user’s data.}, booktitle = {Proceedings of the {iConference} 2015}, author = {Gipp, Bela and Meuschke, Norman and Gernandt, Andre}, month = mar, year = {2015}, keywords = {Blockchain}, }
@inproceedings{GippML15, address = {Newport Beach, California}, title = {{CITREC}: {An} {Evaluation} {Framework} for {Citation}-{Based} {Similarity} {Measures} based on {TREC} {Genomics} and {PubMed} {Central}}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/gipp15b.pdf code=https://doi.org/10.5281/zenodo.3598371 data=https://doi.org/10.5281/zenodo.3598421}, doi = {10.5281/zenodo.3547372}, abstract = {Citation-based similarity measures such as Bibliographic Coupling and Co-Citation are an integral component of many information retrieval systems. However, comparisons of the strengths and weaknesses of measures are challenging due to the lack of suitable test collections. This paper presents CITREC, an open evaluation framework for citation-based and text-based similarity measures. CITREC prepares the data from the PubMed Central Open Access Subset and the TREC Genomics collection for a citation-based analysis and provides tools necessary for performing evaluations of similarity measures. To account for different evaluation purposes, CITREC implements 35 citation-based and text-based similarity measures, and features two gold standards. The first gold standard uses the Medical Subject Headings (MeSH) thesaurus and the second uses the expert relevance feedback that is part of the TREC Genomics collection to gauge similarity. CITREC additionally offers a system that allows creating user defined gold standards to adapt the evaluation framework to individual information needs and evaluation purposes.}, booktitle = {Proceedings of the {iConference}}, author = {Gipp, Bela and Meuschke, Norman and Lipinski, Mario}, month = mar, year = {2015}, keywords = {Literature Recommendation}, }
@inproceedings{GippMBP14, address = {Lisbon, Portugal}, title = {Web-based {Demonstration} of {Semantic} {Similarity} {Detection} {Using} {Citation} {Pattern} {Visualization} for a {Cross} {Language} {Plagiarism} {Case}}, volume = {2}, isbn = {978-989-758-028-4}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/gipp14a.pdf demo=https://citeplag.org/compare/6861131?substpat_a=(%25)&substpat_b=(%25)}, doi = {10.5220/0004985406770683}, abstract = {In a previous paper, we showed that analyzing citation patterns in the well-known plagiarized thesis by K. T. zu Guttenberg clearly outperformed current detection methods in identifying cross-language plagiarism. However, the experiment was a proof of concept and we did not provide a prototype. This paper presents a fully functional, web-based visualization of citation patterns for this verified cross-language plagiarism case, allowing the user to interactively experience the benefits of citation pattern analysis for plagiarism detection. Using examples from the Guttenberg plagiarism case, we demonstrate that the citation pattern visualization reduces the required examiner effort to verify the extent of plagiarism. Copyright ? 2014 SCITEPRESS - Science and Technology Publications.}, booktitle = {Proceedings {International} {Conference} on {Enterprise} {Information} {Systems} ({ICEIS})}, author = {Gipp, Bela and Meuschke, Norman and Breitinger, Corinna and Pitman, Jim and Nürnberger, Andreas}, month = apr, year = {2014}, note = {Venue Rating: CORE C}, keywords = {Plagiarism Detection}, pages = {677--683}, }
@article{GippMB14, title = {Citation-based {Plagiarism} {Detection}: {Practicability} on a {Large}-{Scale} {Scientific} {Corpus}}, volume = {65}, issn = {2330-1635}, url = {https://www.gipp.com/wp-content/papercite-data/pdf/gipp13b.pdf}, doi = {10.1002/asi.23228}, abstract = {The automated detection of plagiarism is an information retrieval task of increasing importance as the volume of readily accessible information on the web expands. A major shortcoming of current automated plagiarism detection approaches is their dependence on high character-based similarity. As a result, heavily disguised plagiarism forms, such as paraphrases, translated plagiarism, or structural and idea plagiarism, remain undetected. A recently proposed language-independent approach to plagiarism detection, Citation-based Plagiarism Detection (CbPD), allows the detection of semantic similarity even in the absence of text overlap by analyzing the citation placement in a document's full text to determine similarity. This article evaluates the performance of CbPD in detecting plagiarism with various degrees of disguise in a collection of 185,000 biomedical articles. We benchmark CbPD against two character-based detection approaches using a ground truth approximated in a user study. Our evaluation shows that the citation-based approach achieves superior ranking performance for heavily disguised plagiarism forms. Additionally, we demonstrate CbPD to be computationally more efficient than character-based approaches. Finally, upon combining the citation-based with the traditional character-based document similarity visualization methods in a hybrid detection prototype, we observe a reduction in the required user effort for document verification.}, number = {8}, journal = {Journal of the Association for Information Science and Technology}, author = {Gipp, Bela and Meuschke, Norman and Breitinger, Corinna}, month = aug, year = {2014}, note = {Venue Rating: SJR Q1}, keywords = {Plagiarism Detection}, pages = {1527--1540}, }
@inproceedings{MeuschkeG14, address = {London, UK}, title = {Reducing {Computational} {Effort} for {Plagiarism} {Detection} by {Using} {Citation} {Characteristics} to {Limit} {Retrieval} {Space}}, isbn = {978-1-4799-5569-5}, url = {https://www.gipp.com/wp-content/papercite-data/pdf/meuschke14.pdf}, doi = {10.1109/JCDL.2014.6970168}, abstract = {This paper proposes a hybrid approach to plagiarism detection in academic documents that integrates detection methods using citations, semantic argument structure, and semantic word similarity with character-based methods to achieve a higher detection performance for disguised plagiarism forms. Currently available software for plagiarism detection exclusively performs text string comparisons. These systems find copies, but fail to identify disguised plagiarism, such as paraphrases, translations, or idea plagiarism. Detection approaches that consider semantic similarity on word and sentence level exist and have consistently achieved higher detection accuracy for disguised plagiarism forms compared to character-based approaches. However, the high computational effort of these semantic approaches makes them infeasible for use in real-world plagiarism detection scenarios. The proposed hybrid approach uses citation-based methods as a preliminary heuristic to reduce the retrieval space with a relatively low loss in detection accuracy. This preliminary step can then be followed by a computationally more expensive semantic and character-based analysis. We show that such a hybrid approach allows semantic plagiarism detection to become feasible even on large collections for the first time.}, booktitle = {Proceedings of the 14th {Annual} {International} {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, author = {Meuschke, Norman and Gipp, Bela}, month = sep, year = {2014}, note = {Venue Rating: CORE A*}, keywords = {Plagiarism Detection}, pages = {197--200}, }
@article{MeuschkeG13, title = {State of the {Art} in {Detecting} {Academic} {Plagiarism}}, volume = {9}, issn = {1833-2595}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/meuschke13.pdf}, doi = {10.5281/zenodo.3482941}, abstract = {The problem of academic plagiarism has been present for centuries. Yet, the widespread dissemination of information technology, including the internet, made plagiarising much easier. Consequently, methods and systems aiding in the detection of plagiarism have attracted much research within the last two decades. Researchers proposed a variety of solutions, which we will review comprehensively in this article. Available detection systems use sophisticated and highly efficient character-based text comparisons, which can reliably identify verbatim and moderately disguised copies. Automatically detecting more strongly disguised plagiarism, such as paraphrases, translations or idea plagiarism, is the focus of current research. Proposed approaches for this task include intrinsic, cross-lingual and citation-based plagiarism detection. Each method offers unique strengths and weaknesses; however, none is currently mature enough for practical use. In the future, plagiarism detection systems may benefit from combining traditional character-based detection methods with these emerging detection approaches. Introduction}, number = {1}, journal = {International Journal for Educational Integrity}, author = {Meuschke, Norman and Gipp, Bela}, month = jun, year = {2013}, keywords = {Plagiarism Detection}, pages = {50--71}, }
@inproceedings{GippMBL13, address = {Dublin, UK}, title = {Demonstration of the {First} {Citation}-based {Plagiarism} {Detection} {Prototype}}, url = {paper=https://www.gipp.com/wp-content/papercite-data/pdf/gipp13.pdf demo=https://citeplag.org/}, doi = {10.1145/2484028.2484214}, booktitle = {Proceedings of the 36th {International} {ACM} {SIGIR} {Conference} on {Research} and {Development} in {Information} {Retrieval}}, publisher = {ACM}, author = {Gipp, Bela and Meuschke, Norman and Breitinger, Corinna and Lipinski, Mario and Nürnberger, Andreas}, month = jul, year = {2013}, note = {Venue Rating: CORE A*}, keywords = {Plagiarism Detection}, pages = {1119--1120}, }
@inproceedings{MeuschkeGB12, address = {Newcastle upon Tyne, UK}, title = {{CitePlag}: {A} {Citation}-based {Plagiarism} {Detection} {System} {Prototype}}, url = {paper=https://zenodo.org/record/3483088/files/Meuschke2012_CitePlag.pdf code=https://doi.org/10.5281/zenodo.1205460}, doi = {10.5281/zenodo.3483088}, abstract = {This paper presents an open-source prototype of a citation-based plagiarism detection system called CitePlag. The underlying idea of the system is to evaluate the citations of academic documents as language independent markers to detect plagiarism. CitePlag uses three different detection algorithms that analyze the citation sequence of academic documents for similar patterns that may indicate unduly used foreign text or ideas. The algorithms consider multiple citation related factors such as proximity and order of citations within the text, or their probability of co-occurrence in order to compute document similarity scores. We present technical details of CitePlag's detection algorithms and the acquisition of test data from the PubMed Central Open Access Subset. Future advancements of the prototype focus on increasing the reference database by enabling the system to process more document and citation formats. Furthermore, we aim to improve CitePlag's detection algorithms and scoring functions for reducing the number of false positives. Eventually, we plan to integrate text with citation-based detection algorithms within CitePlag.}, booktitle = {Proceedings of the 5th {International} {Plagiarism} {Conference}}, author = {Meuschke, Norman and Gipp, Bela and Breitinger, Corinna}, month = jul, year = {2012}, keywords = {Plagiarism Detection}, }
@inproceedings{GippM11, address = {Mountain, View, CA, USA}, title = {Citation {Pattern} {Matching} {Algorithms} for {Citation}-based {Plagiarism} {Detection}: {Greedy} {Citation} {Tiling}, {Citation} {Chunking} and {Longest} {Common} {Citation} {Sequence}}, isbn = {978-1-4503-0863-2}, url = {https://www.gipp.com/wp-content/papercite-data/pdf/gipp11c.pdf}, doi = {10.1145/2034691.2034741}, abstract = {Plagiarism Detection Systems have been developed to locate instances of plagiarism e.g. within scientific papers. Studies have shown that the existing approaches deliver reasonable results in identifying copy\&paste plagiarism, but fail to detect more sophisticated forms such as paraphrased plagiarism, translation plagiarism or idea plagiarism. The authors of this paper demonstrated in recent studies that the detection rate can be significantly improved by not only relying on text analysis, but by additionally analyzing the citations of a document. Citations are valuable language independent markers that are similar to a fingerprint. In fact, our examinations of real world cases have shown that the order of citations in a document often remains similar even if the text has been strongly paraphrased or translated in order to disguise plagiarism. This paper introduces three algorithms and discusses their suitability for the purpose of citation-based plagiarism detection. Due to the numerous ways in which plagiarism can occur, these algorithms need to be versatile. They must be capable of detecting transpositions, scaling and combinations in a local and global form. The algorithms are coined Greedy Citation Tiling, Citation Chunking and Longest Common Citation Sequence. The evaluation showed that if these algorithms are combined, common forms of plagiarism can be detected reliably.}, booktitle = {Proceedings of the 11th {ACM} {Symposium} on {Document} {Engineering}}, publisher = {ACM}, author = {Gipp, Bela and Meuschke, Norman}, month = sep, year = {2011}, note = {Venue Rating: CORE B}, keywords = {Plagiarism Detection}, pages = {249--258}, }
@inproceedings{GippMB11, address = {Ottawa, Canada}, title = {Comparative {Evaluation} of {Text}- and {Citation}-based {Plagiarism} {Detection} {Approaches} using {GuttenPlag}}, isbn = {978-1-4503-0744-4}, url = {https://www.gipp.com/wp-content/papercite-data/pdf/gipp11.pdf}, doi = {10.1145/1998076.1998124}, abstract = {Various approaches for plagiarism detection exist. All are based on more or less sophisticated text analysis methods such as string matching, fingerprinting or style comparison. In this paper a new approach called Citation-based Plagiarism Detection is evaluated using a doctoral thesis [8], in which a volunteer crowd-sourcing project called GuttenPlag [1] identified substantial amounts of plagiarism through careful manual inspection. This new approach is able to identify similar and plagiarized documents based on the citations used in the text. It is shown that citation-based plagiarism detection performs significantly better than text-based procedures in identifying strong paraphrasing, translation and some idea plagiarism. Detection rates can be improved by combining citation-based with text-based plagiarism detection.}, booktitle = {Proceedings of 11th {Annual} {International} {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, author = {Gipp, Bela and Meuschke, Norman and Beel, Joeran}, month = jun, year = {2011}, note = {Venue Rating: CORE A*}, keywords = {Plagiarism Detection}, pages = {255--258}, }
@techreport{GomezQuesadaIMT08, title = {{BoKlok} {Sweet} {Boklok}: {A} {Joint} {Innovation} of {Skanska} and {IKEA}}, url = {https://zenodo.org/record/3548729/files/BoKlok_sweet_BoKlok.pdf}, abstract = {This paper presents an innovative concept for functional, low-priced, privately owned housing products called BoKlok and analyses the development as well as the commercialization of this concept as a joint effort of the home furnishing company IKEA and the international construction enterprise Skanska. In the first chapter a short review on academic theory related to the paper is offered. In the second chapter the named housing products are introduced to the reader, their innovative characteristics are pointed out and an overview about the history of the concept as well as the current state of business related to the BoKlok products is given. In the third chapter the business model that is applied for merchandising BoKlok homes is analyzed in detail. Chapter four and five each present a general overview about the companies IKEA and Skanska, which cooperated in the development project for the BoKlok concept. The development process as such is reviewed explicitly in chapter six. In chapter seven factors that significantly influenced the project results are identified and discussed in respect to relevant academic findings. Chapter eight illustrates how the BoKlok concept is continuously developed further in the present. The work concludes with providing some future perspectives for the concept in chapter nine.}, author = {Gomez Quesada, Victoria and Idone, Claudia and Meuschke, Norman and Teboul, Nicolas}, month = jan, year = {2008}, doi = {10.5281/zenodo.3548729}, keywords = {Miscellaneous}, }