<script src="https://bibbase.org/show?bib=https%3A%2F%2Fapi.zotero.org%2Fusers%2F7689706%2Fcollections%2FIBJGRWZX%2Fitems%3Fkey%3DR0b523dc3oYLxTGap1H4YXgd%26format%3Dbibtex%26limit%3D100&jsonp=1"></script>
<?php
$contents = file_get_contents("https://bibbase.org/show?bib=https%3A%2F%2Fapi.zotero.org%2Fusers%2F7689706%2Fcollections%2FIBJGRWZX%2Fitems%3Fkey%3DR0b523dc3oYLxTGap1H4YXgd%26format%3Dbibtex%26limit%3D100");
print_r($contents);
?>
<iframe src="https://bibbase.org/show?bib=https%3A%2F%2Fapi.zotero.org%2Fusers%2F7689706%2Fcollections%2FIBJGRWZX%2Fitems%3Fkey%3DR0b523dc3oYLxTGap1H4YXgd%26format%3Dbibtex%26limit%3D100"></iframe>
For more details see the documention.
To the site owner:
Action required! Mendeley is changing its API. In order to keep using Mendeley with BibBase past April 14th, you need to:
@misc{BibbaseGreinerPetterTalkICSTI, type = {Invited talk}, title = {Why {ChatGPT} and its friends are bad at {Math}}, url = {https://doi.org/10.5446/s_1854}, language = {en}, author = {Greiner-Petter, Andre}, month = nov, year = {2024}, doi = {10.5446/69580}, note = {Invited Talk}, }
@inproceedings{BibbaseSatputeGGS24a, address = {Washington DC, USA}, title = {Can {LLMs} {Master} {Math}? {Investigating} {Large} {Language} {Models} on {Math} {Stack} {Exchange}}, shorttitle = {Can {LLMs} {Master} {Math}?}, url = {https://arxiv.org/abs/2404.00344}, doi = {10.1145/3626772.3657945}, abstract = {Large Language Models (LLMs) have demonstrated exceptional capabilities in various natural language tasks, often achieving performances that surpass those of humans. Despite these advancements, the domain of mathematics presents a distinctive challenge, primarily due to its specialized structure and the precision it demands. In this study, we adopted a two-step approach for investigating the proficiency of LLMs in answering mathematical questions. First, we employ the most effective LLMs, as identified by their performance on math question-answer benchmarks, to generate answers to 78 questions from the Math Stack Exchange (MSE). Second, a case analysis is conducted on the LLM that showed the highest performance, focusing on the quality and accuracy of its answers through manual evaluation. We found that GPT-4 performs best (nDCG of 0.48 and P@10 of 0.37) amongst existing LLMs fine-tuned for answering mathematics questions and outperforms the current best approach on ArqMATH3 Task1, considering P@10. Our Case analysis indicates that while the GPT-4 can generate relevant responses in certain instances, it does not consistently answer all questions accurately. This paper explores the current limitations of LLMs in navigating complex mathematical problem-solving. Through case analysis, we shed light on the gaps in LLM capabilities within mathematics, thereby setting the stage for future research and advancements in AI-driven mathematical reasoning. We make our code and findings publicly available for research at https://github.com/gipplab/LLM-Investig-MathStackExchange.}, booktitle = {47th {International} {ACM} {SIGIR} {Conference} on {Research} and {Development} in {Information} {Retrieval}}, publisher = {ACM}, author = {Satpute, Ankit and Gießing, Noah and Greiner-Petter, André and Schubotz, Moritz and Teschke, Olaf and Aizawa, Akiko and Gipp, Bela}, month = jun, year = {2024}, note = {Core Rank A*}, pages = {2316--2320}, }
@inproceedings{BibbaseHorychWWR24, address = {Torino, Italy}, title = {{MAGPIE}: {Multi}-{Task} {Analysis} of {Media}-{Bias} {Generalization} with {Pre}-{Trained} {Identification} of {Expressions}}, shorttitle = {{MAGPIE}}, url = {https://aclanthology.org/2024.lrec-main.952}, abstract = {Media bias detection poses a complex, multifaceted problem traditionally tackled using single-task models and small in-domain datasets, consequently lacking generalizability. To address this, we introduce MAGPIE, a large-scale multi-task pre-training approach explicitly tailored for media bias detection. To enable large-scale pre-training, we construct Large Bias Mixture (LBM), a compilation of 59 bias-related tasks. MAGPIE outperforms previous approaches in media bias detection on the Bias Annotation By Experts (BABE) dataset, with a relative improvement of 3.3\% F1-score. Furthermore, using a RoBERTa encoder, we show that MAGPIE needs only 15\% of fine-tuning steps compared to single-task approaches. We provide insight into task learning interference and show that sentiment analysis and emotion detection help learning of all other tasks, and scaling the number of tasks leads to the best results. MAGPIE confirms that MTL is a promising approach for addressing media bias detection, enhancing the accuracy and efficiency of existing models. Furthermore, LBM is the first available resource collection focused on media bias MTL.}, urldate = {2024-05-23}, booktitle = {Proc. {Joint} {Int}. {Conf}. {Computational} {Linguistics}, {Language} {Resources} and {Evaluation} ({LREC}-{COLING})}, publisher = {ELRA and ICCL}, author = {Horych, Tomáš and Wessel, Martin Paul and Wahle, Jan Philip and Ruas, Terry and Waßmuth, Jerome and Greiner-Petter, André and Aizawa, Akiko and Gipp, Bela and Spinde, Timo}, month = may, year = {2024}, note = {Core Rank B}, pages = {10903--10920}, }
@inproceedings{BibbaseSatputeGGB24, address = {Glasgow, UK}, title = {Taxonomy of {Mathematical} {Plagiarism}}, volume = {14611}, isbn = {978-3-031-56065-1}, url = {https://link.springer.com/10.1007/978-3-031-56066-8_2}, doi = {10.1007/978-3-031-56066-8_2}, abstract = {Plagiarism is a pressing concern, even more so with the availability of large language models. Existing plagiarism detection systems reliably find copied and moderately reworded text but fail for idea plagiarism, especially in mathematical science, which heavily uses formal mathematical notation. We make two contributions. First, we establish a taxonomy of mathematical content reuse by annotating potentially plagiarised 122 scientific document pairs. Second, we analyze the best-performing approaches to detect plagiarism and mathematical content similarity on the newly established taxonomy. We found that the best-performing methods for plagiarism and math content similarity achieve an overall detection score (PlagDet) of 0.06 and 0.16, respectively. The best-performing methods failed to detect most cases from all seven newly established math similarity types. Outlined contributions will benefit research in plagiarism detection systems, recommender systems, question-answering systems, and search engines. We make our experiment’s code and annotated dataset available to the community: https://github.com/gipplab/Taxonomy-of-Mathematical-Plagiarism.}, language = {en}, urldate = {2024-05-23}, booktitle = {46th {European} {Conference} on {Information} {Retrieval} ({ECIR})}, publisher = {Springer Nature Switzerland}, author = {Satpute, Ankit and Greiner-Petter, André and Gießing, Noah and Beckenbach, Isabel and Schubotz, Moritz and Teschke, Olaf and Aizawa, Akiko and Gipp, Bela}, month = mar, year = {2024}, note = {Core Rank A}, pages = {12--20}, }
@book{BibbaseGreinerPetter23, address = {Wiesbaden}, title = {Making {Presentation} {Math} {Computable}: {A} {Context}-{Sensitive} {Approach} for {Translating} {LaTeX} to {Computer} {Algebra} {Systems}}, isbn = {978-3-658-40472-7 978-3-658-40473-4}, shorttitle = {Making {Presentation} {Math} {Computable}}, url = {https://link.springer.com/10.1007/978-3-658-40473-4}, abstract = {This thesis addresses the issue of translating mathematical expressions from LaTeX to the syntax of Computer Algebra Systems (CAS). Over the past decades, especially in the domain of Sciences, Technology, Engineering, and Mathematics (STEM), LaTeX has become the de-facto standard to typeset mathematical formulae in publications. Since scientists are generally required to publish their work, LaTeX has become an integral part of today's publishing workflow. On the other hand, modern research increasingly relies on CAS to simplify, manipulate, compute, and visualize mathematics. However, existing LaTeX import functions in CAS are limited to simple arithmetic expressions and are, therefore, insufficient for most use cases. Consequently, the workflow of experimenting and publishing in the Sciences often includes time-consuming and error-prone manual conversions between presentational LaTeX and computational CAS formats. To address the lack of a reliable and comprehensive translation tool between LaTeX and CAS, this thesis makes the following three contributions. First, it provides an approach to semantically enhance LaTeX expressions with sufficient semantic information for translations into CAS syntaxes. This, so called, semantification process analyzes the structure of the formula and its textual context to conclude semantic information. The research for this semantification process additionally contributes towards related Mathematical Information Retrieval (MathIR) tasks, such as mathematical education assistance, math recommendation and question answering systems, search engines, automatic plagiarism detection, and math type assistance systems. Second, this thesis demonstrates the first context-aware LaTeX to CAS translation framework LaCASt. LaCASt uses the developed semantification approach to transform LaTeX expressions into an intermediate semantic LaTeX format, which is then further translated to CAS based on translation patterns. These patterns were manually crafted by mathematicians to assure accurate and reliable translations. In comparison, this thesis additionally elaborates a non-context aware neural machine translation approach trained on a mathematical library generated by Mathematica. Third, the thesis provides a novel approach to evaluate the performance for LaTeX to CAS translations on large-scaled datasets with an automatic verification of equations in digital mathematical libraries. This evaluation approach is based on the assumption that equations in digital mathematical libraries can be computationally verified by CAS, if a translation between both systems exists. In addition, the thesis provides an in-depth manual evaluation on mathematical articles from the English Wikipedia. The presented context-aware translation framework LaCASt increases the efficiency and reliability of translations to CAS. Via LaCASt, we strengthened the Digital Library of Mathematical Functions (DLMF) by identifying numerous of issues, from missing or wrong semantic annotations to sign errors. Further, via LaCASt, we were able to discover several issues with the commercial CAS Maple and Mathematica. The fundamental approaches to semantically enhance mathematics developed in this thesis additionally contributed towards several related MathIR tasks. For instance, the large-scale analysis of mathematical notations and the studies on math-embeddings motivated new approaches for math plagiarism detection systems, search engines, and allow typing assistance for mathematical inputs. Finally, LaCASt translations will have a direct real-world impact, as they are scheduled to be integrated into upcoming versions of the DLMF and Wikipedia.}, language = {en}, urldate = {2023-02-24}, publisher = {Springer Fachmedien Wiesbaden}, author = {Greiner-Petter, Andre}, year = {2023}, doi = {10.1007/978-3-658-40473-4}, note = {Doctoral Dissertation at University of Wuppertal, Germany}, }
@article{BibbaseGreinerPetter23b, title = {Do the {Math}: {Making} {Mathematics} in {Wikipedia} {Computable}}, volume = {45}, issn = {0162-8828, 1939-3539}, shorttitle = {Do the {Math}}, url = {https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=9847017}, doi = {10.1109/TPAMI.2022.3195261}, abstract = {Wikipedia combines the power of AI solutions and human reviewers to safeguard article quality. Quality control objectives include detecting malicious edits, fixing typos, and spotting inconsistent formatting. However, no automated quality control mechanisms currently exist for mathematical formulae. Spell checkers are widely used to highlight textual errors, yet no equivalent tool exists to detect algebraically incorrect formulae. Our paper addresses this shortcoming by making mathematical formulae computable. We present a method that (1) gathers the semantic information surrounding the context of each mathematical formulae, (2) provides access to the information in a graph-structured dependency hierarchy, and (3) performs automatic plausibility checks on equations. We evaluate the performance of our approach on 6,337 mathematical expressions contained in 104 Wikipedia articles on the topic of orthogonal polynomials and special functions. Our system, LaCASt, verified 358 out of 1,516 equations as error-free. LaCASt successfully translated 27\% of the mathematical expressions and outperformed existing translation approaches by 16\%. Additionally, LaCASt achieved an F1 score of .495 for annotating mathematical expressions with relevant textual descriptions, which is a significant step towards advancing searchability, readability, and accessibility of mathematical formulae in Wikipedia. A prototype of LaCASt and the semantically enhanced Wikipedia articles are available at: https://tpami.wmflabs.org.}, number = {4}, urldate = {2022-10-03}, journal = {IEEE Transactions on Pattern Analysis and Machine Intelligence}, author = {Greiner-Petter, Andre and Schubotz, Moritz and Breitinger, Corinna and Scharpf, Philipp and Aizawa, Akiko and Gipp, Bela}, year = {2023}, note = {Journal Rank Q1; IF: 24.314}, pages = {4384--4395}, }
@inproceedings{BibbaseSatpute23, address = {Santa Fe, NM, USA}, title = {{TEIMMA}: {The} {First} {Content} {Reuse} {Annotator} for {Text}, {Images}, and {Math}}, isbn = {9798350399318}, shorttitle = {{TEIMMA}}, url = {https://arxiv.org/pdf/2305.13193.pdf}, doi = {10.1109/JCDL57899.2023.00056}, abstract = {This demo paper presents the first tool to annotate the reuse of text, images, and mathematical formulae in a document pair -- TEIMMA. Annotating content reuse is particularly useful to develop plagiarism detection algorithms. Real-world content reuse is often obfuscated, which makes it challenging to identify such cases. TEIMMA allows entering the obfuscation type to enable novel classifications for confirmed cases of plagiarism. It enables recording different reuse types for text, images, and mathematical formulae in HTML and supports users by visualizing the content reuse in a document pair using similarity detection methods for text and math.}, urldate = {2023-11-03}, booktitle = {2023 {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, publisher = {IEEE}, author = {Satpute, Ankit and Greiner-Petter, Andre and Schubotz, Moritz and Meuschke, Norman and Aizawa, Akiko and Teschke, Olaf and Gipp, Bela}, month = jun, year = {2023}, pages = {271--273}, }
@inproceedings{BibbasePetersen23, address = {Toronto, Canada}, title = {Neural {Machine} {Translation} for {Mathematical} {Formulae}}, url = {https://aclanthology.org/2023.acl-long.645}, doi = {10.18653/v1/2023.acl-long.645}, abstract = {We tackle the problem of neural machine translation of mathematical formulae between ambiguous presentation languages and unambiguous content languages. Compared to neural machine translation on natural language, mathematical formulae have a much smaller vocabulary and much longer sequences of symbols, while their translation requires extreme precision to satisfy mathematical information needs. In this work, we perform the tasks of translating from LaTeX to Mathematica as well as from LaTeX to semantic LaTeX. While recurrent, recursive, and transformer networks struggle with preserving all contained information, we find that convolutional sequence-to-sequence networks achieve 95.1\% and 90.7\% exact matches, respectively.}, language = {en}, urldate = {2023-11-03}, booktitle = {Proceedings of the 61st {Annual} {Meeting} of the {Association} for {Computational} {Linguistics} ({Volume} 1: {Long} {Papers})}, publisher = {Association for Computational Linguistics}, author = {Petersen, Felix and Schubotz, Moritz and Greiner-Petter, Andre and Gipp, Bela}, year = {2023}, note = {Core Rank A*}, pages = {11534--11550}, }
@inproceedings{BibbaseGreinerPetter22a, address = {Munich, Germany}, series = {Lecture {Notes} in {Computer} {Science}}, title = {Comparative {Verification} of the {Digital} {Library} of {Mathematical} {Functions} and {Computer} {Algebra} {Systems}}, url = {https://arxiv.org/abs/2201.09488}, doi = {10.1007/978-3-030-99524-9_5}, abstract = {Digital mathematical libraries assemble the knowledge of years of mathematical research. Numerous disciplines (e.g., physics, engineering, pure and applied mathematics) rely heavily on compendia gathered findings. Likewise, modern research applications rely more and more on computational solutions, which are often calculated and verified by computer algebra systems. Hence, the correctness, accuracy, and reliability of both digital mathematical libraries and computer algebra systems is a crucial attribute for modern research. In this paper, we present a novel approach to verify a digital mathematical library and two computer algebra systems with one another by converting mathematical expressions from one system to the other. We use our previously developed conversion tool (referred to as LaCASt) to translate formulae from the NIST Digital Library of Mathematical Functions to the computer algebra systems Maple and Mathematica. The contributions of our presented work are as follows: (1) we present the most comprehensive verification of computer algebra systems and digital mathematical libraries with one another; (2) we significantly enhance the performance of the underlying translator in terms of coverage and accuracy; and (3) we provide open access to translations for Maple and Mathematica of the formulae in the NIST Digital Library of Mathematical Functions.}, booktitle = {Tools and {Algorithms} for the {Construction} and {Analysis} of {Systems} - 28th {International} {Conference}, ({TACAS})}, publisher = {Springer}, author = {Greiner-Petter, Andre and Cohl, Howard S. and Youssef, Abdou and Schubotz, Moritz and Trost, Avi and Dey, Rajen and Aizawa, Akiko and Gipp, Bela}, month = apr, year = {2022}, note = {Core Rank A}, pages = {87--105}, }
@inproceedings{BibbaseScharpf22, address = {Hanghzou, China}, series = {{CEUR} {Workshop} {Proceedings}}, title = {Collaborative and {AI}-aided {Exam} {Question} {Generation} using {Wikidata} in {Education}}, volume = {3262}, url = {https://ceur-ws.org/Vol-3262/paper13.pdf}, abstract = {Since the COVID-19 outbreak, the use of digital learning or education platforms has significantly increased. Teachers now digitally distribute homework and provide exercise questions. In both cases, teachers need to continuously develop novel and individual questions. This process can be very time-consuming and should be facilitated and accelerated both through exchange with other teachers and by using Artificial Intelligence (AI) capabilities. To address this need, we propose a multilingual Wikimedia framework that allows for collaborative worldwide teacher knowledge engineering and subsequent AI-aided question generation, test, and correction. As a proof of concept, we present {\textgreater}{\textgreater}PhysWikiQuiz{\textless}{\textless}, a physics question generation and test engine. Our system (hosted by Wikimedia at https://physwikiquiz.wmflabs.org/) retrieves physics knowledge from the open community-curated database Wikidata. It can generate questions in different variations and verify answer values and units using a Computer Algebra System (CAS). We evaluate the performance on a public benchmark dataset at each stage of the system workflow. For an average formula with three variables, the system can generate and correct up to 300 questions for individual students based on a single formula concept name as input by the teacher.}, language = {en}, booktitle = {Proceedings of the 3rd {Wikidata} {Workshop} 2022 co-located with the 21st {International} {Semantic} {Web} {Conference} ({ISWC2022})}, publisher = {CEUR-WS.org}, author = {Scharpf, Philipp and Schubotz, Moritz and Spitz, Andreas and Greiner-Petter, Andre and Gipp, Bela}, year = {2022}, note = {Core Rank A}, }
@article{BibbaseSchubotzSGA22, title = {Caching and {Reproducibility}: {Making} {Data} {Science} {Experiments} {Faster} and {FAIRer}}, volume = {7}, issn = {2504-0537}, shorttitle = {Caching and {Reproducibility}}, url = {https://www.frontiersin.org/articles/10.3389/frma.2022.861944/full}, doi = {10.3389/frma.2022.861944}, abstract = {Small to medium-scale data science experiments often rely on research software developed ad-hoc by individual scientists or small teams. Often there is no time to make the research software fast, reusable, and open access. The consequence is twofold. First, subsequent researchers must spend significant work hours building upon the proposed hypotheses or experimental framework. In the worst case, others cannot reproduce the experiment and reuse the findings for subsequent research. Second, suppose the ad-hoc research software fails during often long-running computational expensive experiments. In that case, the overall effort to iteratively improve the software and rerun the experiments creates significant time pressure on the researchers. We suggest making caching an integral part of the research software development process, even before the first line of code is written. This article outlines caching recommendations for developing research software in data science projects. Our recommendations provide a perspective to circumvent common problems such as propriety dependence, speed, etc. At the same time, caching contributes to the reproducibility of experiments in the open science workflow. Concerning the four guiding principles, i.e., Findability, Accessibility, Interoperability, and Reusability (FAIR), we foresee that including the proposed recommendation in a research software development will make the data related to that software FAIRer for both machines and humans. We exhibit the usefulness of some of the proposed recommendations on our recently completed research software project in mathematical information retrieval.}, urldate = {2022-08-24}, journal = {Frontiers in Research Metrics and Analytics}, author = {Schubotz, Moritz and Satpute, Ankit and Greiner-Petter, Andre and Aizawa, Akiko and Gipp, Bela}, month = apr, year = {2022}, pages = {861944}, }
@misc{BibbaseGreinerPetterTalkSIGMathLing, type = {Invited talk}, title = {Comparative {Verification} of {Digital} {Mathematical} {Libraries} and {Computer} {Algebra} {Systems}}, url = {https://sigmathling.kwarc.info/seminar/}, author = {Greiner-Petter, Andre}, month = mar, year = {2021}, note = {Invited Talk}, }
@misc{BibbaseGreinerPetterTalkACMD, type = {Invited talk}, title = {Semantic {Preserving} {Translations} between {NIST}'s {Digital} {Library} of {Mathematical} {Functions} and {Computer} {Algebra} {Systems}}, url = {https://www.nist.gov/itl/math/acmd-seminar-semantic-preserving-translations-between-nists-digital-library-mathematical}, author = {Greiner-Petter, Andre}, month = jul, year = {2021}, note = {Invited Talk}, }
@inproceedings{BibbaseGreinerPetterSMB20, address = {Taipei, Taiwan}, title = {Discovering {Mathematical} {Objects} of {Interest} — {A} {Study} of {Mathematical} {Notations}}, isbn = {978-1-4503-7023-3}, url = {https://arxiv.org/abs/2002.02712}, doi = {10.1145/3366423.3380218}, abstract = {Mathematical notation, i.e., the writing system used to communicate concepts in mathematics, encodes valuable information for a variety of information search and retrieval systems. Yet, mathematical notations remain mostly unutilized by today's systems. In this paper, we present the first in-depth study on the distributions of mathematical notation in two large scientific corpora: the open access arXiv (2.5B mathematical objects) and the mathematical reviewing service for pure and applied mathematics zbMATH (61M mathematical objects). Our study lays a foundation for future research projects on mathematical information retrieval for large scientific corpora. Further, we demonstrate the relevance of our results to a variety of use-cases. For example, to assist semantic extraction systems, to improve scientific search engines, and to facilitate specialized math recommendation systems. The contributions of our presented research are as follows: (1) we present the first distributional analysis of mathematical formulae on arXiv and zbMATH; (2) we retrieve relevant mathematical objects for given textual search queries (e.g., linking \$P\_\{n\}{\textasciicircum}\{({\textbackslash}alpha, {\textbackslash}beta)\} {\textbackslash}left(x{\textbackslash}right)\$ with `Jacobi polynomial'); (3) we extend zbMATH's search engine by providing relevant mathematical formulae; and (4) we exemplify the applicability of the results by presenting auto-completion for math inputs as the first contribution to math recommendation systems. To expedite future research projects, we have made available our source code and data.}, language = {en}, urldate = {2021-07-30}, booktitle = {Proceedings of {The} {Web} {Conference} ({WWW})}, publisher = {ACM}, author = {Greiner-Petter, Andre and Schubotz, Moritz and Müller, Fabian and Breitinger, Corinna and Cohl, Howard and Aizawa, Akiko and Gipp, Bela}, month = apr, year = {2020}, note = {Core Rank A*}, pages = {1445--1456}, }
@inproceedings{BibbaseAsakuraGAM20, address = {Online}, title = {Towards {Grounding} of {Formulae}}, url = {https://www.aclweb.org/anthology/2020.sdp-1.16}, doi = {10/gjzg2r}, abstract = {A large amount of scientific knowledge is represented within mixed forms of natural language texts and mathematical formulae. Therefore, a collaboration of natural language processing and formula analyses, so-called mathematical language processing, is necessary to enable computers to understand and retrieve information from the documents. However, as we will show in this project, a mathematical notation can change its meaning even within the scope of a single paragraph. This flexibility makes it difficult to extract the exact meaning of a mathematical formula. In this project, we will propose a new task direction for grounding mathematical formulae. Particularly, we are addressing the widespread misconception of various research projects in mathematical information retrieval, which presume that mathematical notations have a fixed meaning within a single document. We manually annotated a long scientific paper to illustrate the task concept. Our high inter-annotator agreement shows that the task is well understood for humans. Our results indicate that it is worthwhile to grow the techniques for the proposed task to contribute to the further progress of mathematical language processing.}, language = {en}, urldate = {2021-08-02}, booktitle = {Proceedings of the {First} {Workshop} on {Scholarly} {Document} {Processing} ({SDP}@{EMNLP})}, publisher = {ACL}, author = {Asakura, Takuto and Greiner-Petter, Andre and Aizawa, Akiko and Miyao, Yusuke}, year = {2020}, note = {Core Rank A*}, pages = {138--147}, }
@inproceedings{BibbaseGreinerPetterSAG20, address = {Braunschweig, Germany}, series = {Lecture {Notes} in {Computer} {Science}}, title = {Making {Presentation} {Math} {Computable}: {Proposing} a {Context} {Sensitive} {Approach} for {Translating} {LaTeX} to {Computer} {Algebra} {Systems}}, volume = {12097}, isbn = {978-3-030-52199-8}, shorttitle = {Making {Presentation} {Math} {Computable}}, url = {https://link.springer.com/content/pdf/10.1007%2F978-3-030-52200-1_33.pdf}, doi = {10.1007/978-3-030-52200-1_33}, abstract = {Scientists increasingly rely on computer algebra systems and digital mathematical libraries to compute, validate, or experiment with mathematical formulae. However, the focus in digital mathematical libraries and scientific documents often lies more on an accurate presentation of the formulae rather than providing uniform access to the semantic information. But, presentational math formats do not provide exclusive access to the underlying semantic meanings. One has to derive the semantic information from the context. As a consequence, the workflow of experimenting and publishing in the Sciences often includes time-consuming, error-prone manual conversions between presentational and computational math formats. As a contribution to improve this workflow, we propose a context-sensitive approach that extracts semantic information from a given context, embeds the information into the given input, and converts the semantically enhanced expressions to computer algebra systems.}, language = {en}, urldate = {2021-07-30}, booktitle = {International {Congress} of {Mathematical} {Software} ({ICMS})}, publisher = {Springer}, author = {Greiner-Petter, Andre and Schubotz, Moritz and Aizawa, Akiko and Gipp, Bela}, editor = {Bigatti, Anna Maria and Carette, Jacques and Davenport, James H. and Joswig, Michael and de Wolff, Timo}, year = {2020}, pages = {335--341}, }
@article{BibbaseGreinerPetterYRM20, title = {Math-{Word} {Embedding} in {Math} {Search} and {Semantic} {Extraction}}, volume = {125}, issn = {0138-9130, 1588-2861}, url = {https://link.springer.com/10.1007/s11192-020-03502-9}, doi = {10.1007/s11192-020-03502-9}, abstract = {Word embedding, which represents individual words with semantically fixed-length vectors, has made it possible to successfully apply deep learning to natural language processing tasks such as semantic role-modeling, question answering, and machine translation. As math text consists of natural text, as well as math expressions that similarly exhibit linear correlation and contextual characteristics, word embedding techniques can also be applied to math documents. However, while mathematics is a precise and accurate science, it is usually expressed through imprecise and less accurate descriptions, contributing to the relative dearth of machine learning applications for information retrieval in this domain. Generally, mathematical documents communicate their knowledge with an ambiguous, context-dependent, and non-formal language. Given recent advances in word embedding, it is worthwhile to explore their use and effectiveness in math information retrieval tasks, such as math language processing and semantic knowledge extraction. In this paper, we explore math embedding by testing it on several different scenarios, namely, (1) math-term similarity, (2) analogy, (3) numerical concept-modeling based on the centroid of the keywords that characterize a concept, (4) math search using query expansions, and (5) semantic extraction, i.e., extracting descriptive phrases for math expressions. Due to the lack of benchmarks, our investigations were performed using the arXiv collection of STEM documents and carefully selected illustrations on the Digital Library of Mathematical Functions (DLMF: NIST digital library of mathematical functions. Release 1.0.20 of 2018-09-1, 2018). Our results show that math embedding holds much promise for similarity, analogy, and search tasks. However, we also observed the need for more robust math embedding approaches. Moreover, we explore and discuss fundamental issues that we believe thwart the progress in mathematical information retrieval in the direction of machine learning.}, language = {en}, number = {3}, urldate = {2021-06-30}, journal = {Scientometrics}, author = {Greiner-Petter, Andre and Youssef, Abdou and Ruas, Terry and Miller, Bruce R. and Schubotz, Moritz and Aizawa, Akiko and Gipp, Bela}, month = dec, year = {2020}, note = {Journal Rank Q1; IF: 3.702}, pages = {3017--3046}, }
@inproceedings{BibbaseScharpfSGOTG20, address = {Thessaloniki, Greece}, title = {{ARQMath} {Lab}: {An} {Incubator} for {Semantic} {Formula} {Search} in {zbMATH} {Open}?}, volume = {2696}, url = {http://ceur-ws.org/Vol-2696/paper_200.pdf}, abstract = {The zbMATH database contains more than 4 million bibliographic entries. We aim to provide easy access to these entries. Therefore, we maintain dif-ferent index structures including a formula index. To optimize the findability of the entries in our database, we constantly investigate new approaches to satisfy the information needs of our users. We believe that the findings from the ARQMath evaluation will generate new insights into which index struc-tures are most suitable to satisfy mathematical information needs. Search en-gines, recommender systems, plagiarism checking software, and many other added-value services acting on databases such as the arXiv and zbMATH need to combine natural and formula language. One initial approach to ad-dress this challenge is to enrich the mostly unstructured document data via Entity Linking. The ARQMath Task at CLEF 2020 aims to tackle the problem of linking newly posted questions from Math Stack Exchange (MSE) to exist-ing ones that were already answered by the community. To deeply under-stand MSE information needs, answer-, and formula types, we performed manual runs for tasks 1 and 2. Furthermore, we explored several formula re-trieval methods for task 2, such as fuzzy string search, k-nearest neighbors, and our recently introduced approach to retrieve Mathematical Objects of In-terest (MOI) with textual search queries. The task results show that neither our automated methods nor our manual runs archived good scores in the competition. However, the perceived quality of the hits returned by the MOI search particularly motivates us to conduct further research about MOI.}, booktitle = {Working {Notes} of ({CLEF}) 2020 - {Conference} and {Labs} of the {Evaluation} {Forum}}, publisher = {CEUR-WS.org}, author = {Scharpf, Philipp and Schubotz, Moritz and Greiner-Petter, Andre and Ostendorff, Malte and Teschke, Olaf and Gipp, Bela}, year = {2020}, }
@inproceedings{BibbaseSchubotzGMT20, address = {Virtual Event, China}, title = {Mathematical {Formulae} in {Wikimedia} {Projects} 2020}, isbn = {978-1-4503-7585-6}, url = {https://arxiv.org/abs/2003.09417}, doi = {10/ghn2t2}, abstract = {This poster summarizes our contributions to Wikimedia's processing pipeline for mathematical formulae. We describe how we have supported the transition from rendering formulae as course-grained PNG images in 2001 to providing modern semantically enriched language-independent MathML formulae in 2020. Additionally, we describe our plans to improve the accessibility and discoverability of mathematical knowledge in Wikimedia projects further.}, language = {en}, urldate = {2021-08-02}, booktitle = {Proceedings of the {ACM}/{IEEE} {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, publisher = {ACM}, author = {Schubotz, Moritz and Greiner-Petter, Andre and Meuschke, Norman and Teschke, Olaf and Gipp, Bela}, month = aug, year = {2020}, note = {Core Rank A*}, pages = {447--448}, }
@article{BibbaseGreinerPetterD19, title = {Automatic {Mathematical} {Information} {Retrieval} to {Perform} {Translations} up to {Computer} {Algebra} {Systems}}, volume = {15}, url = {https://arxiv.org/pdf/2011.14616.pdf}, abstract = {In mathematics, LaTeX is the de facto standard to prepare documents, e.g., scientific publications. While some formulae are still developed using pen and paper, more complicated mathematical expressions used more and more often with computer algebra systems. Mathematical expressions are often manually transcribed to computer algebra systems. The goal of my doctoral thesis is to improve the efficiency of this workflow. My envisioned method will automatically semantically enrich mathematical expressions so that they can be imported to computer algebra systems and other systems which can take advantage of the semantics, such as search engines or automatic plagiarism detection systems. These imports should preserve essential semantic features of the expression.}, number = {1}, journal = {Bulletin of IEEE Technical Committee on Digital Libraries (TCDL)}, author = {Greiner-Petter, Andre}, year = {2019}, }
@inproceedings{BibbaseGreinerPetterRSA19, address = {Paris, France}, title = {Why {Machines} {Cannot} {Learn} {Mathematics}, {Yet}}, volume = {2414}, url = {http://ceur-ws.org/Vol-2414/paper14.pdf}, abstract = {Nowadays, Machine Learning (ML) is seen as the universal solution to improve the effectiveness of information retrieval (IR) methods. However, while mathematics is a precise and accurate science, it is usually expressed by less accurate and imprecise descriptions. Generally, mathematical documents communicate their knowledge with an ambiguous, context-dependent, and non-formal language. In this work, we apply text embedding techniques to the arXiv collection of STEM documents and explore how these are unable to properly understand mathematics from that corpus, while proposing alternative to mitigate such situation.}, booktitle = {Proceedings of the 4th {Joint} {Workshop} on {Bibliometric}-{Enhanced} {Information} {Retrieval} and {Natural} {Language} {Processing} for {Digital} {Libraries} ({BIRNDL}@{SIGIR})}, publisher = {CEUR-WS.org}, author = {Greiner-Petter, Andre and Ruas, Terry and Schubotz, Moritz and Aizawa, Akiko and Grosky, William I. and Gipp, Bela}, year = {2019}, note = {Core Rank A*}, }
@article{BibbaseGreinerPetterSCG19, title = {Semantic {Preserving} {Bijective} {Mappings} for {Expressions} involving {Special} {Functions} between {Computer} {Algebra} {Systems} and {Document} {Preparation} {Systems}}, volume = {71}, issn = {2050-3806}, url = {https://arxiv.org/abs/1906.11485}, doi = {10.1108/AJIM-08-2018-0185}, abstract = {Modern mathematicians and scientists of math-related disciplines often use Document Preparation Systems (DPS) to write and Computer Algebra Systems (CAS) to calculate mathematical expressions. Usually, they translate the expressions manually between DPS and CAS. This process is time-consuming and error-prone. The purpose of this paper is to automate this translation. This paper uses Maple and Mathematica as the CAS, and LaTeX as the DPS. Bruce Miller at the National Institute of Standards and Technology (NIST) developed a collection of special LaTeX macros that create links from mathematical symbols to their definitions in the NIST Digital Library of Mathematical Functions (DLMF). The authors are using these macros to perform rule-based translations between the formulae in the DLMF and CAS. Moreover, the authors develop software to ease the creation of new rules and to discover inconsistencies. The authors created 396 mappings and translated 58.8 percent of DLMF formulae (2,405 expressions) successfully between Maple and DLMF. For a significant percentage, the special function definitions in Maple and the DLMF were different. An atomic symbol in one system maps to a composite expression in the other system. The translator was also successfully used for automatic verification of mathematical online compendia and CAS. The evaluation techniques discovered two errors in the DLMF and one defect in Maple. This paper introduces the first translation tool for special functions between LaTeX and CAS. The approach improves error-prone manual translations and can be used to verify mathematical online compendia and CAS.}, language = {en}, number = {3}, urldate = {2021-09-06}, journal = {Aslib Journal of Information Management}, author = {Greiner-Petter, Andre and Schubotz, Moritz and Cohl, Howard S. and Gipp, Bela}, month = may, year = {2019}, note = {Journal Rank Q1; IF: 2.653}, pages = {415--439}, }
@inproceedings{BibbaseCohlGS18, address = {Hagenberg, Austria}, series = {Lecture {Notes} in {Computer} {Science}}, title = {Automated {Symbolic} and {Numerical} {Testing} of {DLMF} {Formulae} {Using} {Computer} {Algebra} {Systems}}, volume = {11006}, isbn = {978-3-319-96811-7}, url = {https://arxiv.org/abs/2109.08899}, doi = {10/ggv8dn}, abstract = {We have developed an automated procedure for symbolic and numerical testing of formulae extracted from the NIST Digital Library of Mathematical Functions (DLMF). For the NIST Digital Repository of Mathematical Formulae, we have developed conversion tools from semantic LaTeX to the Computer Algebra System (CAS) Maple which relies on Youssef's part-of-math tagger. We convert a test data subset of 4,078 semantic LaTeX DLMF formulae to the native CAS representation and then apply an automated scheme for symbolic and numerical testing and verification. Our framework is implemented using Java and Maple. We describe in detail the conversion process which is required so that the CAS can correctly interpret the mathematical representation of the formulae. We describe the improvement of the effectiveness of our automated scheme through incremental enhancement (making more precise) of the mathematical semantic markup for the formulae.}, urldate = {2021-09-08}, booktitle = {Proceedings {International} {Conference} on {Intelligent} {Computer} {Mathematics} ({CICM})}, publisher = {Springer International Publishing}, author = {Cohl, Howard S. and Greiner-Petter, Andre and Schubotz, Moritz}, editor = {Rabe, Florian and Farmer, William M. and Passmore, Grant O. and Youssef, Abdou}, year = {2018}, pages = {39--52}, }
@inproceedings{BibbaseGreinerPetterD18, address = {Hagenberg, Austria}, title = {Automatic {Mathematical} {Information} {Retrieval} to {Perform} {Translations} up to {Computer} {Algebra} {Systems}}, volume = {2307}, url = {http://ceur-ws.org/Vol-2307/DP1.pdf}, booktitle = {Joint {Proceedings} of the {CME}-{EI}, {FMM}, {CAAT}, {FVPS}, {M3SRD}, {OpenMath} {Workshops}, {Doctoral} {Program} and {Work} in {Progress} at the {Conference} on {Intelligent} {Computer} {Mathematics} ({CICM})}, publisher = {CEUR-WS.org}, author = {Greiner-Petter, Andre}, year = {2018}, }
@inproceedings{BibbaseGreinerPetterSCG18, address = {Hagenberg, Austria}, series = {Lecture {Notes} in {Computer} {Science}}, title = {{MathTools}: {An} {Open} {API} for {Convenient} {MathML} {Handling}}, volume = {11006}, isbn = {978-3-319-96811-7}, shorttitle = {{MathTools}}, url = {https://arxiv.org/abs/2109.08539}, doi = {10.1007/978-3-319-96812-4_9}, abstract = {Mathematical formulae carry complex and essential semantic information in a variety of formats. Accessing this information with different systems requires a standardized machine-readable format that is capable of encoding presentational and semantic information. Even though MathML is an official recommendation by W3C and an ISO standard for representing mathematical expressions, we could identify only very few systems which use the full descriptiveness of MathML. MathML's high complexity results in a steep learning curve for novice users. We hypothesize that this complexity is the reason why many community-driven projects refrain from using MathML, and instead develop problem-specific data formats for their purposes. We provide a user-friendly, open-source application programming interface for controlling MathML data. Our API is written in JAVA and allows to create, manipulate, and efficiently access commonly needed information in presentation and content MathML. Our interface also provides tools for calculating differences and similarities between MathML expressions. The API also allows to determine the distance between expressions using different similarity measures. In addition, we provide adapters for numerous conversion tools and the canonicalization project. Our toolkit facilitates processing of mathematics for digital libraries, without the need to obtain XML expertise.}, urldate = {2021-09-14}, booktitle = {Proceedings of the {International} {Conference} on {Intelligent} {Computer} {Mathematics} ({CICM})}, publisher = {Springer International Publishing}, author = {Greiner-Petter, Andre and Schubotz, Moritz and Cohl, Howard S. and Gipp, Bela}, editor = {Rabe, Florian and Farmer, William M. and Passmore, Grant O. and Youssef, Abdou}, year = {2018}, pages = {104--110}, }
@inproceedings{BibbaseSchubotzGSM18, address = {Fort Worth, Texas, USA}, title = {Improving the {Representation} and {Conversion} of {Mathematical} {Formulae} by {Considering} their {Textual} {Context}}, isbn = {978-1-4503-5178-2}, url = {https://arxiv.org/abs/1804.04956}, doi = {10/ggv8jk}, abstract = {Mathematical formulae represent complex semantic information in a concise form. Especially in Science, Technology, Engineering, and Mathematics, mathematical formulae are crucial to communicate information, e.g., in scientific papers, and to perform computations using computer algebra systems. Enabling computers to access the information encoded in mathematical formulae requires machine-readable formats that can represent both the presentation and content, i.e., the semantics, of formulae. Exchanging such information between systems additionally requires conversion methods for mathematical representation formats. We analyze how the semantic enrichment of formulae improves the format conversion process and show that considering the textual context of formulae reduces the error rate of such conversions. Our main contributions are: (1) providing an openly available benchmark dataset for the mathematical format conversion task consisting of a newly created test collection, an extensive, manually curated gold standard and task-specific evaluation metrics; (2) performing a quantitative evaluation of state-of-the-art tools for mathematical format conversions; (3) presenting a new approach that considers the textual context of formulae to reduce the error rate for mathematical format conversions. Our benchmark dataset facilitates future research on mathematical format conversions as well as research on many problems in mathematical information retrieval. Because we annotated and linked all components of formulae, e.g., identifiers, operators and other entities, to Wikidata entries, the gold standard can, for instance, be used to train methods for formula concept discovery and recognition. Such methods can then be applied to improve mathematical information retrieval systems, e.g., for semantic formula search, recommendation of mathematical content, or detection of mathematical plagiarism.}, language = {en}, urldate = {2021-09-06}, booktitle = {Proceedings of the 18th {ACM}/{IEEE} on {Joint} {Conference} on {Digital} {Libraries} ({JCDL})}, publisher = {ACM}, author = {Schubotz, Moritz and Greiner-Petter, Andre and Scharpf, Philipp and Meuschke, Norman and Cohl, Howard S. and Gipp, Bela}, month = may, year = {2018}, note = {Core Rank A*}, pages = {233--242}, }
@inproceedings{BibbaseCohlSYG17, address = {Edinburgh, UK}, series = {Lecture {Notes} in {Computer} {Science}}, title = {Semantic {Preserving} {Bijective} {Mappings} of {Mathematical} {Formulae} {Between} {Document} {Preparation} {Systems} and {Computer} {Algebra} {Systems}}, volume = {10383}, isbn = {978-3-319-62074-9}, url = {https://arxiv.org/abs/2109.08655}, doi = {10/ggv8dk}, abstract = {There are many different approaches to represent mathematical expressions on computers. Word processors like LaTeX offer the ability to render mathematical expressions as one would write these on paper. Using LaTeX, LaTeXML, and tools generated for use in the NIST Digital Library of Mathematical Functions, semantically enhanced mathematical LaTeX markup (semantic LaTeX) is achieved by using a semantic macro set. Computer algebra systems (CAS) such as Maple and Mathematica use alternative markup to represent mathematical expressions. For the conversion from semantic LaTeX to the CAS representations, we have adapted the approach of Part of Speech Tagging from Natural Language Processing. By taking advantage of Youssef's Part-of-Math Tagger, CAS internal representations, and locally developed software, we develop algorithms to convert between semantic LaTeX and representations from CAS. The goal of these efforts is to provide CAS formulae representations to the public for digital mathematics libraries. In connection with these efforts, we have developed software which has converted between CAS representations through semantic LaTeX to generate MediaWiki wikitext for the NIST Digital Repository of Mathematical Formulae for the Wolfram Encoding Continued Fraction dataset and the University of Antwerp Continued Fractions for Special Functions dataset.}, urldate = {2021-09-08}, booktitle = {Proceedings {International} {Conference} on {Intelligent} {Computer} {Mathematics} ({CICM})}, publisher = {Springer}, author = {Cohl, Howard S. and Schubotz, Moritz and Youssef, Abdou and Greiner-Petter, Andre and Gerhard, Jürgen and Saunders, Bonita V. and McClain, Marjorie A. and Bang, Joon and Chen, Kevin}, editor = {Geuvers, Herman and England, Matthew and Hasan, Osman and Rabe, Florian and Teschke, Olaf}, year = {2017}, pages = {115--131}, }