Beyond lexical frequencies: using R for text analysis in the digital humanities

Beyond lexical frequencies: using R for text analysis in the digital humanities. Arnold, T., Ballier, N., Lissón, P., & Tilton, L. Language Resources and Evaluation, 53(4):707–733, December, 2019.

Paper doi abstract bibtex

This paper presents a combination of R packages—user contributed toolkits written in a common core programming language—to facilitate the humanistic investigation of digitised, text-based corpora. Our survey of text analysis packages includes those of our own creation (cleanNLP and fasttextM) as well as packages built by other research groups (stringi, readtext, hyphenatr, quanteda, and hunspell). By operating on generic object types, these packages unite research innovations in corpus linguistics, natural language processing, machine learning, statistics, and digital humanities. We begin by extrapolating on the theoretical beneﬁts of R as an elaborate gluing language for bringing together several areas of expertise and compare it to linguistic concordancers and other tool-based approaches to text analysis in the digital humanities. We then showcase the practical beneﬁts of an ecosystem by illustrating how R packages have been integrated into a digital humanities project. Throughout, the focus is on moving beyond the bag-ofwords, lexical frequency model by incorporating linguistically-driven analyses in research.

@article{arnold2019b,
	title = {Beyond lexical frequencies: using {R} for text analysis in the digital humanities},
	volume = {53},
	issn = {1574-020X, 1574-0218},
	shorttitle = {Beyond lexical frequencies},
	url = {http://link.springer.com/10.1007/s10579-019-09456-6},
	doi = {10.1007/s10579-019-09456-6},
	abstract = {This paper presents a combination of R packages—user contributed toolkits written in a common core programming language—to facilitate the humanistic investigation of digitised, text-based corpora. Our survey of text analysis packages includes those of our own creation (cleanNLP and fasttextM) as well as packages built by other research groups (stringi, readtext, hyphenatr, quanteda, and hunspell). By operating on generic object types, these packages unite research innovations in corpus linguistics, natural language processing, machine learning, statistics, and digital humanities. We begin by extrapolating on the theoretical beneﬁts of R as an elaborate gluing language for bringing together several areas of expertise and compare it to linguistic concordancers and other tool-based approaches to text analysis in the digital humanities. We then showcase the practical beneﬁts of an ecosystem by illustrating how R packages have been integrated into a digital humanities project. Throughout, the focus is on moving beyond the bag-ofwords, lexical frequency model by incorporating linguistically-driven analyses in research.},
	language = {en},
	number = {4},
	urldate = {2024-03-05},
	journal = {Language Resources and Evaluation},
	author = {Arnold, Taylor and Ballier, Nicolas and Lissón, Paula and Tilton, Lauren},
	month = dec,
	year = {2019},
	pages = {707--733},
}

Downloads: 0

{"_id":"q72iJnJmo4ENuh7dP","bibbaseid":"arnold-ballier-lissn-tilton-beyondlexicalfrequenciesusingrfortextanalysisinthedigitalhumanities-2019","author_short":["Arnold, T.","Ballier, N.","Lissón, P.","Tilton, L."],"bibdata":{"bibtype":"article","type":"article","title":"Beyond lexical frequencies: using R for text analysis in the digital humanities","volume":"53","issn":"1574-020X, 1574-0218","shorttitle":"Beyond lexical frequencies","url":"http://link.springer.com/10.1007/s10579-019-09456-6","doi":"10.1007/s10579-019-09456-6","abstract":"This paper presents a combination of R packages—user contributed toolkits written in a common core programming language—to facilitate the humanistic investigation of digitised, text-based corpora. Our survey of text analysis packages includes those of our own creation (cleanNLP and fasttextM) as well as packages built by other research groups (stringi, readtext, hyphenatr, quanteda, and hunspell). By operating on generic object types, these packages unite research innovations in corpus linguistics, natural language processing, machine learning, statistics, and digital humanities. We begin by extrapolating on the theoretical beneﬁts of R as an elaborate gluing language for bringing together several areas of expertise and compare it to linguistic concordancers and other tool-based approaches to text analysis in the digital humanities. We then showcase the practical beneﬁts of an ecosystem by illustrating how R packages have been integrated into a digital humanities project. Throughout, the focus is on moving beyond the bag-ofwords, lexical frequency model by incorporating linguistically-driven analyses in research.","language":"en","number":"4","urldate":"2024-03-05","journal":"Language Resources and Evaluation","author":[{"propositions":[],"lastnames":["Arnold"],"firstnames":["Taylor"],"suffixes":[]},{"propositions":[],"lastnames":["Ballier"],"firstnames":["Nicolas"],"suffixes":[]},{"propositions":[],"lastnames":["Lissón"],"firstnames":["Paula"],"suffixes":[]},{"propositions":[],"lastnames":["Tilton"],"firstnames":["Lauren"],"suffixes":[]}],"month":"December","year":"2019","pages":"707–733","bibtex":"@article{arnold2019b,\n\ttitle = {Beyond lexical frequencies: using {R} for text analysis in the digital humanities},\n\tvolume = {53},\n\tissn = {1574-020X, 1574-0218},\n\tshorttitle = {Beyond lexical frequencies},\n\turl = {http://link.springer.com/10.1007/s10579-019-09456-6},\n\tdoi = {10.1007/s10579-019-09456-6},\n\tabstract = {This paper presents a combination of R packages—user contributed toolkits written in a common core programming language—to facilitate the humanistic investigation of digitised, text-based corpora. Our survey of text analysis packages includes those of our own creation (cleanNLP and fasttextM) as well as packages built by other research groups (stringi, readtext, hyphenatr, quanteda, and hunspell). By operating on generic object types, these packages unite research innovations in corpus linguistics, natural language processing, machine learning, statistics, and digital humanities. We begin by extrapolating on the theoretical beneﬁts of R as an elaborate gluing language for bringing together several areas of expertise and compare it to linguistic concordancers and other tool-based approaches to text analysis in the digital humanities. We then showcase the practical beneﬁts of an ecosystem by illustrating how R packages have been integrated into a digital humanities project. Throughout, the focus is on moving beyond the bag-ofwords, lexical frequency model by incorporating linguistically-driven analyses in research.},\n\tlanguage = {en},\n\tnumber = {4},\n\turldate = {2024-03-05},\n\tjournal = {Language Resources and Evaluation},\n\tauthor = {Arnold, Taylor and Ballier, Nicolas and Lissón, Paula and Tilton, Lauren},\n\tmonth = dec,\n\tyear = {2019},\n\tpages = {707--733},\n}\n\n","author_short":["Arnold, T.","Ballier, N.","Lissón, P.","Tilton, L."],"key":"arnold2019b","id":"arnold2019b","bibbaseid":"arnold-ballier-lissn-tilton-beyondlexicalfrequenciesusingrfortextanalysisinthedigitalhumanities-2019","role":"author","urls":{"Paper":"http://link.springer.com/10.1007/s10579-019-09456-6"},"metadata":{"authorlinks":{}}},"bibtype":"article","biburl":"https://api.zotero.org/groups/2386895/collections/VAEPA58T/items?format=bibtex&limit=100","dataSources":["M3AAJxDS9SqRCwGyw"],"keywords":[],"search_terms":["beyond","lexical","frequencies","using","text","analysis","digital","humanities","arnold","ballier","lissón","tilton"],"title":"Beyond lexical frequencies: using R for text analysis in the digital humanities","year":2019}