Software Framework for Topic Modelling with Large Corpora

Software Framework for Topic Modelling with Large Corpora. Re, R. & Sojka, P.

Large corpora are ubiquitous in today's world and memory quickly becomes the limiting factor in practical applications of the Vector Space Model (VSM). In this paper, we identify a gap in existing implementations of many of the popular algorithms, which is their scalability and ease of use. We describe a Natural Language Processing software framework which is based on the idea of document streaming, i.e. processing corpora document after document, in a memory independent fashion. Within this framework, we implement several popular algorithms for topical inference, including Latent Semantic Analysis and Latent Dirichlet Allocation, in a way that makes them completely independent of the training corpus size. Particular emphasis is placed on straightforward and intuitive framework design, so that modifications and extensions of the methods and/or their application by interested practitioners are effortless. We demonstrate the usefulness of our approach on a real-world scenario of computing document similarities within an existing digital library DML-CZ.

@article{
 title = {Software Framework for Topic Modelling with Large Corpora},
 type = {article},
 websites = {https://radimrehurek.com/gensim/lrec2010_final.pdf},
 id = {8b891772-df36-3ddf-ab39-d344510b3c7b},
 created = {2018-02-05T17:50:23.275Z},
 accessed = {2018-02-05},
 file_attached = {true},
 profile_id = {371589bb-c770-37ff-8193-93c6f25ffeb1},
 group_id = {f982cd63-7ceb-3aa2-ac7e-a953963d6716},
 last_modified = {2018-02-05T17:50:25.079Z},
 read = {false},
 starred = {false},
 authored = {false},
 confirmed = {false},
 hidden = {false},
 private_publication = {false},
 abstract = {Large corpora are ubiquitous in today's world and memory quickly becomes the limiting factor in practical applications of the Vector Space Model (VSM). In this paper, we identify a gap in existing implementations of many of the popular algorithms, which is their scalability and ease of use. We describe a Natural Language Processing software framework which is based on the idea of document streaming, i.e. processing corpora document after document, in a memory independent fashion. Within this framework, we implement several popular algorithms for topical inference, including Latent Semantic Analysis and Latent Dirichlet Allocation, in a way that makes them completely independent of the training corpus size. Particular emphasis is placed on straightforward and intuitive framework design, so that modifications and extensions of the methods and/or their application by interested practitioners are effortless. We demonstrate the usefulness of our approach on a real-world scenario of computing document similarities within an existing digital library DML-CZ.},
 bibtype = {article},
 author = {Re, Radi and Sojka, Petr}
}

Downloads: 0

{"_id":"98aPm6HRdRsyWRzB7","bibbaseid":"re-sojka-softwareframeworkfortopicmodellingwithlargecorpora","downloads":0,"creationDate":"2018-02-07T16:22:57.311Z","title":"Software Framework for Topic Modelling with Large Corpora","author_short":["Re, R.","Sojka, P."],"year":null,"bibtype":"article","biburl":null,"bibdata":{"title":"Software Framework for Topic Modelling with Large Corpora","type":"article","websites":"https://radimrehurek.com/gensim/lrec2010_final.pdf","id":"8b891772-df36-3ddf-ab39-d344510b3c7b","created":"2018-02-05T17:50:23.275Z","accessed":"2018-02-05","file_attached":"true","profile_id":"371589bb-c770-37ff-8193-93c6f25ffeb1","group_id":"f982cd63-7ceb-3aa2-ac7e-a953963d6716","last_modified":"2018-02-05T17:50:25.079Z","read":false,"starred":false,"authored":false,"confirmed":false,"hidden":false,"private_publication":false,"abstract":"Large corpora are ubiquitous in today's world and memory quickly becomes the limiting factor in practical applications of the Vector Space Model (VSM). In this paper, we identify a gap in existing implementations of many of the popular algorithms, which is their scalability and ease of use. We describe a Natural Language Processing software framework which is based on the idea of document streaming, i.e. processing corpora document after document, in a memory independent fashion. Within this framework, we implement several popular algorithms for topical inference, including Latent Semantic Analysis and Latent Dirichlet Allocation, in a way that makes them completely independent of the training corpus size. Particular emphasis is placed on straightforward and intuitive framework design, so that modifications and extensions of the methods and/or their application by interested practitioners are effortless. We demonstrate the usefulness of our approach on a real-world scenario of computing document similarities within an existing digital library DML-CZ.","bibtype":"article","author":"Re, Radi and Sojka, Petr","bibtex":"@article{\n title = {Software Framework for Topic Modelling with Large Corpora},\n type = {article},\n websites = {https://radimrehurek.com/gensim/lrec2010_final.pdf},\n id = {8b891772-df36-3ddf-ab39-d344510b3c7b},\n created = {2018-02-05T17:50:23.275Z},\n accessed = {2018-02-05},\n file_attached = {true},\n profile_id = {371589bb-c770-37ff-8193-93c6f25ffeb1},\n group_id = {f982cd63-7ceb-3aa2-ac7e-a953963d6716},\n last_modified = {2018-02-05T17:50:25.079Z},\n read = {false},\n starred = {false},\n authored = {false},\n confirmed = {false},\n hidden = {false},\n private_publication = {false},\n abstract = {Large corpora are ubiquitous in today's world and memory quickly becomes the limiting factor in practical applications of the Vector Space Model (VSM). In this paper, we identify a gap in existing implementations of many of the popular algorithms, which is their scalability and ease of use. We describe a Natural Language Processing software framework which is based on the idea of document streaming, i.e. processing corpora document after document, in a memory independent fashion. Within this framework, we implement several popular algorithms for topical inference, including Latent Semantic Analysis and Latent Dirichlet Allocation, in a way that makes them completely independent of the training corpus size. Particular emphasis is placed on straightforward and intuitive framework design, so that modifications and extensions of the methods and/or their application by interested practitioners are effortless. We demonstrate the usefulness of our approach on a real-world scenario of computing document similarities within an existing digital library DML-CZ.},\n bibtype = {article},\n author = {Re, Radi and Sojka, Petr}\n}","author_short":["Re, R.","Sojka, P."],"urls":{"Paper":"http://bibbase.org/service/mendeley/371589bb-c770-37ff-8193-93c6f25ffeb1/file/6f328870-3f57-198b-2f94-5163b9bb4887/Software_Framework_for_Topic_Modelling_with_Large_Corpora.pdf.pdf","Website":"https://radimrehurek.com/gensim/lrec2010_final.pdf"},"bibbaseid":"re-sojka-softwareframeworkfortopicmodellingwithlargecorpora","role":"author","downloads":0},"search_terms":["software","framework","topic","modelling","large","corpora","re","sojka"],"keywords":[],"authorIDs":[]}