A Biterm Topic Model for Short Texts

A Biterm Topic Model for Short Texts. Yan, X., Guo, J., Lan, Y., & Cheng, X.

A Biterm Topic Model for Short Texts [link]

Uncovering the topics within short texts, such as tweets and instant messages, has become an important task for many content analysis applications. However, directly applying conventional topic models (e.g. LDA and PLSA) on such short texts may not work well. The fundamental reason lies in that conventional topic models implicitly capture the document-level word co-occurrence patterns to reveal topics, and thus suffer from the severe data sparsity in short docu-ments. In this paper, we propose a novel way for modeling topics in short texts, referred as biterm topic model (BTM). Specifically, in BTM we learn the topics by directly modeling the generation of word co-occurrence patterns (i.e. biterms) in the whole corpus. The major advantages of BTM are that 1) BTM explicitly models the word co-occurrence pat-terns to enhance the topic learning; and 2) BTM uses the aggregated patterns in the whole corpus for learning topics to solve the problem of sparse word co-occurrence patterns at document-level. We carry out extensive experiments on real-world short text collections. The results demonstrate that our approach can discover more prominent and coher-ent topics, and significantly outperform baseline methods on several evaluation metrics. Furthermore, we find that BTM can outperform LDA even on normal texts, showing the po-tential generality and wider usage of the new topic model.

@article{
 title = {A Biterm Topic Model for Short Texts},
 type = {article},
 keywords = {Biterm,Clus-tering Keywords Short Text,Content Analysis,I53 [Pattern Recognition],Information Search and Retrieval,Topic Model,docu-ment clustering},
 websites = {http://delivery.acm.org/10.1145/2490000/2488514/p1445-yan.pdf?ip=128.227.11.255&id=2488514&acc=ACTIVE%20SERVICE&key=5CC3CBFF4617FD07%2EC2A817F22E85290F%2E4D4702B0C3E38B35%2E4D4702B0C3E38B35&__acm__=1517849380_da1e8f45e4e4675162957624f657ff15},
 id = {f63af086-2977-3777-bc07-f2d63f11796f},
 created = {2018-02-05T16:45:00.644Z},
 accessed = {2018-02-05},
 file_attached = {true},
 profile_id = {371589bb-c770-37ff-8193-93c6f25ffeb1},
 group_id = {f982cd63-7ceb-3aa2-ac7e-a953963d6716},
 last_modified = {2018-02-05T16:45:03.863Z},
 read = {false},
 starred = {false},
 authored = {false},
 confirmed = {false},
 hidden = {false},
 private_publication = {false},
 abstract = {Uncovering the topics within short texts, such as tweets and instant messages, has become an important task for many content analysis applications. However, directly applying conventional topic models (e.g. LDA and PLSA) on such short texts may not work well. The fundamental reason lies in that conventional topic models implicitly capture the document-level word co-occurrence patterns to reveal topics, and thus suffer from the severe data sparsity in short docu-ments. In this paper, we propose a novel way for modeling topics in short texts, referred as biterm topic model (BTM). Specifically, in BTM we learn the topics by directly modeling the generation of word co-occurrence patterns (i.e. biterms) in the whole corpus. The major advantages of BTM are that 1) BTM explicitly models the word co-occurrence pat-terns to enhance the topic learning; and 2) BTM uses the aggregated patterns in the whole corpus for learning topics to solve the problem of sparse word co-occurrence patterns at document-level. We carry out extensive experiments on real-world short text collections. The results demonstrate that our approach can discover more prominent and coher-ent topics, and significantly outperform baseline methods on several evaluation metrics. Furthermore, we find that BTM can outperform LDA even on normal texts, showing the po-tential generality and wider usage of the new topic model.},
 bibtype = {article},
 author = {Yan, Xiaohui and Guo, Jiafeng and Lan, Yanyan and Cheng, Xueqi}
}

Downloads: 0

{"_id":"MSj6tYgPHwzQGnvru","bibbaseid":"yan-guo-lan-cheng-abitermtopicmodelforshorttexts","downloads":0,"creationDate":"2018-02-07T16:22:57.263Z","title":"A Biterm Topic Model for Short Texts","author_short":["Yan, X.","Guo, J.","Lan, Y.","Cheng, X."],"year":null,"bibtype":"article","biburl":null,"bibdata":{"title":"A Biterm Topic Model for Short Texts","type":"article","keywords":"Biterm,Clus-tering Keywords Short Text,Content Analysis,I53 [Pattern Recognition],Information Search and Retrieval,Topic Model,docu-ment clustering","websites":"http://delivery.acm.org/10.1145/2490000/2488514/p1445-yan.pdf?ip=128.227.11.255&id=2488514&acc=ACTIVE%20SERVICE&key=5CC3CBFF4617FD07%2EC2A817F22E85290F%2E4D4702B0C3E38B35%2E4D4702B0C3E38B35&__acm__=1517849380_da1e8f45e4e4675162957624f657ff15","id":"f63af086-2977-3777-bc07-f2d63f11796f","created":"2018-02-05T16:45:00.644Z","accessed":"2018-02-05","file_attached":"true","profile_id":"371589bb-c770-37ff-8193-93c6f25ffeb1","group_id":"f982cd63-7ceb-3aa2-ac7e-a953963d6716","last_modified":"2018-02-05T16:45:03.863Z","read":false,"starred":false,"authored":false,"confirmed":false,"hidden":false,"private_publication":false,"abstract":"Uncovering the topics within short texts, such as tweets and instant messages, has become an important task for many content analysis applications. However, directly applying conventional topic models (e.g. LDA and PLSA) on such short texts may not work well. The fundamental reason lies in that conventional topic models implicitly capture the document-level word co-occurrence patterns to reveal topics, and thus suffer from the severe data sparsity in short docu-ments. In this paper, we propose a novel way for modeling topics in short texts, referred as biterm topic model (BTM). Specifically, in BTM we learn the topics by directly modeling the generation of word co-occurrence patterns (i.e. biterms) in the whole corpus. The major advantages of BTM are that 1) BTM explicitly models the word co-occurrence pat-terns to enhance the topic learning; and 2) BTM uses the aggregated patterns in the whole corpus for learning topics to solve the problem of sparse word co-occurrence patterns at document-level. We carry out extensive experiments on real-world short text collections. The results demonstrate that our approach can discover more prominent and coher-ent topics, and significantly outperform baseline methods on several evaluation metrics. Furthermore, we find that BTM can outperform LDA even on normal texts, showing the po-tential generality and wider usage of the new topic model.","bibtype":"article","author":"Yan, Xiaohui and Guo, Jiafeng and Lan, Yanyan and Cheng, Xueqi","bibtex":"@article{\n title = {A Biterm Topic Model for Short Texts},\n type = {article},\n keywords = {Biterm,Clus-tering Keywords Short Text,Content Analysis,I53 [Pattern Recognition],Information Search and Retrieval,Topic Model,docu-ment clustering},\n websites = {http://delivery.acm.org/10.1145/2490000/2488514/p1445-yan.pdf?ip=128.227.11.255&id=2488514&acc=ACTIVE%20SERVICE&key=5CC3CBFF4617FD07%2EC2A817F22E85290F%2E4D4702B0C3E38B35%2E4D4702B0C3E38B35&__acm__=1517849380_da1e8f45e4e4675162957624f657ff15},\n id = {f63af086-2977-3777-bc07-f2d63f11796f},\n created = {2018-02-05T16:45:00.644Z},\n accessed = {2018-02-05},\n file_attached = {true},\n profile_id = {371589bb-c770-37ff-8193-93c6f25ffeb1},\n group_id = {f982cd63-7ceb-3aa2-ac7e-a953963d6716},\n last_modified = {2018-02-05T16:45:03.863Z},\n read = {false},\n starred = {false},\n authored = {false},\n confirmed = {false},\n hidden = {false},\n private_publication = {false},\n abstract = {Uncovering the topics within short texts, such as tweets and instant messages, has become an important task for many content analysis applications. However, directly applying conventional topic models (e.g. LDA and PLSA) on such short texts may not work well. The fundamental reason lies in that conventional topic models implicitly capture the document-level word co-occurrence patterns to reveal topics, and thus suffer from the severe data sparsity in short docu-ments. In this paper, we propose a novel way for modeling topics in short texts, referred as biterm topic model (BTM). Specifically, in BTM we learn the topics by directly modeling the generation of word co-occurrence patterns (i.e. biterms) in the whole corpus. The major advantages of BTM are that 1) BTM explicitly models the word co-occurrence pat-terns to enhance the topic learning; and 2) BTM uses the aggregated patterns in the whole corpus for learning topics to solve the problem of sparse word co-occurrence patterns at document-level. We carry out extensive experiments on real-world short text collections. The results demonstrate that our approach can discover more prominent and coher-ent topics, and significantly outperform baseline methods on several evaluation metrics. Furthermore, we find that BTM can outperform LDA even on normal texts, showing the po-tential generality and wider usage of the new topic model.},\n bibtype = {article},\n author = {Yan, Xiaohui and Guo, Jiafeng and Lan, Yanyan and Cheng, Xueqi}\n}","author_short":["Yan, X.","Guo, J.","Lan, Y.","Cheng, X."],"urls":{"Paper":"http://bibbase.org/service/mendeley/371589bb-c770-37ff-8193-93c6f25ffeb1/file/48f0a772-1874-6775-61ba-b994a7e8b2b7/A_Biterm_Topic_Model_for_Short_Texts.pdf.pdf","Website":"http://delivery.acm.org/10.1145/2490000/2488514/p1445-yan.pdf?ip=128.227.11.255&id=2488514&acc=ACTIVE%20SERVICE&key=5CC3CBFF4617FD07%2EC2A817F22E85290F%2E4D4702B0C3E38B35%2E4D4702B0C3E38B35&__acm__=1517849380_da1e8f45e4e4675162957624f657ff15"},"bibbaseid":"yan-guo-lan-cheng-abitermtopicmodelforshorttexts","role":"author","keyword":["Biterm","Clus-tering Keywords Short Text","Content Analysis","I53 [Pattern Recognition]","Information Search and Retrieval","Topic Model","docu-ment clustering"],"downloads":0},"search_terms":["biterm","topic","model","short","texts","yan","guo","lan","cheng"],"keywords":["biterm","clus-tering keywords short text","content analysis","i53 [pattern recognition]","information search and retrieval","topic model","docu-ment clustering"],"authorIDs":[]}