A Latent Concept Topic Model for Robust Topic Inference Using Word Embeddings

A Latent Concept Topic Model for Robust Topic Inference Using Word Embeddings. Hu, W. & Tsujii, J., '.

Uncovering thematic structures of SNS and blog posts is a crucial yet challeng-ing task, because of the severe data spar-sity induced by the short length of texts and diverse use of vocabulary. This hin-ders effective topic inference of traditional LDA because it infers topics based on document-level co-occurrence of words. To robustly infer topics in such contexts, we propose a latent concept topic model (LCTM). Unlike LDA, LCTM reveals top-ics via co-occurrence of latent concepts, which we introduce as latent variables to capture conceptual similarity of words. More specifically, LCTM models each topic as a distribution over the latent con-cepts, where each latent concept is a local-ized Gaussian distribution over the word embedding space. Since the number of unique concepts in a corpus is often much smaller than the number of unique words, LCTM is less susceptible to the data spar-sity. Experiments on the 20Newsgroups show the effectiveness of LCTM in deal-ing with short texts as well as the capabil-ity of the model in handling held-out doc-uments with a high degree of OOV words.

@article{
 title = {A Latent Concept Topic Model for Robust Topic Inference Using Word Embeddings},
 type = {article},
 pages = {380-386},
 websites = {https://pdfs.semanticscholar.org/68d1/26a8a7080b7a67219c27456c873543376393.pdf},
 id = {eb49e89d-613f-3138-b6fc-bd74367d353b},
 created = {2018-02-05T18:48:03.758Z},
 accessed = {2018-02-05},
 file_attached = {true},
 profile_id = {371589bb-c770-37ff-8193-93c6f25ffeb1},
 group_id = {f982cd63-7ceb-3aa2-ac7e-a953963d6716},
 last_modified = {2018-02-05T18:48:06.626Z},
 read = {false},
 starred = {false},
 authored = {false},
 confirmed = {false},
 hidden = {false},
 private_publication = {false},
 abstract = {Uncovering thematic structures of SNS and blog posts is a crucial yet challeng-ing task, because of the severe data spar-sity induced by the short length of texts and diverse use of vocabulary. This hin-ders effective topic inference of traditional LDA because it infers topics based on document-level co-occurrence of words. To robustly infer topics in such contexts, we propose a latent concept topic model (LCTM). Unlike LDA, LCTM reveals top-ics via co-occurrence of latent concepts, which we introduce as latent variables to capture conceptual similarity of words. More specifically, LCTM models each topic as a distribution over the latent con-cepts, where each latent concept is a local-ized Gaussian distribution over the word embedding space. Since the number of unique concepts in a corpus is often much smaller than the number of unique words, LCTM is less susceptible to the data spar-sity. Experiments on the 20Newsgroups show the effectiveness of LCTM in deal-ing with short texts as well as the capabil-ity of the model in handling held-out doc-uments with a high degree of OOV words.},
 bibtype = {article},
 author = {Hu, Weihua and Tsujii, Jun 'ichi}
}

Downloads: 0

{"_id":"XrdQnnD2ptqtZjoor","bibbaseid":"hu-tsujii-alatentconcepttopicmodelforrobusttopicinferenceusingwordembeddings","downloads":0,"creationDate":"2018-02-07T16:22:57.318Z","title":"A Latent Concept Topic Model for Robust Topic Inference Using Word Embeddings","author_short":["Hu, W.","Tsujii, J., '."],"year":null,"bibtype":"article","biburl":null,"bibdata":{"title":"A Latent Concept Topic Model for Robust Topic Inference Using Word Embeddings","type":"article","pages":"380-386","websites":"https://pdfs.semanticscholar.org/68d1/26a8a7080b7a67219c27456c873543376393.pdf","id":"eb49e89d-613f-3138-b6fc-bd74367d353b","created":"2018-02-05T18:48:03.758Z","accessed":"2018-02-05","file_attached":"true","profile_id":"371589bb-c770-37ff-8193-93c6f25ffeb1","group_id":"f982cd63-7ceb-3aa2-ac7e-a953963d6716","last_modified":"2018-02-05T18:48:06.626Z","read":false,"starred":false,"authored":false,"confirmed":false,"hidden":false,"private_publication":false,"abstract":"Uncovering thematic structures of SNS and blog posts is a crucial yet challeng-ing task, because of the severe data spar-sity induced by the short length of texts and diverse use of vocabulary. This hin-ders effective topic inference of traditional LDA because it infers topics based on document-level co-occurrence of words. To robustly infer topics in such contexts, we propose a latent concept topic model (LCTM). Unlike LDA, LCTM reveals top-ics via co-occurrence of latent concepts, which we introduce as latent variables to capture conceptual similarity of words. More specifically, LCTM models each topic as a distribution over the latent con-cepts, where each latent concept is a local-ized Gaussian distribution over the word embedding space. Since the number of unique concepts in a corpus is often much smaller than the number of unique words, LCTM is less susceptible to the data spar-sity. Experiments on the 20Newsgroups show the effectiveness of LCTM in deal-ing with short texts as well as the capabil-ity of the model in handling held-out doc-uments with a high degree of OOV words.","bibtype":"article","author":"Hu, Weihua and Tsujii, Jun 'ichi","bibtex":"@article{\n title = {A Latent Concept Topic Model for Robust Topic Inference Using Word Embeddings},\n type = {article},\n pages = {380-386},\n websites = {https://pdfs.semanticscholar.org/68d1/26a8a7080b7a67219c27456c873543376393.pdf},\n id = {eb49e89d-613f-3138-b6fc-bd74367d353b},\n created = {2018-02-05T18:48:03.758Z},\n accessed = {2018-02-05},\n file_attached = {true},\n profile_id = {371589bb-c770-37ff-8193-93c6f25ffeb1},\n group_id = {f982cd63-7ceb-3aa2-ac7e-a953963d6716},\n last_modified = {2018-02-05T18:48:06.626Z},\n read = {false},\n starred = {false},\n authored = {false},\n confirmed = {false},\n hidden = {false},\n private_publication = {false},\n abstract = {Uncovering thematic structures of SNS and blog posts is a crucial yet challeng-ing task, because of the severe data spar-sity induced by the short length of texts and diverse use of vocabulary. This hin-ders effective topic inference of traditional LDA because it infers topics based on document-level co-occurrence of words. To robustly infer topics in such contexts, we propose a latent concept topic model (LCTM). Unlike LDA, LCTM reveals top-ics via co-occurrence of latent concepts, which we introduce as latent variables to capture conceptual similarity of words. More specifically, LCTM models each topic as a distribution over the latent con-cepts, where each latent concept is a local-ized Gaussian distribution over the word embedding space. Since the number of unique concepts in a corpus is often much smaller than the number of unique words, LCTM is less susceptible to the data spar-sity. Experiments on the 20Newsgroups show the effectiveness of LCTM in deal-ing with short texts as well as the capabil-ity of the model in handling held-out doc-uments with a high degree of OOV words.},\n bibtype = {article},\n author = {Hu, Weihua and Tsujii, Jun 'ichi}\n}","author_short":["Hu, W.","Tsujii, J., '."],"urls":{"Paper":"http://bibbase.org/service/mendeley/371589bb-c770-37ff-8193-93c6f25ffeb1/file/635847b3-ae47-d4bf-500c-22d9eb55c2a7/A_Latent_Concept_Topic_Model_for_Robust_Topic_Inference_Using_Word_Embeddings.pdf.pdf","Website":"https://pdfs.semanticscholar.org/68d1/26a8a7080b7a67219c27456c873543376393.pdf"},"bibbaseid":"hu-tsujii-alatentconcepttopicmodelforrobusttopicinferenceusingwordembeddings","role":"author","downloads":0},"search_terms":["latent","concept","topic","model","robust","topic","inference","using","word","embeddings","hu","tsujii"],"keywords":[],"authorIDs":[]}