Language Modeling by Clustering with Word Embeddings for Text Readability Assessment

Language Modeling by Clustering with Word Embeddings for Text Readability Assessment. Cha, M., Gwon, Y., & Kung, H., T.

We present a clustering-based language model using word em-beddings for text readability prediction. Presumably, an Euclidean semantic space hypothesis holds true for word embeddings whose training is done by observing word co-occurrences. We argue that clustering with word embeddings in the metric space should yield feature representations in a higher semantic space appropriate for text regression. Also, by representing features in terms of his-tograms, our approach can naturally address documents of varying lengths. An empirical evaluation using the Common Core Standards corpus reveals that the features formed on our clustering-based language model signiicantly improve the previously known results for the same corpus in readability prediction. We also evaluate the task of sentence matching based on semantic relatedness using the Wiki-SimpleWiki corpus and dnd that our features lead to superior matching performance.

@article{
 title = {Language Modeling by Clustering with Word Embeddings for Text Readability Assessment},
 type = {article},
 identifiers = {[object Object]},
 keywords = {KEYWORDS Readability assessment,clustering-based language model},
 websites = {https://arxiv.org/pdf/1709.01888.pdf},
 id = {91c05b91-b5c4-344e-890d-9dda9786f0b4},
 created = {2018-02-05T19:14:20.670Z},
 accessed = {2018-02-05},
 file_attached = {true},
 profile_id = {371589bb-c770-37ff-8193-93c6f25ffeb1},
 group_id = {f982cd63-7ceb-3aa2-ac7e-a953963d6716},
 last_modified = {2018-02-05T19:14:23.177Z},
 read = {false},
 starred = {false},
 authored = {false},
 confirmed = {false},
 hidden = {false},
 private_publication = {false},
 abstract = {We present a clustering-based language model using word em-beddings for text readability prediction. Presumably, an Euclidean semantic space hypothesis holds true for word embeddings whose training is done by observing word co-occurrences. We argue that clustering with word embeddings in the metric space should yield feature representations in a higher semantic space appropriate for text regression. Also, by representing features in terms of his-tograms, our approach can naturally address documents of varying lengths. An empirical evaluation using the Common Core Standards corpus reveals that the features formed on our clustering-based language model signiicantly improve the previously known results for the same corpus in readability prediction. We also evaluate the task of sentence matching based on semantic relatedness using the Wiki-SimpleWiki corpus and dnd that our features lead to superior matching performance.},
 bibtype = {article},
 author = {Cha, Miriam and Gwon, Youngjune and Kung, H T}
}

Downloads: 0

{"_id":"MXu74XqgsiRMPuTkv","bibbaseid":"cha-gwon-kung-languagemodelingbyclusteringwithwordembeddingsfortextreadabilityassessment","downloads":0,"creationDate":"2018-02-07T16:22:57.341Z","title":"Language Modeling by Clustering with Word Embeddings for Text Readability Assessment","author_short":["Cha, M.","Gwon, Y.","Kung, H., T."],"year":null,"bibtype":"article","biburl":null,"bibdata":{"title":"Language Modeling by Clustering with Word Embeddings for Text Readability Assessment","type":"article","identifiers":"[object Object]","keywords":"KEYWORDS Readability assessment,clustering-based language model","websites":"https://arxiv.org/pdf/1709.01888.pdf","id":"91c05b91-b5c4-344e-890d-9dda9786f0b4","created":"2018-02-05T19:14:20.670Z","accessed":"2018-02-05","file_attached":"true","profile_id":"371589bb-c770-37ff-8193-93c6f25ffeb1","group_id":"f982cd63-7ceb-3aa2-ac7e-a953963d6716","last_modified":"2018-02-05T19:14:23.177Z","read":false,"starred":false,"authored":false,"confirmed":false,"hidden":false,"private_publication":false,"abstract":"We present a clustering-based language model using word em-beddings for text readability prediction. Presumably, an Euclidean semantic space hypothesis holds true for word embeddings whose training is done by observing word co-occurrences. We argue that clustering with word embeddings in the metric space should yield feature representations in a higher semantic space appropriate for text regression. Also, by representing features in terms of his-tograms, our approach can naturally address documents of varying lengths. An empirical evaluation using the Common Core Standards corpus reveals that the features formed on our clustering-based language model signiicantly improve the previously known results for the same corpus in readability prediction. We also evaluate the task of sentence matching based on semantic relatedness using the Wiki-SimpleWiki corpus and dnd that our features lead to superior matching performance.","bibtype":"article","author":"Cha, Miriam and Gwon, Youngjune and Kung, H T","bibtex":"@article{\n title = {Language Modeling by Clustering with Word Embeddings for Text Readability Assessment},\n type = {article},\n identifiers = {[object Object]},\n keywords = {KEYWORDS Readability assessment,clustering-based language model},\n websites = {https://arxiv.org/pdf/1709.01888.pdf},\n id = {91c05b91-b5c4-344e-890d-9dda9786f0b4},\n created = {2018-02-05T19:14:20.670Z},\n accessed = {2018-02-05},\n file_attached = {true},\n profile_id = {371589bb-c770-37ff-8193-93c6f25ffeb1},\n group_id = {f982cd63-7ceb-3aa2-ac7e-a953963d6716},\n last_modified = {2018-02-05T19:14:23.177Z},\n read = {false},\n starred = {false},\n authored = {false},\n confirmed = {false},\n hidden = {false},\n private_publication = {false},\n abstract = {We present a clustering-based language model using word em-beddings for text readability prediction. Presumably, an Euclidean semantic space hypothesis holds true for word embeddings whose training is done by observing word co-occurrences. We argue that clustering with word embeddings in the metric space should yield feature representations in a higher semantic space appropriate for text regression. Also, by representing features in terms of his-tograms, our approach can naturally address documents of varying lengths. An empirical evaluation using the Common Core Standards corpus reveals that the features formed on our clustering-based language model signiicantly improve the previously known results for the same corpus in readability prediction. We also evaluate the task of sentence matching based on semantic relatedness using the Wiki-SimpleWiki corpus and dnd that our features lead to superior matching performance.},\n bibtype = {article},\n author = {Cha, Miriam and Gwon, Youngjune and Kung, H T}\n}","author_short":["Cha, M.","Gwon, Y.","Kung, H., T."],"urls":{"Paper":"http://bibbase.org/service/mendeley/371589bb-c770-37ff-8193-93c6f25ffeb1/file/4f934215-48a2-ae0b-3a1b-06a7f79f2757/Language_Modeling_by_Clustering_with_Word_Embeddings_for_Text_Readability_Assessment.pdf.pdf","Website":"https://arxiv.org/pdf/1709.01888.pdf"},"bibbaseid":"cha-gwon-kung-languagemodelingbyclusteringwithwordembeddingsfortextreadabilityassessment","role":"author","keyword":["KEYWORDS Readability assessment","clustering-based language model"],"downloads":0},"search_terms":["language","modeling","clustering","word","embeddings","text","readability","assessment","cha","gwon","kung"],"keywords":["keywords readability assessment","clustering-based language model"],"authorIDs":[]}