Improving Statistical Method using Known Terms for Automatic Term Extraction. Fahmi, I., Bouma, G., & Van Der Plas, L. Computational Linguistics in the Netherlands CLIN 17, 2007.
abstract   bibtex   
In this paper, we improve the statistical ranking of multi-word terms using known terms. We make use of linguistic knowledge to extract noun phrases as candidate terms in Dutch. After converting them into bigrams, we compare the performances of eight statistical methods (frequency, dice, log-likelihood, pair-wise mutual information, true mutual information, t-score, chi-square, and C-value) in measuring the bigram association and then we select the best one (log-likelihood) as a baseline and for further improvement. We propose a new scoring method to improve its term ranking by incorporating known terms. For evaluation, we use Elsevier's Medical Encyclopedia and Merck Manual as corpora, and compare the extracted terms against those encoded in the encyclopedia and a list of Dutch health terms collected from the Internet. We also apply manual evaluation for new terms. The evaluation using accuracy and figure of merit indicates that our method improves the ranking and successfully assigns higher scores to new terms.
@article{
 title = {Improving Statistical Method using Known Terms for Automatic Term Extraction},
 type = {article},
 year = {2007},
 id = {9e112f2e-047e-305d-9510-7eb02d1fce62},
 created = {2012-02-28T00:51:15.000Z},
 file_attached = {false},
 profile_id = {5284e6aa-156c-3ce5-bc0e-b80cf09f3ef6},
 group_id = {066b42c8-f712-3fc3-abb2-225c158d2704},
 last_modified = {2017-03-14T14:36:19.698Z},
 tags = {keyword extraction,term extraction},
 read = {false},
 starred = {false},
 authored = {false},
 confirmed = {true},
 hidden = {false},
 citation_key = {Fahmi2007},
 private_publication = {false},
 abstract = {In this paper, we improve the statistical ranking of multi-word terms using known terms. We make use of linguistic knowledge to extract noun phrases as candidate terms in Dutch. After converting them into bigrams, we compare the performances of eight statistical methods (frequency, dice, log-likelihood, pair-wise mutual information, true mutual information, t-score, chi-square, and C-value) in measuring the bigram association and then we select the best one (log-likelihood) as a baseline and for further improvement. We propose a new scoring method to improve its term ranking by incorporating known terms. For evaluation, we use Elsevier's Medical Encyclopedia and Merck Manual as corpora, and compare the extracted terms against those encoded in the encyclopedia and a list of Dutch health terms collected from the Internet. We also apply manual evaluation for new terms. The evaluation using accuracy and figure of merit indicates that our method improves the ranking and successfully assigns higher scores to new terms.},
 bibtype = {article},
 author = {Fahmi, Ismail and Bouma, Gosse and Van Der Plas, Lonneke},
 journal = {Computational Linguistics in the Netherlands CLIN 17},
 number = {January}
}

Downloads: 0