Constructing Evaluation Corpora for Automated Clinical Named Entity Recognition

Constructing Evaluation Corpora for Automated Clinical Named Entity Recognition. S, P., V., O., M., D, G., K., S., P., & H, C., G., C., M., D., P. In Proceedings of the 6th international Conference on Language Resources and Evaluation (2008), pages 3143-3150, 2008. European Language Resources Association (ELRA).

Website abstract bibtex

We report on the construction of a gold-standard dataset consisting of annotated clinical notes suitable for evaluating our biomedical named entity recognition system. The dataset is the result of consensus between four human annotators and contains 1,556 annotations on 160 clinical notes using 658 unique concept codes from SNOMED-CT corresponding to human disorders. Inter-annotator agreement was calculated on annotations from 100 of the documents for span (90.9%), concept code (81.7%), context (84.8%), and status (86.0%) agreement. Complete agreement for span, concept code, context, and status was 74.6%. We found that creating a consensus set based on annotations from two independently-created annotation sets can reduce inter-annotator disagreement by 32.3%. We found little benefit to pre-annotating the corpus with a third-party named entity recognizer.

@inProceedings{
 title = {Constructing Evaluation Corpora for Automated Clinical Named Entity Recognition},
 type = {inProceedings},
 year = {2008},
 identifiers = {[object Object]},
 pages = {3143-3150},
 websites = {http://www.lrec-conf.org/proceedings/lrec2008/},
 publisher = {European Language Resources Association (ELRA)},
 editors = {[object Object],[object Object],[object Object],[object Object],[object Object],[object Object],[object Object]},
 id = {d9b0db98-b54f-3b3c-8b4c-90884f2aa3ce},
 created = {2011-12-28T07:04:55.000Z},
 file_attached = {false},
 profile_id = {5284e6aa-156c-3ce5-bc0e-b80cf09f3ef6},
 group_id = {066b42c8-f712-3fc3-abb2-225c158d2704},
 last_modified = {2017-03-14T14:36:19.698Z},
 tags = {named entities},
 read = {false},
 starred = {false},
 authored = {false},
 confirmed = {true},
 hidden = {false},
 citation_key = {S2008},
 private_publication = {false},
 abstract = {We report on the construction of a gold-standard dataset consisting of annotated clinical notes suitable for evaluating our biomedical named entity recognition system. The dataset is the result of consensus between four human annotators and contains 1,556 annotations on 160 clinical notes using 658 unique concept codes from SNOMED-CT corresponding to human disorders. Inter-annotator agreement was calculated on annotations from 100 of the documents for span (90.9%), concept code (81.7%), context (84.8%), and status (86.0%) agreement. Complete agreement for span, concept code, context, and status was 74.6%. We found that creating a consensus set based on annotations from two independently-created annotation sets can reduce inter-annotator disagreement by 32.3%. We found little benefit to pre-annotating the corpus with a third-party named entity recognizer.},
 bibtype = {inProceedings},
 author = {S, Philip V Ogren M and D, Guergana K Savova Ph and H, Christopher G Chute M D P},
 booktitle = {Proceedings of the 6th international Conference on Language Resources and Evaluation (2008)}
}

Downloads: 0

{"_id":"E5tX8x9p9R7uQDfwe","bibbaseid":"s-d-h-constructingevaluationcorporaforautomatedclinicalnamedentityrecognition-2008","authorIDs":[],"author_short":["S, P., V., O., M.","D, G., K., S., P.","H, C., G., C., M., D., P."],"bibdata":{"title":"Constructing Evaluation Corpora for Automated Clinical Named Entity Recognition","type":"inProceedings","year":"2008","identifiers":"[object Object]","pages":"3143-3150","websites":"http://www.lrec-conf.org/proceedings/lrec2008/","publisher":"European Language Resources Association (ELRA)","editors":"[object Object],[object Object],[object Object],[object Object],[object Object],[object Object],[object Object]","id":"d9b0db98-b54f-3b3c-8b4c-90884f2aa3ce","created":"2011-12-28T07:04:55.000Z","file_attached":false,"profile_id":"5284e6aa-156c-3ce5-bc0e-b80cf09f3ef6","group_id":"066b42c8-f712-3fc3-abb2-225c158d2704","last_modified":"2017-03-14T14:36:19.698Z","tags":"named entities","read":false,"starred":false,"authored":false,"confirmed":"true","hidden":false,"citation_key":"S2008","private_publication":false,"abstract":"We report on the construction of a gold-standard dataset consisting of annotated clinical notes suitable for evaluating our biomedical named entity recognition system. The dataset is the result of consensus between four human annotators and contains 1,556 annotations on 160 clinical notes using 658 unique concept codes from SNOMED-CT corresponding to human disorders. Inter-annotator agreement was calculated on annotations from 100 of the documents for span (90.9%), concept code (81.7%), context (84.8%), and status (86.0%) agreement. Complete agreement for span, concept code, context, and status was 74.6%. We found that creating a consensus set based on annotations from two independently-created annotation sets can reduce inter-annotator disagreement by 32.3%. We found little benefit to pre-annotating the corpus with a third-party named entity recognizer.","bibtype":"inProceedings","author":"S, Philip V Ogren M and D, Guergana K Savova Ph and H, Christopher G Chute M D P","booktitle":"Proceedings of the 6th international Conference on Language Resources and Evaluation (2008)","bibtex":"@inProceedings{\n title = {Constructing Evaluation Corpora for Automated Clinical Named Entity Recognition},\n type = {inProceedings},\n year = {2008},\n identifiers = {[object Object]},\n pages = {3143-3150},\n websites = {http://www.lrec-conf.org/proceedings/lrec2008/},\n publisher = {European Language Resources Association (ELRA)},\n editors = {[object Object],[object Object],[object Object],[object Object],[object Object],[object Object],[object Object]},\n id = {d9b0db98-b54f-3b3c-8b4c-90884f2aa3ce},\n created = {2011-12-28T07:04:55.000Z},\n file_attached = {false},\n profile_id = {5284e6aa-156c-3ce5-bc0e-b80cf09f3ef6},\n group_id = {066b42c8-f712-3fc3-abb2-225c158d2704},\n last_modified = {2017-03-14T14:36:19.698Z},\n tags = {named entities},\n read = {false},\n starred = {false},\n authored = {false},\n confirmed = {true},\n hidden = {false},\n citation_key = {S2008},\n private_publication = {false},\n abstract = {We report on the construction of a gold-standard dataset consisting of annotated clinical notes suitable for evaluating our biomedical named entity recognition system. The dataset is the result of consensus between four human annotators and contains 1,556 annotations on 160 clinical notes using 658 unique concept codes from SNOMED-CT corresponding to human disorders. Inter-annotator agreement was calculated on annotations from 100 of the documents for span (90.9%), concept code (81.7%), context (84.8%), and status (86.0%) agreement. Complete agreement for span, concept code, context, and status was 74.6%. We found that creating a consensus set based on annotations from two independently-created annotation sets can reduce inter-annotator disagreement by 32.3%. We found little benefit to pre-annotating the corpus with a third-party named entity recognizer.},\n bibtype = {inProceedings},\n author = {S, Philip V Ogren M and D, Guergana K Savova Ph and H, Christopher G Chute M D P},\n booktitle = {Proceedings of the 6th international Conference on Language Resources and Evaluation (2008)}\n}","author_short":["S, P., V., O., M.","D, G., K., S., P.","H, C., G., C., M., D., P."],"urls":{"Website":"http://www.lrec-conf.org/proceedings/lrec2008/"},"bibbaseid":"s-d-h-constructingevaluationcorporaforautomatedclinicalnamedentityrecognition-2008","role":"author","downloads":0,"html":""},"bibtype":"inProceedings","creationDate":"2020-02-06T23:48:11.833Z","downloads":0,"keywords":[],"search_terms":["constructing","evaluation","corpora","automated","clinical","named","entity","recognition","s","d","h"],"title":"Constructing Evaluation Corpora for Automated Clinical Named Entity Recognition","year":2008}