WebSets: extracting sets of entities from the web using unsupervised information extraction. Dalvi, B., B., Cohen, W., W., & Callan, J. In Proceedings of the fifth ACM international conference on Web search and data mining - WSDM '12, pages 243, 2012. ACM Press.
WebSets: extracting sets of entities from the web using unsupervised information extraction [link]Website  abstract   bibtex   
We describe a open-domain information extraction method for extracting concept-instance pairs from an HTML corpus. Most earlier approaches to this problem rely on combining clusters of distributionally similar terms and concept-instance pairs obtained with Hearst patterns. In contrast, our method relies on a novel approach for clustering terms found in HTML tables, and then assigning concept names to these clusters using Hearst patterns. The method can be efficiently applied to a large corpus, and experimental results on several datasets show that our method can accurately extract large numbers of concept-instance pairs.
@inProceedings{
 title = {WebSets: extracting sets of entities from the web using unsupervised information extraction},
 type = {inProceedings},
 year = {2012},
 identifiers = {[object Object]},
 pages = {243},
 websites = {http://dl.acm.org/citation.cfm?doid=2124295.2124327},
 publisher = {ACM Press},
 city = {New York, New York, USA},
 id = {0778e0d0-e371-3ede-8eba-aac11d02526a},
 created = {2012-04-01T16:32:49.000Z},
 accessed = {2012-03-28},
 file_attached = {false},
 profile_id = {5284e6aa-156c-3ce5-bc0e-b80cf09f3ef6},
 group_id = {066b42c8-f712-3fc3-abb2-225c158d2704},
 last_modified = {2017-03-14T14:36:19.698Z},
 tags = {named entities},
 read = {false},
 starred = {false},
 authored = {false},
 confirmed = {true},
 hidden = {false},
 citation_key = {Dalvi2012},
 private_publication = {false},
 abstract = {We describe a open-domain information extraction method for extracting concept-instance pairs from an HTML corpus. Most earlier approaches to this problem rely on combining clusters of distributionally similar terms and concept-instance pairs obtained with Hearst patterns. In contrast, our method relies on a novel approach for clustering terms found in HTML tables, and then assigning concept names to these clusters using Hearst patterns. The method can be efficiently applied to a large corpus, and experimental results on several datasets show that our method can accurately extract large numbers of concept-instance pairs.},
 bibtype = {inProceedings},
 author = {Dalvi, Bhavana Bharat and Cohen, William W. and Callan, Jamie},
 booktitle = {Proceedings of the fifth ACM international conference on Web search and data mining - WSDM '12}
}

Downloads: 0