Adaptive record extraction from web pages. Park, J. & Barbosa, D. Proceedings of the 16th international conference on World Wide Web WWW 07, ACM Press, 2007.
Adaptive record extraction from web pages [link]Website  abstract   bibtex   
We describe an adaptive method for extracting records from web pages. Our algorithm combines a weighted tree matching metric with clustering for obtaining data extraction patterns.We compare our method experimentally to the state-of-the-art, and show that our approach is very competitive for rigidly-structured records (such as product descriptions) and far superior for loosely-structured records (such as entrieson blogs).
@article{
 title = {Adaptive record extraction from web pages},
 type = {article},
 year = {2007},
 identifiers = {[object Object]},
 pages = {1335},
 websites = {http://portal.acm.org/citation.cfm?doid=1242572.1242838},
 publisher = {ACM Press},
 id = {668be039-8099-31c9-a7cf-dda44976f950},
 created = {2012-02-28T00:51:15.000Z},
 file_attached = {false},
 profile_id = {5284e6aa-156c-3ce5-bc0e-b80cf09f3ef6},
 group_id = {066b42c8-f712-3fc3-abb2-225c158d2704},
 last_modified = {2017-03-14T14:36:19.698Z},
 read = {false},
 starred = {false},
 authored = {false},
 confirmed = {true},
 hidden = {false},
 citation_key = {Park2007},
 private_publication = {false},
 abstract = {We describe an adaptive method for extracting records from web pages. Our algorithm combines a weighted tree matching metric with clustering for obtaining data extraction patterns.We compare our method experimentally to the state-of-the-art, and show that our approach is very competitive for rigidly-structured records (such as product descriptions) and far superior for loosely-structured records (such as entrieson blogs).},
 bibtype = {article},
 author = {Park, Justin and Barbosa, Denilson},
 journal = {Proceedings of the 16th international conference on World Wide Web WWW 07}
}

Downloads: 0