Adaptive record extraction from web pages

Adaptive record extraction from web pages. Park, J. & Barbosa, D. Proceedings of the 16th international conference on World Wide Web WWW 07, ACM Press, 2007.

Website abstract bibtex

We describe an adaptive method for extracting records from web pages. Our algorithm combines a weighted tree matching metric with clustering for obtaining data extraction patterns.We compare our method experimentally to the state-of-the-art, and show that our approach is very competitive for rigidly-structured records (such as product descriptions) and far superior for loosely-structured records (such as entrieson blogs).

@article{
 title = {Adaptive record extraction from web pages},
 type = {article},
 year = {2007},
 identifiers = {[object Object]},
 pages = {1335},
 websites = {http://portal.acm.org/citation.cfm?doid=1242572.1242838},
 publisher = {ACM Press},
 id = {668be039-8099-31c9-a7cf-dda44976f950},
 created = {2012-02-28T00:51:15.000Z},
 file_attached = {false},
 profile_id = {5284e6aa-156c-3ce5-bc0e-b80cf09f3ef6},
 group_id = {066b42c8-f712-3fc3-abb2-225c158d2704},
 last_modified = {2017-03-14T14:36:19.698Z},
 read = {false},
 starred = {false},
 authored = {false},
 confirmed = {true},
 hidden = {false},
 citation_key = {Park2007},
 private_publication = {false},
 abstract = {We describe an adaptive method for extracting records from web pages. Our algorithm combines a weighted tree matching metric with clustering for obtaining data extraction patterns.We compare our method experimentally to the state-of-the-art, and show that our approach is very competitive for rigidly-structured records (such as product descriptions) and far superior for loosely-structured records (such as entrieson blogs).},
 bibtype = {article},
 author = {Park, Justin and Barbosa, Denilson},
 journal = {Proceedings of the 16th international conference on World Wide Web WWW 07}
}

Downloads: 0

{"_id":"AKYA3PRqmjxBYL9Nf","bibbaseid":"park-barbosa-adaptiverecordextractionfromwebpages-2007","authorIDs":[],"author_short":["Park, J.","Barbosa, D."],"bibdata":{"title":"Adaptive record extraction from web pages","type":"article","year":"2007","identifiers":"[object Object]","pages":"1335","websites":"http://portal.acm.org/citation.cfm?doid=1242572.1242838","publisher":"ACM Press","id":"668be039-8099-31c9-a7cf-dda44976f950","created":"2012-02-28T00:51:15.000Z","file_attached":false,"profile_id":"5284e6aa-156c-3ce5-bc0e-b80cf09f3ef6","group_id":"066b42c8-f712-3fc3-abb2-225c158d2704","last_modified":"2017-03-14T14:36:19.698Z","read":false,"starred":false,"authored":false,"confirmed":"true","hidden":false,"citation_key":"Park2007","private_publication":false,"abstract":"We describe an adaptive method for extracting records from web pages. Our algorithm combines a weighted tree matching metric with clustering for obtaining data extraction patterns.We compare our method experimentally to the state-of-the-art, and show that our approach is very competitive for rigidly-structured records (such as product descriptions) and far superior for loosely-structured records (such as entrieson blogs).","bibtype":"article","author":"Park, Justin and Barbosa, Denilson","journal":"Proceedings of the 16th international conference on World Wide Web WWW 07","bibtex":"@article{\n title = {Adaptive record extraction from web pages},\n type = {article},\n year = {2007},\n identifiers = {[object Object]},\n pages = {1335},\n websites = {http://portal.acm.org/citation.cfm?doid=1242572.1242838},\n publisher = {ACM Press},\n id = {668be039-8099-31c9-a7cf-dda44976f950},\n created = {2012-02-28T00:51:15.000Z},\n file_attached = {false},\n profile_id = {5284e6aa-156c-3ce5-bc0e-b80cf09f3ef6},\n group_id = {066b42c8-f712-3fc3-abb2-225c158d2704},\n last_modified = {2017-03-14T14:36:19.698Z},\n read = {false},\n starred = {false},\n authored = {false},\n confirmed = {true},\n hidden = {false},\n citation_key = {Park2007},\n private_publication = {false},\n abstract = {We describe an adaptive method for extracting records from web pages. Our algorithm combines a weighted tree matching metric with clustering for obtaining data extraction patterns.We compare our method experimentally to the state-of-the-art, and show that our approach is very competitive for rigidly-structured records (such as product descriptions) and far superior for loosely-structured records (such as entrieson blogs).},\n bibtype = {article},\n author = {Park, Justin and Barbosa, Denilson},\n journal = {Proceedings of the 16th international conference on World Wide Web WWW 07}\n}","author_short":["Park, J.","Barbosa, D."],"urls":{"Website":"http://portal.acm.org/citation.cfm?doid=1242572.1242838"},"bibbaseid":"park-barbosa-adaptiverecordextractionfromwebpages-2007","role":"author","downloads":0,"html":""},"bibtype":"article","creationDate":"2020-02-06T23:48:12.063Z","downloads":0,"keywords":[],"search_terms":["adaptive","record","extraction","web","pages","park","barbosa"],"title":"Adaptive record extraction from web pages","year":2007}