Robust Web Data Extraction with XML Path Expressions

Robust Web Data Extraction with XML Path Expressions. Myllymaki, J. & Jackson, J. Technical Report IBM Research Report, 2002.
abstract bibtex

Automated extraction of structured Web data has attracted considerable interest in both the academia and industry. A particularly promising approach is to employ XML technologies to translate semi-structured HTML documents to “pure ” XML documents. In this approach, HTML documents are first normalized into XHMTL and then mapped to the desired XML application format by using XML path expressions and regular expressions. In this paper we describe a methodology for creating XML path (XPath) expressions that are capable of extracting data from virtually any HTML page, while placing an emphasis on the persistent integrity of these expressions. This robustness is critical given the vulnerability of extraction technologies to the continually changing content, structure, and formatting of pages on the Web. We define categories of extraction rules in terms of their dependence on content, structural, or formatting features, and provide practical tips on how to create dependable data extraction patterns for the Web.

@techreport{
 title = {Robust Web Data Extraction with XML Path Expressions},
 type = {techreport},
 year = {2002},
 institution = {IBM Research Report},
 id = {498876f0-09f8-372c-932f-caa9cd2b88fe},
 created = {2012-12-24T15:02:36.000Z},
 file_attached = {false},
 profile_id = {5284e6aa-156c-3ce5-bc0e-b80cf09f3ef6},
 group_id = {066b42c8-f712-3fc3-abb2-225c158d2704},
 last_modified = {2017-03-14T14:36:19.698Z},
 tags = {web data extraction,xpath},
 read = {false},
 starred = {false},
 authored = {false},
 confirmed = {true},
 hidden = {false},
 citation_key = {Myllymaki2002},
 private_publication = {false},
 abstract = {Automated extraction of structured Web data has attracted considerable interest in both the academia and industry. A particularly promising approach is to employ XML technologies to translate semi-structured HTML documents to “pure ” XML documents. In this approach, HTML documents are first normalized into XHMTL and then mapped to the desired XML application format by using XML path expressions and regular expressions. In this paper we describe a methodology for creating XML path (XPath) expressions that are capable of extracting data from virtually any HTML page, while placing an emphasis on the persistent integrity of these expressions. This robustness is critical given the vulnerability of extraction technologies to the continually changing content, structure, and formatting of pages on the Web. We define categories of extraction rules in terms of their dependence on content, structural, or formatting features, and provide practical tips on how to create dependable data extraction patterns for the Web.},
 bibtype = {techreport},
 author = {Myllymaki, Jussi and Jackson, Jared}
}

Downloads: 0

{"_id":"4g2ypTjGPyNXrJSKN","bibbaseid":"myllymaki-jackson-robustwebdataextractionwithxmlpathexpressions-2002","authorIDs":[],"author_short":["Myllymaki, J.","Jackson, J."],"bibdata":{"title":"Robust Web Data Extraction with XML Path Expressions","type":"techreport","year":"2002","institution":"IBM Research Report","id":"498876f0-09f8-372c-932f-caa9cd2b88fe","created":"2012-12-24T15:02:36.000Z","file_attached":false,"profile_id":"5284e6aa-156c-3ce5-bc0e-b80cf09f3ef6","group_id":"066b42c8-f712-3fc3-abb2-225c158d2704","last_modified":"2017-03-14T14:36:19.698Z","tags":"web data extraction,xpath","read":false,"starred":false,"authored":false,"confirmed":"true","hidden":false,"citation_key":"Myllymaki2002","private_publication":false,"abstract":"Automated extraction of structured Web data has attracted considerable interest in both the academia and industry. A particularly promising approach is to employ XML technologies to translate semi-structured HTML documents to “pure ” XML documents. In this approach, HTML documents are first normalized into XHMTL and then mapped to the desired XML application format by using XML path expressions and regular expressions. In this paper we describe a methodology for creating XML path (XPath) expressions that are capable of extracting data from virtually any HTML page, while placing an emphasis on the persistent integrity of these expressions. This robustness is critical given the vulnerability of extraction technologies to the continually changing content, structure, and formatting of pages on the Web. We define categories of extraction rules in terms of their dependence on content, structural, or formatting features, and provide practical tips on how to create dependable data extraction patterns for the Web.","bibtype":"techreport","author":"Myllymaki, Jussi and Jackson, Jared","bibtex":"@techreport{\n title = {Robust Web Data Extraction with XML Path Expressions},\n type = {techreport},\n year = {2002},\n institution = {IBM Research Report},\n id = {498876f0-09f8-372c-932f-caa9cd2b88fe},\n created = {2012-12-24T15:02:36.000Z},\n file_attached = {false},\n profile_id = {5284e6aa-156c-3ce5-bc0e-b80cf09f3ef6},\n group_id = {066b42c8-f712-3fc3-abb2-225c158d2704},\n last_modified = {2017-03-14T14:36:19.698Z},\n tags = {web data extraction,xpath},\n read = {false},\n starred = {false},\n authored = {false},\n confirmed = {true},\n hidden = {false},\n citation_key = {Myllymaki2002},\n private_publication = {false},\n abstract = {Automated extraction of structured Web data has attracted considerable interest in both the academia and industry. A particularly promising approach is to employ XML technologies to translate semi-structured HTML documents to “pure ” XML documents. In this approach, HTML documents are first normalized into XHMTL and then mapped to the desired XML application format by using XML path expressions and regular expressions. In this paper we describe a methodology for creating XML path (XPath) expressions that are capable of extracting data from virtually any HTML page, while placing an emphasis on the persistent integrity of these expressions. This robustness is critical given the vulnerability of extraction technologies to the continually changing content, structure, and formatting of pages on the Web. We define categories of extraction rules in terms of their dependence on content, structural, or formatting features, and provide practical tips on how to create dependable data extraction patterns for the Web.},\n bibtype = {techreport},\n author = {Myllymaki, Jussi and Jackson, Jared}\n}","author_short":["Myllymaki, J.","Jackson, J."],"bibbaseid":"myllymaki-jackson-robustwebdataextractionwithxmlpathexpressions-2002","role":"author","urls":{},"downloads":0,"html":""},"bibtype":"techreport","creationDate":"2020-02-06T23:48:12.204Z","downloads":0,"keywords":[],"search_terms":["robust","web","data","extraction","xml","path","expressions","myllymaki","jackson"],"title":"Robust Web Data Extraction with XML Path Expressions","year":2002}