Wrapping Web Data into XML. Han, W., Buttler, D., & Pu, C. Sigmod Record, 30(3):33-38, ACM Press, 2001.
Wrapping Web Data into XML [link]Website  abstract   bibtex   
The vast majority of information that is available on- line, and coming online in this near future is only avail- able in HTML. In order to use this information for more than human browsing, it must be converted into a machine-readable format. Wrappers have been the key tool to make the conversion from HTML into se- mantically meaningful and well-structured XML data. However, developing wrappers is slow and tedious work with typically brittle results. This paper de- scribes XWRAP Elite, a tool to automatically gen- erate robust wrappers, which breaks down the conver- sion process into three procedures: discovering where the data is located in an HTML page and separating the data into individual objects; decomposing objects into data elements; marking objects and elements in an output format. XWRAP Elite automates the rst two procedures and requires minimal human involve- ment in marking output data. In addition, there is a code generation component to package all of the pieces into a stand-alone wrapper.
@article{
 title = {Wrapping Web Data into XML},
 type = {article},
 year = {2001},
 identifiers = {[object Object]},
 pages = {33-38},
 volume = {30},
 websites = {http://dl.acm.org/citation.cfm?id=603867.603873},
 publisher = {ACM Press},
 id = {5590cde7-b69a-39fd-985c-d8eacf256532},
 created = {2012-12-24T15:02:36.000Z},
 file_attached = {false},
 profile_id = {5284e6aa-156c-3ce5-bc0e-b80cf09f3ef6},
 group_id = {066b42c8-f712-3fc3-abb2-225c158d2704},
 last_modified = {2017-03-14T14:36:19.698Z},
 tags = {web data extraction},
 read = {false},
 starred = {false},
 authored = {false},
 confirmed = {true},
 hidden = {false},
 citation_key = {Han2001},
 private_publication = {false},
 abstract = {The vast majority of information that is available on- line, and coming online in this near future is only avail- able in HTML. In order to use this information for more than human browsing, it must be converted into a machine-readable format. Wrappers have been the key tool to make the conversion from HTML into se- mantically meaningful and well-structured XML data. However, developing wrappers is slow and tedious work with typically brittle results. This paper de- scribes XWRAP Elite, a tool to automatically gen- erate robust wrappers, which breaks down the conver- sion process into three procedures: discovering where the data is located in an HTML page and separating the data into individual objects; decomposing objects into data elements; marking objects and elements in an output format. XWRAP Elite automates the rst two procedures and requires minimal human involve- ment in marking output data. In addition, there is a code generation component to package all of the pieces into a stand-alone wrapper.},
 bibtype = {article},
 author = {Han, Wei and Buttler, David and Pu, Calton},
 journal = {Sigmod Record},
 number = {3}
}

Downloads: 0