Using grammatical inference to automate information extraction from the Web. Hong, T., W. & Clark, K., L. Principles of Data Mining and Knowledge Discovery, 2168:216–227, Springer, 2001.
Using grammatical inference to automate information extraction from the Web [pdf]Website  abstract   bibtex   
The World-Wide Web contains a wealth of semistructured information sources that often give partial/overlapping views on the same domains, such as real estate listings or book prices. These partial sources could be used more effectively if integrated into a single view; however, since they are typically formatted in diverse ways for human viewing, extracting their data for integration is a difficult challenge. Existing learning systems for this task generally use hardcoded ad hoc heuristics, are restricted in the domains and structures they can recognize, and/or require manual training. We describe a principled method for automatically generating extraction wrappers using grammatical inference that can recognize general structures and does not rely on manually-labelled examples. Domain-speci.c knowledge is explicitly separated out in the form of declarative rules. The method is demonstrated in a test setting by extracting real estate listings from web pages and integrating them into an interactive data visualization tool based on dynamic queries.
@article{
 title = {Using grammatical inference to automate information extraction from the Web},
 type = {article},
 year = {2001},
 pages = {216–227},
 volume = {2168},
 websites = {http://www.springerlink.com/index/3e1y25yqxmxd9kkm.pdf},
 publisher = {Springer},
 id = {9716598e-681d-354f-93d5-1c58f2344569},
 created = {2012-04-01T16:32:49.000Z},
 file_attached = {false},
 profile_id = {5284e6aa-156c-3ce5-bc0e-b80cf09f3ef6},
 group_id = {066b42c8-f712-3fc3-abb2-225c158d2704},
 last_modified = {2017-03-14T14:36:19.698Z},
 read = {false},
 starred = {false},
 authored = {false},
 confirmed = {true},
 hidden = {false},
 citation_key = {Hong2001},
 private_publication = {false},
 abstract = {The World-Wide Web contains a wealth of semistructured information sources that often give partial/overlapping views on the same domains, such as real estate listings or book prices. These partial sources could be used more effectively if integrated into a single view; however, since they are typically formatted in diverse ways for human viewing, extracting their data for integration is a difficult challenge. Existing learning systems for this task generally use hardcoded ad hoc heuristics, are restricted in the domains and structures they can recognize, and/or require manual training. We describe a principled method for automatically generating extraction wrappers using grammatical inference that can recognize general structures and does not rely on manually-labelled examples. Domain-speci.c knowledge is explicitly separated out in the form of declarative rules. The method is demonstrated in a test setting by extracting real estate listings from web pages and integrating them into an interactive data visualization tool based on dynamic queries.},
 bibtype = {article},
 author = {Hong, T W and Clark, K L},
 journal = {Principles of Data Mining and Knowledge Discovery}
}
Downloads: 0