Using grammatical inference to automate information extraction from the Web

Using grammatical inference to automate information extraction from the Web. Hong, T., W. & Clark, K., L. Principles of Data Mining and Knowledge Discovery, 2168:216–227, Springer, 2001.

Website abstract bibtex

The World-Wide Web contains a wealth of semistructured information sources that often give partial/overlapping views on the same domains, such as real estate listings or book prices. These partial sources could be used more effectively if integrated into a single view; however, since they are typically formatted in diverse ways for human viewing, extracting their data for integration is a difficult challenge. Existing learning systems for this task generally use hardcoded ad hoc heuristics, are restricted in the domains and structures they can recognize, and/or require manual training. We describe a principled method for automatically generating extraction wrappers using grammatical inference that can recognize general structures and does not rely on manually-labelled examples. Domain-speci.c knowledge is explicitly separated out in the form of declarative rules. The method is demonstrated in a test setting by extracting real estate listings from web pages and integrating them into an interactive data visualization tool based on dynamic queries.

@article{
 title = {Using grammatical inference to automate information extraction from the Web},
 type = {article},
 year = {2001},
 pages = {216–227},
 volume = {2168},
 websites = {http://www.springerlink.com/index/3e1y25yqxmxd9kkm.pdf},
 publisher = {Springer},
 id = {9716598e-681d-354f-93d5-1c58f2344569},
 created = {2012-04-01T16:32:49.000Z},
 file_attached = {false},
 profile_id = {5284e6aa-156c-3ce5-bc0e-b80cf09f3ef6},
 group_id = {066b42c8-f712-3fc3-abb2-225c158d2704},
 last_modified = {2017-03-14T14:36:19.698Z},
 read = {false},
 starred = {false},
 authored = {false},
 confirmed = {true},
 hidden = {false},
 citation_key = {Hong2001},
 private_publication = {false},
 abstract = {The World-Wide Web contains a wealth of semistructured information sources that often give partial/overlapping views on the same domains, such as real estate listings or book prices. These partial sources could be used more effectively if integrated into a single view; however, since they are typically formatted in diverse ways for human viewing, extracting their data for integration is a difficult challenge. Existing learning systems for this task generally use hardcoded ad hoc heuristics, are restricted in the domains and structures they can recognize, and/or require manual training. We describe a principled method for automatically generating extraction wrappers using grammatical inference that can recognize general structures and does not rely on manually-labelled examples. Domain-speci.c knowledge is explicitly separated out in the form of declarative rules. The method is demonstrated in a test setting by extracting real estate listings from web pages and integrating them into an interactive data visualization tool based on dynamic queries.},
 bibtype = {article},
 author = {Hong, T W and Clark, K L},
 journal = {Principles of Data Mining and Knowledge Discovery}
}

Downloads: 0

{"_id":"mWgEMrfqscMspz5wq","bibbaseid":"hong-clark-usinggrammaticalinferencetoautomateinformationextractionfromtheweb-2001","authorIDs":[],"author_short":["Hong, T., W.","Clark, K., L."],"bibdata":{"title":"Using grammatical inference to automate information extraction from the Web","type":"article","year":"2001","pages":"216–227","volume":"2168","websites":"http://www.springerlink.com/index/3e1y25yqxmxd9kkm.pdf","publisher":"Springer","id":"9716598e-681d-354f-93d5-1c58f2344569","created":"2012-04-01T16:32:49.000Z","file_attached":false,"profile_id":"5284e6aa-156c-3ce5-bc0e-b80cf09f3ef6","group_id":"066b42c8-f712-3fc3-abb2-225c158d2704","last_modified":"2017-03-14T14:36:19.698Z","read":false,"starred":false,"authored":false,"confirmed":"true","hidden":false,"citation_key":"Hong2001","private_publication":false,"abstract":"The World-Wide Web contains a wealth of semistructured information sources that often give partial/overlapping views on the same domains, such as real estate listings or book prices. These partial sources could be used more effectively if integrated into a single view; however, since they are typically formatted in diverse ways for human viewing, extracting their data for integration is a difficult challenge. Existing learning systems for this task generally use hardcoded ad hoc heuristics, are restricted in the domains and structures they can recognize, and/or require manual training. We describe a principled method for automatically generating extraction wrappers using grammatical inference that can recognize general structures and does not rely on manually-labelled examples. Domain-speci.c knowledge is explicitly separated out in the form of declarative rules. The method is demonstrated in a test setting by extracting real estate listings from web pages and integrating them into an interactive data visualization tool based on dynamic queries.","bibtype":"article","author":"Hong, T W and Clark, K L","journal":"Principles of Data Mining and Knowledge Discovery","bibtex":"@article{\n title = {Using grammatical inference to automate information extraction from the Web},\n type = {article},\n year = {2001},\n pages = {216–227},\n volume = {2168},\n websites = {http://www.springerlink.com/index/3e1y25yqxmxd9kkm.pdf},\n publisher = {Springer},\n id = {9716598e-681d-354f-93d5-1c58f2344569},\n created = {2012-04-01T16:32:49.000Z},\n file_attached = {false},\n profile_id = {5284e6aa-156c-3ce5-bc0e-b80cf09f3ef6},\n group_id = {066b42c8-f712-3fc3-abb2-225c158d2704},\n last_modified = {2017-03-14T14:36:19.698Z},\n read = {false},\n starred = {false},\n authored = {false},\n confirmed = {true},\n hidden = {false},\n citation_key = {Hong2001},\n private_publication = {false},\n abstract = {The World-Wide Web contains a wealth of semistructured information sources that often give partial/overlapping views on the same domains, such as real estate listings or book prices. These partial sources could be used more effectively if integrated into a single view; however, since they are typically formatted in diverse ways for human viewing, extracting their data for integration is a difficult challenge. Existing learning systems for this task generally use hardcoded ad hoc heuristics, are restricted in the domains and structures they can recognize, and/or require manual training. We describe a principled method for automatically generating extraction wrappers using grammatical inference that can recognize general structures and does not rely on manually-labelled examples. Domain-speci.c knowledge is explicitly separated out in the form of declarative rules. The method is demonstrated in a test setting by extracting real estate listings from web pages and integrating them into an interactive data visualization tool based on dynamic queries.},\n bibtype = {article},\n author = {Hong, T W and Clark, K L},\n journal = {Principles of Data Mining and Knowledge Discovery}\n}","author_short":["Hong, T., W.","Clark, K., L."],"urls":{"Website":"http://www.springerlink.com/index/3e1y25yqxmxd9kkm.pdf"},"bibbaseid":"hong-clark-usinggrammaticalinferencetoautomateinformationextractionfromtheweb-2001","role":"author","downloads":0,"html":""},"bibtype":"article","creationDate":"2020-02-06T23:48:12.142Z","downloads":0,"keywords":[],"search_terms":["using","grammatical","inference","automate","information","extraction","web","hong","clark"],"title":"Using grammatical inference to automate information extraction from the Web","year":2001}