Information Extraction from Semi-structured Web Documents. Yun, B. & Seo, C. Volume 4092. Information Extraction from Semi-structured Web Documents, pages 586-598. Springer Berlin / Heidelberg, 2006.
Information Extraction from Semi-structured Web Documents [link]Website  abstract   bibtex   
This paper proposes the web information extraction system that extracts the pre-defined information automatically from web documents (i.e. HTML documents) and integrates the extracted information. The system recognizes entities without labels by the probabilistic based entity recognition method and extends the existing domain knowledge semiautomatically by using the extracted data. Moreover, the system extracts the sub-linked information linked to the basic page and integrates the similar results extracted from heterogeneous sources. The experimental result shows that the global precision of seven domain sites is 93.5%. The system using the sub-linked information and the probabilistic based entity recognition enhances the precision significantly against the system using only the domain knowledge. Moreover, the presented system can extract the more various information precisely due to applying the system with flexibility according to domains. Thus, the system can increase the degree of user satisfaction at its maximum and contribute the revitalization of e-business.
@inBook{
 title = {Information Extraction from Semi-structured Web Documents},
 type = {inBook},
 year = {2006},
 pages = {586-598},
 volume = {4092},
 websites = {http://dx.doi.org/10.1007/11811220_50},
 publisher = {Springer Berlin / Heidelberg},
 series = {Lecture Notes in Computer Science},
 editors = {[object Object],[object Object],[object Object]},
 id = {290ad02c-bc92-3bb6-bd61-7f4b49add61e},
 created = {2011-01-29T09:23:47.000Z},
 file_attached = {false},
 profile_id = {5284e6aa-156c-3ce5-bc0e-b80cf09f3ef6},
 group_id = {066b42c8-f712-3fc3-abb2-225c158d2704},
 last_modified = {2017-03-14T14:36:19.698Z},
 read = {false},
 starred = {false},
 authored = {false},
 confirmed = {true},
 hidden = {false},
 citation_key = {Yun2006},
 private_publication = {false},
 abstract = {This paper proposes the web information extraction system that extracts the pre-defined information automatically from web documents (i.e. HTML documents) and integrates the extracted information. The system recognizes entities without labels by the probabilistic based entity recognition method and extends the existing domain knowledge semiautomatically by using the extracted data. Moreover, the system extracts the sub-linked information linked to the basic page and integrates the similar results extracted from heterogeneous sources. The experimental result shows that the global precision of seven domain sites is 93.5%. The system using the sub-linked information and the probabilistic based entity recognition enhances the precision significantly against the system using only the domain knowledge. Moreover, the presented system can extract the more various information precisely due to applying the system with flexibility according to domains. Thus, the system can increase the degree of user satisfaction at its maximum and contribute the revitalization of e-business.},
 bibtype = {inBook},
 author = {Yun, Bo-Hyun and Seo, Chang-Ho},
 book = {Knowledge Science Engineering and Management}
}

Downloads: 0