Web content extraction by using decision tree learning. Uzun, E., Agun, H., V., & Yerlikaya, T. In 2012 20th Signal Processing and Communications Applications Conference (SIU), pages 1-4, 2012.
Web content extraction by using decision tree learning [link]Website  doi  abstract   bibtex   
Via information extraction techniques, web pages are able to generate datasets for various studies such as natural language processing, and data mining. However, nowadays the uninformative sections like advertisement, menus, and links are in increase. The cleaning of web pages from uninformative sections, and extraction of informative content has become an important issue. In this study, we present an decision tree learning approach over DOM based features which aims to clean the uninformative sections and extract informative content in three classes: title, main content, and additional information. Through this approach, differently from previous studies, the learning model for the extraction of the main content constructed on DIV and TD tags. The proposed method achieved 95.58% accuracy in cleaning uninformative sections and extraction of the informative content. Especially for the extraction of the main block, 0.96 f-measure is obtained.
@inproceedings{
 title = {Web content extraction by using decision tree learning},
 type = {inproceedings},
 year = {2012},
 keywords = {DOM,Decision tree,Web content extraction},
 pages = {1-4},
 websites = {http://ieeexplore.ieee.org/document/6204476/},
 id = {7bcc560c-fc7c-3ff3-b320-0158acf6f924},
 created = {2018-03-16T13:30:30.712Z},
 file_attached = {false},
 profile_id = {37fa15c3-e5d0-3212-8e18-e4c72814fd47},
 last_modified = {2018-07-04T12:59:46.813Z},
 read = {false},
 starred = {false},
 authored = {true},
 confirmed = {true},
 hidden = {false},
 citation_key = {Uzun2012},
 private_publication = {false},
 abstract = {Via information extraction techniques, web pages are able to generate datasets for various studies such as natural language processing, and data mining. However, nowadays the uninformative sections like advertisement, menus, and links are in increase. The cleaning of web pages from uninformative sections, and extraction of informative content has become an important issue. In this study, we present an decision tree learning approach over DOM based features which aims to clean the uninformative sections and extract informative content in three classes: title, main content, and additional information. Through this approach, differently from previous studies, the learning model for the extraction of the main content constructed on DIV and TD tags. The proposed method achieved 95.58% accuracy in cleaning uninformative sections and extraction of the informative content. Especially for the extraction of the main block, 0.96 f-measure is obtained.},
 bibtype = {inproceedings},
 author = {Uzun, Erdinç and Agun, Hayri Volkan and Yerlikaya, Tarık},
 doi = {10.1109/SIU.2012.6204476},
 booktitle = {2012 20th Signal Processing and Communications Applications Conference (SIU)}
}

Downloads: 0