A hybrid approach for extracting informative content from web pages

A hybrid approach for extracting informative content from web pages. Uzun, E., Agun, H., V., & Yerlikaya, T. Information Processing and Management, 49(4):928-944, 2013.
doi abstract bibtex

Eliminating noisy information and extracting informative content have become important issues for web mining, search and accessibility. This extraction process can employ automatic techniques and hand-crafted rules. Automatic extraction techniques focus on various machine learning methods, but implementing these techniques increases time complexity of the extraction process. Conversely, extraction through hand-crafted rules is an efficient technique that uses string manipulation functions, but preparing these rules is difficult and cumbersome for users. In this paper, we present a hybrid approach that contains two steps that can invoke each other. The first step discovers informative content using Decision Tree Learning as an appropriate machine learning method and creates rules from the results of this learning method. The second step extracts informative content using rules obtained from the first step. However, if the second step does not return an extraction result, the first step gets invoked. In our experiments, the first step achieves high accuracy with 95.76% in extraction of the informative content. Moreover, 71.92% of the rules can be used in the extraction process, and it is approximately 240 times faster than the first step. © 2013 Elsevier Ltd. All rights reserved.

@article{
 title = {A hybrid approach for extracting informative content from web pages},
 type = {article},
 year = {2013},
 keywords = {Template Detection,Web Cleaning,Web Content Extraction,Web Learning Modeling},
 pages = {928-944},
 volume = {49},
 id = {43c947b2-4168-3b3c-b825-ea192abfefc5},
 created = {2018-03-16T13:30:30.710Z},
 file_attached = {false},
 profile_id = {37fa15c3-e5d0-3212-8e18-e4c72814fd47},
 last_modified = {2018-07-04T12:00:48.056Z},
 read = {false},
 starred = {false},
 authored = {true},
 confirmed = {true},
 hidden = {false},
 citation_key = {Uzun2013},
 folder_uuids = {6d42ffd6-e28f-468b-89db-871815ea39c8,d4ba7016-85a6-45e9-8213-463276800ac8},
 private_publication = {false},
 abstract = {Eliminating noisy information and extracting informative content have become important issues for web mining, search and accessibility. This extraction process can employ automatic techniques and hand-crafted rules. Automatic extraction techniques focus on various machine learning methods, but implementing these techniques increases time complexity of the extraction process. Conversely, extraction through hand-crafted rules is an efficient technique that uses string manipulation functions, but preparing these rules is difficult and cumbersome for users. In this paper, we present a hybrid approach that contains two steps that can invoke each other. The first step discovers informative content using Decision Tree Learning as an appropriate machine learning method and creates rules from the results of this learning method. The second step extracts informative content using rules obtained from the first step. However, if the second step does not return an extraction result, the first step gets invoked. In our experiments, the first step achieves high accuracy with 95.76% in extraction of the informative content. Moreover, 71.92% of the rules can be used in the extraction process, and it is approximately 240 times faster than the first step. © 2013 Elsevier Ltd. All rights reserved.},
 bibtype = {article},
 author = {Uzun, Erdinç and Agun, Hayri Volkan and Yerlikaya, Tarik},
 doi = {10.1016/j.ipm.2013.02.005},
 journal = {Information Processing and Management},
 number = {4}
}

Downloads: 0

{"_id":"Ji7ut4yCtjtjbmgkg","bibbaseid":"uzun-agun-yerlikaya-ahybridapproachforextractinginformativecontentfromwebpages-2013","downloads":0,"creationDate":"2018-07-03T12:59:41.806Z","title":"A hybrid approach for extracting informative content from web pages","author_short":["Uzun, E.","Agun, H., V.","Yerlikaya, T."],"year":2013,"bibtype":"article","biburl":"https://bibbase.org/service/mendeley/37fa15c3-e5d0-3212-8e18-e4c72814fd47","bibdata":{"title":"A hybrid approach for extracting informative content from web pages","type":"article","year":"2013","keywords":"Template Detection,Web Cleaning,Web Content Extraction,Web Learning Modeling","pages":"928-944","volume":"49","id":"43c947b2-4168-3b3c-b825-ea192abfefc5","created":"2018-03-16T13:30:30.710Z","file_attached":false,"profile_id":"37fa15c3-e5d0-3212-8e18-e4c72814fd47","last_modified":"2018-07-04T12:00:48.056Z","read":false,"starred":false,"authored":"true","confirmed":"true","hidden":false,"citation_key":"Uzun2013","folder_uuids":"6d42ffd6-e28f-468b-89db-871815ea39c8,d4ba7016-85a6-45e9-8213-463276800ac8","private_publication":false,"abstract":"Eliminating noisy information and extracting informative content have become important issues for web mining, search and accessibility. This extraction process can employ automatic techniques and hand-crafted rules. Automatic extraction techniques focus on various machine learning methods, but implementing these techniques increases time complexity of the extraction process. Conversely, extraction through hand-crafted rules is an efficient technique that uses string manipulation functions, but preparing these rules is difficult and cumbersome for users. In this paper, we present a hybrid approach that contains two steps that can invoke each other. The first step discovers informative content using Decision Tree Learning as an appropriate machine learning method and creates rules from the results of this learning method. The second step extracts informative content using rules obtained from the first step. However, if the second step does not return an extraction result, the first step gets invoked. In our experiments, the first step achieves high accuracy with 95.76% in extraction of the informative content. Moreover, 71.92% of the rules can be used in the extraction process, and it is approximately 240 times faster than the first step. © 2013 Elsevier Ltd. All rights reserved.","bibtype":"article","author":"Uzun, Erdinç and Agun, Hayri Volkan and Yerlikaya, Tarik","doi":"10.1016/j.ipm.2013.02.005","journal":"Information Processing and Management","number":"4","bibtex":"@article{\n title = {A hybrid approach for extracting informative content from web pages},\n type = {article},\n year = {2013},\n keywords = {Template Detection,Web Cleaning,Web Content Extraction,Web Learning Modeling},\n pages = {928-944},\n volume = {49},\n id = {43c947b2-4168-3b3c-b825-ea192abfefc5},\n created = {2018-03-16T13:30:30.710Z},\n file_attached = {false},\n profile_id = {37fa15c3-e5d0-3212-8e18-e4c72814fd47},\n last_modified = {2018-07-04T12:00:48.056Z},\n read = {false},\n starred = {false},\n authored = {true},\n confirmed = {true},\n hidden = {false},\n citation_key = {Uzun2013},\n folder_uuids = {6d42ffd6-e28f-468b-89db-871815ea39c8,d4ba7016-85a6-45e9-8213-463276800ac8},\n private_publication = {false},\n abstract = {Eliminating noisy information and extracting informative content have become important issues for web mining, search and accessibility. This extraction process can employ automatic techniques and hand-crafted rules. Automatic extraction techniques focus on various machine learning methods, but implementing these techniques increases time complexity of the extraction process. Conversely, extraction through hand-crafted rules is an efficient technique that uses string manipulation functions, but preparing these rules is difficult and cumbersome for users. In this paper, we present a hybrid approach that contains two steps that can invoke each other. The first step discovers informative content using Decision Tree Learning as an appropriate machine learning method and creates rules from the results of this learning method. The second step extracts informative content using rules obtained from the first step. However, if the second step does not return an extraction result, the first step gets invoked. In our experiments, the first step achieves high accuracy with 95.76% in extraction of the informative content. Moreover, 71.92% of the rules can be used in the extraction process, and it is approximately 240 times faster than the first step. © 2013 Elsevier Ltd. All rights reserved.},\n bibtype = {article},\n author = {Uzun, Erdinç and Agun, Hayri Volkan and Yerlikaya, Tarik},\n doi = {10.1016/j.ipm.2013.02.005},\n journal = {Information Processing and Management},\n number = {4}\n}","author_short":["Uzun, E.","Agun, H., V.","Yerlikaya, T."],"biburl":"https://bibbase.org/service/mendeley/37fa15c3-e5d0-3212-8e18-e4c72814fd47","bibbaseid":"uzun-agun-yerlikaya-ahybridapproachforextractinginformativecontentfromwebpages-2013","role":"author","urls":{},"keyword":["Template Detection","Web Cleaning","Web Content Extraction","Web Learning Modeling"],"metadata":{"authorlinks":{"uzun, e":"https://erdincuzun.com/yayinlar/"}},"downloads":0},"search_terms":["hybrid","approach","extracting","informative","content","web","pages","uzun","agun","yerlikaya"],"keywords":["template detection","web cleaning","web content extraction","web learning modeling"],"authorIDs":["QrE2Jk7Eehmqc5trT"],"dataSources":["mqdHLrE2gnaRYnL6B","ya2CyA73rpZseyrZ8","2252seNhipfTmjEBQ"]}