Evaluation of Hap, AngleSharp and HtmlDocument in web content extraction. Uzun, E., Buluş, H., N., Doruk, A., & Özhan, E. In International Scientific Conference’2017 (UNITECH’17), volume 2, pages 275-278, 2017. Website abstract bibtex 1 download With the DOM, programming languages can access and change all the HTML elements of a web page. There are several libraries for instantiating the DOM. In this study, we compare three different well-known .NET libraries, including HAP (Html Agility Pack), AngleSharp and MS_HtmlDocument to extract content from web pages. The experimental results indicate that AngleSharp achieves the best results with average 5.54 ms for preprocessing of the DOM and average 0.46 ms for extracting of a content from the DOM.
@inproceedings{
title = {Evaluation of Hap, AngleSharp and HtmlDocument in web content extraction},
type = {inproceedings},
year = {2017},
keywords = {.NET C#,DOM,HTML,Web Extraction Performance},
pages = {275-278},
volume = {2},
websites = {https://erdincuzun.com/wp-content/uploads/download/s5_p255.pdf},
id = {c4e12fa9-8c76-3455-b183-fda4a673dea8},
created = {2018-06-05T12:53:51.408Z},
file_attached = {false},
profile_id = {37fa15c3-e5d0-3212-8e18-e4c72814fd47},
last_modified = {2020-01-16T20:29:39.192Z},
read = {false},
starred = {false},
authored = {true},
confirmed = {true},
hidden = {false},
citation_key = {Uzun2017_G1},
private_publication = {false},
abstract = {With the DOM, programming languages can access and change all the HTML elements of a web page. There are several libraries for instantiating the DOM. In this study, we compare three different well-known .NET libraries, including HAP (Html Agility Pack), AngleSharp and MS_HtmlDocument to extract content from web pages. The experimental results indicate that AngleSharp achieves the best results with average 5.54 ms for preprocessing of the DOM and average 0.46 ms for extracting of a content from the DOM.},
bibtype = {inproceedings},
author = {Uzun, Erdinç and Buluş, Halil Nusret and Doruk, Alpay and Özhan, Erkan},
booktitle = {International Scientific Conference’2017 (UNITECH’17)}
}
Downloads: 1
{"_id":"6xnbZNZRM2s3p9dgu","bibbaseid":"uzun-bulu-doruk-zhan-evaluationofhapanglesharpandhtmldocumentinwebcontentextraction-2017","downloads":1,"creationDate":"2018-07-03T12:59:41.813Z","title":"Evaluation of Hap, AngleSharp and HtmlDocument in web content extraction","author_short":["Uzun, E.","Buluş, H., N.","Doruk, A.","Özhan, E."],"year":2017,"bibtype":"inproceedings","biburl":"https://bibbase.org/service/mendeley/37fa15c3-e5d0-3212-8e18-e4c72814fd47","bibdata":{"title":"Evaluation of Hap, AngleSharp and HtmlDocument in web content extraction","type":"inproceedings","year":"2017","keywords":".NET C#,DOM,HTML,Web Extraction Performance","pages":"275-278","volume":"2","websites":"https://erdincuzun.com/wp-content/uploads/download/s5_p255.pdf","id":"c4e12fa9-8c76-3455-b183-fda4a673dea8","created":"2018-06-05T12:53:51.408Z","file_attached":false,"profile_id":"37fa15c3-e5d0-3212-8e18-e4c72814fd47","last_modified":"2020-01-16T20:29:39.192Z","read":false,"starred":false,"authored":"true","confirmed":"true","hidden":false,"citation_key":"Uzun2017_G1","private_publication":false,"abstract":"With the DOM, programming languages can access and change all the HTML elements of a web page. There are several libraries for instantiating the DOM. In this study, we compare three different well-known .NET libraries, including HAP (Html Agility Pack), AngleSharp and MS_HtmlDocument to extract content from web pages. The experimental results indicate that AngleSharp achieves the best results with average 5.54 ms for preprocessing of the DOM and average 0.46 ms for extracting of a content from the DOM.","bibtype":"inproceedings","author":"Uzun, Erdinç and Buluş, Halil Nusret and Doruk, Alpay and Özhan, Erkan","booktitle":"International Scientific Conference’2017 (UNITECH’17)","bibtex":"@inproceedings{\n title = {Evaluation of Hap, AngleSharp and HtmlDocument in web content extraction},\n type = {inproceedings},\n year = {2017},\n keywords = {.NET C#,DOM,HTML,Web Extraction Performance},\n pages = {275-278},\n volume = {2},\n websites = {https://erdincuzun.com/wp-content/uploads/download/s5_p255.pdf},\n id = {c4e12fa9-8c76-3455-b183-fda4a673dea8},\n created = {2018-06-05T12:53:51.408Z},\n file_attached = {false},\n profile_id = {37fa15c3-e5d0-3212-8e18-e4c72814fd47},\n last_modified = {2020-01-16T20:29:39.192Z},\n read = {false},\n starred = {false},\n authored = {true},\n confirmed = {true},\n hidden = {false},\n citation_key = {Uzun2017_G1},\n private_publication = {false},\n abstract = {With the DOM, programming languages can access and change all the HTML elements of a web page. There are several libraries for instantiating the DOM. In this study, we compare three different well-known .NET libraries, including HAP (Html Agility Pack), AngleSharp and MS_HtmlDocument to extract content from web pages. The experimental results indicate that AngleSharp achieves the best results with average 5.54 ms for preprocessing of the DOM and average 0.46 ms for extracting of a content from the DOM.},\n bibtype = {inproceedings},\n author = {Uzun, Erdinç and Buluş, Halil Nusret and Doruk, Alpay and Özhan, Erkan},\n booktitle = {International Scientific Conference’2017 (UNITECH’17)}\n}","author_short":["Uzun, E.","Buluş, H., N.","Doruk, A.","Özhan, E."],"urls":{"Website":"https://erdincuzun.com/wp-content/uploads/download/s5_p255.pdf"},"biburl":"https://bibbase.org/service/mendeley/37fa15c3-e5d0-3212-8e18-e4c72814fd47","bibbaseid":"uzun-bulu-doruk-zhan-evaluationofhapanglesharpandhtmldocumentinwebcontentextraction-2017","role":"author","keyword":[".NET C#","DOM","HTML","Web Extraction Performance"],"metadata":{"authorlinks":{"uzun, e":"https://erdincuzun.com/yayinlar/"}},"downloads":1},"search_terms":["evaluation","hap","anglesharp","htmldocument","web","content","extraction","uzun","buluş","doruk","özhan"],"keywords":[".net c#","dom","html","web extraction performance"],"authorIDs":["QrE2Jk7Eehmqc5trT"],"dataSources":["mqdHLrE2gnaRYnL6B","ya2CyA73rpZseyrZ8"]}