A novel algorithm for extracting the user reviews from web pages. Uçar, E., Uzun, E., & Tüfekci, P. Journal of Information Science, 43(5):696-712, 10, 2017.
A novel algorithm for extracting the user reviews from web pages [link]Website  doi  abstract   bibtex   1 download  
Extracting the user reviews in websites such as forums, blogs, newspapers, commerce, trips, etc. is crucial for text processing applications (e.g. sentiment analysis, trend detection/monitoring and recommendation systems) which are needed to deal with structured data. Traditional algorithms have three processes consisting of Document Object Model (DOM) tree creation, extraction of features obtained from this tree and machine learning. However, these algorithms increase time complexity of extraction process. This study proposes a novel algorithm that involves two complementary stages. The first stage determines which HTML tags correspond to review layout for a web domain by using the DOM tree as well as its features and decision tree learning. The second stage extracts review layout for web pages in a web domain using the found tags obtained from the first stage. This stage is more time-efficient, being approximately 21 times faster compared to the first stage. Moreover, it achieves a relatively high accuracy of 96.67% in our experiments of review block extraction. © Chartered Institute of Library and Information Professionals.
@article{
 title = {A novel algorithm for extracting the user reviews from web pages},
 type = {article},
 year = {2017},
 keywords = {Efficient extraction,web data extraction,web user reviews},
 pages = {696-712},
 volume = {43},
 websites = {http://journals.sagepub.com/doi/10.1177/0165551516666446},
 month = {10},
 id = {95648a3d-c8d0-34be-a04a-fe1aa8f3ad56},
 created = {2018-03-16T13:30:30.629Z},
 file_attached = {false},
 profile_id = {37fa15c3-e5d0-3212-8e18-e4c72814fd47},
 last_modified = {2018-07-04T12:59:46.783Z},
 read = {false},
 starred = {false},
 authored = {true},
 confirmed = {true},
 hidden = {false},
 citation_key = {Ucar2017},
 folder_uuids = {6d42ffd6-e28f-468b-89db-871815ea39c8},
 private_publication = {false},
 abstract = {Extracting the user reviews in websites such as forums, blogs, newspapers, commerce, trips, etc. is crucial for text processing applications (e.g. sentiment analysis, trend detection/monitoring and recommendation systems) which are needed to deal with structured data. Traditional algorithms have three processes consisting of Document Object Model (DOM) tree creation, extraction of features obtained from this tree and machine learning. However, these algorithms increase time complexity of extraction process. This study proposes a novel algorithm that involves two complementary stages. The first stage determines which HTML tags correspond to review layout for a web domain by using the DOM tree as well as its features and decision tree learning. The second stage extracts review layout for web pages in a web domain using the found tags obtained from the first stage. This stage is more time-efficient, being approximately 21 times faster compared to the first stage. Moreover, it achieves a relatively high accuracy of 96.67% in our experiments of review block extraction. © Chartered Institute of Library and Information Professionals.},
 bibtype = {article},
 author = {Uçar, Erdem and Uzun, Erdinç and Tüfekci, Pınar},
 doi = {10.1177/0165551516666446},
 journal = {Journal of Information Science},
 number = {5}
}

Downloads: 1