doi abstract bibtex

[Significance] We present a tractable algorithm that provides a near-optimal solution to the crawling problem, a fundamental challenge at the heart of web search: Given a large quantity of distributed and dynamic web content, what pages do we choose to update a local cache with the goal of serving up-to-date pages to client requests? Solving this optimization requires identifying the best set of pages to refresh given popularity rates and change rates – an intractable problem in the general case. To overcome this intractability, we show that the optimal randomized strategy can be efficiently determined (in near-linear time) and then use it to produce a deterministic policy that exhibits excellent performance in experiments. [Abstract] The problem of maintaining a local cache of n constantly changing pages arises in multiple mechanisms such as web crawlers and proxy servers. In these, the resources for polling pages for possible updates are typically limited. The goal is to devise a polling and fetching policy that maximizes the utility of served pages that are up to date. Cho and Garcia-Molina [(2003) ACM Trans Database Syst 28:390-426] formulated this as an optimization problem, which can be solved numerically for small values of n, but appears intractable in general. Here, we show that the optimal randomized policy can be found exactly in O(nlogn) operations. Moreover, using the optimal probabilities to define in linear time a deterministic schedule yields a tractable policy that in experiments attains 99\,% of the optimum.

@article{azarTractableNearoptimalPolicies2018, title = {Tractable Near-Optimal Policies for Crawling}, author = {Azar, Yossi and Horvitz, Eric and Lubetzky, Eyal and Peres, Yuval and Shahaf, Dafna}, year = {2018}, month = aug, volume = {115}, pages = {8099--8103}, issn = {1091-6490}, doi = {10.1073/pnas.1801519115}, abstract = {[Significance] We present a tractable algorithm that provides a near-optimal solution to the crawling problem, a fundamental challenge at the heart of web search: Given a large quantity of distributed and dynamic web content, what pages do we choose to update a local cache with the goal of serving up-to-date pages to client requests? Solving this optimization requires identifying the best set of pages to refresh given popularity rates and change rates -- an intractable problem in the general case. To overcome this intractability, we show that the optimal randomized strategy can be efficiently determined (in near-linear time) and then use it to produce a deterministic policy that exhibits excellent performance in experiments. [Abstract] The problem of maintaining a local cache of n constantly changing pages arises in multiple mechanisms such as web crawlers and proxy servers. In these, the resources for polling pages for possible updates are typically limited. The goal is to devise a polling and fetching policy that maximizes the utility of served pages that are up to date. Cho and Garcia-Molina [(2003) ACM Trans Database Syst 28:390-426] formulated this as an optimization problem, which can be solved numerically for small values of n, but appears intractable in general. Here, we show that the optimal randomized policy can be found exactly in O(nlogn) operations. Moreover, using the optimal probabilities to define in linear time a deterministic schedule yields a tractable policy that in experiments attains 99\,\% of the optimum.}, journal = {Proceedings of the National Academy of Sciences}, keywords = {*imported-from-citeulike-INRMM,~INRMM-MiD:c-14623862,~to-add-doi-URL,asynchronous-change,information-systems,networks,optimisation,web-and-information-technologies}, lccn = {INRMM-MiD:c-14623862}, number = {32} }

Downloads: 0