Near-Optimal Reinforcement Learning in Polynomial Time. Kearns, M. & Singh, S. 49(2):209-232.
Paper doi abstract bibtex We present new algorithms for reinforcement learning and prove that they have polynomial bounds on the resources required to achieve near-optimal return in general Markov decision processes. After observing that the number of actions required to approach the optimal return is lower bounded by the mixing time T of the optimal policy (in the undiscounted case) or by the horizon time T (in the discounted case), we then give algorithms requiring a number of actions and total computation time that are only polynomial in T and the number of states and actions, for both the undiscounted and discounted cases. An interesting aspect of our algorithms is their explicit handling of the Exploration-Exploitation trade-off.
@article{kearnsNearOptimalReinforcementLearning2002,
langid = {english},
title = {Near-{{Optimal Reinforcement Learning}} in {{Polynomial Time}}},
volume = {49},
issn = {1573-0565},
url = {https://doi.org/10.1023/A:1017984413808},
doi = {10.1023/A:1017984413808},
abstract = {We present new algorithms for reinforcement learning and prove that they have polynomial bounds on the resources required to achieve near-optimal return in general Markov decision processes. After observing that the number of actions required to approach the optimal return is lower bounded by the mixing time T of the optimal policy (in the undiscounted case) or by the horizon time T (in the discounted case), we then give algorithms requiring a number of actions and total computation time that are only polynomial in T and the number of states and actions, for both the undiscounted and discounted cases. An interesting aspect of our algorithms is their explicit handling of the Exploration-Exploitation trade-off.},
number = {2},
journaltitle = {Machine Learning},
shortjournal = {Machine Learning},
urldate = {2019-01-21},
date = {2002-11-01},
pages = {209-232},
keywords = {reinforcement learning,exploration versus exploitation,Markov decision processes},
author = {Kearns, Michael and Singh, Satinder},
file = {/home/dimitri/Nextcloud/Zotero/storage/QHHJB8YE/Kearns and Singh - 2002 - Near-Optimal Reinforcement Learning in Polynomial .pdf}
}