Model-Based Reinforcement Learning with Value-Targeted Regression

Model-Based Reinforcement Learning with Value-Targeted Regression. Ayoub, A., Jia, Z., Szepesvári, C., Wang, M., & Yang, L. In ICML, pages 463–474, 06, 2020.

Paper abstract bibtex 91 downloads

This paper studies model-based reinforcement learning (RL) for regret minimization. We focus on finite-horizon episodic RL where the transition model $P$ belongs to a known family of models $\mathcal{P}$, a special case of which is when models in $\mathcal{P}$ take the form of linear mixtures: $P_{θ} = ∑_{i=1}^{d} θ_{i}P_{i}$. We propose a model based RL algorithm that is based on the optimism principle: In each episode, the set of models that are `consistent' with the data collected is constructed. The criterion of consistency is based on the total squared error that the model incurs on the task of predicting state values as determined by the last value estimate along the transitions. The next value function is then chosen by solving the optimistic planning problem with the constructed set of models. We derive a bound on the regret, which, in the special case of linear mixtures, takes the form $\̃mathcal{O}}(d\sqrt{H^{3}T})$, where $H$, $T$ and $d$ are the horizon, the total number of steps and the dimension of $θ$, respectively. In particular, this regret bound is independent of the total number of states or actions, and is close to a lower bound $Ω(\sqrt{HdT})$. For a general model family $\mathcal{P}$, the regret bound is derived based on the Eluder dimension.

@inproceedings{AZSzWY20,
	abstract = {This paper studies model-based reinforcement learning (RL) for regret minimization.
We focus on finite-horizon episodic RL where the transition model $P$ belongs to a known family of models $\mathcal{P}$, a special case of which is when models in $\mathcal{P}$ take the form of linear mixtures:
$P_{\theta} = \sum_{i=1}^{d} \theta_{i}P_{i}$.
We propose a model based RL algorithm that is based on the optimism principle:
In each episode, the set of models that are `consistent' with the data collected is constructed.
The criterion of consistency is based on the total squared error that the model incurs on the task of predicting <em>state values</em> as determined by the last value estimate along the transitions.
The next value function is then chosen by solving the optimistic planning problem with the constructed set of models.
We derive a bound on the regret, which, in the special case of linear mixtures,
 takes the form $\tilde{\mathcal{O}}(d\sqrt{H^{3}T})$, where $H$, $T$ and $d$ are the horizon, the total number of steps and the dimension of $\theta$, respectively.
In particular, this regret bound is independent of the total number of states or actions, and is close to a lower bound $\Omega(\sqrt{HdT})$.
For a general model family $\mathcal{P}$, the regret bound is derived
based on the Eluder dimension.},
	author = {Ayoub, Alex and Jia, Zeyu and Szepesv\'ari, Csaba and Wang, Mengdi and Yang, Lin},
	crossref = {ICML2020},
	date-modified = {2021-06-29 19:55:35 -0600},
	month = {06},
	pages = {463--474},
	title = {Model-Based Reinforcement Learning with Value-Targeted Regression},
	url_paper = {ICML2020_UCRL_VTR.pdf},
    booktitle = {ICML},
	year = {2020},
	keywords = {exploration},		
}

Downloads: 91

{"_id":"KYHMjmX4jwGozspSq","bibbaseid":"ayoub-jia-szepesvri-wang-yang-modelbasedreinforcementlearningwithvaluetargetedregression-2020","authorIDs":["279PY77kXFE8vWA2Z","3RfzECoweoi7whJcn","4QCWeGJDcuieMasAe","4Tjqo47EWWsMKkTsz","4rnd6s56kwkYuN4vj","596hfkzoGyduaHJsx","6ZE3ATLtdNK2XKNyM","99T5SjY7hztGpFBvH","9ptfi8y4NAbFtcFyE","A2yHTTtEd7BHAWKxd","BnDo6icizXoM3ZM6w","CEF7BzjRG82xSkYnM","CNNkdvJNYs6mrvzjX","CuaCYHTopgvGbd8zk","F2vs4LRcswWXavxfy","FaD78bpAgKLAq4DE2","G25PrkxMGXRRMcCc4","Ge5Rxopmc3SuMrwAH","GpEM5uuobmY3kpHTW","JYhYxghGatqr4mF3H","JdCvvY7vmDS37xtBu","KDMX7rrdf6AsAYDyL","KFpw9rYFeSRdATA4e","KRpsFoiZnaCs9spJb","KaaDW3CcB7w9jsdXT","KergaMvq5ySYJJ3ja","L79tQyaj5QPQQWbhg","MYwHnbXmgZ6kDo3rw","MwHsLe6xMSqRXNS2a","Px8xSNb3LrPQap6Kk","Q6itd4jKLZFdSnTf3","R2QWF4bMkcqfXtkFy","R4cZsfzoubPJYRrnK","Ro8w9jcjvoj73u7Xr","TFtNr7Gkec5KGNDtp","XKguNtDfpi65mQGoP","Xfkk7uQL8EdfTKvQr","ZuZsatkxppZCHnGih","ZxvYv4Qz5HX2uJuNy","abeZr8physSQM35kQ","aod4LHA2acYGGgTq5","dPLx5jQPTZ38sge6e","daaG2KorDDHmmfE8n","e6FLJXcbsWN389Nac","euwQteZ8dvXDgnTeJ","fCcZBpWoomHwsZhMc","fjJ4rCAY73hrX8FfN","jT9EgmjXvsKC8mchN","jqRm9piESHxML2fDN","jzYGL4nHWtXMxLrS2","o7eSSyiMrY5sM7Riu","qZG9eGoTDZQerwFFk","rLnbnm3N6z6ao7Sgs","tPEcG6gpERvBMQHXC","tcPCYiCfNx26iQvrG","tepS4j4xyQcYE9w6A","u8YWp79iEPkjZWt8B","vEGDZadANdDu7HE4S","xEkabBjTQjdvXWXbX","xyst9ZfRqvy2Qhf39","z3Gjh8c2ESrGnGcxb"],"author_short":["Ayoub, A.","Jia, Z.","Szepesvári, C.","Wang, M.","Yang, L."],"bibdata":{"bibtype":"inproceedings","type":"inproceedings","abstract":"This paper studies model-based reinforcement learning (RL) for regret minimization. We focus on finite-horizon episodic RL where the transition model $P$ belongs to a known family of models $\\mathcal{P}$, a special case of which is when models in $\\mathcal{P}$ take the form of linear mixtures: $P_{θ} = ∑_{i=1}^{d} θ_{i}P_{i}$. We propose a model based RL algorithm that is based on the optimism principle: In each episode, the set of models that are `consistent' with the data collected is constructed. The criterion of consistency is based on the total squared error that the model incurs on the task of predicting <em>state values</em> as determined by the last value estimate along the transitions. The next value function is then chosen by solving the optimistic planning problem with the constructed set of models. We derive a bound on the regret, which, in the special case of linear mixtures, takes the form $\\̃mathcal{O}}(d\\sqrt{H^{3}T})$, where $H$, $T$ and $d$ are the horizon, the total number of steps and the dimension of $θ$, respectively. In particular, this regret bound is independent of the total number of states or actions, and is close to a lower bound $Ω(\\sqrt{HdT})$. For a general model family $\\mathcal{P}$, the regret bound is derived based on the Eluder dimension.","author":[{"propositions":[],"lastnames":["Ayoub"],"firstnames":["Alex"],"suffixes":[]},{"propositions":[],"lastnames":["Jia"],"firstnames":["Zeyu"],"suffixes":[]},{"propositions":[],"lastnames":["Szepesvári"],"firstnames":["Csaba"],"suffixes":[]},{"propositions":[],"lastnames":["Wang"],"firstnames":["Mengdi"],"suffixes":[]},{"propositions":[],"lastnames":["Yang"],"firstnames":["Lin"],"suffixes":[]}],"crossref":"ICML2020","date-modified":"2021-06-29 19:55:35 -0600","month":"06","pages":"463–474","title":"Model-Based Reinforcement Learning with Value-Targeted Regression","url_paper":"ICML2020_UCRL_VTR.pdf","booktitle":"ICML","year":"2020","keywords":"exploration","bibtex":"@inproceedings{AZSzWY20,\n\tabstract = {This paper studies model-based reinforcement learning (RL) for regret minimization.\nWe focus on finite-horizon episodic RL where the transition model $P$ belongs to a known family of models $\\mathcal{P}$, a special case of which is when models in $\\mathcal{P}$ take the form of linear mixtures:\n$P_{\\theta} = \\sum_{i=1}^{d} \\theta_{i}P_{i}$.\nWe propose a model based RL algorithm that is based on the optimism principle:\nIn each episode, the set of models that are `consistent' with the data collected is constructed.\nThe criterion of consistency is based on the total squared error that the model incurs on the task of predicting <em>state values</em> as determined by the last value estimate along the transitions.\nThe next value function is then chosen by solving the optimistic planning problem with the constructed set of models.\nWe derive a bound on the regret, which, in the special case of linear mixtures,\n takes the form $\\tilde{\\mathcal{O}}(d\\sqrt{H^{3}T})$, where $H$, $T$ and $d$ are the horizon, the total number of steps and the dimension of $\\theta$, respectively.\nIn particular, this regret bound is independent of the total number of states or actions, and is close to a lower bound $\\Omega(\\sqrt{HdT})$.\nFor a general model family $\\mathcal{P}$, the regret bound is derived\nbased on the Eluder dimension.},\n\tauthor = {Ayoub, Alex and Jia, Zeyu and Szepesv\\'ari, Csaba and Wang, Mengdi and Yang, Lin},\n\tcrossref = {ICML2020},\n\tdate-modified = {2021-06-29 19:55:35 -0600},\n\tmonth = {06},\n\tpages = {463--474},\n\ttitle = {Model-Based Reinforcement Learning with Value-Targeted Regression},\n\turl_paper = {ICML2020_UCRL_VTR.pdf},\n booktitle = {ICML},\n\tyear = {2020},\n\tkeywords = {exploration},\t\t\n}\n\n","author_short":["Ayoub, A.","Jia, Z.","Szepesvári, C.","Wang, M.","Yang, L."],"key":"AZSzWY20","id":"AZSzWY20","bibbaseid":"ayoub-jia-szepesvri-wang-yang-modelbasedreinforcementlearningwithvaluetargetedregression-2020","role":"author","urls":{" paper":"https://www.ualberta.ca/~szepesva/papers/ICML2020_UCRL_VTR.pdf"},"keyword":["exploration"],"metadata":{"authorlinks":{"szepesvári, c":"https://sites.ualberta.ca/~szepesva/pubs.html"}},"downloads":91},"bibtype":"inproceedings","biburl":"https://www.ualberta.ca/~szepesva/papers/p2.bib","creationDate":"2020-07-06T22:38:54.306Z","downloads":91,"keywords":["exploration"],"search_terms":["model","based","reinforcement","learning","value","targeted","regression","ayoub","jia","szepesvári","wang","yang"],"title":"Model-Based Reinforcement Learning with Value-Targeted Regression","year":2020,"dataSources":["dYMomj4Jofy8t4qmm","Ciq2jeFvPFYBCoxwJ","v2PxY4iCzrNyY9fhF","cd5AYQRw3RHjTgoQc"]}