var bibbase_data = {"data":"\"Loading..\"\n\n
\n\n \n\n \n\n \n \n\n \n\n \n \n\n \n\n \n
\n generated by\n \n \"bibbase.org\"\n\n \n
\n \n\n
\n\n \n\n\n
\n\n Excellent! Next you can\n create a new website with this list, or\n embed it in an existing web page by copying & pasting\n any of the following snippets.\n\n
\n JavaScript\n (easiest)\n
\n \n <script src=\"https://bibbase.org/show?bib=https://www.ualberta.ca/~szepesva/papers/p2.bib&jsonp=1&group0=year&theme=bullets&css=https://www.ualberta.ca/~szepesva/jemdoc.css&jsonp=1\"></script>\n \n
\n\n PHP\n
\n \n <?php\n $contents = file_get_contents(\"https://bibbase.org/show?bib=https://www.ualberta.ca/~szepesva/papers/p2.bib&jsonp=1&group0=year&theme=bullets&css=https://www.ualberta.ca/~szepesva/jemdoc.css\");\n print_r($contents);\n ?>\n \n
\n\n iFrame\n (not recommended)\n
\n \n <iframe src=\"https://bibbase.org/show?bib=https://www.ualberta.ca/~szepesva/papers/p2.bib&jsonp=1&group0=year&theme=bullets&css=https://www.ualberta.ca/~szepesva/jemdoc.css\"></iframe>\n \n
\n\n

\n For more details see the documention.\n

\n
\n
\n\n
\n\n This is a preview! To use this list on your own web site\n or create a new web site from it,\n create a free account. The file will be added\n and you will be able to edit it in the File Manager.\n We will show you instructions once you've created your account.\n
\n\n
\n\n

To the site owner:

\n\n

Action required! Mendeley is changing its\n API. In order to keep using Mendeley with BibBase past April\n 14th, you need to:\n

    \n
  1. renew the authorization for BibBase on Mendeley, and
  2. \n
  3. update the BibBase URL\n in your page the same way you did when you initially set up\n this page.\n
  4. \n
\n

\n\n

\n \n \n Fix it now\n

\n
\n\n
\n\n\n
\n \n \n
\n
\n  \n 2023\n \n \n (16)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Context-lumpable stochastic bandits.\n \n \n \n \n\n\n \n Lee, C.; Liu, Q.; Abbasi-Yadkori, Y.; Jin, C.; Lattimore, T.; and Szepesvári, C.\n\n\n \n\n\n\n In NeurIPS, 12 2023. \n \n\n\n\n
\n\n\n\n \n \n \"Context-lumpablePaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 15 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{LLAYJLSz23:lumpNeurIPS,\n\tauthor = {Chung-Wei Lee and Qinghua Liu and Yasin Abbasi-Yadkori and Chi Jin and Tor Lattimore and Csaba Szepesv\\'ari},\n\ttitle={Context-lumpable stochastic bandits},\n\tcrossref  = {NeurIPS2023poster},\n\tbooktitle = {NeurIPS},\n\tyear = {2023},\n\tmonth = {12},\n\turl={https://openreview.net/forum?id=EY7Hpj8Ok6},\n\tabstract = {We consider a contextual bandit problem with $S$ contexts and $K$ actions. In each round $t=1,2,..$ the learner\nobserves a random context and chooses an action based on its past experience. The learner then observes a random reward whose mean is a function of the context and the action for the round. Under the assumption that the contexts can be lumped into $r\\le \\min(S ,K)$ groups such that the mean reward for the various actions is the same for any two contexts that are in the same group, we give an algorithm that outputs an $\\epsilon$-optimal policy after using at most $\\tilde O(r (S +K )/\\epsilon^2)$ samples with high probability and provide a matching $\\tilde\\Omega(r(S +K )/\\epsilon^2)$ lower bound. In the regret minimization setting, we give an algorithm whose cumulative regret up to time $T$ is bounded by $\\tilde O(\\sqrt{r^3(S +K )T})$. To the best of our knowledge, we are the first to show the near-optimal sample complexity in the PAC setting and $\\tilde O{\\sqrt{\\text{poly}(r)(S+K)T}}$ minimax regret in the online setting for this problem.  We also show our algorithms can be applied to more general low-rank bandits and get improved regret bounds in some scenarios.},\n}\n\n
\n
\n\n\n
\n We consider a contextual bandit problem with $S$ contexts and $K$ actions. In each round $t=1,2,..$ the learner observes a random context and chooses an action based on its past experience. The learner then observes a random reward whose mean is a function of the context and the action for the round. Under the assumption that the contexts can be lumped into $rłe \\min(S ,K)$ groups such that the mean reward for the various actions is the same for any two contexts that are in the same group, we give an algorithm that outputs an $ε$-optimal policy after using at most $ ̃O(r (S +K )/ε^2)$ samples with high probability and provide a matching $Ω̃(r(S +K )/ε^2)$ lower bound. In the regret minimization setting, we give an algorithm whose cumulative regret up to time $T$ is bounded by $ ̃O(\\sqrt{r^3(S +K )T})$. To the best of our knowledge, we are the first to show the near-optimal sample complexity in the PAC setting and $ ̃O{\\sqrt{\\text{poly}(r)(S+K)T}}$ minimax regret in the online setting for this problem. We also show our algorithms can be applied to more general low-rank bandits and get improved regret bounds in some scenarios.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Regret Minimization via Saddle Point Optimization.\n \n \n \n \n\n\n \n Kirschner, J.; Bakhtiari, A.; Chandak, K.; Tkachuk, V.; and Szepesvári, C.\n\n\n \n\n\n\n In NeurIPS, 12 2023. \n \n\n\n\n
\n\n\n\n \n \n \"RegretPaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 16 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{KBCTSz23:SaddleNeurIPS,\n\tauthor={Johannes Kirschner and Alireza Bakhtiari and Kushagra Chandak and Volodymyr Tkachuk and Csaba Szepesv\\'ari},\n\ttitle={Regret Minimization via Saddle Point Optimization},\n\tcrossref  = {NeurIPS2023poster},\n\tbooktitle = {NeurIPS},\n\tyear = {2023},\n\tmonth = {12},\n\turl={https://openreview.net/forum?id=VLnEFGu9V7},\n\tabstract = {A long line of works characterizes the sample complexity of regret minimization in sequential decision-making by min-max programs. In the corresponding saddle-point game, the min-player optimizes the sampling distribution against an adversarial max-player that chooses confusing models leading to large regret. The most recent instantiation of this idea is the decision-estimation coefficient (DEC), which was shown to provide nearly tight lower and upper bounds on the worst-case expected regret in structured bandits and reinforcement learning. By re-parametrizing the offset DEC with the confidence radius and solving the corresponding min-max program, we propose a novel anytime variant of the Estimation-To-Decisions algorithm (AETD). Importantly, the algorithm optimizes the exploration-exploitation trade-off online instead of via the analysis. Our formulation leads to a practical algorithm for finite model classes and linear feedback models.  We illustrate the results by deriving improved rates for high-dimensional linear bandits. Lastly, we point out connections to the information ratio, decoupling coefficient and PAC-DEC, and numerically evaluate the performance of E2D on simple examples.},\n}\n\n
\n
\n\n\n
\n A long line of works characterizes the sample complexity of regret minimization in sequential decision-making by min-max programs. In the corresponding saddle-point game, the min-player optimizes the sampling distribution against an adversarial max-player that chooses confusing models leading to large regret. The most recent instantiation of this idea is the decision-estimation coefficient (DEC), which was shown to provide nearly tight lower and upper bounds on the worst-case expected regret in structured bandits and reinforcement learning. By re-parametrizing the offset DEC with the confidence radius and solving the corresponding min-max program, we propose a novel anytime variant of the Estimation-To-Decisions algorithm (AETD). Importantly, the algorithm optimizes the exploration-exploitation trade-off online instead of via the analysis. Our formulation leads to a practical algorithm for finite model classes and linear feedback models. We illustrate the results by deriving improved rates for high-dimensional linear bandits. Lastly, we point out connections to the information ratio, decoupling coefficient and PAC-DEC, and numerically evaluate the performance of E2D on simple examples.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Optimistic Natural Policy Gradient: a Simple Efficient Policy Optimization Framework for Online RL.\n \n \n \n \n\n\n \n Liu, Q.; Weisz, G.; György, A.; Jin, C.; and Szepesvári, C.\n\n\n \n\n\n\n In NeurIPS, 12 2023. \n \n\n\n\n
\n\n\n\n \n \n \"Optimistic link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{LWGyJSz23:NPGNeurIPS,\n\tauthor = {Qinghua Liu and Gell\\'ert  Weisz and Andr\\'as Gy\\"orgy and Chi Jin and Csaba Szepesv{\\'a}ri},\n\ttitle={Optimistic Natural Policy Gradient: a Simple Efficient Policy Optimization Framework  for Online RL},\n\tcrossref  = {NeurIPS2023spotlight},\n\tbooktitle = {NeurIPS},\n\tyear = {2023},\n\tmonth = {12},\n\turl_link={https://openreview.net/forum?id=zaQ7wV9NOg},\n\tabstract = {While policy optimization algorithms have played an important role in recent empirical success of Reinforcement Learning (RL), the existing theoretical understanding of policy optimization remains rather limited---they are either restricted to tabular MDPs or suffer from highly suboptimal sample complexity, especial in online RL where exploration is necessary. This paper proposes a simple efficient policy optimization framework---Optimistic NPG for online RL. Optimistic NPG can be viewed as simply combining of the classic natural policy gradient (NPG) algorithm [Kakade, 2001]  with optimistic policy evaluation subroutines to encourage exploration. For $d$-dimensional linear MDPs, Optimistic NPG is computationally efficient, and learns an $\\epsilon$-optimal policy within  $\\tilde{O}(d^2/\\epsilon^3)$ samples, which is the first computationally efficient algorithm whose sample complexity has the optimal dimension dependence $\\tilde{\\Theta}  (d^2)$. It also improves over state-of-the-art results of policy optimization algorithms [Zanette et al., 2021] by a factor of $d$. For general function approximation that subsumes linear MDPs, Optimistic NPG, to our best knowledge, is also the first policy optimization algorithm that achieves the polynomial sample complexity for learning near-optimal policies.},\n}\n\n
\n
\n\n\n
\n While policy optimization algorithms have played an important role in recent empirical success of Reinforcement Learning (RL), the existing theoretical understanding of policy optimization remains rather limited—they are either restricted to tabular MDPs or suffer from highly suboptimal sample complexity, especial in online RL where exploration is necessary. This paper proposes a simple efficient policy optimization framework—Optimistic NPG for online RL. Optimistic NPG can be viewed as simply combining of the classic natural policy gradient (NPG) algorithm [Kakade, 2001] with optimistic policy evaluation subroutines to encourage exploration. For $d$-dimensional linear MDPs, Optimistic NPG is computationally efficient, and learns an $ε$-optimal policy within $Õ(d^2/ε^3)$ samples, which is the first computationally efficient algorithm whose sample complexity has the optimal dimension dependence $Θ̃ (d^2)$. It also improves over state-of-the-art results of policy optimization algorithms [Zanette et al., 2021] by a factor of $d$. For general function approximation that subsumes linear MDPs, Optimistic NPG, to our best knowledge, is also the first policy optimization algorithm that achieves the polynomial sample complexity for learning near-optimal policies.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Ordering-based Conditions for Global Convergence of Policy Gradient Methods.\n \n \n \n \n\n\n \n Mei, J.; Dai, B.; Agarwal, A.; Ghavamzadeh, M.; Szepesvári, C.; and Schuurmans, D.\n\n\n \n\n\n\n In NeurIPS, 12 2023. \n \n\n\n\n
\n\n\n\n \n \n \"Ordering-based link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{MDAGSS23:pgNeurIPS,\n\tauthor = {Jincheng Mei and Bo Dai and Alekh Agarwal and Mohammad Ghavamzadeh and Csaba Szepesv{\\'a}ri and Dale Schuurmans},\n\ttitle={Ordering-based Conditions for Global Convergence of Policy Gradient Methods},\n\tcrossref  = {NeurIPS2023oral},\n\tbooktitle = {NeurIPS},\n\tyear = {2023},\n\tmonth = {12},\n\turl_link={https://openreview.net/forum?id=sW8yGZ4uVJ},\n\tabstract = {We prove that, for finite-arm bandits with linear function approximation, the global convergence of policy gradient (PG) methods depends on inter-related properties between the policy update and the representation. <b>First</b>, we establish a few key observations that frame the study: </b>(i)</b> Global convergence can be achieved under linear function approximation without policy or reward realizability, both for the standard Softmax PG and natural policy gradient (NPG). <b>(ii)</b> Approximation error is not a key quantity for characterizing global convergence in either algorithm. <b>(iii)</b> The conditions on the representation that imply global convergence are different between these two algorithms. Overall, these observations call into question approximation error as an appropriate quantity for characterizing the global convergence of PG methods under linear function approximation. <b>Second</b>, motivated by these observations, we establish new general results: <b>(i)</b> NPG with linear function approximation achieves global convergence <em>if and only if</em> the projection of the reward onto the representable space preserves the optimal action's rank, a quantity that is not strongly related to approximation error. <b>(ii)</b> We show that the global convergence of Softmax PG occurs if the representation can merely preserve the ranking of rewards, a property that goes well beyond policy or reward realizability. We provide experimental results to support these theoretical findings.},\n}\n\n
\n
\n\n\n
\n We prove that, for finite-arm bandits with linear function approximation, the global convergence of policy gradient (PG) methods depends on inter-related properties between the policy update and the representation. First, we establish a few key observations that frame the study: (i) Global convergence can be achieved under linear function approximation without policy or reward realizability, both for the standard Softmax PG and natural policy gradient (NPG). (ii) Approximation error is not a key quantity for characterizing global convergence in either algorithm. (iii) The conditions on the representation that imply global convergence are different between these two algorithms. Overall, these observations call into question approximation error as an appropriate quantity for characterizing the global convergence of PG methods under linear function approximation. Second, motivated by these observations, we establish new general results: (i) NPG with linear function approximation achieves global convergence if and only if the projection of the reward onto the representable space preserves the optimal action's rank, a quantity that is not strongly related to approximation error. (ii) We show that the global convergence of Softmax PG occurs if the representation can merely preserve the ranking of rewards, a property that goes well beyond policy or reward realizability. We provide experimental results to support these theoretical findings.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Online RL in Linearly $q^{π}$-Realizable MDPs Is as Easy as in Linear MDPs If You Learn What to Ignore.\n \n \n \n \n\n\n \n Weisz, G.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In NeurIPS, 12 2023. \n \n\n\n\n
\n\n\n\n \n \n \"Online link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{WeGySz23:qpNeurIPS,\n\tauthor = {Weisz, Gell\\'ert and Gy\\"orgy, Andr\\'as and  Szepesv{\\'a}ri, Csaba},\n\ttitle={Online RL in Linearly $q^{\\pi}$-Realizable MDPs Is as Easy as in Linear MDPs If You Learn What to Ignore},\n\tcrossref  = {NeurIPS2023oral},\n\tbooktitle = {NeurIPS},\n\tyear = {2023},\n\tmonth = {12},\n\turl_link={https://openreview.net/forum?id=HV85SiyrsV},\n\tabstract = {We consider online reinforcement learning (RL) in episodic Markov decision processes (MDPs) under the  linear $q^\\pi$-realizability assumption, where it is assumed that the action-values of all policies can be  expressed as linear functions of state-action features. This class is known to be more general than  linear MDPs, where the transition kernel and the reward function are assumed to be linear functions of the feature vectors. As our first contribution, we show that the difference between the two classes is the presence of states in linearly $q^\\pi$-realizable MDPs where for any policy, all the actions have  approximately equal values, and skipping these states by following an arbitrarily fixed policy in those states transforms the problem to a linear MDP. Based on this observation, we derive a novel learning algorithm for linearly $q^\\pi$-realizable MDPs that simultaneously learns what states should be skipped and runs another learning algorithm on the linear MDP hidden in the problem. The new algorithm returns an $\\epsilon$-optimal policy after $\\text{polylog}(H, d)/\\epsilon^2$ interactions with the MDP, where $H$ is the time horizon and $d$ is the dimension of the feature vectors, giving the first polynomial-sample-complexity online RL algorithm for this setting. The results are proved for the misspecified case, where the sample complexity is shown to degrade gracefully with the misspecification error.},\n}\n\n
\n
\n\n\n
\n We consider online reinforcement learning (RL) in episodic Markov decision processes (MDPs) under the linear $q^π$-realizability assumption, where it is assumed that the action-values of all policies can be expressed as linear functions of state-action features. This class is known to be more general than linear MDPs, where the transition kernel and the reward function are assumed to be linear functions of the feature vectors. As our first contribution, we show that the difference between the two classes is the presence of states in linearly $q^π$-realizable MDPs where for any policy, all the actions have approximately equal values, and skipping these states by following an arbitrarily fixed policy in those states transforms the problem to a linear MDP. Based on this observation, we derive a novel learning algorithm for linearly $q^π$-realizable MDPs that simultaneously learns what states should be skipped and runs another learning algorithm on the linear MDP hidden in the problem. The new algorithm returns an $ε$-optimal policy after $\\text{polylog}(H, d)/ε^2$ interactions with the MDP, where $H$ is the time horizon and $d$ is the dimension of the feature vectors, giving the first polynomial-sample-complexity online RL algorithm for this setting. The results are proved for the misspecified case, where the sample complexity is shown to degrade gracefully with the misspecification error.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Regret Minimization via Saddle Point Optimization.\n \n \n \n \n\n\n \n Kirschner, J.; Bakhtiari, A.; Chandak, K.; Tkachuk, V.; and Szepesvári, C.\n\n\n \n\n\n\n In European Workshop on Reinforcement Learning (EWRL), 09 2023. \n \n\n\n\n
\n\n\n\n \n \n \"Regret pdf\n  \n \n \n \"Regret link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 16 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{kirschner2023regret,\n\ttitle={Regret Minimization via Saddle Point Optimization},\n\tauthor={Johannes Kirschner and Alireza Bakhtiari and Kushagra Chandak and Volodymyr Tkachuk and Csaba Szepesv\\'ari},\n\tbooktitle={European Workshop on Reinforcement Learning (EWRL)},\n\tyear={2023},\n\tmonth = {09},\n\turl_pdf = {https://openreview.net/pdf?id=wH0SdQr0jRH},\n\turl_link ={https://openreview.net/forum?id=wH0SdQr0jRH},\n\tentrysubtype = {unrefereed},\t\n\tabstract = {A long line of works characterizes the sample complexity of regret minimization in sequential decision-making by min-max programs. In the corresponding saddle-point game, the min-player optimizes the sampling distribution against an adversarial max-player that chooses confusing models leading to large regret. The most recent instantiation of this idea is the decision-estimation coefficient (DEC), which was shown to provide nearly tight lower and upper bounds on the worst-case expected regret in structured bandits and reinforcement learning. By re-parametrizing the offset DEC with the confidence radius and solving the corresponding min-max program, we propose a novel anytime variant of the Estimation-To-Decisions algorithm. Importantly, the algorithm optimizes the exploration-exploitation trade-off online instead of via the analysis. Our formulation leads to a practical algorithm for finite model classes and linear feedback models. We illustrate the results by deriving improved rates for high-dimensional linear bandits. Lastly, we point out connections to the information ratio, decoupling coefficient and PAC-DEC, and numerically evaluate the performance of E2D on simple examples.},\n}\n\n\n
\n
\n\n\n
\n A long line of works characterizes the sample complexity of regret minimization in sequential decision-making by min-max programs. In the corresponding saddle-point game, the min-player optimizes the sampling distribution against an adversarial max-player that chooses confusing models leading to large regret. The most recent instantiation of this idea is the decision-estimation coefficient (DEC), which was shown to provide nearly tight lower and upper bounds on the worst-case expected regret in structured bandits and reinforcement learning. By re-parametrizing the offset DEC with the confidence radius and solving the corresponding min-max program, we propose a novel anytime variant of the Estimation-To-Decisions algorithm. Importantly, the algorithm optimizes the exploration-exploitation trade-off online instead of via the analysis. Our formulation leads to a practical algorithm for finite model classes and linear feedback models. We illustrate the results by deriving improved rates for high-dimensional linear bandits. Lastly, we point out connections to the information ratio, decoupling coefficient and PAC-DEC, and numerically evaluate the performance of E2D on simple examples.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Exponential Hardness of Reinforcement Learning with Linear Function Approximation.\n \n \n \n \n\n\n \n Liu, S.; Mahajan, G.; Kane, D.; Lovett, S.; Weisz, G.; and Szepesvári, C.\n\n\n \n\n\n\n In COLT, pages 1588–1617, 07 2023. \n \n\n\n\n
\n\n\n\n \n \n \"Exponential pdf\n  \n \n \n \"Exponential link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 10 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{SiGaKaLoWSz23,\n  title = \t {Exponential Hardness of Reinforcement Learning with Linear Function Approximation},\n  author =       {Liu, Sihan and Mahajan, Gaurav and Kane, Daniel and Lovett, Shachar and Weisz, Gell\\'ert and Szepesv\\'ari, Csaba},\n  booktitle = \t {COLT},\n  pages = \t {1588--1617},\n  acceptrate = {165 out of 474 = 34.8\\%},\n  year = \t {2023},\n  month = \t {07},\n  url_pdf = \t {https://proceedings.mlr.press/v195/liu23b/liu23b.pdf},\n  url_link = \t {https://proceedings.mlr.press/v195/liu23b.html},\n  abstract = \t {A fundamental question in reinforcement learning theory is: suppose the optimal value functions are linear in given features, can we learn them efficiently? This problem's counterpart in supervised learning, linear regression, can be solved both statistically and computationally efficiently. Therefore, it was quite surprising when a recent work \\cite{kane2022computational} showed a computational-statistical gap for linear reinforcement learning: even though there are polynomial sample-complexity algorithms, unless NP = RP, there are no polynomial time algorithms for this setting.In this work, we build on their result to show a computational lower bound, which is exponential in feature dimension and horizon, for linear reinforcement learning under the Randomized Exponential Time Hypothesis. To prove this we build a round-based game where in each round the learner is searching for an unknown vector in a unit hypercube. The rewards in this game are chosen such that if the learner achieves large reward, then the learner's actions can be used to simulate solving a variant of 3-SAT, where (a) each variable shows up in a bounded number of clauses (b) if an instance has no solutions then it also has no solutions that satisfy more than (1-$\\epsilon$)-fraction of clauses. We use standard reductions to show this 3-SAT variant is approximately as hard as 3-SAT. Finally, we also show a lower bound optimized for horizon dependence that almost matches the best known upper bound of $\\exp(\\sqrt{H})$.}\n}\n\n\n
\n
\n\n\n
\n A fundamental question in reinforcement learning theory is: suppose the optimal value functions are linear in given features, can we learn them efficiently? This problem's counterpart in supervised learning, linear regression, can be solved both statistically and computationally efficiently. Therefore, it was quite surprising when a recent work i̧tekane2022computational showed a computational-statistical gap for linear reinforcement learning: even though there are polynomial sample-complexity algorithms, unless NP = RP, there are no polynomial time algorithms for this setting.In this work, we build on their result to show a computational lower bound, which is exponential in feature dimension and horizon, for linear reinforcement learning under the Randomized Exponential Time Hypothesis. To prove this we build a round-based game where in each round the learner is searching for an unknown vector in a unit hypercube. The rewards in this game are chosen such that if the learner achieves large reward, then the learner's actions can be used to simulate solving a variant of 3-SAT, where (a) each variable shows up in a bounded number of clauses (b) if an instance has no solutions then it also has no solutions that satisfy more than (1-$ε$)-fraction of clauses. We use standard reductions to show this 3-SAT variant is approximately as hard as 3-SAT. Finally, we also show a lower bound optimized for horizon dependence that almost matches the best known upper bound of $\\exp(\\sqrt{H})$.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Optimistic MLE: A Generic Model-Based Algorithm for Partially Observable Sequential Decision Making.\n \n \n \n \n\n\n \n Liu, Q.; Netrapalli, P.; Szepesvári, C.; and Jin, C.\n\n\n \n\n\n\n In STOC, pages 363–376, 11 2023. \n \n\n\n\n
\n\n\n\n \n \n \"Optimistic link\n  \n \n \n \"Optimistic pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 6 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{QiPraSzCh23,\n\tauthor = {Liu, Qinghua and Netrapalli, Praneeth and Szepesv\\'ari, Csaba and Jin, Chi},\n\ttitle = {Optimistic MLE: A Generic Model-Based Algorithm for Partially Observable Sequential Decision Making},\n\tyear = {2023},\n\tmonth = {11},\n\turl_link = {https://arxiv.org/abs/2209.14997},\n\turl_pdf = {https://arxiv.org/pdf/2209.14997.pdf},\n\tacceptrate = {155 out of 479 = 32\\%},\n\tdoi = {10.1145/3564246.3585161},\n\tabstract = {This paper introduces a simple efficient learning algorithms for general sequential decision making. The algorithm combines Optimism for exploration with Maximum Likelihood Estimation for model estimation, which is thus named OMLE. We prove that OMLE learns the near-optimal policies of an enormously rich class of sequential decision making problems in a polynomial number of samples. This rich class includes not only a majority of known tractable model-based Reinforcement Learning (RL) problems (such as tabular MDPs, factored MDPs, low witness rank problems, tabular weakly-revealing/observable POMDPs and multi-step decodable POMDPs ), but also many new challenging RL problems especially in the partially observable setting that were not previously known to be tractable. Notably, the new problems addressed by this paper include (1) observable POMDPs with continuous observation and function approximation, where we achieve the first sample complexity that is completely independent of the size of observation space; (2) well-conditioned low-rank sequential decision making problems (also known as Predictive State Representations (PSRs)), which include and generalize all known tractable POMDP examples under a more intrinsic representation; (3) general sequential decision making problems under SAIL condition, which unifies our existing understandings of model-based RL in both fully observable and partially observable settings. SAIL condition is identified by this paper, which can be viewed as a natural generalization of Bellman/witness rank to address partial observability. This paper also presents a reward-free variant of OMLE algorithm, which learns approximate dynamic models that enable the computation of near-optimal policies for all reward functions simultaneously.},\n\tbooktitle = {STOC},\n\tpages = {363--376},\n\tkeywords = {POMDPs, Optimistic maximum likelihood estimate, PSRs, reinforcement learning, partial observability},\n}\n\n\n
\n
\n\n\n
\n This paper introduces a simple efficient learning algorithms for general sequential decision making. The algorithm combines Optimism for exploration with Maximum Likelihood Estimation for model estimation, which is thus named OMLE. We prove that OMLE learns the near-optimal policies of an enormously rich class of sequential decision making problems in a polynomial number of samples. This rich class includes not only a majority of known tractable model-based Reinforcement Learning (RL) problems (such as tabular MDPs, factored MDPs, low witness rank problems, tabular weakly-revealing/observable POMDPs and multi-step decodable POMDPs ), but also many new challenging RL problems especially in the partially observable setting that were not previously known to be tractable. Notably, the new problems addressed by this paper include (1) observable POMDPs with continuous observation and function approximation, where we achieve the first sample complexity that is completely independent of the size of observation space; (2) well-conditioned low-rank sequential decision making problems (also known as Predictive State Representations (PSRs)), which include and generalize all known tractable POMDP examples under a more intrinsic representation; (3) general sequential decision making problems under SAIL condition, which unifies our existing understandings of model-based RL in both fully observable and partially observable settings. SAIL condition is identified by this paper, which can be viewed as a natural generalization of Bellman/witness rank to address partial observability. This paper also presents a reward-free variant of OMLE algorithm, which learns approximate dynamic models that enable the computation of near-optimal policies for all reward functions simultaneously.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Efficient Planning in Combinatorial Action Spaces with Applications to Cooperative Multi-Agent Reinforcement Learning.\n \n \n \n \n\n\n \n Tkachuk, V.; Bakhtiari, S. A.; Kirschner, J.; Jusup, M.; Bogunovic, I.; and Szepesvári, C.\n\n\n \n\n\n\n In AISTATS, pages 6342–6370, 04 2023. \n \n\n\n\n
\n\n\n\n \n \n \"Efficient pdf\n  \n \n \n \"Efficient link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{TkaBa-CombAct23,\n  title = \t {Efficient Planning in Combinatorial Action Spaces with Applications to Cooperative Multi-Agent Reinforcement Learning},\n  author =       {Tkachuk, Volodymyr and Bakhtiari, Seyed Alireza and Kirschner, Johannes and Jusup, Matej and Bogunovic, Ilija and Szepesv\\'ari, Csaba},\n  booktitle = \t {AISTATS},\n  acceptrate= {ca. 490 out of 1689 = 29\\%},    \n  pages = \t {6342--6370},\n  year = \t {2023},\n  month = \t {04},\n  url_pdf = \t {https://proceedings.mlr.press/v206/tkachuk23a/tkachuk23a.pdf},\n  url_link = \t {https://proceedings.mlr.press/v206/tkachuk23a.html},\n  abstract = \t {A practical challenge in reinforcement learning are combinatorial action spaces that make planning computationally demanding. For example, in cooperative multi-agent reinforcement learning, a potentially large number of agents jointly optimize a global reward function, which leads to a combinatorial blow-up in the action space by the number of agents. As a minimal requirement, we assume access to an argmax oracle that allows to efficiently compute the greedy policy for any Q-function in the model class. Building on recent work in planning with local access to a simulator and linear function approximation, we propose efficient algorithms for this setting that lead to polynomial compute and query complexity in all relevant problem parameters. For the special case where the feature decomposition is additive, we further improve the bounds and extend the results to the kernelized setting with an efficient algorithm.}\n}\n\n\n
\n
\n\n\n
\n A practical challenge in reinforcement learning are combinatorial action spaces that make planning computationally demanding. For example, in cooperative multi-agent reinforcement learning, a potentially large number of agents jointly optimize a global reward function, which leads to a combinatorial blow-up in the action space by the number of agents. As a minimal requirement, we assume access to an argmax oracle that allows to efficiently compute the greedy policy for any Q-function in the model class. Building on recent work in planning with local access to a simulator and linear function approximation, we propose efficient algorithms for this setting that lead to polynomial compute and query complexity in all relevant problem parameters. For the special case where the feature decomposition is additive, we further improve the bounds and extend the results to the kernelized setting with an efficient algorithm.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Sample Efficient Deep Reinforcement Learning via Local Planning.\n \n \n \n \n\n\n \n Yin, D.; Thiagarajan, S.; Lazic, N.; Rajaraman, N.; Hao, B.; and Szepesvári, C.\n\n\n \n\n\n\n arXiv e-prints, abs/2301.12579. 01 2023.\n \n\n\n\n
\n\n\n\n \n \n \"Sample link\n  \n \n \n \"Sample pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@article{YiTh-deepRL23,\n       author = {Yin, Dong and Thiagarajan, Sridhar and Lazic, Nevena and Rajaraman, Nived and Hao, Botao and Szepesv\\'ari, Csaba},\n        title = {Sample Efficient Deep Reinforcement Learning via Local Planning},\n      journal = {arXiv e-prints},\n         year = {2023},\n        month = {01},\n       url_link = {https://arxiv.org/abs/2301.12579},\n       url_pdf = {https://arxiv.org/pdf/2301.12579.pdf},\n      volume = {abs/2301.12579},\n\tentrysubtype = {unrefereed},\t\n       abstract = {The focus of this work is sample-efficient deep reinforcement learning (RL) with a simulator. One useful property of simulators is that it is typically easy to reset the environment to a previously observed state. We propose an algorithmic framework, named uncertainty-first local planning (UFLP), that takes advantage of this property. Concretely, in each data collection iteration, with some probability, our meta-algorithm resets the environment to an observed state which has high uncertainty, instead of sampling according to the initial-state distribution. The agent-environment interaction then proceeds as in the standard online RL setting. We demonstrate that this simple procedure can dramatically improve the sample cost of several baseline RL algorithms on difficult exploration tasks. Notably, with our framework, we can achieve super-human performance on the notoriously hard Atari game, Montezuma's Revenge, with a simple (distributional) double DQN. Our work can be seen as an efficient approximate implementation of an existing algorithm with theoretical guarantees, which offers an interpretation of the positive empirical results.},\n}\n\n\n
\n
\n\n\n
\n The focus of this work is sample-efficient deep reinforcement learning (RL) with a simulator. One useful property of simulators is that it is typically easy to reset the environment to a previously observed state. We propose an algorithmic framework, named uncertainty-first local planning (UFLP), that takes advantage of this property. Concretely, in each data collection iteration, with some probability, our meta-algorithm resets the environment to an observed state which has high uncertainty, instead of sampling according to the initial-state distribution. The agent-environment interaction then proceeds as in the standard online RL setting. We demonstrate that this simple procedure can dramatically improve the sample cost of several baseline RL algorithms on difficult exploration tasks. Notably, with our framework, we can achieve super-human performance on the notoriously hard Atari game, Montezuma's Revenge, with a simple (distributional) double DQN. Our work can be seen as an efficient approximate implementation of an existing algorithm with theoretical guarantees, which offers an interpretation of the positive empirical results.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning Lipschitz Functions by GD-trained Shallow Overparameterized ReLU Neural Networks.\n \n \n \n \n\n\n \n Kuzborskij, I.; and Szepesvári, C.\n\n\n \n\n\n\n arXiv e-prints, abs/2212.13848. 04 2023.\n \n\n\n\n
\n\n\n\n \n \n \"Learning link\n  \n \n \n \"Learning pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@article{kusze-GDRelU23,\n      author={Ilja Kuzborskij and Csaba Szepesv\\'ari},\n      title={Learning Lipschitz Functions by GD-trained Shallow Overparameterized ReLU Neural Networks}, \n      url_link = {https://arxiv.org/abs/2212.13848},\n      url_pdf = {https://arxiv.org/pdf/2212.13848.pdf},\n      year={2023},\n      month = {04},\n      journal = {arXiv e-prints},      \n      volume = {abs/2212.13848},\n\tentrysubtype = {unrefereed},\t\n\t\tabstract = {We explore the ability of overparameterized shallow ReLU neural networks to learn Lipschitz, nondifferentiable, bounded functions with additive noise when trained by Gradient Descent (GD). To avoid the problem that in the presence of noise, neural networks trained to nearly zero training error are inconsistent in this class, we focus on the early-stopped GD which allows us to show consistency and optimal rates. In particular, we explore this problem from the viewpoint of the Neural Tangent Kernel (NTK) approximation of a GD-trained finite-width neural network. We show that whenever some early stopping rule is guaranteed to give an optimal rate (of excess risk) on the Hilbert space of the kernel induced by the ReLU activation function, the same rule can be used to achieve minimax optimal rate for learning on the class of considered Lipschitz functions by neural networks. We discuss several data-free and data-dependent practically appealing stopping rules that yield optimal rates.},\n}\n\n\n
\n
\n\n\n
\n We explore the ability of overparameterized shallow ReLU neural networks to learn Lipschitz, nondifferentiable, bounded functions with additive noise when trained by Gradient Descent (GD). To avoid the problem that in the presence of noise, neural networks trained to nearly zero training error are inconsistent in this class, we focus on the early-stopped GD which allows us to show consistency and optimal rates. In particular, we explore this problem from the viewpoint of the Neural Tangent Kernel (NTK) approximation of a GD-trained finite-width neural network. We show that whenever some early stopping rule is guaranteed to give an optimal rate (of excess risk) on the Hilbert space of the kernel induced by the ReLU activation function, the same rule can be used to achieve minimax optimal rate for learning on the class of considered Lipschitz functions by neural networks. We discuss several data-free and data-dependent practically appealing stopping rules that yield optimal rates.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Optimistic Exploration with Learned Features Provably Solves Markov Decision Processes with Neural Dynamics.\n \n \n \n \n\n\n \n Zheng, S.; Wang, L.; Qiu, S.; Fu, Z.; Yang, Z.; Szepesvári, C.; and Wang, Z.\n\n\n \n\n\n\n In ICLR, 05 2023. \n \n\n\n\n
\n\n\n\n \n \n \"Optimistic pdf\n  \n \n \n \"Optimistic link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{zheng2023optimistic-ICLR,\n\ttitle={Optimistic Exploration with Learned Features Provably Solves Markov Decision Processes with Neural Dynamics},\n\tauthor={Sirui Zheng and Lingxiao Wang and Shuang Qiu and Zuyue Fu and Zhuoran Yang and Csaba Szepesv\\'ari and Zhaoran Wang},\n\tbooktitle={ICLR},\n\tacceptrate = {ca. 1590 out of ca. 5000 = 31.8\\%},\n\tmonth = {05},\n\tyear={2023},\n\turl_pdf={https://openreview.net/pdf?id=9kBCMNb5mc},\n\turl_link={https://openreview.net/forum?id=9kBCMNb5mc},\n\tabstract = {Incorporated with the recent advances in deep learning, deep reinforcement learning (DRL) has achieved tremendous success in empirical study. However, analyzing DRL is still challenging due to the complexity of the neural network class. In this paper, we address such a challenge by analyzing the Markov decision process (MDP) with neural dynamics, which covers several existing models as special cases, including the kernelized nonlinear regulator (KNR) model and the linear MDP. We propose a novel algorithm that designs exploration incentives via learnable representations of the dynamics model by embedding the neural dynamics into a kernel space induced by the system noise. We further establish an upper bound on the sample complexity of the algorithm, which demonstrates the sample efficiency of the algorithm. We highlight that, unlike previous analyses of RL algorithms with function approximation, our bound on the sample complexity does not depend on the Eluder dimension of the neural network class, which is known to be exponentially large (Dong et al., 2021).},\n}\n\n\n
\n
\n\n\n
\n Incorporated with the recent advances in deep learning, deep reinforcement learning (DRL) has achieved tremendous success in empirical study. However, analyzing DRL is still challenging due to the complexity of the neural network class. In this paper, we address such a challenge by analyzing the Markov decision process (MDP) with neural dynamics, which covers several existing models as special cases, including the kernelized nonlinear regulator (KNR) model and the linear MDP. We propose a novel algorithm that designs exploration incentives via learnable representations of the dynamics model by embedding the neural dynamics into a kernel space induced by the system noise. We further establish an upper bound on the sample complexity of the algorithm, which demonstrates the sample efficiency of the algorithm. We highlight that, unlike previous analyses of RL algorithms with function approximation, our bound on the sample complexity does not depend on the Eluder dimension of the neural network class, which is known to be exponentially large (Dong et al., 2021).\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Revisiting Simple Regret: Fast Rates for Returning a Good Arm.\n \n \n \n \n\n\n \n Zhao, Y.; Stephens, C.; Szepesvári, C.; and Jun, K.\n\n\n \n\n\n\n In ICML, pages 42110–42158, 07 2023. \n \n\n\n\n
\n\n\n\n \n \n \"Revisiting link\n  \n \n \n \"Revisiting pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{YaoCoSzKS-ICML23,\n  title = \t {Revisiting Simple Regret: Fast Rates for Returning a Good Arm},\n  author =       {Zhao, Yao and Stephens, Connor and Szepesv{\\'a}ri, Csaba and Jun, Kwang-Sung},\n  booktitle = \t {ICML},\n  pages = \t {42110--42158},\n  crossref  = {ICML2023},\n  year = {2023},\n  month = {07},\n  url_link = \t {https://proceedings.mlr.press/v202/zhao23g.html},\n  url_pdf = \t {https://proceedings.mlr.press/v202/zhao23g/zhao23g.pdf},\n  abstract = \t {Simple regret is a natural and parameter-free performance criterion for pure exploration in multi-armed bandits yet is less popular than the probability of missing the best arm or an $\\epsilon$-good arm, perhaps due to lack of easy ways to characterize it. In this paper, we make a significant progress on minimizing simple regret in both data-rich ($T\\ge n$) and data-poor regime ($T \\le n$) where $n$ is the number of arms and $T$ is the number of samples. At its heart is our improved instance-dependent analysis of the well-known Sequential Halving (SH) algorithm where we bound the probability of returning an arm whose mean reward is not within $\\epsilon$ from the best (i.e., not $\\epsilon$-good) for <em>any</em> choice of $\\epsilon\\ge 0$, although $\\epsilon$ is not an input to SH. Our bound not only leads to an optimal worst-case simple regret bound of $\\sqrt{n/T}$ up to logarithmic factors but also essentially matches the instance-dependent lower bound for returning an $\\epsilon$-good arm reported by Katz-Samuels and Jamieson (2020). For the more challenging data-poor regime, we propose Bracketing SH (BSH) that enjoys the same improvement even without sampling each arm at least once. Our empirical study shows that BSH outperforms existing methods on real-world tasks.}\n}\n\n\n
\n
\n\n\n
\n Simple regret is a natural and parameter-free performance criterion for pure exploration in multi-armed bandits yet is less popular than the probability of missing the best arm or an $ε$-good arm, perhaps due to lack of easy ways to characterize it. In this paper, we make a significant progress on minimizing simple regret in both data-rich ($T\\ge n$) and data-poor regime ($T łe n$) where $n$ is the number of arms and $T$ is the number of samples. At its heart is our improved instance-dependent analysis of the well-known Sequential Halving (SH) algorithm where we bound the probability of returning an arm whose mean reward is not within $ε$ from the best (i.e., not $ε$-good) for any choice of $ε\\ge 0$, although $ε$ is not an input to SH. Our bound not only leads to an optimal worst-case simple regret bound of $\\sqrt{n/T}$ up to logarithmic factors but also essentially matches the instance-dependent lower bound for returning an $ε$-good arm reported by Katz-Samuels and Jamieson (2020). For the more challenging data-poor regime, we propose Bracketing SH (BSH) that enjoys the same improvement even without sampling each arm at least once. Our empirical study shows that BSH outperforms existing methods on real-world tasks.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Stochastic Gradient Succeeds for Bandits.\n \n \n \n \n\n\n \n Mei, J.; Zhong, Z.; Dai, B.; Agarwal, A.; Szepesvári, C.; and Schuurmans, D.\n\n\n \n\n\n\n In ICML, pages 24325–24360, 07 2023. \n \n\n\n\n
\n\n\n\n \n \n \"Stochastic link\n  \n \n \n \"Stochastic pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{MeiZi-ICML23,\n  title = \t {Stochastic Gradient Succeeds for Bandits},\n  author =       {Mei, Jincheng and Zhong, Zixin and Dai, Bo and Agarwal, Alekh and Szepesv{\\'a}ri, Csaba and Schuurmans, Dale},\n  booktitle = \t {ICML},\n  pages = \t {24325--24360},\n  crossref  = {ICML2023},\n  year = {2023},\n  month = {07},\n  url_link = \t {https://proceedings.mlr.press/v202/mei23a.html},\n  url_pdf = \t {https://proceedings.mlr.press/v202/mei23a/mei23a.pdf},\n  abstract = \t {We show that the stochastic gradient bandit algorithm converges to a globally optimal policy at an $O(1/t)$ rate, even with a constant step size. Remarkably, global convergence of the stochastic gradient bandit algorithm has not been previously established, even though it is an old algorithm known to be applicable to bandits. The new result is achieved by establishing two novel technical findings: first, the noise of the stochastic updates in the gradient bandit algorithm satisfies a strong ``growth condition'' property, where the variance diminishes whenever progress becomes small, implying that additional noise control via diminishing step sizes is unnecessary; second, a form of ``weak exploration'' is automatically achieved through the stochastic gradient updates, since they prevent the action probabilities from decaying faster than $O(1/t)$, thus ensuring that every action is sampled infinitely often with probability $1$. These two findings can be used to show that the stochastic gradient update is already ``sufficient'' for bandits in the sense that exploration versus exploitation is automatically balanced in a manner that ensures almost sure convergence to a global optimum. These novel theoretical findings are further verified by experimental results.}\n}\n\n\n
\n
\n\n\n
\n We show that the stochastic gradient bandit algorithm converges to a globally optimal policy at an $O(1/t)$ rate, even with a constant step size. Remarkably, global convergence of the stochastic gradient bandit algorithm has not been previously established, even though it is an old algorithm known to be applicable to bandits. The new result is achieved by establishing two novel technical findings: first, the noise of the stochastic updates in the gradient bandit algorithm satisfies a strong ``growth condition'' property, where the variance diminishes whenever progress becomes small, implying that additional noise control via diminishing step sizes is unnecessary; second, a form of ``weak exploration'' is automatically achieved through the stochastic gradient updates, since they prevent the action probabilities from decaying faster than $O(1/t)$, thus ensuring that every action is sampled infinitely often with probability $1$. These two findings can be used to show that the stochastic gradient update is already ``sufficient'' for bandits in the sense that exploration versus exploitation is automatically balanced in a manner that ensures almost sure convergence to a global optimum. These novel theoretical findings are further verified by experimental results.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n The Optimal Approximation Factors in Misspecified Off-Policy Value Function Estimation.\n \n \n \n \n\n\n \n Amortila, P.; Jiang, N.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 768–790, 07 2023. \n \n\n\n\n
\n\n\n\n \n \n \"The link\n  \n \n \n \"The pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{AnNanSz-ICML23,\n  title = \t {The Optimal Approximation Factors in Misspecified Off-Policy Value Function Estimation},\n  author =       {Amortila, Philip and Jiang, Nan and Szepesv{\\'a}ri, Csaba},\n  booktitle = \t {ICML},\n  pages = \t {768--790},\n  crossref  = {ICML2023},\n  year = {2023},\n  month = {07},\n  url_link = \t {https://proceedings.mlr.press/v202/amortila23a.html},\n  url_pdf = \t {https://proceedings.mlr.press/v202/amortila23a/amortila23a.pdf},\n  abstract = \t {Theoretical guarantees in reinforcement learning (RL) are known to suffer multiplicative blow-up factors with respect to the misspecification error of function approximation. Yet, the nature of such <em>approximation factors</em> -- especially their optimal form in a given learning problem -- is poorly understood. In this paper we study this question in linear off-policy value function estimation, where many open questions remain. We study the approximation factor in a broad spectrum of settings, such as presence vs. absence of state aliasing and full vs. partial coverage of the state space. Our core results include instance-dependent upper bounds on the approximation factors with respect to both the weighted $L_2$-norm (where the weighting is the offline state distribution) and the $L_\\infty$ norm. We show that these approximation factors are optimal (in an instance-dependent sense) for a number of these settings. In other cases, we show that the instance-dependent parameters which appear in the upper bounds are necessary, and that the finiteness of either alone cannot guarantee a finite approximation factor even in the limit of infinite data.}\n}\n\n\n
\n
\n\n\n
\n Theoretical guarantees in reinforcement learning (RL) are known to suffer multiplicative blow-up factors with respect to the misspecification error of function approximation. Yet, the nature of such approximation factors – especially their optimal form in a given learning problem – is poorly understood. In this paper we study this question in linear off-policy value function estimation, where many open questions remain. We study the approximation factor in a broad spectrum of settings, such as presence vs. absence of state aliasing and full vs. partial coverage of the state space. Our core results include instance-dependent upper bounds on the approximation factors with respect to both the weighted $L_2$-norm (where the weighting is the offline state distribution) and the $L_∞$ norm. We show that these approximation factors are optimal (in an instance-dependent sense) for a number of these settings. In other cases, we show that the instance-dependent parameters which appear in the upper bounds are necessary, and that the finiteness of either alone cannot guarantee a finite approximation factor even in the limit of infinite data.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Regularization and Variance-Weighted Regression Achieves Minimax Optimality in Linear MDPs: Theory and Practice.\n \n \n \n \n\n\n \n Kitamura, T.; Kozuno, T.; Tang, Y.; Vieillard, N.; Valko, M.; Yang, W.; Mei, J.; Menard, P.; Gheshlaghi Azar, M.; Munos, R.; Pietquin, O.; Geist, M.; Szepesvári, C.; Kumagai, W.; and Matsuo, Y.\n\n\n \n\n\n\n In ICML, pages 17135–17175, 07 2023. \n \n\n\n\n
\n\n\n\n \n \n \"Regularization pdf\n  \n \n \n \"Regularization link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{ToKoICML23,\n  title = {Regularization and Variance-Weighted Regression Achieves Minimax Optimality in Linear MDPs: Theory and Practice},\n  author = {Kitamura, Toshinori and Kozuno, Tadashi and Tang, Yunhao and Vieillard, Nino and Valko, Michal and Yang, Wenhao and Mei, Jincheng and Menard, Pierre and Gheshlaghi Azar, Mohammad and Munos, Remi and Pietquin, Olivier and Geist, Matthieu and Szepesv{\\'a}ri, Csaba and Kumagai, Wataru and Matsuo, Yutaka},\n  booktitle = {ICML},\n  pages = \t{17135--17175},\n  crossref  = {ICML2023},\n  year = {2023},\n  month = {07},\n  url_pdf= \t {https://proceedings.mlr.press/v202/kitamura23a/kitamura23a.pdf},\n  url_link = \t {https://proceedings.mlr.press/v202/kitamura23a.html},\n  abstract = \t {Mirror descent value iteration (MDVI), an abstraction of Kullback-Leibler (KL) and entropy-regularized reinforcement learning (RL), has served as the basis for recent high-performing practical RL algorithms. However, despite the use of function approximation in practice, the theoretical understanding of MDVI has been limited to tabular Markov decision processes (MDPs). We study MDVI with linear function approximation through its sample complexity required to identify an $\\varepsilon$-optimal policy with probability $1-\\delta$ under the settings of an infinite-horizon linear MDP, generative model, and G-optimal design. We demonstrate that least-squares regression weighted by the variance of an estimated optimal value function of the next state is crucial to achieving minimax optimality. Based on this observation, we present Variance-Weighted Least-Squares MDVI (VWLS-MDVI), the first theoretical algorithm that achieves nearly minimax optimal sample complexity for infinite-horizon linear MDPs. Furthermore, we propose a practical VWLS algorithm for value-based deep RL, Deep Variance Weighting (DVW). Our experiments demonstrate that DVW improves the performance of popular value-based deep RL algorithms on a set of MinAtar benchmarks.}\n}\n\n\n
\n
\n\n\n
\n Mirror descent value iteration (MDVI), an abstraction of Kullback-Leibler (KL) and entropy-regularized reinforcement learning (RL), has served as the basis for recent high-performing practical RL algorithms. However, despite the use of function approximation in practice, the theoretical understanding of MDVI has been limited to tabular Markov decision processes (MDPs). We study MDVI with linear function approximation through its sample complexity required to identify an $ɛ$-optimal policy with probability $1-δ$ under the settings of an infinite-horizon linear MDP, generative model, and G-optimal design. We demonstrate that least-squares regression weighted by the variance of an estimated optimal value function of the next state is crucial to achieving minimax optimality. Based on this observation, we present Variance-Weighted Least-Squares MDVI (VWLS-MDVI), the first theoretical algorithm that achieves nearly minimax optimal sample complexity for infinite-horizon linear MDPs. Furthermore, we propose a practical VWLS algorithm for value-based deep RL, Deep Variance Weighting (DVW). Our experiments demonstrate that DVW improves the performance of popular value-based deep RL algorithms on a set of MinAtar benchmarks.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2022\n \n \n (13)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Towards painless policy optimization for constrained MDPs.\n \n \n \n \n\n\n \n Jain, A.; Vaswani, S.; Babanezhad, R.; Szepesvári, C.; and Precup, D.\n\n\n \n\n\n\n In UAI, pages 895–905, 08 2022. \n \n\n\n\n
\n\n\n\n \n \n \"Towards link\n  \n \n \n \"Towards pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{JaVa-UAI22,\n  title = \t {Towards painless policy optimization for constrained MDPs},\n  author =       {Jain, Arushi and Vaswani, Sharan and Babanezhad, Reza and Szepesv\\'ari, Csaba and Precup, Doina},\n  booktitle = \t {UAI},\n  pages = \t {895--905},\n  acceptrate= {230 out of 712 = 32\\%},  \n  year = \t {2022},\n  month = \t {08},\n  url_link = \t {https://proceedings.mlr.press/v180/jain22a.html},\n  url_pdf = \t {https://proceedings.mlr.press/v180/jain22a/jain22a.pdf},\n  abstract = \t {We study policy optimization in an infinite horizon, $\\gamma$-discounted constrained Markov decision process (CMDP). Our objective is to return a policy that achieves large expected reward with a small constraint violation. We consider the online setting with linear function approximation and assume global access to the corresponding features. We propose a generic primal-dual framework that allows us to bound the reward sub-optimality and constraint violation for arbitrary algorithms in terms of their primal and dual regret on online linear optimization problems. We instantiate this framework to use coin-betting algorithms and propose the <em>Coin Betting Politex (CBP)</em> algorithm. Assuming that the action-value functions are $\\epsilon_{\\text{b}}$-close to the span of the $d$-dimensional state-action features and no sampling errors, we prove that $T$ iterations of CBP result in an $O(\\frac{1}{(1 - \\gamma)^3 \\sqrt{T}} + \\frac{\\epsilon_{\\text{b}} \\sqrt{d}}{(1 - \\gamma)^2} )$ reward sub-optimality and an $O(\\frac{1}{(1 - \\gamma)^2 \\sqrt{T}} + \\frac{\\epsilon_{\\text{b}} \\sqrt{d}}{1 - \\gamma} )$ constraint violation. Importantly, unlike gradient descent-ascent and other recent methods, CBP does not require extensive hyperparameter tuning. Via experiments on synthetic and Cartpole environments, we demonstrate the effectiveness and robustness of CBP.}\n}\n\n\n
\n
\n\n\n
\n We study policy optimization in an infinite horizon, $γ$-discounted constrained Markov decision process (CMDP). Our objective is to return a policy that achieves large expected reward with a small constraint violation. We consider the online setting with linear function approximation and assume global access to the corresponding features. We propose a generic primal-dual framework that allows us to bound the reward sub-optimality and constraint violation for arbitrary algorithms in terms of their primal and dual regret on online linear optimization problems. We instantiate this framework to use coin-betting algorithms and propose the Coin Betting Politex (CBP) algorithm. Assuming that the action-value functions are $ε_{\\text{b}}$-close to the span of the $d$-dimensional state-action features and no sampling errors, we prove that $T$ iterations of CBP result in an $O(\\frac{1}{(1 - γ)^3 \\sqrt{T}} + \\frac{ε_{\\text{b}} \\sqrt{d}}{(1 - γ)^2} )$ reward sub-optimality and an $O(\\frac{1}{(1 - γ)^2 \\sqrt{T}} + \\frac{ε_{\\text{b}} \\sqrt{d}}{1 - γ} )$ constraint violation. Importantly, unlike gradient descent-ascent and other recent methods, CBP does not require extensive hyperparameter tuning. Via experiments on synthetic and Cartpole environments, we demonstrate the effectiveness and robustness of CBP.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Near-Optimal Sample Complexity Bounds for Constrained MDPs.\n \n \n \n \n\n\n \n Vaswani, S.; Yang, L. F.; and Szepesvári, C.\n\n\n \n\n\n\n In NeurIPS, 11 2022. \n \n\n\n\n
\n\n\n\n \n \n \"Near-Optimal link\n  \n \n \n \"Near-Optimal paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{vayasz22,\ntitle = {Near-Optimal Sample Complexity Bounds for Constrained MDPs},\nauthor = {Vaswani, Sharan and Yang, Lin F.  and Szepesv{\\'a}ri, Csaba},\nbooktitle = {NeurIPS},\nacceptrate= {2665 out of 10411 = 25.6\\%},\nmonth = {11},\nyear = {2022},\nurl_link={},\nurl_paper = {NeurIPS2022_CMDP.pdf},\nabstract = {In contrast to the advances in characterizing the sample complexity for solving Markov decision processes (MDPs), the optimal statistical complexity for solving constrained MDPs (CMDPs) remains unknown. We resolve this question by providing <em>minimax</em> upper and lower bounds on the sample complexity for learning near-optimal policies in a discounted CMDP with access to a generative model (simulator). In particular, we design a model-based algorithm that addresses two settings: (i) <em>relaxed feasibility</em>,  where small constraint violations are allowed, and (ii) <em>strict feasibility</em>, where the output policy is required to satisfy the constraint. For (i), we prove that our algorithm returns an $\\epsilon$-optimal policy with probability $1 - \\delta$, by making  $\\tilde{O}(\\frac{S A \\log(1/\\delta)}{(1 - \\gamma)^3 \\epsilon^2})$ queries to the generative model, thus matching the sample-complexity for unconstrained MDPs. For (ii), we show that the algorithm's sample complexity is upper-bounded by $\\tilde{O} (\\frac{S A \\log(1/\\delta)}{(1 - \\gamma)^5  \\epsilon^2 \\zeta^2} )$ where $\\zeta$ is the problem-dependent Slater constant that characterizes the size of the feasible region. Finally, we prove a matching lower-bound for the strict feasibility setting, thus obtaining the first near minimax optimal bounds for discounted CMDPs. Our results show that learning CMDPs is as easy as MDPs when small constraint violations are allowed, but inherently more difficult when we demand zero constraint violation.},\nkeywords = {reinforcement learning, MDPs, constrained MDPs, sample complexity, primal-dual methods}\n}\n\n\n
\n
\n\n\n
\n In contrast to the advances in characterizing the sample complexity for solving Markov decision processes (MDPs), the optimal statistical complexity for solving constrained MDPs (CMDPs) remains unknown. We resolve this question by providing minimax upper and lower bounds on the sample complexity for learning near-optimal policies in a discounted CMDP with access to a generative model (simulator). In particular, we design a model-based algorithm that addresses two settings: (i) relaxed feasibility, where small constraint violations are allowed, and (ii) strict feasibility, where the output policy is required to satisfy the constraint. For (i), we prove that our algorithm returns an $ε$-optimal policy with probability $1 - δ$, by making $Õ(\\frac{S A łog(1/δ)}{(1 - γ)^3 ε^2})$ queries to the generative model, thus matching the sample-complexity for unconstrained MDPs. For (ii), we show that the algorithm's sample complexity is upper-bounded by $Õ (\\frac{S A łog(1/δ)}{(1 - γ)^5 ε^2 ζ^2} )$ where $ζ$ is the problem-dependent Slater constant that characterizes the size of the feasible region. Finally, we prove a matching lower-bound for the strict feasibility setting, thus obtaining the first near minimax optimal bounds for discounted CMDPs. Our results show that learning CMDPs is as easy as MDPs when small constraint violations are allowed, but inherently more difficult when we demand zero constraint violation.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Confident Approximate Policy Iteration for Efficient Local Planning in $q^π$-realizable MDPs.\n \n \n \n \n\n\n \n Weisz, G.; György, A.; Kozuno, T.; and Szepesvári, C.\n\n\n \n\n\n\n In NeurIPS, 11 2022. \n \n\n\n\n
\n\n\n\n \n \n \"Confident link\n  \n \n \n \"Confident paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{wegysz22,\ntitle = {Confident Approximate Policy Iteration for Efficient Local Planning in $q^\\pi$-realizable MDPs},\nauthor = {Weisz, Gell\\'ert and Gy\\"orgy, Andr\\'as and Kozuno, Tadashi and Szepesv{\\'a}ri, Csaba},\nbooktitle = {NeurIPS},\nacceptrate= {2665 out of 10411 = 25.6\\%},\nmonth = {11},\nyear = {2022},\nurl_link={},\nurl_paper = {NeurIPS2022_CAPI.pdf},\nabstract = {We consider approximate dynamic programming in $\\gamma$-discounted Markov decision processes and apply it to approximate planning with linear value-function approximation. Our first contribution is a new variant of approximate policy iteration (API), called confident approximate policy iteration (CAPI), which computes a deterministic stationary policy with an optimal error bound scaling linearly with the product of the effective horizon $H$ and the worst-case approximation error $\\varepsilon$ of the action-value functions of stationary policies. This improvement over API (whose error scales with $H^2$) comes at the price of an $H$-fold increase in memory cost. Unlike Scherrer and Lesner [2012], who recommended computing a non-stationary policy to achieve a similar improvement (with the same memory overhead), we are able to stick to stationary policies. This allows for our second contribution, the application of CAPI to planning with local access to a simulator and $d$-dimensional linear function approximation. As such, we design a planning algorithm that applies CAPI to obtain a sequence of policies with successively refined accuracies on a dynamically evolving set of states. The algorithm outputs an $\\tilde{O}( \\sqrt{dH\\varepsilon})$-optimal policy after issuing $\\tilde{O}(dH^4/\\varepsilon^2)$ queries to the simulator, simultaneously achieving the optimal accuracy bound and the best known query complexity bound, while earlier algorithms in the literature achieve only one of them. This query complexity is shown to be tight in all parameters except $H$. These improvements come at the expense of a mild (polynomial) increase in memory and computational costs of both the algorithm and its output policy.},\nkeywords = {reinforcement learning, policy iteration, local planning, simulators, MDPs, linear function approximation}\n}\n\n\n
\n
\n\n\n
\n We consider approximate dynamic programming in $γ$-discounted Markov decision processes and apply it to approximate planning with linear value-function approximation. Our first contribution is a new variant of approximate policy iteration (API), called confident approximate policy iteration (CAPI), which computes a deterministic stationary policy with an optimal error bound scaling linearly with the product of the effective horizon $H$ and the worst-case approximation error $ɛ$ of the action-value functions of stationary policies. This improvement over API (whose error scales with $H^2$) comes at the price of an $H$-fold increase in memory cost. Unlike Scherrer and Lesner [2012], who recommended computing a non-stationary policy to achieve a similar improvement (with the same memory overhead), we are able to stick to stationary policies. This allows for our second contribution, the application of CAPI to planning with local access to a simulator and $d$-dimensional linear function approximation. As such, we design a planning algorithm that applies CAPI to obtain a sequence of policies with successively refined accuracies on a dynamically evolving set of states. The algorithm outputs an $Õ( \\sqrt{dHɛ})$-optimal policy after issuing $Õ(dH^4/ɛ^2)$ queries to the simulator, simultaneously achieving the optimal accuracy bound and the best known query complexity bound, while earlier algorithms in the literature achieve only one of them. This query complexity is shown to be tight in all parameters except $H$. These improvements come at the expense of a mild (polynomial) increase in memory and computational costs of both the algorithm and its output policy.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n The Role of Baselines in Policy Gradient Optimization.\n \n \n \n \n\n\n \n Mei, J.; Chung, W.; Thomas, V.; Dai, B.; Szepesvári, C.; and Schuurmans, D.\n\n\n \n\n\n\n In NeurIPS, 11 2022. \n \n\n\n\n
\n\n\n\n \n \n \"The link\n  \n \n \n \"The pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{mei2022the,\ntitle={The Role of Baselines in Policy Gradient Optimization},\nauthor={Jincheng Mei and Wesley Chung and Valentin Thomas and Bo Dai and Csaba Szepesv{\\'a}ri and Dale Schuurmans},\nbooktitle={NeurIPS},\nacceptrate= {2665 out of 10411 = 25.6\\%},\nmonth = {11},\nyear={2022},\nurl_link={https://openreview.net/forum?id=XzeTJBq1Ce2},\nurl_pdf={https://openreview.net/pdf?id=XzeTJBq1Ce2},\nabstract = {We study the effect of baselines in on-policy stochastic policy gradient optimization, and close the gap between the theory and practice of policy optimization methods. Our first contribution is to show that the <em>state value</em> baseline allows on-policy stochastic <em>natural</em> policy gradient (NPG) to converge to a globally optimal policy at an  rate, which was not previously known. The analysis relies on two novel findings: the expected progress of the NPG update satisfies a stochastic version of the non-uniform \\L{}ojasiewicz (N\\L{}) inequality, and with probability one the state value baseline prevents the optimal action's probability from vanishing, thus ensuring sufficient exploration. Importantly, these results provide a new understanding of the role of baselines in stochastic policy gradient: by showing that the variance of natural policy gradient estimates remains unbounded with or without a baseline, we find that variance reduction <em>cannot</em> explain their utility in this setting. Instead, the analysis reveals that the primary effect of the value baseline is to <strong>reduce the aggressiveness of the updates</strong> rather than their variance. That is, we demonstrate that a finite variance is <em>not necessary</em> for almost sure convergence of stochastic NPG, while controlling update aggressiveness is both necessary and sufficient. Additional experimental results verify these theoretical findings.},\nkeywords = {reinforcement learning, policy gradient}\n}\n\n\n
\n
\n\n\n
\n We study the effect of baselines in on-policy stochastic policy gradient optimization, and close the gap between the theory and practice of policy optimization methods. Our first contribution is to show that the state value baseline allows on-policy stochastic natural policy gradient (NPG) to converge to a globally optimal policy at an rate, which was not previously known. The analysis relies on two novel findings: the expected progress of the NPG update satisfies a stochastic version of the non-uniform Łojasiewicz (NŁ) inequality, and with probability one the state value baseline prevents the optimal action's probability from vanishing, thus ensuring sufficient exploration. Importantly, these results provide a new understanding of the role of baselines in stochastic policy gradient: by showing that the variance of natural policy gradient estimates remains unbounded with or without a baseline, we find that variance reduction cannot explain their utility in this setting. Instead, the analysis reveals that the primary effect of the value baseline is to reduce the aggressiveness of the updates rather than their variance. That is, we demonstrate that a finite variance is not necessary for almost sure convergence of stochastic NPG, while controlling update aggressiveness is both necessary and sufficient. Additional experimental results verify these theoretical findings.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Sample-Efficient Reinforcement Learning of Partially Observable Markov Games.\n \n \n \n \n\n\n \n Liu, Q.; Szepesvári, C.; and Jin, C.\n\n\n \n\n\n\n In NeurIPS, 11 2022. \n \n\n\n\n
\n\n\n\n \n \n \"Sample-Efficient link\n  \n \n \n \"Sample-Efficient pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{liu2022sampleefficient,\ntitle={Sample-Efficient Reinforcement Learning of Partially Observable Markov Games},\nauthor={Qinghua Liu and Csaba Szepesv{\\'a}ri and Chi Jin},\nbooktitle={NeurIPS},\nacceptrate= {2665 out of 10411 = 25.6\\%},\nmonth = {11},\nyear={2022},\nurl_link={https://openreview.net/forum?id=HnIQrSY7vPI},\nurl_pdf={https://openreview.net/pdf?id=HnIQrSY7vPI},\nabstract = {This paper considers the challenging tasks of Multi-Agent Reinforcement Learning (MARL) under partial observability, where each agent only sees her own individual observations and actions that reveal incomplete information about the underlying state of system. This paper studies these tasks under the general model of multiplayer general-sum Partially Observable Markov Games (POMGs), which is significantly larger than the standard model of Imperfect Information Extensive-Form Games (IIEFGs). We identify a rich subclass of POMGs---weakly revealing POMGs---in which sample-efficient learning is tractable. In the self-play setting, we prove that a simple algorithm combining optimism and Maximum Likelihood Estimation (MLE) is sufficient to find approximate Nash equilibria, correlated equilibria, as well as coarse correlated equilibria of weakly revealing POMGs, in a polynomial number of samples when the number of agents is small. In the setting of playing against adversarial opponents, we show that a variant of our optimistic MLE algorithm is capable of achieving sublinear regret when being compared against the optimal maximin policies. To our best knowledge, this work provides the first line of sample-efficient results for learning POMGs.},\nkeywords = {reinforcement learning, partial observability, Markov games, sample complexity, finite model}\n}\n\n\n
\n
\n\n\n
\n This paper considers the challenging tasks of Multi-Agent Reinforcement Learning (MARL) under partial observability, where each agent only sees her own individual observations and actions that reveal incomplete information about the underlying state of system. This paper studies these tasks under the general model of multiplayer general-sum Partially Observable Markov Games (POMGs), which is significantly larger than the standard model of Imperfect Information Extensive-Form Games (IIEFGs). We identify a rich subclass of POMGs—weakly revealing POMGs—in which sample-efficient learning is tractable. In the self-play setting, we prove that a simple algorithm combining optimism and Maximum Likelihood Estimation (MLE) is sufficient to find approximate Nash equilibria, correlated equilibria, as well as coarse correlated equilibria of weakly revealing POMGs, in a polynomial number of samples when the number of agents is small. In the setting of playing against adversarial opponents, we show that a variant of our optimistic MLE algorithm is capable of achieving sublinear regret when being compared against the optimal maximin policies. To our best knowledge, this work provides the first line of sample-efficient results for learning POMGs.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Bandit Theory and Thompson Sampling-Guided Directed Evolution for Sequence Optimization.\n \n \n \n \n\n\n \n Yuan, H.; Ni, C.; Wang, H.; Zhang, X.; Cong, L.; Szepesvári, C.; and Wang, M.\n\n\n \n\n\n\n In NeurIPS, 11 2022. \n \n\n\n\n
\n\n\n\n \n \n \"Bandit link\n  \n \n \n \"Bandit pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{yuan2022bandit,\ntitle={Bandit Theory and Thompson Sampling-Guided Directed Evolution for Sequence Optimization},\nauthor={Hui Yuan and Chengzhuo Ni and Huazheng Wang and Xuezhou Zhang and Le Cong and Csaba Szepesv{\\'a}ri and Mengdi Wang},\nbooktitle={NeurIPS},\nacceptrate= {2665 out of 10411 = 25.6\\%},\nmonth = {11},\nyear={2022},\nurl_link={https://openreview.net/forum?id=drVX99PekKf},\nurl_pdf={https://openreview.net/pdf?id=drVX99PekKf},\nabstract = {Directed Evolution (DE), a landmark wet-lab method originated in 1960s, enables discovery of novel protein designs via evolving a population of candidate sequences. Recent advances in biotechnology has made it possible to collect high-throughput data, allowing the use of machine learning to map out a protein's sequence-to-function relation. There is a growing interest in machine learning-assisted DE for accelerating protein optimization. Yet the theoretical understanding of DE, as well as the use of machine learning in DE, remains limited. In this paper, we connect DE with the bandit learning theory and make a first attempt to study regret minimization in DE. We propose a Thompson Sampling-guided Directed Evolution (TS-DE) framework for sequence optimization, where the sequence-to-function mapping is unknown and querying a single value is subject to costly and noisy measurements. TS-DE updates a posterior of the function based on collected measurements. It uses a posterior-sampled function estimate to guide the crossover recombination and mutation steps in DE. In the case of a linear model, we show that TS-DE enjoys a Bayesian regret of order $\\tilde{O}(d^2\\sqrt{MT})$, where $d$ is feature dimension, $M$ is population size and $T$ is number of rounds. This regret bound is nearly optimal, confirming that bandit learning can provably accelerate DE. It may have implications for more general sequence optimization and evolutionary algorithms.},\nkeywords = {directed evolution, optimization}\n}\n\n\n
\n
\n\n\n
\n Directed Evolution (DE), a landmark wet-lab method originated in 1960s, enables discovery of novel protein designs via evolving a population of candidate sequences. Recent advances in biotechnology has made it possible to collect high-throughput data, allowing the use of machine learning to map out a protein's sequence-to-function relation. There is a growing interest in machine learning-assisted DE for accelerating protein optimization. Yet the theoretical understanding of DE, as well as the use of machine learning in DE, remains limited. In this paper, we connect DE with the bandit learning theory and make a first attempt to study regret minimization in DE. We propose a Thompson Sampling-guided Directed Evolution (TS-DE) framework for sequence optimization, where the sequence-to-function mapping is unknown and querying a single value is subject to costly and noisy measurements. TS-DE updates a posterior of the function based on collected measurements. It uses a posterior-sampled function estimate to guide the crossover recombination and mutation steps in DE. In the case of a linear model, we show that TS-DE enjoys a Bayesian regret of order $Õ(d^2\\sqrt{MT})$, where $d$ is feature dimension, $M$ is population size and $T$ is number of rounds. This regret bound is nearly optimal, confirming that bandit learning can provably accelerate DE. It may have implications for more general sequence optimization and evolutionary algorithms.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A Free Lunch from the Noise: Provable and Practical Exploration for Representation Learning.\n \n \n \n \n\n\n \n Ren, T.; Zhang, T.; Szepesvári, C.; and Dai, B.\n\n\n \n\n\n\n In UAI, 08 2022. \n \n\n\n\n
\n\n\n\n \n \n \"A link\n  \n \n \n \"A pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{ren2022a,\ntitle={A Free Lunch from the Noise: Provable and Practical Exploration for Representation Learning},\nauthor={Tongzheng Ren and Tianjun Zhang and Csaba Szepesv{\\'a}ri and Bo Dai},\nbooktitle={UAI},\nacceptrate = {230 out of 712 = 32\\%},\nmonth = {08},\nyear={2022},\nurl_link={https://openreview.net/forum?id=SHg8gwUsqxc},\nurl_pdf={https://openreview.net/pdf?id=SHg8gwUsqxc},\nabstract = {Representation learning lies at the heart of the empirical success of deep learning for dealing with the curse of dimensionality. However, the power of representation learning has not been fully exploited yet in reinforcement learning (RL), due to i), the trade-off between expressiveness and tractability; and ii), the coupling between exploration and representation learning. In this paper, we first reveal the fact that under some noise assumption in the stochastic control model, we can obtain the linear spectral feature of its corresponding Markov transition operator in closed-form for free. Based on this observation, we propose Spectral Dynamics Embedding (SPEDE), which breaks the tradeoff and completes optimistic exploration for representation learning by exploiting the structure of the noise. We provide rigorous theoretical analysis of SPEDE, and demonstrate the practical superior performance over the existing state-of-the-art empirical algorithms on several benchmarks.},\nkeywords = {reinforcement learning, exploration, representation learning}\n}\n\n
\n
\n\n\n
\n Representation learning lies at the heart of the empirical success of deep learning for dealing with the curse of dimensionality. However, the power of representation learning has not been fully exploited yet in reinforcement learning (RL), due to i), the trade-off between expressiveness and tractability; and ii), the coupling between exploration and representation learning. In this paper, we first reveal the fact that under some noise assumption in the stochastic control model, we can obtain the linear spectral feature of its corresponding Markov transition operator in closed-form for free. Based on this observation, we propose Spectral Dynamics Embedding (SPEDE), which breaks the tradeoff and completes optimistic exploration for representation learning by exploiting the structure of the noise. We provide rigorous theoretical analysis of SPEDE, and demonstrate the practical superior performance over the existing state-of-the-art empirical algorithms on several benchmarks.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n When Is Partially Observable Reinforcement Learning Not Scary?.\n \n \n \n \n\n\n \n Liu, Q.; Chung, A.; Szepesvári, C.; and Jin, C.\n\n\n \n\n\n\n In COLT, 07 2022. \n \n\n\n\n
\n\n\n\n \n \n \"When link\n  \n \n \n \"When pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{LCSzJ22,\n  author    = {Liu, Qinghua  and Chung, Alan  and Szepesv\\'ari, Csaba and Jin, Chi},\n  title     = {When Is Partially Observable Reinforcement Learning Not Scary?},\n  booktitle = {COLT},\n  acceptrate = {155 out of 484 = 32\\%},\n  month     = {07},\n  year      = {2022},\n  url_link  = {https://doi.org/10.48550/arXiv.2204.08967},\n  url_pdf = {https://arxiv.org/pdf/2204.08967.pdf},\n  abstract  = {Partial observability is ubiquitous in applications of Reinforcement Learning (RL), in which agents learn to make a sequence of decisions despite lacking complete information about the latent states of the controlled system. Partially observable RL is notoriously difficult in theory---well-known complexity-theoretic results show that learning partially observable Markov decision processes (POMDPs) requires an exponential number of samples in the worst case. Yet, this does not rule out the possible existence of interesting subclasses of POMDPs, which include a large set of partial observable applications in practice while being tractable. In this paper we identify a rich family of tractable POMDPs, which we call weakly revealing POMDPs. This family rules out the pathological instances of POMDPs with non-informative observations. We prove that for weakly revealing POMDPs, a simple algorithm combining optimism and Maximum Likelihood Estimation (MLE) is sufficient to guarantee a polynomial sample complexity. To the best of our knowledge, this is the first provably sample-efficient result for learning in overcomplete POMDPs---where the number of latent states can be larger than the number of observations---in settings where exploration is necessary.},\nkeywords = {reinforcement learning, exploration, POMDPs}  \n}\n\n
\n
\n\n\n
\n Partial observability is ubiquitous in applications of Reinforcement Learning (RL), in which agents learn to make a sequence of decisions despite lacking complete information about the latent states of the controlled system. Partially observable RL is notoriously difficult in theory—well-known complexity-theoretic results show that learning partially observable Markov decision processes (POMDPs) requires an exponential number of samples in the worst case. Yet, this does not rule out the possible existence of interesting subclasses of POMDPs, which include a large set of partial observable applications in practice while being tractable. In this paper we identify a rich family of tractable POMDPs, which we call weakly revealing POMDPs. This family rules out the pathological instances of POMDPs with non-informative observations. We prove that for weakly revealing POMDPs, a simple algorithm combining optimism and Maximum Likelihood Estimation (MLE) is sufficient to guarantee a polynomial sample complexity. To the best of our knowledge, this is the first provably sample-efficient result for learning in overcomplete POMDPs—where the number of latent states can be larger than the number of observations—in settings where exploration is necessary.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n The Curse of Passive Data Collection in Batch Reinforcement Learning.\n \n \n \n \n\n\n \n Xiao, C.; Lee, I.; Dai, B.; Schuurmans, D.; and Szepesvári, C.\n\n\n \n\n\n\n In AISTATS, pages 8413–8438, 01 2022. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"The link\n  \n \n \n \"The pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{XiaoLDSS22,\n  author    = {Xiao, Chenjun  and Lee, Ilbin  and Dai, Bo  and Schuurmans, Dale  and Szepesv\\'ari, Csaba},\n  title     = {The Curse of Passive Data Collection in Batch Reinforcement Learning},\n  booktitle = {AISTATS},\n  crossref  = {AISTATS2022},\n  month = {01},\n  pages     = {8413--8438},\n  publisher = {PMLR},\n  year      = {2022},\n  url_link  = {https://proceedings.mlr.press/v151/xiao22b.html},\n  url_pdf = {https://proceedings.mlr.press/v151/xiao22b/xiao22b.pdf},\n  abstract  = {In high stake applications, active experimentation may be considered too risky and thus data are often collected passively. While in simple cases, such as in bandits, passive and active data collection are similarly effective, the price of passive sampling can be much higher when collecting data from a system with controlled states. The main focus of the current paper is the characterization of this price. For example, when learning in episodic finite state-action Markov decision processes (MDPs) with $S$ states and $A$ actions, we show that even with the best (but passively chosen) logging policy, $\\Omega(A^{\\min(S−1,H)/\\varepsilon^2})$ episodes are necessary (and sufficient) to obtain an $\\varepsilon$-optimal policy, where $H$ is the length of episodes. Note that this shows that the sample complexity blows up exponentially compared to the case of active data collection, a result which is not unexpected, but, as far as we know, have not been published beforehand and perhaps the form of the exact expression is a little surprising. We also extend these results in various directions, such as other criteria or learning in the presence of function approximation, with similar conclusions. A remarkable feature of our result is the sharp characterization of the exponent that appears, which is critical for understanding what makes passive learning hard.},\n  keywords = {reinforcement learning, batch learning, sample complexity}\n}\n\n
\n
\n\n\n
\n In high stake applications, active experimentation may be considered too risky and thus data are often collected passively. While in simple cases, such as in bandits, passive and active data collection are similarly effective, the price of passive sampling can be much higher when collecting data from a system with controlled states. The main focus of the current paper is the characterization of this price. For example, when learning in episodic finite state-action Markov decision processes (MDPs) with $S$ states and $A$ actions, we show that even with the best (but passively chosen) logging policy, $Ω(A^{\\min(S−1,H)/ɛ^2})$ episodes are necessary (and sufficient) to obtain an $ɛ$-optimal policy, where $H$ is the length of episodes. Note that this shows that the sample complexity blows up exponentially compared to the case of active data collection, a result which is not unexpected, but, as far as we know, have not been published beforehand and perhaps the form of the exact expression is a little surprising. We also extend these results in various directions, such as other criteria or learning in the presence of function approximation, with similar conclusions. A remarkable feature of our result is the sharp characterization of the exponent that appears, which is critical for understanding what makes passive learning hard.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Faster Rates, Adaptive Algorithms, and Finite-Time Bounds for Linear Composition Optimization and Gradient TD Learning.\n \n \n \n \n\n\n \n Raj, A.; Joulani, P.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In AISTATS, pages 7176–7186, 01 2022. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"Faster link\n  \n \n \n \"Faster pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{RajJ0S22,\n  author    = {Raj, Anant  and Joulani, Pooria  and Gy\\"orgy, Andr\\'as and Szepesv\\'ari, Csaba},\n  title     = {Faster Rates, Adaptive Algorithms, and Finite-Time Bounds for Linear Composition Optimization and Gradient TD Learning},\n  booktitle = {AISTATS},\n  crossref  = {AISTATS2022},\n  month = {01},\n  pages     = {7176--7186},\n  publisher = {PMLR},\n  year      = {2022},\n  url_link  = {https://proceedings.mlr.press/v151/raj22a.html},\n  url_pdf = {https://proceedings.mlr.press/v151/raj22a/raj22a.pdf},\n  abstract  = {Gradient temporal difference (GTD) algorithms are provably convergent policy evaluation methods for off-policy reinforcement learning. Despite much progress, proper tuning of the stochastic approximation methods used to solve the resulting saddle point optimization problem requires the knowledge of several (unknown) problem-dependent parameters. In this paper we apply adaptive step-size tuning strategies to greatly reduce this dependence on prior knowledge, and provide algorithms with adaptive convergence guarantees. In addition, we use the underlying refined analysis technique to obtain new $O(1/T)$ rates that do not depend on the strong-convexity parameter of the problem, and also apply to the Markov noise setting, as well as the unbounded i.i.d. noise setting.},\n  keywords = {reinforcement learning, TD learning, GTD, value function estimation, stochastic approximation}\n}\n\n
\n
\n\n\n
\n Gradient temporal difference (GTD) algorithms are provably convergent policy evaluation methods for off-policy reinforcement learning. Despite much progress, proper tuning of the stochastic approximation methods used to solve the resulting saddle point optimization problem requires the knowledge of several (unknown) problem-dependent parameters. In this paper we apply adaptive step-size tuning strategies to greatly reduce this dependence on prior knowledge, and provide algorithms with adaptive convergence guarantees. In addition, we use the underlying refined analysis technique to obtain new $O(1/T)$ rates that do not depend on the strong-convexity parameter of the problem, and also apply to the Markov noise setting, as well as the unbounded i.i.d. noise setting.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Confident Least Square Value Iteration with Local Access to a Simulator.\n \n \n \n \n\n\n \n Hao, B.; Lazic, N.; Yin, D.; Abbasi-Yadkori , Y.; and Szepesvári, C.\n\n\n \n\n\n\n In AISTATS, pages 2420–2435, 01 2022. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"Confident link\n  \n \n \n \"Confident pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{HaoLYAS22,\n  author    = {Hao, Botao  and Lazic, Nevena  and Yin, Dong  and Abbasi-Yadkori , Yasin and Szepesv\\'ari, Csaba},\n  title     = {Confident Least Square Value Iteration with Local Access to a Simulator},\n  booktitle = {AISTATS},\n  crossref  = {AISTATS2022},\n  month = {01},\n  pages     = {2420--2435},\n  publisher = {PMLR},\n  year      = {2022},\n  url_link  = {https://proceedings.mlr.press/v151/hao22a.html},\n  url_pdf = {https://proceedings.mlr.press/v151/hao22a/hao22a.pdf},\n  abstract  = {Learning with simulators is ubiquitous in modern reinforcement learning (RL). The simulator can either correspond to a simplified version of the real environment (such as a physics simulation of a robot arm) or to the environment itself (such as in games like Atari and Go). Among algorithms that are provably sample-efficient in this setting, most make the unrealistic assumption that all possible environment states are known before learning begins, or perform global optimistic planning which is computationally inefficient. In this work, we focus on simulation-based RL under a more realistic local access protocol, where the state space is unknown and the simulator can only be queried at states that have previously been observed (initial states and those returned by previous queries). We propose an algorithm named CONFIDENT-LSVI based on the template of least-square value iteration. CONFIDENT-LSVI incrementally builds a coreset of important states and uses the simulator to revisit them. Assuming that the linear function class has low approximation error under the Bellman optimality operator (a.k.a. low inherent Bellman error), we bound the algorithm performance in terms of this error, and show that it is query-and computationally-efficient.},\nkeywords = {reinforcement learning, value iteration, local planning, simulators, MDPs, linear function approximation, low inherent Bellman error}  \n}\n\n
\n
\n\n\n
\n Learning with simulators is ubiquitous in modern reinforcement learning (RL). The simulator can either correspond to a simplified version of the real environment (such as a physics simulation of a robot arm) or to the environment itself (such as in games like Atari and Go). Among algorithms that are provably sample-efficient in this setting, most make the unrealistic assumption that all possible environment states are known before learning begins, or perform global optimistic planning which is computationally inefficient. In this work, we focus on simulation-based RL under a more realistic local access protocol, where the state space is unknown and the simulator can only be queried at states that have previously been observed (initial states and those returned by previous queries). We propose an algorithm named CONFIDENT-LSVI based on the template of least-square value iteration. CONFIDENT-LSVI incrementally builds a coreset of important states and uses the simulator to revisit them. Assuming that the linear function class has low approximation error under the Bellman optimality operator (a.k.a. low inherent Bellman error), we bound the algorithm performance in terms of this error, and show that it is query-and computationally-efficient.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n TensorPlan and the Few Actions Lower Bound for Planning in MDPs under Linear Realizability of Optimal Value Functions.\n \n \n \n \n\n\n \n Weisz, G.; Szepesvári, C.; and György, A.\n\n\n \n\n\n\n In ALT, pages 1097–1137, 03 2022. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"TensorPlan link\n  \n \n \n \"TensorPlan pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{WeiszS022,\n  author    = {Weisz, Gell\\'ert  and Szepesv\\'ari, Csaba and Gy\\"orgy, Andr\\'as },\n  title     = {TensorPlan and the Few Actions Lower Bound for Planning in MDPs under Linear Realizability of Optimal Value Functions},\n  booktitle = {ALT},\n  month = {03},\n  acceptrate = {42 out of 117 = 36\\%},\n  pages     = {1097--1137},\n  publisher = {PMLR},\n  year      = {2022},\n  url_link  = {https://proceedings.mlr.press/v167/weisz22a.html},\n  url_pdf = {https://proceedings.mlr.press/v167/weisz22a/weisz22a.pdf},\n  abstract  = {We consider the minimax query complexity of online planning with a generative model in fixed-horizon Markov decision processes (MDPs) with linear function approximation. Following recent works, we consider broad classes of problems where  either \n(i) the optimal value function $v^\\star$ or \n(ii) the optimal action-value function $q^\\star$ lie in the linear span of some features; or\n (iii) both $v^\\star$ and $q^\\star$ lie in the linear span when restricted to the states reachable from the starting state. \nRecently, Weisz et al. (ALT 2021) showed that under (ii) the minimax query complexity of any planning algorithm is at least exponential in the horizon $H$ or in the feature dimension $d$ when the size $A$ of the action set can be chosen to be exponential in $\\min(d,H)$. On the other hand, for the setting (i), Weisz et al. (COLT 2021) introduced TensorPlan, a planner whose query cost is polynomial in all relevant quantities when the number of actions is fixed. Among other things, these two works left open the question whether polynomial query complexity is possible when $A$ is subexponential in $\\min(d,H)$. In this paper we answer this question in the negative: we show that an exponentially large lower bound holds when $A=\\Omega( \\min(d^{1/4},H^{1/2}))$, under either (i), (ii) or (iii). In particular, this implies a perhaps surprising exponential separation of query complexity compared to the work of Du et al. (2021) who prove a polynomial upper bound when (iii) holds for all states. Furthermore, we show that the upper bound of TensorPlan can be extended to hold under (iii) and, for MDPs with deterministic transitions and stochastic rewards, also under (ii).},\nkeywords = {reinforcement learning, simulators, planning, MDPs, linear function approximation, $q^*$-realizability, $v^*$-realizability}\n}\n\n
\n
\n\n\n
\n We consider the minimax query complexity of online planning with a generative model in fixed-horizon Markov decision processes (MDPs) with linear function approximation. Following recent works, we consider broad classes of problems where either (i) the optimal value function $v^⋆$ or (ii) the optimal action-value function $q^⋆$ lie in the linear span of some features; or (iii) both $v^⋆$ and $q^⋆$ lie in the linear span when restricted to the states reachable from the starting state. Recently, Weisz et al. (ALT 2021) showed that under (ii) the minimax query complexity of any planning algorithm is at least exponential in the horizon $H$ or in the feature dimension $d$ when the size $A$ of the action set can be chosen to be exponential in $\\min(d,H)$. On the other hand, for the setting (i), Weisz et al. (COLT 2021) introduced TensorPlan, a planner whose query cost is polynomial in all relevant quantities when the number of actions is fixed. Among other things, these two works left open the question whether polynomial query complexity is possible when $A$ is subexponential in $\\min(d,H)$. In this paper we answer this question in the negative: we show that an exponentially large lower bound holds when $A=Ω( \\min(d^{1/4},H^{1/2}))$, under either (i), (ii) or (iii). In particular, this implies a perhaps surprising exponential separation of query complexity compared to the work of Du et al. (2021) who prove a polynomial upper bound when (iii) holds for all states. Furthermore, we show that the upper bound of TensorPlan can be extended to hold under (iii) and, for MDPs with deterministic transitions and stochastic rewards, also under (ii).\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Efficient local planning with linear function approximation.\n \n \n \n \n\n\n \n Yin, D.; Hao, B.; Abbasi-Yadkori, Y.; Lazic, N.; and Szepesvári, C.\n\n\n \n\n\n\n In ALT, pages 1165–1192, 03 2022. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"Efficient link\n  \n \n \n \"Efficient pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{YinHALS22,\n  title     = {Efficient local planning with linear function approximation},\n  author    = {Yin, Dong  and Hao, Botao  and Abbasi-Yadkori, Yasin and Lazic, Nevena  and Szepesv\\'ari, Csaba},\n  booktitle = {ALT},\n  month = {03},\n  acceptrate = {42 out of 117 = 36\\%},\n  pages     = {1165--1192},\n  publisher = {PMLR},\n  year      = {2022},\n  url_link  = {https://proceedings.mlr.press/v167/yin22a.html},\n  url_pdf = {https://proceedings.mlr.press/v167/yin22a/yin22a.pdf},\n  abstract  = {We study query and computationally efficient planning algorithms for discounted Markov decision processes (MDPs) with linear function approximation and a simulator. The agent is assumed to have local access to the simulator, meaning that the simulator can be queried only at states that have been encountered in previous steps. We propose two new algorithms for this setting, which we call confident Monte Carlo least-squares policy iteration (Confident MC-LSPI), and confident Monte Carlo Politex (Confident MC-Politex), respectively. The main novelty in our algorithms is that they gradually build a set of state-action pairs (“core set”) with which it can control the extrapolation errors. We show that our algorithms have polynomial query and computational cost in the dimension of the features, the effective planning horizon and the targeted sub-optimality, while the cost remains independent of the size of the state space. An interesting technical contribution of our work is the introduction of a novel proof technique that makes use of a virtual policy iteration algorithm. We use this method to leverage existing results on approximate policy iteration with $\\ell^\\infty$-bounded error to show that our algorithm can learn the optimal policy for the given initial state even only with local access to the simulator. We believe that this technique can be extended to broader settings beyond this work.},\nkeywords = {reinforcement learning, simulators, planning, MDPs, linear function approximation, local access, policy iteration}\n}\n\n
\n
\n\n\n
\n We study query and computationally efficient planning algorithms for discounted Markov decision processes (MDPs) with linear function approximation and a simulator. The agent is assumed to have local access to the simulator, meaning that the simulator can be queried only at states that have been encountered in previous steps. We propose two new algorithms for this setting, which we call confident Monte Carlo least-squares policy iteration (Confident MC-LSPI), and confident Monte Carlo Politex (Confident MC-Politex), respectively. The main novelty in our algorithms is that they gradually build a set of state-action pairs (“core set”) with which it can control the extrapolation errors. We show that our algorithms have polynomial query and computational cost in the dimension of the features, the effective planning horizon and the targeted sub-optimality, while the cost remains independent of the size of the state space. An interesting technical contribution of our work is the introduction of a novel proof technique that makes use of a virtual policy iteration algorithm. We use this method to leverage existing results on approximate policy iteration with $\\ell^∞$-bounded error to show that our algorithm can learn the optimal policy for the given initial state even only with local access to the simulator. We believe that this technique can be extended to broader settings beyond this work.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2021\n \n \n (21)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Tighter Risk Certificates for Neural Networks.\n \n \n \n \n\n\n \n Ortiz, M.; Rivasplata, O.; Shawe-Taylor, J.; and Szepesvári, C.\n\n\n \n\n\n\n Journal of Machine Learning Research, 22: 227:1–227:40. 2021.\n \n\n\n\n
\n\n\n\n \n \n \"Tighter link\n  \n \n \n \"Tighter pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@article{Perez-OrtizRSS21,\n  author    = {Ortiz, Maria-P\\'erez  and Rivasplata, Omar  and Shawe-Taylor, John  and Szepesv\\'ari, Csaba},\n  title     = {Tighter Risk Certificates for Neural Networks},\n  journal   = {Journal of Machine Learning Research},\n  volume    = {22},\n  pages     = {227:1--227:40},\n  year      = {2021},\n  url_link  = {http://jmlr.org/papers/v22/20-879.html},\n  url_pdf = {https://jmlr.org/papers/volume22/20-879/20-879.pdf},\n  abstract  = {This paper presents an empirical study regarding training probabilistic neural networks using training objectives derived from PAC-Bayes bounds. In the context of probabilistic neural networks, the output of training is a probability distribution over network weights. We present two training objectives, used here for the first time in connection with training neural networks. These two training objectives are derived from tight PAC-Bayes bounds. We also re-implement a previously used training objective based on a classical PAC-Bayes bound, to compare the properties of the predictors learned using the different training objectives. We compute risk certificates for the learnt predictors, based on part of the data used to learn the predictors. We further experiment with different types of priors on the weights (both data-free and data-dependent priors) and neural network architectures. Our experiments on MNIST and CIFAR-10 show that our training methods produce competitive test set errors and non-vacuous risk bounds with much tighter values than previous results in the literature, showing promise not only to guide the learning algorithm through bounding the risk but also for model selection. These observations suggest that the methods studied here might be good candidates for self-certified learning, in the sense of using the whole data set for learning a predictor and certifying its risk on any unseen data (from the same distribution as the training data) potentially without the need for holding out test data.}\n}\n\n
\n
\n\n\n
\n This paper presents an empirical study regarding training probabilistic neural networks using training objectives derived from PAC-Bayes bounds. In the context of probabilistic neural networks, the output of training is a probability distribution over network weights. We present two training objectives, used here for the first time in connection with training neural networks. These two training objectives are derived from tight PAC-Bayes bounds. We also re-implement a previously used training objective based on a classical PAC-Bayes bound, to compare the properties of the predictors learned using the different training objectives. We compute risk certificates for the learnt predictors, based on part of the data used to learn the predictors. We further experiment with different types of priors on the weights (both data-free and data-dependent priors) and neural network architectures. Our experiments on MNIST and CIFAR-10 show that our training methods produce competitive test set errors and non-vacuous risk bounds with much tighter values than previous results in the literature, showing promise not only to guide the learning algorithm through bounding the risk but also for model selection. These observations suggest that the methods studied here might be good candidates for self-certified learning, in the sense of using the whole data set for learning a predictor and certifying its risk on any unseen data (from the same distribution as the training data) potentially without the need for holding out test data.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n On the Convergence and Sample Efficiency of Variance-Reduced Policy Gradient Method.\n \n \n \n \n\n\n \n Zhang, J.; Ni, C.; Yu, Z.; Szepesvári, C.; and Wang, M.\n\n\n \n\n\n\n In NeurIPS, pages 2228–2240, 2021. \n \n\n\n\n
\n\n\n\n \n \n \"On link\n  \n \n \n \"On paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 6 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{ZNYSzW21,\n  author    = {Junyu Zhang and Chengzhuo Ni and Zheng Yu and Csaba Szepesv\\'ari and Mengdi Wang},\n  title     = {On the Convergence and Sample Efficiency of Variance-Reduced Policy Gradient Method},\n  pages     = {2228--2240},\n  year      = {2021},\n  booktitle = {NeurIPS},\n  crossref  = {NeurIPS2021spotlight},\n  url_link  = {https://proceedings.neurips.cc/paper/2021/hash/11c484ea9305ea4c7bb6b2e6d570d466-Abstract.html},\n  url_paper = {NeurIPS2021_TSIVR-PG.pdf},\n  abstract = {Policy gradient (PG) gives rise to a rich class of reinforcement learning (RL) methods. Recently, there has been an emerging trend to accelerate the existing PG methods such as REINFORCE by the <em>variance reduction</em> techniques.  However, all existing variance-reduced PG methods heavily rely on an uncheckable importance weight assumption made for every single iteration of the algorithms. In this paper, a simple gradient truncation mechanism is proposed to address this issue. Moreover, we design a Truncated Stochastic Incremental Variance-Reduced Policy Gradient (TSIVR-PG) method, which is able to maximize not only a cumulative sum of rewards but also a general utility function over a policy's long-term visiting distribution.  We show an $\\tilde{O}(\\epsilon^{-3})$ sample complexity for TSIVR-PG to find an $\\epsilon$-stationary policy. By assuming the overparameterizaiton of policy and exploiting the hidden convexity of the problem, we further show that TSIVR-PG converges to global $\\epsilon$-optimal policy with $\\tilde{O}(\\epsilon^{-2})$ samples.}\n}\n\n\n
\n
\n\n\n
\n Policy gradient (PG) gives rise to a rich class of reinforcement learning (RL) methods. Recently, there has been an emerging trend to accelerate the existing PG methods such as REINFORCE by the variance reduction techniques. However, all existing variance-reduced PG methods heavily rely on an uncheckable importance weight assumption made for every single iteration of the algorithms. In this paper, a simple gradient truncation mechanism is proposed to address this issue. Moreover, we design a Truncated Stochastic Incremental Variance-Reduced Policy Gradient (TSIVR-PG) method, which is able to maximize not only a cumulative sum of rewards but also a general utility function over a policy's long-term visiting distribution. We show an $Õ(ε^{-3})$ sample complexity for TSIVR-PG to find an $ε$-stationary policy. By assuming the overparameterizaiton of policy and exploiting the hidden convexity of the problem, we further show that TSIVR-PG converges to global $ε$-optimal policy with $Õ(ε^{-2})$ samples.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Understanding the Effect of Stochasticity in Policy Optimization.\n \n \n \n \n\n\n \n Mei, J.; Dai, B.; Xiao, C.; Szepesvári, C.; and Schuurmans, D.\n\n\n \n\n\n\n In NeurIPS, pages 19339–19351, 2021. \n \n\n\n\n
\n\n\n\n \n \n \"Understanding link\n  \n \n \n \"Understanding paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{MeiDXSzS21,\n  author    = {Jincheng Mei and Bo Dai and Chenjun Xiao and Csaba Szepesv\\'ari and Dale Schuurmans},\n  title     = {Understanding the Effect of Stochasticity in Policy Optimization},\n  booktitle = {NeurIPS},\n  crossref  = {NeurIPS2021poster},\n  pages     = {19339--19351},\n  year      = {2021},\n  url_link       = {https://proceedings.neurips.cc/paper/2021/hash/a12f69495f41bb3b637ba1b6238884d6-Abstract.html},\n  url_paper = {NeurIPS2021_commitalrate.pdf},\n  abstract  = {We study the effect of stochasticity in on-policy policy optimization, and make the following four contributions. \n  First, we show that the preferability of optimization methods depends critically on whether stochastic versus exact gradients are used. In particular, unlike the true gradient setting, geometric information <em>cannot</em> be easily exploited in the stochastic case for accelerating policy optimization without detrimental consequences or impractical assumptions.\n  Second,  to explain these findings we introduce the concept of <em>committal rate</em> for stochastic policy optimization, and show that this can serve as a criterion for determining almost sure convergence to global optimality.\n  Third, we show that in the absence of external oracle information, which allows an algorithm to determine the difference between optimal and sub-optimal actions given only on-policy samples, there is an inherent trade-off between exploiting geometry to accelerate convergence versus achieving optimality almost surely.  That is, an uninformed algorithm either converges to a globally optimal policy with probability $1$ but at a rate no better than $O(1/t)$, or it achieves faster than $O(1/t)$ convergence but then must fail to converge to the globally optimal policy with some positive probability.\n  Finally, we use the committal rate theory to explain why practical policy optimization methods are sensitive to random initialization, then develop an ensemble method that can be guaranteed to achieve near-optimal solutions with high probability.}\n}\n\n
\n
\n\n\n
\n We study the effect of stochasticity in on-policy policy optimization, and make the following four contributions. First, we show that the preferability of optimization methods depends critically on whether stochastic versus exact gradients are used. In particular, unlike the true gradient setting, geometric information cannot be easily exploited in the stochastic case for accelerating policy optimization without detrimental consequences or impractical assumptions. Second, to explain these findings we introduce the concept of committal rate for stochastic policy optimization, and show that this can serve as a criterion for determining almost sure convergence to global optimality. Third, we show that in the absence of external oracle information, which allows an algorithm to determine the difference between optimal and sub-optimal actions given only on-policy samples, there is an inherent trade-off between exploiting geometry to accelerate convergence versus achieving optimality almost surely. That is, an uninformed algorithm either converges to a globally optimal policy with probability $1$ but at a rate no better than $O(1/t)$, or it achieves faster than $O(1/t)$ convergence but then must fail to converge to the globally optimal policy with some positive probability. Finally, we use the committal rate theory to explain why practical policy optimization methods are sensitive to random initialization, then develop an ensemble method that can be guaranteed to achieve near-optimal solutions with high probability.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n No Regrets for Learning the Prior in Bandits.\n \n \n \n \n\n\n \n Basu, S.; Kveton, B.; Zaheer, M.; and Szepesvári, C.\n\n\n \n\n\n\n In NeurIPS, pages 28029–28041, 2021. \n \n\n\n\n
\n\n\n\n \n \n \"No link\n  \n \n \n \"No paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 8 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{BasuKZS21,\n  author    = {Soumya Basu and Branislav Kveton and Manzil Zaheer and Csaba Szepesv\\'ari},\n  title     = {No Regrets for Learning the Prior in Bandits},\n  booktitle = {NeurIPS},\n  crossref  = {NeurIPS2021poster},\n  pages     = {28029--28041},\n  year      = {2021},\n  url_link  = {https://proceedings.neurips.cc/paper/2021/hash/ec1f764517b7ffb52057af6df18142b7-Abstract.html},\n  url_paper = {NeurIPS2021_AdaTS.pdf},\n  abstract  = {We propose AdaTS, a Thompson sampling algorithm that adapts sequentially to bandit tasks that it interacts with. The key idea in AdaTS is to adapt to an unknown task prior distribution by maintaining a distribution over its parameters. When solving a bandit task, that uncertainty is marginalized out and properly accounted for. AdaTS is a fully-Bayesian algorithm that can be implemented efficiently in several classes of bandit problems. We derive upper bounds on its Bayes regret that quantify the loss due to not knowing the task prior, and show that it is small. Our theory is supported by experiments, where AdaTS outperforms prior algorithms and works well even in challenging real-world problems.}\n}\n\n
\n
\n\n\n
\n We propose AdaTS, a Thompson sampling algorithm that adapts sequentially to bandit tasks that it interacts with. The key idea in AdaTS is to adapt to an unknown task prior distribution by maintaining a distribution over its parameters. When solving a bandit task, that uncertainty is marginalized out and properly accounted for. AdaTS is a fully-Bayesian algorithm that can be implemented efficiently in several classes of bandit problems. We derive upper bounds on its Bayes regret that quantify the loss due to not knowing the task prior, and show that it is small. Our theory is supported by experiments, where AdaTS outperforms prior algorithms and works well even in challenging real-world problems.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n On the Role of Optimization in Double Descent: A Least Squares Study.\n \n \n \n \n\n\n \n Kuzborskij, I.; Szepesvári, C.; Rivasplata, O.; Rannen-Triki, A.; and Pascanu, R.\n\n\n \n\n\n\n In NeurIPS, pages 29567–29577, 2021. \n \n\n\n\n
\n\n\n\n \n \n \"On link\n  \n \n \n \"On paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{KuzborskijSRRP21,\n  author    = {Ilja Kuzborskij and Csaba Szepesv\\'ari and Omar Rivasplata and Amal Rannen-Triki and Razvan Pascanu},\n  title     = {On the Role of Optimization in Double Descent: A Least Squares Study},\n  booktitle = {NeurIPS},\n  crossref  = {NeurIPS2021poster},\n  pages     = {29567--29577},\n  year      = {2021},\n  url_link       = {https://proceedings.neurips.cc/paper/2021/hash/f754186469a933256d7d64095e963594-Abstract.html},\n  url_paper = {NeurIPS2021_DoubleDescent.pdf},\n  abstract = {Empirically it has been observed that the performance of deep neural networks steadily improves with increased model size, contradicting the classical view on overfitting and generalization. Recently, the double descent phenomenon has been proposed to reconcile this observation with theory, suggesting that the test error has a second descent when the model becomes sufficiently overparameterized, as the model size itself acts as an implicit regularizer. In this paper we add to the growing body of work in this space, providing a careful study of learning dynamics as a function of model size for the least squares scenario. We show an excess risk bound for the gradient descent solution of the least squares objective. The bound depends on the smallest non-zero eigenvalue of the sample covariance matrix of the input features, via a functional form that has the double descent behaviour. This gives a new perspective on the double descent curves reported in the literature, as our analysis of the excess risk allows to decouple the effect of optimization and generalization error. In particular, we find that in the case of noiseless regression, double descent is explained solely by optimization-related quantities, which was missed in studies focusing on the Moore-Penrose pseudoinverse solution. We believe that our derivation provides an alternative view compared to existing works, shedding some light on a possible cause of this phenomenon, at least in the considered least squares setting. We empirically explore if our predictions hold for neural networks, in particular whether the spectrum of the sample covariance of features at intermediary hidden layers has a similar behaviour as the one predicted by our derivations in the least squares setting.}\n}\n\n
\n
\n\n\n
\n Empirically it has been observed that the performance of deep neural networks steadily improves with increased model size, contradicting the classical view on overfitting and generalization. Recently, the double descent phenomenon has been proposed to reconcile this observation with theory, suggesting that the test error has a second descent when the model becomes sufficiently overparameterized, as the model size itself acts as an implicit regularizer. In this paper we add to the growing body of work in this space, providing a careful study of learning dynamics as a function of model size for the least squares scenario. We show an excess risk bound for the gradient descent solution of the least squares objective. The bound depends on the smallest non-zero eigenvalue of the sample covariance matrix of the input features, via a functional form that has the double descent behaviour. This gives a new perspective on the double descent curves reported in the literature, as our analysis of the excess risk allows to decouple the effect of optimization and generalization error. In particular, we find that in the case of noiseless regression, double descent is explained solely by optimization-related quantities, which was missed in studies focusing on the Moore-Penrose pseudoinverse solution. We believe that our derivation provides an alternative view compared to existing works, shedding some light on a possible cause of this phenomenon, at least in the considered least squares setting. We empirically explore if our predictions hold for neural networks, in particular whether the spectrum of the sample covariance of features at intermediary hidden layers has a similar behaviour as the one predicted by our derivations in the least squares setting.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Nearly Minimax Optimal Reinforcement Learning for Linear Mixture Markov Decision Processes.\n \n \n \n \n\n\n \n Zhou, D.; Gu, Q.; and Szepesvári, C.\n\n\n \n\n\n\n In COLT, pages 4532–4576, 08 2021. \n \n\n\n\n
\n\n\n\n \n \n \"Nearly paper\n  \n \n \n \"Nearly link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{ZhGuSz21,\n  title = {Nearly Minimax Optimal Reinforcement Learning for Linear Mixture Markov Decision Processes},\n  author = {Zhou, Dongruo and Gu, Quanquan and Szepesv{\\'a}ri, Csaba},\n  pages = {4532--4576},\n  url_paper = {COLT2021-BernsteinBonus.pdf},\n  url_link = {https://proceedings.mlr.press/v134/zhou21a.html},\n  abstract = {We study reinforcement learning (RL) with linear function approximation where the underlying transition probability kernel of the Markov decision process (MDP) is a linear mixture model (Jia et al., 2020; Ayoub et al., 2020; Zhou et al., 2020) and the learning agent has access to either an integration or a sampling oracle of the individual basis kernels. For the fixed-horizon episodic setting with inhomogeneous transition kernels, we propose a new, computationally efficient algorithm that uses the basis kernels to approximate value functions. We show that the new algorithm, which we call ${\\text{UCRL-VTR}^{+}}$, attains an $\\tilde O(dH\\sqrt{T})$ regret where $d$ is the number of basis kernels, $H$ is the length of the episode and $T$ is the number of interactions with the MDP. We also prove a matching lower bound $\\Omega(dH\\sqrt{T})$ for this setting, which shows that ${\\text{UCRL-VTR}^{+}}$ is minimax optimal up to logarithmic factors. At the core of our results are (1) a weighted least squares estimator for the unknown transitional probability; and (2) a new Bernstein-type concentration inequality for self-normalized vector-valued martingales with bounded increments. Together, these new tools enable tight control of the Bellman error and lead to a nearly minimax regret. To the best of our knowledge, this is the first computationally efficient, nearly minimax optimal algorithm for RL with linear function approximation.},\n  crossref  = {COLT2021},\n  booktitle = {COLT},\n  month = {08},\n  year = {2021},\n}\n\n\n
\n
\n\n\n
\n We study reinforcement learning (RL) with linear function approximation where the underlying transition probability kernel of the Markov decision process (MDP) is a linear mixture model (Jia et al., 2020; Ayoub et al., 2020; Zhou et al., 2020) and the learning agent has access to either an integration or a sampling oracle of the individual basis kernels. For the fixed-horizon episodic setting with inhomogeneous transition kernels, we propose a new, computationally efficient algorithm that uses the basis kernels to approximate value functions. We show that the new algorithm, which we call ${\\text{UCRL-VTR}^{+}}$, attains an $ ̃O(dH\\sqrt{T})$ regret where $d$ is the number of basis kernels, $H$ is the length of the episode and $T$ is the number of interactions with the MDP. We also prove a matching lower bound $Ω(dH\\sqrt{T})$ for this setting, which shows that ${\\text{UCRL-VTR}^{+}}$ is minimax optimal up to logarithmic factors. At the core of our results are (1) a weighted least squares estimator for the unknown transitional probability; and (2) a new Bernstein-type concentration inequality for self-normalized vector-valued martingales with bounded increments. Together, these new tools enable tight control of the Bellman error and lead to a nearly minimax regret. To the best of our knowledge, this is the first computationally efficient, nearly minimax optimal algorithm for RL with linear function approximation.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n On Query-efficient Planning in MDPs under Linear Realizability of the Optimal State-value Function.\n \n \n \n \n\n\n \n Weisz, G.; Amortila, P.; Janzer, B.; Abbasi-Yadkori, Y.; Jiang, N.; and Szepesvári, C.\n\n\n \n\n\n\n In COLT, pages 4355–4385, 08 2021. \n \n\n\n\n
\n\n\n\n \n \n \"On paper\n  \n \n \n \"On link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 16 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{Weiszvstar21,\n  title = \t {On Query-efficient Planning in MDPs under Linear Realizability of the Optimal State-value Function},\n  author =       {Weisz, Gell{\\'e}rt and Amortila, Philip and Janzer, Barnab\\'as and Abbasi-Yadkori, Yasin and Jiang, Nan and Szepesv{\\'a}ri, Csaba},\n  pages = \t {4355--4385},\n  url_paper = \t {COLT2021-weisz21a.pdf},\n  url_link = \t {http://proceedings.mlr.press/v134/weisz21a.html},\n  abstract = \t {We consider the problem of local planning in fixed-horizon Markov Decision Processes (MDPs) with a generative model under the assumption that the optimal value function lies close to the span of a feature map. The generative model provides a restricted, “local” access to the MDP: The planner can ask for random transitions from previously returned states and arbitrary actions, and the features are also only accessible for the states that are encountered in this process. As opposed to previous work (e.g. Lattimore et al. (2020)) where linear realizability of all policies was assumed, we consider the significantly relaxed assumption of a single linearly realizable (deterministic) policy. A recent lower bound by Weisz et al. (2020) established that the related problem when the action-value function of the optimal policy is linearly realizable requires an exponential number of queries, either in $H$ (the horizon of the MDP) or $d$ (the dimension of the feature mapping). Their construction crucially relies on having an exponentially large action set. In contrast, in this work, we establish that $\\poly(H,d)$ planning is possible with state value function realizability whenever the action set has a constant size. In particular, we present the TensorPlan algorithm which uses $\\poly((dH/\\delta)^A)$ simulator queries to find a $\\delta$-optimal policy relative to any deterministic policy for which the value function is linearly realizable with some bounded parameter (with a known bound). This is the first algorithm to give a polynomial query complexity guarantee using only linear-realizability of a single competing value function. Whether the computation cost is similarly bounded remains an interesting open question. We also extend the upper bound to the near-realizable case and to the infinite-horizon discounted MDP setup. The upper bounds are complemented by a lower bound which states that in the infinite-horizon episodic setting, planners that achieve constant suboptimality need exponentially many queries, either in the dimension or the number of actions.},\n  crossref  = {COLT2021},\n  booktitle = {COLT},\n  month = {08},\n  year = {2021},\n}\n\n
\n
\n\n\n
\n We consider the problem of local planning in fixed-horizon Markov Decision Processes (MDPs) with a generative model under the assumption that the optimal value function lies close to the span of a feature map. The generative model provides a restricted, “local” access to the MDP: The planner can ask for random transitions from previously returned states and arbitrary actions, and the features are also only accessible for the states that are encountered in this process. As opposed to previous work (e.g. Lattimore et al. (2020)) where linear realizability of all policies was assumed, we consider the significantly relaxed assumption of a single linearly realizable (deterministic) policy. A recent lower bound by Weisz et al. (2020) established that the related problem when the action-value function of the optimal policy is linearly realizable requires an exponential number of queries, either in $H$ (the horizon of the MDP) or $d$ (the dimension of the feature mapping). Their construction crucially relies on having an exponentially large action set. In contrast, in this work, we establish that $\\poly(H,d)$ planning is possible with state value function realizability whenever the action set has a constant size. In particular, we present the TensorPlan algorithm which uses $\\poly((dH/δ)^A)$ simulator queries to find a $δ$-optimal policy relative to any deterministic policy for which the value function is linearly realizable with some bounded parameter (with a known bound). This is the first algorithm to give a polynomial query complexity guarantee using only linear-realizability of a single competing value function. Whether the computation cost is similarly bounded remains an interesting open question. We also extend the upper bound to the near-realizable case and to the infinite-horizon discounted MDP setup. The upper bounds are complemented by a lower bound which states that in the infinite-horizon episodic setting, planners that achieve constant suboptimality need exponentially many queries, either in the dimension or the number of actions.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Nonparametric Regression with Shallow Overparameterized Neural Networks Trained by GD with Early Stopping.\n \n \n \n \n\n\n \n Kuzborskij, I.; and Szepesvári, C.\n\n\n \n\n\n\n In COLT, pages 2853–2890, 08 2021. \n \n\n\n\n
\n\n\n\n \n \n \"Nonparametric paper\n  \n \n \n \"Nonparametric link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 14 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{KuSze21,\n  title = \t {Nonparametric Regression with Shallow Overparameterized Neural Networks Trained by GD with Early Stopping},\n  author =       {Kuzborskij, Ilja and Szepesv{\\'a}ri, Csaba},\n  pages = \t {2853--2890},\n  url_paper = \t {COLT2021-kuzborskij21a.pdf},\n  url_link = \t {http://proceedings.mlr.press/v134/kuzborskij21a.html},\n  abstract = \t {We explore the ability of overparameterized shallow neural networks to learn Lipschitz regression functions with and without label noise when trained by Gradient Descent (GD). To avoid the problem that in the presence of noisy labels, neural networks trained to nearly zero training error are inconsistent on this class, we propose an early stopping rule that allows us to show optimal rates. This provides an alternative to the result of Hu et al. (2021) who studied the performance of $\\ell_2$-regularized GD for training shallow networks in nonparametric regression which fully relied on the infinite-width network (Neural Tangent Kernel (NTK)) approximation. Here we present a simpler analysis which is based on a partitioning argument of the input space (as in the case of 1-nearest-neighbor rule) coupled with the fact that trained neural networks are smooth with respect to their inputs when trained by GD. In the noise-free case the proof does not rely on any kernelization and can be regarded as a finite-width result. In the case of label noise, by slightly modifying the proof, the noise is controlled using a technique of Yao, Rosasco, and Caponnetto (2007).},\n  crossref  = {COLT2021},\n  booktitle = {COLT},\n  month = {08},\n  year = {2021},\n}\n\n
\n
\n\n\n
\n We explore the ability of overparameterized shallow neural networks to learn Lipschitz regression functions with and without label noise when trained by Gradient Descent (GD). To avoid the problem that in the presence of noisy labels, neural networks trained to nearly zero training error are inconsistent on this class, we propose an early stopping rule that allows us to show optimal rates. This provides an alternative to the result of Hu et al. (2021) who studied the performance of $\\ell_2$-regularized GD for training shallow networks in nonparametric regression which fully relied on the infinite-width network (Neural Tangent Kernel (NTK)) approximation. Here we present a simpler analysis which is based on a partitioning argument of the input space (as in the case of 1-nearest-neighbor rule) coupled with the fact that trained neural networks are smooth with respect to their inputs when trained by GD. In the noise-free case the proof does not rely on any kernelization and can be regarded as a finite-width result. In the case of label noise, by slightly modifying the proof, the noise is controlled using a technique of Yao, Rosasco, and Caponnetto (2007).\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Asymptotically Optimal Information-Directed Sampling.\n \n \n \n \n\n\n \n Kirschner, J.; Lattimore, T.; Vernade, C.; and Szepesvári, C.\n\n\n \n\n\n\n In COLT, pages 2777–2821, 08 2021. \n \n\n\n\n
\n\n\n\n \n \n \"Asymptotically paper\n  \n \n \n \"Asymptotically link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{kirschner21a,\n  title =  {Asymptotically Optimal Information-Directed Sampling},\n  author = {Kirschner, Johannes and Lattimore, Tor and Vernade, Claire and Szepesv{\\'a}ri, Csaba},\n  pages =  {2777--2821},\n  url_paper =    {COLT2021-kirschner21a.pdf},\n  url_link =    {http://proceedings.mlr.press/v134/kirschner21a.html},\n  abstract = {We introduce a simple and efficient algorithm for stochastic linear bandits with finitely many actions that is asymptotically optimal and (nearly) worst-case optimal in finite time. The approach is based on the frequentist information-directed sampling (IDS) framework, with a surrogate for the information gain that is informed by the optimization problem that defines the asymptotic lower bound. Our analysis sheds light on how IDS balances the trade-off between regret and information and uncovers a surprising connection between the recently proposed primal-dual methods and the IDS algorithm. We demonstrate empirically that IDS is competitive with UCB in finite-time, and can be significantly better in the asymptotic regime.},\n  crossref  = {COLT2021},\n  booktitle = {COLT},\n  month = {08},\n  year = {2021},\n}\n\n\n
\n
\n\n\n
\n We introduce a simple and efficient algorithm for stochastic linear bandits with finitely many actions that is asymptotically optimal and (nearly) worst-case optimal in finite time. The approach is based on the frequentist information-directed sampling (IDS) framework, with a surrogate for the information gain that is informed by the optimization problem that defines the asymptotic lower bound. Our analysis sheds light on how IDS balances the trade-off between regret and information and uncovers a surprising connection between the recently proposed primal-dual methods and the IDS algorithm. We demonstrate empirically that IDS is competitive with UCB in finite-time, and can be significantly better in the asymptotic regime.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n On the Optimality of Batch Policy Optimization Algorithms.\n \n \n \n \n\n\n \n Xiao, C.; Wu, Y.; Mei, J.; Dai, B.; Lattimore, T.; Li, L.; Szepesvári, C.; and Schuurmans, D.\n\n\n \n\n\n\n In ICML, pages 11362–11371, 07 2021. \n \n\n\n\n
\n\n\n\n \n \n \"On paper\n  \n \n \n \"On link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 8 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{XiaoWMDL0SS21,\n  author    = {Chenjun Xiao and Yifan Wu and Jincheng Mei and Bo Dai and Tor Lattimore and Lihong Li and Csaba Szepesv\\'ari and Dale Schuurmans},\n  title     = {On the Optimality of Batch Policy Optimization Algorithms},\n  pages     = {11362--11371},\n  crossref  = {ICML2021},\n  url_paper = {ICML2021-BatchPO.pdf},\n  url_link  = {http://proceedings.mlr.press/v139/xiao21b.html},\n  abstract  = {Batch policy optimization considers leveraging existing data for policy construction before interacting with an environment. Although interest in this problem has grown significantly in recent years, its theoretical foundations remain under-developed. To advance the understanding of this problem, we provide three results that characterize the limits and possibilities of batch policy optimization in the finite-armed stochastic bandit setting. First, we introduce a class of confidence-adjusted index algorithms that unifies optimistic and pessimistic principles in a common framework, which enables a general analysis. For this family, we show that any confidence-adjusted index algorithm is minimax optimal, whether it be optimistic, pessimistic or neutral. Our analysis reveals that instance-dependent optimality, commonly used to establish optimality of on-line stochastic bandit algorithms, cannot be achieved by any algorithm in the batch setting. In particular, for any algorithm that performs optimally in some environment, there exists another environment where the same algorithm suffers arbitrarily larger regret. Therefore, to establish a framework for distinguishing algorithms, we introduce a new weighted-minimax criterion that considers the inherent difficulty of optimal value prediction. We demonstrate how this criterion can be used to justify commonly used pessimistic principles for batch policy optimization.},\n  booktitle = {ICML},\n  month = {07},\n  year = {2021},\n}\n\n
\n
\n\n\n
\n Batch policy optimization considers leveraging existing data for policy construction before interacting with an environment. Although interest in this problem has grown significantly in recent years, its theoretical foundations remain under-developed. To advance the understanding of this problem, we provide three results that characterize the limits and possibilities of batch policy optimization in the finite-armed stochastic bandit setting. First, we introduce a class of confidence-adjusted index algorithms that unifies optimistic and pessimistic principles in a common framework, which enables a general analysis. For this family, we show that any confidence-adjusted index algorithm is minimax optimal, whether it be optimistic, pessimistic or neutral. Our analysis reveals that instance-dependent optimality, commonly used to establish optimality of on-line stochastic bandit algorithms, cannot be achieved by any algorithm in the batch setting. In particular, for any algorithm that performs optimally in some environment, there exists another environment where the same algorithm suffers arbitrarily larger regret. Therefore, to establish a framework for distinguishing algorithms, we introduce a new weighted-minimax criterion that considers the inherent difficulty of optimal value prediction. We demonstrate how this criterion can be used to justify commonly used pessimistic principles for batch policy optimization.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Leveraging Non-uniformity in First-order Non-convex Optimization.\n \n \n \n \n\n\n \n Mei, J.; Gao, Y.; Dai, B.; Szepesvári, C.; and Schuurmans, D.\n\n\n \n\n\n\n In ICML, pages 7555–7564, 07 2021. \n \n\n\n\n
\n\n\n\n \n \n \"Leveraging paper\n  \n \n \n \"Leveraging link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{MeiGDSS21,\n  author    = {Jincheng Mei and Yue Gao and Bo Dai and Csaba Szepesv\\'ari and Dale Schuurmans},\n  title     = {Leveraging Non-uniformity in First-order Non-convex Optimization},\n  pages     = {7555--7564},\n  crossref  = {ICML2021},\n  url_paper = {ICML2021-NonunifPG.pdf},\n  url_link  = {http://proceedings.mlr.press/v139/mei21a.html},\n  abstract  = {Classical global convergence results for first-order methods rely on uniform smoothness and the Ł{}ojasiewicz inequality. Motivated by properties of objective functions that arise in machine learning, we propose a non-uniform refinement of these notions, leading to <em>Non-uniform Smoothness</em> (NS) and <em>Non-uniform Ł{}ojasiewicz inequality</em> (NŁ{}). The new definitions inspire new geometry-aware first-order methods that are able to converge to global optimality faster than the classical $\\Omega(1/t^2)$ lower bounds. To illustrate the power of these geometry-aware methods and their corresponding non-uniform analysis, we consider two important problems in machine learning: policy gradient optimization in reinforcement learning (PG), and generalized linear model training in supervised learning (GLM). For PG, we find that normalizing the gradient ascent method can accelerate convergence to $O(e^{- c \\cdot t})$ (where $c &gt; 0$) while incurring less overhead than existing algorithms. For GLM, we show that geometry-aware normalized gradient descent can also achieve a linear convergence rate, which significantly improves the best known results. We additionally show that the proposed geometry-aware gradient descent methods escape landscape plateaus faster than standard gradient descent. Experimental results are used to illustrate and complement the theoretical findings.},\n  booktitle = {ICML},\n  month = {07},\n  year = {2021},\n}\n\n
\n
\n\n\n
\n Classical global convergence results for first-order methods rely on uniform smoothness and the Łojasiewicz inequality. Motivated by properties of objective functions that arise in machine learning, we propose a non-uniform refinement of these notions, leading to Non-uniform Smoothness (NS) and Non-uniform Łojasiewicz inequality (NŁ). The new definitions inspire new geometry-aware first-order methods that are able to converge to global optimality faster than the classical $Ω(1/t^2)$ lower bounds. To illustrate the power of these geometry-aware methods and their corresponding non-uniform analysis, we consider two important problems in machine learning: policy gradient optimization in reinforcement learning (PG), and generalized linear model training in supervised learning (GLM). For PG, we find that normalizing the gradient ascent method can accelerate convergence to $O(e^{- c · t})$ (where $c > 0$) while incurring less overhead than existing algorithms. For GLM, we show that geometry-aware normalized gradient descent can also achieve a linear convergence rate, which significantly improves the best known results. We additionally show that the proposed geometry-aware gradient descent methods escape landscape plateaus faster than standard gradient descent. Experimental results are used to illustrate and complement the theoretical findings.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Improved Regret Bound and Experience Replay in Regularized Policy Iteration.\n \n \n \n \n\n\n \n Lazic, N.; Yin, D.; Abbasi-Yadkori, Y.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 6032–6042, 07 2021. \n \n\n\n\n
\n\n\n\n \n \n \"Improved paper\n  \n \n \n \"Improved link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{LazicYAS21,\n  author    = {Nevena Lazic and Dong Yin and Yasin Abbasi-Yadkori and Csaba Szepesv\\'ari},\n  title     = {Improved Regret Bound and Experience Replay in Regularized Policy Iteration},\n  pages     = {6032--6042},\n  crossref  = {ICML2021longpres},\n  url_paper = {ICML2021-Politex.pdf},\n  url_link  = {http://proceedings.mlr.press/v139/lazic21a.html},\n  abstract  = {In this work, we study algorithms for learning in infinite-horizon undiscounted Markov decision processes (MDPs) with function approximation. We first show that the regret analysis of the Politex algorithm (a version of regularized policy iteration) can be sharpened from $O(T^{3/4})$ to $O(\\sqrt{T})$ under nearly identical assumptions, and instantiate the bound with linear function approximation. Our result provides the first high-probability $O(\\sqrt{T})$ regret bound for a computationally efficient algorithm in this setting. The exact implementation of Politex with neural network function approximation is inefficient in terms of memory and computation. Since our analysis suggests that we need to approximate the average of the action-value functions of past policies well, we propose a simple efficient implementation where we train a single Q-function on a replay buffer with past data. We show that this often leads to superior performance over other implementation choices, especially in terms of wall-clock time. Our work also provides a novel theoretical justification for using experience replay within policy iteration algorithms.},\n  booktitle = {ICML},\n  month = {07},\n  year = {2021},\n}\n\n
\n
\n\n\n
\n In this work, we study algorithms for learning in infinite-horizon undiscounted Markov decision processes (MDPs) with function approximation. We first show that the regret analysis of the Politex algorithm (a version of regularized policy iteration) can be sharpened from $O(T^{3/4})$ to $O(\\sqrt{T})$ under nearly identical assumptions, and instantiate the bound with linear function approximation. Our result provides the first high-probability $O(\\sqrt{T})$ regret bound for a computationally efficient algorithm in this setting. The exact implementation of Politex with neural network function approximation is inefficient in terms of memory and computation. Since our analysis suggests that we need to approximate the average of the action-value functions of past policies well, we propose a simple efficient implementation where we train a single Q-function on a replay buffer with past data. We show that this often leads to superior performance over other implementation choices, especially in terms of wall-clock time. Our work also provides a novel theoretical justification for using experience replay within policy iteration algorithms.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Meta-Thompson Sampling.\n \n \n \n \n\n\n \n Kveton, B.; Konobeev, M.; Zaheer, M.; Hsu, C.; Mladenov, M.; Boutilier, C.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 5884–5893, 07 2021. \n \n\n\n\n
\n\n\n\n \n \n \"Meta-Thompson paper\n  \n \n \n \"Meta-Thompson link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 6 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{KvetonKZHMBS21,\n  author    = {Branislav Kveton and Mikhail Konobeev and Manzil Zaheer and Chih-Wei Hsu and Martin Mladenov and Craig Boutilier and Csaba Szepesv\\'ari},\n  crossref  = {ICML2021},\n  title     = {Meta-Thompson Sampling},\n  pages     = {5884--5893},\n  url_paper = {ICML2021-MetaTS.pdf},\n  url_link  = {http://proceedings.mlr.press/v139/kveton21a.html},\n  abstract  = \t {Efficient exploration in bandits is a fundamental online learning problem. We propose a variant of Thompson sampling that learns to explore better as it interacts with bandit instances drawn from an unknown prior. The algorithm meta-learns the prior and thus we call it MetaTS. We propose several efficient implementations of MetaTS and analyze it in Gaussian bandits. Our analysis shows the benefit of meta-learning and is of a broader interest, because we derive a novel prior-dependent Bayes regret bound for Thompson sampling. Our theory is complemented by empirical evaluation, which shows that MetaTS quickly adapts to the unknown prior.},\n  booktitle = {ICML},\n  month = {07},\n  year = {2021},\n}\n\n
\n
\n\n\n
\n Efficient exploration in bandits is a fundamental online learning problem. We propose a variant of Thompson sampling that learns to explore better as it interacts with bandit instances drawn from an unknown prior. The algorithm meta-learns the prior and thus we call it MetaTS. We propose several efficient implementations of MetaTS and analyze it in Gaussian bandits. Our analysis shows the benefit of meta-learning and is of a broader interest, because we derive a novel prior-dependent Bayes regret bound for Thompson sampling. Our theory is complemented by empirical evaluation, which shows that MetaTS quickly adapts to the unknown prior.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A Distribution-dependent Analysis of Meta Learning.\n \n \n \n \n\n\n \n Konobeev, M.; Kuzborskij, I.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 5697–5706, 07 2021. \n \n\n\n\n
\n\n\n\n \n \n \"A paper\n  \n \n \n \"A link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{KonobeevKS21,\n  author    = {Mikhail Konobeev and Ilja Kuzborskij and Csaba Szepesv\\'ari},\n  title     = {A Distribution-dependent Analysis of Meta Learning},\n  crossref  = {ICML2021},\n  pages     = {5697--5706},\n  url_paper = {ICML2021-KonobeevMetaLR.pdf},\n  url_link  = {http://proceedings.mlr.press/v139/konobeev21a.html},\n  abstract  = {A key problem in the theory of meta-learning is to understand how the task distributions influence transfer risk, the expected error of a meta-learner on a new task drawn from the unknown task distribution. In this paper, focusing on fixed design linear regression with Gaussian noise and a Gaussian task (or parameter) distribution, we give distribution-dependent lower bounds on the transfer risk of any algorithm, while we also show that a novel, weighted version of the so-called biased regularized regression method is able to match these lower bounds up to a fixed constant factor. Notably, the weighting is derived from the covariance of the Gaussian task distribution. Altogether, our results provide a precise characterization of the difficulty of meta-learning in this Gaussian setting. While this problem setting may appear simple, we show that it is rich enough to unify the “parameter sharing” and “representation learning” streams of meta-learning; in particular, representation learning is obtained as the special case when the covariance matrix of the task distribution is unknown. For this case we propose to adopt the EM method, which is shown to enjoy efficient updates in our case. The paper is completed by an empirical study of EM. In particular, our experimental results show that the EM algorithm can attain the lower bound as the number of tasks grows, while the algorithm is also successful in competing with its alternatives when used in a representation learning context.},\n  booktitle = {ICML},\n  month = {07},\n  year = {2021},\n}\n\n
\n
\n\n\n
\n A key problem in the theory of meta-learning is to understand how the task distributions influence transfer risk, the expected error of a meta-learner on a new task drawn from the unknown task distribution. In this paper, focusing on fixed design linear regression with Gaussian noise and a Gaussian task (or parameter) distribution, we give distribution-dependent lower bounds on the transfer risk of any algorithm, while we also show that a novel, weighted version of the so-called biased regularized regression method is able to match these lower bounds up to a fixed constant factor. Notably, the weighting is derived from the covariance of the Gaussian task distribution. Altogether, our results provide a precise characterization of the difficulty of meta-learning in this Gaussian setting. While this problem setting may appear simple, we show that it is rich enough to unify the “parameter sharing” and “representation learning” streams of meta-learning; in particular, representation learning is obtained as the special case when the covariance matrix of the task distribution is unknown. For this case we propose to adopt the EM method, which is shown to enjoy efficient updates in our case. The paper is completed by an empirical study of EM. In particular, our experimental results show that the EM algorithm can attain the lower bound as the number of tasks grows, while the algorithm is also successful in competing with its alternatives when used in a representation learning context.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Bootstrapping Fitted Q-Evaluation for Off-Policy Inference.\n \n \n \n \n\n\n \n Hao, B.; Ji, X.; Duan, Y.; Lu, H.; Szepesvári, C.; and Wang, M.\n\n\n \n\n\n\n In ICML, pages 4074–4084, 07 2021. \n \n\n\n\n
\n\n\n\n \n \n \"Bootstrapping paper\n  \n \n \n \"Bootstrapping link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{HaoJDLSW21,\n  author    = {Botao Hao and Xiang Ji and Yaqi Duan and Hao Lu and Csaba Szepesv\\'ari and Mengdi Wang},\n  title     = {Bootstrapping Fitted Q-Evaluation for Off-Policy Inference},\n  crossref  = {ICML2021},\n  pages     = {4074--4084},\n  url_paper = {IMCL2021-Botao-BootstrappingFQE.pdf},\n  url_link  = {http://proceedings.mlr.press/v139/hao21b.html},\n  abstract = \t {Bootstrapping provides a flexible and effective approach for assessing the quality of batch reinforcement learning, yet its theoretical properties are poorly understood. In this paper, we study the use of bootstrapping in off-policy evaluation (OPE), and in particular, we focus on the fitted Q-evaluation (FQE) that is known to be minimax-optimal in the tabular and linear-model cases. We propose a bootstrapping FQE method for inferring the distribution of the policy evaluation error and show that this method is asymptotically efficient and distributionally consistent for off-policy statistical inference. To overcome the computation limit of bootstrapping, we further adapt a subsampling procedure that improves the runtime by an order of magnitude. We numerically evaluate the bootrapping method in classical RL environments for confidence interval estimation, estimating the variance of off-policy evaluator, and estimating the correlation between multiple off-policy evaluators.},\n  booktitle = {ICML},\n  month = {07},\n  year = {2021},\n}\n\n
\n
\n\n\n
\n Bootstrapping provides a flexible and effective approach for assessing the quality of batch reinforcement learning, yet its theoretical properties are poorly understood. In this paper, we study the use of bootstrapping in off-policy evaluation (OPE), and in particular, we focus on the fitted Q-evaluation (FQE) that is known to be minimax-optimal in the tabular and linear-model cases. We propose a bootstrapping FQE method for inferring the distribution of the policy evaluation error and show that this method is asymptotically efficient and distributionally consistent for off-policy statistical inference. To overcome the computation limit of bootstrapping, we further adapt a subsampling procedure that improves the runtime by an order of magnitude. We numerically evaluate the bootrapping method in classical RL environments for confidence interval estimation, estimating the variance of off-policy evaluator, and estimating the correlation between multiple off-policy evaluators.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Sparse Feature Selection Makes Batch Reinforcement Learning More Sample Efficient.\n \n \n \n \n\n\n \n Hao, B.; Duan, Y.; Lattimore, T.; Szepesvári, C.; and Wang, M.\n\n\n \n\n\n\n In ICML, pages 4063–4073, 07 2021. \n \n\n\n\n
\n\n\n\n \n \n \"Sparse paper\n  \n \n \n \"Sparse link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{HaoDLSW21,\n  author    = {Botao Hao and Yaqi Duan and Tor Lattimore and Csaba Szepesv\\'ari and Mengdi Wang},\n  title     = {Sparse Feature Selection Makes Batch Reinforcement Learning More Sample                Efficient},\n  crossref  = {ICML2021},\n  pages     = {4063--4073},\n  url_paper = {ICML2021-BotaoSparseBatchRL.pdf},\n  url_link  = {http://proceedings.mlr.press/v139/hao21a.html},\n  abstract = \t {This paper provides a statistical analysis of high-dimensional batch reinforcement learning (RL) using sparse linear function approximation. When there is a large number of candidate features, our result sheds light on the fact that sparsity-aware methods can make batch RL more sample efficient. We first consider the off-policy policy evaluation problem. To evaluate a new target policy, we analyze a Lasso fitted Q-evaluation method and establish a finite-sample error bound that has no polynomial dependence on the ambient dimension. To reduce the Lasso bias, we further propose a post model-selection estimator that applies fitted Q-evaluation to the features selected via group Lasso. Under an additional signal strength assumption, we derive a sharper instance-dependent error bound that depends on a divergence function measuring the distribution mismatch between the data distribution and occupancy measure of the target policy. Further, we study the Lasso fitted Q-iteration for batch policy optimization and establish a finite-sample error bound depending on the ratio between the number of relevant features and restricted minimal eigenvalue of the data’s covariance. In the end, we complement the results with minimax lower bounds for batch-data policy evaluation/optimization that nearly match our upper bounds. The results suggest that having well-conditioned data is crucial for sparse batch policy learning.},\n  booktitle = {ICML},\n  month = {07},\n  year = {2021},\n}\n\n
\n
\n\n\n
\n This paper provides a statistical analysis of high-dimensional batch reinforcement learning (RL) using sparse linear function approximation. When there is a large number of candidate features, our result sheds light on the fact that sparsity-aware methods can make batch RL more sample efficient. We first consider the off-policy policy evaluation problem. To evaluate a new target policy, we analyze a Lasso fitted Q-evaluation method and establish a finite-sample error bound that has no polynomial dependence on the ambient dimension. To reduce the Lasso bias, we further propose a post model-selection estimator that applies fitted Q-evaluation to the features selected via group Lasso. Under an additional signal strength assumption, we derive a sharper instance-dependent error bound that depends on a divergence function measuring the distribution mismatch between the data distribution and occupancy measure of the target policy. Further, we study the Lasso fitted Q-iteration for batch policy optimization and establish a finite-sample error bound depending on the ratio between the number of relevant features and restricted minimal eigenvalue of the data’s covariance. In the end, we complement the results with minimax lower bounds for batch-data policy evaluation/optimization that nearly match our upper bounds. The results suggest that having well-conditioned data is crucial for sparse batch policy learning.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Exponential Lower Bounds for Planning in MDPs With Linearly-Realizable Optimal Action-Value Functions.\n \n \n \n \n\n\n \n Weisz, G.; Amortila, P.; and Szepesvári, C.\n\n\n \n\n\n\n In ALT, volume 132, pages 1237-1264, 03 2021. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"Exponential link\n  \n \n \n \"Exponential paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 18 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{Weisz21ALT,\n\tabstract = {We consider the problem of local planning in fixed-horizon and discounted Markov Decision Processes (MDPs)\nwith linear function approximation and a generative model\nunder the assumption that the optimal action-value function lies in the span of a feature\nmap that is available to the planner. Previous work has left open the question of whether there exist sound planners that need only $\\text{poly}(H,d)$ queries\nregardless of the MDP, where $H$ is the horizon and $d$ is the dimensionality of the features.\nWe answer this question in the negative: we show that any sound planner must query at least\n$\\min(e^{\\Omega(d)},\\Omega(2^H))$ samples in the fized-horizon setting and $e^{\\Omega(d)}$ samples in the discounted setting.\\todog{added discounted setting}\nWe also show that for any $\\delta>0$, the least-squares value iteration algorithm with\n$\\tilde{\\mathcal{O}}(H^5 d^{H+1}/\\delta^2)$ queries can compute a $\\delta$-optimal policy in the fixed-horizon setting.\nWe discuss implications and remaining open questions.\n},\n\tacceptrate = {\\textbf{Best student paper award}, 2 out of 157 = 1\\%},\n\tauthor = {Weisz, Gell\\'ert and Amortila, Philip and Szepesv\\'ari, Csaba},\n\tbooktitle = {ALT},\n\tmonth = {03},\n\tpages = {1237-1264},\n\tpublisher = {PMLR},\n\ttitle = {Exponential Lower Bounds for Planning in MDPs With Linearly-Realizable Optimal Action-Value Functions},\n\turl_link = {http://proceedings.mlr.press/v132/weisz21a.html},\n\turl_paper = {alt21-weisz.pdf},\n\tvolume = 132,\n\tyear = 2021}\n\n
\n
\n\n\n
\n We consider the problem of local planning in fixed-horizon and discounted Markov Decision Processes (MDPs) with linear function approximation and a generative model under the assumption that the optimal action-value function lies in the span of a feature map that is available to the planner. Previous work has left open the question of whether there exist sound planners that need only $\\text{poly}(H,d)$ queries regardless of the MDP, where $H$ is the horizon and $d$ is the dimensionality of the features. We answer this question in the negative: we show that any sound planner must query at least $\\min(e^{Ω(d)},Ω(2^H))$ samples in the fized-horizon setting and $e^{Ω(d)}$ samples in the discounted setting.\\todogadded discounted setting We also show that for any $δ>0$, the least-squares value iteration algorithm with $\\̃mathcal{O}}(H^5 d^{H+1}/δ^2)$ queries can compute a $δ$-optimal policy in the fixed-horizon setting. We discuss implications and remaining open questions. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Confident Off-Policy Evaluation and Selection through Self-Normalized Importance Weighting.\n \n \n \n \n\n\n \n Kuzborskij, I.; Vernade, C.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In Banerjee, A.; and Fukumizu, K., editor(s), AISTATS, pages 640–648, 04 2021. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"Confident link\n  \n \n \n \"Confident paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 20 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{Kuzborskij2021-xi,\n\tabstract = {We consider off-policy evaluation in the contextual bandit\n               setting for the purpose of obtaining a robust off-policy\n               selection strategy, where the selection strategy is evaluated\n               based on the value of the chosen policy in a set of proposal\n               (target) policies. We propose a new method to compute a lower\n               bound on the value of an arbitrary target policy given some\n               logged data in contextual bandits for a desired coverage. The\n               lower bound is built around the so-called Self-normalized\n               Importance Weighting (SN) estimator. It combines the use of a\n               semi-empirical Efron-Stein tail inequality to control the\n               concentration and Harris' inequality to control the bias. The\n               new approach is evaluated on a number of synthetic and real\n               datasets and is found to be superior to its main competitors,\n               both in terms of tightness of the confidence intervals and the\n               quality of the policies chosen.},\n\tacceptrate = {455 out of 1527 = 30\\%},\n\tauthor = {Kuzborskij, Ilja and Vernade, Claire and Gy{\\"o}rgy, Andr\\'as and Szepesv\\'ari, Csaba},\n\tbooktitle = {AISTATS},\n\teditor = {Banerjee, Arindam and Fukumizu, Kenji},\n\tmonth = {04},\n\tpages = {640--648},\n\tpublisher = {PMLR},\n\ttitle = {Confident Off-Policy Evaluation and Selection through Self-Normalized Importance Weighting},\n\turl_link = {http://proceedings.mlr.press/v130/kuzborskij21a.html},\n\turl_paper = {AISTATS2021_WIS.pdf},\n\tyear = 2021}\n\n
\n
\n\n\n
\n We consider off-policy evaluation in the contextual bandit setting for the purpose of obtaining a robust off-policy selection strategy, where the selection strategy is evaluated based on the value of the chosen policy in a set of proposal (target) policies. We propose a new method to compute a lower bound on the value of an arbitrary target policy given some logged data in contextual bandits for a desired coverage. The lower bound is built around the so-called Self-normalized Importance Weighting (SN) estimator. It combines the use of a semi-empirical Efron-Stein tail inequality to control the concentration and Harris' inequality to control the bias. The new approach is evaluated on a number of synthetic and real datasets and is found to be superior to its main competitors, both in terms of tightness of the confidence intervals and the quality of the policies chosen.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Online Sparse Reinforcement Learning.\n \n \n \n \n\n\n \n Hao, B.; Lattimore, T.; Szepesvári, C.; and Wang, M.\n\n\n \n\n\n\n In Banerjee, A.; and Fukumizu, K., editor(s), AISTATS, pages 316–324, 04 2021. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"Online link\n  \n \n \n \"Online paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 17 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{Hao2021-ut,\n\tabstract = {We investigate the hardness of online reinforcement learning in\n               sparse linear Markov decision process (MDP), with a special\n               focus on the high-dimensional regime where the ambient dimension\n               is larger than the number of episodes. Our contribution is\n               two-fold. First, we provide a lower bound showing that linear\n               regret is generally unavoidable, even if there exists a policy\n               that collects well-conditioned data. Second, we show that if the\n               learner has oracle access to a policy that collects\n               well-conditioned data, then a variant of Lasso fitted\n               Q-iteration enjoys a regret of $O(N^{2/3})$ where $N$ is the\n               number of episodes.},\n\tacceptrate = {455 out of 1527 = 30\\%},\n\tauthor = {Hao, Botao and Lattimore, Tor and Szepesv\\'ari, Csaba and Wang, Mengdi},\n\tbooktitle = {AISTATS},\n\teditor = {Banerjee, Arindam and Fukumizu, Kenji},\n\tmonth = {04},\n\tpages = {316--324},\n\tpublisher = {PMLR},\n\ttitle = {Online Sparse Reinforcement Learning},\n\turl_link = {http://proceedings.mlr.press/v130/hao21a.html},\n\turl_paper = {AISTATS2021_SparseOnlineMDP.pdf},\n\tyear = 2021}\n\n
\n
\n\n\n
\n We investigate the hardness of online reinforcement learning in sparse linear Markov decision process (MDP), with a special focus on the high-dimensional regime where the ambient dimension is larger than the number of episodes. Our contribution is two-fold. First, we provide a lower bound showing that linear regret is generally unavoidable, even if there exists a policy that collects well-conditioned data. Second, we show that if the learner has oracle access to a policy that collects well-conditioned data, then a variant of Lasso fitted Q-iteration enjoys a regret of $O(N^{2/3})$ where $N$ is the number of episodes.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Adaptive Approximate Policy Iteration.\n \n \n \n \n\n\n \n Hao, B.; Lazic, N.; Abbasi-Yadkori, Y.; Joulani, P.; and Szepesvári, C.\n\n\n \n\n\n\n In Banerjee, A.; and Fukumizu, K., editor(s), AISTATS, pages 523–531, 04 2021. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"Adaptive link\n  \n \n \n \"Adaptive paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 9 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{Hao2021-ig,\n\tabstract = {Model-free reinforcement learning algorithms combined with value\n               function approximation have recently achieved impressive\n               performance in a variety of application domains. However, the\n               theoretical understanding of such algorithms is limited, and\n               existing results are largely focused on episodic or discounted\n               Markov decision processes (MDPs). In this work, we present\n               adaptive approximate policy iteration (AAPI), a learning scheme\n               which enjoys a $O(T^{2/3})$ regret bound for undiscounted,\n               continuing learning in uniformly ergodic MDPs. This is an\n               improvement over the best existing bound of $O(T^{3/4})$ for the\n               average-reward case with function approximation. Our algorithm\n               and analysis rely on online learning techniques, where value\n               functions are treated as losses. The main technical novelty is\n               the use of a data-dependent adaptive learning rate coupled with\n               a so-called optimistic prediction of upcoming losses. In\n               addition to theoretical guarantees, we demonstrate the\n               advantages of our approach empirically on several environments.},\n\tacceptrate = {455 out of 1527 = 30\\%},\n\tauthor = {Hao, Botao and Lazic, Nevena and Abbasi-Yadkori, Yasin and Joulani, Pooria and Szepesv\\'ari, Csaba},\n\tbooktitle = {AISTATS},\n\teditor = {Banerjee, Arindam and Fukumizu, Kenji},\n\tmonth = {04},\n\tpages = {523--531},\n\tpublisher = {PMLR},\n\ttitle = {Adaptive Approximate Policy Iteration},\n\turl_link = {http://proceedings.mlr.press/v130/hao21b.html},\n\turl_paper = {AISTATS2021_AAPI.pdf},\n\tyear = 2021}\n\n
\n
\n\n\n
\n Model-free reinforcement learning algorithms combined with value function approximation have recently achieved impressive performance in a variety of application domains. However, the theoretical understanding of such algorithms is limited, and existing results are largely focused on episodic or discounted Markov decision processes (MDPs). In this work, we present adaptive approximate policy iteration (AAPI), a learning scheme which enjoys a $O(T^{2/3})$ regret bound for undiscounted, continuing learning in uniformly ergodic MDPs. This is an improvement over the best existing bound of $O(T^{3/4})$ for the average-reward case with function approximation. Our algorithm and analysis rely on online learning techniques, where value functions are treated as losses. The main technical novelty is the use of a data-dependent adaptive learning rate coupled with a so-called optimistic prediction of upcoming losses. In addition to theoretical guarantees, we demonstrate the advantages of our approach empirically on several environments.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n NeurIPS.\n \n \n \n\n\n \n \n\n\n \n\n\n\n 12 2021.\n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@proceedingsNOTUSED{NeurIPS2021oral,\n      acceptrate = {\\textbf{oral}, 55 out of 9122 = 1\\%},\n      entrysubtype = {notmine},\n      month = {12},\n      title = {NeurIPS},\n      year = {2021}\n      }\n\n    
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2020\n \n \n (20)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Escaping the Gravitational Pull of Softmax.\n \n \n \n \n\n\n \n Mei, J.; Xiao, C; Dai, B.; Li, L.; Szepesvári, C.; and Schuurmans, D.\n\n\n \n\n\n\n In NeurIPS, 12 2020. \n \n\n\n\n
\n\n\n\n \n \n \"Escaping link\n  \n \n \n \"Escaping paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 46 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{MXDLSzS20,\n\tabstract = {The softmax is the standard transformation used in machine learning to map real-valued vectors to categorical distributions. Unfortunately, this transform poses serious drawbacks for gradient descent (ascent) optimization. We reveal this difficulty by establishing two negative results: (1) optimizing any expectation with respect to the softmax must exhibit sensitivity to parameter initialization (<code>softmax gravity well</code>), and (2) optimizing log-probabilities under the softmax must exhibit slow convergence (</code>softmax damping</code>). Both findings are based on an analysis of convergence rates using the Non-uniform \\L{}ojasiewicz (N\\L{}) inequalities. To circumvent these shortcomings we investigate an alternative transformation, the <em>escort</em> mapping, that demonstrates better optimization properties. The disadvantages of the softmax and the effectiveness of the escort transformation are further explained using the concept of N\\L{} coefficient. In addition to proving bounds on convergence rates to firmly establish these results, we also provide experimental evidence for the superiority of the escort transformation.},\n\tauthor = {Mei, J. and Xiao, C and Dai, B. and Li, L. and Szepesv{\\'a}ri, Cs. and Schuurmans, D.},\n\tcrossref = {NeurIPS2020oral},\n\tmonth = {12},\n\ttitle = {Escaping the Gravitational Pull of Softmax},\n\turl_link = {https://papers.nips.cc/paper/2020/hash/f1cf2a082126bf02de0b307778ce73a7-Abstract.html},\n\turl_paper = {NeurIPS2020_pg.pdf},\n    booktitle = {NeurIPS},\n\tyear = {2020}}\n\n
\n
\n\n\n
\n The softmax is the standard transformation used in machine learning to map real-valued vectors to categorical distributions. Unfortunately, this transform poses serious drawbacks for gradient descent (ascent) optimization. We reveal this difficulty by establishing two negative results: (1) optimizing any expectation with respect to the softmax must exhibit sensitivity to parameter initialization (softmax gravity well), and (2) optimizing log-probabilities under the softmax must exhibit slow convergence (softmax damping). Both findings are based on an analysis of convergence rates using the Non-uniform Łojasiewicz (NŁ) inequalities. To circumvent these shortcomings we investigate an alternative transformation, the escort mapping, that demonstrates better optimization properties. The disadvantages of the softmax and the effectiveness of the escort transformation are further explained using the concept of NŁ coefficient. In addition to proving bounds on convergence rates to firmly establish these results, we also provide experimental evidence for the superiority of the escort transformation.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Efficient Planning in Large MDPs with Weak Linear Function Approximation.\n \n \n \n \n\n\n \n Shariff, R.; and Szepesvári, C.\n\n\n \n\n\n\n In NeurIPS, pages 19163–19174, 12 2020. \n \n\n\n\n
\n\n\n\n \n \n \"Efficient link\n  \n \n \n \"Efficient paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 36 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{SSz20,\n\tabstract = {Large-scale Markov decision processes (MDPs) require planning algorithms with runtime independent of the number of states of the MDP. We consider the planning problem in MDPs using linear value function approximation with only weak requirements: low approximation error for the optimal value function, and a small set of ``core'' states whose features span those of other states. In particular, we make no assumptions about the representability of policies or value functions of non-optimal policies. Our algorithm produces almost-optimal actions for any state using a generative oracle (simulator) for the MDP, while its computation time scales polynomially with the number of features, core states, and actions and the effective horizon.},\n\tauthor = {Shariff, R. and Szepesv{\\'a}ri, Cs.},\n\tcrossref = {NeurIPS2020poster},\n\tmonth = {12},\n\ttitle = {Efficient Planning in Large MDPs with Weak Linear Function Approximation},\n\turl_link = {https://papers.nips.cc/paper/2020/hash/de07edeeba9f475c9395959494cd8f64-Abstract.html},\n\turl_paper = {NeurIPS2020_alp.pdf},\n    booktitle = {NeurIPS},\n    pages = {19163--19174},\n\tyear = {2020}}\n\n
\n
\n\n\n
\n Large-scale Markov decision processes (MDPs) require planning algorithms with runtime independent of the number of states of the MDP. We consider the planning problem in MDPs using linear value function approximation with only weak requirements: low approximation error for the optimal value function, and a small set of ``core'' states whose features span those of other states. In particular, we make no assumptions about the representability of policies or value functions of non-optimal policies. Our algorithm produces almost-optimal actions for any state using a generative oracle (simulator) for the MDP, while its computation time scales polynomially with the number of features, core states, and actions and the effective horizon.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n ImpatientCapsAndRuns: Approximately Optimal Algorithm Configuration from an Infinite Pool.\n \n \n \n \n\n\n \n Weisz, G.; György, A.; Lin, W.; Graham, D.; Leyton-Brown, K.; Szepesvári, C.; and Lucier, B.\n\n\n \n\n\n\n In NeurIPS, pages 17478–17488, 12 2020. \n \n\n\n\n
\n\n\n\n \n \n \"ImpatientCapsAndRuns: link\n  \n \n \n \"ImpatientCapsAndRuns: paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 11 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{WGLGLBSzL20,\n\tabstract = {Algorithm configuration procedures optimize parameters of a given algorithm to perform well over a distribution of inputs. Recent theoretical work focused on the case of selecting between a small number of alternatives. In practice, parameter spaces are often very large or infinite, and so successful heuristic procedures discard parameters impatiently'', based on very few observations. Inspired by this idea, we introduce ImpatientCapsAndRuns, which quickly discards less promising configurations, significantly speeding up the search procedure compared to previous algorithms with theoretical guarantees, while still achieving optimal runtime up to logarithmic factors under mild assumptions. Experimental results demonstrate a practical improvement.},\n\tauthor = {Weisz, G. and Gy{\\"o}rgy, A. and Lin, W.-I and Graham, Devon and Leyton-Brown, K. and Szepesv{\\'a}ri, Cs. and Lucier, B.},\n\tcrossref = {NeurIPS2020poster},\n\tmonth = {12},\n\ttitle = {ImpatientCapsAndRuns: Approximately Optimal Algorithm Configuration from an Infinite Pool},\n\turl_link = {https://papers.nips.cc/paper/2020/hash/ca5520b5672ea120b23bde75c46e76c6-Abstract.html},\n\turl_paper = {NeurIPS2020_impcr.pdf},\n    booktitle = {NeurIPS},\n    pages = {17478--17488},\n\tyear = {2020}}\n\n
\n
\n\n\n
\n Algorithm configuration procedures optimize parameters of a given algorithm to perform well over a distribution of inputs. Recent theoretical work focused on the case of selecting between a small number of alternatives. In practice, parameter spaces are often very large or infinite, and so successful heuristic procedures discard parameters impatiently'', based on very few observations. Inspired by this idea, we introduce ImpatientCapsAndRuns, which quickly discards less promising configurations, significantly speeding up the search procedure compared to previous algorithms with theoretical guarantees, while still achieving optimal runtime up to logarithmic factors under mild assumptions. Experimental results demonstrate a practical improvement.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n PAC-Bayes Analysis Beyond the Usual Bounds.\n \n \n \n \n\n\n \n Rivasplata, O.; Kuzborskij, I.; Szepesvári, C.; and Shawe-Taylor, J.\n\n\n \n\n\n\n In NeurIPS, pages 16833–16845, 12 2020. \n \n\n\n\n
\n\n\n\n \n \n \"PAC-Bayes link\n  \n \n \n \"PAC-Bayes paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 9 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{RKSzST20,\n\tabstract = {We focus on a stochastic learning model where the learner observes a finite set of training examples and the output of the learning process is a data-dependent distribution over a space of hypotheses. The learned data-dependent distribution is then used to make randomized predictions, and the high-level theme addressed here is guaranteeing the quality of predictions on examples that were not seen during training, i.e. generalization. In this setting the unknown quantity of interest is the expected risk of the data-dependent randomized predictor, for which upper bounds can be derived via a PAC-Bayes analysis, leading to PAC-Bayes bounds. Specifically, we present a basic PAC-Bayes inequality for stochastic kernels, from which one may derive extensions of various known PAC-Bayes bounds as well as novel bounds. We clarify the role of the requirements of fixed `data-free' priors, bounded losses, and i.i.d. data. We highlight that those requirements were used to upper-bound an exponential moment term, while the basic PAC-Bayes theorem remains valid without those restrictions. We present three bounds that illustrate the use of data-dependent priors, including one for the unbounded square loss.},\n\tacceptrate = {1900 out of 9454 = 20\\%},\n\tauthor = {Rivasplata, Omar and Kuzborskij, Ilja and Szepesv{\\'a}ri, Csaba and Shawe-Taylor, John},\n\tcrossref = {NeurIPS2020poster},\n\tmonth = {12},\n\ttitle = {PAC-Bayes Analysis Beyond the Usual Bounds},\n\turl_link = {https://papers.nips.cc/paper/2020/hash/c3992e9a68c5ae12bd18488bc579b30d-Abstract.html},\n\turl_paper = {NeurIPS2020_PACBayes.pdf},\n    booktitle = {NeurIPS},\n    pages = {16833--16845},\n\tyear = {2020}}\n\n
\n
\n\n\n
\n We focus on a stochastic learning model where the learner observes a finite set of training examples and the output of the learning process is a data-dependent distribution over a space of hypotheses. The learned data-dependent distribution is then used to make randomized predictions, and the high-level theme addressed here is guaranteeing the quality of predictions on examples that were not seen during training, i.e. generalization. In this setting the unknown quantity of interest is the expected risk of the data-dependent randomized predictor, for which upper bounds can be derived via a PAC-Bayes analysis, leading to PAC-Bayes bounds. Specifically, we present a basic PAC-Bayes inequality for stochastic kernels, from which one may derive extensions of various known PAC-Bayes bounds as well as novel bounds. We clarify the role of the requirements of fixed `data-free' priors, bounded losses, and i.i.d. data. We highlight that those requirements were used to upper-bound an exponential moment term, while the basic PAC-Bayes theorem remains valid without those restrictions. We present three bounds that illustrate the use of data-dependent priors, including one for the unbounded square loss.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Model Selection in Contextual Stochastic Bandit Problems.\n \n \n \n \n\n\n \n Pacchiano, A.; Phan, M.; Abbasi-Yadkori, Y.; Rao, A.; Zimmert, J.; Lattimore, T.; and Szepesvári, C.\n\n\n \n\n\n\n In NeurIPS, pages 10328–10337, 12 2020. \n \n\n\n\n
\n\n\n\n \n \n \"Model link\n  \n \n \n \"Model paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{PPARZLSz20,\n\tabstract = {We study bandit model selection in stochastic environments. Our approach relies on a master algorithm that selects between candidate base algorithms. We develop a master-base algorithm abstraction that can work with general classes of base algorithms and different type of adversarial master algorithms. Our methods rely on a novel and generic smoothing transformation for bandit algorithms that permits us to obtain optimal\n$O(\\sqrt{T})$\n model selection guarantees for stochastic contextual bandit problems as long as the optimal base algorithm satisfies a high probability regret guarantee. We show through a lower bound that even when one of the base algorithms has\n$O(\\log(T))$\n regret, in general it is impossible to get better than\n$\\Omega(\\sqrt{T})$\n regret in model selection, even asymptotically. Using our techniques, we address model selection in a variety of problems such as misspecified linear contextual bandits, linear bandit with unknown dimension  and reinforcement learning with unknown feature maps. Our algorithm requires the knowledge of the optimal base regret to adjust the master learning rate. We show that without such prior knowledge any master can suffer a regret larger than the optimal base regret.},\n\tauthor = {Pacchiano, A. and Phan, M. and Abbasi-Yadkori, Y. and Rao, A. and Zimmert, J. and Lattimore, T. and Szepesv{\\'a}ri, Cs.},\n\tcrossref = {NeurIPS2020poster},\n\tmonth = {12},\n\ttitle = {Model Selection in Contextual Stochastic Bandit Problems},\n\turl_link = {https://papers.nips.cc/paper/2020/hash/751d51528afe5e6f7fe95dece4ed32ba-Abstract.html},\n\turl_paper = {NeurIPS2020_modelselect.pdf},\n    booktitle = {NeurIPS},\n    pages = {10328--10337},\n\tyear = {2020}}\n\n
\n
\n\n\n
\n We study bandit model selection in stochastic environments. Our approach relies on a master algorithm that selects between candidate base algorithms. We develop a master-base algorithm abstraction that can work with general classes of base algorithms and different type of adversarial master algorithms. Our methods rely on a novel and generic smoothing transformation for bandit algorithms that permits us to obtain optimal $O(\\sqrt{T})$ model selection guarantees for stochastic contextual bandit problems as long as the optimal base algorithm satisfies a high probability regret guarantee. We show through a lower bound that even when one of the base algorithms has $O(łog(T))$ regret, in general it is impossible to get better than $Ω(\\sqrt{T})$ regret in model selection, even asymptotically. Using our techniques, we address model selection in a variety of problems such as misspecified linear contextual bandits, linear bandit with unknown dimension and reinforcement learning with unknown feature maps. Our algorithm requires the knowledge of the optimal base regret to adjust the master learning rate. We show that without such prior knowledge any master can suffer a regret larger than the optimal base regret.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n CoinDICE: Off-Policy Confidence Interval Estimation.\n \n \n \n \n\n\n \n Dai, B.; Nachum, O.; Chow, Y.; Li, L.; Szepesvári, C.; and Schuurmans, D.\n\n\n \n\n\n\n In NeurIPS, 12 2020. \n \n\n\n\n
\n\n\n\n \n \n \"CoinDICE: link\n  \n \n \n \"CoinDICE: paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{DBCKSzS20,\n\tabstract = {We study high-confidence behavior-agnostic off-policy evaluation in reinforcement learning, where the goal is to estimate a confidence interval on a target policy's value, given only access to a static experience dataset collected by unknown behavior policies. Starting from a function space embedding of the linear program formulation of the Q-function, we obtain an optimization problem with generalized estimating equation constraints. By applying the generalized empirical likelihood method to the resulting Lagrangian, we propose CoinDICE, a novel and efficient algorithm for computing confidence intervals. Theoretically, we prove the obtained confidence intervals are valid, in both asymptotic and finite-sample regimes. Empirically, we show in a variety of benchmarks that the confidence interval estimates are tighter and more accurate than existing methods.},\n\tauthor = {Dai, B. and Nachum, O. and Chow, Y. and Li, L. and Szepesv{\\'a}ri, Cs. and Schuurmans, D.},\n\tcrossref = {NeurIPS2020spotlight},\n\tmonth = {12},\n\ttitle = {CoinDICE: Off-Policy Confidence Interval Estimation},\n\turl_link = {https://papers.nips.cc/paper/2020/hash/6aaba9a124857622930ca4e50f5afed2-Abstract.html},\n\turl_paper = {NeurIPS2020_coindice.pdf},\n    booktitle = {NeurIPS},\n\tyear = {2020}}\n\n
\n
\n\n\n
\n We study high-confidence behavior-agnostic off-policy evaluation in reinforcement learning, where the goal is to estimate a confidence interval on a target policy's value, given only access to a static experience dataset collected by unknown behavior policies. Starting from a function space embedding of the linear program formulation of the Q-function, we obtain an optimization problem with generalized estimating equation constraints. By applying the generalized empirical likelihood method to the resulting Lagrangian, we propose CoinDICE, a novel and efficient algorithm for computing confidence intervals. Theoretically, we prove the obtained confidence intervals are valid, in both asymptotic and finite-sample regimes. Empirically, we show in a variety of benchmarks that the confidence interval estimates are tighter and more accurate than existing methods.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Variational Policy Gradient Method for Reinforcement Learning with General Utilities.\n \n \n \n \n\n\n \n Zhang, J.; Koppel, A.; Bedi, A.; Szepesvári, C.; and Wang, M.\n\n\n \n\n\n\n In NeurIPS, pages 4572–4583, 12 2020. \n \n\n\n\n
\n\n\n\n \n \n \"Variational link\n  \n \n \n \"Variational paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 6 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{ZKBSzW20,\n\tabstract = {In recent years, reinforcement learning systems with general goals beyond a cumulative sum of rewards have gained traction, such as in constrained problems, exploration, and acting upon prior experiences. In this paper, we consider policy optimization in Markov Decision Problems, where the objective is a general utility function of the state-action occupancy measure, which subsumes several of the aforementioned examples as special cases. Such generality invalidates the Bellman equation. As this means that dynamic programming no longer works, we focus on direct policy search. Analogously to the Policy Gradient Theorem available for RL with cumulative rewards, we derive a new Variational Policy Gradient Theorem for RL with general utilities, which establishes that the gradient may be obtained as the solution of a stochastic saddle point problem involving the Fenchel dual of the utility function. We develop a variational Monte Carlo gradient estimation algorithm to compute the policy gradient based on sample paths. Further, we prove that the variational policy gradient scheme converges globally to the optimal policy for the general objective, and we also establish its rate of convergence that matches or improves the convergence rate available in the case of RL with cumulative rewards.},\n\tauthor = {Zhang, J. and Koppel, A. and Bedi, A.S. and Szepesv{\\'a}ri, Cs. and Wang, M.},\n\tcrossref = {NeurIPS2020spotlight},\n\tmonth = {12},\n\ttitle = {Variational Policy Gradient Method for Reinforcement Learning with General Utilities},\n\turl_link = {https://papers.nips.cc/paper/2020/hash/30ee748d38e21392de740e2f9dc686b6-Abstract.html},\n\turl_paper = {NeurIPS2020_rlg.pdf},\n    booktitle = {NeurIPS},\n    pages = {4572--4583},\n\tyear = {2020}}\n\n
\n
\n\n\n
\n In recent years, reinforcement learning systems with general goals beyond a cumulative sum of rewards have gained traction, such as in constrained problems, exploration, and acting upon prior experiences. In this paper, we consider policy optimization in Markov Decision Problems, where the objective is a general utility function of the state-action occupancy measure, which subsumes several of the aforementioned examples as special cases. Such generality invalidates the Bellman equation. As this means that dynamic programming no longer works, we focus on direct policy search. Analogously to the Policy Gradient Theorem available for RL with cumulative rewards, we derive a new Variational Policy Gradient Theorem for RL with general utilities, which establishes that the gradient may be obtained as the solution of a stochastic saddle point problem involving the Fenchel dual of the utility function. We develop a variational Monte Carlo gradient estimation algorithm to compute the policy gradient based on sample paths. Further, we prove that the variational policy gradient scheme converges globally to the optimal policy for the general objective, and we also establish its rate of convergence that matches or improves the convergence rate available in the case of RL with cumulative rewards.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Differentiable Meta-Learning of Bandit Policies.\n \n \n \n \n\n\n \n Boutilier, C.; Hsu, C.; Kveton, B.; Mladenov, M.; Szepesvári, C.; and Zaheer, M.\n\n\n \n\n\n\n In NeurIPS, pages 2122–2134, 12 2020. \n \n\n\n\n
\n\n\n\n \n \n \"Differentiable link\n  \n \n \n \"Differentiable paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{BHKMSzZ20,\n\tabstract = {Exploration policies in Bayesian bandits maximize the average reward over problem instances drawn from some distribution P. In this work, we learn such policies for an unknown distribution P using samples from P. Our approach is a form of meta-learning and exploits properties of P without making strong assumptions about its form. To do this, we parameterize our policies in a differentiable way and optimize them by policy gradients, an approach that is pleasantly general and easy to implement. We derive effective gradient estimators and propose novel variance reduction techniques. We also analyze and experiment with various bandit policy classes, including neural networks and a novel softmax policy. The latter has regret guarantees and is a natural starting point for our optimization. Our experiments show the versatility of our approach. We also observe that neural network policies can learn implicit biases expressed only through the sampled instances.},\n\tauthor = {Boutilier, C. and Hsu, C.-w. and Kveton, B. and Mladenov, M. and Szepesv{\\'a}ri, Cs. and Zaheer, M.},\n\tcrossref = {NeurIPS2020poster},\n\tmonth = {12},\n\ttitle = {Differentiable Meta-Learning of Bandit Policies},\n\turl_link = {https://papers.nips.cc/paper/2020/hash/171ae1bbb81475eb96287dd78565b38b-Abstract.html},\n\turl_paper = {NeurIPS2020_dml.pdf},\n    booktitle = {NeurIPS},\n    pages = {2122--2134},\n\tyear = {2020}}\n\n
\n
\n\n\n
\n Exploration policies in Bayesian bandits maximize the average reward over problem instances drawn from some distribution P. In this work, we learn such policies for an unknown distribution P using samples from P. Our approach is a form of meta-learning and exploits properties of P without making strong assumptions about its form. To do this, we parameterize our policies in a differentiable way and optimize them by policy gradients, an approach that is pleasantly general and easy to implement. We derive effective gradient estimators and propose novel variance reduction techniques. We also analyze and experiment with various bandit policy classes, including neural networks and a novel softmax policy. The latter has regret guarantees and is a natural starting point for our optimization. Our experiments show the versatility of our approach. We also observe that neural network policies can learn implicit biases expressed only through the sampled instances.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Online Algorithm for Unsupervised Sequential Selection with Contextual Information.\n \n \n \n \n\n\n \n Verma, A.; Hanawal, M.; Szepesvári, C.; and Saligrama, V.\n\n\n \n\n\n\n In NeurIPS, pages 778–788, 12 2020. \n \n\n\n\n
\n\n\n\n \n \n \"Online link\n  \n \n \n \"Online paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 6 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{VHSZS20,\n\tabstract = {n this paper, we study Contextual Unsupervised Sequential Selection (USS), a new variant of the stochastic contextual bandits problem where the loss of an arm cannot be inferred from the observed feedback. In our setup, arms are associated with fixed costs and are ordered, forming a cascade. In each round, a context is presented, and the learner selects the arms sequentially till some depth. The total cost incurred by stopping at an arm is the sum of fixed costs of arms selected and the stochastic loss associated with the arm. The learner's goal is to learn a decision rule that maps contexts to arms with the goal of minimizing the total expected loss. The problem is challenging as we are faced with an unsupervised setting as the total loss cannot be estimated. Clearly, learning is feasible only if the optimal arm can be inferred (explicitly or implicitly) from the problem structure. We observe that learning is still possible when the problem instance satisfies the so-called 'Contextual Weak Dominance' (CWD) property. Under CWD, we propose an algorithm for the contextual USS problem and demonstrate that it has sub-linear regret. Experiments on synthetic and real datasets validate our algorithm.},\n\tauthor = {Verma, A. and Hanawal, M. and Szepesv{\\'a}ri, Cs. and Saligrama, V.},\n\tcrossref = {NeurIPS2020poster},\n\tmonth = {12},\n\ttitle = {Online Algorithm for Unsupervised Sequential Selection with Contextual Information},\n\turl_link = {https://papers.nips.cc/paper/2020/hash/08e5d8066881eab185d0de9db3b36c7f-Abstract.html},\n\turl_paper = {NeurIPS2020_uss.pdf},\n    booktitle = {NeurIPS},\n    pages = {778--788},\n\tyear = {2020}}\n\n
\n
\n\n\n
\n n this paper, we study Contextual Unsupervised Sequential Selection (USS), a new variant of the stochastic contextual bandits problem where the loss of an arm cannot be inferred from the observed feedback. In our setup, arms are associated with fixed costs and are ordered, forming a cascade. In each round, a context is presented, and the learner selects the arms sequentially till some depth. The total cost incurred by stopping at an arm is the sum of fixed costs of arms selected and the stochastic loss associated with the arm. The learner's goal is to learn a decision rule that maps contexts to arms with the goal of minimizing the total expected loss. The problem is challenging as we are faced with an unsupervised setting as the total loss cannot be estimated. Clearly, learning is feasible only if the optimal arm can be inferred (explicitly or implicitly) from the problem structure. We observe that learning is still possible when the problem instance satisfies the so-called 'Contextual Weak Dominance' (CWD) property. Under CWD, we propose an algorithm for the contextual USS problem and demonstrate that it has sub-linear regret. Experiments on synthetic and real datasets validate our algorithm.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Exploration by Optimisation in Partial Monitoring.\n \n \n \n \n\n\n \n Lattimore, T.; and Szepesvári, C.\n\n\n \n\n\n\n In COLT, 06 2020. \n \n\n\n\n
\n\n\n\n \n \n \"Exploration paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 47 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{LSz20,\n\tabstract = {We provide a novel algorithm for adversarial $k$-action $d$-outcome partial monitoring that is adaptive, intuitive and efficient.\nThe highlight is that for the non-degenerate locally observable games, the $n$-round minimax regret is bounded by\n$6m k^{3/2} \\sqrt{n \\log(k)}$, where $m$ is the number of signals.\nThis matches the best known information-theoretic upper bound derived via Bayesian minimax duality.\nThe same algorithm also achieves near-optimal regret for full information, bandit and\nglobally observable games. High probability bounds and simple experiments are also provided.\n},\n\tacceptrate = {120 out of 359=33\\%},\n\tauthor = {Lattimore, Tor and Szepesv\\'ari, Csaba},\n\tbooktitle = {COLT},\n\tmonth = {06},\n\ttitle = {Exploration by Optimisation in Partial Monitoring},\n\turl_paper = {COLT2020_explbyopt.pdf},\n\tyear = {2020}}\n\n
\n
\n\n\n
\n We provide a novel algorithm for adversarial $k$-action $d$-outcome partial monitoring that is adaptive, intuitive and efficient. The highlight is that for the non-degenerate locally observable games, the $n$-round minimax regret is bounded by $6m k^{3/2} \\sqrt{n łog(k)}$, where $m$ is the number of signals. This matches the best known information-theoretic upper bound derived via Bayesian minimax duality. The same algorithm also achieves near-optimal regret for full information, bandit and globally observable games. High probability bounds and simple experiments are also provided. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n On the Global Convergence Rates of Softmax Policy Gradient Methods.\n \n \n \n \n\n\n \n Mei, J.; Xiao, C.; Szepesvári, C.; and Schuurmans, D.\n\n\n \n\n\n\n In ICML, 06 2020. \n \n\n\n\n
\n\n\n\n \n \n \"On paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 52 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{JYSzWA20,\n\tabstract = {We make three contributions toward better understanding policy gradient methods in the tabular setting.\nFirst, we show that with the true gradient, policy gradient with a softmax parametrization converges at a $O(1/t)$ rate, with constants depending on the problem and initialization.\nThis result significantly expands the recent asymptotic convergence results.\nThe analysis relies on two findings:\nthat the softmax policy gradient satisfies a Lojasiewicz inequality, and the minimum probability of an optimal action during optimization can be bounded in terms of its initial value.\nSecond, we analyze entropy regularized policy gradient and show that it enjoys a significantly faster linear convergence rate $O(e^{-t})$ toward softmax optimal policy.\nThis result resolves an open question in the recent literature.\nFinally, combining the above two results and additional new $\\Omega(1/t)$ lower bound results, we explain how entropy regularization improves policy optimization, even with the true gradient, from the perspective of convergence rate. The separation of rates is further explained using the notion of  non-uniform Lojasiewicz degree.\nThese results provide a theoretical understanding of the impact of entropy and corroborate existing empirical studies.},\n\tauthor = {Mei, Jincheng and Xiao, Chenjun and Szepesv\\'ari, Csaba and Schuurmans, Dale},\n\tcrossref = {ICML2020},\n\tmonth = {06},\n\ttitle = {On the Global Convergence Rates of Softmax Policy Gradient Methods},\n\turl_paper = {ICML2020_pg.pdf},\n    booktitle = {ICML},\n\tyear = {2020}}\n\n
\n
\n\n\n
\n We make three contributions toward better understanding policy gradient methods in the tabular setting. First, we show that with the true gradient, policy gradient with a softmax parametrization converges at a $O(1/t)$ rate, with constants depending on the problem and initialization. This result significantly expands the recent asymptotic convergence results. The analysis relies on two findings: that the softmax policy gradient satisfies a Lojasiewicz inequality, and the minimum probability of an optimal action during optimization can be bounded in terms of its initial value. Second, we analyze entropy regularized policy gradient and show that it enjoys a significantly faster linear convergence rate $O(e^{-t})$ toward softmax optimal policy. This result resolves an open question in the recent literature. Finally, combining the above two results and additional new $Ω(1/t)$ lower bound results, we explain how entropy regularization improves policy optimization, even with the true gradient, from the perspective of convergence rate. The separation of rates is further explained using the notion of non-uniform Lojasiewicz degree. These results provide a theoretical understanding of the impact of entropy and corroborate existing empirical studies.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A simpler approach to accelerated optimization: iterative averaging meets optimism.\n \n \n \n \n\n\n \n Joulani, P.; Raj, A.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, 06 2020. \n \n\n\n\n
\n\n\n\n \n \n \"A paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 20 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{JRGySz20,\n\tabstract = {Recently there have been several attempts to improve the rates of convergence achievable for smooth objectives. In particular, several recent papers have attempted to extend Nesterov's accelerated algorithm to stochastic and variance-reduced optimization. In this paper, we show that there is a simpler approach to obtaining accelerated rates: applying generic, well-known optimistic online learning algorithms and using the online average of their predictions to query the (deterministic or stochastic) first-order optimization oracle at each time step. In particular, we tighten the recent results of Cutkosky (2019) to demonstrate theoretically that online averaging results in a reduced optimization gap, independently of the algorithm involved. Then, we show that a simple tuning of existing generic optimistic online learning algorithms (e.g., Joulani et al [2017]), when combined with the reduced error quantified above, naturally results in optimal accelerated rates. Importantly, the smooth objective may or may not be strongly-convex, and the rates are nevertheless optimal for both stochastic and deterministic first-order oracles. We further show that the same ideas transfer to variance-reduced optimization. In each case, the proofs are much simpler than the previous work, such as the new derivations of accelerated algorithms based on a primal-dual view (Wang and Abernethy, 2018) or the ideas based on linear coupling (Allen-Zhu and Orecchia, 2017). Importantly, we also provide algorithms that maintain the "universality" property, meaning that the same algorithm achieves the optimal rate for smooth and non-smooth objectives without further prior knowledge, generalizing the results of Kavis et al (2019) and solving a number of their open problems.},\n\tauthor = {Joulani, Pooria and Raj, Anant and Gy\\"orgy, Andr\\'as and Szepesv\\'ari, Csaba},\n\tcrossref = {ICML2020},\n\tmonth = {06},\n\ttitle = {A simpler approach to accelerated optimization: iterative averaging meets optimism},\n\turl_paper = {ICML2020_itavgopt.pdf},\n    booktitle = {ICML},\n\tyear = {2020}}\n\n
\n
\n\n\n
\n Recently there have been several attempts to improve the rates of convergence achievable for smooth objectives. In particular, several recent papers have attempted to extend Nesterov's accelerated algorithm to stochastic and variance-reduced optimization. In this paper, we show that there is a simpler approach to obtaining accelerated rates: applying generic, well-known optimistic online learning algorithms and using the online average of their predictions to query the (deterministic or stochastic) first-order optimization oracle at each time step. In particular, we tighten the recent results of Cutkosky (2019) to demonstrate theoretically that online averaging results in a reduced optimization gap, independently of the algorithm involved. Then, we show that a simple tuning of existing generic optimistic online learning algorithms (e.g., Joulani et al [2017]), when combined with the reduced error quantified above, naturally results in optimal accelerated rates. Importantly, the smooth objective may or may not be strongly-convex, and the rates are nevertheless optimal for both stochastic and deterministic first-order oracles. We further show that the same ideas transfer to variance-reduced optimization. In each case, the proofs are much simpler than the previous work, such as the new derivations of accelerated algorithms based on a primal-dual view (Wang and Abernethy, 2018) or the ideas based on linear coupling (Allen-Zhu and Orecchia, 2017). Importantly, we also provide algorithms that maintain the \"universality\" property, meaning that the same algorithm achieves the optimal rate for smooth and non-smooth objectives without further prior knowledge, generalizing the results of Kavis et al (2019) and solving a number of their open problems.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Model-Based Reinforcement Learning with Value-Targeted Regression.\n \n \n \n \n\n\n \n Ayoub, A.; Jia, Z.; Szepesvári, C.; Wang, M.; and Yang, L.\n\n\n \n\n\n\n In ICML, pages 463–474, 06 2020. \n \n\n\n\n
\n\n\n\n \n \n \"Model-Based paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 88 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{AZSzWY20,\n\tabstract = {This paper studies model-based reinforcement learning (RL) for regret minimization.\nWe focus on finite-horizon episodic RL where the transition model $P$ belongs to a known family of models $\\mathcal{P}$, a special case of which is when models in $\\mathcal{P}$ take the form of linear mixtures:\n$P_{\\theta} = \\sum_{i=1}^{d} \\theta_{i}P_{i}$.\nWe propose a model based RL algorithm that is based on the optimism principle:\nIn each episode, the set of models that are `consistent' with the data collected is constructed.\nThe criterion of consistency is based on the total squared error that the model incurs on the task of predicting <em>state values</em> as determined by the last value estimate along the transitions.\nThe next value function is then chosen by solving the optimistic planning problem with the constructed set of models.\nWe derive a bound on the regret, which, in the special case of linear mixtures,\n takes the form $\\tilde{\\mathcal{O}}(d\\sqrt{H^{3}T})$, where $H$, $T$ and $d$ are the horizon, the total number of steps and the dimension of $\\theta$, respectively.\nIn particular, this regret bound is independent of the total number of states or actions, and is close to a lower bound $\\Omega(\\sqrt{HdT})$.\nFor a general model family $\\mathcal{P}$, the regret bound is derived\nbased on the Eluder dimension.},\n\tauthor = {Ayoub, Alex and Jia, Zeyu and Szepesv\\'ari, Csaba and Wang, Mengdi and Yang, Lin},\n\tcrossref = {ICML2020},\n\tdate-modified = {2021-06-29 19:55:35 -0600},\n\tmonth = {06},\n\tpages = {463--474},\n\ttitle = {Model-Based Reinforcement Learning with Value-Targeted Regression},\n\turl_paper = {ICML2020_UCRL_VTR.pdf},\n    booktitle = {ICML},\n\tyear = {2020}}\n\n
\n
\n\n\n
\n This paper studies model-based reinforcement learning (RL) for regret minimization. We focus on finite-horizon episodic RL where the transition model $P$ belongs to a known family of models $\\mathcal{P}$, a special case of which is when models in $\\mathcal{P}$ take the form of linear mixtures: $P_{θ} = ∑_{i=1}^{d} θ_{i}P_{i}$. We propose a model based RL algorithm that is based on the optimism principle: In each episode, the set of models that are `consistent' with the data collected is constructed. The criterion of consistency is based on the total squared error that the model incurs on the task of predicting state values as determined by the last value estimate along the transitions. The next value function is then chosen by solving the optimistic planning problem with the constructed set of models. We derive a bound on the regret, which, in the special case of linear mixtures, takes the form $\\̃mathcal{O}}(d\\sqrt{H^{3}T})$, where $H$, $T$ and $d$ are the horizon, the total number of steps and the dimension of $θ$, respectively. In particular, this regret bound is independent of the total number of states or actions, and is close to a lower bound $Ω(\\sqrt{HdT})$. For a general model family $\\mathcal{P}$, the regret bound is derived based on the Eluder dimension.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning with Good Feature Representations in Bandits and in RL with a Generative Model.\n \n \n \n \n\n\n \n Lattimore, T.; Szepesvári, C.; and Weisz, G.\n\n\n \n\n\n\n In ICML, pages 5662-5670, 06 2020. \n \n\n\n\n
\n\n\n\n \n \n \"Learning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 36 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{LSzW20,\n\tabstract = {\nThe construction by Du et al. (2019) implies that even if a learner is given linear features in $\\mathbb{R}^d$ that approximate the rewards in a bandit with a uniform error of $\\epsilon$,\nthen searching for an action that is optimal up to $O(\\epsilon)$ requires examining essentially all actions. We use the Kiefer--Wolfowitz\ntheorem to prove a positive result that by checking only a few actions, a learner can always find an action that is suboptimal with an error of at most $O(\\epsilon \\sqrt{d})$. Thus, features are useful when the approximation error is small\nrelative to the dimensionality of the features. The idea is applied to stochastic bandits and reinforcement learning with a generative model where the learner\nhas access to $d$-dimensional linear features that approximate the action-value functions for all policies to an accuracy of $\\epsilon$. For linear bandits, we\nprove a bound on the regret of order $\\sqrt{dn \\log(k)} + \\epsilon n \\sqrt{d} \\log(n)$ with $k$ the number of actions and $n$ the horizon. For RL we show that approximate policy\niteration can learn a policy that is optimal up to an additive error of order $\\epsilon \\sqrt{d}/(1 - \\gamma)^2$ and using $d/(\\epsilon^2(1 - \\gamma)^4)$ samples from a generative model.\nThese bounds are independent of the finer details of the features. We also investigate how the structure of the feature set impacts the tradeoff between sample complexity and estimation error.\n},\n\tauthor = {Lattimore, T. and Szepesv\\'ari, Cs. and Weisz, G.},\n\tcrossref = {ICML2020},\n\tmonth = {06},\n\ttitle = {Learning with Good Feature Representations in Bandits and in RL with a Generative Model},\n\turl_paper = {ICML2020_goodfeatures.pdf},\n    booktitle = {ICML},\n    pages = {5662-5670},\n\tyear = {2020}}\n\n
\n
\n\n\n
\n The construction by Du et al. (2019) implies that even if a learner is given linear features in $ℝ^d$ that approximate the rewards in a bandit with a uniform error of $ε$, then searching for an action that is optimal up to $O(ε)$ requires examining essentially all actions. We use the Kiefer–Wolfowitz theorem to prove a positive result that by checking only a few actions, a learner can always find an action that is suboptimal with an error of at most $O(ε \\sqrt{d})$. Thus, features are useful when the approximation error is small relative to the dimensionality of the features. The idea is applied to stochastic bandits and reinforcement learning with a generative model where the learner has access to $d$-dimensional linear features that approximate the action-value functions for all policies to an accuracy of $ε$. For linear bandits, we prove a bound on the regret of order $\\sqrt{dn łog(k)} + ε n \\sqrt{d} łog(n)$ with $k$ the number of actions and $n$ the horizon. For RL we show that approximate policy iteration can learn a policy that is optimal up to an additive error of order $ε \\sqrt{d}/(1 - γ)^2$ and using $d/(ε^2(1 - γ)^4)$ samples from a generative model. These bounds are independent of the finer details of the features. We also investigate how the structure of the feature set impacts the tradeoff between sample complexity and estimation error. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A modular analysis of adaptive (non-)convex optimization: Optimism, composite objectives, variance reduction, and variational bounds.\n \n \n \n \n\n\n \n Joulani, P.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n Theoretical Computer Science, 808: 108–138. 01 2020.\n \n\n\n\n
\n\n\n\n \n \n \"A link\n  \n \n \n \"A paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@article{JouGySz20,\n\tabstract = {Recently, much work has been done on extending the scope of online learning and incremental stochastic optimization algorithms. In this paper we contribute to this effort in two ways: First, based on a generalization of Bregman divergences and a generic regret decomposition, we provide a self-contained, modular analysis of the two workhorses of online learning: (general) adaptive versions of Mirror Descent (MD) and the Follow-the-Regularized-Leader (FTRL) algorithms. The analysis is done with extra care so as not to introduce assumptions not needed in the proofs and allows to combine, in a straightforward way, different algorithmic ideas (e.g., adaptivity, optimism, implicit updates, variance reduction) and learning settings (e.g., strongly convex or composite objectives). This way we are able to reprove, extend and refine a large body of the literature, while keeping the proofs concise. The second contribution is a by-product of this careful analysis: We present algorithms with improved variational bounds for smooth, composite objectives, including a new family of optimistic MD algorithms with only one projection step per round. Furthermore, we provide a simple extension of adaptive regret bounds to a class of practically relevant non-convex problem settings (namely, star-convex loss functions and their extensions) with essentially no extra effort.},\n\tauthor = {Joulani, Pooria and Gy\\"orgy, Andr\\'as and Szepesv\\'ari, Csaba},\n\tdoi = {10.1016/j.tcs.2019.11.015},\n\tjournal = {Theoretical Computer Science},\n\tmonth = {01},\n\tpages = {108--138},\n\ttitle = {A modular analysis of adaptive (non-)convex optimization: Optimism, composite objectives, variance reduction, and variational bounds},\n\turl_link = {https://doi.org/10.1016/j.tcs.2019.11.015},\n\turl_paper = {TCS2020.pdf},\n\tvolume = {808},\n\tyear = {2020},\n\tBdsk-Url-1 = {https://doi.org/10.1016/j.tcs.2019.11.015}}\n\n
\n
\n\n\n
\n Recently, much work has been done on extending the scope of online learning and incremental stochastic optimization algorithms. In this paper we contribute to this effort in two ways: First, based on a generalization of Bregman divergences and a generic regret decomposition, we provide a self-contained, modular analysis of the two workhorses of online learning: (general) adaptive versions of Mirror Descent (MD) and the Follow-the-Regularized-Leader (FTRL) algorithms. The analysis is done with extra care so as not to introduce assumptions not needed in the proofs and allows to combine, in a straightforward way, different algorithmic ideas (e.g., adaptivity, optimism, implicit updates, variance reduction) and learning settings (e.g., strongly convex or composite objectives). This way we are able to reprove, extend and refine a large body of the literature, while keeping the proofs concise. The second contribution is a by-product of this careful analysis: We present algorithms with improved variational bounds for smooth, composite objectives, including a new family of optimistic MD algorithms with only one projection step per round. Furthermore, we provide a simple extension of adaptive regret bounds to a class of practically relevant non-convex problem settings (namely, star-convex loss functions and their extensions) with essentially no extra effort.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Bounds and dynamics for empirical game theoretic analysis.\n \n \n \n \n\n\n \n Tuyls, K.; Pérolat, J.; Lanctot, M.; Hughes, E.; Everett, R.; Leibo, J. Z.; Szepesvári, C.; and Graepel, T.\n\n\n \n\n\n\n Autonomous Agents and Multi-Agent Systems, 34(1): 7. 01 2020.\n \n\n\n\n
\n\n\n\n \n \n \"Bounds link\n  \n \n \n \"Bounds paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@article{Tuysetal2020,\n\tabstract = {This paper provides several theoretical results for empirical game theory. Specifically, we introduce bounds for empirical game theoretical analysis of complex multi-agent interactions. In doing so we provide insights in the empirical meta game showing that a Nash equilibrium of the estimated meta-game is an approximate Nash equilibrium of the true underlying meta-game. We investigate and show how many data samples are required to obtain a close enough approximation of the underlying game. Additionally, we extend the evolutionary dynamics analysis of meta-games using heuristic payoff tables (HPTs) to asymmetric games. The state-of-the-art has only considered evolutionary dynamics of symmetric HPTs in which agents have access to the same strategy sets and the payoff structure is symmetric, implying that agents are interchangeable. Finally, we carry out an empirical illustration of the generalised method in several domains, illustrating the theory and evolutionary dynamics of several versions of the AlphaGo algorithm (symmetric), the dynamics of the Colonel Blotto game played by human players on Facebook (symmetric), the dynamics of several teams of players in the capture the flag game (symmetric), and an example of a meta-game in Leduc Poker (asymmetric), generated by the policy-space response oracle multi-agent learning algorithm.},\n\tauthor = {Tuyls, Karl and P\\'erolat, Julien and Lanctot, Marc and Hughes, Edward and Everett, Richard and Leibo, Joel Z. and Szepesv\\'ari, Csaba and Graepel, Thore},\n\tjournal = {Autonomous Agents and Multi-Agent Systems},\n\tmonth = {01},\n\tnumber = {1},\n\tpages = {7},\n\ttitle = {Bounds and dynamics for empirical game theoretic analysis},\n\turl_link = {https://doi.org/10.1007/s10458-019-09432-y},\n\turl_paper = {Tuyls2019.pdf},\n\tvolume = {34},\n\tyear = {2020}}\n\n
\n
\n\n\n
\n This paper provides several theoretical results for empirical game theory. Specifically, we introduce bounds for empirical game theoretical analysis of complex multi-agent interactions. In doing so we provide insights in the empirical meta game showing that a Nash equilibrium of the estimated meta-game is an approximate Nash equilibrium of the true underlying meta-game. We investigate and show how many data samples are required to obtain a close enough approximation of the underlying game. Additionally, we extend the evolutionary dynamics analysis of meta-games using heuristic payoff tables (HPTs) to asymmetric games. The state-of-the-art has only considered evolutionary dynamics of symmetric HPTs in which agents have access to the same strategy sets and the payoff structure is symmetric, implying that agents are interchangeable. Finally, we carry out an empirical illustration of the generalised method in several domains, illustrating the theory and evolutionary dynamics of several versions of the AlphaGo algorithm (symmetric), the dynamics of the Colonel Blotto game played by human players on Facebook (symmetric), the dynamics of several teams of players in the capture the flag game (symmetric), and an example of a meta-game in Leduc Poker (asymmetric), generated by the policy-space response oracle multi-agent learning algorithm.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Behaviour Suite for Reinforcement Learning.\n \n \n \n \n\n\n \n Osband, I.; Doron, Y.; Hessel, M.; Aslanides, J.; Sezener, E.; Saraiva, A.; McKinney, K.; Lattimore, T.; Szepesvári, C.; Singh, S.; Roy, B. V.; Sutton, R. S.; Silver, D.; and van Hasselt, H.\n\n\n \n\n\n\n In ICLR, 08 2020. \n \n\n\n\n
\n\n\n\n \n \n \"Behaviour link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 7 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{ODH20,\n\tabstract = {This paper introduces the Behaviour Suite for Reinforcement Learning, or bsuite for short. bsuite is a collection of carefully-designed experiments that investigate core capabilities of reinforcement learning (RL) agents with two objectives. First, to collect clear, informative and scalable problems that capture key issues in the design of general and efficient learning algorithms. Second, to study agent behaviour through their performance on these shared benchmarks. To complement this effort, we open source this, which automates evaluation and analysis of any agent on bsuite. This library facilitates reproducible and accessible research on the core issues in RL, and ultimately the design of superior learning algorithms. Our code is Python, and easy to use within existing projects. We include examples with OpenAI Baselines, Dopamine as well as new reference implementations. Going forward, we hope to incorporate more excellent experiments from the research community, and commit to a periodic review of bsuite from a committee of prominent researchers.},\n\tauthor = {Osband, Ian and Doron, Yotam and Hessel, Matteo and Aslanides, John and Sezener, Eren and Saraiva, Andre and McKinney, Katrina and Lattimore, Tor and Szepesv\\'ari, Csaba and Singh, Satinder and Roy, Benjamin Van and Sutton, Richard S. and Silver, David and van Hasselt, Hado},\n\tbooktitle = {ICLR},\n\tdate-added = {2020-03-08 16:04:53 -0600},\n\tdate-modified = {2020-03-08 16:07:01 -0600},\n\tkeywords = {reinforcement learning, empirical evaluation},\n\tmonth = {08},\n\ttitle = {Behaviour Suite for Reinforcement Learning},\n\turl_link = {http://arxiv.org/abs/1908.03568},\n\tyear = {2020}}\n\n
\n
\n\n\n
\n This paper introduces the Behaviour Suite for Reinforcement Learning, or bsuite for short. bsuite is a collection of carefully-designed experiments that investigate core capabilities of reinforcement learning (RL) agents with two objectives. First, to collect clear, informative and scalable problems that capture key issues in the design of general and efficient learning algorithms. Second, to study agent behaviour through their performance on these shared benchmarks. To complement this effort, we open source this, which automates evaluation and analysis of any agent on bsuite. This library facilitates reproducible and accessible research on the core issues in RL, and ultimately the design of superior learning algorithms. Our code is Python, and easy to use within existing projects. We include examples with OpenAI Baselines, Dopamine as well as new reference implementations. Going forward, we hope to incorporate more excellent experiments from the research community, and commit to a periodic review of bsuite from a committee of prominent researchers.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Adaptive Exploration in Linear Contextual Bandit.\n \n \n \n \n\n\n \n Hao, B.; Lattimore, T.; and Szepesvári, C.\n\n\n \n\n\n\n In AISTATS, 03 2020. \n \n\n\n\n
\n\n\n\n \n \n \"Adaptive link\n  \n \n \n \"Adaptive paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 13 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{HLSz20,\n\tabstract = {Contextual bandits serve as a fundamental model for many sequential decision making tasks. The most popular theoretically justified approaches are based on the optimism principle. While these algorithms can be practical, they are known to be suboptimal asymptotically. On the other hand, existing asymptotically optimal algorithms for this problem do not exploit the linear structure in an optimal way and suffer from lower-order terms that dominate the regret in all practically interesting regimes. We start to bridge the gap by designing an algorithm that is asymptotically optimal and has good finite-time empirical performance. At the same time, we make connections to the recent literature on when exploration-free methods are effective. Indeed, if the distribution of contexts is well behaved, then our algorithm acts mostly greedily and enjoys sub-logarithmic regret. Furthermore, our approach is adaptive in the sense that it automatically detects the nice case. Numerical results demonstrate significant regret reductions by our method relative to several baselines.},\n\tauthor = {Hao, B. and Lattimore, T. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {AISTATS},\n\tkeywords = {stochastic bandits, contextual linear bandits, finite-armed bandits, asymptotic optimality, asymptotic regret, adaptivity},\n\tmonth = {03},\n\ttitle = {Adaptive Exploration in Linear Contextual Bandit},\n\turl_link = {https://arxiv.org/abs/1910.06996},\n\turl_paper = {AISTATS2020-AdaptiveLinBandit.pdf},\n    acceptrate = {acceptance rate: 23\\%},\n\tyear = {2020}}\n\n
\n
\n\n\n
\n Contextual bandits serve as a fundamental model for many sequential decision making tasks. The most popular theoretically justified approaches are based on the optimism principle. While these algorithms can be practical, they are known to be suboptimal asymptotically. On the other hand, existing asymptotically optimal algorithms for this problem do not exploit the linear structure in an optimal way and suffer from lower-order terms that dominate the regret in all practically interesting regimes. We start to bridge the gap by designing an algorithm that is asymptotically optimal and has good finite-time empirical performance. At the same time, we make connections to the recent literature on when exploration-free methods are effective. Indeed, if the distribution of contexts is well behaved, then our algorithm acts mostly greedily and enjoys sub-logarithmic regret. Furthermore, our approach is adaptive in the sense that it automatically detects the nice case. Numerical results demonstrate significant regret reductions by our method relative to several baselines.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Randomized Exploration in Generalized Linear Bandits.\n \n \n \n \n\n\n \n Kveton, B.; ; Zaheer, M.; Szepesvári, C.; Li, L.; Ghavamzadeh, M.; and Boutilier, C.\n\n\n \n\n\n\n In AISTATS, 03 2020. \n \n\n\n\n
\n\n\n\n \n \n \"Randomized link\n  \n \n \n \"Randomized paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 16 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KZSZLGB20,\n\tabstract = {We study two randomized algorithms for generalized linear bandits, GLM-TSL and GLM-FPL. GLM-TSL samples a generalized linear model (GLM) from the Laplace approximation to the posterior distribution. GLM-FPL fits a GLM to a randomly perturbed history of past rewards. We prove C d (n log K)^(1/2) bounds (up to log factors) on the n-round regret of GLM-TSL and GLM-FPL, where d is the number of features and K is the number of arms. The regret bound of GLM-TSL improves upon prior work and the regret bound of GLM-FPL is the first of its kind. We apply both GLM-TSL and GLM-FPL to logistic and neural network bandits, and show that they perform well empirically. In more complex models, GLM-FPL is significantly faster. Our results showcase the role of randomization, beyond sampling from the posterior, in exploration. },\n\tauthor = {Kveton, B. and and Zaheer, M. and Szepesv{\\'a}ri, Cs. and Li, L. and Ghavamzadeh, M. and Boutilier, C.},\n\tbooktitle = {AISTATS},\n\tkeywords = {stochastic bandits, finite-armed bandits, randomization, generalized linear bandit, follow-the-perturbed-leader, Thompson sampling},\n\tmonth = {03},\n\ttitle = {Randomized Exploration in Generalized Linear Bandits},\n\turl_link = {https://arxiv.org/abs/1906.08947},\n\turl_paper = {AISTATS2020-GLB.pdf},\n    acceptrate = {acceptance rate: 23\\%},\n\tyear = {2020}}\n\n
\n
\n\n\n
\n We study two randomized algorithms for generalized linear bandits, GLM-TSL and GLM-FPL. GLM-TSL samples a generalized linear model (GLM) from the Laplace approximation to the posterior distribution. GLM-FPL fits a GLM to a randomly perturbed history of past rewards. We prove C d (n log K)^(1/2) bounds (up to log factors) on the n-round regret of GLM-TSL and GLM-FPL, where d is the number of features and K is the number of arms. The regret bound of GLM-TSL improves upon prior work and the regret bound of GLM-FPL is the first of its kind. We apply both GLM-TSL and GLM-FPL to logistic and neural network bandits, and show that they perform well empirically. In more complex models, GLM-FPL is significantly faster. Our results showcase the role of randomization, beyond sampling from the posterior, in exploration. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Bandit Algorithms.\n \n \n \n \n\n\n \n Lattimore, T.; and Szepesvári, C.\n\n\n \n\n\n\n Cambridge University Press, 08 2020.\n \n\n\n\n
\n\n\n\n \n \n \"BanditPaper\n  \n \n \n \"Bandit link\n  \n \n \n \"Bandit pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 25 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@book{LaSze19:book,\n\tabstract = {Multi-armed bandits have now been studied for nearly a century. While research in the beginning was quite meandering, there is now a large community publishing hundreds of articles every year. Bandit algorithms are also finding their way into practical applications in industry, especially in on-line platforms where data is readily available and automation is the only way to scale. We had hoped to write a comprehensive book, but the literature is now so vast that many topics have been excluded. In the end we settled on the more modest goal of equipping our readers with enough expertise to explore the specialized literature by themselves, and to adapt existing algorithms to their applications. This latter point is important. Problems in theory are all alike; every application is different. A practitioner seeking to apply a bandit algorithm needs to understand which assumptions in the theory are important and how to modify the algorithm when the assumptions change. We hope this book can provide that understanding. What is covered in the book is covered in some depth. The focus is on the mathematical analysis of algorithms for bandit problems, but this is not a traditional mathematics book, where lemmas are followed by proofs, theorems and more lemmas. We worked hard to include guiding principles for designing algorithms and intuition for their analysis. Many algorithms are accompanied by empirical demonstrations that further aid intuition. We expect our readers to be familiar with basic analysis and calculus and some linear algebra. The book uses the notation of measure-theoretic probability theory, but does not rely on any deep results. A dedicated chapter is included to introduce the notation and provide intuitions for the basic results we need. This chapter is unusual for an introduction to measure theory in that it emphasizes the reasons to use σ-algebras beyond the standard technical justifications. We hope this will convince the reader that measure theory is an important and intuitive tool. Some chapters use techniques from information theory and convex analysis and we devote a short chapter to each.},\n\tauthor = {Lattimore, T. and Szepesv{\\'a}ri, Cs.},\n\tdate = {2020-03},\n\tdate-added = {2019-07-20 14:35:41 -0600},\n\tdate-modified = {2020-03-07 15:04:59 -0700},\n\tkeywords = {bandits},\n\tmonth = {08},\n\tpublisher = {Cambridge University Press},\n\ttitle = {Bandit Algorithms},\n\turl = {https://tor-lattimore.com/downloads/book/book.pdf},\n\turl_link = {https://banditalgs.com},\n\turl_pdf = {https://tor-lattimore.com/downloads/book/book.pdf},\n\tyear = {2020},\n\tBdsk-Url-1 = {https://tor-lattimore.com/downloads/book/book.pdf}}\n\n
\n
\n\n\n
\n Multi-armed bandits have now been studied for nearly a century. While research in the beginning was quite meandering, there is now a large community publishing hundreds of articles every year. Bandit algorithms are also finding their way into practical applications in industry, especially in on-line platforms where data is readily available and automation is the only way to scale. We had hoped to write a comprehensive book, but the literature is now so vast that many topics have been excluded. In the end we settled on the more modest goal of equipping our readers with enough expertise to explore the specialized literature by themselves, and to adapt existing algorithms to their applications. This latter point is important. Problems in theory are all alike; every application is different. A practitioner seeking to apply a bandit algorithm needs to understand which assumptions in the theory are important and how to modify the algorithm when the assumptions change. We hope this book can provide that understanding. What is covered in the book is covered in some depth. The focus is on the mathematical analysis of algorithms for bandit problems, but this is not a traditional mathematics book, where lemmas are followed by proofs, theorems and more lemmas. We worked hard to include guiding principles for designing algorithms and intuition for their analysis. Many algorithms are accompanied by empirical demonstrations that further aid intuition. We expect our readers to be familiar with basic analysis and calculus and some linear algebra. The book uses the notation of measure-theoretic probability theory, but does not rely on any deep results. A dedicated chapter is included to introduce the notation and provide intuitions for the basic results we need. This chapter is unusual for an introduction to measure theory in that it emphasizes the reasons to use σ-algebras beyond the standard technical justifications. We hope this will convince the reader that measure theory is an important and intuitive tool. Some chapters use techniques from information theory and convex analysis and we devote a short chapter to each.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2019\n \n \n (18)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Think out of the \"Box\": Generically-Constrained Asynchronous Composite Optimization and Hedging.\n \n \n \n \n\n\n \n Joulani, P.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems 32, pages 12246–12256, 2019. \n \n\n\n\n
\n\n\n\n \n \n \"Think link\n  \n \n \n \"Think paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{NeurIPS2019_9391,\n\tabstract = {We present two new algorithms, ASYNCADA and HEDGEHOG, for asynchronous sparse online and stochastic optimization. ASYNCADA is, to our knowledge, the first asynchronous stochastic optimization algorithm with finite-time data-dependent convergence guarantees for generic convex constraints. In addition, ASYNCADA: (a) allows for proximal (i.e., composite-objective) updates and adaptive step-sizes; (b) enjoys any-time convergence guarantees without requiring an exact global clock; and (c) when the data is sufficiently sparse, its convergence rate for (non-)smooth, (non-)strongly-convex, and even a limited class of non-convex objectives matches the corresponding serial rate, implying a theoretical ``linear speed-up''. The second algorithm, HEDGEHOG, is an asynchronous parallel version of the Exponentiated Gradient (EG) algorithm for optimization over the probability simplex (a.k.a. Hedge in online learning), and, to our knowledge, the first asynchronous algorithm enjoying linear speed-ups under sparsity with non-SGD-style updates. Unlike previous work, ASYNCADA and HEDGEHOG and their convergence and speed-up analyses are not limited to individual coordinate-wise (i.e., ``box-shaped'') constraints or smooth and strongly-convex objectives. Underlying both results is a generic analysis framework that is of independent interest, and further applicable to distributed and delayed feedback optimization.},\n\tauthor = {Joulani, Pooria and Gy\\"orgy, Andr\\'as and Szepesv\\'ari, Csaba},\n\tbooktitle = {Advances in Neural Information Processing Systems 32},\n\tpages = {12246--12256},\n\ttitle = {Think out of the "Box": Generically-Constrained Asynchronous Composite Optimization and Hedging},\n\turl_link = {http://papers.neurips.cc/paper/9391-think-out-of-the-box-generically-constrained-asynchronous-composite-optimization-and-hedging},\n\turl_paper = {outofbox-neurips2019.pdf},\n\tyear = {2019}}\n\n
\n
\n\n\n
\n We present two new algorithms, ASYNCADA and HEDGEHOG, for asynchronous sparse online and stochastic optimization. ASYNCADA is, to our knowledge, the first asynchronous stochastic optimization algorithm with finite-time data-dependent convergence guarantees for generic convex constraints. In addition, ASYNCADA: (a) allows for proximal (i.e., composite-objective) updates and adaptive step-sizes; (b) enjoys any-time convergence guarantees without requiring an exact global clock; and (c) when the data is sufficiently sparse, its convergence rate for (non-)smooth, (non-)strongly-convex, and even a limited class of non-convex objectives matches the corresponding serial rate, implying a theoretical ``linear speed-up''. The second algorithm, HEDGEHOG, is an asynchronous parallel version of the Exponentiated Gradient (EG) algorithm for optimization over the probability simplex (a.k.a. Hedge in online learning), and, to our knowledge, the first asynchronous algorithm enjoying linear speed-ups under sparsity with non-SGD-style updates. Unlike previous work, ASYNCADA and HEDGEHOG and their convergence and speed-up analyses are not limited to individual coordinate-wise (i.e., ``box-shaped'') constraints or smooth and strongly-convex objectives. Underlying both results is a generic analysis framework that is of independent interest, and further applicable to distributed and delayed feedback optimization.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Detecting overfitting via adversarial examples.\n \n \n \n \n\n\n \n Werpachowski, R.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, pages 7856–7866, 2019. \n \n\n\n\n
\n\n\n\n \n \n \"Detecting link\n  \n \n \n \"Detecting paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 8 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@inproceedings{werpachowski2019detecting,\n\tabstract = {The repeated community-wide reuse of test sets in popular benchmark problems raises doubts about the credibility of reported test-error rates. Verifying whether a learned model is overfitted to a test set is challenging as independent test sets drawn from the same data distribution are usually unavailable, while other test sets may introduce a distribution shift. We propose a new hypothesis test that uses only the original test data to detect overfitting. It utilizes a new unbiased error estimate that is based on adversarial examples generated from the test data and importance weighting. Overfitting is detected if this error estimate is sufficiently different from the original test error rate. We develop a specialized variant of our test for multiclass image classification, and apply it to testing overfitting of recent models to the popular ImageNet benchmark. Our method correctly indicates overfitting of the trained model to the training set, but is not able to detect any overfitting to the test set, in line with other recent work on this topic.},\n\tauthor = {Werpachowski, Roman and Gy{\\"o}rgy, Andr{\\'a}s and Szepesv{\\'a}ri, Csaba},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tpages = {7856--7866},\n\ttitle = {Detecting overfitting via adversarial examples},\n\turl_link = {http://papers.neurips.cc/paper/9000-detecting-overfitting-via-adversarial-examples},\n\turl_paper = {advrisk_neurips2019.pdf},\n\tyear = {2019}}\n\n
\n
\n\n\n
\n The repeated community-wide reuse of test sets in popular benchmark problems raises doubts about the credibility of reported test-error rates. Verifying whether a learned model is overfitted to a test set is challenging as independent test sets drawn from the same data distribution are usually unavailable, while other test sets may introduce a distribution shift. We propose a new hypothesis test that uses only the original test data to detect overfitting. It utilizes a new unbiased error estimate that is based on adversarial examples generated from the test data and importance weighting. Overfitting is detected if this error estimate is sufficiently different from the original test error rate. We develop a specialized variant of our test for multiclass image classification, and apply it to testing overfitting of recent models to the popular ImageNet benchmark. Our method correctly indicates overfitting of the trained model to the training set, but is not able to detect any overfitting to the test set, in line with other recent work on this topic.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Mixing Time Estimation in Reversible Markov Chains from a Single Sample Path.\n \n \n \n \n\n\n \n Hsu, D.; Kontorovich, A.; Levin, D. A.; Peres, Y.; Szepesvári, C.; and Wolfer, G.\n\n\n \n\n\n\n Annals of Applied Probability, 29(4): 2439–2480. 07 2019.\n \n\n\n\n
\n\n\n\n \n \n \"Mixing paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{HsuKoLePeSzeWo19,\n\tabstract = {The spectral gap of a finite, ergodic, and reversible Markov chain is an important parameter measuring the asymptotic rate of convergence. In applications, the transition matrix P may be unknown, yet one sample of the chain up to a fixed time n may be observed. We consider here the problem of estimating the spectral gap from this data and give a fully empirical interval estimate, whose width is essentially unimprovable (shortened abstract).},\n\tauthor = {Hsu, Daniel and Kontorovich, Arieh and Levin, David A. and Peres, Yuval and Szepesv{\\'a}ri, Csaba and Wolfer, Geoffrey},\n\tdate = {2019-07},\n\tdate-added = {2019-07-24 20:08:32 -0600},\n\tdate-modified = {2019-07-24 20:17:30 -0600},\n\tjournal = {Annals of Applied Probability},\n\tkeywords = {mixing, data-dependent bounds, a posteriori bounds, Markov chains, finite-sample bounds, theory},\n\tmonth = {07},\n\tnumber = {4},\n\tpages = {2439--2480},\n\ttitle = {Mixing Time Estimation in Reversible Markov Chains from a Single Sample Path},\n\turl_paper = {AAP19_mixing.pdf},\n\tvolume = {29},\n\tyear = {2019}}\n\n
\n
\n\n\n
\n The spectral gap of a finite, ergodic, and reversible Markov chain is an important parameter measuring the asymptotic rate of convergence. In applications, the transition matrix P may be unknown, yet one sample of the chain up to a fixed time n may be observed. We consider here the problem of estimating the spectral gap from this data and give a fully empirical interval estimate, whose width is essentially unimprovable (shortened abstract).\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n BubbleRank: Safe Online Learning to Re-Rank via Implicit Click Feedback.\n \n \n \n \n\n\n \n Li, C.; Kveton, B.; Lattimore, T.; Markov, I.; de Rijke, M.; Szepesvári, C.; and Zoghi, M.\n\n\n \n\n\n\n In UAI, 08 2019. \n \n\n\n\n
\n\n\n\n \n \n \"BubbleRank: paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{ZoTuGKSzW:UAI19,\n\tabstract = {In this paper, we study the problem of safe on-line learning to re-rank, where user feedback is used to improve the quality of displayed lists. Learning to rank has traditionally been studied in two settings. In the offline setting, rankers are typically learned from relevance labels created by judges. This approach has generally become standard in industrial applications of ranking, such as search. However, this approach lacks exploration and thus is limited by the information content of the offline training data. In the online setting, an algorithm can experiment with lists and learn from feedback on them in a sequential fashion. Bandit algorithms are well-suited for this setting but they tend to learn user preferences from scratch, which results in a high initial cost of exploration. This poses an additional challenge of safe exploration in ranked lists. We propose BubbleRank, a bandit algorithm for safe re-ranking that combines the strengths of both the offline and online settings. The algorithm starts with an initial base list and improves it online by gradually exchanging higher-ranked less attractive items for lower-ranked more attractive items. We prove an upper bound on the n-step regret of BubbleRank that degrades gracefully with the quality of the initial base list. Our theoretical findings are supported by extensive experiments on a large-scale real-world click dataset.\n},\n\tauthor = {Li, C. and Kveton, B. and Lattimore, T. and Markov, I. and de Rijke, M. and Szepesv{\\'a}ri, Cs. and Zoghi, M.},\n\tcrossref = {UAI2019},\n\tkeywords = {reranking, warm start, ranking, online learning, partial information, stochastic online learning, online learning to rank},\n\tmonth = {08},\n\ttitle = {BubbleRank: Safe Online Learning to Re-Rank via Implicit Click Feedback},\n\turl_paper = {UAI2019-BubbleRank.pdf},\n    booktitle = {UAI},\n\tyear = {2019}}\n\n
\n
\n\n\n
\n In this paper, we study the problem of safe on-line learning to re-rank, where user feedback is used to improve the quality of displayed lists. Learning to rank has traditionally been studied in two settings. In the offline setting, rankers are typically learned from relevance labels created by judges. This approach has generally become standard in industrial applications of ranking, such as search. However, this approach lacks exploration and thus is limited by the information content of the offline training data. In the online setting, an algorithm can experiment with lists and learn from feedback on them in a sequential fashion. Bandit algorithms are well-suited for this setting but they tend to learn user preferences from scratch, which results in a high initial cost of exploration. This poses an additional challenge of safe exploration in ranked lists. We propose BubbleRank, a bandit algorithm for safe re-ranking that combines the strengths of both the offline and online settings. The algorithm starts with an initial base list and improves it online by gradually exchanging higher-ranked less attractive items for lower-ranked more attractive items. We prove an upper bound on the n-step regret of BubbleRank that degrades gracefully with the quality of the initial base list. Our theoretical findings are supported by extensive experiments on a large-scale real-world click dataset. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Perturbed-History Exploration in Stochastic Linear Bandits.\n \n \n \n \n\n\n \n Kveton, B.; Szepesvári, C.; Ghavamzadeh, M.; and Boutilier, C.\n\n\n \n\n\n\n In UAI, 07 2019. \n \n\n\n\n
\n\n\n\n \n \n \"Perturbed-History paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 11 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KSzGB19:UAI,\n\tabstract = {We propose a new online algorithm for cumulative regret minimization in a stochastic linear bandit. The algorithm pulls the arm with the highest estimated reward in a linear model trained on its perturbed history. Therefore, we call it perturbed-history exploration in a linear bandit (LinPHE). The perturbed history is a mixture of observed rewards and randomly generated i.i.d. pseudo-rewards. We derive a O(d n^(1/2)) gap-free bound on then-round regret of LinPHE, where d is the number of features. The key steps in our analysis are new concentration and anti-concentration bounds on the weighted sum of Bernoulli random variables. To show the generality of our design, we generalize LinPHE to a logistic model. We evaluate our algorithms empirically and show that they are practical.},\n\tauthor = {Kveton, B. and Szepesv{\\'a}ri, Cs. and Ghavamzadeh, M. and Boutilier, C.},\n\tcrossref = {UAI2019},\n    booktitle = {UAI},\n\tkeywords = {stochastic bandits, finite-armed bandits, randomization, follow-the-perturbed-leader, linear bandits},\n\tmonth = {07},\n\ttitle = {Perturbed-History Exploration in Stochastic Linear Bandits},\n\turl_paper = {UAI2019-PHE.pdf},\n\tyear = {2019}}\n\n
\n
\n\n\n
\n We propose a new online algorithm for cumulative regret minimization in a stochastic linear bandit. The algorithm pulls the arm with the highest estimated reward in a linear model trained on its perturbed history. Therefore, we call it perturbed-history exploration in a linear bandit (LinPHE). The perturbed history is a mixture of observed rewards and randomly generated i.i.d. pseudo-rewards. We derive a O(d n^(1/2)) gap-free bound on then-round regret of LinPHE, where d is the number of features. The key steps in our analysis are new concentration and anti-concentration bounds on the weighted sum of Bernoulli random variables. To show the generality of our design, we generalize LinPHE to a logistic model. We evaluate our algorithms empirically and show that they are practical.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Perturbed-History Exploration in Stochastic Multi-Armed Bandits.\n \n \n \n \n\n\n \n Kveton, B.; Szepesvári, C.; Ghavamzadeh, M.; and Boutilier, C.\n\n\n \n\n\n\n In IJCAI, 06 2019. \n \n\n\n\n
\n\n\n\n \n \n \"Perturbed-History paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KSzGB19,\n\tabstract = {We propose an online algorithm for cumulative regret minimization in a stochastic multi-armed ban-dit. The algorithm adds O(t) i.i.d. pseudo-rewards to its history in round t and then pulls the arm with the highest average reward in its perturbed history. Therefore, we call it perturbed-history exploration (PHE). The pseudo-rewards are carefully designed to offset potentially underestimated mean rewards of arms with a high probability. We derive near-optimal gap-dependent and gap-free bounds on then-round regret of PHE.  The key step in our analysis is a novel argument that shows that randomized Bernoulli  rewards  lead  to  optimism.   Finally,  we empirically evaluate PHE and show that it is competitive with state-of-the-art baselines.},\n\tacceptrate = {850 out of 4752=18\\%},\n\tauthor = {Kveton, B. and Szepesv{\\'a}ri, Cs. and Ghavamzadeh, M. and Boutilier, C.},\n\tbooktitle = {IJCAI},\n\tkeywords = {stochastic bandits, finite-armed bandits, randomization, follow-the-perturbed-leader},\n\tmonth = {06},\n\ttitle = {Perturbed-History Exploration in Stochastic Multi-Armed Bandits},\n\turl_paper = {IJCAI2019-PHE.pdf},\n\tyear = {2019}}\n\n
\n
\n\n\n
\n We propose an online algorithm for cumulative regret minimization in a stochastic multi-armed ban-dit. The algorithm adds O(t) i.i.d. pseudo-rewards to its history in round t and then pulls the arm with the highest average reward in its perturbed history. Therefore, we call it perturbed-history exploration (PHE). The pseudo-rewards are carefully designed to offset potentially underestimated mean rewards of arms with a high probability. We derive near-optimal gap-dependent and gap-free bounds on then-round regret of PHE. The key step in our analysis is a novel argument that shows that randomized Bernoulli rewards lead to optimism. Finally, we empirically evaluate PHE and show that it is competitive with state-of-the-art baselines.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Rigorous Agent Evaluation: An Adversarial Approach to Uncover Catastrophic Failures.\n \n \n \n \n\n\n \n Uesato, J.; Kumar, A.; Szepesvári, C.; Erez, T.; Ruderman, A.; Anderson, K.; Dvijotham, K.; Heess, N.; and Kohli, P.\n\n\n \n\n\n\n In ICLR, pages 3692–3702, 06 2019. \n \n\n\n\n
\n\n\n\n \n \n \"Rigorous paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{UKSz19,\n\tabstract = {This paper addresses the problem of evaluating learning systems in safety critical domains such as autonomous driving, where failures can have catastrophic consequences. We focus on two problems: searching for scenarios when learned agents fail and assessing their probability of failure. The standard method for agent evaluation in reinforcement learning, Vanilla Monte Carlo, can miss failures entirely, leading to the deployment of unsafe agents. We demonstrate this is an issue for current agents, where even matching the compute used for training is sometimes insufficient for evaluation. To address this shortcoming, we draw upon the rare event probability estimation literature and propose an adversarial evaluation approach. Our approach focuses evaluation on adversarially chosen situations, while still providing unbiased estimates of failure probabilities. The key difficulty is in identifying these adversarial situations -- since failures are rare there is little signal to drive optimization. To solve this we propose a continuation approach that learns failure modes in related but less robust agents. Our approach also allows reuse of data already collected for training the agent. We demonstrate the efficacy of adversarial evaluation on two standard domains: humanoid control and simulated driving. Experimental results show that our methods can find catastrophic failures and estimate failures rates of agents multiple orders of magnitude faster than standard evaluation schemes, in minutes to hours rather than days.},\n\tacceptrate = {500 out of 1591=31\\%},\n\tauthor = {Uesato, J. and Kumar, A. and Szepesv{\\'a}ri, Cs. and Erez, T. and Ruderman, A. and Anderson, K. and Dvijotham, K. and Heess, N. and Kohli, P.},\n\tbooktitle = {ICLR},\n\tdate = {2019-06},\n\tdate-added = {2019-07-20 13:57:55 -0600},\n\tdate-modified = {2019-07-20 14:02:08 -0600},\n\tkeywords = {Monte Carlo methods, failure probability prediction, continuation method, reinforcement learning, adversarial evaluation},\n\tmonth = {06},\n\tpages = {3692--3702},\n\trating = {0},\n\tread = {Yes},\n\ttitle = {Rigorous Agent Evaluation: An Adversarial Approach to Uncover Catastrophic Failures},\n\turl_paper = {ICLR2019-Risk.pdf},\n\tyear = {2019}}\n\n
\n
\n\n\n
\n This paper addresses the problem of evaluating learning systems in safety critical domains such as autonomous driving, where failures can have catastrophic consequences. We focus on two problems: searching for scenarios when learned agents fail and assessing their probability of failure. The standard method for agent evaluation in reinforcement learning, Vanilla Monte Carlo, can miss failures entirely, leading to the deployment of unsafe agents. We demonstrate this is an issue for current agents, where even matching the compute used for training is sometimes insufficient for evaluation. To address this shortcoming, we draw upon the rare event probability estimation literature and propose an adversarial evaluation approach. Our approach focuses evaluation on adversarially chosen situations, while still providing unbiased estimates of failure probabilities. The key difficulty is in identifying these adversarial situations – since failures are rare there is little signal to drive optimization. To solve this we propose a continuation approach that learns failure modes in related but less robust agents. Our approach also allows reuse of data already collected for training the agent. We demonstrate the efficacy of adversarial evaluation on two standard domains: humanoid control and simulated driving. Experimental results show that our methods can find catastrophic failures and estimate failures rates of agents multiple orders of magnitude faster than standard evaluation schemes, in minutes to hours rather than days.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Online Learning to Rank with Features.\n \n \n \n \n\n\n \n Li, S.; Lattimore, T.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 3856–3865, 05 2019. \n \n\n\n\n
\n\n\n\n \n \n \"Online paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 6 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{LLSz19,\n\tabstract = {We introduce a new model for online ranking in which the click probability factors into an examination and attractiveness function and the attractiveness function is a linear function of a feature vector and an unknown parameter. Only relatively mild assumptions are made on the examination function. A novel algorithm for this setup is analysed, showing that the dependence on the number of items is replaced by a dependence on the dimension, allowing the new algorithm to handle a large number of items. When reduced to the orthogonal case, the regret of the algorithm improves on the state-of-the-art.},\n\tauthor = {Li, S. and Lattimore, T. and Szepesv{\\'a}ri, Cs.},\n\tcrossref = {ICML2019},\n    booktitle = {ICML},\n\tkeywords = {stochastic bandits, finite-armed bandits, linear bandits, ranking, online learning to rank, ranking, online learning, partial information, stochastic online learning},\n\tmonth = {05},\n\tpages = {3856--3865},\n\ttitle = {Online Learning to Rank with Features},\n\turl_paper = {ICML2019-RecurRank.pdf},\n\tyear = {2019}}\n\n
\n
\n\n\n
\n We introduce a new model for online ranking in which the click probability factors into an examination and attractiveness function and the attractiveness function is a linear function of a feature vector and an unknown parameter. Only relatively mild assumptions are made on the examination function. A novel algorithm for this setup is analysed, showing that the dependence on the number of items is replaced by a dependence on the dimension, allowing the new algorithm to handle a large number of items. When reduced to the orthogonal case, the regret of the algorithm improves on the state-of-the-art.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n CapsAndRuns: An Improved Method for Approximately Optimal Algorithm Configuration.\n \n \n \n \n\n\n \n Weisz, G.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 6707-6715, 05 2019. \n \n\n\n\n
\n\n\n\n \n \n \"CapsAndRuns: paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{WGySz19,\n\tabstract = {We consider the problem of configuring general-purpose solvers to run efficiently on problem instances drawn from an unknown distribution, a problem of major interest in solver autoconfiguration. Following previous work, we focus on designing algorithms that find a configuration with near-optimal expected capped runtime while doing the least amount of work, with the cap chosen in a configuration-specific way so that most instances are solved. In this paper we present a new algorithm, CapsAndRuns, which finds a near-optimal configuration while using time that scales (in a problem dependent way) with the optimal expected capped runtime, significantly strengthening previous results which could only guarantee a bound that scaled with the potentially much larger optimal expected uncapped runtime. The new algorithm is simpler and more intuitive than the previous methods: first it estimates the optimal runtime cap for each configuration, then it uses a Bernstein race to find a near optimal configuration given the caps. Experiments verify that our method can significantly outperform its competitors.},\n\tauthor = {Weisz, G. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tcrossref = {ICML2019},\n    booktitle = {ICML},\n\tkeywords = {algorithm configuration, theory, heavy tail data},\n\tmonth = {05},\n\tpages = {6707-6715},\n\ttitle = {CapsAndRuns: An Improved Method for Approximately Optimal Algorithm Configuration},\n\turl_paper = {ICML2019-CapsAndRuns.pdf},\n\tyear = {2019}}\n\n
\n
\n\n\n
\n We consider the problem of configuring general-purpose solvers to run efficiently on problem instances drawn from an unknown distribution, a problem of major interest in solver autoconfiguration. Following previous work, we focus on designing algorithms that find a configuration with near-optimal expected capped runtime while doing the least amount of work, with the cap chosen in a configuration-specific way so that most instances are solved. In this paper we present a new algorithm, CapsAndRuns, which finds a near-optimal configuration while using time that scales (in a problem dependent way) with the optimal expected capped runtime, significantly strengthening previous results which could only guarantee a bound that scaled with the potentially much larger optimal expected uncapped runtime. The new algorithm is simpler and more intuitive than the previous methods: first it estimates the optimal runtime cap for each configuration, then it uses a Bernstein race to find a near optimal configuration given the caps. Experiments verify that our method can significantly outperform its competitors.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n POLITEX: Regret Bounds for Policy Iteration using Expert Prediction.\n \n \n \n \n\n\n \n Abbasi-Yadkori, Y.; Bartlett, P.; Bhatia, K.; Lazic, N.; Szepesvári, C.; and Weisz, G.\n\n\n \n\n\n\n In ICML, pages 3692–3702, 05 2019. \n \n\n\n\n
\n\n\n\n \n \n \"POLITEX: paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 12 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{AYBBLSzW19,\n\tabstract = {We present POLITEX (POLicy ITeration with EXpert advice), a variant of policy iteration where each policy is a Boltzmann distribution over the sum of action-value function estimates of the previous policies, and analyze its regret in continuing RL problems. We assume that the value function error after running a policy for tau time steps scales as eps(tau) = eps_0 + O( (d/tau)^(1/2) ), where eps_0 is the worst-case approximation error and d is the number of features in a compressed representation of the state-action space. We establish that this condition is satisfied by the LSPE algorithm under certain assumptions on the MDP and policies. Under the error assumption, we show that the regret of POLITEX in uniformly mixing MDPs scales as O( d^(1/2) T^(3/4) + eps_0 T) up to logarithmic terms. Thus, we provide the first regret bound for a fully practical model-free method which only scales in the number of features, and not in the size of the underlying MDP. Experiments on a queuing problem confirm that POLITEX is competitive with some of its alternatives, while preliminary results on Ms Pacman (one of the standard Atari benchmark problems) confirm the viability of POLITEX beyond linear function approximation.},\n\tauthor = {Abbasi-Yadkori, Y. and Bartlett, P. and Bhatia, K. and Lazic, N. and Szepesv{\\'a}ri, Cs. and Weisz, G.},\n\tcrossref = {ICML2019},\n    booktitle = {ICML},\n\tkeywords = {Markov Decision Processes, online learning, theory, model-free RL, black-box RL},\n\tmonth = {05},\n\tpages = {3692--3702},\n\trating = {0},\n\tread = {Yes},\n\ttitle = {POLITEX: Regret Bounds for Policy Iteration using Expert Prediction},\n\turl_paper = {ICML2019-Politex.pdf},\n\tyear = {2019},\n\tBdsk-File-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxBKLi4vLi4vLi4vTGlicmFyeS5wYXBlcnMzL0ZpbGVzL0Q5L0Q5QkEzMkVGLTMyQ0QtNERBRS04MkQzLTgyNjY2QTgyOUY0Ri5wZGZPEQHaAAAAAAHaAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAAAAAAAAAAAAAAQkQAAf////8fRDlCQTMyRUYtMzJDRC00REFFI0ZGRkZGRkZGLnBkZgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/////wAAAAAAAAAAAAAAAAADAAQAAAogY3UAAAAAAAAAAAAAAAAAAkQ5AAIAWS86VXNlcnM6Y3NhYmE6RG9jdW1lbnRzOkxpYnJhcnkucGFwZXJzMzpGaWxlczpEOTpEOUJBMzJFRi0zMkNELTREQUUtODJEMy04MjY2NkE4MjlGNEYucGRmAAAOAFIAKABEADkAQgBBADMAMgBFAEYALQAzADIAQwBEAC0ANABEAEEARQAtADgAMgBEADMALQA4ADIANgA2ADYAQQA4ADIAOQBGADQARgAuAHAAZABmAA8AGgAMAE0AYQBjAGkAbgB0AG8AcwBoACAASABEABIAV1VzZXJzL2NzYWJhL0RvY3VtZW50cy9MaWJyYXJ5LnBhcGVyczMvRmlsZXMvRDkvRDlCQTMyRUYtMzJDRC00REFFLTgyRDMtODI2NjZBODI5RjRGLnBkZgAAEwABLwAAFQACAAz//wAAAAgADQAaACQAcQAAAAAAAAIBAAAAAAAAAAUAAAAAAAAAAAAAAAAAAAJP},\n\tBdsk-Url-1 = {http://ieeexplore.ieee.org/document/8014469/},\n\tBdsk-Url-2 = {https://dx.doi.org/10.1109/TAC.2017.2743163}}\n\n
\n
\n\n\n
\n We present POLITEX (POLicy ITeration with EXpert advice), a variant of policy iteration where each policy is a Boltzmann distribution over the sum of action-value function estimates of the previous policies, and analyze its regret in continuing RL problems. We assume that the value function error after running a policy for tau time steps scales as eps(tau) = eps_0 + O( (d/tau)^(1/2) ), where eps_0 is the worst-case approximation error and d is the number of features in a compressed representation of the state-action space. We establish that this condition is satisfied by the LSPE algorithm under certain assumptions on the MDP and policies. Under the error assumption, we show that the regret of POLITEX in uniformly mixing MDPs scales as O( d^(1/2) T^(3/4) + eps_0 T) up to logarithmic terms. Thus, we provide the first regret bound for a fully practical model-free method which only scales in the number of features, and not in the size of the underlying MDP. Experiments on a queuing problem confirm that POLITEX is competitive with some of its alternatives, while preliminary results on Ms Pacman (one of the standard Atari benchmark problems) confirm the viability of POLITEX beyond linear function approximation.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Garbage In, Reward Out: Bootstrapping Exploration in Multi-Armed Bandits.\n \n \n \n \n\n\n \n Kveton, B.; Szepesvári, C.; Vaswani, S.; Wen, Z.; Ghavamzadeh, M.; and Lattimore, T.\n\n\n \n\n\n\n In ICML, pages 3601–3610, 05 2019. \n \n\n\n\n
\n\n\n\n \n \n \"Garbage paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 5 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KSzVWGL19,\n\tabstract = {We propose a bandit algorithm that explores by randomizing its history of rewards. Specifically, it pulls the arm with the highest mean reward in a non-parametric bootstrap sample of its history with pseudo rewards. We design the pseudo rewards such that the bootstrap mean is optimistic with a sufficiently high probability. We call our algorithm Giro, which stands for garbage in, reward out. We analyze Giro in a Bernoulli bandit and derive a O( K log(n)/Delta ) bound on its n-round regret, where Delta is the difference in the expected rewards of the optimal and the best sub-optimal arms, and K is the number of arms. The main advantage of our exploration design is that it easily generalizes to structured problems. To show this, we propose contextual Giro with an arbitrary reward generalization model. We evaluate Giro and its contextual variant on multiple synthetic and real-world problems, and observe that it performs well.},\n\tauthor = {Kveton, B. and Szepesv{\\'a}ri, Cs. and Vaswani, S. and Wen, Z. and Ghavamzadeh, M. and Lattimore, T.},\n\tcrossref = {ICML2019},\n    booktitle = {ICML},\n\tkeywords = {stochastic bandits, finite-armed bandits, randomization, follow-the-perturbed-leader},\n\tmonth = {05},\n\tpages = {3601--3610},\n\ttitle = {Garbage In, Reward Out: Bootstrapping Exploration in Multi-Armed Bandits},\n\turl_paper = {ICML2019-Giro.pdf},\n\tyear = {2019}}\n\n
\n
\n\n\n
\n We propose a bandit algorithm that explores by randomizing its history of rewards. Specifically, it pulls the arm with the highest mean reward in a non-parametric bootstrap sample of its history with pseudo rewards. We design the pseudo rewards such that the bootstrap mean is optimistic with a sufficiently high probability. We call our algorithm Giro, which stands for garbage in, reward out. We analyze Giro in a Bernoulli bandit and derive a O( K log(n)/Delta ) bound on its n-round regret, where Delta is the difference in the expected rewards of the optimal and the best sub-optimal arms, and K is the number of arms. The main advantage of our exploration design is that it easily generalizes to structured problems. To show this, we propose contextual Giro with an arbitrary reward generalization model. We evaluate Giro and its contextual variant on multiple synthetic and real-world problems, and observe that it performs well.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n An Information-Theoretic Approach to Minimax Regret in Partial Monitoring.\n \n \n \n \n\n\n \n Lattimore, T.; and Szepesvári, C.\n\n\n \n\n\n\n In COLT, 04 2019. \n \n\n\n\n
\n\n\n\n \n \n \"An paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{LaSze19COLT,\n\tabstract = {We prove a new minimax theorem connecting the worst-case Bayesian regret and minimax regret under finite-action partial monitoring with no assumptions on the space of signals or decisions of the adversary. We then generalise the information-theoretic tools of Russo and Van Roy (2016) for proving Bayesian regret bounds and combine them with the minimax theorem to derive minimax regret bounds for various partial monitoring settings. The highlight is a clean analysis of `easy' and `hard' finite partial monitoring, with new regret bounds that are independent of arbitrarily large game-dependent constants and eliminate the logarithmic dependence on the horizon for easy games that appeared in earlier work. The power of the generalised machinery is further demonstrated by proving that the minimax regret for k-armed adversarial bandits is at most (2kn)^(1/2), improving on existing results by a factor of 2. Finally, we provide a simple analysis of the cops and robbers game, also improving best known constants.\n},\n\tacceptrate = {116 out of 393=30\\%},\n\tauthor = {Lattimore, T. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {COLT},\n\tkeywords = {partial monitoring, partial information, online learning, adversarial setting, Bayes regret, minimax optimality},\n\tmonth = {04},\n\ttitle = {An Information-Theoretic Approach to Minimax Regret in Partial Monitoring},\n\turl_paper = {COLT2019_pm.pdf},\n\tyear = {2019}}\n\n
\n
\n\n\n
\n We prove a new minimax theorem connecting the worst-case Bayesian regret and minimax regret under finite-action partial monitoring with no assumptions on the space of signals or decisions of the adversary. We then generalise the information-theoretic tools of Russo and Van Roy (2016) for proving Bayesian regret bounds and combine them with the minimax theorem to derive minimax regret bounds for various partial monitoring settings. The highlight is a clean analysis of `easy' and `hard' finite partial monitoring, with new regret bounds that are independent of arbitrarily large game-dependent constants and eliminate the logarithmic dependence on the horizon for easy games that appeared in earlier work. The power of the generalised machinery is further demonstrated by proving that the minimax regret for k-armed adversarial bandits is at most (2kn)^(1/2), improving on existing results by a factor of 2. Finally, we provide a simple analysis of the cops and robbers game, also improving best known constants. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Distribution-Dependent Analysis of Gibbs-ERM Principle.\n \n \n \n \n\n\n \n Kuzborskij, I.; Cesa-Bianchi, N.; and Szepesvári, C.\n\n\n \n\n\n\n In COLT, 04 2019. \n \n\n\n\n
\n\n\n\n \n \n \"Distribution-Dependent paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{KuCBSze19,\n\tabstract = {Gibbs-ERM is a natural idealized model of learning with stochastic optimization algorithms (such as Stochastic Gradient Langevin Dynamics and --- to some extent--- Stochastic Gradient Descent) which also appears in other contexts, including PAC-Bayesian theory, and sampling mechanisms. In this work we study the excess risk suffered by the Gibbs-ERM learner with non-convex, regularized empirical risk. Our goal is to understand the interplay between the data-generating distribution and the problem of learning in large hypothesis spaces. Our main results are distribution-dependent upper bounds on several notions of excess risk. We show that, in all cases, the distribution-dependent excess risk is essentially controlled by the "local" effective dimension of the problem, a well-established notion of effective dimension appearing in the analyses of several previous algorithms, including SGD and ridge regression. Ours is the first work that brings this notion of dimension to the analysis of learning via Gibbs densities. The distribution-dependent view we advocate here improves upon earlier results of Raginsky et al. (2017), and can yield much tighter bounds depending on the interplay between the data-generating distribution and the loss function. The first part of our analysis focuses on the localized excess risk in the vicinity of a fixed local minimizer. This result is then extended to bounds on the global excess risk, by characterizing probabilities of local minima (and their complement) under Gibbs densities, a result which might be of independent interest.},\n\tacceptrate = {116 out of 393=30\\%},\n\tauthor = {Kuzborskij, I. and Cesa-Bianchi, N. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {COLT},\n\tkeywords = {learning theory},\n\tmonth = {04},\n\ttitle = {Distribution-Dependent Analysis of Gibbs-ERM Principle},\n\turl_paper = {COLT2019_gibbs.pdf},\n\tyear = {2019}}\n\n
\n
\n\n\n
\n Gibbs-ERM is a natural idealized model of learning with stochastic optimization algorithms (such as Stochastic Gradient Langevin Dynamics and — to some extent— Stochastic Gradient Descent) which also appears in other contexts, including PAC-Bayesian theory, and sampling mechanisms. In this work we study the excess risk suffered by the Gibbs-ERM learner with non-convex, regularized empirical risk. Our goal is to understand the interplay between the data-generating distribution and the problem of learning in large hypothesis spaces. Our main results are distribution-dependent upper bounds on several notions of excess risk. We show that, in all cases, the distribution-dependent excess risk is essentially controlled by the \"local\" effective dimension of the problem, a well-established notion of effective dimension appearing in the analyses of several previous algorithms, including SGD and ridge regression. Ours is the first work that brings this notion of dimension to the analysis of learning via Gibbs densities. The distribution-dependent view we advocate here improves upon earlier results of Raginsky et al. (2017), and can yield much tighter bounds depending on the interplay between the data-generating distribution and the loss function. The first part of our analysis focuses on the localized excess risk in the vicinity of a fixed local minimizer. This result is then extended to bounds on the global excess risk, by characterizing probabilities of local minima (and their complement) under Gibbs densities, a result which might be of independent interest.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n An Exponential Efron-Stein Inequality for Lq-Stable Learning Rules.\n \n \n \n \n\n\n \n Abou-Moustafa, K.; and Szepesvári, C.\n\n\n \n\n\n\n In ALT, 02 2019. \n \n\n\n\n
\n\n\n\n \n \n \"An paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{AMSz19ALT,\n\tabstract = {There is an accumulating evidence in the literature that stability of learning algorithms is a key characteristic that permits a learning algorithm to generalize. Despite various insightful results in this direction, there seems to be an overlooked dichotomy in the type of stability-based generalization bounds we have in the literature. On one hand, the literature seems to suggest that exponential generalization bounds for the estimated risk, which are optimal, can be only obtained through stringent, distribution independent and computationally intractable notions of stability such as uniform stability. On the other hand, it seems that weaker notions of stability such as hypothesis stability, although it is distribution dependent and more amenable to computation, can only yield polynomial generalization bounds for the estimated risk, which are suboptimal. In this paper, we address the gap between these two regimes of results. In particular, the main question we address here is whether it is possible to derive exponential generalization bounds for the estimated risk using a notion of stability that is computationally tractable and distribution dependent, but weaker than uniform stability. Using recent advances in concentration inequalities, and using a notion of stability that is weaker than uniform stability but distribution dependent and amenable to computation, we derive an exponential tail bound for the concentration of the estimated risk of a hypothesis returned by a general learning rule, where the estimated risk is expressed in terms of either the resubstitution estimate (empirical error), or the deleted (or, leave-one-out) estimate. As an illustration we derive exponential tail bounds for ridge regression with unbounded responses -- a setting where uniform stability results of Bousquet and Elisseeff (2002) are not applicable.},\n\tacceptrate = {37 out of 78=47\\%},\n\tauthor = {Abou-Moustafa, K. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ALT},\n\tkeywords = {stability, generalization bounds, learning theory, cross-validation, deleted estimate, Efron-Stein},\n\tmonth = {02},\n\ttitle = {An Exponential Efron-Stein Inequality for Lq-Stable Learning Rules},\n\turl_paper = {ALT2019_expefronsteindel.pdf},\n\tyear = {2019}}\n\n
\n
\n\n\n
\n There is an accumulating evidence in the literature that stability of learning algorithms is a key characteristic that permits a learning algorithm to generalize. Despite various insightful results in this direction, there seems to be an overlooked dichotomy in the type of stability-based generalization bounds we have in the literature. On one hand, the literature seems to suggest that exponential generalization bounds for the estimated risk, which are optimal, can be only obtained through stringent, distribution independent and computationally intractable notions of stability such as uniform stability. On the other hand, it seems that weaker notions of stability such as hypothesis stability, although it is distribution dependent and more amenable to computation, can only yield polynomial generalization bounds for the estimated risk, which are suboptimal. In this paper, we address the gap between these two regimes of results. In particular, the main question we address here is whether it is possible to derive exponential generalization bounds for the estimated risk using a notion of stability that is computationally tractable and distribution dependent, but weaker than uniform stability. Using recent advances in concentration inequalities, and using a notion of stability that is weaker than uniform stability but distribution dependent and amenable to computation, we derive an exponential tail bound for the concentration of the estimated risk of a hypothesis returned by a general learning rule, where the estimated risk is expressed in terms of either the resubstitution estimate (empirical error), or the deleted (or, leave-one-out) estimate. As an illustration we derive exponential tail bounds for ridge regression with unbounded responses – a setting where uniform stability results of Bousquet and Elisseeff (2002) are not applicable.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Cleaning up the neighborhood: A full classification for adversarial partial monitoring.\n \n \n \n \n\n\n \n Lattimore, T.; and Szepesvári, C.\n\n\n \n\n\n\n In ALT, 02 2019. \n \n\n\n\n
\n\n\n\n \n \n \"Cleaning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{LaSze19ALT,\n\tabstract = {Partial monitoring is a generalization of the well-known multi-armed bandit framework where the loss is not directly observed by the learner. We complete the classification of finite adversarial partial monitoring to include all games, solving an open problem posed by Bartok et al. (2014). Along the way we simplify and improve existing algorithms and correct errors in previous analyses. Our second contribution is a new algorithm for the class of games studied by Bartok (2013) where we prove upper and lower regret bounds that shed more light on the dependence of the regret on the game structure.},\n\tacceptrate = {37 out of 78=47\\%},\n\tauthor = {Lattimore, T. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ALT},\n\tkeywords = {partial monitoring, partial information, online learning, adversarial setting},\n\tmonth = {02},\n\ttitle = {Cleaning up the neighborhood: A full classification for adversarial partial monitoring},\n\turl_paper = {ALT2019_cleaning_pm.pdf},\n\tyear = {2019}}\n\n
\n
\n\n\n
\n Partial monitoring is a generalization of the well-known multi-armed bandit framework where the loss is not directly observed by the learner. We complete the classification of finite adversarial partial monitoring to include all games, solving an open problem posed by Bartok et al. (2014). Along the way we simplify and improve existing algorithms and correct errors in previous analyses. Our second contribution is a new algorithm for the class of games studied by Bartok (2013) where we prove upper and lower regret bounds that shed more light on the dependence of the regret on the game structure.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Online Algorithm for Unsupervised Sensor Selection (long version).\n \n \n \n \n\n\n \n Verma, A.; Hanawal, M.; Szepesvári, C.; and Saligrama, V.\n\n\n \n\n\n\n In AISTATS, 2019. \n \n\n\n\n
\n\n\n\n \n \n \"Online paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{VeHaSzeSa19,\n\tabstract = {In many security and healthcare systems, the detection and diagnosis systems use a sequence of sensors/tests. Each test outputs a prediction of the latent state and carries an inherent cost. However, the correctness of the predictions cannot be evaluated due to unavailability of the ground-truth annotations. Our objective is to learn strategies for selecting a test that gives the best trade-off between accuracy and costs in such unsupervised sensor selection (USS) problems. Clearly, learning is feasible only if ground truth can be inferred (explicitly or implicitly) from the problem structure.  It is observed that this happens if the problem satisfies the `Weak Dominance' (WD) property.  We set up the USS problem as a stochastic partial monitoring problem and develop an algorithm with sub-linear regret under the WD property. We argue that our algorithm is optimal and evaluate its performance on problem instances generated from synthetic and real-world datasets.},\n\tacceptrate = {360 out of 1111=32\\%},\n\tauthor = {Verma, A. and Hanawal, M. and Szepesv{\\'a}ri, Cs. and Saligrama, V.},\n\tbooktitle = {AISTATS},\n\tkeywords = {stochastic bandits, unsupervised learning, stochastic partial monitoring, cascaded sensor selection, optimal stopping},\n\trating = {0},\n\tread = {Yes},\n\ttitle = {Online Algorithm for Unsupervised Sensor Selection (long version)},\n\turl_paper = {aistats2019_uss.pdf},\n\tyear = {2019},\n\tBdsk-File-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxBKLi4vLi4vLi4vTGlicmFyeS5wYXBlcnMzL0ZpbGVzL0Q5L0Q5QkEzMkVGLTMyQ0QtNERBRS04MkQzLTgyNjY2QTgyOUY0Ri5wZGZPEQHaAAAAAAHaAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAAAAAAAAAAAAAAQkQAAf////8fRDlCQTMyRUYtMzJDRC00REFFI0ZGRkZGRkZGLnBkZgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/////wAAAAAAAAAAAAAAAAADAAQAAAogY3UAAAAAAAAAAAAAAAAAAkQ5AAIAWS86VXNlcnM6Y3NhYmE6RG9jdW1lbnRzOkxpYnJhcnkucGFwZXJzMzpGaWxlczpEOTpEOUJBMzJFRi0zMkNELTREQUUtODJEMy04MjY2NkE4MjlGNEYucGRmAAAOAFIAKABEADkAQgBBADMAMgBFAEYALQAzADIAQwBEAC0ANABEAEEARQAtADgAMgBEADMALQA4ADIANgA2ADYAQQA4ADIAOQBGADQARgAuAHAAZABmAA8AGgAMAE0AYQBjAGkAbgB0AG8AcwBoACAASABEABIAV1VzZXJzL2NzYWJhL0RvY3VtZW50cy9MaWJyYXJ5LnBhcGVyczMvRmlsZXMvRDkvRDlCQTMyRUYtMzJDRC00REFFLTgyRDMtODI2NjZBODI5RjRGLnBkZgAAEwABLwAAFQACAAz//wAAAAgADQAaACQAcQAAAAAAAAIBAAAAAAAAAAUAAAAAAAAAAAAAAAAAAAJP},\n\tBdsk-Url-1 = {http://ieeexplore.ieee.org/document/8014469/},\n\tBdsk-Url-2 = {https://dx.doi.org/10.1109/TAC.2017.2743163}}\n\n
\n
\n\n\n
\n In many security and healthcare systems, the detection and diagnosis systems use a sequence of sensors/tests. Each test outputs a prediction of the latent state and carries an inherent cost. However, the correctness of the predictions cannot be evaluated due to unavailability of the ground-truth annotations. Our objective is to learn strategies for selecting a test that gives the best trade-off between accuracy and costs in such unsupervised sensor selection (USS) problems. Clearly, learning is feasible only if ground truth can be inferred (explicitly or implicitly) from the problem structure. It is observed that this happens if the problem satisfies the `Weak Dominance' (WD) property. We set up the USS problem as a stochastic partial monitoring problem and develop an algorithm with sub-linear regret under the WD property. We argue that our algorithm is optimal and evaluate its performance on problem instances generated from synthetic and real-world datasets.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Model-Free Linear Quadratic Control via Reduction to Expert Prediction.\n \n \n \n \n\n\n \n Abbasi-Yadkori, Y.; Lazic, N.; and Szepesvári, C.\n\n\n \n\n\n\n In AISTATS, 2019. \n \n\n\n\n
\n\n\n\n \n \n \"Model-Free paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{AYLaSz19,\n\tabstract = { Model-free approaches for reinforcement learning (RL) and continuous control find policies based only on past states and rewards, without fitting a model of the system dynamics. They are appealing as they are general purpose and easy to implement; however, they also come with fewer theoretical guarantees than model-based RL.  In this work, we present a new model-free algorithm for controlling linear quadratic (LQ) systems, and show that its regret scales as O(T^(x+2/3)) for any small x>0 if the time horizon satisfies T>C^(1/x) for a constant C. The algorithm is based on a reduction of control of Markov decision processes to an expert prediction problem. In practice, it corresponds to a variant of policy iteration with forced exploration, where the policy in each phase is greedy with respect to the average of all previous value functions.\nThis is the first model-free algorithm for adaptive control of LQ systems that provably achieves sublinear regret and has a polynomial computation cost. Empirically, our algorithm dramatically outperforms standard policy iteration, but performs worse than a model-based approach.\n},\n\tacceptrate = {360 out of 1111=32\\%},\n\tauthor = {Abbasi-Yadkori, Y. and Lazic, N. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {AISTATS},\n\tkeywords = {Markov Decision Processes, linear dynamical systems, exploration vs. exploitation, online learning, theory, LQR},\n\trating = {0},\n\tread = {Yes},\n\ttitle = {Model-Free Linear Quadratic Control via Reduction to Expert Prediction},\n\turl_paper = {aistats2019_modelfreeLQR.pdf},\n\tyear = {2019},\n\tBdsk-File-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxBKLi4vLi4vLi4vTGlicmFyeS5wYXBlcnMzL0ZpbGVzL0Q5L0Q5QkEzMkVGLTMyQ0QtNERBRS04MkQzLTgyNjY2QTgyOUY0Ri5wZGZPEQHaAAAAAAHaAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAAAAAAAAAAAAAAQkQAAf////8fRDlCQTMyRUYtMzJDRC00REFFI0ZGRkZGRkZGLnBkZgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/////wAAAAAAAAAAAAAAAAADAAQAAAogY3UAAAAAAAAAAAAAAAAAAkQ5AAIAWS86VXNlcnM6Y3NhYmE6RG9jdW1lbnRzOkxpYnJhcnkucGFwZXJzMzpGaWxlczpEOTpEOUJBMzJFRi0zMkNELTREQUUtODJEMy04MjY2NkE4MjlGNEYucGRmAAAOAFIAKABEADkAQgBBADMAMgBFAEYALQAzADIAQwBEAC0ANABEAEEARQAtADgAMgBEADMALQA4ADIANgA2ADYAQQA4ADIAOQBGADQARgAuAHAAZABmAA8AGgAMAE0AYQBjAGkAbgB0AG8AcwBoACAASABEABIAV1VzZXJzL2NzYWJhL0RvY3VtZW50cy9MaWJyYXJ5LnBhcGVyczMvRmlsZXMvRDkvRDlCQTMyRUYtMzJDRC00REFFLTgyRDMtODI2NjZBODI5RjRGLnBkZgAAEwABLwAAFQACAAz//wAAAAgADQAaACQAcQAAAAAAAAIBAAAAAAAAAAUAAAAAAAAAAAAAAAAAAAJP},\n\tBdsk-Url-1 = {http://ieeexplore.ieee.org/document/8014469/},\n\tBdsk-Url-2 = {https://dx.doi.org/10.1109/TAC.2017.2743163}}\n\n
\n
\n\n\n
\n Model-free approaches for reinforcement learning (RL) and continuous control find policies based only on past states and rewards, without fitting a model of the system dynamics. They are appealing as they are general purpose and easy to implement; however, they also come with fewer theoretical guarantees than model-based RL. In this work, we present a new model-free algorithm for controlling linear quadratic (LQ) systems, and show that its regret scales as O(T^(x+2/3)) for any small x>0 if the time horizon satisfies T>C^(1/x) for a constant C. The algorithm is based on a reduction of control of Markov decision processes to an expert prediction problem. In practice, it corresponds to a variant of policy iteration with forced exploration, where the policy in each phase is greedy with respect to the average of all previous value functions. This is the first model-free algorithm for adaptive control of LQ systems that provably achieves sublinear regret and has a polynomial computation cost. Empirically, our algorithm dramatically outperforms standard policy iteration, but performs worse than a model-based approach. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n An Exponential Tail Bound for the Deleted Estimate.\n \n \n \n \n\n\n \n Abou-Moustafa, K.; and Szepesvári, C.\n\n\n \n\n\n\n In AAAI, 11 2019. \n \n\n\n\n
\n\n\n\n \n \n \"An paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{AMSz19AAAI,\n\tabstract = {There is an accumulating evidence in the literature that stability of learning algorithms is a key characteristic that permits a learning algorithm to generalize. Despite various insightful results in this direction, there seems to be an overlooked dichotomy in the type of stability-based generalization bounds we have in the literature. On one hand, the literature seems to suggest that exponential generalization bounds for the estimated risk, which are optimal, can be only obtained through stringent, distribution independent and computationally intractable notions of stability such as uniform stability. On the other hand, it seems that weaker notions of stability such as hypothesis stability, although it is distribution dependent and more amenable to computation, can only yield polynomial generalization bounds for the estimated risk, which are suboptimal. In this paper, we address the gap between these two regimes of results. In particular, the main question we address here is whether it is possible to derive exponential generalization bounds for the estimated risk using a notion of stability that is computationally tractable and distribution dependent, but weaker than uniform stability. Using recent advances in concentration inequalities, and using a notion of stability that is weaker than uniform stability but distribution dependent and amenable to computation, we derive an exponential tail bound for the concentration of the estimated risk of a hypothesis returned by a general learning rule, where the estimated risk is expressed in terms of the deleted estimate. Interestingly, we note that our final bound has similarities to previous exponential generalization bounds for the deleted estimate, in particular, the result of Bousquet and Elisseeff (2002) for the regression case.},\n\tacceptrate = {1150 out of 7095=16\\%},\n\tauthor = {Abou-Moustafa, K. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {AAAI},\n\tkeywords = {stability, generalization bounds, learning theory, cross-validation, deleted estimate, Efron-Stein},\n\tmonth = {11},\n\ttitle = {An Exponential Tail Bound for the Deleted Estimate},\n\turl_paper = {AAAI2019_expefronsteindel.pdf},\n\tyear = {2019}}\n\n
\n
\n\n\n
\n There is an accumulating evidence in the literature that stability of learning algorithms is a key characteristic that permits a learning algorithm to generalize. Despite various insightful results in this direction, there seems to be an overlooked dichotomy in the type of stability-based generalization bounds we have in the literature. On one hand, the literature seems to suggest that exponential generalization bounds for the estimated risk, which are optimal, can be only obtained through stringent, distribution independent and computationally intractable notions of stability such as uniform stability. On the other hand, it seems that weaker notions of stability such as hypothesis stability, although it is distribution dependent and more amenable to computation, can only yield polynomial generalization bounds for the estimated risk, which are suboptimal. In this paper, we address the gap between these two regimes of results. In particular, the main question we address here is whether it is possible to derive exponential generalization bounds for the estimated risk using a notion of stability that is computationally tractable and distribution dependent, but weaker than uniform stability. Using recent advances in concentration inequalities, and using a notion of stability that is weaker than uniform stability but distribution dependent and amenable to computation, we derive an exponential tail bound for the concentration of the estimated risk of a hypothesis returned by a general learning rule, where the estimated risk is expressed in terms of the deleted estimate. Interestingly, we note that our final bound has similarities to previous exponential generalization bounds for the deleted estimate, in particular, the result of Bousquet and Elisseeff (2002) for the regression case.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2018\n \n \n (8)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n PAC-Bayes bounds for stable algorithms with instance-dependent priors.\n \n \n \n \n\n\n \n Rivasplata, O.; Szepesvári, C.; Shawe-Taylor, J.; Parrado-Hernandez, E.; and Sun, S.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, 09 2018. \n \n\n\n\n
\n\n\n\n \n \n \"PAC-Bayes paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{RSz18,\n\tabstract = {PAC-Bayes bounds have been proposed to get risk estimates based on a training sample. In this paper the PAC-Bayes approach is combined with stability of the hypothesis learned by a Hilbert space valued algorithm. The PAC-Bayes setting is used with a Gaussian prior centered at the expected output. Thus a novelty of our paper is using priors defined in terms of the data-generating distribution. Our main result estimates the risk of the randomized algorithm in terms of the hypothesis stability coefficients. We also provide a new bound for the SVM classifier, which is compared to other known bounds experimentally. Ours appears to be the first uniform hypothesis stability-based bound that evaluates to non-trivial values.},\n\tacceptrate = {1011 out of 4856=21\\%},\n\tauthor = {Rivasplata, O. and Szepesv{\\'a}ri, Cs. and Shawe-Taylor, J. and Parrado-Hernandez, E. and Sun, S.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tkeywords = {stability, generalization bounds, learning theory, PAC-Bayes, uniform stability},\n\tmonth = {09},\n\ttitle = {PAC-Bayes bounds for stable algorithms with instance-dependent priors},\n\turl_paper = {NeurIPS2018-PACBayes.pdf},\n\tyear = {2018}}\n\n
\n
\n\n\n
\n PAC-Bayes bounds have been proposed to get risk estimates based on a training sample. In this paper the PAC-Bayes approach is combined with stability of the hypothesis learned by a Hilbert space valued algorithm. The PAC-Bayes setting is used with a Gaussian prior centered at the expected output. Thus a novelty of our paper is using priors defined in terms of the data-generating distribution. Our main result estimates the risk of the randomized algorithm in terms of the hypothesis stability coefficients. We also provide a new bound for the SVM classifier, which is compared to other known bounds experimentally. Ours appears to be the first uniform hypothesis stability-based bound that evaluates to non-trivial values.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n TopRank: A practical algorithm for online stochastic ranking.\n \n \n \n \n\n\n \n Lattimore, T.; Kveton, B.; Li, S.; and Szepesvári, C.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, 09 2018. \n \n\n\n\n
\n\n\n\n \n \n \"TopRank: paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{LKLSz18,\n\tabstract = {Online learning to rank is a sequential decision-making problem where in each round the learning agent chooses a list of items and receives feedback in the form of clicks from the user. Many sample-efficient algorithms have been proposed for this problem that assume a specific click model connecting rankings and user behavior. We propose a generalized click model that encompasses many existing models, including the position-based and cascade models. Our generalization motivates a novel online learning algorithm based on topological sort, which we call TopRank. TopRank is (a) more natural than existing algorithms, (b) has stronger regret guarantees than existing algorithms with comparable generality, (c) has a more insightful proof that leaves the door open to many generalizations, (d) outperforms existing algorithms empirically.},\n\tacceptrate = {1011 out of 4856=21\\%},\n\tauthor = {Lattimore, T. and Kveton, B. and Li, S. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tkeywords = {ranking, online learning, partial information, stochastic online learning, online learning to rank},\n\tmonth = {09},\n\ttitle = {TopRank: A practical algorithm for online stochastic ranking},\n\turl_paper = {NeurIPS2018-toprank.pdf},\n\tyear = {2018}}\n\n
\n
\n\n\n
\n Online learning to rank is a sequential decision-making problem where in each round the learning agent chooses a list of items and receives feedback in the form of clicks from the user. Many sample-efficient algorithms have been proposed for this problem that assume a specific click model connecting rankings and user behavior. We propose a generalized click model that encompasses many existing models, including the position-based and cascade models. Our generalization motivates a novel online learning algorithm based on topological sort, which we call TopRank. TopRank is (a) more natural than existing algorithms, (b) has stronger regret guarantees than existing algorithms with comparable generality, (c) has a more insightful proof that leaves the door open to many generalizations, (d) outperforms existing algorithms empirically.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n LeapsAndBounds: A Method for Approximately Optimal Algorithm Configuration.\n \n \n \n \n\n\n \n Weisz, G.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, 07 2018. \n \n\n\n\n
\n\n\n\n \n \n \"LeapsAndBounds: paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{WeGySz18,\n\tabstract = {We consider the problem of configuring general-purpose solvers to run efficiently on problem instances drawn from an unknown distribution. The goal of the configurator is to find a configuration that runs fast on average on most instances, and do so with the least amount of total work. It can run a chosen solver on a random instance until the solver finishes or a timeout is reached. We propose LeapsAndBounds, an algorithm that tests configurations on randomly selected problem instances for longer and longer time. We prove that the capped expected runtime of the configuration returned by LeapsAndBounds is close to the optimal expected runtime, while our algorithm's running time is near-optimal. Our results show that LeapsAndBounds is more efficient than the recent algorithm of Kleinberg et al. (2017), which, to our knowledge, is the only other algorithm configuration method with non-trivial theoretical guarantees. Experimental results on configuring a public SAT solver on a new benchmark dataset also stand witness to the superiority of our method.},\n\tacceptrate = {618 out of 2473=25\\%},\n\tauthor = {Weisz, G. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\tkeywords = {algorithm configuration, theory, heavy tail data},\n\tmonth = {07},\n\ttitle = {LeapsAndBounds: A Method for Approximately Optimal Algorithm Configuration},\n\turl_paper = {leapsandbounds-ICML18.pdf},\n\tyear = {2018}}\n\n
\n
\n\n\n
\n We consider the problem of configuring general-purpose solvers to run efficiently on problem instances drawn from an unknown distribution. The goal of the configurator is to find a configuration that runs fast on average on most instances, and do so with the least amount of total work. It can run a chosen solver on a random instance until the solver finishes or a timeout is reached. We propose LeapsAndBounds, an algorithm that tests configurations on randomly selected problem instances for longer and longer time. We prove that the capped expected runtime of the configuration returned by LeapsAndBounds is close to the optimal expected runtime, while our algorithm's running time is near-optimal. Our results show that LeapsAndBounds is more efficient than the recent algorithm of Kleinberg et al. (2017), which, to our knowledge, is the only other algorithm configuration method with non-trivial theoretical guarantees. Experimental results on configuring a public SAT solver on a new benchmark dataset also stand witness to the superiority of our method.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Gradient Descent for Sparse Rank-One Matrix Completion for Crowd-Sourced Aggregation of Sparsely Interacting Workers.\n \n \n \n \n\n\n \n Ma, Y.; Olshevsky, A.; Szepesvári, C.; and Saligrama, V.\n\n\n \n\n\n\n In ICML, 07 2018. \n \n\n\n\n
\n\n\n\n \n \n \"Gradient paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{MOSzS18,\n\tabstract = {We consider worker skill estimation for the single-coin Dawid-Skene crowdsourcing model. In practice skill-estimation is challenging because worker assignments are sparse and irregular due to the arbitrary, and uncontrolled availability of workers. We formulate skill estimation as a rank-one correlation-matrix completion problem, where the observed components correspond to observed label correlation between workers. We show that the correlation matrix can be successfully recovered and skills identifiable if and only if the sampling matrix (observed components) is irreducible and aperiodic. We then propose an efficient gradient descent scheme and show that skill estimates converges to the desired global optima for such sampling matrices. Our proof is original and the results are surprising in light of the fact that even the weighted rank-one matrix factorization problem is NP hard in general. Next we derive sample complexity bounds for the noisy case in terms of spectral properties of the signless Laplacian of the sampling matrix. Our proposed scheme achieves state-of-art performance on a number of real-world datasets.},\n\tacceptrate = {618 out of 2473=25\\%},\n\tauthor = {Ma, Y. and Olshevsky, A. and Szepesv{\\'a}ri, Cs. and Saligrama, V.},\n\tbooktitle = {ICML},\n\tkeywords = {crowdsourcing, rank-1 model, Dawid-Skene, weighted low-rank, nonconvex optimization, gradient descent},\n\tmonth = {07},\n\ttitle = {Gradient Descent for Sparse Rank-One Matrix Completion for Crowd-Sourced Aggregation of Sparsely Interacting Workers},\n\turl_paper = {GDForSparseRankOne-Full-ICML18.pdf},\n\tyear = {2018}}\n\n
\n
\n\n\n
\n We consider worker skill estimation for the single-coin Dawid-Skene crowdsourcing model. In practice skill-estimation is challenging because worker assignments are sparse and irregular due to the arbitrary, and uncontrolled availability of workers. We formulate skill estimation as a rank-one correlation-matrix completion problem, where the observed components correspond to observed label correlation between workers. We show that the correlation matrix can be successfully recovered and skills identifiable if and only if the sampling matrix (observed components) is irreducible and aperiodic. We then propose an efficient gradient descent scheme and show that skill estimates converges to the desired global optima for such sampling matrices. Our proof is original and the results are surprising in light of the fact that even the weighted rank-one matrix factorization problem is NP hard in general. Next we derive sample complexity bounds for the noisy case in terms of spectral properties of the signless Laplacian of the sampling matrix. Our proposed scheme achieves state-of-art performance on a number of real-world datasets.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Bandits with Delayed, Aggregated Anonymous Feedback.\n \n \n \n \n\n\n \n Pike-Burke, C.; Agrawal, S.; Szepesvári, C.; and Grünewälder, S.\n\n\n \n\n\n\n In ICML, 07 2018. \n \n\n\n\n
\n\n\n\n \n \n \"Bandits paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{PBASzG18,\n\tabstract = {We study a variant of the stochastic K-armed bandit problem, which we call "bandits with delayed, aggregated anonymous feedback". In this problem, when the player pulls an arm, a reward is generated, however it is not immediately observed. Instead, at the end of each round the player observes only the sum of a number of previously generated rewards which happen to arrive in the given round. The rewards are stochastically delayed and due to the aggregated nature of the observations, the information of which arm led to a particular reward is lost. The question is what is the cost of the information loss due to this delayed, aggregated anonymous feedback? Previous works have studied bandits with stochastic, non-anonymous delays and found that the regret increases only by an additive factor relating to the expected delay. In this paper, we show that this additive regret increase can be maintained in the harder delayed, aggregated anonymous feedback setting when the expected delay (or a bound on it) is known. We provide an algorithm that matches the worst case regret of the non-anonymous problem exactly when the delays are bounded, and up to logarithmic factors or an additive variance term, for unbounded delays.},\n\tacceptrate = {618 out of 2473=25\\%},\n\tauthor = {Pike-Burke, C. and Agrawal, S. and Szepesv{\\'a}ri, Cs. and Gr{\\"u}new{\\"a}lder, S.},\n\tbooktitle = {ICML},\n\tkeywords = {bandits, delay, stochastic bandits, theory, aggregated feedback},\n\tmonth = {07},\n\ttitle = {Bandits with Delayed, Aggregated Anonymous Feedback},\n\turl_paper = {DelayedBandit-ICML18.pdf},\n\tyear = {2018}}\n\n
\n
\n\n\n
\n We study a variant of the stochastic K-armed bandit problem, which we call \"bandits with delayed, aggregated anonymous feedback\". In this problem, when the player pulls an arm, a reward is generated, however it is not immediately observed. Instead, at the end of each round the player observes only the sum of a number of previously generated rewards which happen to arrive in the given round. The rewards are stochastically delayed and due to the aggregated nature of the observations, the information of which arm led to a particular reward is lost. The question is what is the cost of the information loss due to this delayed, aggregated anonymous feedback? Previous works have studied bandits with stochastic, non-anonymous delays and found that the regret increases only by an additive factor relating to the expected delay. In this paper, we show that this additive regret increase can be maintained in the harder delayed, aggregated anonymous feedback setting when the expected delay (or a bound on it) is known. We provide an algorithm that matches the worst case regret of the non-anonymous problem exactly when the delays are bounded, and up to logarithmic factors or an additive variance term, for unbounded delays.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Linear Stochastic Approximation: How Far Does Constant Step-Size and Iterate Averaging Go?.\n \n \n \n \n\n\n \n Lakshminarayanan, C.; and Szepesvári, C.\n\n\n \n\n\n\n In AISTATS, 02 2018. \n \n\n\n\n
\n\n\n\n \n \n \"Linear paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 11 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{LaSze18:LSA,\n\tabstract = {In this paper we study study constant step-size averaged linear stochastic approximation. With an eye towards linear value estimation in reinforcement learning, we ask whether for a given class of linear estimation problems i) a single universal constant step-size with ii) a C/t worst-case expected error with a class-dependent constant C > 0 can be guaranteed when the error is measured via an appropriate weighted squared norm. Such a result has recently been obtained in the context of linear least squares regression. We give examples that show that the answer to these questions in general is no. On the positive side, we also characterize the instance dependent behavior of the error of the said algorithms, identify some conditions under which the answer to the above questions can be changed to the positive, and in particular show instance-dependent error bounds of magnitude O(1/t) for the constant step-size iterate averaged versions of TD(0) and a novel variant of GTD, where the stepsize is chosen independently of the value estimation instance. Computer simulations are used to illustrate and complement the theory.},\n\tacceptrate = {214 out of 645=33\\%},\n\tauthor = {Lakshminarayanan, C. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {AISTATS},\n\tkeywords = {TD learning, stochastic approximation, theory, finite-sample bounds},\n\tmonth = {02},\n\ttitle = {Linear Stochastic Approximation: How Far Does Constant Step-Size and Iterate Averaging Go?},\n\turl_paper = {aistats2018-flsa.pdf},\n\tyear = {2018}}\n\n
\n
\n\n\n
\n In this paper we study study constant step-size averaged linear stochastic approximation. With an eye towards linear value estimation in reinforcement learning, we ask whether for a given class of linear estimation problems i) a single universal constant step-size with ii) a C/t worst-case expected error with a class-dependent constant C > 0 can be guaranteed when the error is measured via an appropriate weighted squared norm. Such a result has recently been obtained in the context of linear least squares regression. We give examples that show that the answer to these questions in general is no. On the positive side, we also characterize the instance dependent behavior of the error of the said algorithms, identify some conditions under which the answer to the above questions can be changed to the positive, and in particular show instance-dependent error bounds of magnitude O(1/t) for the constant step-size iterate averaged versions of TD(0) and a novel variant of GTD, where the stepsize is chosen independently of the value estimation instance. Computer simulations are used to illustrate and complement the theory.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Stochastic Optimization in a Cumulative Prospect Theory Framework.\n \n \n \n \n\n\n \n Jie, C.; L.A., P.; Fu, M.; Marcus, S.; and Szepesvári, C.\n\n\n \n\n\n\n IEEE Transactions on Automatic Control, 63(9): 2867–2882. 2018.\n \n\n\n\n
\n\n\n\n \n \n \"Stochastic link\n  \n \n \n \"Stochastic paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{JiPrFuMaSz18,\n\tabstract = {Cumulative prospect theory (CPT) is a popular approach for modeling human preferences. It is based on probabilistic distortions and generalizes expected utility theory.  We bring CPT to a stochastic optimization framework and propose algorithms for both estimation and optimization of CPT-value objectives. We propose an empirical distribution function-based scheme to estimate the CPT-value and then use this scheme in the inner loop of a CPT-value optimization procedure. We propose both gradient-based as well as gradient-free CPT-value optimization algorithms that are based on two well-known simulation optimization ideas: simultaneous perturbation stochastic approximation (SPSA) and model-based parameter search (MPS), respectively.  We provide theoretical convergence guarantees for all the proposed algorithms\nand also illustrate the potential of CPT-based criteria in a traffic signal control application.\n},\n\tauthor = {Jie, C. and Prashanth L.A. and Fu, M.C. and Marcus, S. and Szepesv{\\'a}ri, Cs.},\n\tdate = {2018-01},\n\tdate-added = {2018-03-11 17:24:12 +0000},\n\tdate-modified = {2019-07-20 10:15:20 -0600},\n\tjournal = {IEEE Transactions on Automatic Control},\n\tkeywords = {risk-sensitive criteria, Markov Decision Processes, SPSA, optimization, stochastic optimization, reinforcement learning, cumulative prospect theory},\n\tnumber = {9},\n\tpages = {2867--2882},\n\trating = {0},\n\tread = {Yes},\n\ttitle = {Stochastic Optimization in a Cumulative Prospect Theory Framework},\n\turl_link = {http://ieeexplore.ieee.org/document/8014469/},\n\turl_paper = {2018-cpt-rl-tac.pdf},\n\tvolume = {63},\n\tyear = {2018},\n\tBdsk-File-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxBKLi4vLi4vLi4vTGlicmFyeS5wYXBlcnMzL0ZpbGVzL0Q5L0Q5QkEzMkVGLTMyQ0QtNERBRS04MkQzLTgyNjY2QTgyOUY0Ri5wZGZPEQHaAAAAAAHaAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAAAAAAAAAAAAAAQkQAAf////8fRDlCQTMyRUYtMzJDRC00REFFI0ZGRkZGRkZGLnBkZgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/////wAAAAAAAAAAAAAAAAADAAQAAAogY3UAAAAAAAAAAAAAAAAAAkQ5AAIAWS86VXNlcnM6Y3NhYmE6RG9jdW1lbnRzOkxpYnJhcnkucGFwZXJzMzpGaWxlczpEOTpEOUJBMzJFRi0zMkNELTREQUUtODJEMy04MjY2NkE4MjlGNEYucGRmAAAOAFIAKABEADkAQgBBADMAMgBFAEYALQAzADIAQwBEAC0ANABEAEEARQAtADgAMgBEADMALQA4ADIANgA2ADYAQQA4ADIAOQBGADQARgAuAHAAZABmAA8AGgAMAE0AYQBjAGkAbgB0AG8AcwBoACAASABEABIAV1VzZXJzL2NzYWJhL0RvY3VtZW50cy9MaWJyYXJ5LnBhcGVyczMvRmlsZXMvRDkvRDlCQTMyRUYtMzJDRC00REFFLTgyRDMtODI2NjZBODI5RjRGLnBkZgAAEwABLwAAFQACAAz//wAAAAgADQAaACQAcQAAAAAAAAIBAAAAAAAAAAUAAAAAAAAAAAAAAAAAAAJP},\n\tBdsk-Url-1 = {http://ieeexplore.ieee.org/document/8014469/},\n\tBdsk-Url-2 = {https://dx.doi.org/10.1109/TAC.2017.2743163}}\n\n
\n
\n\n\n
\n Cumulative prospect theory (CPT) is a popular approach for modeling human preferences. It is based on probabilistic distortions and generalizes expected utility theory. We bring CPT to a stochastic optimization framework and propose algorithms for both estimation and optimization of CPT-value objectives. We propose an empirical distribution function-based scheme to estimate the CPT-value and then use this scheme in the inner loop of a CPT-value optimization procedure. We propose both gradient-based as well as gradient-free CPT-value optimization algorithms that are based on two well-known simulation optimization ideas: simultaneous perturbation stochastic approximation (SPSA) and model-based parameter search (MPS), respectively. We provide theoretical convergence guarantees for all the proposed algorithms and also illustrate the potential of CPT-based criteria in a traffic signal control application. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A Linearly Relaxed Approximate Linear Program for Markov Decision Processes.\n \n \n \n \n\n\n \n Lakshminarayanan, C.; Bhatnagar, S.; and Szepesvári, C.\n\n\n \n\n\n\n IEEE Transactions on Automatic Control, 63(4): 1185–1191. 2018.\n \n\n\n\n
\n\n\n\n \n \n \"A link\n  \n \n \n \"A paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 7 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{LaBhSze18,\n\tabstract = {Approximate linear programming (ALP) and its variants have been widely applied to Markov Decision Processes (MDPs) with a large number of states. A serious limitation of ALP is that it has an intractable number of constraints, as a result of which constraint approximations are of interest. In this paper, we define a linearly relaxed approximation linear program (LRALP) that has a tractable number of constraints, obtained as positive linear combinations of the original constraints of the ALP. The main contribution is a novel performance bound for LRALP.\n},\n\tauthor = {Lakshminarayanan, C. and Bhatnagar, S. and Szepesv{\\'a}ri, Cs.},\n\tdate = {2018-01},\n\tdate-added = {2018-03-11 16:50:30 +0000},\n\tdate-modified = {2019-07-20 10:14:41 -0600},\n\tjournal = {IEEE Transactions on Automatic Control},\n\tkeywords = {Markov Decision Processes, planning, control},\n\tnumber = {4},\n\tpages = {1185--1191},\n\trating = {0},\n\tread = {Yes},\n\ttitle = {A Linearly Relaxed Approximate Linear Program for Markov Decision Processes},\n\turl_link = {http://ieeexplore.ieee.org/document/8014469/},\n\turl_paper = {2018-lralp-ieee-tac.pdf},\n\tvolume = {63},\n\tyear = {2018},\n\tBdsk-File-1 = {YnBsaXN0MDDSAQIDBFxyZWxhdGl2ZVBhdGhZYWxpYXNEYXRhXxBKLi4vLi4vLi4vTGlicmFyeS5wYXBlcnMzL0ZpbGVzL0Q5L0Q5QkEzMkVGLTMyQ0QtNERBRS04MkQzLTgyNjY2QTgyOUY0Ri5wZGZPEQHaAAAAAAHaAAIAAAxNYWNpbnRvc2ggSEQAAAAAAAAAAAAAAAAAAAAAAAAAQkQAAf////8fRDlCQTMyRUYtMzJDRC00REFFI0ZGRkZGRkZGLnBkZgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA/////wAAAAAAAAAAAAAAAAADAAQAAAogY3UAAAAAAAAAAAAAAAAAAkQ5AAIAWS86VXNlcnM6Y3NhYmE6RG9jdW1lbnRzOkxpYnJhcnkucGFwZXJzMzpGaWxlczpEOTpEOUJBMzJFRi0zMkNELTREQUUtODJEMy04MjY2NkE4MjlGNEYucGRmAAAOAFIAKABEADkAQgBBADMAMgBFAEYALQAzADIAQwBEAC0ANABEAEEARQAtADgAMgBEADMALQA4ADIANgA2ADYAQQA4ADIAOQBGADQARgAuAHAAZABmAA8AGgAMAE0AYQBjAGkAbgB0AG8AcwBoACAASABEABIAV1VzZXJzL2NzYWJhL0RvY3VtZW50cy9MaWJyYXJ5LnBhcGVyczMvRmlsZXMvRDkvRDlCQTMyRUYtMzJDRC00REFFLTgyRDMtODI2NjZBODI5RjRGLnBkZgAAEwABLwAAFQACAAz//wAAAAgADQAaACQAcQAAAAAAAAIBAAAAAAAAAAUAAAAAAAAAAAAAAAAAAAJP},\n\tBdsk-Url-1 = {http://ieeexplore.ieee.org/document/8014469/},\n\tBdsk-Url-2 = {https://dx.doi.org/10.1109/TAC.2017.2743163}}\n\n
\n
\n\n\n
\n Approximate linear programming (ALP) and its variants have been widely applied to Markov Decision Processes (MDPs) with a large number of states. A serious limitation of ALP is that it has an intractable number of constraints, as a result of which constraint approximations are of interest. In this paper, we define a linearly relaxed approximation linear program (LRALP) that has a tractable number of constraints, obtained as positive linear combinations of the original constraints of the ALP. The main contribution is a novel performance bound for LRALP. \n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2017\n \n \n (10)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Multi-view Matrix Factorization for Linear Dynamical System Estimation.\n \n \n \n \n\n\n \n Karami, M.; White, M.; Schuurmans, D.; and Szepesvári, C.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, pages 7092–7101, 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Multi-view link\n  \n \n \n \"Multi-view paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KaWhSchSz17,\n\tabstract = {We consider maximum likelihood estimation of linear dynamical systems with generalized-linear observation models. Maximum likelihood is typically considered to be hard in this setting since latent states and transition parameters must be inferred jointly. Given that expectation-maximization does not scale and is prone to local minima, moment-matching approaches from the subspace identification literature have become standard, despite known statistical efficiency issues. In this paper, we instead reconsider likelihood maximization and develop an optimization based strategy for recovering the latent states and transition parameters. Key to the approach is a two-view reformulation of maximum likelihood estimation for linear dynamical systems that enables the use of global optimization algorithms for matrix factorization. We show that the proposed estimation strategy outperforms widely-used identification algorithms such as subspace identification methods, both in terms of accuracy and runtime.},\n\tacceptrate = {678 out of 3270=21\\%},\n\tauthor = {Karami, M. and White, M. and Schuurmans, D. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tkeywords = {linear dynamics, linear dynamical systems, system identification, two-view model},\n\tpages = {7092--7101},\n\ttitle = {Multi-view Matrix Factorization for Linear Dynamical System Estimation},\n\turl_link = {https://papers.neurips.cc/paper/7284-multi-view-matrix-factorization-for-linear-dynamical-system-estimation},\n\turl_paper = {2017-NeurIPS-lds.pd},\n\tyear = {2017},\n\tBdsk-Url-1 = {http://papers.neurips.cc/paper/7284-multi-view-matrix-factorization-for-linear-dynamical-system-estimation.pdf}}\n\n
\n
\n\n\n
\n We consider maximum likelihood estimation of linear dynamical systems with generalized-linear observation models. Maximum likelihood is typically considered to be hard in this setting since latent states and transition parameters must be inferred jointly. Given that expectation-maximization does not scale and is prone to local minima, moment-matching approaches from the subspace identification literature have become standard, despite known statistical efficiency issues. In this paper, we instead reconsider likelihood maximization and develop an optimization based strategy for recovering the latent states and transition parameters. Key to the approach is a two-view reformulation of maximum likelihood estimation for linear dynamical systems that enables the use of global optimization algorithms for matrix factorization. We show that the proposed estimation strategy outperforms widely-used identification algorithms such as subspace identification methods, both in terms of accuracy and runtime.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Structured Best Arm Identification with Fixed Confidence.\n \n \n \n \n\n\n \n Huang, R.; Ajallooeian, M.; Szepesvári, C.; and Müller, M.\n\n\n \n\n\n\n In ALT, volume 76, pages 593–616, 10 2017. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"Structured link\n  \n \n \n \"Structured paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{pmlr-v76-huang17a,\n\tabstract = {We study the problem of identifying the best action among a set of possible options when the value of each action is given by a mapping from a number of noisy micro-observables in the so-called fixed confidence setting. Our main motivation is the application to minimax game search, which has been a major topic of interest in artificial intelligence. In this paper we introduce an abstract setting to clearly describe the essential properties of the problem. While previous work only considered a two-move-deep game tree search problem, our abstract setting can be applied to the general minimax games where the depth can be non-uniform and arbitrary, and transpositions are allowed. We introduce a new algorithm (LUCB-micro) for the abstract setting, and give its lower and upper sample complexity results. Our bounds recover some previous results, achieved in more limited settings, and also shed further light on how the structure of minimax problems influences sample complexity.},\n\tacceptrate = {33 out of 74=45\\%},\n\tauthor = {Huang, R. and Ajallooeian, M.M. and Szepesv{\\'a}ri, Cs. and M{\\"u}ller, M.},\n\tbooktitle = {ALT},\n\tkeywords = {optimization, best-arm identification, stochastic optimization, bandits, Monte-Carlo tree search},\n\tmonth = {10},\n\tpages = {593--616},\n\tpublisher = {PMLR},\n\ttitle = {Structured Best Arm Identification with Fixed Confidence},\n\turl_link = {http://proceedings.mlr.press/v76/huang17a.html},\n\turl_paper = {2018-alt-bestarm.pdf},\n\tvolume = {76},\n\tyear = {2017},\n\tBdsk-Url-1 = {http://proceedings.mlr.press/v76/huang17a.html}}\n\n
\n
\n\n\n
\n We study the problem of identifying the best action among a set of possible options when the value of each action is given by a mapping from a number of noisy micro-observables in the so-called fixed confidence setting. Our main motivation is the application to minimax game search, which has been a major topic of interest in artificial intelligence. In this paper we introduce an abstract setting to clearly describe the essential properties of the problem. While previous work only considered a two-move-deep game tree search problem, our abstract setting can be applied to the general minimax games where the depth can be non-uniform and arbitrary, and transpositions are allowed. We introduce a new algorithm (LUCB-micro) for the abstract setting, and give its lower and upper sample complexity results. Our bounds recover some previous results, achieved in more limited settings, and also shed further light on how the structure of minimax problems influences sample complexity.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A Modular Analysis of Adaptive (Non-)Convex Optimization: Optimism, Composite Objectives, and Variational Bounds.\n \n \n \n \n\n\n \n Joulani, P.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In ALT, volume 76, pages 681–720, 2017. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"A paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{JoGySz:ALT17,\n\tabstract = {Recently, much work has been done on extending the scope of online learning and incremental stochastic optimization algorithms. In this paper we contribute to this effort in two ways: First, based on a new regret decomposition and a generalization of Bregman divergences, we provide a self-contained, modular analysis of the two workhorses of online learning: (general) adaptive versions of Mirror Descent (MD) and the Follow-the-Regularized-Leader (FTRL) algorithms. The analysis is done with extra care so as not to introduce assumptions not needed in the proofs and allows to combine, in a straightforward way, different algorithmic ideas (e.g., adaptivity, optimism, implicit updates) and learning settings (e.g., strongly convex or composite objectives). This way we are able to reprove, extend and refine a large body of the literature, while keeping the proofs concise. The second contribution is a byproduct of this careful analysis: We present algorithms with improved variational bounds for smooth, composite objectives, including a new family of optimistic MD algorithms with only one projection step per round. Furthermore, we provide a simple extension of adaptive regret bounds to practically relevant non-convex problem settings with essentially no extra effort.},\n\tacceptrate = {33 out of 74=45\\%},\n\tauthor = {Joulani, P. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ALT},\n\tkeywords = {online learning, online convex optimization, theory, adversarial setting},\n\tpages = {681--720},\n\tpublisher = {PMLR},\n\ttitle = {A Modular Analysis of Adaptive (Non-)Convex Optimization: Optimism, Composite Objectives, and Variational Bounds},\n\turl_paper = {2017-alt-modol.pdf},\n\tvolume = {76},\n\tyear = {2017}}\n\n
\n
\n\n\n
\n Recently, much work has been done on extending the scope of online learning and incremental stochastic optimization algorithms. In this paper we contribute to this effort in two ways: First, based on a new regret decomposition and a generalization of Bregman divergences, we provide a self-contained, modular analysis of the two workhorses of online learning: (general) adaptive versions of Mirror Descent (MD) and the Follow-the-Regularized-Leader (FTRL) algorithms. The analysis is done with extra care so as not to introduce assumptions not needed in the proofs and allows to combine, in a straightforward way, different algorithmic ideas (e.g., adaptivity, optimism, implicit updates) and learning settings (e.g., strongly convex or composite objectives). This way we are able to reprove, extend and refine a large body of the literature, while keeping the proofs concise. The second contribution is a byproduct of this careful analysis: We present algorithms with improved variational bounds for smooth, composite objectives, including a new family of optimistic MD algorithms with only one projection step per round. Furthermore, we provide a simple extension of adaptive regret bounds to practically relevant non-convex problem settings with essentially no extra effort.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Following the Leader and Fast Rates in Online Linear Prediction: Curved Constraint Sets and Other Regularities.\n \n \n \n \n\n\n \n Huang, R.; Lattimore, T.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n Journal of Machine Learning Research, 18: 1–31. 2017.\n \n\n\n\n
\n\n\n\n \n \n \"Following link\n  \n \n \n \"Following paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@article{HuLaGySz17,\n\tabstract = {Follow the leader (FTL) is a simple online learning algorithm that is known to perform well when the loss functions are convex and positively curved. In this paper we ask whether there are other settings when FTL achieves low regret. In particular, we study the fundamental problem of linear prediction over a convex, compact domain with non-empty interior. Amongst other results, we prove that the curvature of the boundary of the domain can act as if the losses were curved: In this case, we prove that as long as the mean of the loss vectors have positive lengths bounded away from zero, FTL enjoys logarithmic regret, while for polytope domains and stochastic data it enjoys finite expected regret. The former result is also extended to strongly convex domains by establishing an equivalence between the strong convexity of sets and the minimum curvature of their boundary, which may be of independent interest. Building on a previously known meta-algorithm, we also get an algorithm that simultaneously enjoys the worst-case guarantees and the smaller regret of FTL when the data is `easy'. Finally, we show that such guarantees are achievable directly (e.g., by the follow the regularized leader algorithm or by a shrinkage-based variant of FTL) when the constraint set is an ellipsoid.},\n\tauthor = {Huang, R. and Lattimore, T. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tdate = {2017-10},\n\tdate-added = {2018-03-11 12:12:25 +0000},\n\tdate-modified = {2018-03-11 18:35:41 +0000},\n\tjournal = {Journal of Machine Learning Research},\n\tkeywords = {online learning, linear prediction},\n\tpages = {1--31},\n\ttitle = {Following the Leader and Fast Rates in Online Linear Prediction: Curved Constraint Sets and Other Regularities},\n\turl_link = {http://jmlr.org/papers/v18/17-079.html},\n\turl_paper = {2017-JMRL-FTL.pdf},\n\tvolume = {18},\n\tyear = {2017},\n\tBdsk-Url-1 = {http://jmlr.org/papers/v18/17-079.html}}\n\n
\n
\n\n\n
\n Follow the leader (FTL) is a simple online learning algorithm that is known to perform well when the loss functions are convex and positively curved. In this paper we ask whether there are other settings when FTL achieves low regret. In particular, we study the fundamental problem of linear prediction over a convex, compact domain with non-empty interior. Amongst other results, we prove that the curvature of the boundary of the domain can act as if the losses were curved: In this case, we prove that as long as the mean of the loss vectors have positive lengths bounded away from zero, FTL enjoys logarithmic regret, while for polytope domains and stochastic data it enjoys finite expected regret. The former result is also extended to strongly convex domains by establishing an equivalence between the strong convexity of sets and the minimum curvature of their boundary, which may be of independent interest. Building on a previously known meta-algorithm, we also get an algorithm that simultaneously enjoys the worst-case guarantees and the smaller regret of FTL when the data is `easy'. Finally, we show that such guarantees are achievable directly (e.g., by the follow the regularized leader algorithm or by a shrinkage-based variant of FTL) when the constraint set is an ellipsoid.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Finite Time Bounds for Temporal Difference Learning with Function Approximation: Problems with some ``state-of-the-art'' results.\n \n \n \n \n\n\n \n Lakshminarayanan, C.; and Szepesvári, C.\n\n\n \n\n\n\n 08 2017.\n Technical Report\n\n\n\n
\n\n\n\n \n \n \"Finite paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 6 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@unpublished{LaSze17:Rep,\n\tabstract = {Online learning to rank is a core problem in information retrieval and machine learning. Many provably efficient algorithms have been recently proposed for this problem in specific click models. The click model is a model of how the user interacts with a list of documents. Though these results are significant, their impact on practice is limited, because all proposed algorithms are designed for specific click models and lack convergence guarantees in other models. In this work, we propose BatchRank, the first online learning to rank algorithm for a broad class of click models. The class encompasses two most fundamental click models, the cascade and position-based models. We derive a gap-dependent upper bound on the T-step regret of BatchRank and evaluate it on a range of web search queries. We observe that BatchRank outperforms ranked bandits and is more robust than CascadeKL-UCB, an existing algorithm for the cascade model. },\n\tauthor = {Lakshminarayanan, C. and Szepesv{\\'a}ri, Cs.},\n\tdate = {2017-07},\n\tdate-added = {2017-07-30 15:39:59 +0000},\n\tdate-modified = {2017-07-30 15:44:30 +0000},\n\tkeywords = {TD learning, stochastic approximation},\n\tmonth = {08},\n\tnote = {Technical Report},\n\ttitle = {Finite Time Bounds for Temporal Difference Learning with Function Approximation: Problems with some ``state-of-the-art'' results},\n\turl_paper = {TD-issues17.pdf},\n\tyear = {2017}}\n\n
\n
\n\n\n
\n Online learning to rank is a core problem in information retrieval and machine learning. Many provably efficient algorithms have been recently proposed for this problem in specific click models. The click model is a model of how the user interacts with a list of documents. Though these results are significant, their impact on practice is limited, because all proposed algorithms are designed for specific click models and lack convergence guarantees in other models. In this work, we propose BatchRank, the first online learning to rank algorithm for a broad class of click models. The class encompasses two most fundamental click models, the cascade and position-based models. We derive a gap-dependent upper bound on the T-step regret of BatchRank and evaluate it on a range of web search queries. We observe that BatchRank outperforms ranked bandits and is more robust than CascadeKL-UCB, an existing algorithm for the cascade model. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Online Learning to Rank in Stochastic Click Models (long version).\n \n \n \n \n\n\n \n Zoghi, M.; Tunys, T.; Ghavamzadeh, M.; Kveton, B.; Szepesvári, C.; and Wen, Z.\n\n\n \n\n\n\n In ICML, volume 70, of PMLR, pages 4199-4208, 08 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Online paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{ZoTuGKSzW:ICML17,\n\tabstract = {Online learning to rank is a core problem in information retrieval and machine learning. Many provably efficient algorithms have been recently proposed for this problem in specific click models. The click model is a model of how the user interacts with a list of documents. Though these results are significant, their impact on practice is limited, because all proposed algorithms are designed for specific click models and lack convergence guarantees in other models. In this work, we propose BatchRank, the first online learning to rank algorithm for a broad class of click models. The class encompasses two most fundamental click models, the cascade and position-based models. We derive a gap-dependent upper bound on the T-step regret of BatchRank and evaluate it on a range of web search queries. We observe that BatchRank outperforms ranked bandits and is more robust than CascadeKL-UCB, an existing algorithm for the cascade model. },\n\tacceptrate = {434 out of 1676=26\\%},\n\tauthor = {Zoghi, M. and Tunys, T. and Ghavamzadeh, M. and Kveton, B. and Szepesv{\\'a}ri, Cs. and Wen, Z.},\n\tbooktitle = {ICML},\n\tkeywords = {ranking, online learning, partial information, stochastic online learning, online learning to rank},\n\tmonth = {08},\n\tpages = {4199-4208},\n\tseries = {PMLR},\n\ttitle = {Online Learning to Rank in Stochastic Click Models (long version)},\n\turl_paper = {ranking-icml2017.pdf},\n\tvolume = {70},\n\tyear = {2017}}\n\n
\n
\n\n\n
\n Online learning to rank is a core problem in information retrieval and machine learning. Many provably efficient algorithms have been recently proposed for this problem in specific click models. The click model is a model of how the user interacts with a list of documents. Though these results are significant, their impact on practice is limited, because all proposed algorithms are designed for specific click models and lack convergence guarantees in other models. In this work, we propose BatchRank, the first online learning to rank algorithm for a broad class of click models. The class encompasses two most fundamental click models, the cascade and position-based models. We derive a gap-dependent upper bound on the T-step regret of BatchRank and evaluate it on a range of web search queries. We observe that BatchRank outperforms ranked bandits and is more robust than CascadeKL-UCB, an existing algorithm for the cascade model. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Bernoulli Rank-1 Bandits for Click Feedback.\n \n \n \n \n\n\n \n Katariya, S.; Kveton, B.; Szepesvári, C.; Vernade, C.; and Wen, Z.\n\n\n \n\n\n\n In IJCAI, pages 2001–2007, 08 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Bernoulli paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KaKvSzVW:IIJCAI17,\n\tabstract = {The probability that a user will click a search result depends both on its relevance and its position on the results page. The position based model explains this behavior by ascribing to every item an attraction probability, and to every position an examination probability. To be clicked, a result must be both attractive and examined. The probabilities of an item-position pair being clicked thus form the entries of a rank-1 matrix. We propose the learning problem of a Bernoulli rank-1 bandit where at each step, the learning agent chooses a pair of row and column arms, and receives the product of their Bernoulli-distributed values as a reward. This is a special case of the stochastic rank-1 bandit problem considered in recent work that proposed an elimination based algorithm Rank1Elim, and showed that Rank1Elim's regret scales linearly with the number of rows and columns on "benign" instances. These are the instances where the minimum of the average row and column rewards mu is bounded away from zero. The issue with Rank1Elim is that it fails to be competitive with straightforward bandit strategies as mu approaches zero. In this paper we propose Rank1ElimKL, which replaces the crude confidence intervals of Rank1Elim with confidence intervals based on Kullback-Leibler (KL) divergences. With the help of a novel result concerning the scaling of KL divergences we prove that with this change, our algorithm will be competitive no matter the value of mu. Experiments with synthetic data confirm that on benign instances the performance of Rank1ElimKL is significantly better than that of even Rank1Elim. Similarly, experiments with models derived from real-data confirm that the improvements are significant across the board, regardless of whether the data is benign or not.},\n\tacceptrate = {660 out of 2540=26\\%},\n\tauthor = {Katariya, S. and Kveton, B. and Szepesv{\\'a}ri, Cs. and Vernade, C. and Wen, Z.},\n\tbooktitle = {IJCAI},\n\tkeywords = {stochastic bandits, rank-1 bandits, UCB policy},\n\tmonth = {08},\n\tpages = {2001--2007},\n\ttitle = {Bernoulli Rank-1 Bandits for Click Feedback},\n\turl_paper = {rank1klucb-ijcai17.pdf},\n\tyear = {2017}}\n\n
\n
\n\n\n
\n The probability that a user will click a search result depends both on its relevance and its position on the results page. The position based model explains this behavior by ascribing to every item an attraction probability, and to every position an examination probability. To be clicked, a result must be both attractive and examined. The probabilities of an item-position pair being clicked thus form the entries of a rank-1 matrix. We propose the learning problem of a Bernoulli rank-1 bandit where at each step, the learning agent chooses a pair of row and column arms, and receives the product of their Bernoulli-distributed values as a reward. This is a special case of the stochastic rank-1 bandit problem considered in recent work that proposed an elimination based algorithm Rank1Elim, and showed that Rank1Elim's regret scales linearly with the number of rows and columns on \"benign\" instances. These are the instances where the minimum of the average row and column rewards mu is bounded away from zero. The issue with Rank1Elim is that it fails to be competitive with straightforward bandit strategies as mu approaches zero. In this paper we propose Rank1ElimKL, which replaces the crude confidence intervals of Rank1Elim with confidence intervals based on Kullback-Leibler (KL) divergences. With the help of a novel result concerning the scaling of KL divergences we prove that with this change, our algorithm will be competitive no matter the value of mu. Experiments with synthetic data confirm that on benign instances the performance of Rank1ElimKL is significantly better than that of even Rank1Elim. Similarly, experiments with models derived from real-data confirm that the improvements are significant across the board, regardless of whether the data is benign or not.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Unsupervised Sequential Sensor Acquisition (long version).\n \n \n \n \n\n\n \n Hanawal, M.; Szepesvári, C.; and Saligrama, V.\n\n\n \n\n\n\n In AISTATS, volume 54, of PMLR, pages 803–811, 04 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Unsupervised paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{HaSzeSa:AISTATS17,\n\tabstract = {In many security and healthcare systems a sequence of sensors/tests are used for detection and diagnosis. Each test outputs a prediction of the latent state, and carries with it inherent costs. Our objective is to learn strategies for selecting tests to optimize accuracy and costs. Unfortunately it is often impossible to acquire in-situ ground truth annotations and we are left with the problem of unsupervised sensor selection (USS). We pose USS as a version of stochastic partial monitoring problem with an unusual reward structure (even noisy annotations are unavailable). Unsurprisingly no learner can achieve sublinear regret without further assumptions. To this end we propose the notion of weak-dominance. This is a condition on the joint probability distribution of test outputs and latent state and says that whenever a test is accurate on an example, a later test in the sequence is likely to be accurate as well.},\n\tacceptrate = {168 out of 530=32\\%},\n\tauthor = {Hanawal, M. and Szepesv{\\'a}ri, Cs. and Saligrama, V.},\n\tbooktitle = {AISTATS},\n\tkeywords = {stochastic bandits, unsupervised learning, stochastic partial monitoring, cascaded sensor selection, optimal stopping},\n\tmonth = {04},\n\tpages = {803--811},\n\tseries = {PMLR},\n\ttitle = {Unsupervised Sequential Sensor Acquisition (long version)},\n\turl_paper = {sensor_AISTATS17.pdf},\n\tvolume = {54},\n\tyear = {2017}}\n\n
\n
\n\n\n
\n In many security and healthcare systems a sequence of sensors/tests are used for detection and diagnosis. Each test outputs a prediction of the latent state, and carries with it inherent costs. Our objective is to learn strategies for selecting tests to optimize accuracy and costs. Unfortunately it is often impossible to acquire in-situ ground truth annotations and we are left with the problem of unsupervised sensor selection (USS). We pose USS as a version of stochastic partial monitoring problem with an unusual reward structure (even noisy annotations are unavailable). Unsurprisingly no learner can achieve sublinear regret without further assumptions. To this end we propose the notion of weak-dominance. This is a condition on the joint probability distribution of test outputs and latent state and says that whenever a test is accurate on an example, a later test in the sequence is likely to be accurate as well.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n The End of Optimism? An Asymptotic Analysis of Finite-Armed Linear Bandits (long version).\n \n \n \n \n\n\n \n Lattimore, T.; and Szepesvári, C.\n\n\n \n\n\n\n In AISTATS, volume 54, of PMLR, pages 728–737, 04 2017. \n \n\n\n\n
\n\n\n\n \n \n \"The paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{LaSze17:AISTATS,\n\tabstract = {Stochastic linear bandits are a natural and simple generalisation of finite-armed bandits with numerous practical applications. Current approaches focus on generalising existing techniques for finite-armed bandits, notably the otimism principle and Thompson sampling. Prior analysis has mostly focussed on the worst-case setting. We analyse the asymptotic regret and show matching upper and lower bounds on what is achievable. Surprisingly, our results show that no algorithm based on optimism or Thompson sampling will ever achieve the optimal rate. In fact, they can be arbitrarily far from optimal, even in very simple cases. This is a disturbing result because these techniques are standard tools that are widely used for sequential optimisation, for example, generalised linear bandits and reinforcement learning. },\n\tacceptrate = {168 out of 530=32\\%},\n\tauthor = {Lattimore, T. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {AISTATS},\n\tkeywords = {bandits, structured bandits, stochastic bandits, linear bandits, asymptotic optimality, asymptotic regret, instance dependent regret},\n\tmonth = {04},\n\tpages = {728--737},\n\tseries = {PMLR},\n\ttitle = {The End of Optimism? An Asymptotic Analysis of Finite-Armed Linear Bandits (long version)},\n\turl_paper = {linbandits_aistats17.pdf},\n\tvolume = {54},\n\tyear = {2017}}\n\n
\n
\n\n\n
\n Stochastic linear bandits are a natural and simple generalisation of finite-armed bandits with numerous practical applications. Current approaches focus on generalising existing techniques for finite-armed bandits, notably the otimism principle and Thompson sampling. Prior analysis has mostly focussed on the worst-case setting. We analyse the asymptotic regret and show matching upper and lower bounds on what is achievable. Surprisingly, our results show that no algorithm based on optimism or Thompson sampling will ever achieve the optimal rate. In fact, they can be arbitrarily far from optimal, even in very simple cases. This is a disturbing result because these techniques are standard tools that are widely used for sequential optimisation, for example, generalised linear bandits and reinforcement learning. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Stochastic Rank-1 Bandits (long version).\n \n \n \n \n\n\n \n Katariya, S.; Kveton, B.; Szepesvári, C.; Vernade, C.; and Wen, Z.\n\n\n \n\n\n\n In AISTATS, volume 54, of PMLR, pages 392–401, 04 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Stochastic paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KaKveSzVW:2017,\n\tabstract = {We propose stochastic rank-1 bandits, a class of online learning problems where at each step a learning agent chooses a pair of row and column arms, and receives the product of their values as a reward. The main challenge of the problem is that the individual values of the row and column are unobserved. We assume that these values are stochastic and drawn independently. We propose a computationally-efficient algorithm for solving our problem, which we call Rank1Elim. We derive a O((K + L) (1/Delta) log n) upper bound on its n-step regret, where K is the number of rows, L is the number of columns, and Delta is the minimum of the row and column gaps; under the assumption that the mean row and column rewards are bounded away from zero. To the best of our knowledge, we present the first bandit algorithm that finds the maximum entry of a rank-1 matrix whose regret is linear in K + L, 1/Delta, and log n. We also derive a nearly matching lower bound. Finally, we evaluate Rank1Elim empirically on multiple problems. We observe that it leverages the structure of our problems and can learn near-optimal solutions even if our modeling assumptions are mildly violated.\n},\n\tacceptrate = {168 out of 530=32\\%},\n\tauthor = {Katariya, S. and Kveton, B. and Szepesv{\\'a}ri, Cs. and Vernade, C. and Wen, Z.},\n\tbooktitle = {AISTATS},\n\tkeywords = {bandits, structured bandits, stochastic bandits, matrix bandits, linear bandits},\n\tmonth = {04},\n\tpages = {392--401},\n\tseries = {PMLR},\n\ttitle = {Stochastic Rank-1 Bandits (long version)},\n\turl_paper = {rank1aistats17.pdf},\n\tvolume = {54},\n\tyear = {2017}}\n\n
\n
\n\n\n
\n We propose stochastic rank-1 bandits, a class of online learning problems where at each step a learning agent chooses a pair of row and column arms, and receives the product of their values as a reward. The main challenge of the problem is that the individual values of the row and column are unobserved. We assume that these values are stochastic and drawn independently. We propose a computationally-efficient algorithm for solving our problem, which we call Rank1Elim. We derive a O((K + L) (1/Delta) log n) upper bound on its n-step regret, where K is the number of rows, L is the number of columns, and Delta is the minimum of the row and column gaps; under the assumption that the mean row and column rewards are bounded away from zero. To the best of our knowledge, we present the first bandit algorithm that finds the maximum entry of a rank-1 matrix whose regret is linear in K + L, 1/Delta, and log n. We also derive a nearly matching lower bound. Finally, we evaluate Rank1Elim empirically on multiple problems. We observe that it leverages the structure of our problems and can learn near-optimal solutions even if our modeling assumptions are mildly violated. \n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2016\n \n \n (11)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n SDP Relaxation with Randomized Rounding for Energy Disaggregation.\n \n \n \n \n\n\n \n Shaloudegi, K.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, pages 4979–4987, 09 2016. \n \n\n\n\n
\n\n\n\n \n \n \"SDP paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{ShGySze16,\n\tabstract = {We develop a scalable, computationally efficient method for the task of energy disaggregation for home appliance monitoring. In this problem the goal is to estimate the energy consumption of each appliance over time based on the total energy-consumption signal of a household. The current state of the art is to model the problem as inference in factorial HMMs, and use quadratic programming to find an approximate solution to the resulting quadratic integer program. Here we take a more principled approach, better suited to integer programming problems, and find an approximate optimum by combining convex semidefinite relaxations  randomized rounding, as well as a scalable ADMM method that exploits the special structure of the resulting semidefinite program. Simulation results both in synthetic and real-world datasets demonstrate the superiority of our method.\n},\n\tacceptrate = {568 out of 2500=23\\%},\n\tauthor = {Shaloudegi, K. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tkeywords = {factorial hidden Markov model, inference, relaxation, optimization, energy disaggregation},\n\tmonth = {09},\n\tpages = {4979--4987},\n\ttitle = {SDP Relaxation with Randomized Rounding for Energy Disaggregation},\n\turl_paper = {NeurIPS16-NILM.pdf},\n\tyear = {2016}}\n\n
\n
\n\n\n
\n We develop a scalable, computationally efficient method for the task of energy disaggregation for home appliance monitoring. In this problem the goal is to estimate the energy consumption of each appliance over time based on the total energy-consumption signal of a household. The current state of the art is to model the problem as inference in factorial HMMs, and use quadratic programming to find an approximate solution to the resulting quadratic integer program. Here we take a more principled approach, better suited to integer programming problems, and find an approximate optimum by combining convex semidefinite relaxations randomized rounding, as well as a scalable ADMM method that exploits the special structure of the resulting semidefinite program. Simulation results both in synthetic and real-world datasets demonstrate the superiority of our method. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Following the Leader and Fast Rates in Linear Prediction: Curved Constraint Sets and Other Regularities (extended version).\n \n \n \n \n\n\n \n Huang, R.; Lattimore, T.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, pages 4970–4978, 09 2016. \n \n\n\n\n
\n\n\n\n \n \n \"Following paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{HuLaGySze16,\n\tabstract = {The follow the leader (FTL) algorithm, perhaps the simplest of all online learning algorithms, is known to perform well when the loss functions it is used on are positively curved. In this paper we ask whether there are other ``lucky'' settings when FTL achieves sublinear, ``small'' regret. In particular, we study the fundamental problem of linear prediction over a non-empty convex, compact domain. Amongst other results, we prove that the curvature of  the boundary of the domain can act as if the losses were curved: In this case, we prove that as long as  the mean of the loss vectors have positive lengths bounded away from zero,  FTL enjoys a logarithmic growth rate of regret, while, e.g., for polyhedral domains and stochastic data it enjoys finite expected regret. Building on a previously known meta-algorithm, we also get an algorithm that simultaneously enjoys the worst-case guarantees and the bound available for FTL.},\n\tacceptrate = {568 out of 2500=23\\%},\n\tauthor = {Huang, R. and Lattimore, T. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tkeywords = {adversarial setting, online learning, follow-the-leader, adaptivity, theory},\n\tmonth = {09},\n\tpages = {4970--4978},\n\ttitle = {Following the Leader and Fast Rates in Linear Prediction: Curved Constraint Sets and Other Regularities (extended version)},\n\turl_paper = {NeurIPS16_FTL.pdf},\n\tyear = {2016}}\n\n
\n
\n\n\n
\n The follow the leader (FTL) algorithm, perhaps the simplest of all online learning algorithms, is known to perform well when the loss functions it is used on are positively curved. In this paper we ask whether there are other ``lucky'' settings when FTL achieves sublinear, ``small'' regret. In particular, we study the fundamental problem of linear prediction over a non-empty convex, compact domain. Amongst other results, we prove that the curvature of the boundary of the domain can act as if the losses were curved: In this case, we prove that as long as the mean of the loss vectors have positive lengths bounded away from zero, FTL enjoys a logarithmic growth rate of regret, while, e.g., for polyhedral domains and stochastic data it enjoys finite expected regret. Building on a previously known meta-algorithm, we also get an algorithm that simultaneously enjoys the worst-case guarantees and the bound available for FTL.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n (Bandit) Convex Optimization with Biased Noisy Gradient Oracles.\n \n \n \n \n\n\n \n Hu, X.; L.A., P.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In AISTATS, pages 819–828, 2016. \n \n\n\n\n
\n\n\n\n \n \n \"(Bandit) paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 6 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{HPGySz16:BCO,\n\tabstract = {Algorithms for bandit convex optimization and online learning often rely on constructing noisy gradient estimates, which are then used in appropriately adjusted first-order algorithms, replacing actual gradients. Depending on the properties of the function to be optimized and the nature of ``noise'' in the bandit feedback, the bias and variance of gradient estimates exhibit various tradeoffs. In this paper we propose a novel framework that replaces the specific gradient estimation methods with an abstract oracle. With the help of the new framework we unify previous works,\n reproducing  their results in a clean and concise fashion, while, perhaps more importantly, the framework also allows us to formally show that to achieve the optimal root-n rate either the algorithms that use existing gradient estimators, or the proof techniques used to analyze them have to go beyond what exists today.},\n\tacceptrate = {165 out of 537=31\\%},\n\tauthor = {Hu, X. and Prashanth L.A. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {AISTATS},\n\tkeywords = {bandits, online convex optimization, online learning, zeroth order optimization, noisy optimization},\n\tpages = {819--828},\n\ttitle = {(Bandit) Convex Optimization with Biased Noisy Gradient Oracles},\n\turl_paper = {AISTAT16-BGO.pdf},\n\tyear = {2016}}\n\n
\n
\n\n\n
\n Algorithms for bandit convex optimization and online learning often rely on constructing noisy gradient estimates, which are then used in appropriately adjusted first-order algorithms, replacing actual gradients. Depending on the properties of the function to be optimized and the nature of ``noise'' in the bandit feedback, the bias and variance of gradient estimates exhibit various tradeoffs. In this paper we propose a novel framework that replaces the specific gradient estimation methods with an abstract oracle. With the help of the new framework we unify previous works, reproducing their results in a clean and concise fashion, while, perhaps more importantly, the framework also allows us to formally show that to achieve the optimal root-n rate either the algorithms that use existing gradient estimators, or the proof techniques used to analyze them have to go beyond what exists today.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Shifting Regret, Mirror Descent, and Matrices.\n \n \n \n \n\n\n \n György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 2943–2951, 2016. \n \n\n\n\n
\n\n\n\n \n \n \"Shifting paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{GySz16:MirrorDescent,\n\tabstract = {We consider the problem of online prediction in changing environments. In this framework the performance of a predictor is evaluated as the loss relative to an arbitrarily changing predictor, whose individual components come from a base class of predictors. Typical results in the literature consider different base classes (experts, linear predictors on the simplex, etc.) separately. Introducing an arbitrary mapping inside the mirror decent algorithm, we provide a framework that unifies and extends existing results. As an example, we prove new shifting regret bounds for matrix prediction problems.},\n\tacceptrate = {270 out of 1037=26\\%},\n\tauthor = {Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\tkeywords = {online learning, mirror descent, twisted mirror descent, nonstationary prediction},\n\tpages = {2943--2951},\n\ttitle = {Shifting Regret, Mirror Descent, and Matrices},\n\turl_paper = {ICML16-SHIFT.pdf},\n\tyear = {2016}}\n\n
\n
\n\n\n
\n We consider the problem of online prediction in changing environments. In this framework the performance of a predictor is evaluated as the loss relative to an arbitrarily changing predictor, whose individual components come from a base class of predictors. Typical results in the literature consider different base classes (experts, linear predictors on the simplex, etc.) separately. Introducing an arbitrary mapping inside the mirror decent algorithm, we provide a framework that unifies and extends existing results. As an example, we prove new shifting regret bounds for matrix prediction problems.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Cumulative Prospect Theory Meets Reinforcement Learning: Prediction and Control.\n \n \n \n \n\n\n \n L.A., P.; Jie, C.; Fu, M.; Marcus, S.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 1406–1415, 2016. \n \n\n\n\n
\n\n\n\n \n \n \"Cumulative paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{PJFMSz16:CPT,\n\tabstract = {Cumulative prospect theory (CPT) is known to model human decisions well, with substantial empirical evidence supporting this claim.\nCPT works by distorting probabilities and is more general than the classic expected utility and coherent risk measures. We bring this idea to a risk-sensitive reinforcement learning (RL) setting and design algorithms for both estimation and control. The RL setting presents two particular challenges when CPT is applied: estimating the CPT objective requires estimations of the entire distribution of the value function and finding a randomized optimal policy. The estimation scheme that we propose uses the empirical distribution to estimate the CPT-value of a random variable. We then use this scheme in the inner loop of a CPT-value optimization procedure that is based on the well-known simulation optimization idea of simultaneous perturbation stochastic approximation (SPSA). We provide theoretical convergence guarantees for all the proposed algorithms and also illustrate the usefulness of CPT-based criteria in a traffic signal control application.},\n\tacceptrate = {270 out of 1037=26\\%},\n\tauthor = {Prashanth L.A. and Jie, C. and Fu, M. and Marcus, S. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\tkeywords = {cumulative prospect theory, reinforcement learning, SPSA, risk-sensitive criteria,},\n\tpages = {1406--1415},\n\ttitle = {Cumulative Prospect Theory Meets Reinforcement Learning: Prediction and Control},\n\turl_paper = {ICML16-CPT.pdf},\n\tyear = {2016}}\n\n
\n
\n\n\n
\n Cumulative prospect theory (CPT) is known to model human decisions well, with substantial empirical evidence supporting this claim. CPT works by distorting probabilities and is more general than the classic expected utility and coherent risk measures. We bring this idea to a risk-sensitive reinforcement learning (RL) setting and design algorithms for both estimation and control. The RL setting presents two particular challenges when CPT is applied: estimating the CPT objective requires estimations of the entire distribution of the value function and finding a randomized optimal policy. The estimation scheme that we propose uses the empirical distribution to estimate the CPT-value of a random variable. We then use this scheme in the inner loop of a CPT-value optimization procedure that is based on the well-known simulation optimization idea of simultaneous perturbation stochastic approximation (SPSA). We provide theoretical convergence guarantees for all the proposed algorithms and also illustrate the usefulness of CPT-based criteria in a traffic signal control application.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Conservative Bandits.\n \n \n \n \n\n\n \n Shariff, R.; Wu, Y.; Lattimore, T.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 1254–1262, 2016. \n \n\n\n\n
\n\n\n\n \n \n \"Conservative paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{SWLSz16:Conservative,\n\tabstract = {  We study a novel multi-armed bandit problem that models the\n  challenge faced by a company wishing to explore new strategies to\n  maximize revenue whilst simultaneously maintaining their revenue\n  above a fixed baseline, uniformly over time.  While previous work\n  addressed the problem under the weaker requirement of maintaining\n  the revenue constraint only at a given fixed time in the future, the\n  algorithms previously proposed are unsuitable due to their design\n  under the more stringent constraints.  We consider both the\n  stochastic and the adversarial settings, where we propose, natural,\n  yet novel strategies and analyze the price for maintaining the\n  constraints.  Amongst other things, we prove both high probability\n  and expectation bounds on the regret, while we also consider both\n  the problem of maintaining the constraints with high probability or\n  expectation.  For the adversarial setting the price of maintaining\n  the constraint appears to be higher, at least for the algorithm\n  considered.  A lower bound is given showing that the algorithm for\n  the stochastic setting is almost optimal.  Empirical results\n  obtained in synthetic environments complement our theoretical\n  findings.},\n\tacceptrate = {270 out of 1037=26\\%},\n\tauthor = {Shariff, R. and Wu, Y. and Lattimore, T. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\tkeywords = {bandits, stochastic bandits, theory, online learning, finite-armed bandits, constrained learning},\n\tpages = {1254--1262},\n\ttitle = {Conservative Bandits},\n\turl_paper = {ICML16-cbandit.pdf},\n\tyear = {2016}}\n\n
\n
\n\n\n
\n We study a novel multi-armed bandit problem that models the challenge faced by a company wishing to explore new strategies to maximize revenue whilst simultaneously maintaining their revenue above a fixed baseline, uniformly over time. While previous work addressed the problem under the weaker requirement of maintaining the revenue constraint only at a given fixed time in the future, the algorithms previously proposed are unsuitable due to their design under the more stringent constraints. We consider both the stochastic and the adversarial settings, where we propose, natural, yet novel strategies and analyze the price for maintaining the constraints. Amongst other things, we prove both high probability and expectation bounds on the regret, while we also consider both the problem of maintaining the constraints with high probability or expectation. For the adversarial setting the price of maintaining the constraint appears to be higher, at least for the algorithm considered. A lower bound is given showing that the algorithm for the stochastic setting is almost optimal. Empirical results obtained in synthetic environments complement our theoretical findings.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Policy Error Bounds for Model-Based Reinforcement Learning with Factored Linear Models.\n \n \n \n \n\n\n \n Pires, B.; and Szepesvári, C.\n\n\n \n\n\n\n In COLT, pages 121–151, 2016. \n \n\n\n\n
\n\n\n\n \n \n \"Policy paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 10 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{PiSze16:FLM,\n\tabstract = {In this paper we study a model-based approach to calculating approximately optimal policies in Markovian Decision Processes. In particular, we derive novel bounds on the loss of using a policy derived from a factored linear model, a class of models which generalize virtually all previous models that come with strong computational guarantees. For the first time in the literature, we derive performance bounds for model-based techniques where the model inaccuracy is measured in weighted norms. Moreover, our bounds show a decreased sensitivity to the discount factor and, unlike similar bounds derived for other approaches, they are insensitive to measure mismatch. Similarly to previous works, our proofs are also based on contraction arguments, but with the main differences that we use carefully constructed norms building on Banach lattices, and the contraction property is only assumed for operators acting on ``compressed'' spaces, thus weakening previous assumptions, while strengthening previous results.\n},\n\tacceptrate = {63 out of 199=32\\%},\n\tauthor = {Pires, B.A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {COLT},\n\tkeywords = {factored linear models, reinforcement learning, Markov Decision Processes, function approximation, control, planning, control learning, abstraction, model-based RL, pseudo-MDPs},\n\tpages = {121--151},\n\ttitle = {Policy Error Bounds for Model-Based Reinforcement Learning with Factored Linear Models},\n\turl_paper = {COLT16-FLM.pdf},\n\tyear = {2016}}\n\n
\n
\n\n\n
\n In this paper we study a model-based approach to calculating approximately optimal policies in Markovian Decision Processes. In particular, we derive novel bounds on the loss of using a policy derived from a factored linear model, a class of models which generalize virtually all previous models that come with strong computational guarantees. For the first time in the literature, we derive performance bounds for model-based techniques where the model inaccuracy is measured in weighted norms. Moreover, our bounds show a decreased sensitivity to the discount factor and, unlike similar bounds derived for other approaches, they are insensitive to measure mismatch. Similarly to previous works, our proofs are also based on contraction arguments, but with the main differences that we use carefully constructed norms building on Banach lattices, and the contraction property is only assumed for operators acting on ``compressed'' spaces, thus weakening previous assumptions, while strengthening previous results. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n DCM Bandits: Learning to Rank with Multiple Clicks.\n \n \n \n \n\n\n \n Katariya, S.; Kveton, B.; Szepesvári, C.; and Wen, Z.\n\n\n \n\n\n\n In ICML, pages 1215–1224, 2016. \n \n\n\n\n
\n\n\n\n \n \n \"DCM paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KKSzw16:DCM,\n\tabstract = {Search engines recommend a list of web pages. The user examines this list, from the first page to the last, and may click on multiple attractive pages. This type of user behavior can be modeled by the dependent click model (DCM). In this work, we propose DCM bandits, an online learning variant of the DCM model where the objective is to maximize the probability of recommending a satisfactory item. The main challenge of our problem is that the learning agent does not observe the reward. It only observes the clicks. This imbalance between the feedback and rewards makes our setting challenging. We propose a computationally-efficient learning algorithm for our problem, which we call dcmKL-UCB; derive gap-dependent upper bounds on its regret under reasonable assumptions; and prove a matching lower bound up to logarithmic factors. We experiment with dcmKL-UCB on both synthetic and real-world problems. Our algorithm outperforms a range of baselines and performs well even when our modeling assumptions are violated. To the best of our knowledge, this is the first regret-optimal online learning algorithm for learning to rank with multiple clicks in a cascade-like model.\n},\n\tacceptrate = {322 out of 1327=24\\%},\n\tauthor = {Katariya, S. and Kveton, B. and Szepesv{\\'a}ri, Cs. and Wen, Z.},\n\tbooktitle = {ICML},\n\tkeywords = {bandits, stochastic bandits, theory, online learning, nonlinear bandits, partial information, cascading bandits, multiclick},\n\tpages = {1215--1224},\n\ttitle = {DCM Bandits: Learning to Rank with Multiple Clicks},\n\turl_paper = {ICML16-DCMBandits.pdf},\n\tyear = {2016}}\n\n
\n
\n\n\n
\n Search engines recommend a list of web pages. The user examines this list, from the first page to the last, and may click on multiple attractive pages. This type of user behavior can be modeled by the dependent click model (DCM). In this work, we propose DCM bandits, an online learning variant of the DCM model where the objective is to maximize the probability of recommending a satisfactory item. The main challenge of our problem is that the learning agent does not observe the reward. It only observes the clicks. This imbalance between the feedback and rewards makes our setting challenging. We propose a computationally-efficient learning algorithm for our problem, which we call dcmKL-UCB; derive gap-dependent upper bounds on its regret under reasonable assumptions; and prove a matching lower bound up to logarithmic factors. We experiment with dcmKL-UCB on both synthetic and real-world problems. Our algorithm outperforms a range of baselines and performs well even when our modeling assumptions are violated. To the best of our knowledge, this is the first regret-optimal online learning algorithm for learning to rank with multiple clicks in a cascade-like model. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Regularized Policy Iteration with Nonparametric Function Spaces.\n \n \n \n \n\n\n \n Farahmand, A.; Ghavamzadeh, M.; Szepesvári, C.; and Mannor, S.\n\n\n \n\n\n\n JMLR, 17: 1–66. 01 2016.\n \n\n\n\n
\n\n\n\n \n \n \"Regularized paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{farahmand2016,\n\tabstract = {We study two regularization-based approximate policy iteration algorithms, namely REG-LSPI and REG-BRM, to solve reinforcement learning and planning problems in discounted Markov Decision Processes with large state and finite action spaces.\nThe core of these algorithms are the regularized extensions of the Least-Squares Temporal Difference (LSTD) learning and Bellman Residual Minimization (BRM), which are used in the algorithms' policy evaluation steps.\nRegularization provides a convenient way to control the complexity of the function space to which the estimated value function belongs and as a result enables us to work with rich nonparametric function spaces.\nWe derive efficient implementations of our methods when the function space is a reproducing kernel Hilbert space.\nWe analyze the statistical properties of REG-LSPI and provide an upper bound on the policy evaluation error and the performance loss of the policy returned by this method. Our bound shows the dependence of the loss on the number of samples, the capacity of the function space, and some intrinsic properties of the underlying Markov Decision Process. The dependence of the policy evaluation bound on the number of samples is minimax optimal. This is the first work that provides such a strong guarantee for a nonparametric approximate policy iteration algorithm.},\n\tauthor = {Farahmand, A.m. and Ghavamzadeh, M. and Szepesv{\\'a}ri, Cs. and Mannor, S.},\n\tdate = {2016-01},\n\tdate-added = {2016-01-09 00:18:01 +0000},\n\tdate-modified = {2016-10-16 14:32:18 +0000},\n\tjournal = {JMLR},\n\tkeywords = {reinforcement learning, regularization, nonparametrics, theory, function approximation, policy iteration},\n\tmonth = {01},\n\tpages = {1--66},\n\ttitle = {Regularized Policy Iteration with Nonparametric Function Spaces},\n\turl_paper = {jmlr15-regrl.pdf},\n\tvolume = {17},\n\tyear = {2016}}\n\n
\n
\n\n\n
\n We study two regularization-based approximate policy iteration algorithms, namely REG-LSPI and REG-BRM, to solve reinforcement learning and planning problems in discounted Markov Decision Processes with large state and finite action spaces. The core of these algorithms are the regularized extensions of the Least-Squares Temporal Difference (LSTD) learning and Bellman Residual Minimization (BRM), which are used in the algorithms' policy evaluation steps. Regularization provides a convenient way to control the complexity of the function space to which the estimated value function belongs and as a result enables us to work with rich nonparametric function spaces. We derive efficient implementations of our methods when the function space is a reproducing kernel Hilbert space. We analyze the statistical properties of REG-LSPI and provide an upper bound on the policy evaluation error and the performance loss of the policy returned by this method. Our bound shows the dependence of the loss on the number of samples, the capacity of the function space, and some intrinsic properties of the underlying Markov Decision Process. The dependence of the policy evaluation bound on the number of samples is minimax optimal. This is the first work that provides such a strong guarantee for a nonparametric approximate policy iteration algorithm.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Delay-Tolerant Online Convex Optimization: Unified Analysis and Adaptive-Gradient Algorithms.\n \n \n \n \n\n\n \n Joulani, P.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In AAAI-2016, pages 1744–1750, 11 2016. \n \n\n\n\n
\n\n\n\n \n \n \"Delay-Tolerant paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{JoGySz:AAAI16,\n\tabstract = {We present a unified, black-box-style method for developing and analyzing online convex optimization (OCO) algorithms for full-information online learning in delayed-feedback environments.  Our new, simplified analysis enables us to substantially improve upon previous work  and to solve a number of open problems from the literature. Specifically, we develop and analyze asynchronous AdaGrad-style algorithms from the Follow-the-Regularized-Leader (FTRL) and Mirror-Descent family that, unlike previous works, can handle projections and adapt both to the gradients and the delays, without relying  on  either strong convexity or smoothness of the objective function, or data sparsity. Our unified framework builds on a natural reduction from delayed-feedback to standard (non-delayed) online learning. This reduction, together with recent unification results for OCO algorithms, allows us to analyze the regret of generic FTRL and Mirror-Descent algorithms in the delayed-feedback setting in a unified manner using standard proof techniques. In addition, the reduction is exact and can be used to obtain both upper and lower bounds on the regret in the delayed-feedback setting.\n},\n\tacceptrate = {549 out of 2132=26\\%},\n\tauthor = {Joulani, P. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {AAAI-2016},\n\tkeywords = {online learning, online convex optimization, theory, delay, adversarial setting},\n\tmonth = {11},\n\tpages = {1744--1750},\n\ttitle = {Delay-Tolerant Online Convex Optimization: Unified Analysis and Adaptive-Gradient Algorithms},\n\turl_paper = {AAAI16-stable-algs-linear.pdf},\n\tyear = {2016}}\n\n
\n
\n\n\n
\n We present a unified, black-box-style method for developing and analyzing online convex optimization (OCO) algorithms for full-information online learning in delayed-feedback environments. Our new, simplified analysis enables us to substantially improve upon previous work and to solve a number of open problems from the literature. Specifically, we develop and analyze asynchronous AdaGrad-style algorithms from the Follow-the-Regularized-Leader (FTRL) and Mirror-Descent family that, unlike previous works, can handle projections and adapt both to the gradients and the delays, without relying on either strong convexity or smoothness of the objective function, or data sparsity. Our unified framework builds on a natural reduction from delayed-feedback to standard (non-delayed) online learning. This reduction, together with recent unification results for OCO algorithms, allows us to analyze the regret of generic FTRL and Mirror-Descent algorithms in the delayed-feedback setting in a unified manner using standard proof techniques. In addition, the reduction is exact and can be used to obtain both upper and lower bounds on the regret in the delayed-feedback setting. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Compressed Conditional Mean Embeddings for Model-Based Reinforcement Learning.\n \n \n \n \n\n\n \n Lever, G.; Shawe-Taylor, J.; Stafford, R.; and Szepesvári, C.\n\n\n \n\n\n\n In AAAI-2016, pages 1779–1787, 11 2016. \n \n\n\n\n
\n\n\n\n \n \n \"Compressed paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{LeSTSSz16,\n\tabstract = {We present a model-based approach to solving Markov decision processes (MDPs) in which the system dynamics are learned using conditional mean embeddings (CMEs). This class of methods comes with strong performance guarantees, and enables planning to be performed in an induced finite (pseudo-)MDP, which approximates the MDP, but can be solved exactly using dynamic programming. Two drawbacks of existing methods exist: firstly, the size of the induced finite (pseudo-)MDP scales quadratically with the amount of data used to learn the model, costing much memory and time when planning with the learned model; secondly, learning the CME itself using powerful kernel least-squares is costly -- a second computational bottleneck. We present an algorithm which maintains a rich kernelized CME model class, but solves both problems: firstly we demonstrate that the loss function for the CME model suggests a principled approach to compressing the induced (pseudo-)MDP, leading to faster planning, while maintaining guarantees; secondly we propose to learn the CME model itself using fast sparse-greedy kernel regression well-suited to the RL context. We demonstrate superior performance to existing methods in this class of model-based approaches on a range of MDPs.},\n\tacceptrate = {549 out of 2132=26\\%},\n\tauthor = {Lever, G. and Shawe-Taylor, J. and Stafford, R. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {AAAI-2016},\n\tkeywords = {reinforcement learning, Markov Decision Processes,function approximation, control, planning, control learning, abstraction, model-based RL, pseudo-MDPs},\n\tmonth = {11},\n\tpages = {1779--1787},\n\ttitle = {Compressed Conditional Mean Embeddings for Model-Based Reinforcement Learning},\n\turl_paper = {AAAI16_CompCME4RLfinal.pdf},\n\tyear = {2016}}\n\n
\n
\n\n\n
\n We present a model-based approach to solving Markov decision processes (MDPs) in which the system dynamics are learned using conditional mean embeddings (CMEs). This class of methods comes with strong performance guarantees, and enables planning to be performed in an induced finite (pseudo-)MDP, which approximates the MDP, but can be solved exactly using dynamic programming. Two drawbacks of existing methods exist: firstly, the size of the induced finite (pseudo-)MDP scales quadratically with the amount of data used to learn the model, costing much memory and time when planning with the learned model; secondly, learning the CME itself using powerful kernel least-squares is costly – a second computational bottleneck. We present an algorithm which maintains a rich kernelized CME model class, but solves both problems: firstly we demonstrate that the loss function for the CME model suggests a principled approach to compressing the induced (pseudo-)MDP, leading to faster planning, while maintaining guarantees; secondly we propose to learn the CME model itself using fast sparse-greedy kernel regression well-suited to the RL context. We demonstrate superior performance to existing methods in this class of model-based approaches on a range of MDPs.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2015\n \n \n (14)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Online Learning with Gaussian Payoffs and Side Observations.\n \n \n \n \n\n\n \n Wu, Y.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, pages 1360–1368, 09 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Online paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{WGySz:NeurIPS15,\n\tabstract = {We consider a sequential learning problem with Gaussian payoffs and side observations: after selecting an action i, the learner\nreceives information about the payoff of every action j in the form of Gaussian observations whose mean is the same as the mean payoff, but the variance depends on the pair (i,j) (and may be infinite). The setup allows a more refined information transfer from one action to another than previous partial monitoring setups, including the recently introduced graph-structured feedback case. For the first time in the literature, we provide non-asymptotic problem-dependent lower bounds on the regret of any algorithm, which recover existing asymptotic problem-dependent lower bounds and finite-time minimax lower bounds available in the literature. We also provide algorithms that achieve the problem-dependent lower bound (up to some universal constant factor) or the minimax lower bounds (up to  logarithmic factors). },\n\tacceptrate = {403 out of 1838=22\\%},\n\tauthor = {Wu, Y. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tkeywords = {online learning, partial information, learning with side-observations, minimax bounds, finite-sample bounds, asymptotic optimality, minimax optimality},\n\tmonth = {09},\n\tpages = {1360--1368},\n\ttitle = {Online Learning with Gaussian Payoffs and Side Observations},\n\turl_paper = {NeurIPS15-SideObs.pdf},\n\tyear = {2015}}\n\n
\n
\n\n\n
\n We consider a sequential learning problem with Gaussian payoffs and side observations: after selecting an action i, the learner receives information about the payoff of every action j in the form of Gaussian observations whose mean is the same as the mean payoff, but the variance depends on the pair (i,j) (and may be infinite). The setup allows a more refined information transfer from one action to another than previous partial monitoring setups, including the recently introduced graph-structured feedback case. For the first time in the literature, we provide non-asymptotic problem-dependent lower bounds on the regret of any algorithm, which recover existing asymptotic problem-dependent lower bounds and finite-time minimax lower bounds available in the literature. We also provide algorithms that achieve the problem-dependent lower bound (up to some universal constant factor) or the minimax lower bounds (up to logarithmic factors). \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Linear Multi-Resource Allocation with Semi-Bandit Feedback.\n \n \n \n \n\n\n \n Lattimore, T.; Crammer, K.; and Szepesvári, C.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, pages 964–972, 09 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Linear paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{LaCrSze15,\n\tabstract = {We study an idealised sequential resource allocation problem. In each time step the learner\nchooses an allocation of several resource types between a number of tasks. Assigning more\nresources to a task increases the probability that it is completed. The problem is challenging\nbecause the alignment of the tasks to the resource types is unknown and the feedback is noisy.\nOur main contribution is the new setting and an algorithm with nearly-optimal\nregret analysis. Along the way we draw connections to the problem of minimising regret\nfor stochastic linear bandits with heteroscedastic noise. We also present some new results for stochastic linear\nbandits on the hypercube that significantly improve on existing work, especially in the sparse case.\n},\n\tacceptrate = {403 out of 1838=22\\%},\n\tauthor = {Lattimore, T. and Crammer, K. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tkeywords = {partial information, online learning, theory, stochastic partial monitoring, bandits, resource allocation, linear bandits},\n\tmonth = {09},\n\tpages = {964--972},\n\ttitle = {Linear Multi-Resource Allocation with Semi-Bandit Feedback},\n\turl_paper = {NeurIPS15-mr-bandit.pdf},\n\tyear = {2015}}\n\n
\n
\n\n\n
\n We study an idealised sequential resource allocation problem. In each time step the learner chooses an allocation of several resource types between a number of tasks. Assigning more resources to a task increases the probability that it is completed. The problem is challenging because the alignment of the tasks to the resource types is unknown and the feedback is noisy. Our main contribution is the new setting and an algorithm with nearly-optimal regret analysis. Along the way we draw connections to the problem of minimising regret for stochastic linear bandits with heteroscedastic noise. We also present some new results for stochastic linear bandits on the hypercube that significantly improve on existing work, especially in the sparse case. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Combinatorial Cascading Bandits.\n \n \n \n \n\n\n \n Kveton, B.; Wen, Z.; Ashkan, A.; and Szepesvári, C.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, pages 1450–1458, 09 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Combinatorial paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KveWeAshSze15,\n\tabstract = {We consider learning to maximize reward in combinatorial cascading bandits, a new learning setting that unifies cascading and combinatorial bandits. The unification of these frameworks presents unique challenges in the analysis but allows for modeling a rich set of partial monitoring problems, such as learning to route in a communication network to minimize the probability of losing routed packets and recommending diverse items. We propose CombCascade, a computationally-efficient UCB-like algorithm for solving our problem; and derive gap-dependent and gap-free upper bounds on its regret. Our analysis builds on recent results in stochastic combinatorial semi-bandits but also addresses two novel challenges of our learning setting, a non-linear objective and partial observability. We evaluate CombCascade on two real-world problems and demonstrate that it performs well even when our modeling assumptions are violated. We also demonstrate that our setting requires new learning algorithms.\n},\n\tacceptrate = {403 out of 1838=22\\%},\n\tauthor = {Kveton, B. and Wen, Z. and Ashkan, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tkeywords = {bandits, stochastic bandits, theory, online learning, nonlinear bandits, partial information, cascading bandits, combinatorial bandits},\n\tmonth = {09},\n\tpages = {1450--1458},\n\ttitle = {Combinatorial Cascading Bandits},\n\turl_paper = {NeurIPS15-CombCascadeBandit.pdf},\n\tyear = {2015}}\n\n
\n
\n\n\n
\n We consider learning to maximize reward in combinatorial cascading bandits, a new learning setting that unifies cascading and combinatorial bandits. The unification of these frameworks presents unique challenges in the analysis but allows for modeling a rich set of partial monitoring problems, such as learning to route in a communication network to minimize the probability of losing routed packets and recommending diverse items. We propose CombCascade, a computationally-efficient UCB-like algorithm for solving our problem; and derive gap-dependent and gap-free upper bounds on its regret. Our analysis builds on recent results in stochastic combinatorial semi-bandits but also addresses two novel challenges of our learning setting, a non-linear objective and partial observability. We evaluate CombCascade on two real-world problems and demonstrate that it performs well even when our modeling assumptions are violated. We also demonstrate that our setting requires new learning algorithms. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Mixing Time Estimation in Reversible Markov Chains from a Single Sample Path.\n \n \n \n \n\n\n \n Hsu, D.; Kontorovich, A.; and Szepesvári, C.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, pages 1459–1467, 09 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Mixing paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{HsuKoSz15,\n\tabstract = {This article provides the first procedure for computing a fully\ndata-dependent interval that traps the mixing time t_mix of a finite\nreversible ergodic Markov chain at a prescribed confidence level.  The\ninterval is computed from a single finite-length sample path from the\nMarkov chain, and does not require the knowledge of any parameters of\nthe chain.  This stands in contrast to previous approaches, which\neither only provide point estimates, or require a reset mechanism, or\nadditional prior knowledge.\nThe interval is constructed around the relaxation time\nt_relax, which is strongly related to the mixing time, and\nthe width of the interval converges to zero roughly\nat a root-n rate, where n is the length of the sample path.\nUpper and lower bounds are given on the number of samples required to\nachieve constant-factor multiplicative accuracy.  The lower bounds\nindicate that, unless further restrictions are placed on the chain, no\nprocedure can achieve this accuracy level before seeing each state at\nleast Omega(t_relax) times on the average.  Finally, future\ndirections of research are identified.\n\n},\n\tacceptrate = {403 out of 1838=22\\%},\n\tauthor = {Hsu, D. and Kontorovich, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tkeywords = {mixing, data-dependent bounds, a posteriori bounds, Markov chains, finite-sample bounds, theory},\n\tmonth = {09},\n\tpages = {1459--1467},\n\ttitle = {Mixing Time Estimation in Reversible Markov Chains from a Single Sample Path},\n\turl_paper = {NeurIPS15_MixingTimeEst.pdf},\n\tyear = {2015}}\n\n
\n
\n\n\n
\n This article provides the first procedure for computing a fully data-dependent interval that traps the mixing time t_mix of a finite reversible ergodic Markov chain at a prescribed confidence level. The interval is computed from a single finite-length sample path from the Markov chain, and does not require the knowledge of any parameters of the chain. This stands in contrast to previous approaches, which either only provide point estimates, or require a reset mechanism, or additional prior knowledge. The interval is constructed around the relaxation time t_relax, which is strongly related to the mixing time, and the width of the interval converges to zero roughly at a root-n rate, where n is the length of the sample path. Upper and lower bounds are given on the number of samples required to achieve constant-factor multiplicative accuracy. The lower bounds indicate that, unless further restrictions are placed on the chain, no procedure can achieve this accuracy level before seeing each state at least Omega(t_relax) times on the average. Finally, future directions of research are identified. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Bayesian Optimal Control of Smoothly Parameterized Systems.\n \n \n \n \n\n\n \n Abbasi-Yadkori, Y.; and Szepesvári, C.\n\n\n \n\n\n\n In UAI, 05 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Bayesian paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 10 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{AYSz15,\n\tabstract = {We study Bayesian optimal control of a general class of smoothly parameterized Markov decision problems (MDPs).\nWe propose a lazy version of the so-called posterior sampling method, a method that goes back to Thompson and Strens, more recently studied by Osband, Russo and van Roy.\nWhile Osband et al. derived a bound on the (Bayesian) regret of this method for undiscounted total cost episodic, finite state and action problems,\nwe consider the continuing, average cost setting with no cardinality restrictions on the state or action spaces.\nWhile in the episodic setting, it is natural to switch to a new policy at the episode-ends,\nin the continuing average cost framework we must introduce switching points explicitly and in a principled fashion, or the regret could grow linearly.\nOur lazy method introduces these switching points based on monitoring the uncertainty left about the unknown parameter. To develop a suitable and easy-to-compute uncertainty measure,  we introduce a new ``average local smoothness'' condition, which is shown to be satisfied in common examples. Under this, and some additional mild conditions, we derive rate-optimal bounds on the regret of our algorithm.\nOur general approach allows us to use a single algorithm and a single analysis for a wide range of problems, such as finite MDPs or linear quadratic regulation, both being instances of smoothly parameterized MDPs.\nThe effectiveness of our method is illustrated by means of a simulated example.\n},\n\tacceptrate = {100 out of 292=34\\%},\n\tauthor = {Abbasi-Yadkori, Y. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {UAI},\n\tkeywords = {theory, online learning, MDPs, Thompson sampling, reinforcement learning},\n\tmonth = {05},\n\ttitle = {Bayesian Optimal Control of Smoothly Parameterized Systems},\n\turl_paper = {uai15-lazypsrl.pdf},\n\tyear = {2015}}\n\n
\n
\n\n\n
\n We study Bayesian optimal control of a general class of smoothly parameterized Markov decision problems (MDPs). We propose a lazy version of the so-called posterior sampling method, a method that goes back to Thompson and Strens, more recently studied by Osband, Russo and van Roy. While Osband et al. derived a bound on the (Bayesian) regret of this method for undiscounted total cost episodic, finite state and action problems, we consider the continuing, average cost setting with no cardinality restrictions on the state or action spaces. While in the episodic setting, it is natural to switch to a new policy at the episode-ends, in the continuing average cost framework we must introduce switching points explicitly and in a principled fashion, or the regret could grow linearly. Our lazy method introduces these switching points based on monitoring the uncertainty left about the unknown parameter. To develop a suitable and easy-to-compute uncertainty measure, we introduce a new ``average local smoothness'' condition, which is shown to be satisfied in common examples. Under this, and some additional mild conditions, we derive rate-optimal bounds on the regret of our algorithm. Our general approach allows us to use a single algorithm and a single analysis for a wide range of problems, such as finite MDPs or linear quadratic regulation, both being instances of smoothly parameterized MDPs. The effectiveness of our method is illustrated by means of a simulated example. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n On Identifying Good Options under Combinatorially Structured Feedback in Finite Noisy Environments.\n \n \n \n \n\n\n \n Wu, Y.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 1283–1291, 2015. \n \n\n\n\n
\n\n\n\n \n \n \"On paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{WGySz15,\n\tabstract = {We consider the problem of identifying a good option\nout of finite set of options under combinatorially structured, noise feedback\nabout the quality of the options\nin a sequential process:\nIn each round, a subset of the options, from an available set of subsets,\ncan be selected to receive noisy information about the quality of the options in the chosen subset.\nThe goal is to identify the highest quality option, or a group of options\nof the highest quality, with a small error probability, while using\nthe smallest number of measurements.\nThe problem generalizes best-arm identification problems.\nBy extending previous work, we design new algorithms that are shown\nto be able to exploit the combinatorial structure of the problem\nin a nontrivial fashion, while being unimprovable in special cases.\nThe algorithms call a set multi-covering oracle, hence their\nperformance and efficiency is strongly tied to whether\nthe associated set multi-covering problem can be efficiently solved.},\n\tacceptrate = {270 out of 1037=26\\%},\n\tauthor = {Wu, Y. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\tkeywords = {online learning, best-arm identification, noisy optimization},\n\tpages = {1283--1291},\n\ttitle = {On Identifying Good Options under Combinatorially Structured Feedback in Finite Noisy Environments},\n\turl_paper = {ICML15-ExploSetObs.pdf},\n\tyear = {2015}}\n\n
\n
\n\n\n
\n We consider the problem of identifying a good option out of finite set of options under combinatorially structured, noise feedback about the quality of the options in a sequential process: In each round, a subset of the options, from an available set of subsets, can be selected to receive noisy information about the quality of the options in the chosen subset. The goal is to identify the highest quality option, or a group of options of the highest quality, with a small error probability, while using the smallest number of measurements. The problem generalizes best-arm identification problems. By extending previous work, we design new algorithms that are shown to be able to exploit the combinatorial structure of the problem in a nontrivial fashion, while being unimprovable in special cases. The algorithms call a set multi-covering oracle, hence their performance and efficiency is strongly tied to whether the associated set multi-covering problem can be efficiently solved.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Deterministic Independent Component Analysis.\n \n \n \n \n\n\n \n Huang, R.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 2521–2530, 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Deterministic paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{HuGySze15,\n\tabstract = {We study independent component analysis with noisy observations.\nWe present, for the first time in the literature, consistent, polynomial-time algorithms to recover non-Gaussian source signals and the mixing matrix with\na reconstruction error that vanishes at a rate of T^{1/2} using T observations and scales only polynomially with\nthe natural parameters of the problem.\nOur algorithms and analysis also extend to deterministic source signals whose empirical distributions are approximately independent.\n},\n\tacceptrate = {270 out of 1037=26\\%},\n\tauthor = {Huang, R. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\tkeywords = {learning theory, sample complexity, independent component analysis},\n\tpages = {2521--2530},\n\ttitle = {Deterministic Independent Component Analysis},\n\turl_paper = {ICML15-DICA.pdf},\n\tyear = {2015}}\n\n
\n
\n\n\n
\n We study independent component analysis with noisy observations. We present, for the first time in the literature, consistent, polynomial-time algorithms to recover non-Gaussian source signals and the mixing matrix with a reconstruction error that vanishes at a rate of T^1/2 using T observations and scales only polynomially with the natural parameters of the problem. Our algorithms and analysis also extend to deterministic source signals whose empirical distributions are approximately independent. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Cascading Bandits: Learning to Rank in the Cascade Model.\n \n \n \n \n\n\n \n Kveton, B.; Wen, Z.; Ashkan, A.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 767–776, 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Cascading paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KWASz15:Cascading,\n\tabstract = {The cascade model is a well-established model of user interaction with content. In this work, we propose cascading bandits, a learning variant of the model where the objective is to learn K most attractive items out of L ground items. We cast the problem as a stochastic combinatorial bandit with a non-linear reward function and partially observed weights of items. Both of these are challenging in the context of combinatorial bandits. We propose two computationally-efficient algorithms for our problem, CascadeUCB1 and CascadeKL-UCB, and prove gap-dependent upper bounds on their regret. We also derive a lower bound for cascading bandits and show that it matches the upper bound of CascadeKL-UCB up to a logarithmic factor. Finally, we evaluate our algorithms on synthetic problems. Our experiments demonstrate that the algorithms perform well and robustly even when our modeling assumptions are violated.\n},\n\tacceptrate = {270 out of 1037=26\\%},\n\tauthor = {Kveton, B. and Wen, Z. and Ashkan, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\tkeywords = {bandits, stochastic bandits, theory, online learning, nonlinear bandits, partial information, cascading bandits},\n\tpages = {767--776},\n\ttitle = {Cascading Bandits: Learning to Rank in the Cascade Model},\n\turl_paper = {ICML15-CascadingBandits.pdf},\n\tyear = {2015}}\n\n
\n
\n\n\n
\n The cascade model is a well-established model of user interaction with content. In this work, we propose cascading bandits, a learning variant of the model where the objective is to learn K most attractive items out of L ground items. We cast the problem as a stochastic combinatorial bandit with a non-linear reward function and partially observed weights of items. Both of these are challenging in the context of combinatorial bandits. We propose two computationally-efficient algorithms for our problem, CascadeUCB1 and CascadeKL-UCB, and prove gap-dependent upper bounds on their regret. We also derive a lower bound for cascading bandits and show that it matches the upper bound of CascadeKL-UCB up to a logarithmic factor. Finally, we evaluate our algorithms on synthetic problems. Our experiments demonstrate that the algorithms perform well and robustly even when our modeling assumptions are violated. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Fast Cross-Validation for Incremental Learning.\n \n \n \n \n\n\n \n Joulani, P.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In IJCAI, pages 3597–3604, 04 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Fast paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{JoGySz15,\n\tabstract = {Online learning with delayed feedback has received increasing attention recently due to its several applications in distributed, web-based learning problems. In this paper we provide a systematic study of the topic, and analyze how the delay effects the regret of online learning algorithms. Somewhat surprisingly, it turns out that delay increases the regret in a multiplicative way in adversarial problems, and in an additive way in stochastic problems. We give meta-algorithms that transform, in a black-box fashion, algorithms developed for the non-delayed case into ones that can handle the presence of delays in the feedback loop. Modifications of the well-known UCB algorithm are also developed for the bandit problem with delayed feedback, with the advantage over the meta-algorithms that they can be implemented with much lower complexity.},\n\tacceptrate = {575 out of 1996=28.6\\%},\n\tauthor = {Joulani, P. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {IJCAI},\n\tkeywords = {theory, big data, complexity analysis, computation, cross-validation, performance bounds},\n\tmonth = {04},\n\tpages = {3597--3604},\n\ttitle = {Fast Cross-Validation for Incremental Learning},\n\turl_paper = {tree-cv.pdf},\n\tyear = {2015}}\n\n
\n
\n\n\n
\n Online learning with delayed feedback has received increasing attention recently due to its several applications in distributed, web-based learning problems. In this paper we provide a systematic study of the topic, and analyze how the delay effects the regret of online learning algorithms. Somewhat surprisingly, it turns out that delay increases the regret in a multiplicative way in adversarial problems, and in an additive way in stochastic problems. We give meta-algorithms that transform, in a black-box fashion, algorithms developed for the non-delayed case into ones that can handle the presence of delays in the feedback loop. Modifications of the well-known UCB algorithm are also developed for the bandit problem with delayed feedback, with the advantage over the meta-algorithms that they can be implemented with much lower complexity.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Decision-theoretic Clustering of Strategies.\n \n \n \n \n\n\n \n Bard, N.; Nicholas, D.; Szepesvári, C.; and Bowling, M. H.\n\n\n \n\n\n\n In AAMAS, pages 17–25, 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Decision-theoretic paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{BaNSzBo15,\n\tabstract = {Clustering agents by their behaviour can be crucial for building effective agent models.  Traditional clustering typically aims to group entities together based on a distance metric, where a desirable clustering is one where the entities in a cluster are spatially close together.  Instead, one may desire to cluster based on actionability, or the capacity for the clusters to suggest how an agent should respond to maximize their utility with respect to the entities.  Segmentation problems examine this decision-theoretic clustering task.  Although finding optimal solutions to these problems is computationally hard, greedy-based approximation algorithms exist.  However, in settings where the agent has a combinatorially large number of candidate responses whose utilities must be considered, these algorithms are often intractable.  In this work, we show that in many cases the utility function can be factored to allow for an efficient greedy algorithm even when there are exponentially large response spaces.  We evaluate our technique theoretically, proving approximation bounds, and empirically using extensive-form games by clustering opponent strategies in toy poker games.  Our results demonstrate that these techniques yield dramatically improved clusterings compared to a traditional distance-based clustering approach in terms of both subjective quality and utility obtained by responding to the clusters.\n},\n\tacceptrate = {167 out of 670=25\\%},\n\tauthor = {Bard, N. and Nicholas, D. and Szepesv{\\'a}ri, Cs. and Bowling, M. H.},\n\tbooktitle = {AAMAS},\n\tkeywords = {clustering, decision-theory, poker},\n\tpages = {17--25},\n\ttitle = {Decision-theoretic Clustering of Strategies},\n\turl_paper = {AAMAS15-bard.pdf},\n\tyear = {2015}}\n\n
\n
\n\n\n
\n Clustering agents by their behaviour can be crucial for building effective agent models. Traditional clustering typically aims to group entities together based on a distance metric, where a desirable clustering is one where the entities in a cluster are spatially close together. Instead, one may desire to cluster based on actionability, or the capacity for the clusters to suggest how an agent should respond to maximize their utility with respect to the entities. Segmentation problems examine this decision-theoretic clustering task. Although finding optimal solutions to these problems is computationally hard, greedy-based approximation algorithms exist. However, in settings where the agent has a combinatorially large number of candidate responses whose utilities must be considered, these algorithms are often intractable. In this work, we show that in many cases the utility function can be factored to allow for an efficient greedy algorithm even when there are exponentially large response spaces. We evaluate our technique theoretically, proving approximation bounds, and empirically using extensive-form games by clustering opponent strategies in toy poker games. Our results demonstrate that these techniques yield dramatically improved clusterings compared to a traditional distance-based clustering approach in terms of both subjective quality and utility obtained by responding to the clusters. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Near-optimal max-affine estimators for convex regression.\n \n \n \n \n\n\n \n Balázs, G.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In AISTATS, pages 56–64, 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Near-optimal paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{BaGySz15,\n\tabstract = {    This paper considers least squares estimators for regression\n    problems over convex, uniformly bounded, uniformly Lipschitz function\n    classes minimizing the empirical risk over max-affine functions\n    (the maximum of finitely many affine functions).\n    Based on new results on nonlinear nonparametric regression\n    and on the approximation accuracy of max-affine functions,\n    these estimators are proved to achieve the optimal rate of\n    convergence up to logarithmic factors.\n    Preliminary experiments indicate that a simple randomized approximation\n    to the optimal estimator is competitive with state-of-the-art alternatives.\n},\n\tacceptrate = {127 out of 442=29\\%},\n\tauthor = {Bal{\\'a}zs, G. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {AISTATS},\n\tkeywords = {regression, nonparametrics, convex regression},\n\tpages = {56--64},\n\ttitle = {Near-optimal max-affine estimators for convex regression},\n\turl_paper = {AISTAT15-cvxreg.pdf},\n\tyear = {2015}}\n\n
\n
\n\n\n
\n This paper considers least squares estimators for regression problems over convex, uniformly bounded, uniformly Lipschitz function classes minimizing the empirical risk over max-affine functions (the maximum of finitely many affine functions). Based on new results on nonlinear nonparametric regression and on the approximation accuracy of max-affine functions, these estimators are proved to achieve the optimal rate of convergence up to logarithmic factors. Preliminary experiments indicate that a simple randomized approximation to the optimal estimator is competitive with state-of-the-art alternatives. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Exploiting Symmetries to Construct Efficient MCMC Algorithms with an Application to SLAM.\n \n \n \n \n\n\n \n Shariff, R.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In AISTATS, pages 866–874, 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Exploiting paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{ShGySz15,\n\tabstract = {  The Metropolis-Hastings (MH) algorithm is a flexible method to\n  generate samples from a target distribution, a key problem in\n  probabilistic inference. In this paper we propose a variation of the\n  MH algorithm based on group moves, where the next state is obtained\n  by first choosing a random transformation of the state space and\n  then applying this transformation to the current state. This adds\n  much-needed flexibility to the ``textbook'' MH algorithm where all\n  measures involved must be given in terms of densities with respect\n  to a common reference measure. Under mild conditions, our main\n  result extends the acceptance probability formula of the textbook\n  algorithm to MH algorithms with group moves. We work out how the new\n  algorithms can be used to exploit a problem's natural symmetries and\n  apply the technique to the simultaneous localization and mapping\n  (SLAM) problem, obtaining the first fully rigorous justification of\n  a previous MCMC-based SLAM method. New experimental results\n  comparing our method to existing state-of-the-art specialized\n  methods on a standard range-only SLAM benchmark problem validate the\n  strength of the approach.\n},\n\tacceptrate = {127 out of 442=29\\%},\n\tauthor = {Shariff, R. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {AISTATS},\n\tkeywords = {MCMC, SLAM},\n\tpages = {866--874},\n\ttitle = {Exploiting Symmetries to Construct Efficient MCMC Algorithms with an Application to SLAM},\n\turl_paper = {AISTAT15-MCMC.pdf},\n\tyear = {2015}}\n\n
\n
\n\n\n
\n The Metropolis-Hastings (MH) algorithm is a flexible method to generate samples from a target distribution, a key problem in probabilistic inference. In this paper we propose a variation of the MH algorithm based on group moves, where the next state is obtained by first choosing a random transformation of the state space and then applying this transformation to the current state. This adds much-needed flexibility to the ``textbook'' MH algorithm where all measures involved must be given in terms of densities with respect to a common reference measure. Under mild conditions, our main result extends the acceptance probability formula of the textbook algorithm to MH algorithms with group moves. We work out how the new algorithms can be used to exploit a problem's natural symmetries and apply the technique to the simultaneous localization and mapping (SLAM) problem, obtaining the first fully rigorous justification of a previous MCMC-based SLAM method. New experimental results comparing our method to existing state-of-the-art specialized methods on a standard range-only SLAM benchmark problem validate the strength of the approach. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Toward Minimax Off-policy Value Estimation.\n \n \n \n \n\n\n \n Li, L.; Munos, R.; and Szepesvári, C.\n\n\n \n\n\n\n In AISTATS, pages 608–616, 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Toward paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{LiMuSz15,\n\tabstract = {This paper studies the off-policy evaluation problem, where one aims to estimate the value of a target policy based on a sample of observations collected by another policy.  We first consider the single-state, or multi-armed bandit case, establish a finite-time minimax risk lower bound, and analyze the risk of three standard estimators.\nFor the so-called regression estimator, we show that while it is asymptotically optimal, for small sample sizes it may perform suboptimally compared to an ideal oracle up to a multiplicative factor that depends on the number of actions.\nWe also show that the other two popular estimators can be arbitrarily worse than the optimal, even in the limit of infinitely many data points.\nThe performance of the estimators are studied in synthetic and real problems; illustrating the methods strengths and weaknesses.\nWe also discuss the implications of these results  for off-policy evaluation problems in contextual bandits and fixed-horizon Markov decision processes.\n},\n\tacceptrate = {127 out of 442=29\\%},\n\tauthor = {Li, L. and Munos, R. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {AISTATS},\n\tkeywords = {off-policy learning, bandits, Markov Decision Processes, minimax bounds},\n\tpages = {608--616},\n\ttitle = {Toward Minimax Off-policy Value Estimation},\n\turl_paper = {AISTAT15-OffPolicy.pdf},\n\tyear = {2015}}\n\n
\n
\n\n\n
\n This paper studies the off-policy evaluation problem, where one aims to estimate the value of a target policy based on a sample of observations collected by another policy. We first consider the single-state, or multi-armed bandit case, establish a finite-time minimax risk lower bound, and analyze the risk of three standard estimators. For the so-called regression estimator, we show that while it is asymptotically optimal, for small sample sizes it may perform suboptimally compared to an ideal oracle up to a multiplicative factor that depends on the number of actions. We also show that the other two popular estimators can be arbitrarily worse than the optimal, even in the limit of infinitely many data points. The performance of the estimators are studied in synthetic and real problems; illustrating the methods strengths and weaknesses. We also discuss the implications of these results for off-policy evaluation problems in contextual bandits and fixed-horizon Markov decision processes. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Tight Regret Bounds for Stochastic Combinatorial Semi-Bandits.\n \n \n \n \n\n\n \n Kveton, B.; Wen, Z.; Ashkan, A.; and Szepesvári, C.\n\n\n \n\n\n\n In AISTATS, pages 535–543, 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Tight paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KWASz15,\n\tabstract = {A stochastic combinatorial semi-bandit is an online learning problem where at each step a learning agent chooses a subset of ground items subject to constraints, and then observes stochastic weights of these items and receives their sum as a payoff. In this paper, we close the problem of computationally and sample efficient learning in stochastic combinatorial semi-bandits. In particular, we analyze a UCB-like algorithm for solving the problem, which is known to be computationally efficient; and prove O(K L (1 / Delta) log n) and O( (K L n log n)^{1/2} )$ upper bounds on its n-step regret, where L is the number of ground items, K is the maximum number of chosen items, and Delta is the gap between the expected returns of the optimal and best suboptimal solutions. The gap-dependent bound is tight up to a constant factor and the gap-free bound is tight up to a polylogarithmic factor.},\n\tacceptrate = {127 out of 442=29\\%},\n\tauthor = {Kveton, B. and Wen, Z. and Ashkan, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {AISTATS},\n\tkeywords = {bandits, stochastic bandits, theory, online learning, linear bandits, combinatorial bandits, semi-bandits},\n\tpages = {535--543},\n\ttitle = {Tight Regret Bounds for Stochastic Combinatorial Semi-Bandits},\n\turl_paper = {AISTAT15-CombBand.pdf},\n\tyear = {2015}}\n\n
\n
\n\n\n
\n A stochastic combinatorial semi-bandit is an online learning problem where at each step a learning agent chooses a subset of ground items subject to constraints, and then observes stochastic weights of these items and receives their sum as a payoff. In this paper, we close the problem of computationally and sample efficient learning in stochastic combinatorial semi-bandits. In particular, we analyze a UCB-like algorithm for solving the problem, which is known to be computationally efficient; and prove O(K L (1 / Delta) log n) and O( (K L n log n)^1/2 )$ upper bounds on its n-step regret, where L is the number of ground items, K is the maximum number of chosen items, and Delta is the gap between the expected returns of the optimal and best suboptimal solutions. The gap-dependent bound is tight up to a constant factor and the gap-free bound is tight up to a polylogarithmic factor.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2014\n \n \n (11)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Proceedings of The 27th Conference on Learning Theory (COLT 2014).\n \n \n \n \n\n\n \n Balcan, M.; Feldman, V.; and Szepesvári, C.,\n editors.\n \n\n\n \n\n\n\n JMLR.org. 2014.\n \n\n\n\n
\n\n\n\n \n \n \"ProceedingsPaper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@proceedings{COLT2014,\n\teditor = {Balcan, M.-F. and Feldman, V. and Szepesv{\\'a}ri, Cs.},\n\tpublisher = {JMLR.org},\n\ttitle = {Proceedings of The 27th Conference on Learning Theory (COLT 2014)},\n\turl = {http://proceedings.mlr.press/v35/},\n\tyear = {2014},\n\tBdsk-Url-1 = {http://proceedings.mlr.press/v35/}}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Pseudo-MDPs and Factored Linear Action Models.\n \n \n \n \n\n\n \n Yao, H.; Szepesvári, C.; Pires, B.; and Zhang, X.\n\n\n \n\n\n\n In IEEE ADPRL, pages 189–197, 10 2014. \n \n\n\n\n
\n\n\n\n \n \n \"Pseudo-MDPs paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{YaoSze14,\n\tabstract = {In this paper we introduce the concept of pseudo-MDPs\nto develop abstractions.\nPseudo-MDPs relax the requirement that the transition kernel has to be a probability kernel.\nWe show that the new framework captures many existing abstractions.\nWe also introduce the concept of factored linear action models; a special case.\nAgain, the relation of factored linear action models and existing works are discussed.\nWe use the general framework to develop a theory for bounding the suboptimality of policies derived from pseudo-MDPs.\nSpecializing the framework, we recover existing results.\nWe give a least-squares approach and a constrained optimization approach of learning the factored linear model as well as efficient computation methods.\nWe demonstrate that the constrained optimization approach gives better performance than the least-squares approach with normalization.\n},\n\tauthor = {Yao, H. and Szepesv{\\'a}ri, Cs. and Pires, B.A. and Zhang, X.},\n\tbooktitle = {IEEE ADPRL},\n\tkeywords = {factored linear models, reinforcement learning, Markov Decision Processes, function approximation, control, planning, control learning, abstraction, model-based RL, pseudo-MDPs},\n\tmonth = {10},\n\tpages = {189--197},\n\ttitle = {Pseudo-MDPs and Factored Linear Action Models},\n\turl_paper = {ieee_adprl2014.pdf},\n\tyear = {2014}}\n\n
\n
\n\n\n
\n In this paper we introduce the concept of pseudo-MDPs to develop abstractions. Pseudo-MDPs relax the requirement that the transition kernel has to be a probability kernel. We show that the new framework captures many existing abstractions. We also introduce the concept of factored linear action models; a special case. Again, the relation of factored linear action models and existing works are discussed. We use the general framework to develop a theory for bounding the suboptimality of policies derived from pseudo-MDPs. Specializing the framework, we recover existing results. We give a least-squares approach and a constrained optimization approach of learning the factored linear model as well as efficient computation methods. We demonstrate that the constrained optimization approach gives better performance than the least-squares approach with normalization. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Universal Option Models.\n \n \n \n \n\n\n \n Yao, H.; Szepesvári, C.; Sutton, R.; Modayil, J.; and Bhatnagar, S.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, pages 990–998, 09 2014. \n \n\n\n\n
\n\n\n\n \n \n \"Universal paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{YaoSzeSuMoBha14,\n\tabstract = {We consider the problem of learning models of options for real-time abstract planning, in the setting where reward functions can be specified at any time and their expected returns must be efficiently computed. We introduce a new model for an option that is independent of any reward function, called the {\\it universal option model (UOM)}. We prove that the UOM of an option can construct a traditional option model given a reward function, and the option-conditional return is computed directly by a single dot-product of the UOM with the reward function. We extend the UOM to linear function approximation, and we show it gives the TD solution of option returns and value functions of policies over options. We provide a stochastic approximation algorithm for incrementally learning UOMs from data and prove its consistency. We demonstrate our method in two domains. The first domain is document recommendation, where each user query defines a new reward function and a document's relevance is the expected return of a simulated random-walk through the document's references. The second domain is a real-time strategy game, where the controller must select the best game unit to accomplish dynamically-specified tasks. Our experiments show that UOMs are substantially more efficient in evaluating option returns and policies than previously known methods.},\n\tacceptrate = {414 out of 1678=25\\%},\n\tauthor = {Yao, H. and Szepesv{\\'a}ri, Cs. and Sutton, R.S. and Modayil, J. and Bhatnagar, S.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tkeywords = {reinforcement learning, Markov Decision Processes,function approximation, control, planning, control learning, temporal difference learning, LSTD},\n\tmonth = {09},\n\tpages = {990--998},\n\ttitle = {Universal Option Models},\n\turl_paper = {lamapi.pdf},\n\tyear = {2014}}\n\n
\n
\n\n\n
\n We consider the problem of learning models of options for real-time abstract planning, in the setting where reward functions can be specified at any time and their expected returns must be efficiently computed. We introduce a new model for an option that is independent of any reward function, called the ıt universal option model (UOM). We prove that the UOM of an option can construct a traditional option model given a reward function, and the option-conditional return is computed directly by a single dot-product of the UOM with the reward function. We extend the UOM to linear function approximation, and we show it gives the TD solution of option returns and value functions of policies over options. We provide a stochastic approximation algorithm for incrementally learning UOMs from data and prove its consistency. We demonstrate our method in two domains. The first domain is document recommendation, where each user query defines a new reward function and a document's relevance is the expected return of a simulated random-walk through the document's references. The second domain is a real-time strategy game, where the controller must select the best game unit to accomplish dynamically-specified tasks. Our experiments show that UOMs are substantially more efficient in evaluating option returns and policies than previously known methods.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Sequential Learning for Multi-channel Wireless Network Monitoring with Channel Switching Costs.\n \n \n \n \n\n\n \n Le, T.; Zheng, R.; and Szepesvári, C.\n\n\n \n\n\n\n IEEE Transactions on Signal Processing, 62: 5919–5929. 09 2014.\n \n\n\n\n
\n\n\n\n \n \n \"Sequential paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{LeSzeZh14,\n\tabstract = {We consider the problem of optimally assigning p sniffers to K channels to\nmonitor the transmission activities in a multi-channel wireless network with\nswitching costs. The activity of users is initially unknown to the sniffers and\nis to be learned along with channel assignment decisions to maximize the\nbenefits of this assignment, resulting in the fundamental trade-off between\nexploration and exploitation. Switching costs are incurred when sniffers change\ntheir channel assignments. As a result, frequent changes are undesirable. We\nformulate the sniffer-channel assignment with switching costs as a linear\npartial monitoring problem, a super-class of multi-armed bandits. As the number\nof arms (sniffer-channel assignments) is exponential, novel techniques are\ncalled for, to allow efficient learning.  We use the linear bandit model to\ncapture the dependency amongst the arms and develop a policy that takes\nadvantage of this dependency. We prove the proposed Upper Confident Bound-based\n(UCB) policy enjoys a logarithmic regret bound in time t that depends\nsub-linearly on the number of arms, while its total switching cost grows with log(log(t)).},\n\tauthor = {Le, T. and Zheng, R. and Szepesv{\\'a}ri, Cs.},\n\tdate-added = {2014-09-07 09:25:42 -0600},\n\tdate-modified = {2014-12-06 19:50:07 +0000},\n\tjournal = {IEEE Transactions on Signal Processing},\n\tkeywords = {local area networks, network monitoring, theory, networking, wireless networks, bandits, stochastic bandits},\n\tmonth = {09},\n\tpages = {5919--5929},\n\ttitle = {Sequential Learning for Multi-channel Wireless Network Monitoring with Channel Switching Costs},\n\turl_paper = {zheng-tsp2014.pdf},\n\tvolume = {62},\n\tyear = {2014},\n\tBdsk-Url-1 = {http://www.azn.nl/rrng/xray/digmam/iwdm98}}\n\n
\n
\n\n\n
\n We consider the problem of optimally assigning p sniffers to K channels to monitor the transmission activities in a multi-channel wireless network with switching costs. The activity of users is initially unknown to the sniffers and is to be learned along with channel assignment decisions to maximize the benefits of this assignment, resulting in the fundamental trade-off between exploration and exploitation. Switching costs are incurred when sniffers change their channel assignments. As a result, frequent changes are undesirable. We formulate the sniffer-channel assignment with switching costs as a linear partial monitoring problem, a super-class of multi-armed bandits. As the number of arms (sniffer-channel assignments) is exponential, novel techniques are called for, to allow efficient learning. We use the linear bandit model to capture the dependency amongst the arms and develop a policy that takes advantage of this dependency. We prove the proposed Upper Confident Bound-based (UCB) policy enjoys a logarithmic regret bound in time t that depends sub-linearly on the number of arms, while its total switching cost grows with log(log(t)).\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n On Learning the Optimal Waiting Time.\n \n \n \n \n\n\n \n Lattimore, T.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In ALT, 2014. \n \n\n\n\n
\n\n\n\n \n \n \"On paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{LaGySze14,\n\tabstract = {Consider the problem of learning how long to wait for a bus before walking, experimenting each day and assuming\nthat the bus arrival times are independent and identically distributed random variables with an unknown distribution.\nSimilar uncertain optimal stopping problems arise when devising power-saving strategies, e.g., learning the optimal disk spin-down time for\nmobile computers, or speeding up certain types of satisficing search procedures by switching from\na potentially fast search method that is unreliable, to one that is reliable, but slower.\nFormally, the problem can be described as a repeated game. In each round of the game an agent is waiting for an event to occur.\nIf the event occurs while the agent is waiting, the agent suffers a loss that is the sum of  the event's ``arrival time''  and some\nfixed loss. If the agents decides to give up waiting before the event occurs, he suffers a loss that is the sum of the waiting time and some other\nfixed loss. It is assumed that the arrival times are independent random quantities with the same distribution, which is unknown, while the agent knows the\nloss associated with each outcome.  Two versions of the game are considered. In the full information case the agent\nobserves the arrival times regardless of its actions, while\nin the partial information case the arrival time is  observed only if it does not exceed the waiting time. After\nsome general structural observations about the problem,\nwe present a number of algorithms for both cases that learn the optimal weighting time with nearly matching\nminimax upper and lower bounds on their regret. },\n\tacceptrate = {20 out of 40=50\\%},\n\tauthor = {Lattimore, T. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ALT},\n\tkeywords = {partial information, online learning, theory, stochastic partial monitoring, bandits, censored observations},\n\ttitle = {On Learning the Optimal Waiting Time},\n\turl_paper = {bus-ALT04.pdf},\n\tyear = {2014}}\n\n
\n
\n\n\n
\n Consider the problem of learning how long to wait for a bus before walking, experimenting each day and assuming that the bus arrival times are independent and identically distributed random variables with an unknown distribution. Similar uncertain optimal stopping problems arise when devising power-saving strategies, e.g., learning the optimal disk spin-down time for mobile computers, or speeding up certain types of satisficing search procedures by switching from a potentially fast search method that is unreliable, to one that is reliable, but slower. Formally, the problem can be described as a repeated game. In each round of the game an agent is waiting for an event to occur. If the event occurs while the agent is waiting, the agent suffers a loss that is the sum of the event's ``arrival time'' and some fixed loss. If the agents decides to give up waiting before the event occurs, he suffers a loss that is the sum of the waiting time and some other fixed loss. It is assumed that the arrival times are independent random quantities with the same distribution, which is unknown, while the agent knows the loss associated with each outcome. Two versions of the game are considered. In the full information case the agent observes the arrival times regardless of its actions, while in the partial information case the arrival time is observed only if it does not exceed the waiting time. After some general structural observations about the problem, we present a number of algorithms for both cases that learn the optimal weighting time with nearly matching minimax upper and lower bounds on their regret. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Optimal Resource Allocation with Semi-Bandit Feedback.\n \n \n \n \n\n\n \n Lattimore, T.; Crammer, K.; and Szepesvári, C.\n\n\n \n\n\n\n In UAI, pages 477–486, 2014. \n \n\n\n\n
\n\n\n\n \n \n \"Optimal paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{LaCrSze14,\n\tabstract = {We study a sequential resource allocation problem involving a fixed number of recurring jobs.\nAt each time-step the manager should distribute available resources among the jobs in order to maximise the expected number of completed jobs.\nAllocating more resources to a given job increases the probability\nthat it completes, but with a cut-off.\nSpecifically, we assume a linear\nmodel where the probability increases linearly until it equals one, after which\nallocating additional resources is wasteful.\nWe assume the difficulty of each job is unknown\nand present the first algorithm for this problem and prove upper and lower bounds on its regret.\nDespite its apparent simplicity, the problem has a rich structure:\nwe show that an appropriate optimistic algorithm can improve its learning speed dramatically\nbeyond the results one normally expects for similar problems\nas the problem becomes resource-laden.},\n\tacceptrate = {oral presentation 24 out of 292=8\\%},\n\tauthor = {Lattimore, T. and Crammer, K. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {UAI},\n\tkeywords = {partial information, online learning, theory, stochastic partial monitoring, bandits, resource allocation},\n\tpages = {477--486},\n\ttitle = {Optimal Resource Allocation with Semi-Bandit Feedback},\n\turl_paper = {lcs14mem-alloc.pdf},\n\tyear = {2014}}\n\n
\n
\n\n\n
\n We study a sequential resource allocation problem involving a fixed number of recurring jobs. At each time-step the manager should distribute available resources among the jobs in order to maximise the expected number of completed jobs. Allocating more resources to a given job increases the probability that it completes, but with a cut-off. Specifically, we assume a linear model where the probability increases linearly until it equals one, after which allocating additional resources is wasteful. We assume the difficulty of each job is unknown and present the first algorithm for this problem and prove upper and lower bounds on its regret. Despite its apparent simplicity, the problem has a rich structure: we show that an appropriate optimistic algorithm can improve its learning speed dramatically beyond the results one normally expects for similar problems as the problem becomes resource-laden.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Partial monitoring – classification, regret bounds, and algorithms.\n \n \n \n \n\n\n \n Bartók, G.; Foster, D.; Pál, D.; Rakhlin, A.; and Szepesvári, C.\n\n\n \n\n\n\n Mathematics of Operations Research, 39: 967–997. 2014.\n \n\n\n\n
\n\n\n\n \n \n \"Partial paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{BaFoPaRaSze14,\n\tabstract = { In a partial monitoring game, the learner repeatedly chooses an action, the\nenvironment responds with an outcome, and then the learner suffers a loss and\nreceives a feedback signal, both of which are fixed functions of the action and\nthe outcome. The goal of the learner is to minimize his regret, which is the\ndifference between his total cumulative loss and the total loss of the best\nfixed action in hindsight.\nIn this paper we characterize the minimax regret of any\npartial monitoring game with finitely many actions and\noutcomes. It turns out that the minimax regret of any such game is either zero,\nTheta~(T^{1/2}), Theta(T^{2/3}), or Theta(T).  We provide computationally efficient learning\nalgorithms that achieve the minimax regret within logarithmic factor for any game. In addition to the bounds on the minimax regret, if we assume that the outcomes are generated in an i.i.d. fashion, we prove individual upper bounds on the expected regret.},\n\tauthor = {Bart{\\'o}k, G. and Foster, D. and P{\\'a}l, D. and Rakhlin, A. and Szepesv{\\'a}ri, Cs.},\n\tdate = {2014-05},\n\tdate-added = {2014-05-16 22:39:50 -0700},\n\tdate-modified = {2014-12-06 19:49:48 +0000},\n\tjournal = {Mathematics of Operations Research},\n\tkeywords = {partial information, online learning, adversarial setting, theory, stochastic partial monitoring},\n\tpages = {967--997},\n\ttitle = {Partial monitoring -- classification, regret bounds, and algorithms},\n\turl_paper = {partial_monitoring-mor.pdf},\n\tvolume = {39},\n\tyear = {2014},\n\tBdsk-Url-1 = {http://dx.doi.org/10.1016/j.tcs.2012.10.008}}\n\n
\n
\n\n\n
\n In a partial monitoring game, the learner repeatedly chooses an action, the environment responds with an outcome, and then the learner suffers a loss and receives a feedback signal, both of which are fixed functions of the action and the outcome. The goal of the learner is to minimize his regret, which is the difference between his total cumulative loss and the total loss of the best fixed action in hindsight. In this paper we characterize the minimax regret of any partial monitoring game with finitely many actions and outcomes. It turns out that the minimax regret of any such game is either zero, Theta (T^1/2), Theta(T^2/3), or Theta(T). We provide computationally efficient learning algorithms that achieve the minimax regret within logarithmic factor for any game. In addition to the bounds on the minimax regret, if we assume that the outcomes are generated in an i.i.d. fashion, we prove individual upper bounds on the expected regret.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Adaptive Monte Carlo via Bandit Allocation (extended version).\n \n \n \n \n\n\n \n Neufeld, J.; György, A.; Schuurmans, D.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 1944–1952, 05 2014. \n \n\n\n\n
\n\n\n\n \n \n \"Adaptive paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 6 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{NeuGyoSchSze14,\n\tabstract = {We consider the problem of sequentially choosing between a set of\nunbiased Monte Carlo estimators to minimize the mean-squared-error (MSE) of a\nfinal combined estimate.\nBy reducing this task to a stochastic multi-armed bandit problem,\nwe show that well developed allocation strategies can be used to achieve\nan MSE that approaches that of the best estimator chosen in retrospect.\nWe then extend these developments to a scenario where alternative estimators\nhave different, possibly stochastic costs.\nThe outcome is a new set of adaptive Monte Carlo strategies that provide stronger\nguarantees than previous approaches while offering practical advantages.\n\n},\n\tacceptrate = {310 out of 1238=25\\% acceptance rate;as one of the 18 out of 1238=1.5\\% ICML excellent papers, one of the 18/1238=1.5\\% papers invited to JMLR fast track submisson\\%},\n\tauthor = {Neufeld, J. and Gy{\\"o}rgy, A. and Schuurmans, D. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\tkeywords = {online learning, Monte Carlo methods, bandits, stochastic bandits},\n\tmonth = {05},\n\tpages = {1944--1952},\n\ttitle = {Adaptive Monte Carlo via Bandit Allocation (extended version)},\n\turl_paper = {mcbandit.pdf},\n\tyear = {2014}}\n\n
\n
\n\n\n
\n We consider the problem of sequentially choosing between a set of unbiased Monte Carlo estimators to minimize the mean-squared-error (MSE) of a final combined estimate. By reducing this task to a stochastic multi-armed bandit problem, we show that well developed allocation strategies can be used to achieve an MSE that approaches that of the best estimator chosen in retrospect. We then extend these developments to a scenario where alternative estimators have different, possibly stochastic costs. The outcome is a new set of adaptive Monte Carlo strategies that provide stronger guarantees than previous approaches while offering practical advantages. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A Finite-Sample Generalization Bound for Semiparametric Regression: Partially Linear Models.\n \n \n \n \n\n\n \n Huang, R.; and Szepesvári, C.\n\n\n \n\n\n\n In AISTATS, pages 402–410, 02 2014. \n \n\n\n\n
\n\n\n\n \n \n \"A paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{HuSze14,\n\tabstract = {In this paper we provide generalization bounds for semiparametric regression with the so-called partially linear models where the regression function is written as the sum of a linear parametric and a nonlinear, non- parametric function, the latter taken from a some set H with finite entropy-integral. The problem is technically challenging because the parametric part is unconstrained and the model is underdetermined, while the response is allowed to be unbounded with subgaussian tails. Under natural regularity conditions, we bound the generalization error as a function of the Rademacher complexity of H and that of the linear model. Our main tool is a ratio-type concentration inequality for increments of empirical processes, based on which we are able to give an exponential tail bound on the size of the parametric component. We also provide a comparison to alternatives of this technique and discuss why and when the un-constrained parametric part in the model may cause a problem in terms of the expected risk. We also explain by means of a specific example why this problem cannot be detected using the results of classical asymptotic analysis often seen in the statistics literature.},\n\tauthor = {Huang, R. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {AISTATS},\n\tkeywords = {nonparametrics, least-squares methods, rate of convergence},\n\tmonth = {02},\n\tpages = {402--410},\n\ttitle = {A Finite-Sample Generalization Bound for Semiparametric Regression: Partially Linear Models},\n\turl_paper = {semiparametric-aistat2014.pdf},\n\tyear = {2014}}\n\n
\n
\n\n\n
\n In this paper we provide generalization bounds for semiparametric regression with the so-called partially linear models where the regression function is written as the sum of a linear parametric and a nonlinear, non- parametric function, the latter taken from a some set H with finite entropy-integral. The problem is technically challenging because the parametric part is unconstrained and the model is underdetermined, while the response is allowed to be unbounded with subgaussian tails. Under natural regularity conditions, we bound the generalization error as a function of the Rademacher complexity of H and that of the linear model. Our main tool is a ratio-type concentration inequality for increments of empirical processes, based on which we are able to give an exponential tail bound on the size of the parametric component. We also provide a comparison to alternatives of this technique and discuss why and when the un-constrained parametric part in the model may cause a problem in terms of the expected risk. We also explain by means of a specific example why this problem cannot be detected using the results of classical asymptotic analysis often seen in the statistics literature.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Online Learning in Markov Decision Processes with Changing Cost Sequences.\n \n \n \n \n\n\n \n Dick, T.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 512–520, 01 2014. \n \n\n\n\n
\n\n\n\n \n \n \"Online paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{DiGyoSze14,\n\tabstract = {In this paper we consider online learning in finite Markov decision processes (MDPs) with changing cost sequences under full and bandit-information.  We propose to view this problem as an instance of online linear optimization.  We propose two methods for this problem: MD^2 (mirror descent with approximate projections) and the continuous exponential weights algorithm with Dikin walks.  We provide a rigorous complexity analysis of these techniques, while providing near-optimal regret-bounds (in particular, we take into account the computational costs of performing approximate projections in MD^2).  In the case of full-information feedback, our results complement existing ones. In the case of bandit-information feedback we consider the online stochastic shortest path problem, a special case of the above MDP problems, and manage to improve the existing results by removing the previous restrictive assumption that the state-visitation probabilities are uniformly bounded away from zero under all policies.},\n\tacceptrate = {310 out of 1238=25\\%},\n\tauthor = {Dick, T. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\tkeywords = {online learning, adversarial setting, finite MDPs, recurrent MDPs, shortest path problem, theory},\n\tmonth = {01},\n\tpages = {512--520},\n\ttitle = {Online Learning in Markov Decision Processes with Changing Cost Sequences},\n\turl_paper = {omdp_icml.pdf},\n\tyear = {2014}}\n\n
\n
\n\n\n
\n In this paper we consider online learning in finite Markov decision processes (MDPs) with changing cost sequences under full and bandit-information. We propose to view this problem as an instance of online linear optimization. We propose two methods for this problem: MD^2 (mirror descent with approximate projections) and the continuous exponential weights algorithm with Dikin walks. We provide a rigorous complexity analysis of these techniques, while providing near-optimal regret-bounds (in particular, we take into account the computational costs of performing approximate projections in MD^2). In the case of full-information feedback, our results complement existing ones. In the case of bandit-information feedback we consider the online stochastic shortest path problem, a special case of the above MDP problems, and manage to improve the existing results by removing the previous restrictive assumption that the state-visitation probabilities are uniformly bounded away from zero under all policies.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Online Markov Decision Processes under Bandit Feedback.\n \n \n \n \n\n\n \n Neu, G.; György, A.; Szepesvári, C.; and Antos, A.\n\n\n \n\n\n\n IEEE Transactions on Automatic Control, 59(3): 676–691. 12 2014.\n \n\n\n\n
\n\n\n\n \n \n \"Online paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 5 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{NeGySzA13,\n\tabstract = {We consider online learning in finite stochastic Markovian environments where in each time step a new reward function is chosen by an oblivious adversary. The goal of the learning agent is to compete with the best stationary policy in hindsight in terms of the total reward received. Specifically, in each time step the agent observes the current state and the reward associated with the last transition, however, the agent does not observe the rewards associated with other state-action pairs. The agent is assumed to know the transition probabilities. The state of the art result for this setting is an algorithm with an expected regret of $O(T^{2/3}ln T)$. In this paper, assuming that stationary policies mix uniformly fast, we show that after $T$ time steps, the expected regret of this algorithm (more precisely, a slightly modified version thereof) is $O(T^{1/2}ln T)$, giving the first rigorously proven, essentially tight regret bound for the problem.},\n\tauthor = {Neu, G. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs. and Antos, A.},\n\tdate = {2014-01},\n\tdate-added = {2013-11-29 18:51:13 +0200},\n\tdate-modified = {2015-03-02 01:15:25 +0000},\n\tjournal = {IEEE Transactions on Automatic Control},\n\tkeywords = {online learning, adversarial setting, finite MDPs, recurrent MDPs, theory},\n\tmonth = {12},\n\tnumber = {3},\n\tpages = {676--691},\n\ttitle = {Online Markov Decision Processes under Bandit Feedback},\n\turl_paper = {mdptac13.pdf},\n\tvolume = {59},\n\tyear = {2014}}\n\n
\n
\n\n\n
\n We consider online learning in finite stochastic Markovian environments where in each time step a new reward function is chosen by an oblivious adversary. The goal of the learning agent is to compete with the best stationary policy in hindsight in terms of the total reward received. Specifically, in each time step the agent observes the current state and the reward associated with the last transition, however, the agent does not observe the rewards associated with other state-action pairs. The agent is assumed to know the transition probabilities. The state of the art result for this setting is an algorithm with an expected regret of $O(T^{2/3}ln T)$. In this paper, assuming that stationary policies mix uniformly fast, we show that after $T$ time steps, the expected regret of this algorithm (more precisely, a slightly modified version thereof) is $O(T^{1/2}ln T)$, giving the first rigorously proven, essentially tight regret bound for the problem.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2013\n \n \n (8)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Online Learning with Costly Features and Labels.\n \n \n \n \n\n\n \n Zolghadr, N.; Bartók, G.; Greiner, R.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, pages 1241–1249, 12 2013. \n \n\n\n\n
\n\n\n\n \n \n \"Online paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{ZoBaGrGySz13,\n\tabstract = {This paper introduces the online probing problem:\nIn each round, the learner is able to purchase the values of a subset of feature values.\nAfter the learner uses this information to come up with a prediction for the given round,\nhe then has the option of paying\nto see the loss function\nthat he is evaluated against.\nEither way, the learner pays for\nboth the errors of his predictions and also whatever he chooses to observe,\nincluding the cost of observing the loss function for the given round and the cost of the observed features.\nWe consider two variations of this problem,\ndepending on whether the learner can observe the label for free or not.\nWe provide algorithms and upper and lower bounds on the regret for both variants.\nWe show that a positive cost for observing the label significantly increases the regret of the problem.\n},\n\tacceptrate = {360 out of 1420=25\\%},\n\tauthor = {Zolghadr, N. and Bart{\\'o}k, G. and Greiner, R. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tkeywords = {theory, adversarial setting, partial information, online learning},\n\tmonth = {12},\n\tpages = {1241--1249},\n\ttitle = {Online Learning with Costly Features and Labels},\n\turl_paper = {OnlineProbingNeurIPS2013.pdf},\n\tyear = {2013}}\n\n
\n
\n\n\n
\n This paper introduces the online probing problem: In each round, the learner is able to purchase the values of a subset of feature values. After the learner uses this information to come up with a prediction for the given round, he then has the option of paying to see the loss function that he is evaluated against. Either way, the learner pays for both the errors of his predictions and also whatever he chooses to observe, including the cost of observing the loss function for the given round and the cost of the observed features. We consider two variations of this problem, depending on whether the learner can observe the label for free or not. We provide algorithms and upper and lower bounds on the regret for both variants. We show that a positive cost for observing the label significantly increases the regret of the problem. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Online Learning in Markov Decision Processes with Adversarially Chosen Transition Probability Distributions.\n \n \n \n \n\n\n \n Abbasi-Yadkori, Y.; Bartlett, P.; Kanade, V.; Seldin, Y.; and Szepesvári, C.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, pages 2508–2516, 12 2013. \n \n\n\n\n
\n\n\n\n \n \n \"OnlineLink\n  \n \n \n \"Online paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{AYBKSSz13,\n\tabstract = {We study the problem of online learning Markov Decision Processes\n(MDPs) when both the transition distributions and loss functions\nare chosen by an adversary. We present an algorithm that, under a\nmixing assumption, achieves \\sqrt{T\\log|\\Pi|}+\\log|\\Pi| regret\nwith respect to a comparison set of policies \\Pi.  The regret\nis independent of the size of the state and action spaces. When\nexpectations over sample paths can be computed efficiently and\nthe comparison set \\Pi has polynomial size,\nthis algorithm is efficient.\n\nWe also consider the episodic adversarial online shortest path\nproblem.  Here, in each episode an adversary may choose a weighted\ndirected acyclic graph with an identified start and finish node. The\ngoal of the learning algorithm is to choose a path that minimizes\nthe loss while traversing from the start to finish node. At the end\nof each episode the loss function (given by weights on the edges)\nis revealed to the learning algorithm. The goal is to minimize regret\nwith respect to a fixed policy for selecting paths. This problem is\na special case of the online MDP problem.\nIt was shown that for randomly chosen graphs\nand adversarial losses, the problem can be efficiently solved. We\nshow that it also can be efficiently solved for adversarial\ngraphs and randomly chosen losses.  When both graphs and losses\nare adversarially chosen, we show that designing efficient algorithms for the\nadversarial online shortest path problem (and hence for the\nadversarial MDP problem) is as hard as learning parity with noise, a\nnotoriously difficult problem that has been used to design efficient\ncryptographic schemes. Finally, we present an efficient algorithm whose\nregret scales linearly with the number of distinct graphs.\n},\n\tacceptrate = {360 out of 1420=25\\%},\n\tauthor = {Abbasi-Yadkori, Y. and Bartlett, P. and Kanade, V. and Seldin, Y. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tee = {http://papers.neurips.cc/paper/4975-online-learning-in-markov-decision-processes-with-adversarially-chosen-transition-probability-distributions},\n\tkeywords = {theory, online learning, finite MDPs, adversarial setting, reinforcement learning},\n\tmonth = {12},\n\tpages = {2508--2516},\n\ttitle = {Online Learning in Markov Decision Processes with Adversarially Chosen Transition Probability Distributions},\n\turl_paper = {ChangingTransNeurIPS2013.pdf},\n\tyear = {2013}}\n\n
\n
\n\n\n
\n We study the problem of online learning Markov Decision Processes (MDPs) when both the transition distributions and loss functions are chosen by an adversary. We present an algorithm that, under a mixing assumption, achieves \\sqrtTłog|Π|+łog|Π| regret with respect to a comparison set of policies Π. The regret is independent of the size of the state and action spaces. When expectations over sample paths can be computed efficiently and the comparison set Π has polynomial size, this algorithm is efficient. We also consider the episodic adversarial online shortest path problem. Here, in each episode an adversary may choose a weighted directed acyclic graph with an identified start and finish node. The goal of the learning algorithm is to choose a path that minimizes the loss while traversing from the start to finish node. At the end of each episode the loss function (given by weights on the edges) is revealed to the learning algorithm. The goal is to minimize regret with respect to a fixed policy for selecting paths. This problem is a special case of the online MDP problem. It was shown that for randomly chosen graphs and adversarial losses, the problem can be efficiently solved. We show that it also can be efficiently solved for adversarial graphs and randomly chosen losses. When both graphs and losses are adversarially chosen, we show that designing efficient algorithms for the adversarial online shortest path problem (and hence for the adversarial MDP problem) is as hard as learning parity with noise, a notoriously difficult problem that has been used to design efficient cryptographic schemes. Finally, we present an efficient algorithm whose regret scales linearly with the number of distinct graphs. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Toward a classification of finite partial-monitoring games.\n \n \n \n \n\n\n \n Antos, A.; Bartók, G.; Pál, D.; and Szepesvári, C.\n\n\n \n\n\n\n Theoretical Computer Science, 473: 77–99. 2013.\n \n\n\n\n
\n\n\n\n \n \n \"TowardLink\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{AnBaPaSz13,\n\tabstract = {Partial-monitoring games constitute\n a mathematical framework for sequential decision making problems with imperfect feedback:\nThe learner repeatedly chooses an action,\n nature responds with an outcome,\n and then the learner suffers a loss and receives a feedback signal,\n both of which are fixed functions of the action and the outcome.\nThe goal of the learner is to minimize his total cumulative loss.\nWe make progress towards the classification of these games based on their minimax expected regret.\nNamely, we classify almost all games with two outcomes and a finite number of actions:\nWe show that their minimax expected regret is either\n zero, Theta(T^{1/2}), Theta(T^{2/3}), or \\Theta(T),\n and we give a simple and efficiently computable classification of these four classes of games.\nOur hope is that the result can serve\n as a stepping stone toward classifying all finite partial-monitoring games.\n},\n\tauthor = {Antos, A. and Bart{\\'o}k, G. and P{\\'a}l, D. and Szepesv{\\'a}ri, Cs.},\n\tdoi = {10.1016/j.tcs.2012.10.008},\n\tee = {partial-monitoring-tcs.pdf},\n\tjournal = {Theoretical Computer Science},\n\tkeywords = {partial information, online learning, adversarial setting, theory},\n\tpages = {77--99},\n\ttitle = {Toward a classification of finite partial-monitoring games},\n\tvolume = {473},\n\tyear = {2013},\n\tBdsk-Url-1 = {http://dx.doi.org/10.1016/j.tcs.2012.10.008}}\n\n
\n
\n\n\n
\n Partial-monitoring games constitute a mathematical framework for sequential decision making problems with imperfect feedback: The learner repeatedly chooses an action, nature responds with an outcome, and then the learner suffers a loss and receives a feedback signal, both of which are fixed functions of the action and the outcome. The goal of the learner is to minimize his total cumulative loss. We make progress towards the classification of these games based on their minimax expected regret. Namely, we classify almost all games with two outcomes and a finite number of actions: We show that their minimax expected regret is either zero, Theta(T^1/2), Theta(T^2/3), or Θ(T), and we give a simple and efficiently computable classification of these four classes of games. Our hope is that the result can serve as a stepping stone toward classifying all finite partial-monitoring games. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Alignment based kernel learning with a continuous set of base kernels.\n \n \n \n \n\n\n \n Afkanpour, A.; Szepesvári, C.; and Bowling, M. H.\n\n\n \n\n\n\n Machine Learning, 91(3): 305–324. 2013.\n \n\n\n\n
\n\n\n\n \n \n \"Alignment paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{AfSzeBo13,\n\tabstract = {The success of kernel-based learning methods depends on the choice of kernel. Recently, kernel learning methods have been proposed that use data to select the most appropriate kernel, usually by combining a set of base kernels. We introduce a new algorithm for kernel learning that combines a continuous set of base kernels, without the common step of discretizing the space of base kernels. We demonstrate that our new method achieves state-of-the-art performance across a variety of real-world datasets. Furthermore, we explicitly demonstrate the importance of combining the right dictionary of kernels, which is problematic for methods that combine a finite set of base kernels chosen a priori. Our method is not the first approach to work with continuously parameterized kernels. We adopt a two-stage kernel learning approach. We also show that our method requires substantially less computation than previous such approaches, and so is more amenable to multi-dimensional parameterizations of base kernels, which we demonstrate.},\n\tauthor = {Afkanpour, A. and Szepesv{\\'a}ri, Cs. and Bowling, M. H.},\n\tdate = {2013-06},\n\tdate-added = {2013-06-30 22:38:38 -0600},\n\tdate-modified = {2013-06-30 22:51:39 -0600},\n\tdoi = {10.1007/s10994-013-5361-8},\n\tjournal = {Machine Learning},\n\tkeywords = {multikernel learning; supervised learning},\n\tnumber = {3},\n\tpages = {305--324},\n\tpublisher = {Springer},\n\ttitle = {Alignment based kernel learning with a continuous set of base kernels},\n\turl_paper = {alignment_based_kernel_learning.pdf},\n\tvolume = {91},\n\tyear = {2013},\n\tBdsk-Url-1 = {http://dx.doi.org/10.1007/s10994-013-5361-8}}\n\n
\n
\n\n\n
\n The success of kernel-based learning methods depends on the choice of kernel. Recently, kernel learning methods have been proposed that use data to select the most appropriate kernel, usually by combining a set of base kernels. We introduce a new algorithm for kernel learning that combines a continuous set of base kernels, without the common step of discretizing the space of base kernels. We demonstrate that our new method achieves state-of-the-art performance across a variety of real-world datasets. Furthermore, we explicitly demonstrate the importance of combining the right dictionary of kernels, which is problematic for methods that combine a finite set of base kernels chosen a priori. Our method is not the first approach to work with continuously parameterized kernels. We adopt a two-stage kernel learning approach. We also show that our method requires substantially less computation than previous such approaches, and so is more amenable to multi-dimensional parameterizations of base kernels, which we demonstrate.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Cost-sensitive Multiclass Classification Risk Bounds.\n \n \n \n \n\n\n \n Pires, B.; Ghavamzadeh, M.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 1391–1399, 06 2013. \n \n\n\n\n
\n\n\n\n \n \n \"Cost-sensitive paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{PiGhSz13,\n\tabstract = {In this paper we present 0-1-like-risk bounds for multiclass cost-sensitive classifiers minimizing losses from an often-used family surrogate losses. To this end, we calculate an analytic expression that describe how the 0-1-like-risk-convergence rate of a classifier depends on the surrogate loss chosen while making less assumptions on the surrogate losses than previous work. We also show that calculating the values of the resulting expression is as easy as calculating these values for binary classification, and derive that some well-known losses share the same rate of convergence as their ``truncated versions'', a fact previously observed only through examples and for some of these losses.},\n\tacceptrate = {283 out of 1204=24\\%},\n\tauthor = {Pires, B.A. and Ghavamzadeh, M. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\tkeywords = {theory, classification, consistency, risk bound, calibration},\n\tmonth = {06},\n\tpages = {1391--1399},\n\ttitle = {Cost-sensitive Multiclass Classification Risk Bounds},\n\turl_paper = {icml2013multiclass.pdf},\n\tyear = {2013}}\n\n
\n
\n\n\n
\n In this paper we present 0-1-like-risk bounds for multiclass cost-sensitive classifiers minimizing losses from an often-used family surrogate losses. To this end, we calculate an analytic expression that describe how the 0-1-like-risk-convergence rate of a classifier depends on the surrogate loss chosen while making less assumptions on the surrogate losses than previous work. We also show that calculating the values of the resulting expression is as easy as calculating these values for binary classification, and derive that some well-known losses share the same rate of convergence as their ``truncated versions'', a fact previously observed only through examples and for some of these losses.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Online Learning under Delayed Feedback.\n \n \n \n \n\n\n \n Joulani, P.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 1453–1461, 06 2013. \n \n\n\n\n
\n\n\n\n \n \n \"Online paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{JoGySz13,\n\tabstract = {Online learning with delayed feedback has received increasing attention recently due to its several applications in distributed, web-based learning problems. In this paper we provide a systematic study of the topic, and analyze how the delay effects the regret of online learning algorithms. Somewhat surprisingly, it turns out that delay increases the regret in a multiplicative way in adversarial problems, and in an additive way in stochastic problems. We give meta-algorithms that transform, in a black-box fashion, algorithms developed for the non-delayed case into ones that can handle the presence of delays in the feedback loop. Modifications of the well-known UCB algorithm are also developed for the bandit problem with delayed feedback, with the advantage over the meta-algorithms that they can be implemented with much lower complexity.},\n\tacceptrate = {oral presentation 139 out of 1204=12\\%},\n\tauthor = {Joulani, P. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\tkeywords = {theory, bandits, delay, adversarial setting, stochastic bandits, online learning},\n\tmonth = {06},\n\tpages = {1453--1461},\n\ttitle = {Online Learning under Delayed Feedback},\n\turl_paper = {DelayedOnlineLearning.pdf},\n\tyear = {2013}}\n\n
\n
\n\n\n
\n Online learning with delayed feedback has received increasing attention recently due to its several applications in distributed, web-based learning problems. In this paper we provide a systematic study of the topic, and analyze how the delay effects the regret of online learning algorithms. Somewhat surprisingly, it turns out that delay increases the regret in a multiplicative way in adversarial problems, and in an additive way in stochastic problems. We give meta-algorithms that transform, in a black-box fashion, algorithms developed for the non-delayed case into ones that can handle the presence of delays in the feedback loop. Modifications of the well-known UCB algorithm are also developed for the bandit problem with delayed feedback, with the advantage over the meta-algorithms that they can be implemented with much lower complexity.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A randomized mirror descent algorithm for large scale multiple kernel learning.\n \n \n \n \n\n\n \n Afkanpour, A.; György, A.; Bowling, M. H.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 374–382, 06 2013. \n \n\n\n\n
\n\n\n\n \n \n \"A paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{AfGyBoSze13,\n\tabstract = {We consider the problem of simultaneously learning to linearly combine a very large number of kernels and learn a good predictor based on the learnt kernel.\nWhen the number of kernels d to be combined is very large, multiple kernel learning methods whose computational cost scales linearly in d are intractable.\nWe propose a randomized version of the mirror descent algorithm to overcome this issue, under the objective of minimizing the group p-norm penalized empirical risk. The key to achieve the required exponential speed-up  is the computationally efficient construction of low-variance estimates of the gradient.\nWe propose importance sampling based estimates, and find that the ideal distribution samples a coordinate with a probability proportional to the magnitude of the corresponding gradient. We show the surprising result that in the case of learning the coefficients of a polynomial kernel, the combinatorial structure of the base kernels to be combined allows the implementation of sampling from this distribution to run in O(log(d)) time, making the total computational cost of the method to achieve an epsilon-optimal solution to be O(log(d)/epsilon^2), thereby allowing our method to operate for very large values of d. Experiments with simulated and real data confirm that the new algorithm is computationally more efficient than its state-of-the-art alternatives.},\n\tacceptrate = {283 out of 1204=24\\%},\n\tauthor = {Afkanpour, A. and Gy{\\"o}rgy, A. and Bowling, M. H. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\tkeywords = {theory, RKHS, kernels, optimization, multikernel learning, stochastic gradient methods, coordinate descent, mirror descent},\n\tmonth = {06},\n\tpages = {374--382},\n\ttitle = {A randomized mirror descent algorithm for large scale multiple kernel learning},\n\turl_paper = {mkl_icml2013.pdf},\n\tyear = {2013}}\n\n
\n
\n\n\n
\n We consider the problem of simultaneously learning to linearly combine a very large number of kernels and learn a good predictor based on the learnt kernel. When the number of kernels d to be combined is very large, multiple kernel learning methods whose computational cost scales linearly in d are intractable. We propose a randomized version of the mirror descent algorithm to overcome this issue, under the objective of minimizing the group p-norm penalized empirical risk. The key to achieve the required exponential speed-up is the computationally efficient construction of low-variance estimates of the gradient. We propose importance sampling based estimates, and find that the ideal distribution samples a coordinate with a probability proportional to the magnitude of the corresponding gradient. We show the surprising result that in the case of learning the coefficients of a polynomial kernel, the combinatorial structure of the base kernels to be combined allows the implementation of sampling from this distribution to run in O(log(d)) time, making the total computational cost of the method to achieve an epsilon-optimal solution to be O(log(d)/epsilon^2), thereby allowing our method to operate for very large values of d. Experiments with simulated and real data confirm that the new algorithm is computationally more efficient than its state-of-the-art alternatives.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Characterizing the Representer Theorem.\n \n \n \n \n\n\n \n Yu, Y.; Chen, H.; Schuurmans, D.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 570–578, 06 2013. \n \n\n\n\n
\n\n\n\n \n \n \"Characterizing paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{YuChSchSz13,\n\tabstract = {The representer theorem assures that kernel methods retain optimality\nunder penalized empirical risk minimization.\nWhile a sufficient condition on the form of the regularizer guaranteeing\nthe representer theorem has been known since the initial development\nof kernel methods, necessary conditions have only been investigated recently.\nIn this paper we completely characterize the necessary and sufficient\nconditions on the regularizer that ensure the representer theorem holds.\nThe results are surprisingly simple yet broaden the conditions where the\nrepresenter theorem is known to hold.\nExtension to the matrix domain is also addressed.\n},\n\tacceptrate = {283 out of 1204=24\\%},\n\tauthor = {Yu, Y.-L. and Chen, H. and Schuurmans, D. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\tkeywords = {theory, representer theorem, RKHS, kernels},\n\tmonth = {06},\n\tpages = {570--578},\n\ttitle = {Characterizing the Representer Theorem},\n\turl_paper = {repthm_icml13.pdf},\n\tyear = {2013}}\n\n
\n
\n\n\n
\n The representer theorem assures that kernel methods retain optimality under penalized empirical risk minimization. While a sufficient condition on the form of the regularizer guaranteeing the representer theorem has been known since the initial development of kernel methods, necessary conditions have only been investigated recently. In this paper we completely characterize the necessary and sufficient conditions on the regularizer that ensure the representer theorem holds. The results are surprisingly simple yet broaden the conditions where the representer theorem is known to hold. Extension to the matrix domain is also addressed. \n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2012\n \n \n (11)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Deep Representations and Codes for Image Auto-Annotation.\n \n \n \n \n\n\n \n Kiros, R.; and Szepesvári, C.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, pages 917–925. 12 2012.\n \n\n\n\n
\n\n\n\n \n \n \"Deep link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@incollection{NeurIPS2012_0424,\n\tabstract = {The task of image auto-annotation, namely assigning a set of relevant tags to an\nimage, is challenging due to the size and variability of tag vocabularies. Consequently,\nmost existing algorithms focus on tag assignment and fix an often large\nnumber of hand-crafted features to describe image characteristics. In this paper\nwe introduce a hierarchical model for learning representations of standard sized\ncolor images from the pixel level, removing the need for engineered feature representations\nand subsequent feature selection for annotation. We benchmark our\nmodel on the STL-10 recognition dataset, achieving state-of-the-art performance.\nWhen our features are combined with TagProp (Guillaumin et al.), we compete\nwith or outperform existing annotation approaches that use over a dozen distinct\nhandcrafted image descriptors. Furthermore, using 256-bit codes and Hamming\ndistance for training TagProp, we exchange only a small reduction in performance\nfor efficient storage and fast comparisons. Self-taught learning is used in all of our\nexperiments and deeper architectures always outperform shallow ones.},\n\tacceptrate = {370 out of 1467=25\\%},\n\tannote = {The task of image auto-annotation, namely assigning a set of relevant tags to an image, is challenging due to the size and variability of tag vocabularies. Consequently, most existing algorithms focus on tag assignment and fix an often large number of hand-crafted features to describe image characteristics. In this paper we introduce a hierarchical model for learning representations of standard sized color images from the pixel level, removing the need for engineered feature representations and subsequent feature selection for annotation. We benchmark our model on the STL-10 recognition dataset, achieving state-of-the-art performance. When our features are combined with TagProp (Guillaumin et al.), we compete with or outperform existing annotation approaches that use over a dozen distinct handcrafted image descriptors. Furthermore, using 256-bit codes and Hamming distance for training TagProp, we exchange only a small reduction in performance for efficient storage and fast comparisons. Self-taught learning is used in all of our experiments and deeper architectures always outperform shallow ones.},\n\tauthor = {Kiros, R. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tkeywords = {deep learning, image processing},\n\tmonth = {12},\n\tpages = {917--925},\n\ttitle = {Deep Representations and Codes for Image Auto-Annotation},\n\turl_link = {http://papers.neurips.cc/paper/4704-deep-representations-and-codes-for-image-auto-annotation.pdf},\n\tyear = {2012}}\n\n
\n
\n\n\n
\n The task of image auto-annotation, namely assigning a set of relevant tags to an image, is challenging due to the size and variability of tag vocabularies. Consequently, most existing algorithms focus on tag assignment and fix an often large number of hand-crafted features to describe image characteristics. In this paper we introduce a hierarchical model for learning representations of standard sized color images from the pixel level, removing the need for engineered feature representations and subsequent feature selection for annotation. We benchmark our model on the STL-10 recognition dataset, achieving state-of-the-art performance. When our features are combined with TagProp (Guillaumin et al.), we compete with or outperform existing annotation approaches that use over a dozen distinct handcrafted image descriptors. Furthermore, using 256-bit codes and Hamming distance for training TagProp, we exchange only a small reduction in performance for efficient storage and fast comparisons. Self-taught learning is used in all of our experiments and deeper architectures always outperform shallow ones.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Partial monitoring with side information.\n \n \n \n \n\n\n \n Bartók, G.; and Szepesvári, C.\n\n\n \n\n\n\n In ALT, pages 305–319, 10 2012. \n \n\n\n\n
\n\n\n\n \n \n \"Partial paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{BarSze12,\n\tabstract = {We present a new anytime algorithm that achieves near-optimal regret for any instance of finite stochastic partial monitoring. In particular, the new algorithm achieves the minimax regret, within logarithmic factors, for both "easy" and "hard" problems. For easy problems, it additionally achieves logarithmic individual regret. Most importantly, the algorithm is adaptive in the sense that if the opponent strategy is in an "easy region" of the strategy space then the regret grows as if the problem was easy. As an implication, we show that under some reasonable additional assumptions, the algorithm enjoys an O(T^{1/2}) regret in Dynamic Pricing, proven to be hard by Bartok et al. (2011).},\n\tacceptrate = {23 out of 47=49\\%},\n\tauthor = {Bart{\\'o}k, G. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ALT},\n\tkeywords = {partial information, online learning, stochastic partial monitoring, theory},\n\tmonth = {10},\n\tpages = {305--319},\n\ttitle = {Partial monitoring with side information},\n\turl_paper = {adaptive_sideinfo.pdf},\n\tyear = {2012}}\n\n
\n
\n\n\n
\n We present a new anytime algorithm that achieves near-optimal regret for any instance of finite stochastic partial monitoring. In particular, the new algorithm achieves the minimax regret, within logarithmic factors, for both \"easy\" and \"hard\" problems. For easy problems, it additionally achieves logarithmic individual regret. Most importantly, the algorithm is adaptive in the sense that if the opponent strategy is in an \"easy region\" of the strategy space then the regret grows as if the problem was easy. As an implication, we show that under some reasonable additional assumptions, the algorithm enjoys an O(T^1/2) regret in Dynamic Pricing, proven to be hard by Bartok et al. (2011).\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Approximate Policy Iteration with Linear Action Models.\n \n \n \n \n\n\n \n Yao, H.; and Szepesvári, C.\n\n\n \n\n\n\n In AAAI-2012, pages 1212–1217, 07 2012. \n \n\n\n\n
\n\n\n\n \n \n \"Approximate paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{YaoSze12,\n\tabstract = {In this paper we consider the problem of finding a good policy given some batch data. We propose a new approach, LAM-API, that first builds a so-called linear action model (LAM) from the data and then uses the learned model and the collected data in approximate policy iteration (API) to find a good policy. A natural choice for the policy evaluation step in this algorithm is to use least-squares temporal difference (LSTD) learning algorithm. Empirical results on three benchmark problems show that this particular instance of LAM-API performs competitively as compared with LSPI, both from the point of view of data and computational efficiency.},\n\tacceptrate = {294 out of 1129=26\\%},\n\tauthor = {Yao, H. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {AAAI-2012},\n\tkeywords = {reinforcement learning, Markov Decision Processes,function approximation, control, planning, control learning, temporal difference learning, LSTD},\n\tmonth = {07},\n\tpages = {1212--1217},\n\ttitle = {Approximate Policy Iteration with Linear Action Models},\n\turl_paper = {lamapi.pdf},\n\tyear = {2012}}\n\n
\n
\n\n\n
\n In this paper we consider the problem of finding a good policy given some batch data. We propose a new approach, LAM-API, that first builds a so-called linear action model (LAM) from the data and then uses the learned model and the collected data in approximate policy iteration (API) to find a good policy. A natural choice for the policy evaluation step in this algorithm is to use least-squares temporal difference (LSTD) learning algorithm. Empirical results on three benchmark problems show that this particular instance of LAM-API performs competitively as compared with LSPI, both from the point of view of data and computational efficiency.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Statistical linear estimation with penalized estimators: an application to reinforcement learning.\n \n \n \n \n\n\n \n Pires, B.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 1535–1542, 06 2012. \n \n\n\n\n
\n\n\n\n \n \n \"Statistical paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 7 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{PiSze1206,\n\tabstract = {Motivated by value function estimation in reinforcement learning, we study statistical linear inverse problems, i.e., problems where the coefficients of a linear system to be solved are observed in noise. We consider penalized estimators, where performance is evaluated using a matrix-weighted two-norm of the defect of the estimator measured with respect to the true, unknown coefficients. Two objective functions are considered de- pending whether the error of the defect measured with respect to the noisy coefficients is squared or unsquared. We propose simple, yet novel and theoretically well-founded data-dependent choices for the regularization parameters for both cases that avoid data-splitting. A distinguishing feature of our analysis is that we derive deterministic error bounds in terms of the error of the coefficients, thus allowing the complete separation of the analysis of the stochastic properties of these errors. We show that our results lead to new insights and bounds for linear value function estimation in reinforcement learning.},\n\tacceptrate = {243 out of 890=27.3\\%},\n\tauthor = {Pires, B.A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\tkeywords = {theory, linear prediction, reinforcement learning, LSTD, performance bounds, theory, inverse problems},\n\tmonth = {06},\n\tpages = {1535--1542},\n\ttitle = {Statistical linear estimation with penalized estimators: an application to reinforcement learning},\n\turl_paper = {linear_estimation_icml2012_extended.pdf},\n\tyear = {2012}}\n\n
\n
\n\n\n
\n Motivated by value function estimation in reinforcement learning, we study statistical linear inverse problems, i.e., problems where the coefficients of a linear system to be solved are observed in noise. We consider penalized estimators, where performance is evaluated using a matrix-weighted two-norm of the defect of the estimator measured with respect to the true, unknown coefficients. Two objective functions are considered de- pending whether the error of the defect measured with respect to the noisy coefficients is squared or unsquared. We propose simple, yet novel and theoretically well-founded data-dependent choices for the regularization parameters for both cases that avoid data-splitting. A distinguishing feature of our analysis is that we derive deterministic error bounds in terms of the error of the coefficients, thus allowing the complete separation of the analysis of the stochastic properties of these errors. We show that our results lead to new insights and bounds for linear value function estimation in reinforcement learning.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Analysis of Kernel Mean Matching under Covariate Shift.\n \n \n \n \n\n\n \n Yu, Y.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 607–614, 06 2012. \n \n\n\n\n
\n\n\n\n \n \n \"Analysis paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{YuSze1206,\n\tabstract = {In real supervised learning scenarios, it is not uncommon that the training and test sample follow different probability distributions, thus rendering the necessity to correct the sampling bias. Focusing on a particular co- variate shift problem, we derive high probability confidence bounds for the kernel mean matching (KMM) estimator, whose convergence rate turns out to depend on some regularity measure of the regression function and also on some capacity measure of the kernel. By comparing KMM with the natural plug-in estimator, we establish the superiority of the former hence provide concrete evidence/understanding to the effectiveness of KMM under covariate shift.},\n\tacceptrate = {243 out of 890=27.3\\%},\n\tauthor = {Yu, Y.-L. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\tkeywords = {theory, nonparametrics, covariate shift},\n\tmonth = {06},\n\tpages = {607--614},\n\ttitle = {Analysis of Kernel Mean Matching under Covariate Shift},\n\turl_paper = {covshiftkmm.pdf},\n\tyear = {2012}}\n\n
\n
\n\n\n
\n In real supervised learning scenarios, it is not uncommon that the training and test sample follow different probability distributions, thus rendering the necessity to correct the sampling bias. Focusing on a particular co- variate shift problem, we derive high probability confidence bounds for the kernel mean matching (KMM) estimator, whose convergence rate turns out to depend on some regularity measure of the regression function and also on some capacity measure of the kernel. By comparing KMM with the natural plug-in estimator, we establish the superiority of the former hence provide concrete evidence/understanding to the effectiveness of KMM under covariate shift.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n An adaptive algorithm for finite stochastic partial monitoring (extended version).\n \n \n \n \n\n\n \n Bartók, G.; Zolghadr, N.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 1–20, 06 2012. \n \n\n\n\n
\n\n\n\n \n \n \"An paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{BarZolSze1206,\n\tabstract = {We present a new anytime algorithm that achieves near-optimal regret for any instance of finite stochastic partial monitoring. In particular, the new algorithm achieves the minimax regret, within logarithmic factors, for both "easy" and "hard" problems. For easy problems, it additionally achieves logarithmic individual regret. Most importantly, the algorithm is adaptive in the sense that if the opponent strategy is in an "easy region" of the strategy space then the regret grows as if the problem was easy. As an implication, we show that under some reasonable additional assumptions, the algorithm enjoys an O(T^{1/2}) regret in Dynamic Pricing, proven to be hard by Bartok et al. (2011).},\n\tacceptrate = {243 out of 890=27.3\\%},\n\tauthor = {Bart{\\'o}k, G. and Zolghadr, N. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\tkeywords = {partial information, online learning, stochastic partial monitoring, minimax bounds, theory},\n\tmonth = {06},\n\tpages = {1--20},\n\ttitle = {An adaptive algorithm for finite stochastic partial monitoring (extended version)},\n\turl_paper = {adaptive_partmon_full.pdf},\n\tyear = {2012}}\n\n
\n
\n\n\n
\n We present a new anytime algorithm that achieves near-optimal regret for any instance of finite stochastic partial monitoring. In particular, the new algorithm achieves the minimax regret, within logarithmic factors, for both \"easy\" and \"hard\" problems. For easy problems, it additionally achieves logarithmic individual regret. Most importantly, the algorithm is adaptive in the sense that if the opponent strategy is in an \"easy region\" of the strategy space then the regret grows as if the problem was easy. As an implication, we show that under some reasonable additional assumptions, the algorithm enjoys an O(T^1/2) regret in Dynamic Pricing, proven to be hard by Bartok et al. (2011).\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A Randomized Strategy for Learning to Combine Many Features.\n \n \n \n \n\n\n \n Afkanpour, A.; György, A.; Szepesvári, C.; and Bowling, M. H.\n\n\n \n\n\n\n arXiv e-prints, abs/1205.0288. 05 2012.\n \n\n\n\n
\n\n\n\n \n \n \"ALink\n  \n \n \n \"A paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{AfGySzBo12,\n\tabstract = {We consider the problem of learning a predictor by combining possibly infinitely many linear predictors whose weights are to be learned, too, an instance of multiple kernel learning. To control overfitting a group p-norm penalty is used to penalize the empirical loss. We consider a reformulation of the problem that lets us implement a randomized version of the proximal point algorithm. The key idea of the new algorithm is to use randomized computation to alleviate the problem of dealing with possibly uncountably many predictors. Finite-time performance bounds are derived that show that under mild conditions the method finds the optimum of the penalized criterion in an efficient manner. Experimental results confirm the effectiveness of the new algorithm.},\n\tauthor = {Afkanpour, A. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs. and Bowling, M. H.},\n\tdate = {2012-05},\n\tee = {http://arxiv.org/abs/1205.0288},\n\tentrysubtype = {unrefereed},\t\n\tjournal = {arXiv e-prints},\n\tkeywords = {multikernel learning, optimization, theory},\n\tmonth = {05},\n\ttitle = {A Randomized Strategy for Learning to Combine Many Features},\n\turl_paper = {randomized-mkl.pdf},\n\tvolume = {abs/1205.0288},\n\tyear = {2012}}\n\n
\n
\n\n\n
\n We consider the problem of learning a predictor by combining possibly infinitely many linear predictors whose weights are to be learned, too, an instance of multiple kernel learning. To control overfitting a group p-norm penalty is used to penalize the empirical loss. We consider a reformulation of the problem that lets us implement a randomized version of the proximal point algorithm. The key idea of the new algorithm is to use randomized computation to alleviate the problem of dealing with possibly uncountably many predictors. Finite-time performance bounds are derived that show that under mild conditions the method finds the optimum of the penalized criterion in an efficient manner. Experimental results confirm the effectiveness of the new algorithm.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n The grand challenge of computer Go: Monte Carlo tree search and extensions.\n \n \n \n \n\n\n \n Gelly, S.; Kocsis, L.; Schoenauer, M.; Sebag, M.; Silver, D.; Szepesvári, C.; and Teytaud, O.\n\n\n \n\n\n\n Communications of the ACM, 55(3): 106–113. 2012.\n \n\n\n\n
\n\n\n\n \n \n \"TheLink\n  \n \n \n \"The paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{GKSSSST12,\n\tabstract = {The ancient oriental game of Go has long been considered a grand challenge for artificial intelligence. For decades, computer Go has defied the classical methods in game tree search that worked so successfully for chess and checkers. How- ever, recent play in computer Go has been transformed by a new paradigm for tree search based on Monte-Carlo methods. Programs based on Monte-Carlo tree search now play at human-master levels and are beginning to challenge top professional players. In this paper we describe the leading algorithms for Monte-Carlo tree search and explain how they have advanced the state of the art in computer Go.},\n\tauthor = {Gelly, S. and Kocsis, L. and Schoenauer, M. and Sebag, M. and Silver, D. and Szepesv{\\'a}ri, Cs. and Teytaud, O.},\n\tdate = {2012-03},\n\tee = {http://doi.acm.org/10.1145/2093548.2093574},\n\tjournal = {Communications of the ACM},\n\tkeywords = {Monte-Carlo tree search, UCT, Game of Go},\n\tnumber = {3},\n\tpages = {106--113},\n\ttitle = {The grand challenge of computer Go: Monte Carlo tree search and extensions},\n\turl_paper = {CACM-MCTS.pdf},\n\tvolume = {55},\n\tyear = {2012}}\n\n
\n
\n\n\n
\n The ancient oriental game of Go has long been considered a grand challenge for artificial intelligence. For decades, computer Go has defied the classical methods in game tree search that worked so successfully for chess and checkers. How- ever, recent play in computer Go has been transformed by a new paradigm for tree search based on Monte-Carlo methods. Programs based on Monte-Carlo tree search now play at human-master levels and are beginning to challenge top professional players. In this paper we describe the leading algorithms for Monte-Carlo tree search and explain how they have advanced the state of the art in computer Go.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Online-to-confidence-set conversions and application to sparse stochastic bandits.\n \n \n \n \n\n\n \n Abbasi-Yadkori, Y.; Pál, D.; and Szepesvári, C.\n\n\n \n\n\n\n In AISTATS, pages 1–9, 2012. \n \n\n\n\n
\n\n\n\n \n \n \"Online-to-confidence-setLink\n  \n \n \n \"Online-to-confidence-set paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 5 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{AYPSze12,\n\tabstract = {We introduce a novel technique, which we call online-to-confidence-set conversion. The technique allows us to construct high-probability confidence sets for linear prediction with correlated inputs given the predictions of any algorithm (e.g., online LASSO, exponentiated gradient algorithm, online least-squares, p-norm algorithm) targeting online learning with linear predictors and the quadratic loss. By construction, the size of the confidence set is directly governed by the regret of the online learning algorithm. Constructing tight confidence sets is interesting on its own, but the new technique is given extra weight by the fact having access tight confidence sets underlies a number of important problems. The advantage of our construction here is that progress in constructing better algorithms for online prediction problems directly translates into tighter confidence sets. In this paper, this is demonstrated in the case of linear stochastic bandits. In particular, we introduce the sparse variant of linear stochastic bandits and show that a recent online algorithm together with our online-to-confidence-set conversion allows one to derive algorithms that can exploit if the reward is a function of a sparse linear combination of the components of the chosen action.},\n\tacceptrate = {oral presentation 24 out of 400=6\\%},\n\tauthor = {Abbasi-Yadkori, Y. and P{\\'a}l, D. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {AISTATS},\n\tee = {http://jmlr.csail.mit.edu/proceedings/papers/v22/abbasi-yadkori12/abbasi-yadkori12.pdf},\n\tkeywords = {bandits, stochastic bandits, theory, online learning, linear bandits},\n\tpages = {1--9},\n\ttitle = {Online-to-confidence-set conversions and application to sparse stochastic bandits},\n\turl_paper = {online-to-confidenceset.pdf},\n\tyear = {2012}}\n\n
\n
\n\n\n
\n We introduce a novel technique, which we call online-to-confidence-set conversion. The technique allows us to construct high-probability confidence sets for linear prediction with correlated inputs given the predictions of any algorithm (e.g., online LASSO, exponentiated gradient algorithm, online least-squares, p-norm algorithm) targeting online learning with linear predictors and the quadratic loss. By construction, the size of the confidence set is directly governed by the regret of the online learning algorithm. Constructing tight confidence sets is interesting on its own, but the new technique is given extra weight by the fact having access tight confidence sets underlies a number of important problems. The advantage of our construction here is that progress in constructing better algorithms for online prediction problems directly translates into tighter confidence sets. In this paper, this is demonstrated in the case of linear stochastic bandits. In particular, we introduce the sparse variant of linear stochastic bandits and show that a recent online algorithm together with our online-to-confidence-set conversion allows one to derive algorithms that can exploit if the reward is a function of a sparse linear combination of the components of the chosen action.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n The adversarial stochastic shortest path problem with unknown transition probabilities.\n \n \n \n \n\n\n \n Neu, G.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In AISTATS, pages 805–813, 04 2012. \n \n\n\n\n
\n\n\n\n \n \n \"TheLink\n  \n \n \n \"The paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{NeGySz12,\n\tabstract = {We consider online learning in a special class of episodic Markovian decision processes, namely, loop-free stochastic shortest path problems. In this problem, an agent has to traverse through a finite directed acyclic graph with random transitions while maximizing the obtained rewards along the way. We assume that the reward function can change arbitrarily between consecutive episodes, and is entirely revealed to the agent at the end of each episode. Previous work was concerned with the case when the stochastic dynamics is known ahead of time, whereas the main novelty of this paper is that this assumption is lifted. We propose an algorithm called "follow the perturbed optimistic policy" that combines ideas from the "follow the perturbed leader" method for online learning of arbitrary sequences and "upper confidence reinforcement learning", an algorithm for regret minimization in Markovian decision processes (with a fixed reward function). We prove that the expected cumulative regret of our algorithm is of order L|X| |A| T^{1/2} up to logarithmic factors, where L is the length of the longest path in the graph, X is the state space, A is the action space and T is the number of episodes. To our knowledge this is the first algorithm that learns and controls stochastic and adversarial components in an online fashion at the same time.},\n\tauthor = {Neu, G. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {AISTATS},\n\tee = {http://jmlr.csail.mit.edu/proceedings/papers/v22/neu12.html},\n\tkeywords = {online learning, adversarial setting, finite MDPs, shortest path problem, theory},\n\tmonth = {04},\n\tpages = {805--813},\n\ttitle = {The adversarial stochastic shortest path problem with unknown transition probabilities},\n\turl_paper = {neu12.pdf},\n\tyear = {2012}}\n\n
\n
\n\n\n
\n We consider online learning in a special class of episodic Markovian decision processes, namely, loop-free stochastic shortest path problems. In this problem, an agent has to traverse through a finite directed acyclic graph with random transitions while maximizing the obtained rewards along the way. We assume that the reward function can change arbitrarily between consecutive episodes, and is entirely revealed to the agent at the end of each episode. Previous work was concerned with the case when the stochastic dynamics is known ahead of time, whereas the main novelty of this paper is that this assumption is lifted. We propose an algorithm called \"follow the perturbed optimistic policy\" that combines ideas from the \"follow the perturbed leader\" method for online learning of arbitrary sequences and \"upper confidence reinforcement learning\", an algorithm for regret minimization in Markovian decision processes (with a fixed reward function). We prove that the expected cumulative regret of our algorithm is of order L|X| |A| T^1/2 up to logarithmic factors, where L is the length of the longest path in the graph, X is the state space, A is the action space and T is the number of episodes. To our knowledge this is the first algorithm that learns and controls stochastic and adversarial components in an online fashion at the same time.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Regularized Least-Squares Regression: Learning from a beta-mixing Sequence.\n \n \n \n \n\n\n \n Farahmand, A.; and Szepesvári, C.\n\n\n \n\n\n\n Journal of Statistical Planning and Inference, 142(2): 493–505. 02 2012.\n \n\n\n\n
\n\n\n\n \n \n \"Regularized paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{FaSze11,\n\tabstract = {We analyze the rate of convergence of the estimation error in regularized least-squares regression when the data is exponentially beta-mixing. The results are proven under the assumption that the metric entropy of the balls in the chosen function space grows at most polynomially. In order to prove our main result, we also derive a relative deviation concentration inequality for beta-mixing processes, which might be of independent interest.  The other major techniques that we use are the independent-blocks technique and the peeling device.  An interesting aspect of our analysis is that in order to obtain fast rates we have to make the block sizes dependent on the layer of peeling. With this approach, up to a logarithmic factor, we recover the optimal minimax rates available for the i.i.d. case. In particular, our rate asymptotically matches  the optimal rate of convergence when the regression function belongs to a Sobolev space.},\n\tauthor = {Farahmand, A.m. and Szepesv{\\'a}ri, Cs.},\n\tdate = {2012-01},\n\tdate-added = {2011-08-09 16:51:54 -0600},\n\tdate-modified = {2012-06-03 14:14:18 -0600},\n\tjournal = {Journal of Statistical Planning and Inference},\n\tkeywords = {theory, nonparametrics, regression, mixing},\n\tmonth = {02},\n\tnumber = {2},\n\tpages = {493--505},\n\ttitle = {Regularized Least-Squares Regression: Learning from a beta-mixing Sequence},\n\turl_paper = {RegularizedRegressionWithMixing.pdf},\n\tvolume = {142},\n\tyear = {2012},\n\tBdsk-Url-1 = {http://dx.doi.org/10.1007/s10994-006-6888-8}}\n\n
\n
\n\n\n
\n We analyze the rate of convergence of the estimation error in regularized least-squares regression when the data is exponentially beta-mixing. The results are proven under the assumption that the metric entropy of the balls in the chosen function space grows at most polynomially. In order to prove our main result, we also derive a relative deviation concentration inequality for beta-mixing processes, which might be of independent interest. The other major techniques that we use are the independent-blocks technique and the peeling device. An interesting aspect of our analysis is that in order to obtain fast rates we have to make the block sizes dependent on the layer of peeling. With this approach, up to a logarithmic factor, we recover the optimal minimax rates available for the i.i.d. case. In particular, our rate asymptotically matches the optimal rate of convergence when the regression function belongs to a Sobolev space.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2011\n \n \n (12)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Least Squares Temporal Difference Learning and Galerkin's Method.\n \n \n \n \n\n\n \n Szepesvári, C.\n\n\n \n\n\n\n In Mini-Workshop: Mathematics of Machine Learning, volume 8, of Oberwolfach Reports, pages 2385–2388. European Mathematical Society, Oberwolfach, 2011.\n \n\n\n\n
\n\n\n\n \n \n \"Least paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@incollection{Sze12,\n\tabstract = {The problem of estimating the value function underlying a Markovian reward process is considered. As it is well known, the value function underlying a Markovian reward process satisfied a linear fixed point equation. One approach to learning the value function from finite data is to find a good approximation to the value function in a given (linear) subspace of the space of value functions. We review some of the issues that arise when following this approach, as well as some results that characterize the finite-sample performance of some of the algorithms.},\n\taddress = {Oberwolfach},\n\tauthor = {Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Mini-Workshop: Mathematics of Machine Learning},\n\tentrysubtype = {unrefereed},\n\tkeywords = {reinforcement learning, linear prediction, theory, inverse problems, LSTD},\n\tnumber = {3},\n\tpages = {2385--2388},\n\tpublisher = {European Mathematical Society},\n\tseries = {Oberwolfach Reports},\n\ttitle = {Least Squares Temporal Difference Learning and Galerkin's Method},\n\turl_paper = {Oberwolfach-Report.pdf},\n\tvolume = {8},\n\tyear = {2011}}\n\n
\n
\n\n\n
\n The problem of estimating the value function underlying a Markovian reward process is considered. As it is well known, the value function underlying a Markovian reward process satisfied a linear fixed point equation. One approach to learning the value function from finite data is to find a good approximation to the value function in a given (linear) subspace of the space of value functions. We review some of the issues that arise when following this approach, as well as some results that characterize the finite-sample performance of some of the algorithms.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Non-trivial two-armed partial-monitoring games are bandits.\n \n \n \n \n\n\n \n Antos, A.; Bartók, G.; and Szepesvári, C.\n\n\n \n\n\n\n arXiv e-prints, abs/1108.4961. 2011.\n \n\n\n\n
\n\n\n\n \n \n \"Non-trivialLink\n  \n \n \n \"Non-trivial paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{AnBaSze11,\n\tabstract = {We consider online learning in partial-monitoring games against an oblivious adversary. We show that when the number of actions available to the learner is two and the game is nontrivial then it is reducible to a bandit-like game and thus the minimax regret is Theta(T^{1/2}).},\n\tauthor = {Antos, A. and Bart{\\'o}k, G. and Szepesv{\\'a}ri, Cs.},\n\tdate = {2011-01},\n\tee = {http://arxiv.org/abs/1108.4961},\n\tjournal = {arXiv e-prints},\n\tkeywords = {minimax bounds, bandits, partial information, online learning, theory},\n\ttitle = {Non-trivial two-armed partial-monitoring games are bandits},\n\turl_paper = {twoarmed.pdf},\n\tentrysubtype = {unrefereed},\n\tvolume = {abs/1108.4961},\n\tyear = {2011}}\n\n
\n
\n\n\n
\n We consider online learning in partial-monitoring games against an oblivious adversary. We show that when the number of actions available to the learner is two and the game is nontrivial then it is reducible to a bandit-like game and thus the minimax regret is Theta(T^1/2).\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Proceedings of Algorithmic Learning Theory - 22nd International Conference (ALT 2011).\n \n \n \n \n\n\n \n Kivinen, J.; Szepesvári, C.; Ukkonen, E.; and Zeugmann, T.,\n editors.\n \n\n\n \n\n\n\n Volume 6925, of Lecture Notes in Computer Science.Springer. Espoo, Finland, 10 2011.\n \n\n\n\n
\n\n\n\n \n \n \"ProceedingsLink\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@proceedings{KiSzeUkZeu11,\n\taddress = {Espoo, Finland},\n\tbooktitle = {Proceedings of Algorithmic Learning Theory - 22nd International Conference (ALT 2011)},\n\tdate = {2011-01},\n\teditor = {Kivinen, J. and Szepesv{\\'a}ri, Cs. and Ukkonen, E. and Zeugmann, T.},\n\tee = {http://dx.doi.org/10.1007/978-3-642-24412-4},\n\tisbn = {978-3-642-24411-7},\n\tkeywords = {learning theory},\n\tmonth = {10},\n\tpublisher = {Springer},\n\tseries = {Lecture Notes in Computer Science},\n\ttitle = {Proceedings of Algorithmic Learning Theory - 22nd International Conference (ALT 2011)},\n\tvolume = {6925},\n\tyear = {2011}}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Improved Algorithms for Linear Stochastic Bandits (extended, corrected version).\n \n \n \n \n\n\n \n Abbasi-Yadkori, Y.; Pál, D.; and Szepesvári, C.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, pages 2312–2320, 12 2011. \n \n\n\n\n
\n\n\n\n \n \n \"Improved paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 34 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{AYPSz11,\n\tabstract = {We improve the theoretical analysis and empirical performance of algorithms for\nthe stochastic multi-armed bandit problem and the linear stochastic multi-armed\nbandit problem. In particular, we show that a simple modification of Auer's UCB\nalgorithm (Auer, 2002) achieves with high probability constant regret. More importantly,\nwe modify and, consequently, improve the analysis of the algorithm\nfor the for linear stochastic bandit problem studied by Auer (2002), Dani et al.\n(2008), Rusmevichientong and Tsitsiklis (2010), Li et al. (2010). Our modification\nimproves the regret bound by a logarithmic factor, though experiments show\na vast improvement. In both cases, the improvement stems from the construction\nof smaller confidence sets. For their construction we use a novel tail inequality for\nvector-valued martingales.},\n\tacceptrate = {oral presentation 60 out of 1400=4.3\\%},\n\tauthor = {Abbasi-Yadkori, Y. and P{\\'a}l, D. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tkeywords = {bandits, stochastic bandits, theory, linear bandits, online learning},\n\tmonth = {12},\n\tpages = {2312--2320},\n\ttitle = {Improved Algorithms for Linear Stochastic Bandits (extended, corrected version)},\n\turl_paper = {linear-bandits-NeurIPS2011.pdf},\n\tyear = {2011}}\n\n
\n
\n\n\n
\n We improve the theoretical analysis and empirical performance of algorithms for the stochastic multi-armed bandit problem and the linear stochastic multi-armed bandit problem. In particular, we show that a simple modification of Auer's UCB algorithm (Auer, 2002) achieves with high probability constant regret. More importantly, we modify and, consequently, improve the analysis of the algorithm for the for linear stochastic bandit problem studied by Auer (2002), Dani et al. (2008), Rusmevichientong and Tsitsiklis (2010), Li et al. (2010). Our modification improves the regret bound by a logarithmic factor, though experiments show a vast improvement. In both cases, the improvement stems from the construction of smaller confidence sets. For their construction we use a novel tail inequality for vector-valued martingales.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n PAC-Bayesian Policy Evaluation for Reinforcement Learning.\n \n \n \n \n\n\n \n Fard, M.; Pineau, J.; and Szepesvári, C.\n\n\n \n\n\n\n In UAI, pages 195–202, 07 2011. \n \n\n\n\n
\n\n\n\n \n \n \"PAC-Bayesian paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{FaSzePi11,\n\tabstract = {Bayesian priors offer a compact yet general means of incorporating domain knowledge into many learning tasks. The correctness of the Bayesian analysis and inference, however, largely depends on accuracy and correctness of these priors. PAC-Bayesian methods over- come this problem by providing bounds that hold regardless of the correctness of the prior distribution. This paper introduces the first PAC-Bayesian bound for the batch reinforce- ment learning problem with function approx- imation. We show how this bound can be used to perform model-selection in a trans- fer learning scenario. Our empirical results confirm that PAC-Bayesian policy evaluation is able to leverage prior distributions when they are informative and, unlike standard Bayesian RL approaches, ignore them when they are misleading.},\n\tacceptrate = {96 out of 285=34\\%},\n\tauthor = {Fard, M.M. and Pineau, J. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {UAI},\n\tkeywords = {reinforcement learning, theory, function approximation, transfer learning},\n\tmonth = {07},\n\tpages = {195--202},\n\ttitle = {PAC-Bayesian Policy Evaluation for Reinforcement Learning},\n\turl_paper = {PacBayesUAI.pdf},\n\tyear = {2011}}\n\n
\n
\n\n\n
\n Bayesian priors offer a compact yet general means of incorporating domain knowledge into many learning tasks. The correctness of the Bayesian analysis and inference, however, largely depends on accuracy and correctness of these priors. PAC-Bayesian methods over- come this problem by providing bounds that hold regardless of the correctness of the prior distribution. This paper introduces the first PAC-Bayesian bound for the batch reinforce- ment learning problem with function approx- imation. We show how this bound can be used to perform model-selection in a trans- fer learning scenario. Our empirical results confirm that PAC-Bayesian policy evaluation is able to leverage prior distributions when they are informative and, unlike standard Bayesian RL approaches, ignore them when they are misleading.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Model Selection in Reinforcement Learning.\n \n \n \n \n\n\n \n Farahmand, A.; and Szepesvári, C.\n\n\n \n\n\n\n Machine Learning Journal, 85(3): 299–332. 12 2011.\n \n\n\n\n
\n\n\n\n \n \n \"Model paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{farahmand2010,\n\tabstract = {(This version is identical to the MLJ version except that in the proof of Theorem 2 a minor issue in the proof is corrected.) We consider the problem of model selection in the batch (offline, non-interactive) rein- forcement learning setting when the goal is to find an action-value function with the smallest Bellman error among a countable set of candidates functions. We propose a complexity regularization-based model selection algorithm, BErMin, and prove that it enjoys an oracle-like property: the estimator's error differs from that of an oracle, who selects the candidate with the minimum Bellman error, by only a constant factor and a small remainder term that vanishes at a parametric rate as the number of samples increases. As an application, we consider a problem when the true action-value function belongs to an unknown member of a nested sequence of function spaces. We show that under some additional technical conditions BErMin leads to a procedure whose rate of convergence, up to a constant factor, matches that of an oracle who knows which of the nested function spaces the true action-value function belongs to, i.e., the procedure achieves adaptivity.},\n\tauthor = {Farahmand, A.m. and Szepesv{\\'a}ri, Cs.},\n\tdate = {2011-07},\n\tdate-added = {2011-07-03 21:08:30 -0600},\n\tdate-modified = {2012-06-03 14:11:12 -0600},\n\tdoi = {10.1007/s10994-011-5254-7},\n\tjournal = {Machine Learning Journal},\n\tkeywords = {model selection, Markov Decision Processes, reinforcement learning, Bellman residuals, theory, nonparametrics, adaptivity},\n\tmonth = {12},\n\tnumber = {3},\n\tpages = {299--332},\n\ttitle = {Model Selection in Reinforcement Learning},\n\turl_paper = {RLModelSelect.pdf},\n\tvolume = {85},\n\tyear = {2011},\n\tBdsk-Url-1 = {http://dx.doi.org/10.1007/s10994-006-6888-8}}\n\n
\n
\n\n\n
\n (This version is identical to the MLJ version except that in the proof of Theorem 2 a minor issue in the proof is corrected.) We consider the problem of model selection in the batch (offline, non-interactive) rein- forcement learning setting when the goal is to find an action-value function with the smallest Bellman error among a countable set of candidates functions. We propose a complexity regularization-based model selection algorithm, BErMin, and prove that it enjoys an oracle-like property: the estimator's error differs from that of an oracle, who selects the candidate with the minimum Bellman error, by only a constant factor and a small remainder term that vanishes at a parametric rate as the number of samples increases. As an application, we consider a problem when the true action-value function belongs to an unknown member of a nested sequence of function spaces. We show that under some additional technical conditions BErMin leads to a procedure whose rate of convergence, up to a constant factor, matches that of an oracle who knows which of the nested function spaces the true action-value function belongs to, i.e., the procedure achieves adaptivity.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Online Least Squares Estimation with Self-Normalized Processes: An Application to Bandit Problems.\n \n \n \n \n\n\n \n Abbasi-Yadkori, Y.; Pál, D.; and Szepesvári, C.\n\n\n \n\n\n\n arXiv e-prints, abs/1102.2670. 07 2011.\n \n\n\n\n
\n\n\n\n \n \n \"OnlineLink\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{AYPaSze11,\n\tabstract = {The analysis of online least squares estimation is at the heart of many stochastic sequential decision making problems. We employ tools from the self-normalized processes to provide a simple and self-contained proof of a tail bound of a vector-valued martingale. We use the bound to construct a new tighter confidence sets for the least squares estimate.\nWe apply the confidence sets to several online decision problems, such as the multi-armed and the linearly parametrized bandit problems. The confidence sets are potentially applicable to other problems such as sleeping bandits, generalized linear bandits, and other linear control problems.\nWe improve the regret bound of the Upper Confidence Bound (UCB) algorithm of Auer et al. (2002) and show that its regret is with high-probability a problem dependent constant. In the case of linear bandits (Dani et al., 2008), we improve the problem dependent bound in the dimension and number of time steps. Furthermore, as opposed to the previous result, we prove that our bound holds for small sample sizes, and at the same time the worst case bound is improved by a logarithmic factor and the constant is improved. },\n\tauthor = {Abbasi-Yadkori, Y. and P{\\'a}l, D. and Szepesv{\\'a}ri, Cs.},\n\tdate = {2011-07},\n\tee = {http://arxiv.org/abs/1102.2670},\n\tjournal = {arXiv e-prints},\n\tkeywords = {bandits, theory, tail inequalities, method of mixtures, least-squares methods},\n\tmonth = {07},\n\ttitle = {Online Least Squares Estimation with Self-Normalized Processes: An Application to Bandit Problems},\n\tentrysubtype = {unrefereed},\n\tvolume = {abs/1102.2670},\n\tyear = {2011}}\n\n
\n
\n\n\n
\n The analysis of online least squares estimation is at the heart of many stochastic sequential decision making problems. We employ tools from the self-normalized processes to provide a simple and self-contained proof of a tail bound of a vector-valued martingale. We use the bound to construct a new tighter confidence sets for the least squares estimate. We apply the confidence sets to several online decision problems, such as the multi-armed and the linearly parametrized bandit problems. The confidence sets are potentially applicable to other problems such as sleeping bandits, generalized linear bandits, and other linear control problems. We improve the regret bound of the Upper Confidence Bound (UCB) algorithm of Auer et al. (2002) and show that its regret is with high-probability a problem dependent constant. In the case of linear bandits (Dani et al., 2008), we improve the problem dependent bound in the dimension and number of time steps. Furthermore, as opposed to the previous result, we prove that our bound holds for small sample sizes, and at the same time the worst case bound is improved by a logarithmic factor and the constant is improved. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Agnostic KWIK learning and efficient approximate reinforcement learning.\n \n \n \n \n\n\n \n Szita, I.; and Szepesvári, C.\n\n\n \n\n\n\n In COLT, pages 739–772, 07 2011. \n \n\n\n\n
\n\n\n\n \n \n \"Agnostic paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{SziSze11,\n\tabstract = {A popular approach in reinforcement learning is to use a model-based algorithm, i.e., an algorithm that utilizes a model learner to learn an approximate model to the environment. It has been shown such a model-based learner is efficient if the model learner is efficient in the so-called ``knows what it knows'' (KWIK) framework. A major limitation of the standard KWIK framework is that, by its very definition, it covers only the case when the (model) learner can represent the actual environment with no errors. In this paper, we introduce the agnostic KWIK learning model, where we relax this assumption by allowing nonzero approximation errors. We show that with the new definition that an efficient model learner still leads to an efficient reinforcement learning algorithm. At the same time, though, we find that learning within the new framework can be substantially slower as compared to the standard framework, even in the case of simple learning problems.},\n\tacceptrate = {36 out of 117=31\\%},\n\tauthor = {Szita, I. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {COLT},\n\tkeywords = {reinforcement learning, model selection, complexity regularization, adaptivity, offline learning, off-policy learning, finite-sample bounds, theory},\n\tmonth = {07},\n\tpages = {739--772},\n\ttitle = {Agnostic KWIK learning and efficient approximate reinforcement learning},\n\turl_paper = {agnosticKwik.pdf},\n\tyear = {2011}}\n\n
\n
\n\n\n
\n A popular approach in reinforcement learning is to use a model-based algorithm, i.e., an algorithm that utilizes a model learner to learn an approximate model to the environment. It has been shown such a model-based learner is efficient if the model learner is efficient in the so-called ``knows what it knows'' (KWIK) framework. A major limitation of the standard KWIK framework is that, by its very definition, it covers only the case when the (model) learner can represent the actual environment with no errors. In this paper, we introduce the agnostic KWIK learning model, where we relax this assumption by allowing nonzero approximation errors. We show that with the new definition that an efficient model learner still leads to an efficient reinforcement learning algorithm. At the same time, though, we find that learning within the new framework can be substantially slower as compared to the standard framework, even in the case of simple learning problems.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Minimax Regret of Finite Partial-Monitoring Games in Stochastic Environments.\n \n \n \n \n\n\n \n Bartók, G.; Pál, D.; and Szepesvári, C.\n\n\n \n\n\n\n In COLT, pages 133–154, 07 2011. \n \n\n\n\n
\n\n\n\n \n \n \"Minimax paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{BaPaSze11,\n\tabstract = {In a partial monitoring game, the learner repeatedly chooses an action, the\nenvironment responds with an outcome, and then the learner suffers a loss and\nreceives a feedback signal, both of which are fixed functions of the action and\nthe outcome. The goal of the learner is to minimize his regret, which is the\ndifference between his total cumulative loss and the total loss of the best\nfixed action in hindsight.\nAssuming that the outcomes are generated in an i.i.d. fashion from an arbitrary and\nunknown probability distribution, we characterize the minimax regret of any\npartial monitoring game with finitely many actions and\noutcomes. It turns out that the minimax regret of any such game is either zero,\nTheta(T^{1/2}), Theta(T^{2/3}), or Theta(T). We provide a computationally efficient learning\nalgorithm that achieves the minimax regret within logarithmic factor for any game.},\n\tacceptrate = {36 out of 117=31\\%},\n\tauthor = {Bart{\\'o}k, G. and P{\\'a}l, D. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {COLT},\n\tkeywords = {online learning, partial information, theory, game theory},\n\tmonth = {07},\n\tpages = {133--154},\n\ttitle = {Minimax Regret of Finite Partial-Monitoring Games in Stochastic Environments},\n\turl_paper = {partmon_colt_final.pdf},\n\tyear = {2011}}\n\n
\n
\n\n\n
\n In a partial monitoring game, the learner repeatedly chooses an action, the environment responds with an outcome, and then the learner suffers a loss and receives a feedback signal, both of which are fixed functions of the action and the outcome. The goal of the learner is to minimize his regret, which is the difference between his total cumulative loss and the total loss of the best fixed action in hindsight. Assuming that the outcomes are generated in an i.i.d. fashion from an arbitrary and unknown probability distribution, we characterize the minimax regret of any partial monitoring game with finitely many actions and outcomes. It turns out that the minimax regret of any such game is either zero, Theta(T^1/2), Theta(T^2/3), or Theta(T). We provide a computationally efficient learning algorithm that achieves the minimax regret within logarithmic factor for any game.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Regret Bounds for the Adaptive Control of Linear Quadratic Systems.\n \n \n \n \n\n\n \n Abbasi-Yadkori, Y.; and Szepesvári, C.\n\n\n \n\n\n\n In COLT, pages 1–26, 07 2011. \n \n\n\n\n
\n\n\n\n \n \n \"Regret paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 7 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{AYSze11,\n\tabstract = {We study the average cost Linear Quadratic (LQ) control problem with unknown model parameters, also known as the adaptive control problem in the control community. We design an algorithm and prove that apart from logarithmic factors its regret up to time T is O(T^{1/2}). Unlike previous approaches that use a forced-exploration scheme, we construct a high-probability confidence set around the model parameters and design an algorithm that plays optimistically with respect to this confidence set. The construction of the confidence set is based on the recent results from online least-squares estimation and leads to improved worst-case regret bound for the proposed algorithm. To the best of our knowledge this is the first time that a regret bound is derived for the LQ control problem.},\n\tacceptrate = {36 out of 117=31\\%},\n\tauthor = {Abbasi-Yadkori, Y. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {COLT},\n\tkeywords = {online learning, continuous-space MDPs, continuous action space, reinforcement learning, linear dynamics, theory},\n\tmonth = {07},\n\tpages = {1--26},\n\ttitle = {Regret Bounds for the Adaptive Control of Linear Quadratic Systems},\n\turl_paper = {lqr_colt_final.pdf},\n\tyear = {2011}}\n\n
\n
\n\n\n
\n We study the average cost Linear Quadratic (LQ) control problem with unknown model parameters, also known as the adaptive control problem in the control community. We design an algorithm and prove that apart from logarithmic factors its regret up to time T is O(T^1/2). Unlike previous approaches that use a forced-exploration scheme, we construct a high-probability confidence set around the model parameters and design an algorithm that plays optimistically with respect to this confidence set. The construction of the confidence set is based on the recent results from online least-squares estimation and leads to improved worst-case regret bound for the proposed algorithm. To the best of our knowledge this is the first time that a regret bound is derived for the LQ control problem.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Sequential Learning for Optimal Monitoring of Multi-channel Wireless Networks.\n \n \n \n \n\n\n \n Pallavi, A.; Zheng, R.; and Szepesvári, C.\n\n\n \n\n\n\n In INFOCOM, pages 1152–1160, 04 2011. \n \n\n\n\n
\n\n\n\n \n \n \"Sequential paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{PaZheSze11,\n\tabstract = {We consider the problem of optimally assigning p sniffers to K\nchannels to monitor the transmission activities in a multi-channel wireless\nnetwork. The activity of users is initially unknown to the sniffers and is to\nbe learned along with channel assignment decisions while maximizing the benifits of this assignment,\n resulting in the fundamental trade-off between exploration versus exploitation. We formulate it as the\nlinear partial monitoring problem, a super-class of multi-armed bandits. As the number of\narms (sniffer-channel assignments) is exponential, novel techniques are called for, to allow efficient learning.\nWe use the linear bandit model to capture the dependency amongst the arms and develop two policies that take advantage of this dependency.\nBoth policies enjoy logarithmic regret bound of time-slots with a term that is sub-linear in the number of\narms.},\n\tacceptrate = {291 out of 1823=16\\%},\n\tauthor = {Pallavi, A. and Zheng, R. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {INFOCOM},\n\tkeywords = {theory, networking, wireless networks, bandits, stochastic bandits},\n\tmonth = {04},\n\tpages = {1152--1160},\n\ttitle = {Sequential Learning for Optimal Monitoring of Multi-channel Wireless Networks},\n\turl_paper = {infocom11_final.pdf},\n\tyear = {2011}}\n\n
\n
\n\n\n
\n We consider the problem of optimally assigning p sniffers to K channels to monitor the transmission activities in a multi-channel wireless network. The activity of users is initially unknown to the sniffers and is to be learned along with channel assignment decisions while maximizing the benifits of this assignment, resulting in the fundamental trade-off between exploration versus exploitation. We formulate it as the linear partial monitoring problem, a super-class of multi-armed bandits. As the number of arms (sniffer-channel assignments) is exponential, novel techniques are called for, to allow efficient learning. We use the linear bandit model to capture the dependency amongst the arms and develop two policies that take advantage of this dependency. Both policies enjoy logarithmic regret bound of time-slots with a term that is sub-linear in the number of arms.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n X-Armed Bandits.\n \n \n \n \n\n\n \n Bubeck, S.; Munos, R.; Stoltz, G.; and Szepesvári, C.\n\n\n \n\n\n\n Journal of Machine Learning Research, 12: 1655–1695. 06 2011.\n Submitted on 21/1/2010\n\n\n\n
\n\n\n\n \n \n \"X-ArmedLink\n  \n \n \n \"X-Armed paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{bubeck2010,\n\tabstract = {We consider a generalization of stochastic bandits where the set of\narms, X, is allowed to be a generic measurable space and the\nmean-payoff function is "locally Lipschitz" with respect to a\ndissimilarity function that is known to the decision maker. Under this\ncondition we construct an arm selection policy, called HOO\n(hierarchical optimistic optimization), with improved regret bounds\ncompared to previous results for a large class of problems. In\nparticular, our results imply that if X is the unit hypercube in a\nEuclidean space and the mean-payoff function has a finite number of\nglobal maxima around which the behavior of the function is locally\ncontinuous with a known smoothness degree, then the expected regret of\nHOO is bounded up to a logarithmic factor by sqrt(n), that is, the rate of\ngrowth of the regret is independent of the dimension of the space. We\nalso prove the minimax optimality of our algorithm when the\ndissimilarity is a metric. Our basic strategy has quadratic\ncomputational complexity as a function of the number of time steps and\ndoes not rely on the doubling trick. We also introduce a modified\nstrategy, which relies on the doubling trick but runs in linearithmic\ntime. Both results are improvements with respect to previous\napproaches.},\n\tauthor = {Bubeck, S. and Munos, R. and Stoltz, G. and Szepesv{\\'a}ri, Cs.},\n\tdate = {2011-06},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2011-07-04 11:16:14 -0600},\n\tee = {http://www.jmlr.org/papers/volume12/bubeck11a/bubeck11a.pdf},\n\tjournal = {Journal of Machine Learning Research},\n\tkeywords = {bandits, multi-armed bandits, large action space, stochastic bandits, theory, minimax bounds},\n\tmonth = {06},\n\tnote = {Submitted on 21/1/2010},\n\tpages = {1655--1695},\n\ttitle = {X-Armed Bandits},\n\turl_paper = {BMSS10.pdf},\n\tvolume = {12},\n\tyear = {2011}}\n\n
\n
\n\n\n
\n We consider a generalization of stochastic bandits where the set of arms, X, is allowed to be a generic measurable space and the mean-payoff function is \"locally Lipschitz\" with respect to a dissimilarity function that is known to the decision maker. Under this condition we construct an arm selection policy, called HOO (hierarchical optimistic optimization), with improved regret bounds compared to previous results for a large class of problems. In particular, our results imply that if X is the unit hypercube in a Euclidean space and the mean-payoff function has a finite number of global maxima around which the behavior of the function is locally continuous with a known smoothness degree, then the expected regret of HOO is bounded up to a logarithmic factor by sqrt(n), that is, the rate of growth of the regret is independent of the dimension of the space. We also prove the minimax optimality of our algorithm when the dissimilarity is a metric. Our basic strategy has quadratic computational complexity as a function of the number of time steps and does not rely on the doubling trick. We also introduce a modified strategy, which relies on the doubling trick but runs in linearithmic time. Both results are improvements with respect to previous approaches.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2010\n \n \n (17)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Error Propagation for Approximate Policy and Value Iteration (extended version).\n \n \n \n \n\n\n \n Farahmand, A.; Munos, R.; and Szepesvári, C.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, 12 2010. \n \n\n\n\n
\n\n\n\n \n \n \"Error paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{FaMuSz10,\n\tabstract = {We address the question of how the approximation error/Bellman residual at each\niteration of the Approximate Policy/Value Iteration algorithms influences the quality\nof the resulted policy. We quantify the performance loss as the L^p norm of the\napproximation error/Bellman residual at each iteration. Moreover, we show that\nthe performance loss depends on the expectation of the squared Radon-Nikodym\nderivative of a certain distribution rather than its supremum -- as opposed to what\nhas been suggested by the previous results. Also our results indicate that the\ncontribution of the approximation/Bellman error to the performance loss is more\nprominent in the later iterations of API/AVI, and the effect of an error term in the\nearlier iterations decays exponentially fast.},\n\tacceptrate = {293 out of 1219=24\\%},\n\tauthor = {Farahmand, A.m. and Munos, R. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tkeywords = {reinforcement learning, theory},\n\tmonth = {12},\n\ttitle = {Error Propagation for Approximate Policy and Value Iteration (extended version)},\n\turl_paper = {ErrorPropAPVI-NeurIPS09.pdf},\n\tyear = {2010}}\n\n
\n
\n\n\n
\n We address the question of how the approximation error/Bellman residual at each iteration of the Approximate Policy/Value Iteration algorithms influences the quality of the resulted policy. We quantify the performance loss as the L^p norm of the approximation error/Bellman residual at each iteration. Moreover, we show that the performance loss depends on the expectation of the squared Radon-Nikodym derivative of a certain distribution rather than its supremum – as opposed to what has been suggested by the previous results. Also our results indicate that the contribution of the approximation/Bellman error to the performance loss is more prominent in the later iterations of API/AVI, and the effect of an error term in the earlier iterations decays exponentially fast.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Estimation of Rényi Entropy and Mutual Information Based on Generalized Nearest-Neighbor Graphs (extended version).\n \n \n \n \n\n\n \n Pál, D.; Póczos, B.; and Szepesvári, C.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, 12 2010. \n \n\n\n\n
\n\n\n\n \n \n \"Estimation paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{PaPoSze10,\n\tabstract = {We present simple and computationally efficient nonparametric estimators of\nR{\\'e}nyi entropy and mutual information based on an i.i.d. sample drawn from an\nunknown, absolutely continuous distribution over R^d.  The estimators are\ncalculated as the sum of p-th powers of the Euclidean lengths of the edges of\nthe `generalized nearest-neighbor' graph of the sample and the empirical copula\nof the sample respectively. For the first time,\nwe prove the almost sure consistency of these estimators and\n upper bounds on their rates of convergence, the latter of which under\nthe assumption that the density underlying the sample is Lipschitz continuous.\nExperiments demonstrate their usefulness in independent subspace analysis.\n},\n\tacceptrate = {293 out of 1219=24\\%},\n\tauthor = {P{\\'a}l, D. and P{\\'o}czos, B. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tkeywords = {information theory, theory, mutual information, entropy},\n\tmonth = {12},\n\ttitle = {Estimation of R{\\'e}nyi Entropy and Mutual Information Based on Generalized Nearest-Neighbor Graphs (extended version)},\n\turl_paper = {Renyi-NeurIPS2010.pdf},\n\tyear = {2010}}\n\n
\n
\n\n\n
\n We present simple and computationally efficient nonparametric estimators of Rényi entropy and mutual information based on an i.i.d. sample drawn from an unknown, absolutely continuous distribution over R^d. The estimators are calculated as the sum of p-th powers of the Euclidean lengths of the edges of the `generalized nearest-neighbor' graph of the sample and the empirical copula of the sample respectively. For the first time, we prove the almost sure consistency of these estimators and upper bounds on their rates of convergence, the latter of which under the assumption that the density underlying the sample is Lipschitz continuous. Experiments demonstrate their usefulness in independent subspace analysis. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Parametric Bandits: The Generalized Linear Case (extended version).\n \n \n \n \n\n\n \n Filippi, S.; Cappé, O.; Garivier, A.; and Szepesvári, C.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, pages 586–594, 12 2010. \n \n\n\n\n
\n\n\n\n \n \n \"Parametric paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 17 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{FiOlGaSze10,\n\tabstract = { We consider structured multi-armed bandit problems based on the Generalized Linear Model (GLM) framework of statistics.\nFor these bandits, we propose a new algorithm, called GLM-UCB.\nWe derive finite time, high probability bounds on the regret of the algorithm, extending previous analyses developed for the linear bandits to the non-linear case.\n  The analysis highlights a key difficulty in generalizing linear bandit algorithms to the non-linear case, which is\n  solved in GLM-UCB by focusing on the reward space rather than on the parameter space.\n  Moreover,\n  as the actual effectiveness of current parameterized bandit algorithms is often poor\n  in practice, we provide a tuning method based on asymptotic arguments,\n  which leads to significantly better practical performance.\n We present two numerical experiments on real-world data that\n  illustrate the potential of the GLM-UCB approach.\n},\n\tacceptrate = {293 out of 1219=24\\%},\n\tauthor = {Filippi, S. and Capp{\\'e}, O. and Garivier, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tkeywords = {bandits, stochastic bandits, theory},\n\tmonth = {12},\n\tpages = {586--594},\n\ttitle = {Parametric Bandits: The Generalized Linear Case (extended version)},\n\turl_paper = {GenLinBandits-NeurIPS2010.pdf},\n\tyear = {2010}}\n\n
\n
\n\n\n
\n We consider structured multi-armed bandit problems based on the Generalized Linear Model (GLM) framework of statistics. For these bandits, we propose a new algorithm, called GLM-UCB. We derive finite time, high probability bounds on the regret of the algorithm, extending previous analyses developed for the linear bandits to the non-linear case. The analysis highlights a key difficulty in generalizing linear bandit algorithms to the non-linear case, which is solved in GLM-UCB by focusing on the reward space rather than on the parameter space. Moreover, as the actual effectiveness of current parameterized bandit algorithms is often poor in practice, we provide a tuning method based on asymptotic arguments, which leads to significantly better practical performance. We present two numerical experiments on real-world data that illustrate the potential of the GLM-UCB approach. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Online Markov Decision Processes under Bandit Feedback (extended version).\n \n \n \n \n\n\n \n Neu, G.; György, A.; Antos, A.; and Szepesvári, C.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, 12 2010. \n \n\n\n\n
\n\n\n\n \n \n \"Online paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{NeGyAnSze10,\n\tabstract = {We consider online learning in finite stochastic Markovian environments\nwhere in each time step a new reward function is chosen by an oblivious adversary.\nThe goal of the learning agent is to compete with the best stationary policy\nin terms of the total reward received.\nIn each time step the agent observes the current state and the reward associated with the last transition, however, the agent does not observe the rewards associated with other state-action pairs.\nThe agent is assumed to know the transition probabilities.\nThe state of the art result for this setting is a no-regret algorithm.\nIn this paper we propose a new learning algorithm and,\n assuming that stationary policies mix uniformly fast,\nwe show that after T time steps, the expected regret of the new algorithm\nis O( T^{2/3} (ln T)^(1/3),\ngiving the first rigorously proved regret bound for the problem.\n},\n\tacceptrate = {oral presentation 73 out of 1219=6\\%},\n\tauthor = {Neu, G. and Gy{\\"o}rgy, A. and Antos, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tkeywords = {online learning, Markov Decision Processes, bandits},\n\tmonth = {12},\n\ttitle = {Online Markov Decision Processes under Bandit Feedback (extended version)},\n\turl_paper = {OnlineMDP-NeurIPS2010.pdf},\n\tyear = {2010}}\n\n
\n
\n\n\n
\n We consider online learning in finite stochastic Markovian environments where in each time step a new reward function is chosen by an oblivious adversary. The goal of the learning agent is to compete with the best stationary policy in terms of the total reward received. In each time step the agent observes the current state and the reward associated with the last transition, however, the agent does not observe the rewards associated with other state-action pairs. The agent is assumed to know the transition probabilities. The state of the art result for this setting is a no-regret algorithm. In this paper we propose a new learning algorithm and, assuming that stationary policies mix uniformly fast, we show that after T time steps, the expected regret of the new algorithm is O( T^2/3 (ln T)^(1/3), giving the first rigorously proved regret bound for the problem. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Toward a Classification of Finite Partial-Monitoring Games.\n \n \n \n \n\n\n \n Bartók, G.; Pál, D.; and Szepesvári, C.\n\n\n \n\n\n\n In Hutter, M.; Stephan, F.; Vovk, V.; and Zeugmann, T., editor(s), ALT, pages 224–238, 10 2010. \n \n\n\n\n
\n\n\n\n \n \n \"Toward paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{bartok2010,\n\tabstract = {In a finite partial-monitoring game against Nature, the Learner repeatedly chooses one of finitely many actions, the Nature responds with one of finitely many outcomes, the Learner suffers a loss and receives feedback signal, both of which are fixed functions of the action and the outcome. The goal of the Learner is to minimize its total cumulative loss. We make progress towards classification of these games based on their minimax expected regret. Namely, we classify almost all games with two outcomes: We show that their minimax expected regret is either zero, $\\Theta(T^{1/2})$, $\\Theta(T^{2/3})$, or $\\Theta(T)$ and we give a simple and efficiently computable classification of these four classes of games. Our hope is that the result can serve as a stepping stone toward classifying all finite partial-monitoring games.},\n\tauthor = {Bart{\\'o}k, G. and P{\\'a}l, D. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ALT},\n\tdoi = {10.1007/978-3-642-16108-7_20},\n\teditor = {Hutter, M. and Stephan, F. and Vovk, V. and Zeugmann, T.},\n\tkeywords = {game theory, online learning, adversarial setting, theory, partial information},\n\tmonth = {10},\n\tpages = {224--238},\n\ttitle = {Toward a Classification of Finite Partial-Monitoring Games},\n\turl_paper = {partial-monitoring.pdf},\n\tyear = {2010},\n\tBdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-642-16108-7_20}}\n\n
\n
\n\n\n
\n In a finite partial-monitoring game against Nature, the Learner repeatedly chooses one of finitely many actions, the Nature responds with one of finitely many outcomes, the Learner suffers a loss and receives feedback signal, both of which are fixed functions of the action and the outcome. The goal of the Learner is to minimize its total cumulative loss. We make progress towards classification of these games based on their minimax expected regret. Namely, we classify almost all games with two outcomes: We show that their minimax expected regret is either zero, $Θ(T^{1/2})$, $Θ(T^{2/3})$, or $Θ(T)$ and we give a simple and efficiently computable classification of these four classes of games. Our hope is that the result can serve as a stepping stone toward classifying all finite partial-monitoring games.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Toward Off-Policy Learning Control with Function Approximation.\n \n \n \n \n\n\n \n Maei, H.; Szepesvári, C.; Bhatnagar, S.; and Sutton, R.\n\n\n \n\n\n\n In Fürnkranz, J.; and Joachims, T., editor(s), ICML, pages 719–726, 06 2010. Omnipress\n \n\n\n\n
\n\n\n\n \n \n \"Toward paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{maei2010,\n\tabstract = {We present the first temporal-difference learning algorithm for off-policy control with unrestricted linear function approximation whose per-time-step complexity is linear in the number of features. Our algorithm, Greedy-GQ, is an extension of recent work on gradient temporal-difference learning, which has hitherto been restricted to a prediction (policy evaluation) setting, to a control setting in which the target policy is greedy with respect to a linear approximation to the optimal action-value function. A limitation of our control setting is that we require the behavior policy to be stationary. We call this setting latent learning because the optimal policy, though learned, is not manifest in behavior. Popular off-policy algorithms such as Q-learning are known to be unstable in this setting when used with linear function approximation.},\n\tacceptrate = {152 out of 594=26\\%},\n\tauthor = {Maei, H.R. and Szepesv{\\'a}ri, Cs. and Bhatnagar, S. and Sutton, R.S.},\n\tbooktitle = {ICML},\n\teditor = {F{\\"u}rnkranz, J. and Joachims, T.},\n\tkeywords = {reinforcement learning, control learning, online learning, gradient algorithm, stochastic approximation, theory, function approximation, nonlinear function approximation, neural networks},\n\tmonth = {06},\n\tpages = {719--726},\n\tpublisher = {Omnipress},\n\ttitle = {Toward Off-Policy Learning Control with Function Approximation},\n\turl_paper = {ICML10_controlGQ.pdf},\n\tyear = {2010}}\n\n
\n
\n\n\n
\n We present the first temporal-difference learning algorithm for off-policy control with unrestricted linear function approximation whose per-time-step complexity is linear in the number of features. Our algorithm, Greedy-GQ, is an extension of recent work on gradient temporal-difference learning, which has hitherto been restricted to a prediction (policy evaluation) setting, to a control setting in which the target policy is greedy with respect to a linear approximation to the optimal action-value function. A limitation of our control setting is that we require the behavior policy to be stationary. We call this setting latent learning because the optimal policy, though learned, is not manifest in behavior. Popular off-policy algorithms such as Q-learning are known to be unstable in this setting when used with linear function approximation.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Model-based Reinforcement Learning with Nearly Tight Exploration Complexity Bounds.\n \n \n \n \n\n\n \n Szita, I.; and Szepesvári, C.\n\n\n \n\n\n\n In Fürnkranz, J.; and Joachims, T., editor(s), ICML, pages 1031–1038, 06 2010. Omnipress\n \n\n\n\n
\n\n\n\n \n \n \"Model-basedLink\n  \n \n \n \"Model-based paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{szita2010,\n\tabstract = {A strong selling point of using a model in reinforcement learning is that model-based algorithms can propagate the obtained experience more quickly, and are able to direct exploration better. As a consequence, fewer exploratory actions are enough to learn a good policy. Strangely enough, current theoretical results for model-based algorithms do not support this claim: In a Markov decision process with N states, the best bounds on the number of exploratory steps necessary are of order $O(N^2 \\log N)$, in contrast to the $O(N \\log N)$ bound available for the model-free, delayed Q-learning algorithm. In this paper we show that a modified version of the Rmax algorithm needs to make at most $O(N \\log N)$ exploratory steps. This matches the lower bound up to logarithmic factors, as well as the upper bound of the state-of-the-art model-free algorithm, while our new bound improves the dependence on the discount factor gamma.},\n\tacceptrate = {152 out of 594=26\\%},\n\tauthor = {Szita, I. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\teditor = {F{\\"u}rnkranz, J. and Joachims, T.},\n\tee = {http://www.icml2010.org/papers/546.pdf},\n\tkeywords = {reinforcement learning, PAC-learning, theory, exploration vs. exploitation, sequential algorithms},\n\tmonth = {06},\n\tpages = {1031--1038},\n\tpublisher = {Omnipress},\n\ttitle = {Model-based Reinforcement Learning with Nearly Tight Exploration Complexity Bounds},\n\turl_paper = {ICML10_rmax_improved.pdf},\n\tyear = {2010}}\n\n
\n
\n\n\n
\n A strong selling point of using a model in reinforcement learning is that model-based algorithms can propagate the obtained experience more quickly, and are able to direct exploration better. As a consequence, fewer exploratory actions are enough to learn a good policy. Strangely enough, current theoretical results for model-based algorithms do not support this claim: In a Markov decision process with N states, the best bounds on the number of exploratory steps necessary are of order $O(N^2 łog N)$, in contrast to the $O(N łog N)$ bound available for the model-free, delayed Q-learning algorithm. In this paper we show that a modified version of the Rmax algorithm needs to make at most $O(N łog N)$ exploratory steps. This matches the lower bound up to logarithmic factors, as well as the upper bound of the state-of-the-art model-free algorithm, while our new bound improves the dependence on the discount factor gamma.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Active Learning in Heteroscedastic Noise.\n \n \n \n \n\n\n \n Antos, A.; Grover, V.; and Szepesvári, C.\n\n\n \n\n\n\n Theoretical Computer Science, 411(29–30): 2712–2728. 06 2010.\n \n\n\n\n
\n\n\n\n \n \n \"Active paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{a.antos2010,\n\tabstract = {We consider the problem of actively learning the mean values of distributions associated with a finite number of options. The decision maker can select which option to generate the next observation from, the goal being to produce estimates with equally good precision for all the options. If sample means are used to estimate the unknown values then the optimal solution, assuming that the distributions are known up to a shift, is to sample from each distribution proportional to its variance. No information other than the distributions' variances is needed to calculate the optimal solution. In this paper we propose an incremental algorithm that asymptotically achieves the same loss as an optimal rule. We prove that the excess loss suffered by this algorithm, apart from logarithmic factors, scales as $n^{(-3/2)}$, which we conjecture to be the optimal rate. The performance of the algorithm is illustrated on a simple problem.},\n\tauthor = {Antos, A. and Grover, V. and Szepesv{\\'a}ri, Cs.},\n\tdate = {2010-06},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-11-25 00:42:00 -0700},\n\tdoi = {10.1016/j.tcs.2010.04.007},\n\tissn = {0304-3975},\n\tjournal = {Theoretical Computer Science},\n\tkeywords = {active learning, regression, sequential algorithms, theory},\n\tmonth = {06},\n\tnumber = {29--30},\n\tpages = {2712--2728},\n\ttitle = {Active Learning in Heteroscedastic Noise},\n\turl_paper = {Allocation-TCS10.pdf},\n\tvolume = {411},\n\tyear = {2010},\n\tBdsk-Url-1 = {http://dx.doi.org/10.1016/j.tcs.2010.04.007}}\n\n
\n
\n\n\n
\n We consider the problem of actively learning the mean values of distributions associated with a finite number of options. The decision maker can select which option to generate the next observation from, the goal being to produce estimates with equally good precision for all the options. If sample means are used to estimate the unknown values then the optimal solution, assuming that the distributions are known up to a shift, is to sample from each distribution proportional to its variance. No information other than the distributions' variances is needed to calculate the optimal solution. In this paper we propose an incremental algorithm that asymptotically achieves the same loss as an optimal rule. We prove that the excess loss suffered by this algorithm, apart from logarithmic factors, scales as $n^{(-3/2)}$, which we conjecture to be the optimal rate. The performance of the algorithm is illustrated on a simple problem.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Budgeted Distribution Learning of Belief Net Parameters.\n \n \n \n \n\n\n \n Li, L.; Póczos, B.; Greiner, R.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 879–886, 06 2010. Omnipress\n \n\n\n\n
\n\n\n\n \n \n \"BudgetedLink\n  \n \n \n \"Budgeted paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{b.poczos2010,\n\tabstract = {Most learning algorithms assume that a data set is given initially. We address the common situation where data is not available initially, but can be obtained, at a cost. We focus on learning Bayesian belief networks (BNs) over discrete variables. As such BNs are models of probabilistic distributions, we consider the ``generative'' challenge of learning the parameters, for a fixed structure, that best match the true distribution. We focus on the budgeted learning setting, where there is a known fixed cost $c_i$ for acquiring the value of the i-th feature for any specified instance, and a known total cost to spend acquiring all information. After formally defining this problem from a Bayesian perspective, we first consider allocation algorithms that must decide, before seeing any results, which features of which instances to probe. We show this is NP-hard, even if all variables are independent, then prove that the greedy allocation algorithm IGA is optimal when the costs are uniform and the features are independent, but can otherwise be sub-optimal. We then show that general (non-allocation) policies perform better, and explore the challenges of learning the parameters for general belief networks in this setting, describing conditions for when the obvious round-robin algorithm will, versus will not work optimally. We also explore the effectiveness of this and various other heuristic algorithms.},\n\tacceptrate = {152 out of 594=26\\%},\n\tauthor = {Li, L. and P{\\'o}czos, B. and Greiner, R. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\tee = {http://www.icml2010.org/papers/406.pdf},\n\tkeywords = {budgeted learning, sequential algorithms, unsupervised learning, density estimation},\n\tmonth = {06},\n\towner = {Beata},\n\tpages = {879--886},\n\tpublisher = {Omnipress},\n\ttimestamp = {2010.08.31},\n\ttitle = {Budgeted Distribution Learning of Belief Net Parameters},\n\turl_paper = {BDL_ICML2010.pdf},\n\tyear = {2010}}\n\n
\n
\n\n\n
\n Most learning algorithms assume that a data set is given initially. We address the common situation where data is not available initially, but can be obtained, at a cost. We focus on learning Bayesian belief networks (BNs) over discrete variables. As such BNs are models of probabilistic distributions, we consider the ``generative'' challenge of learning the parameters, for a fixed structure, that best match the true distribution. We focus on the budgeted learning setting, where there is a known fixed cost $c_i$ for acquiring the value of the i-th feature for any specified instance, and a known total cost to spend acquiring all information. After formally defining this problem from a Bayesian perspective, we first consider allocation algorithms that must decide, before seeing any results, which features of which instances to probe. We show this is NP-hard, even if all variables are independent, then prove that the greedy allocation algorithm IGA is optimal when the costs are uniform and the features are independent, but can otherwise be sub-optimal. We then show that general (non-allocation) policies perform better, and explore the challenges of learning the parameters for general belief networks in this setting, describing conditions for when the obvious round-robin algorithm will, versus will not work optimally. We also explore the effectiveness of this and various other heuristic algorithms.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Models of Active Learning in Group-structured State Spaces.\n \n \n \n \n\n\n \n Bartók, G.; Szepesvári, C.; and Zilles, S.\n\n\n \n\n\n\n Information and Computation, 208: 364–384. 04 2010.\n \n\n\n\n
\n\n\n\n \n \n \"Models paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{bartok2010a,\n\tabstract = {We investigate the problem of learning the transition dynamics of deterministic, discrete-state environments. We assume that an agent exploring such an environment is able to perform actions (from a finite set of actions) in the environment and to sense the state changes. The question investigated is whether the agent can learn the dynamics without visiting all states. Such a goal is unrealistic in general, hence we assume that the environment has structural properties an agent might exploit. In particular, we assume that the set of all action sequences forms an algebraic group.\n\n\t\t  We introduce a learning model in different variants and study under which circumstances the corresponding ``group structured environments'' can be learned efficiently by experimenting with group generators (actions). It turns out that for some classes of such environments the choice of actions given to the agent determines if efficient learning is possible. Negative results are presented, even without efficiency constraints, for rather general classes of groups, showing that even with group structure, learning an environment from partial information is far from trivial. However, positive results for special subclasses of Abelian groups turn out to be a good starting point for the design of efficient learning algorithms based on structured representations.},\n\tauthor = {Bart{\\'o}k, G. and Szepesv{\\'a}ri, Cs. and Zilles, S.},\n\tdate = {2010-04},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-11-25 00:42:38 -0700},\n\tdoi = {10.1016/j.ic.2009.09.001},\n\tjournal = {Information and Computation},\n\tkeywords = {active learning, sequential algorithms, theory},\n\tmonth = {04},\n\tpages = {364--384},\n\ttitle = {Models of Active Learning in Group-structured State Spaces},\n\turl_paper = {bartokSZ09groups.pdf},\n\tvolume = {208},\n\tyear = {2010},\n\tBdsk-Url-1 = {http://dx.doi.org/10.1016/j.ic.2009.09.001}}\n\n
\n
\n\n\n
\n We investigate the problem of learning the transition dynamics of deterministic, discrete-state environments. We assume that an agent exploring such an environment is able to perform actions (from a finite set of actions) in the environment and to sense the state changes. The question investigated is whether the agent can learn the dynamics without visiting all states. Such a goal is unrealistic in general, hence we assume that the environment has structural properties an agent might exploit. In particular, we assume that the set of all action sequences forms an algebraic group. We introduce a learning model in different variants and study under which circumstances the corresponding ``group structured environments'' can be learned efficiently by experimenting with group generators (actions). It turns out that for some classes of such environments the choice of actions given to the agent determines if efficient learning is possible. Negative results are presented, even without efficiency constraints, for rather general classes of groups, showing that even with group structure, learning an environment from partial information is far from trivial. However, positive results for special subclasses of Abelian groups turn out to be a good starting point for the design of efficient learning algorithms based on structured representations.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n The Online Loop-free Stochastic Shortest-Path Problem.\n \n \n \n \n\n\n \n Neu, G.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In COLT, pages 231–243, 06 2010. \n \n\n\n\n
\n\n\n\n \n \n \"The paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{g.neu2010,\n\tabstract = {We consider a stochastic extension of the loop-free shortest path problem with adversarial rewards. In this episodic Markov decision problem an agent traverses through an acyclic graph with random transitions: at each step of an episode the agent chooses an action, receives some reward, and arrives at a random next state, where the reward and the distribution of the next state depend on the actual state and the chosen action. We consider the bandit situation when only the reward of the just visited state-action pair is revealed to the agent. For this problem we develop algorithms that perform asymptotically as well as the best stationary policy in hindsight. Assuming that all states are reachable with probability $\\alpha > 0$ under all policies, we give an algorithm and prove that its regret is $O(L^2 \\sqrt(T|A|)/\\alpha)$, where $T$ is the number of episodes, $A$ denotes the (finite) set of actions, and $L$ is the length of the longest path in the graph. Variants of the algorithm are given that improve the dependence on the transition probabilities under specific conditions. The results are also extended to variations of the problem, including the case when the agent competes with time varying policies.},\n\tacceptrate = {41 out of 129=32\\%},\n\tauthor = {Neu, G. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {COLT},\n\tkeywords = {online learning, adversarial setting, finite MDPs, shortest path problem},\n\tmonth = {06},\n\tpages = {231--243},\n\ttitle = {The Online Loop-free Stochastic Shortest-Path Problem},\n\turl_paper = {ssp_col_final.pdf},\n\tyear = {2010}}\n\n
\n
\n\n\n
\n We consider a stochastic extension of the loop-free shortest path problem with adversarial rewards. In this episodic Markov decision problem an agent traverses through an acyclic graph with random transitions: at each step of an episode the agent chooses an action, receives some reward, and arrives at a random next state, where the reward and the distribution of the next state depend on the actual state and the chosen action. We consider the bandit situation when only the reward of the just visited state-action pair is revealed to the agent. For this problem we develop algorithms that perform asymptotically as well as the best stationary policy in hindsight. Assuming that all states are reachable with probability $α > 0$ under all policies, we give an algorithm and prove that its regret is $O(L^2 \\sqrt(T|A|)/α)$, where $T$ is the number of episodes, $A$ denotes the (finite) set of actions, and $L$ is the length of the longest path in the graph. Variants of the algorithm are given that improve the dependence on the transition probabilities under specific conditions. The results are also extended to variations of the problem, including the case when the agent competes with time varying policies.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n REGO: Rank-based Estimation of Rényi Information using Euclidean Graph Optimization.\n \n \n \n \n\n\n \n Póczos, B.; Kirshner, S.; and Szepesvári, C.\n\n\n \n\n\n\n In AISTATS, volume 9, pages 852–859, 05 2010. \n \n\n\n\n
\n\n\n\n \n \n \"REGO: paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{poczos2010,\n\tabstract = {We propose a new method for a non-parametric estimation of R{\\'e}nyi and Shannon information for a multivariate distribution using a corresponding copula, a multivariate distribution over normalized ranks of the data. As the information of the distribution is the same as the negative entropy of its copula, our method estimates this information by solving a Euclidean graph optimization problem on the empirical estimate of the distribution's copula. Owing to the properties of the copula, we show that the resulting estimator of R{\\'e}nyi information is strongly consistent and robust. Further, we demonstrate its applicability in image registration in addition to simulated experiments.},\n\tacceptrate = {oral presentation 24 out of 308=8\\%},\n\tauthor = {P{\\'o}czos, B. and Kirshner, S. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {AISTATS},\n\tkeywords = {information theory, theory, mutual information},\n\tmonth = {05},\n\tpages = {852--859},\n\ttitle = {REGO: Rank-based Estimation of R{\\'e}nyi Information using Euclidean Graph Optimization},\n\turl_paper = {information_AIstat_v4.pdf},\n\tvolume = {9},\n\tyear = {2010}}\n\n
\n
\n\n\n
\n We propose a new method for a non-parametric estimation of Rényi and Shannon information for a multivariate distribution using a corresponding copula, a multivariate distribution over normalized ranks of the data. As the information of the distribution is the same as the negative entropy of its copula, our method estimates this information by solving a Euclidean graph optimization problem on the empirical estimate of the distribution's copula. Owing to the properties of the copula, we show that the resulting estimator of Rényi information is strongly consistent and robust. Further, we demonstrate its applicability in image registration in addition to simulated experiments.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Reinforcement Learning Algorithms for MDPs.\n \n \n \n\n\n \n Szepesvári, C.\n\n\n \n\n\n\n In Wiley Encyclopedia of Operations Research. Wiley, 09 2010.\n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@incollection{Szepesvari2010,\n\tauthor = {Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Wiley Encyclopedia of Operations Research},\n\tkeywords = {survey, reinforcement learning, temporal difference learning, stochastic approximation, two-timescale stochastic approximation, Monte-Carlo methods, simulation optimization, function approximation, stochastic gradient methods, least-squares methods, overfitting, bias-variance tradeoff, online learning, active learning, planning, simulation, PAC-learning, Q-learning, actor-critic methods, policy gradient, natural gradient},\n\tmonth = {09},\n\tpublisher = {Wiley},\n\ttitle = {Reinforcement Learning Algorithms for MDPs},\n\tyear = {2010}}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Algorithms for Reinforcement Learning.\n \n \n \n \n\n\n \n Szepesvári, C.\n\n\n \n\n\n\n Morgan and Claypool, 07 2010.\n \n\n\n\n
\n\n\n\n \n \n \"AlgorithmsPaper\n  \n \n \n \"Algorithms link\n  \n \n \n \"Algorithms paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 14 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@book{Szepesvari2010c,\n\tabstract = {Reinforcement learning is a learning paradigm concerned with learning to control a system so as to maximize a numerical performance measure that expresses a long-term objective. What distinguishes reinforcement learning from supervised learning is that only partial feedback is given to the learner about the learner's predictions. Further, the predictions may have long term effects through influencing the future state of the controlled system. Thus, time plays a special role. The goal in reinforcement learning is to develop efficient learning algorithms, as well as to understand the algorithms' merits and limitations. Reinforcement learning is of great interest because of the large number of practical applications that it can be used to address, ranging from problems in artificial intelligence to operations research or control engineering. In this book we focus on those algorithms of reinforcement learning which build on the powerful theory of dynamic programming. We give a fairly comprehensive catalog of learning problems, describe the core ideas, a large number of state of the art algorithms, followed by the discussion of their theoretical properties and limitations.},\n\tauthor = {Szepesv{\\'a}ri, Cs.},\n\tdate = {2010-07},\n\tdate-added = {2010-08-28 20:41:24 -0600},\n\tdate-modified = {2012-06-03 14:07:36 -0600},\n\tdoi = {10.2200/S00268ED1V01Y201005AIM009},\n\tkeywords = {reinforcement learning, temporal difference learning, stochastic approximation, two-timescale stochastic approximation, Monte-Carlo methods, simulation optimization, function approximation, stochastic gradient methods, least-squares methods, overfitting, bias-variance tradeoff, online learning, active learning, planning, simulation, PAC-learning, Q-learning, actor-critic methods, policy gradient, natural gradient},\n\tmonth = {07},\n\tpublisher = {Morgan and Claypool},\n\ttitle = {Algorithms for Reinforcement Learning},\n\turl = {http://www.ualberta.ca/~szepesva/RLBook.html},\n\turl_link = {http://www.ualberta.ca/~szepesva/RLBook.html},\n\turl_paper = {RLAlgsInMDPs-lecture.pdf},\n\tyear = {2010},\n\tBdsk-Url-1 = {http://www.ualberta.ca/~szepesva/RLBook.html}}\n\n
\n
\n\n\n
\n Reinforcement learning is a learning paradigm concerned with learning to control a system so as to maximize a numerical performance measure that expresses a long-term objective. What distinguishes reinforcement learning from supervised learning is that only partial feedback is given to the learner about the learner's predictions. Further, the predictions may have long term effects through influencing the future state of the controlled system. Thus, time plays a special role. The goal in reinforcement learning is to develop efficient learning algorithms, as well as to understand the algorithms' merits and limitations. Reinforcement learning is of great interest because of the large number of practical applications that it can be used to address, ranging from problems in artificial intelligence to operations research or control engineering. In this book we focus on those algorithms of reinforcement learning which build on the powerful theory of dynamic programming. We give a fairly comprehensive catalog of learning problems, describe the core ideas, a large number of state of the art algorithms, followed by the discussion of their theoretical properties and limitations.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Some Recent Algorithmic Results about the Exploration-vs-exploitation Dilemma.\n \n \n \n \n\n\n \n Szepesvári, C.\n\n\n \n\n\n\n 03 2010.\n \n\n\n\n
\n\n\n\n \n \n \"Some link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@misc{Szepesvari2010b,\n\tauthor = {Szepesv{\\'a}ri, Cs.},\n\tdate = {2010-03},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2015-06-23 19:28:12 +0000},\n\tkeywords = {survey, multi-armed bandits, exploration vs. exploitation},\n\tmonth = {03},\n\ttitle = {Some Recent Algorithmic Results about the Exploration-vs-exploitation Dilemma},\n\turl_link = {http://rstb.royalsocietypublishing.org/content/362/1481/933.e-letters#some-recent-algorithmic-results-about-the-exploration-vs-exploitation-dilemma},\n\tyear = {2010},\n\tentrysubtype = {unrefereed},\n\tBdsk-Url-1 = {http://rstb.royalsocietypublishing.org/content/362/1481/933.abstract/reply#royptb_el_55}}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A Markov-Chain Monte Carlo Approach to Simultaneous Localization and Mapping.\n \n \n \n \n\n\n \n Torma, P.; György, A.; and Szepesvári, C.\n\n\n \n\n\n\n In AISTATS, volume 9, pages 605–612, 05 2010. \n \n\n\n\n
\n\n\n\n \n \n \"A paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{torma2010,\n\tabstract = {A Markov-chain Monte Carlo based algorithm is provided to solve the Simultaneous localization and mapping (SLAM) problem with general dynamics and observation model under open-loop control and provided that the map-representation is finite dimensional. To our knowledge this is the first provably consistent yet (close-to) practical solution to this problem. The superiority of our algorithm over alternative SLAM algorithms is demonstrated in a difficult loop closing situation.},\n\tacceptrate = {125 out of 308=41\\%},\n\tauthor = {Torma, P. and Gy{\\"o}rgy, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {AISTATS},\n\tkeywords = {SLAM, robotics, application, theory, Monte-Carlo methods, MCMC},\n\tmonth = {05},\n\tpages = {605--612},\n\ttitle = {A Markov-Chain Monte Carlo Approach to Simultaneous Localization and Mapping},\n\turl_paper = {mcmc-slam.pdf},\n\tvolume = {9},\n\tyear = {2010}}\n\n
\n
\n\n\n
\n A Markov-chain Monte Carlo based algorithm is provided to solve the Simultaneous localization and mapping (SLAM) problem with general dynamics and observation model under open-loop control and provided that the map-representation is finite dimensional. To our knowledge this is the first provably consistent yet (close-to) practical solution to this problem. The superiority of our algorithm over alternative SLAM algorithms is demonstrated in a difficult loop closing situation.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Extending Rapidly-Exploring Random Trees for Asymptotically Optimal Anytime Motion Planning.\n \n \n \n \n\n\n \n Abbasi-Yadkori, Y.; Modayil, J.; and Szepesvári, C.\n\n\n \n\n\n\n In IROS, pages 127–132, 10 2010. \n \n\n\n\n
\n\n\n\n \n \n \"Extending paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{y.abbasi2010,\n\tabstract = {We consider the problem of anytime planning in continuous state and action spaces with non-linear deterministic dynamics. We review the existing approaches to this problem and find no algorithms that both quickly find feasible solutions and also eventually approach optimal solutions with additional time. The state-of-the-art solution to this problem is the rapidly- exploring random tree (RRT) algorithm that quickly finds a feasible solution. However, the RRT algorithm does not return better results with additional time. We introduce RRT++ , an anytime extension of the basic RRT algorithm. We show that the new algorithm has desirable theoretical properties and experimentally show that it efficiently finds near optimal solutions.},\n\tacceptrate = {840 out of 1692=50\\%},\n\tauthor = {Abbasi-Yadkori, Y. and Modayil, J. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {IROS},\n\tkeywords = {robotics, Monte-Carlo tree search, motion planning, theory, application},\n\tmonth = {10},\n\tpages = {127--132},\n\ttitle = {Extending Rapidly-Exploring Random Trees for Asymptotically Optimal Anytime Motion Planning},\n\turl_paper = {iros-2010.pdf},\n\tyear = {2010}}\n\n
\n
\n\n\n
\n We consider the problem of anytime planning in continuous state and action spaces with non-linear deterministic dynamics. We review the existing approaches to this problem and find no algorithms that both quickly find feasible solutions and also eventually approach optimal solutions with additional time. The state-of-the-art solution to this problem is the rapidly- exploring random tree (RRT) algorithm that quickly finds a feasible solution. However, the RRT algorithm does not return better results with additional time. We introduce RRT++ , an anytime extension of the basic RRT algorithm. We show that the new algorithm has desirable theoretical properties and experimentally show that it efficiently finds near optimal solutions.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2009\n \n \n (12)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Learning to Segment from a Few Well-Selected Training Images.\n \n \n \n \n\n\n \n Farhangfar, A.; Greiner, R.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 1– 2, 2009. \n \n\n\n\n
\n\n\n\n \n \n \"Learning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{a.farhangfar2009,\n\tabstract = {We address the task of actively learning a segmentation system: given a large number of unsegmented images, and access to an oracle that can segment a given image, decide which images to provide, to quickly produce a segmenter (here, a discriminative random field) that is accurate over this distribution of images. We extend the standard models for active learner to define a system for this task that first selects the image whose expected label will reduce the uncertainty of the other unlabeled images the most, and then after greedily selects, from the pool of unsegmented images, the most informative image. The results of our experiments, over two real-world datasets (segmenting brain tumors within magnetic resonance images; and segmenting the sky in real images) show that training on very few informative images (here, as few as 2) can produce a segmenter that is as good as training on the entire dataset.},\n\tacceptrate = {27\\%},\n\tauthor = {Farhangfar, A. and Greiner, R. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\tkeywords = {active learning, supervised learning, structured prediction},\n\tpages = {1-- 2},\n\ttimestamp = {2010.08.31},\n\ttitle = {Learning to Segment from a Few Well-Selected Training Images},\n\turl_paper = {LearningToSegment-ICML09.pdf},\n\tyear = {2009}}\n\n
\n
\n\n\n
\n We address the task of actively learning a segmentation system: given a large number of unsegmented images, and access to an oracle that can segment a given image, decide which images to provide, to quickly produce a segmenter (here, a discriminative random field) that is accurate over this distribution of images. We extend the standard models for active learner to define a system for this task that first selects the image whose expected label will reduce the uncertainty of the other unlabeled images the most, and then after greedily selects, from the pool of unsegmented images, the most informative image. The results of our experiments, over two real-world datasets (segmenting brain tumors within magnetic resonance images; and segmenting the sky in real images) show that training on very few informative images (here, as few as 2) can produce a segmenter that is as good as training on the entire dataset.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Exploration-exploitation Tradeoff using Variance Estimates in Multi-armed Bandits.\n \n \n \n \n\n\n \n Audibert, J.; Munos, R.; and Szepesvári, C.\n\n\n \n\n\n\n Theoretical Computer Science, 410(19): 1876–1902. 2009.\n \n\n\n\n
\n\n\n\n \n \n \"Exploration-exploitationLink\n  \n \n \n \"Exploration-exploitation paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 6 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{audibert2009,\n\tabstract = {Algorithms based on upper confidence bounds for balancing exploration and exploitation are gaining popularity since they are easy to implement, efficient and effective. This paper considers a variant of the basic algorithm for the stochastic, multi-armed bandit problem that takes into account the empirical variance of the different arms. In earlier experimental works, such algorithms were found to outperform the competing algorithms. We provide the first analysis of the expected regret for such algorithms. As expected, our results show that the algorithm that uses the variance estimates has a major advantage over its alternatives that do not use such estimates provided that the variances of the payoffs of the suboptimal arms are low. We also prove that the regret concentrates only at a polynomial rate. This holds for all the upper confidence bound based algorithms and for all bandit problems except those special ones where with probability one the payoff obtained by pulling the optimal arm is larger than the expected payoff for the second best arm. Hence, although upper confidence bound bandit algorithms achieve logarithmic expected regret rates, they might not be suitable for a risk-averse decision maker. We illustrate some of the results by computer simulations.},\n\tauthor = {Audibert, J.-Y. and Munos, R. and Szepesv{\\'a}ri, Cs.},\n\tbibsource = {DBLP, http://dblp.uni-trier.de},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tee = {http://dx.doi.org/10.1016/j.tcs.2009.01.016},\n\tjournal = {Theoretical Computer Science},\n\tkeywords = {multi-armed bandits, sequential algorithms, stochastic bandits, Bernstein's inequality, theory},\n\tnumber = {19},\n\tpages = {1876--1902},\n\ttitle = {Exploration-exploitation Tradeoff using Variance Estimates in Multi-armed Bandits},\n\turl_paper = {ucbtuned-journal.pdf},\n\tvolume = {410},\n\tyear = {2009}}\n\n
\n
\n\n\n
\n Algorithms based on upper confidence bounds for balancing exploration and exploitation are gaining popularity since they are easy to implement, efficient and effective. This paper considers a variant of the basic algorithm for the stochastic, multi-armed bandit problem that takes into account the empirical variance of the different arms. In earlier experimental works, such algorithms were found to outperform the competing algorithms. We provide the first analysis of the expected regret for such algorithms. As expected, our results show that the algorithm that uses the variance estimates has a major advantage over its alternatives that do not use such estimates provided that the variances of the payoffs of the suboptimal arms are low. We also prove that the regret concentrates only at a polynomial rate. This holds for all the upper confidence bound based algorithms and for all bandit problems except those special ones where with probability one the payoff obtained by pulling the optimal arm is larger than the expected payoff for the second best arm. Hence, although upper confidence bound bandit algorithms achieve logarithmic expected regret rates, they might not be suitable for a risk-averse decision maker. We illustrate some of the results by computer simulations.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Regularized Fitted Q-iteration for Planning in Continuous-Space Markovian Decision Problems.\n \n \n \n \n\n\n \n Farahmand, A.; Ghavamzadeh, M.; Szepesvári, C.; and Mannor, S.\n\n\n \n\n\n\n In ACC, pages 725–730, 2009. \n \n\n\n\n
\n\n\n\n \n \n \"Regularized paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{farahmand2009,\n\tabstract = {Reinforcement learning with linear and non-linear function approximation has been studied extensively in the last decade. However, as opposed to other fields of machine learning such as supervised learning, the effect of finite sample has not been thoroughly addressed within the reinforcement learning framework. In this paper we propose to use $L^2$ regularization to control the complexity of the value function in reinforcement learning and planning problems. We consider the Regularized Fitted Q-Iteration algorithm and provide generalization bounds that account for small sample sizes. Finally, a realistic visual-servoing problem is used to illustrate the benefits of using the regularization procedure.},\n\tauthor = {Farahmand, A.m. and Ghavamzadeh, M. and Szepesv{\\'a}ri, Cs. and Mannor, S.},\n\tbooktitle = {ACC},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2015-02-01 09:06:29 +0000},\n\tkeywords = {reinforcement learning, planning, regularization, nonparametrics, theory, function approximation, value iteration},\n\tpages = {725--730},\n\ttitle = {Regularized Fitted Q-iteration for Planning in Continuous-Space Markovian Decision Problems},\n\turl_paper = {RegPlan-ACC09.pdf},\n\tyear = {2009}}\n\n
\n
\n\n\n
\n Reinforcement learning with linear and non-linear function approximation has been studied extensively in the last decade. However, as opposed to other fields of machine learning such as supervised learning, the effect of finite sample has not been thoroughly addressed within the reinforcement learning framework. In this paper we propose to use $L^2$ regularization to control the complexity of the value function in reinforcement learning and planning problems. We consider the Regularized Fitted Q-Iteration algorithm and provide generalization bounds that account for small sample sizes. Finally, a realistic visual-servoing problem is used to illustrate the benefits of using the regularization procedure.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Model-based and Model-free Reinforcement Learning for Visual Servoing.\n \n \n \n \n\n\n \n Farahmand, A.; Shademan, A.; Jägersand, M.; and Szepesvári, C.\n\n\n \n\n\n\n In ICRA, pages 2917–2924, 2009. \n \n\n\n\n
\n\n\n\n \n \n \"Model-based paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{farahmand2009a,\n\tabstract = {To address the difficulty of designing a controller for complex visual-servoing tasks, two learning-based uncalibrated approaches are introduced. The first method starts by building an estimated model for the visual-motor forward kinematic of the vision-robot system by a locally linear regression method. Afterwards, it uses a reinforcement learning method named Regularized Fitted Q-Iteration to find a controller (i.e. policy) for the system (model-based RL). The second method directly uses samples coming from the robot without building any intermediate model (model-free RL). The simulation results show that both methods perform comparably well despite not having any a priori knowledge about the robot.},\n\tacceptrate = {43\\%},\n\tauthor = {Farahmand, A.m. and Shademan, A. and J{\\"a}gersand, M. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICRA},\n\tkeywords = {robotics, application, vision, reinforcement learning},\n\tpages = {2917--2924},\n\ttitle = {Model-based and Model-free Reinforcement Learning for Visual Servoing},\n\turl_paper = {MBRL4VisualServoing.pdf},\n\tyear = {2009}}\n\n
\n
\n\n\n
\n To address the difficulty of designing a controller for complex visual-servoing tasks, two learning-based uncalibrated approaches are introduced. The first method starts by building an estimated model for the visual-motor forward kinematic of the vision-robot system by a locally linear regression method. Afterwards, it uses a reinforcement learning method named Regularized Fitted Q-Iteration to find a controller (i.e. policy) for the system (model-based RL). The second method directly uses samples coming from the robot without building any intermediate model (model-free RL). The simulation results show that both methods perform comparably well despite not having any a priori knowledge about the robot.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning Exercise Policies for American Options.\n \n \n \n \n\n\n \n Li, Y.; Szepesvári, C.; and Schuurmans, D.\n\n\n \n\n\n\n In AISTATS, volume 5, pages 352–359, 2009. \n \n\n\n\n
\n\n\n\n \n \n \"Learning link\n  \n \n \n \"Learning pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{li2009,\n\tabstract = {Options are important instruments in modern finance. In this paper, we investigate reinforcement learning (RL) methods---in particular, least-squares policy iteration (LSPI)---for the problem of learning exercise policies for American options. We develop finite-time bounds on the performance of the policy obtained with LSPI and compare LSPI and the fitted Q-iteration algorithm (FQI) with the Longstaff-Schwartz method (LSM), the standard least-squares Monte Carlo algorithm from the finance community. Our empirical results show that the exercise policies discovered by LSPI and FQI gain larger payoffs than those discovered by LSM, on both real and synthetic data. Furthermore, we find that for all methods the policies learned from real data generally gain similar payoffs to the policies learned from simulated data. Our work shows that solution methods developed in machine learning can advance the state-of-the-art in an important and challenging application area, while demonstrating that computational finance remains a promising area for future applications of machine learning methods.},\n\tacceptrate = {oral presantation 10\\%},\n\tauthor = {Li, Y. and Szepesv{\\'a}ri, Cs. and Schuurmans, D.},\n\tbooktitle = {AISTATS},\n\tkeywords = {finance, reinforcement learning, theory, application},\n\tpages = {352--359},\n\ttitle = {Learning Exercise Policies for American Options},\n\turl_link = {http://www.ics.uci.edu/~aistats/},\n\turl_pdf = {http://jmlr.csail.mit.edu/proceedings/papers/v5/li09d/li09d.pdf},\n\tvolume = {5},\n\tyear = {2009},\n\tBdsk-Url-1 = {http://www.ics.uci.edu/~aistats/}}\n\n
\n
\n\n\n
\n Options are important instruments in modern finance. In this paper, we investigate reinforcement learning (RL) methods—in particular, least-squares policy iteration (LSPI)—for the problem of learning exercise policies for American options. We develop finite-time bounds on the performance of the policy obtained with LSPI and compare LSPI and the fitted Q-iteration algorithm (FQI) with the Longstaff-Schwartz method (LSM), the standard least-squares Monte Carlo algorithm from the finance community. Our empirical results show that the exercise policies discovered by LSPI and FQI gain larger payoffs than those discovered by LSM, on both real and synthetic data. Furthermore, we find that for all methods the policies learned from real data generally gain similar payoffs to the policies learned from simulated data. Our work shows that solution methods developed in machine learning can advance the state-of-the-art in an important and challenging application area, while demonstrating that computational finance remains a promising area for future applications of machine learning methods.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Convergent Temporal-Difference Learning with Arbitrary Smooth Function Approximation.\n \n \n \n \n\n\n \n Maei, H.; Szepesvári, C.; Bhatnagar, S.; Silver, D.; Precup, D.; and Sutton, R.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, pages 1204–1212, 2009. \n \n\n\n\n
\n\n\n\n \n \n \"Convergent paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{maei2009,\n\tabstract = {We introduce the first temporal-difference learning algorithms that converge with smooth value function approximators, such as neural networks. Conventional temporal-difference (TD) methods, such as TD(lambda), Q-learning and Sarsa have been used successfully with function approximation in many applications. However, it is well known that off-policy sampling, as well as non-linear function approximation, can cause these algorithms to become unstable (i.e., the parameters of the approximator may diverge). Sutton et al (2009a,b) solved the problem of off-policy learning with linear TD algorithms by introducing a new objective function, related to the Bellman-error, and algorithms that perform stochastic gradient-descent on this function. In this paper, we generalize their work to non-linear function approximation. We present a Bellman error objective function and two gradient-descent TD algorithms that optimize it. We prove the asymptotic almost-sure convergence of both algorithms for any finite Markov decision process and any smooth value function approximator, under usual stochastic approximation conditions. The computational complexity per iteration scales linearly with the number of parameters of the approximator. The algorithms are incremental and are guaranteed to converge to locally optimal solutions.},\n\tacceptrate = {oral presentation 87 out of 1105=7.9\\%},\n\tauthor = {Maei, H.R. and Szepesv{\\'a}ri, Cs. and Bhatnagar, S. and Silver, D. and Precup, D. and Sutton, R.S.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tkeywords = {reinforcement learning, prediction, online learning, gradient algorithm, stochastic approximation, theory, neural networks, nonlinear function approximation, GTD},\n\tpages = {1204--1212},\n\ttitle = {Convergent Temporal-Difference Learning with Arbitrary Smooth Function Approximation},\n\turl_paper = {nonlin_gtdNeurIPS09-2.pdf},\n\tyear = {2009}}\n\n
\n
\n\n\n
\n We introduce the first temporal-difference learning algorithms that converge with smooth value function approximators, such as neural networks. Conventional temporal-difference (TD) methods, such as TD(lambda), Q-learning and Sarsa have been used successfully with function approximation in many applications. However, it is well known that off-policy sampling, as well as non-linear function approximation, can cause these algorithms to become unstable (i.e., the parameters of the approximator may diverge). Sutton et al (2009a,b) solved the problem of off-policy learning with linear TD algorithms by introducing a new objective function, related to the Bellman-error, and algorithms that perform stochastic gradient-descent on this function. In this paper, we generalize their work to non-linear function approximation. We present a Bellman error objective function and two gradient-descent TD algorithms that optimize it. We prove the asymptotic almost-sure convergence of both algorithms for any finite Markov decision process and any smooth value function approximator, under usual stochastic approximation conditions. The computational complexity per iteration scales linearly with the number of parameters of the approximator. The algorithms are incremental and are guaranteed to converge to locally optimal solutions.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Training Parsers by Inverse Reinforcement Learning.\n \n \n \n \n\n\n \n Neu, G.; and Szepesvári, C.\n\n\n \n\n\n\n Machine Learning, 77: 303–337. 2009.\n \n\n\n\n
\n\n\n\n \n \n \"Training paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{neu2009,\n\tabstract = {One major idea in structured prediction is to assume that the predictor computes its output by finding the maximum of a score function. The training of such a predictor can then be cast as the problem of finding weights of the score function so that the output of the predictor on the inputs matches the corresponding structured labels on the training set. A similar problem is studied in inverse reinforcement learning (IRL) where one is given an environment and a set of trajectories and the problem is to find a reward function such that an agent acting optimally with respect to the reward function would follow trajectories that match those in the training set. In this paper we show how IRL algorithms can be applied to structured prediction, in particular to parser training. We present a number of recent incremental IRL algorithms in a unified framework and map them to parser training algorithms. This allows us to recover some existing parser training algorithms, as well as to obtain a new one. The resulting algorithms are compared in terms of their sensitivity to the choice of various parameters and generalization ability on the Penn Treebank WSJ corpus.},\n\tauthor = {Neu, G. and Szepesv{\\'a}ri, Cs.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tjournal = {Machine Learning},\n\tkeywords = {theory, application, reinforcement learning, apprenticeship learning, natural gradient, structured prediction, inverse reinforcement learning, survey},\n\tpages = {303--337},\n\tread = {1},\n\ttitle = {Training Parsers by Inverse Reinforcement Learning},\n\turl_paper = {MLJ-SISP-09.pdf},\n\tvolume = {77},\n\tyear = {2009}}\n\n
\n
\n\n\n
\n One major idea in structured prediction is to assume that the predictor computes its output by finding the maximum of a score function. The training of such a predictor can then be cast as the problem of finding weights of the score function so that the output of the predictor on the inputs matches the corresponding structured labels on the training set. A similar problem is studied in inverse reinforcement learning (IRL) where one is given an environment and a set of trajectories and the problem is to find a reward function such that an agent acting optimally with respect to the reward function would follow trajectories that match those in the training set. In this paper we show how IRL algorithms can be applied to structured prediction, in particular to parser training. We present a number of recent incremental IRL algorithms in a unified framework and map them to parser training algorithms. This allows us to recover some existing parser training algorithms, as well as to obtain a new one. The resulting algorithms are compared in terms of their sensitivity to the choice of various parameters and generalization ability on the Penn Treebank WSJ corpus.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning When to Stop Thinking and Do Something!.\n \n \n \n \n\n\n \n Póczos, B.; Abbasi-Yadkori, Y.; Szepesvári, C.; Greiner, R.; and Sturtevant, N.\n\n\n \n\n\n\n In ICML, pages 825–832, 2009. \n \n\n\n\n
\n\n\n\n \n \n \"Learning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{poczos2009,\n\tabstract = {An anytime algorithm is capable of returning a response to the given task at essentially any time; typically the quality of the response improves as the time increases. Here, we consider the challenge of learning when we should terminate such algorithms on each of a sequence of iid tasks, to optimize the expected average reward per unit time. We provide a system for addressing this challenge, which combines the global optimizer Cross- Entropy method with local gradient ascent. This paper theoretically investigates how far the estimated gradient is from the true gradient, then empirically demonstrates that this system is effective by applying it to a toy problem, as well as on a real-world face detection task.},\n\tacceptrate = {27\\%},\n\tauthor = {P{\\'o}czos, B. and Abbasi-Yadkori, Y. and Szepesv{\\'a}ri, Cs. and Greiner, R. and Sturtevant, N.},\n\tbooktitle = {ICML},\n\tkeywords = {reinforcement learning, application, pondering, gradient algorithm, REINFORCE, Cross-Entropy search},\n\tpages = {825--832},\n\ttitle = {Learning When to Stop Thinking and Do Something!},\n\turl_paper = {time_is_money-ICML09.pdf},\n\tyear = {2009}}\n\n
\n
\n\n\n
\n An anytime algorithm is capable of returning a response to the given task at essentially any time; typically the quality of the response improves as the time increases. Here, we consider the challenge of learning when we should terminate such algorithms on each of a sequence of iid tasks, to optimize the expected average reward per unit time. We provide a system for addressing this challenge, which combines the global optimizer Cross- Entropy method with local gradient ascent. This paper theoretically investigates how far the estimated gradient is from the true gradient, then empirically demonstrates that this system is effective by applying it to a toy problem, as well as on a real-world face detection task.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Fast Gradient-Descent Methods for Temporal-Difference Learning with Linear Function Approximation.\n \n \n \n \n\n\n \n Sutton, R.; Maei, H.; Precup, D.; Bhatnagar, S.; Silver, D.; Szepesvári, C.; and Wiewiora, E.\n\n\n \n\n\n\n In ICML, pages 993–1000, 2009. \n \n\n\n\n
\n\n\n\n \n \n \"Fast paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{sutton2009,\n\tabstract = {Sutton, {Sz}epesv{\\'a}ri and Maei (2009) recently introduced the first temporal-difference learning algorithm compatible with both linear function approximation and off-policy training, and whose complexity scales only linearly in the size of the function approximator. Although their gradient temporal difference (GTD) algorithm converges reliably, it can be very slow compared to conventional linear TD (on on-policy problems where TD is convergent), calling into question its practical utility. In this paper we introduce two new related algorithms with better convergence rates. The first algorithm, GTD2, is derived and proved convergent just as GTD was, but uses a different objective function and converges significantly faster (but still not as fast as conventional TD). The second new algorithm, linear TD with gradient correction, or TDC, uses the same update rule as conventional TD except for an additional term which is initially zero. In our experiments on small test problems and in a Computer Go application with a million features, the learning rate of this algorithm was comparable to that of conventional TD. This algorithm appears to extend linear TD to off-policy learning with no penalty in performance while only doubling computational requirements.},\n\tauthor = {Sutton, R.S. and Maei, H.R. and Precup, D. and Bhatnagar, S. and Silver, D. and Szepesv{\\'a}ri, Cs. and Wiewiora, E.},\n\tbooktitle = {ICML},\n\tkeywords = {reinforcement learning, prediction, online learning, gradient algorithm, stochastic approximation, theory, function approximation, GTD2, TDC},\n\tpages = {993--1000},\n\ttitle = {Fast Gradient-Descent Methods for Temporal-Difference Learning with Linear Function Approximation},\n\turl_paper = {GTD-ICML09.pdf},\n\tyear = {2009}}\n\n
\n
\n\n\n
\n Sutton, Szepesvári and Maei (2009) recently introduced the first temporal-difference learning algorithm compatible with both linear function approximation and off-policy training, and whose complexity scales only linearly in the size of the function approximator. Although their gradient temporal difference (GTD) algorithm converges reliably, it can be very slow compared to conventional linear TD (on on-policy problems where TD is convergent), calling into question its practical utility. In this paper we introduce two new related algorithms with better convergence rates. The first algorithm, GTD2, is derived and proved convergent just as GTD was, but uses a different objective function and converges significantly faster (but still not as fast as conventional TD). The second new algorithm, linear TD with gradient correction, or TDC, uses the same update rule as conventional TD except for an additional term which is initially zero. In our experiments on small test problems and in a Computer Go application with a million features, the learning rate of this algorithm was comparable to that of conventional TD. This algorithm appears to extend linear TD to off-policy learning with no penalty in performance while only doubling computational requirements.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Reinforcement Learning Algorithms for MDPs – A Survey.\n \n \n \n \n\n\n \n Szepesvári, C.\n\n\n \n\n\n\n Technical Report TR09-13, Department of Computing Science, University of Alberta, 2009.\n \n\n\n\n
\n\n\n\n \n \n \"Reinforcement pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@techreport{Szepesvari2009,\n\tabstract = {This article presents a survey of reinforcement learning algorithms for Markov Decision Processes (MDP). In the first half of the article, the problem of value estimation is considered. Here we start by describing the idea of bootstrapping and temporal difference learning. Next, we compare incremental and batch algorithmic variants and discuss the impact of the choice of the function approximation method on the success of learning. In the second half, we describe methods that target the problem of learning to control an MDP. Here online and active learning are discussed first, followed by a description of direct and actor-critic methods.},\n\tauthor = {Szepesv{\\'a}ri, Cs.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-03 00:43:23 -0600},\n\tinstitution = {Department of Computing Science, University of Alberta},\n\tkeywords = {reinforcement learning, temporal difference learning, stochastic approximation, two-timescale stochastic approximation, Monte-Carlo methods, simulation optimization, function approximation, stochastic gradient methods, least-squares methods, overfitting, bias-variance tradeoff, online learning, active learning, planning, simulation, PAC-learning, Q-learning, actor-critic methods, policy gradient, natural gradient},\n\tnumber = {TR09-13},\n\ttitle = {Reinforcement Learning Algorithms for MDPs -- A Survey},\n\turl_pdf = {http://www.cs.ualberta.ca/system/files/tech_report/2009/TR09-13.pdf},\n\tyear = {2009}}\n\n
\n
\n\n\n
\n This article presents a survey of reinforcement learning algorithms for Markov Decision Processes (MDP). In the first half of the article, the problem of value estimation is considered. Here we start by describing the idea of bootstrapping and temporal difference learning. Next, we compare incremental and batch algorithmic variants and discuss the impact of the choice of the function approximation method on the success of learning. In the second half, we describe methods that target the problem of learning to control an MDP. Here online and active learning are discussed first, followed by a description of direct and actor-critic methods.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n LMS-2: Towards an Algorithm that is as Cheap as LMS and Almost as Efficient as RLS.\n \n \n \n \n\n\n \n Yao, H.; Bhatnagar, S.; and Szepesvári, C.\n\n\n \n\n\n\n In CDC, pages 1181–1188, 2009. \n \n\n\n\n
\n\n\n\n \n \n \"LMS-2: paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{yao2009,\n\tabstract = {We consider linear prediction problems in a stochastic environment. The least mean square (LMS) algorithm is a well-known, easy to implement and computationally cheap solution to this problem. However, as it is well known, the LMS algorithm, being a stochastic gradient descent rule, may converge slowly. The recursive least squares (RLS) algorithm overcomes this problem, but its computational cost is quadratic in the problem dimension. In this paper we propose a two timescale stochastic approximation algorithm which, as far as its slower timescale is considered, behaves the same way as the RLS algorithm, while it is as cheap as the LMS algorithm. In addition, the algorithm is easy to implement. The algorithm is shown to give estimates that converge to the best possible estimate with probability one. The performance of the algorithm is tested in two examples and it is found that it may indeed offer some performance gain over the LMS algorithm.},\n\tauthor = {Yao, H. and Bhatnagar, S. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {CDC},\n\tkeywords = {stochastic approximation, two-timescale stochastic approximation, linear prediction},\n\tpages = {1181--1188},\n\ttitle = {LMS-2: Towards an Algorithm that is as Cheap as LMS and Almost as Efficient as RLS},\n\turl_paper = {cdc09_final.pdf},\n\tyear = {2009}}\n\n
\n
\n\n\n
\n We consider linear prediction problems in a stochastic environment. The least mean square (LMS) algorithm is a well-known, easy to implement and computationally cheap solution to this problem. However, as it is well known, the LMS algorithm, being a stochastic gradient descent rule, may converge slowly. The recursive least squares (RLS) algorithm overcomes this problem, but its computational cost is quadratic in the problem dimension. In this paper we propose a two timescale stochastic approximation algorithm which, as far as its slower timescale is considered, behaves the same way as the RLS algorithm, while it is as cheap as the LMS algorithm. In addition, the algorithm is easy to implement. The algorithm is shown to give estimates that converge to the best possible estimate with probability one. The performance of the algorithm is tested in two examples and it is found that it may indeed offer some performance gain over the LMS algorithm.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A General Projection Property for Distribution Families.\n \n \n \n \n\n\n \n Yu, Y.; Li, Y.; Schuurmans, D.; and Szepesvári, C.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, 2009. \n \n\n\n\n
\n\n\n\n \n \n \"A paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{yu2009,\n\tabstract = {Surjectivity of linear projections between distribution families with fixed mean and covariance (regardless of dimension) is re-derived by a new proof. We further extend this property to distribution families that respect additional constraints, such as symmetry, unimodality and log-concavity. By combining our results with classic univariate inequalities, we provide new worst-case analyses for natural risk criteria arising in classification, optimization, portfolio selection and Markov decision processes.},\n\tacceptrate = {263 out of 1105=24\\%},\n\tauthor = {Yu, Y.-L. and Li, Y. and Schuurmans, D. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tkeywords = {theory, value at risk, stochastic programming},\n\ttitle = {A General Projection Property for Distribution Families},\n\turl_paper = {CVAR-NeurIPS09.pdf},\n\tyear = {2009}}\n\n
\n
\n\n\n
\n Surjectivity of linear projections between distribution families with fixed mean and covariance (regardless of dimension) is re-derived by a new proof. We further extend this property to distribution families that respect additional constraints, such as symmetry, unimodality and log-concavity. By combining our results with classic univariate inequalities, we provide new worst-case analyses for natural risk criteria arising in classification, optimization, portfolio selection and Markov decision processes.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2008\n \n \n (11)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Learning near-optimal policies with Bellman-residual minimization based fitted policy iteration and a single sample path.\n \n \n \n \n\n\n \n Antos, A.; Szepesvári, C.; and Munos, R.\n\n\n \n\n\n\n Machine Learning, 71(1): 89–129. April 2008.\n Published Online First: 14 Nov, 2007\n\n\n\n
\n\n\n\n \n \n \"Learning2\n  \n \n \n \"Learning3\n  \n \n \n \"Learning link\n  \n \n \n \"Learning paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 9 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{anszemu:mlj07,\n\tabstract = {\tIn this paper we consider the problem of finding a near-optimal policy in a continuous space, discounted Markovian Decision Problem (MDP) by employing value-function-based methods when only a single trajectory of a fixed policy is available as the input. We study a policy-iteration algo- rithm where the iterates are obtained via empirical risk minimization with a risk function that penalizes high magnitudes of the Bellman-residual. Our main result is a finite-sample, high-probability bound on the performance of the computed policy that depends on the mixing rate of the trajectory, the capacity of the function set as measured by a novel capacity concept (the VC-crossing dimension), the approximation power of the function set and the controllability properties of the MDP. Moreover, we prove that when a linear parameterization is used the new algorithm is equivalent to Least-Squares Policy Iteration. To the best of our knowledge this is the first theoretical result for off-policy control learning over continuous state-spaces using a single trajectory.},\n\tauthor = {Antos, A. and Szepesv{\\'a}ri, Cs. and Munos, R.},\n\tdate-added = {2011-07-26 09:54:32 -0600},\n\tdate-modified = {2011-07-26 10:03:59 -0600},\n\tdoi = {10.1007/s10994-007-5038-2},\n\teditor = {H.U. Simon, G. Lugosi, A. Blum},\n\tjournal = {Machine Learning},\n\tkeywords = {reinforcement learning, nonparametrics, theory, function approximation, policy iteration},\n\tmonth = apr,\n\tnote = {Published Online First: 14 Nov, 2007},\n\tnumber = {1},\n\tpages = {89--129},\n\ttitle = {Learning near-optimal policies with Bellman-residual minimization based fitted policy iteration and a single sample path},\n\turl2 = {http://springer.r.delivery.net/r/r?2.1.Ee.2Tp.1gRdFJ.BwJPwi..N.ErIo.2yDO.LBOEfA00},\n\turl3 = {http://www.szit.bme.hu/~antos/ps/anszmu_sapi_mlj.ps.gz},\n\turl_link = {http://www.springerlink.com/content/h46m1152288681jt/},\n\turl_paper = {sapi_mlj.pdf},\n\tvolume = {71},\n\tyear = {2008},\n\tBdsk-Url-1 = {http://www.springerlink.com/content/h46m1152288681jt/}}\n\n
\n
\n\n\n
\n In this paper we consider the problem of finding a near-optimal policy in a continuous space, discounted Markovian Decision Problem (MDP) by employing value-function-based methods when only a single trajectory of a fixed policy is available as the input. We study a policy-iteration algo- rithm where the iterates are obtained via empirical risk minimization with a risk function that penalizes high magnitudes of the Bellman-residual. Our main result is a finite-sample, high-probability bound on the performance of the computed policy that depends on the mixing rate of the trajectory, the capacity of the function set as measured by a novel capacity concept (the VC-crossing dimension), the approximation power of the function set and the controllability properties of the MDP. Moreover, we prove that when a linear parameterization is used the new algorithm is equivalent to Least-Squares Policy Iteration. To the best of our knowledge this is the first theoretical result for off-policy control learning over continuous state-spaces using a single trajectory.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Online Optimization in X-armed Bandits.\n \n \n \n \n\n\n \n Bubeck, S.; Munos, R.; Stoltz, G.; and Szepesvári, C.\n\n\n \n\n\n\n In Koller, D.; Schuurmans, D.; Bengio, Y.; and Bottou, L., editor(s), Advances in Neural Information Processing Systems, pages 201–208, 2008. MIT Press\n \n\n\n\n
\n\n\n\n \n \n \"OnlineLink\n  \n \n \n \"Online paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{bubeck2008,\n\tabstract = {We consider a generalization of stochastic bandit problems where the set of arms, X, is allowed to be a generic topological space and the mean-payoff function is ``locally Lipschitz'' with respect to a dissimilarity function that is known to the decision maker. Under this condition we construct an arm selection policy whose regret improves upon previous results for a large class of problems. In particular, our results imply that if X is the unit hypercube in a Euclidean space and the mean-payoff function has a finite number of global maxima around which the behavior of the function is locally H{\\"o}lder with a known exponent, then the expected regret is bounded up to a logarithmic factor by sqrt(n), i.e., the rate of the growth of the regret is independent of the dimension of the space. We also prove the minimax optimality of our algorithm for the class of problems considered.},\n\tacceptrate = {24\\%},\n\tauthor = {Bubeck, S. and Munos, R. and Stoltz, G. and Szepesv{\\'a}ri, Cs.},\n\tbibsource = {DBLP, http://dblp.uni-trier.de},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\teditor = {Koller, D. and Schuurmans, D. and Bengio, Y. and Bottou, L.},\n\tee = {https://papers.neurips.cc/paper/3605-online-optimization-in-x-armed-bandits.pdf},\n\tkeywords = {bandits, multi-armed bandits, large action space, stochastic bandits, theory, minimax bounds},\n\tpages = {201--208},\n\tpublisher = {MIT Press},\n\ttitle = {Online Optimization in X-armed Bandits},\n\turl_paper = {HOO-NeurIPS08.pdf},\n\tyear = {2008}}\n\n
\n
\n\n\n
\n We consider a generalization of stochastic bandit problems where the set of arms, X, is allowed to be a generic topological space and the mean-payoff function is ``locally Lipschitz'' with respect to a dissimilarity function that is known to the decision maker. Under this condition we construct an arm selection policy whose regret improves upon previous results for a large class of problems. In particular, our results imply that if X is the unit hypercube in a Euclidean space and the mean-payoff function has a finite number of global maxima around which the behavior of the function is locally Hölder with a known exponent, then the expected regret is bounded up to a logarithmic factor by sqrt(n), i.e., the rate of the growth of the regret is independent of the dimension of the space. We also prove the minimax optimality of our algorithm for the class of problems considered.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Regularized Fitted Q-Iteration: Application to Planning.\n \n \n \n \n\n\n \n Farahmand, A.; Ghavamzadeh, M.; Szepesvári, C.; and Mannor, S.\n\n\n \n\n\n\n In EWRL, pages 55–68, 2008. \n \n\n\n\n
\n\n\n\n \n \n \"Regularized paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{farahmand2008,\n\tabstract = {We consider planning in a Markovian decision problem, i.e., the problem of finding a good policy given access to a generative model of the environment. We propose to use fitted Q-iteration with penalized (or regularized) least-squares regression as the regression subroutine to address the problem of controlling model-complexity. The algorithm is presented in detail for the case when the function space is a reproducing kernel Hilbert space underlying a user-chosen kernel function. We derive bounds on the quality of the solution and argue that data-dependent penalties can lead to almost optimal performance. A simple example is used to illustrate the benefits of using a penalized procedure.},\n\tauthor = {Farahmand, A.m. and Ghavamzadeh, M. and Szepesv{\\'a}ri, Cs. and Mannor, S.},\n\tbibsource = {DBLP, http://dblp.uni-trier.de},\n\tbooktitle = {EWRL},\n\tdoi = {10.1007/978-3-540-89722-4_5},\n\tentrysubtype = {unrefereed},\n\tkeywords = {reinforcement learning, planning, regularization, nonparametrics, theory, function approximation, value iteration},\n\tpages = {55--68},\n\ttitle = {Regularized Fitted Q-Iteration: Application to Planning},\n\turl_paper = {RegFQI-Plan-EWRL08.pdf},\n\tyear = {2008},\n\tBdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-540-89722-4_5}}\n\n
\n
\n\n\n
\n We consider planning in a Markovian decision problem, i.e., the problem of finding a good policy given access to a generative model of the environment. We propose to use fitted Q-iteration with penalized (or regularized) least-squares regression as the regression subroutine to address the problem of controlling model-complexity. The algorithm is presented in detail for the case when the function space is a reproducing kernel Hilbert space underlying a user-chosen kernel function. We derive bounds on the quality of the solution and argue that data-dependent penalties can lead to almost optimal performance. A simple example is used to illustrate the benefits of using a penalized procedure.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Regularized Policy Iteration.\n \n \n \n \n\n\n \n Farahmand, A.; Ghavamzadeh, M.; Szepesvári, C.; and Mannor, S.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, pages 441–448, 2008. \n \n\n\n\n
\n\n\n\n \n \n \"RegularizedLink\n  \n \n \n \"Regularized paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{farahmand2008a,\n\tabstract = {In this paper we consider approximate policy-iteration-based reinforcement learning algorithms. In order to implement a flexible function approximation scheme we propose the use of non-parametric methods with regularization, providing a convenient way to control the complexity of the function approximator. We propose two novel regularized policy iteration algorithms by adding L2-regularization to two widely-used policy evaluation methods: Bellman residual minimization (BRM) and least-squares temporal difference learning (LSTD). We derive efficient implementation for our algorithms when the approximate value-functions belong to a reproducing kernel Hilbert space. We also provide finite-sample performance bounds for our algorithms and show that they are able to achieve optimal rates of convergence under the studied conditions.},\n\tacceptrate = {24\\%},\n\tauthor = {Farahmand, A.m. and Ghavamzadeh, M. and Szepesv{\\'a}ri, Cs. and Mannor, S.},\n\tbibsource = {DBLP, http://dblp.uni-trier.de},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tee = {https://papers.neurips.cc/paper/3445-regularized-policy-iteration.pdf},\n\tkeywords = {reinforcement learning, regularization, nonparametrics, theory, function approximation, policy iteration},\n\tpages = {441--448},\n\ttitle = {Regularized Policy Iteration},\n\turl_paper = {NeurIPS08-regrl.pdf},\n\tyear = {2008}}\n\n
\n
\n\n\n
\n In this paper we consider approximate policy-iteration-based reinforcement learning algorithms. In order to implement a flexible function approximation scheme we propose the use of non-parametric methods with regularization, providing a convenient way to control the complexity of the function approximator. We propose two novel regularized policy iteration algorithms by adding L2-regularization to two widely-used policy evaluation methods: Bellman residual minimization (BRM) and least-squares temporal difference learning (LSTD). We derive efficient implementation for our algorithms when the approximate value-functions belong to a reproducing kernel Hilbert space. We also provide finite-sample performance bounds for our algorithms and show that they are able to achieve optimal rates of convergence under the studied conditions.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Empirical Bernstein stopping.\n \n \n \n \n\n\n \n Mnih, V.; Szepesvári, C.; and Audibert, J.\n\n\n \n\n\n\n In ICML, pages 672–679, 2008. \n \n\n\n\n
\n\n\n\n \n \n \"Empirical paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 11 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{mnih2008,\n\tabstract = {Sampling is a popular way of scaling up machine learning algorithms to large datasets. The question often is how many samples are needed. Adaptive stopping algorithms monitor the performance in an online fashion and they can stop early, saving valuable resources. We consider problems where probabilistic guarantees are desired and demonstrate how recently-introduced empirical Bernstein bounds can be used to design stopping rules that are efficient. We provide upper bounds on the sample complexity of the new rules, as well as empirical results on model selection and boosting in the filtering setting.},\n\tacceptrate = {27\\%},\n\tauthor = {Mnih, V. and Szepesv{\\'a}ri, Cs. and Audibert, J.-Y.},\n\tbooktitle = {ICML},\n\tdoi = {10.1145/1390156.1390241},\n\tkeywords = {Bernstein's inequality, sequential algorithms, stopping problem, racing problem, pick the winner, theory},\n\tpages = {672--679},\n\ttitle = {Empirical Bernstein stopping},\n\turl_paper = {bernstein-stopping.pdf},\n\tyear = {2008},\n\tBdsk-Url-1 = {http://dx.doi.org/10.1145/1390156.1390241}}\n\n
\n
\n\n\n
\n Sampling is a popular way of scaling up machine learning algorithms to large datasets. The question often is how many samples are needed. Adaptive stopping algorithms monitor the performance in an online fashion and they can stop early, saving valuable resources. We consider problems where probabilistic guarantees are desired and demonstrate how recently-introduced empirical Bernstein bounds can be used to design stopping rules that are efficient. We provide upper bounds on the sample complexity of the new rules, as well as empirical results on model selection and boosting in the filtering setting.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Dyna-Style Planning with Linear Function Approximation and Prioritized Sweeping.\n \n \n \n \n\n\n \n Sutton, R.; Szepesvári, C.; Geramifard, A.; and Bowling, M. H.\n\n\n \n\n\n\n In UAI, pages 528–536, 2008. \n \n\n\n\n
\n\n\n\n \n \n \"Dyna-StyleLink\n  \n \n \n \"Dyna-Style paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{sutton2008a,\n\tabstract = {We consider the problem of efficiently learning optimal control policies and value functions over large state spaces in an online setting in which estimates must be available after each interaction with the world. This paper develops an explicitly model-based approach extending the Dyna architecture to linear function approximation. Dyna-style planning proceeds by generating imaginary experience from the world model and then applying model-free reinforcement learning algorithms to the imagined state transitions. Our main results are to prove that linear Dyna-style planning converges to a unique solution independent of the generating distribution, under natural conditions. In the policy evaluation setting, we prove that the limit point is the least-squares (LSTD) solution. An implication of our results is that prioritized-sweeping can be soundly extended to the linear approximation case, backing up to preceding features rather than to preceding states. We introduce two versions of prioritized sweeping with linear Dyna and briefly illustrate their performance empirically on the Mountain Car and Boyan Chain problems.},\n\tacceptrate = {28\\%},\n\tauthor = {Sutton, R.S. and Szepesv{\\'a}ri, Cs. and Geramifard, A. and Bowling, M. H.},\n\tbibsource = {DBLP, http://dblp.uni-trier.de},\n\tbooktitle = {UAI},\n\tee = {http://uai2008.cs.helsinki.fi/UAI_camera_ready/sutton.pdf},\n\tkeywords = {reinforcement learning, planning, theory, stochastic approximation, asymptotic convergence, function approximation},\n\tpages = {528--536},\n\ttitle = {Dyna-Style Planning with Linear Function Approximation and Prioritized Sweeping},\n\turl_paper = {linearDyna.pdf},\n\tyear = {2008}}\n\n
\n
\n\n\n
\n We consider the problem of efficiently learning optimal control policies and value functions over large state spaces in an online setting in which estimates must be available after each interaction with the world. This paper develops an explicitly model-based approach extending the Dyna architecture to linear function approximation. Dyna-style planning proceeds by generating imaginary experience from the world model and then applying model-free reinforcement learning algorithms to the imagined state transitions. Our main results are to prove that linear Dyna-style planning converges to a unique solution independent of the generating distribution, under natural conditions. In the policy evaluation setting, we prove that the limit point is the least-squares (LSTD) solution. An implication of our results is that prioritized-sweeping can be soundly extended to the linear approximation case, backing up to preceding features rather than to preceding states. We introduce two versions of prioritized sweeping with linear Dyna and briefly illustrate their performance empirically on the Mountain Car and Boyan Chain problems.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A Convergent O(n) Algorithm for Off-policy Temporal-difference Learning with Linear Function Approximation.\n \n \n \n \n\n\n \n Sutton, R.; Szepesvári, C.; and Maei, H.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, pages 1609–1616, 2008. \n \n\n\n\n
\n\n\n\n \n \n \"ALink\n  \n \n \n \"A paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{sutton2008,\n\tabstract = {We introduce the first temporal-difference learning algorithm that is stable with linear function approximation and off-policy training, for any finite Markov decision process, behavior policy, and target policy, and whose complexity scales linearly in the number of parameters. We consider an i.i.d. policy-evaluation setting in which the data need not come from on-policy experience. The gradient temporal-difference (GTD) algorithm estimates the expected update vector of the TD(0) algorithm and performs stochastic gradient descent on its L2 norm. We prove that this algorithm is stable and convergent under the usual stochastic approximation conditions to the same least-squares solution as found by the LSTD, but without LSTD's quadratic computational complexity. GTD is online and incremental, and does not involve multiplying by products of likelihood ratios as in importance-sampling methods.},\n\tacceptrate = {24\\%},\n\tauthor = {Sutton, R.S. and Szepesv{\\'a}ri, Cs. and Maei, H.R.},\n\tbibsource = {DBLP, http://dblp.uni-trier.de},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tee = {https://papers.neurips.cc/paper/3626-a-convergent-on-temporal-difference-algorithm-for-off-policy-learning-with-linear-function-approximation.pdf},\n\tkeywords = {reinforcement learning, prediction, online learning, gradient algorithm, stochastic approximation, theory, function approximation, GTD},\n\tpages = {1609--1616},\n\ttitle = {A Convergent O(n) Algorithm for Off-policy Temporal-difference Learning with Linear Function Approximation},\n\turl_paper = {gtdNeurIPS08.pdf},\n\tyear = {2008}}\n\n
\n
\n\n\n
\n We introduce the first temporal-difference learning algorithm that is stable with linear function approximation and off-policy training, for any finite Markov decision process, behavior policy, and target policy, and whose complexity scales linearly in the number of parameters. We consider an i.i.d. policy-evaluation setting in which the data need not come from on-policy experience. The gradient temporal-difference (GTD) algorithm estimates the expected update vector of the TD(0) algorithm and performs stochastic gradient descent on its L2 norm. We prove that this algorithm is stable and convergent under the usual stochastic approximation conditions to the same least-squares solution as found by the LSTD, but without LSTD's quadratic computational complexity. GTD is online and incremental, and does not involve multiplying by products of likelihood ratios as in importance-sampling methods.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Speeding Up Planning in Markov Decision Processes via Automatically Constructed Abstractions.\n \n \n \n \n\n\n \n Isaza, A.; Szepesvári, C.; Bulitko, V.; and Greiner, R.\n\n\n \n\n\n\n In UAI, pages 306–314, 2008. \n \n\n\n\n
\n\n\n\n \n \n \"Speeding paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 7 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{a.isaza2008,\n\tabstract = {In this paper, we consider planning in stochastic shortest path problems, a subclass of Markov Decision Problems (MDP). We focus on medium-size problems whose state space can be fully enumerated. This problem has numerous important applications, such as navigation and planning under uncertainty. We propose a new approach for constructing a multi-level hierarchy of progressively simpler abstractions of the original problem. Once computed, the hierarchy can be used to speed up planning by first finding a policy for the most abstract level and then recursively refining it into a solution to the original problem. This approach is fully automated and delivers a speed-up of two orders of magnitude over a state-of-the-art MDP solver on sample problems while returning near-optimal solutions.},\n\tacceptrate = {28\\%},\n\tauthor = {Isaza, A. and Szepesv{\\'a}ri, Cs. and Bulitko, V. and Greiner, R.},\n\tbooktitle = {UAI},\n\tkeywords = {planning, finite MDPs, abstraction, macro learning, shortest path problem, options},\n\towner = {Beata},\n\tpages = {306--314},\n\ttimestamp = {2010.08.29},\n\ttitle = {Speeding Up Planning in Markov Decision Processes via Automatically Constructed Abstractions},\n\turl_paper = {prmdp.pdf},\n\tyear = {2008}}\n\n
\n
\n\n\n
\n In this paper, we consider planning in stochastic shortest path problems, a subclass of Markov Decision Problems (MDP). We focus on medium-size problems whose state space can be fully enumerated. This problem has numerous important applications, such as navigation and planning under uncertainty. We propose a new approach for constructing a multi-level hierarchy of progressively simpler abstractions of the original problem. Once computed, the hierarchy can be used to speed up planning by first finding a policy for the most abstract level and then recursively refining it into a solution to the original problem. This approach is fully automated and delivers a speed-up of two orders of magnitude over a state-of-the-art MDP solver on sample problems while returning near-optimal solutions.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Active learning in Multi-armed bandits.\n \n \n \n \n\n\n \n Antos, A.; Grover, V.; and Szepesvári, C.\n\n\n \n\n\n\n In ALT, of Lecture Notes in Computer Science 5254, pages 287–302, 2008. Springer-Verlag\n See i̧tea.antos2010 for an extended version\n\n\n\n
\n\n\n\n \n \n \"Active paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{antos2008,\n\tabstract = {In this paper we consider the problem of actively learning the mean values of distributions associated with a finite number of options (arms). The algorithms can select which option to generate the next sample from in order to produce estimates with equally good precision for all the distributions. When an algorithm uses sample means to estimate the unknown values then the optimal solution, assuming full knowledge of the distributions, is to sample each option proportional to its variance. In this paper we propose an incremental algorithm that asymptotically achieves the same loss as an optimal rule. We prove that the excess loss suffered by this algorithm, apart from logarithmic factors, scales as $1/n^{(3/2)}$, which we conjecture to be the optimal rate. The performance of the algorithm is illustrated in a simple problem.},\n\tauthor = {Antos, A. and Grover, V. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ALT},\n\tkeywords = {active learning, regression, sequential algorithms, theory},\n\tnote = {See \\cite{a.antos2010} for an extended version},\n\tpages = {287--302},\n\tpublisher = {Springer-Verlag},\n\tseries = {Lecture Notes in Computer Science 5254},\n\ttitle = {Active learning in Multi-armed bandits},\n\turl_paper = {Allocation.pdf},\n\tyear = {2008}}\n\n
\n
\n\n\n
\n In this paper we consider the problem of actively learning the mean values of distributions associated with a finite number of options (arms). The algorithms can select which option to generate the next sample from in order to produce estimates with equally good precision for all the distributions. When an algorithm uses sample means to estimate the unknown values then the optimal solution, assuming full knowledge of the distributions, is to sample each option proportional to its variance. In this paper we propose an incremental algorithm that asymptotically achieves the same loss as an optimal rule. We prove that the excess loss suffered by this algorithm, apart from logarithmic factors, scales as $1/n^{(3/2)}$, which we conjecture to be the optimal rate. The performance of the algorithm is illustrated in a simple problem.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Active learning of group-structured environments.\n \n \n \n \n\n\n \n Bartók, G.; Szepesvári, C.; and Zilles, S.\n\n\n \n\n\n\n In ALT, of Lecture Notes in Computer Science 5254, pages 329–343, 2008. Springer-Verlag\n See i̧tebartok2010a for a longer, updated version\n\n\n\n
\n\n\n\n \n \n \"Active paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{bartok2008,\n\tabstract = {The question investigated in this paper is to what extent an input representation influences the success of learning, in particular from the point of view of analyzing agents that can interact with their environment. We investigate learning environments that have a group structure. We introduce a learning model in different variants and study under which circumstances group structures can be learned efficiently from experimenting with group generators (actions). Negative results are presented, even without efficiency constraints, for rather general classes of groups showing that even with group structure, learning an environment from partial information is far from trivial. However, positive results for special subclasses of Abelian groups turn out to be a good starting point for the design of efficient learning algorithms based on structured representations.},\n\tauthor = {Bart{\\'o}k, G. and Szepesv{\\'a}ri, Cs. and Zilles, S.},\n\tbooktitle = {ALT},\n\tdoi = {10.1007/978-3-540-87987-9_28},\n\tkeywords = {active learning, sequential algorithms, theory},\n\tnote = {See \\cite{bartok2010a} for a longer, updated version},\n\tpages = {329--343},\n\tpublisher = {Springer-Verlag},\n\tseries = {Lecture Notes in Computer Science 5254},\n\ttitle = {Active learning of group-structured environments},\n\turl_paper = {bartokSZ08group.pdf},\n\tyear = {2008},\n\tBdsk-Url-1 = {http://dx.doi.org/10.1007/978-3-540-87987-9_28}}\n\n
\n
\n\n\n
\n The question investigated in this paper is to what extent an input representation influences the success of learning, in particular from the point of view of analyzing agents that can interact with their environment. We investigate learning environments that have a group structure. We introduce a learning model in different variants and study under which circumstances group structures can be learned efficiently from experimenting with group generators (actions). Negative results are presented, even without efficiency constraints, for rather general classes of groups showing that even with group structure, learning an environment from partial information is far from trivial. However, positive results for special subclasses of Abelian groups turn out to be a good starting point for the design of efficient learning algorithms based on structured representations.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Finite Time Bounds for Fitted Value Iteration.\n \n \n \n \n\n\n \n Munos, R.; and Szepesvári, C.\n\n\n \n\n\n\n JMLR, 9: 815–857. 2008.\n \n\n\n\n
\n\n\n\n \n \n \"Finite link\n  \n \n \n \"Finite paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{munos2008,\n\tabstract = {This is the longer version of the ICML'2005 paper with all the proofs and some extra material. In this paper we develop a theoretical analysis of the performance of sampling-based fitted value iteration (FVI) to solve infinite state-space, discounted-reward Markovian decision processes (MDPs) under the assumption that a generative model of the environment is available. Our main results come in the form of finite-time bounds on the performance of two versions of sampling-based FVI.The convergence rate results obtained allow us to show that both versions of FVI are well behaving in the sense that by using a sufficiently large number of samples for a large class of MDPs, arbitrary good performance can be achieved with high probability.An important feature of our proof technique is that it permits the study of weighted $L^p$-norm performance bounds. As a result, our technique applies to a large class of function-approximation methods (e.g., neural networks, adaptive regression trees, kernel machines, locally weighted learning), and our bounds scale well with the effective horizon of the MDP. The bounds show a dependence on the stochastic stability properties of the MDP: they scale with the discounted-average concentrability of the future-state distributions. They also depend on a new measure of the approximation power of the function space, the inherent Bellman residual, which reflects how well the function space is ``aligned'' with the dynamics and rewards of the MDP.The conditions of the main result, as well as the concepts introduced in the analysis, are extensively discussed and compared to previous theoretical results.Numerical experiments are used to substantiate the theoretical findings.},\n\tauthor = {Munos, R. and Szepesv{\\'a}ri, Cs.},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tjournal = {JMLR},\n\tkeywords = {batch learning, reinforcement learning, theory, performance bounds, nonparametrics},\n\towner = {Beata},\n\tpages = {815--857},\n\ttimestamp = {2010.08.30},\n\ttitle = {Finite Time Bounds for Fitted Value Iteration},\n\turl_link = {http://www.jmlr.org/papers/volume9/munos08a/munos08a.pdf},\n\turl_paper = {munos08a.pdf},\n\tvolume = {9},\n\tyear = {2008},\n\tBdsk-Url-1 = {http://www.jmlr.org/papers/volume9/munos08a/munos08a.pdf}}\n\n
\n
\n\n\n
\n This is the longer version of the ICML'2005 paper with all the proofs and some extra material. In this paper we develop a theoretical analysis of the performance of sampling-based fitted value iteration (FVI) to solve infinite state-space, discounted-reward Markovian decision processes (MDPs) under the assumption that a generative model of the environment is available. Our main results come in the form of finite-time bounds on the performance of two versions of sampling-based FVI.The convergence rate results obtained allow us to show that both versions of FVI are well behaving in the sense that by using a sufficiently large number of samples for a large class of MDPs, arbitrary good performance can be achieved with high probability.An important feature of our proof technique is that it permits the study of weighted $L^p$-norm performance bounds. As a result, our technique applies to a large class of function-approximation methods (e.g., neural networks, adaptive regression trees, kernel machines, locally weighted learning), and our bounds scale well with the effective horizon of the MDP. The bounds show a dependence on the stochastic stability properties of the MDP: they scale with the discounted-average concentrability of the future-state distributions. They also depend on a new measure of the approximation power of the function space, the inherent Bellman residual, which reflects how well the function space is ``aligned'' with the dynamics and rewards of the MDP.The conditions of the main result, as well as the concepts introduced in the analysis, are extensively discussed and compared to previous theoretical results.Numerical experiments are used to substantiate the theoretical findings.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2007\n \n \n (8)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Sequence prediction exploiting similarity information.\n \n \n \n \n\n\n \n Bı́ró, I.; Szamonek, Z.; ; and Szepesvári, C.\n\n\n \n\n\n\n In IJCAI, pages 1576–1581, 2007. \n \n\n\n\n
\n\n\n\n \n \n \"Sequence paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{BiSzaSze07,\n\tabstract = {When data is scarce or the alphabet is large, smoothing the probability estimates becomes inescapable when estimating n-gram models. In this paper we propose a method that implements a form of smoothing by exploiting similarity information of the alphabet elements. The idea is to view the log-conditional probability function as a smooth function defined over the similarity graph. The algorithm that we propose uses the eigenvectors of the similarity graph as the basis of the expansion of the log conditional probability function whose coefficients are found by solving a regularized logistic regression problem. The experimental results demonstrate the superiority of the method when the similarity graph contains relevant information, whilst the method still remains competitive with state-of-the-art smoothing methods even in the lack of such information.},\n\tauthor = {B\\'\\ir{\\'o}, I. and Szamonek, Z. and and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {IJCAI},\n\tkeywords = {sequence prediction, natural language processing, spectral graph theory, smoothing, learning basis functions},\n\tpages = {1576--1581},\n\ttitle = {Sequence prediction exploiting similarity information},\n\turl_paper = {sequence-ijcai07.pdf},\n\tyear = {2007}}\n\n
\n
\n\n\n
\n When data is scarce or the alphabet is large, smoothing the probability estimates becomes inescapable when estimating n-gram models. In this paper we propose a method that implements a form of smoothing by exploiting similarity information of the alphabet elements. The idea is to view the log-conditional probability function as a smooth function defined over the similarity graph. The algorithm that we propose uses the eigenvectors of the similarity graph as the basis of the expansion of the log conditional probability function whose coefficients are found by solving a regularized logistic regression problem. The experimental results demonstrate the superiority of the method when the similarity graph contains relevant information, whilst the method still remains competitive with state-of-the-art smoothing methods even in the lack of such information.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Fitted Q-iteration in Continuous Action-space MDPs.\n \n \n \n \n\n\n \n Antos, A.; Munos, R.; and Szepesvári, C.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, pages 9–16, 2007. \n \n\n\n\n
\n\n\n\n \n \n \"Fitted paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{antos2007,\n\tabstract = {We consider continuous state, continuous action batch reinforcement learning where the goal is to learn a good policy from a sufficiently rich trajectory generated by some policy. We study a variant of fitted Q-iteration, where the greedy action selection is replaced by searching for a policy in a restricted set of candidate policies by maximizing the average action values. We provide a rigorous analysis of this algorithm, proving what we believe is the first finite-time bound for value-function based algorithms for continuous state and action problems.\n\n\t\t  Note: In retrospect, it would have been better to call this algorithm an actor-critic algorithm. The algorithm that we considers updates a policy and a value function (action-value function in this case).},\n\tauthor = {Antos, A. and Munos, R. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tkeywords = {batch learning, reinforcement learning, function approximation, performance bounds, actor-critic methods, nonparametrics},\n\tpages = {9--16},\n\ttitle = {Fitted Q-iteration in Continuous Action-space MDPs},\n\turl_paper = {rlca.pdf},\n\tyear = {2007}}\n\n
\n
\n\n\n
\n We consider continuous state, continuous action batch reinforcement learning where the goal is to learn a good policy from a sufficiently rich trajectory generated by some policy. We study a variant of fitted Q-iteration, where the greedy action selection is replaced by searching for a policy in a restricted set of candidate policies by maximizing the average action values. We provide a rigorous analysis of this algorithm, proving what we believe is the first finite-time bound for value-function based algorithms for continuous state and action problems. Note: In retrospect, it would have been better to call this algorithm an actor-critic algorithm. The algorithm that we considers updates a policy and a value function (action-value function in this case).\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Manifold-adaptive Dimension Estimation.\n \n \n \n \n\n\n \n Farahmand, A.; Szepesvári, C.; and Audibert, J.\n\n\n \n\n\n\n In ICML, pages 265–272, 2007. \n \n\n\n\n
\n\n\n\n \n \n \"Manifold-adaptiveLink\n  \n \n \n \"Manifold-adaptive paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{farahmand2007,\n\tabstract = {Intuitively, learning should be easier when the data points lie on a low-dimensional submanifold of the input space. Recently there has been a growing interest in algorithms that aim to exploit such geometrical properties of the data. Oftentimes these algorithms require estimating the dimension of the manifold first. In this paper we propose an algorithm for dimension estimation and study its finite-sample behaviour. The algorithm estimates the dimension locally around the data points using nearest neighbor techniques and then combines these local estimates. We show that the rate of convergence of the resulting estimate is independent of the dimension of the input space and hence the algorithm is ``manifold-adaptive''. Thus, when the manifold supporting the data is low dimensional, the algorithm can be exponentially more efficient than its counterparts that are not exploiting this property. Our computer experiments confirm the obtained theoretical results.},\n\tacceptrate = {29\\%},\n\tauthor = {Farahmand, A.m. and Szepesv{\\'a}ri, Cs. and Audibert, J.-Y.},\n\tbibsource = {DBLP, http://dblp.uni-trier.de},\n\tbooktitle = {ICML},\n\tee = {http://doi.acm.org/10.1145/1273496.1273530},\n\tkeywords = {unsupervised learning, dimension estimation, theory, manifold learning},\n\tpages = {265--272},\n\ttitle = {Manifold-adaptive Dimension Estimation},\n\turl_paper = {dimicml.pdf},\n\tyear = {2007}}\n\n
\n
\n\n\n
\n Intuitively, learning should be easier when the data points lie on a low-dimensional submanifold of the input space. Recently there has been a growing interest in algorithms that aim to exploit such geometrical properties of the data. Oftentimes these algorithms require estimating the dimension of the manifold first. In this paper we propose an algorithm for dimension estimation and study its finite-sample behaviour. The algorithm estimates the dimension locally around the data points using nearest neighbor techniques and then combines these local estimates. We show that the rate of convergence of the resulting estimate is independent of the dimension of the input space and hence the algorithm is ``manifold-adaptive''. Thus, when the manifold supporting the data is low dimensional, the algorithm can be exponentially more efficient than its counterparts that are not exploiting this property. Our computer experiments confirm the obtained theoretical results.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Value-iteration Based Fitted Policy Iteration: Learning with a Single Trajectory.\n \n \n \n \n\n\n \n Antos, A.; Szepesvári, C.; and Munos, R.\n\n\n \n\n\n\n In 2007 IEEE Symposium on Approximate Dynamic Programming and Reinforcement Learning (ADPRL 2007), pages 330–337, 2007. IEEE\n (Honolulu, Hawaii, Apr 1–5, 2007.)\n\n\n\n
\n\n\n\n \n \n \"Value-iteration paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{antos2007a,\n\tabstract = {We consider batch reinforcement learning problems in continuous space, expected total discounted-reward Markovian Decision Problems when the training data is composed of the trajectory of some fixed behaviour policy. The algorithm studied is policy iteration where in successive iterations the action-value functions of the intermediate policies are obtained by means of approximate value iteration. PAC-style polynomial bounds are derived on the number of samples needed to guarantee near optimal performance. The bounds depend on the mixing rate of the trajectory, the smoothness properties of the underlying Markovian Decision Problem, the approximation power and capacity of the function set used. One of the main novelties of the paper is that new smoothness constraints are introduced thereby significantly extending the scope of previous results.},\n\tauthor = {Antos, A. and Szepesv{\\'a}ri, Cs. and Munos, R.},\n\tbooktitle = {2007 IEEE Symposium on Approximate Dynamic Programming and Reinforcement Learning (ADPRL 2007)},\n\tkeywords = {reinforcement learning, nonparametrics, theory, function approximation, policy iteration},\n\tnote = {(Honolulu, Hawaii, Apr 1--5, 2007.)},\n\tpages = {330--337},\n\tpublisher = {IEEE},\n\ttitle = {Value-iteration Based Fitted Policy Iteration: Learning with a Single Trajectory},\n\turl_paper = {sapi_adprl4aa.pdf},\n\tyear = {2007}}\n\n
\n
\n\n\n
\n We consider batch reinforcement learning problems in continuous space, expected total discounted-reward Markovian Decision Problems when the training data is composed of the trajectory of some fixed behaviour policy. The algorithm studied is policy iteration where in successive iterations the action-value functions of the intermediate policies are obtained by means of approximate value iteration. PAC-style polynomial bounds are derived on the number of samples needed to guarantee near optimal performance. The bounds depend on the mixing rate of the trajectory, the smoothness properties of the underlying Markovian Decision Problem, the approximation power and capacity of the function set used. One of the main novelties of the paper is that new smoothness constraints are introduced thereby significantly extending the scope of previous results.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Tuning Bandit Algorithms in Stochastic Environments.\n \n \n \n \n\n\n \n Audibert, J.; Munos, R.; and Szepesvári, C.\n\n\n \n\n\n\n In ALT, pages 150–165, 2007. Springer\n See i̧teaudibert2009 for a longer, updated version\n\n\n\n
\n\n\n\n \n \n \"Tuning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{audibert2007,\n\tabstract = {Algorithms based on upper-confidence bounds for balancing exploration and exploitation are gaining popularity since they are easy to implement, efficient and effective. In this paper we consider a variant of the basic algorithm for the stochastic, multi-armed bandit problem that takes into account the empirical variance of the different arms. In earlier experimental works, such algorithms were found to outperform the competing algorithms. The purpose of this paper is to provide a theoretical explanation of these findings and provide theoretical guidelines for the tuning of the parameters of these algorithms. For this we analyze the expected regret and for the first time the concentration of the regret. The analysis of the expected regret shows that variance estimates can be especially advantageous when the payoffs of suboptimal arms have low variance. The risk analysis, rather unexpectedly, reveals that except some very special bandit problems, for upper confidence bound based algorithms with standard bias sequences, the regret concentrates only at a polynomial rate. Hence, although these algorithms achieve logarithmic expected regret rates, they seem less attractive when the risk of achieving much worse than logarithmic cumulative regret is also taken into account.},\n\tacceptrate = {50\\%},\n\tauthor = {Audibert, J.-Y. and Munos, R. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ALT},\n\tkeywords = {multi-armed bandits, sequential algorithms, stochastic bandits, Bernstein's inequality, theory},\n\tnote = {See \\cite{audibert2009} for a longer, updated version},\n\tpages = {150--165},\n\tppt = {talks/ALT07-UCBTuned-Talk.ppt},\n\tpublisher = {Springer},\n\ttitle = {Tuning Bandit Algorithms in Stochastic Environments},\n\turl_paper = {ucb_alt.pdf},\n\tyear = {2007}}\n\n
\n
\n\n\n
\n Algorithms based on upper-confidence bounds for balancing exploration and exploitation are gaining popularity since they are easy to implement, efficient and effective. In this paper we consider a variant of the basic algorithm for the stochastic, multi-armed bandit problem that takes into account the empirical variance of the different arms. In earlier experimental works, such algorithms were found to outperform the competing algorithms. The purpose of this paper is to provide a theoretical explanation of these findings and provide theoretical guidelines for the tuning of the parameters of these algorithms. For this we analyze the expected regret and for the first time the concentration of the regret. The analysis of the expected regret shows that variance estimates can be especially advantageous when the payoffs of suboptimal arms have low variance. The risk analysis, rather unexpectedly, reveals that except some very special bandit problems, for upper confidence bound based algorithms with standard bias sequences, the regret concentrates only at a polynomial rate. Hence, although these algorithms achieve logarithmic expected regret rates, they seem less attractive when the risk of achieving much worse than logarithmic cumulative regret is also taken into account.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Improved Rates for the Stochastic Continuum-Armed Bandit Problem.\n \n \n \n \n\n\n \n Auer, P.; Ortner, R.; and Szepesvári, C.\n\n\n \n\n\n\n In COLT, pages 454–468, 2007. \n \n\n\n\n
\n\n\n\n \n \n \"Improved paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{auer2007,\n\tabstract = {Considering one-dimensional continuum-armed bandit problems, we propose an improvement of an algorithm of Kleinberg and a new set of conditions which give rise to improved rates. In particular, we introduce a novel assumption that is complementary to the previous smoothness conditions, while at the same time smoothness of the mean payoff function is required only at the maxima. Under these new assumptions new bounds on the expected regret are derived. In particular, we show that apart from logarithmic factors, the expected regret scales with the square-root of the number of trials, provided that the mean payoff function has finitely many maxima and its second derivatives are continuous and non-vanishing at the maxima. This improves a previous result of Cope by weakening the assumptions on the function. We also derive matching lower bounds. To complement the bounds on the expected regret, we provide high probability bounds which exhibit similar scaling.},\n\tacceptrate = {43\\%},\n\tauthor = {Auer, P. and Ortner, R. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {COLT},\n\tkeywords = {bandits, multi-armed bandits, large action space, stochastic bandits, theory, minimax bounds},\n\tpages = {454--468},\n\ttitle = {Improved Rates for the Stochastic Continuum-Armed Bandit Problem},\n\turl_paper = {ContinuousBandits.pdf},\n\tyear = {2007}}\n\n
\n
\n\n\n
\n Considering one-dimensional continuum-armed bandit problems, we propose an improvement of an algorithm of Kleinberg and a new set of conditions which give rise to improved rates. In particular, we introduce a novel assumption that is complementary to the previous smoothness conditions, while at the same time smoothness of the mean payoff function is required only at the maxima. Under these new assumptions new bounds on the expected regret are derived. In particular, we show that apart from logarithmic factors, the expected regret scales with the square-root of the number of trials, provided that the mean payoff function has finitely many maxima and its second derivatives are continuous and non-vanishing at the maxima. This improves a previous result of Cope by weakening the assumptions on the function. We also derive matching lower bounds. To complement the bounds on the expected regret, we provide high probability bounds which exhibit similar scaling.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Continuous Time Associative Bandit Problems.\n \n \n \n \n\n\n \n György, A.; Kocsis, L.; Szabó, I.; and Szepesvári, C.\n\n\n \n\n\n\n In IJCAI, pages 830–835, 2007. \n \n\n\n\n
\n\n\n\n \n \n \"Continuous paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{gyorgy2007,\n\tabstract = {In this paper we consider an extension of the multi-armed bandit problem. In this generalized setting, the decision maker receives some side information, performs an action chosen from a finite set and then receives a reward. Unlike in the standard bandit settings, performing an action takes a random period of time. The environment is assumed to be stationary, stochastic and memoryless. The goal is to maximize the average reward received in one unit time, that is, to maximize the average rate of return. We consider the on-line learning problem where the decision maker initially does not know anything about the environment but must learn about it by trial and error. We propose an ``upper confidence bound''-style algorithm that exploits the structure of the problem. We show that the regret of this algorithm relative to the optimal algorithm that has perfect knowledge about the problem grows at the optimal logarithmic rate in the number of decisions and scales polynomially with the parameters of the problem.},\n\tacceptrate = {35\\%},\n\tauthor = {Gy{\\"o}rgy, A. and Kocsis, L. and Szab{\\'o}, I. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {IJCAI},\n\tkeywords = {bandits, multi-armed bandits, semi-Markov decision processes, average payoff, theory},\n\tpages = {830--835},\n\ttitle = {Continuous Time Associative Bandit Problems},\n\turl_paper = {cbandit-ijcai07.pdf},\n\tyear = {2007}}\n\n
\n
\n\n\n
\n In this paper we consider an extension of the multi-armed bandit problem. In this generalized setting, the decision maker receives some side information, performs an action chosen from a finite set and then receives a reward. Unlike in the standard bandit settings, performing an action takes a random period of time. The environment is assumed to be stationary, stochastic and memoryless. The goal is to maximize the average reward received in one unit time, that is, to maximize the average rate of return. We consider the on-line learning problem where the decision maker initially does not know anything about the environment but must learn about it by trial and error. We propose an ``upper confidence bound''-style algorithm that exploits the structure of the problem. We show that the regret of this algorithm relative to the optimal algorithm that has perfect knowledge about the problem grows at the optimal logarithmic rate in the number of decisions and scales polynomially with the parameters of the problem.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Apprenticeship Learning using Inverse Reinforcement Learning and Gradient Methods.\n \n \n \n \n\n\n \n Neu, G.; and Szepesvári, C.\n\n\n \n\n\n\n In UAI, pages 295–302, 2007. \n \n\n\n\n
\n\n\n\n \n \n \"Apprenticeship paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{neu2007,\n\tabstract = {In this paper we propose a novel gradient algorithm to learn a policy from an expert's observed behavior assuming that the expert behaves optimally with respect to some unknown reward function of a Markovian Decision Problem. The algorithm's aim is to find a reward function such that the resulting optimal policy matches well the expert's observed behavior. The main difficulty is that the mapping from the parameters to policies is both nonsmooth and highly redundant. Resorting to subdifferentials solves the first difficulty, while the second one is overcome by computing natural gradients. We tested the proposed method in two artificial domains and found it to be more reliable and efficient than some previous methods.},\n\tacceptrate = {32\\%},\n\tauthor = {Neu, G. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {UAI},\n\tkeywords = {theory, application, reinforcement learning, apprenticeship learning, natural gradient, inverse reinforcement learning},\n\tpages = {295--302},\n\ttitle = {Apprenticeship Learning using Inverse Reinforcement Learning and Gradient Methods},\n\turl_paper = {uai2007-irl.pdf},\n\tyear = {2007}}\n\n
\n
\n\n\n
\n In this paper we propose a novel gradient algorithm to learn a policy from an expert's observed behavior assuming that the expert behaves optimally with respect to some unknown reward function of a Markovian Decision Problem. The algorithm's aim is to find a reward function such that the resulting optimal policy matches well the expert's observed behavior. The main difficulty is that the mapping from the parameters to policies is both nonsmooth and highly redundant. Resorting to subdifferentials solves the first difficulty, while the second one is overcome by computing natural gradients. We tested the proposed method in two artificial domains and found it to be more reliable and efficient than some previous methods.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2006\n \n \n (5)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Bandit based Monte-Carlo Planning.\n \n \n \n \n\n\n \n Kocsis, L.; and Szepesvári, C.\n\n\n \n\n\n\n In ECML, pages 282–293, 2006. \n \n\n\n\n
\n\n\n\n \n \n \"Bandit paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 13 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{kocsis2006a,\n\tabstract = {We consider batch reinforcement learning problems in continuous space, expected total discounted-reward Markovian Decision Problems. As opposed to previous theoretical work, we consider the case when the training data consists of a single sample path (trajectory) of some behaviour policy.In particular, we do not assume access to a generative model of the environment.The algorithm studied is fitted Q-iteration where in successive iterations the $Q$-functions of the intermediate policies are obtained by means of minimizing a novel Bellman-residual type error.PAC-style polynomial bounds are derived on the number of samples needed to guarantee near-optimal performance where the bound depends on the mixing rate of the trajectory, the smoothness properties of the underlying Markovian Decision Problem, the approximation power and capacity of the function set used.},\n\tauthor = {Kocsis, L. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ECML},\n\tkeywords = {reinforcement learning, learning in games, Monte-Carlo methods, Monte-Carlo tree search, UCT, bandits},\n\tpages = {282--293},\n\ttitle = {Bandit based Monte-Carlo Planning},\n\turl_paper = {ecml06.pdf},\n\tyear = {2006}}\n\n
\n
\n\n\n
\n We consider batch reinforcement learning problems in continuous space, expected total discounted-reward Markovian Decision Problems. As opposed to previous theoretical work, we consider the case when the training data consists of a single sample path (trajectory) of some behaviour policy.In particular, we do not assume access to a generative model of the environment.The algorithm studied is fitted Q-iteration where in successive iterations the $Q$-functions of the intermediate policies are obtained by means of minimizing a novel Bellman-residual type error.PAC-style polynomial bounds are derived on the number of samples needed to guarantee near-optimal performance where the bound depends on the mixing rate of the trajectory, the smoothness properties of the underlying Markovian Decision Problem, the approximation power and capacity of the function set used.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning Near-optimal Policies with Bellman-residual Minimization based Fitted Policy Iteration and a Single Sample Path.\n \n \n \n \n\n\n \n Antos, A.; Szepesvári, C.; and Munos, R.\n\n\n \n\n\n\n In COLT, pages 574–588, 2006. \n \n\n\n\n
\n\n\n\n \n \n \"Learning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{antos2006,\n\tabstract = {We consider batch reinforcement learning problems in continuous space, expected total discounted-reward Markovian Decision Problems. As opposed to previous theoretical work, we consider the case when the training data consists of a single sample path (trajectory) of some behaviour policy.In particular, we do not assume access to a generative model of the environment.The algorithm studied is fitted Q-iteration where in successive iterations the $Q$-functions of the intermediate policies are obtained by means of minimizing a novel Bellman-residual type error.PAC-style polynomial bounds are derived on the number of samples needed to guarantee near-optimal performance where the bound depends on the mixing rate of the trajectory, the smoothness properties of the underlying Markovian Decision Problem, the approximation power and capacity of the function set used.},\n\tauthor = {Antos, A. and Szepesv{\\'a}ri, Cs. and Munos, R.},\n\tbooktitle = {COLT},\n\tkeywords = {reinforcement learning, nonparametrics, theory, function approximation, policy iteration},\n\tpages = {574--588},\n\ttitle = {Learning Near-optimal Policies with Bellman-residual Minimization based Fitted Policy Iteration and a Single Sample Path},\n\turl_paper = {colt2006.pdf},\n\tyear = {2006}}\n\n
\n
\n\n\n
\n We consider batch reinforcement learning problems in continuous space, expected total discounted-reward Markovian Decision Problems. As opposed to previous theoretical work, we consider the case when the training data consists of a single sample path (trajectory) of some behaviour policy.In particular, we do not assume access to a generative model of the environment.The algorithm studied is fitted Q-iteration where in successive iterations the $Q$-functions of the intermediate policies are obtained by means of minimizing a novel Bellman-residual type error.PAC-style polynomial bounds are derived on the number of samples needed to guarantee near-optimal performance where the bound depends on the mixing rate of the trajectory, the smoothness properties of the underlying Markovian Decision Problem, the approximation power and capacity of the function set used.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Universal Parameter Optimisation in Games Based on SPSA.\n \n \n \n \n\n\n \n Kocsis, L.; and Szepesvári, C.\n\n\n \n\n\n\n Machine Learning Journal, 63: 249–286. 2006.\n \n\n\n\n
\n\n\n\n \n \n \"Universal paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{kocsis2006,\n\tabstract = {[..] The goal of this paper is twofold: (i) to introduce SPSA for the game programming community by putting it into a game-programming perspective, and (ii) to propose and discuss several methods that can be used to enhance the performance of SPSA. These methods include using common random numbers and antithetic variables, a combination of SPSA with RPROP, and the reuse of samples of previous performance evaluations. SPSA with the proposed enhancements was tested in some large-scale experiments on tuning the parameters of an opponent model, a policy and an evaluation function in our poker program, McRAISE. Whilst SPSA with no enhancements failed to make progress using the allocated resources, SPSA with the enhancements proved to be competitive with other methods, including TD-learning; increasing the average payor per game by as large as 0.19 times the size of the amount of the small bet. From the experimental study, we conclude that the use of an appropriately enhanced variant of SPSA for the optimisation of game program parameters is a viable approach, especially if no good alternative exist for the types of parameters considered.},\n\tauthor = {Kocsis, L. and Szepesv{\\'a}ri, Cs.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tdoi = {10.1007/s10994-006-6888-8},\n\tjournal = {Machine Learning Journal},\n\tkeywords = {SPSA, game playing, poker, Monte-Carlo methods, application, theory},\n\tpages = {249--286},\n\ttitle = {Universal Parameter Optimisation in Games Based on SPSA},\n\turl_paper = {mlj2005.pdf},\n\tvolume = {63},\n\tyear = {2006},\n\tBdsk-Url-1 = {http://dx.doi.org/10.1007/s10994-006-6888-8}}\n\n
\n
\n\n\n
\n [..] The goal of this paper is twofold: (i) to introduce SPSA for the game programming community by putting it into a game-programming perspective, and (ii) to propose and discuss several methods that can be used to enhance the performance of SPSA. These methods include using common random numbers and antithetic variables, a combination of SPSA with RPROP, and the reuse of samples of previous performance evaluations. SPSA with the proposed enhancements was tested in some large-scale experiments on tuning the parameters of an opponent model, a policy and an evaluation function in our poker program, McRAISE. Whilst SPSA with no enhancements failed to make progress using the allocated resources, SPSA with the enhancements proved to be competitive with other methods, including TD-learning; increasing the average payor per game by as large as 0.19 times the size of the amount of the small bet. From the experimental study, we conclude that the use of an appropriately enhanced variant of SPSA for the optimisation of game program parameters is a viable approach, especially if no good alternative exist for the types of parameters considered.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n RSPSA: Enhanced Parameter Optimisation in Games.\n \n \n \n \n\n\n \n Kocsis, L.; Szepesvári, C.; and Winands, M.\n\n\n \n\n\n\n In ACG, pages 1– 2, 2006. \n \n\n\n\n
\n\n\n\n \n \n \"RSPSA: paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{l.kocsis2006,\n\tabstract = {Most game programs have a large number of parameters that are crucial for their performance. While tuning these parameters by hand is rather difficult, successful applications of automatic optimisation algorithms in game programs are known only for parameters that belong to certain components (e.g. evaluation-function parameters). The SPSA (Simultaneous Perturbation Stochastic Approximation) algorithm is an attractive choice for optimising any kind of parameters of a game program, both for its generality and its simplicity. It's disadvantage is that it can be very slow. In this article we propose several methods to speed up SPSA, in particular, the combination with RPROP, using common random numbers, antithetic variables and averaging. We test the resulting algorithm for tuning various types of parameters in two domains, poker and LOA. From the experimental study, we conclude that using SPSA is a viable approach for optimisation in game programs, especially if no good alternative exists for the types of parameters considered.},\n\tauthor = {Kocsis, L. and Szepesv{\\'a}ri, Cs. and Winands, M.H.M.},\n\tbooktitle = {ACG},\n\tkeywords = {SPSA, game playing, poker, Monte-Carlo methods, application, theory},\n\towner = {Beata},\n\tpages = {1-- 2},\n\ttimestamp = {2010.08.30},\n\ttitle = {RSPSA: Enhanced Parameter Optimisation in Games},\n\turl_paper = {rspsa_acg.pdf},\n\tyear = {2006}}\n\n
\n
\n\n\n
\n Most game programs have a large number of parameters that are crucial for their performance. While tuning these parameters by hand is rather difficult, successful applications of automatic optimisation algorithms in game programs are known only for parameters that belong to certain components (e.g. evaluation-function parameters). The SPSA (Simultaneous Perturbation Stochastic Approximation) algorithm is an attractive choice for optimising any kind of parameters of a game program, both for its generality and its simplicity. It's disadvantage is that it can be very slow. In this article we propose several methods to speed up SPSA, in particular, the combination with RPROP, using common random numbers, antithetic variables and averaging. We test the resulting algorithm for tuning various types of parameters in two domains, poker and LOA. From the experimental study, we conclude that using SPSA is a viable approach for optimisation in game programs, especially if no good alternative exists for the types of parameters considered.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Local Importance Sampling: A Novel Technique to Enhance Particle Filtering.\n \n \n \n \n\n\n \n Torma, P.; and Szepesvári, C.\n\n\n \n\n\n\n Journal of Multimedia, 1: 32–43. 2006.\n \n\n\n\n
\n\n\n\n \n \n \"Local paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{torma2006,\n\tabstract = {In the low observation noise limit particle filters become inefficient. In this paper a simple-to-implement particle filter is suggested as a solution to this well-known problem. The proposed Local Importance Sampling based particle filters draw the particles' positions in a two-step process that makes use of both the dynamics of the system and the most recent observation. Experiments with the standard bearings-only tracking problem indicate that the proposed new particle filter method is indeed very successful when observations are reliable. Experiments with a high-dimensional variant of this problem further show that the advantage of the new filter grows with the increasing dimensionality of the system.},\n\tauthor = {Torma, P. and Szepesv{\\'a}ri, Cs.},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tjournal = {Journal of Multimedia},\n\tkeywords = {vision, particle filtering, theory, application},\n\tpages = {32--43},\n\ttimestamp = {2010.08.31},\n\ttitle = {Local Importance Sampling: A Novel Technique to Enhance Particle Filtering},\n\turl_paper = {jmm01013243.pdf},\n\tvolume = {1},\n\tyear = {2006}}\n\n
\n
\n\n\n
\n In the low observation noise limit particle filters become inefficient. In this paper a simple-to-implement particle filter is suggested as a solution to this well-known problem. The proposed Local Importance Sampling based particle filters draw the particles' positions in a two-step process that makes use of both the dynamics of the system and the most recent observation. Experiments with the standard bearings-only tracking problem indicate that the proposed new particle filter method is indeed very successful when observations are reliable. Experiments with a high-dimensional variant of this problem further show that the advantage of the new filter grows with the increasing dimensionality of the system.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2005\n \n \n (5)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Maximum Margin Discriminant Analysis based Face Recognition.\n \n \n \n \n\n\n \n Kornél, K.; Kocsor, A.; and Szepesvári, C.\n\n\n \n\n\n\n In HACIPPR-2005 (Joint Hungarian-Austrian Conference on Image Processing and Pattern Recognition), pages 1– 2, 2005. \n \n\n\n\n
\n\n\n\n \n \n \"Maximum paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{k.kornel2005,\n\tabstract = {Face recognition is a highly non-trivial classification problem since the input is high-dimensional and there are many classes with just a few examples per class. In this paper we propose using a recent algorithm -- Maximum Margin Discriminant Analysis (MMDA) -- to solve face recognition problems. MMDA is a feature extraction method that is derived from a set of sound principles: (i) each feature should maximize information transmission about the classification labels, (ii) only the decision boundary should determine the features and (iii) features should reveal independent information about the class labels. Previously, MMDA was shown to yield good performance scores on a number of standard benchmark problems. Here we show that MMDA is capable of finding good features in face recognition and performs very well provided it is preceded by an appropriate preprocessing phase. },\n\tauthor = {Korn{\\'e}l, K. and Kocsor, A. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {HACIPPR-2005 (Joint Hungarian-Austrian Conference on Image Processing and Pattern Recognition)},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tentrysubtype = {unrefereed},\n\tkeywords = {vision, application, supervised feature extraction, supervised learning, classification},\n\towner = {Beata},\n\tpages = {1-- 2},\n\ttimestamp = {2010.08.31},\n\ttitle = {Maximum Margin Discriminant Analysis based Face Recognition},\n\turl_paper = {mmda-hacippr2005.pdf},\n\tyear = {2005}}\n\n
\n
\n\n\n
\n Face recognition is a highly non-trivial classification problem since the input is high-dimensional and there are many classes with just a few examples per class. In this paper we propose using a recent algorithm – Maximum Margin Discriminant Analysis (MMDA) – to solve face recognition problems. MMDA is a feature extraction method that is derived from a set of sound principles: (i) each feature should maximize information transmission about the classification labels, (ii) only the decision boundary should determine the features and (iii) features should reveal independent information about the class labels. Previously, MMDA was shown to yield good performance scores on a number of standard benchmark problems. Here we show that MMDA is capable of finding good features in face recognition and performs very well provided it is preceded by an appropriate preprocessing phase. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Reduced-Variance Payoff Estimation in Adversarial Bandit Problems.\n \n \n \n \n\n\n \n Kocsis, L.; and Szepesvári, C.\n\n\n \n\n\n\n In Proceedings of the ECML-2005 Workshop on Reinforcement Learning in Non-Stationary Environments, 2005. \n \n\n\n\n
\n\n\n\n \n \n \"Reduced-Variance paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{kocsis2005,\n\tabstract = {A natural way to compare learning methods in non-stationary environments is to compare their regret. In this paper we consider the regret of algorithms in adversarial multi-armed bandit problems. We propose several methods to improve the performance of the baseline exponentially weighted average forecaster by changing the payoff-estimation methods. We argue that improved performance can be achieved by constructing payoff estimation methods that produce estimates with low variance. Our arguments are backed up by both theoretical and empirical results. In fact, our empirical results show that significant performance gains are possible over the baseline algorithm.},\n\tauthor = {Kocsis, L. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Proceedings of the ECML-2005 Workshop on Reinforcement Learning in Non-Stationary Environments},\n\tentrysubtype = {unrefereed},\n\tkeywords = {online learning, adversarial setting, bandits},\n\tslide-handout = {talks/ecml2005-rlw-handout.pdf},\n\tslides = {talks/ECML2005-rlw-slides.pdf},\n\ttitle = {Reduced-Variance Payoff Estimation in Adversarial Bandit Problems},\n\turl_paper = {kocsis-ecml2005-ext.pdf},\n\tyear = {2005}}\n\n
\n
\n\n\n
\n A natural way to compare learning methods in non-stationary environments is to compare their regret. In this paper we consider the regret of algorithms in adversarial multi-armed bandit problems. We propose several methods to improve the performance of the baseline exponentially weighted average forecaster by changing the payoff-estimation methods. We argue that improved performance can be achieved by constructing payoff estimation methods that produce estimates with low variance. Our arguments are backed up by both theoretical and empirical results. In fact, our empirical results show that significant performance gains are possible over the baseline algorithm.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Log-optimal Currency Portfolios and Control Lyapunov Exponents.\n \n \n \n \n\n\n \n Gerencsér, L.; Rásonyi, M.; Szepesvári, C.; and Vágó, Z.\n\n\n \n\n\n\n In CDC, pages 1764–1769, 2005. \n \n\n\n\n
\n\n\n\n \n \n \"Log-optimal paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{l.gerencser2005,\n\tabstract = {P. Algoet and T. Cover characterized log-optimal portfolios in a stationary market without friction. There is no analogous result for markets with friction, of which a currency market is a typical example. In this paper we restrict ourselves to simple static strategies. The problem is then reduced to the analysis of products of random matrices, the top-Lyapunov exponent giving the growth rate. New insights to products of random matrices will be given and an algorithm for optimizing top-Lyapunov exponents will be presented together with some key steps of its analysis. Simulation results will also be given. [..]},\n\tauthor = {Gerencs{\\'e}r, L. and R{\\'a}sonyi, M. and Szepesv{\\'a}ri, Cs. and V{\\'a}g{\\'o}, Zs.},\n\tbooktitle = {CDC},\n\tkeywords = {finance, control, gradient algorithm},\n\towner = {Beata},\n\tpages = {1764--1769},\n\ttimestamp = {2010.08.30},\n\ttitle = {Log-optimal Currency Portfolios and Control Lyapunov Exponents},\n\turl_paper = {cdc2005.pdf},\n\tyear = {2005}}\n\n
\n
\n\n\n
\n P. Algoet and T. Cover characterized log-optimal portfolios in a stationary market without friction. There is no analogous result for markets with friction, of which a currency market is a typical example. In this paper we restrict ourselves to simple static strategies. The problem is then reduced to the analysis of products of random matrices, the top-Lyapunov exponent giving the growth rate. New insights to products of random matrices will be given and an algorithm for optimizing top-Lyapunov exponents will be presented together with some key steps of its analysis. Simulation results will also be given. [..]\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Finite Time Bounds for Sampling Based Fitted Value Iteration.\n \n \n \n \n\n\n \n Munos, R.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 881—886, 2005. \n \n\n\n\n
\n\n\n\n \n \n \"Finite paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{munos2005,\n\tabstract = {In this paper we consider sampling based fitted value iteration for discounted, large (possibly infinite) state space, finite action Markovian Decision Problems where only a generative model of the transition probabilities and rewards is available. At each step the image of the current estimate of the optimal value function under a Monte-Carlo approximation to the Bellman-operator is projected onto some function space. PAC-style bounds on the weighted $L^p$-norm approximation error are obtained as a function of the covering number and the approximation power of the function space, the iteration number and the sample size.},\n\tauthor = {Munos, R. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\tkeywords = {batch learning, reinforcement learning, theory, performance bounds, nonparametrics},\n\tpages = {881---886},\n\tpresentation = {talks/icml2005_talk.pdf},\n\ttitle = {Finite Time Bounds for Sampling Based Fitted Value Iteration},\n\turl_paper = {savi_icml2005.pdf},\n\tyear = {2005}}\n\n
\n
\n\n\n
\n In this paper we consider sampling based fitted value iteration for discounted, large (possibly infinite) state space, finite action Markovian Decision Problems where only a generative model of the transition probabilities and rewards is available. At each step the image of the current estimate of the optimal value function under a Monte-Carlo approximation to the Bellman-operator is projected onto some function space. PAC-style bounds on the weighted $L^p$-norm approximation error are obtained as a function of the covering number and the approximation power of the function space, the iteration number and the sample size.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n On using Likelihood-adjusted Proposals in Particle Filtering: Local Importance Sampling.\n \n \n \n \n\n\n \n Torma, P.; and Szepesvári, C.\n\n\n \n\n\n\n In 4th International Symposium on Image and Signal Processing and Analysis, pages 1–2, 2005. \n \n\n\n\n
\n\n\n\n \n \n \"On paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{torma2005,\n\tabstract = {An unsatisfactory property of particle filters is that they may become inefficient when the observation noise is low. In this paper we consider a simple-to-implement particle filter, called `LIS-based particle filter', whose aim is to overcome the above mentioned weakness. LIS-based particle filters sample the particles in a two-stage process that uses information of the most recent observation, too. Experiments with the standard bearings-only tracking problem indicate that the proposed new particle filter method is indeed a viable alternative to other methods.},\n\tauthor = {Torma, P. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {4th International Symposium on Image and Signal Processing and Analysis},\n\tkeywords = {vision, particle filtering, theory, application},\n\towner = {Beata},\n\tpages = {1--2},\n\ttimestamp = {2010.08.31},\n\ttitle = {On using Likelihood-adjusted Proposals in Particle Filtering: Local Importance Sampling},\n\turl_paper = {torma-ispa2005.pdf},\n\tyear = {2005}}\n\n
\n
\n\n\n
\n An unsatisfactory property of particle filters is that they may become inefficient when the observation noise is low. In this paper we consider a simple-to-implement particle filter, called `LIS-based particle filter', whose aim is to overcome the above mentioned weakness. LIS-based particle filters sample the particles in a two-stage process that uses information of the most recent observation, too. Experiments with the standard bearings-only tracking problem indicate that the proposed new particle filter method is indeed a viable alternative to other methods.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2004\n \n \n (5)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Interpolation-based Q-learning.\n \n \n \n \n\n\n \n Szepesvári, C.; and Smart, W. D.\n\n\n \n\n\n\n In ICML, pages 791–798, 2004. \n \n\n\n\n
\n\n\n\n \n \n \"Interpolation-based paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{Szepesvari2004,\n\tabstract = {We consider a variant of Q-learning in continuous state spaces under the total expected discounted cost criterion combined with local function approximation methods. Provided that the function approximator satisfies certain interpolation properties, the resulting algorithm is shown to converge with probability one. The limit function is shown to satisfy a fixed point equation of the Bellman type, where the fixed point operator depends on the stationary distribution of the exploration policy and approximation properties of the function approximation method. The basic algorithm is extended in several ways. In particular, a variant of the algorithm is obtained that is shown to converge in probability to the optimal Q function. Preliminary computer simulations confirm the validity of the approach.},\n\tauthor = {Szepesv{\\'a}ri, Cs. and Smart, W. D.},\n\tbooktitle = {ICML},\n\tkeywords = {reinforcement learning, control learning, online learning, stochastic approximation, theory, function approximation},\n\tpages = {791--798},\n\ttitle = {Interpolation-based Q-learning},\n\turl_paper = {szws_icml2004_rlfapp.pdf},\n\tyear = {2004}}\n\n
\n
\n\n\n
\n We consider a variant of Q-learning in continuous state spaces under the total expected discounted cost criterion combined with local function approximation methods. Provided that the function approximator satisfies certain interpolation properties, the resulting algorithm is shown to converge with probability one. The limit function is shown to satisfy a fixed point equation of the Bellman type, where the fixed point operator depends on the stationary distribution of the exploration policy and approximation properties of the function approximation method. The basic algorithm is extended in several ways. In particular, a variant of the algorithm is obtained that is shown to converge in probability to the optimal Q function. Preliminary computer simulations confirm the validity of the approach.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Margin Maximizing Discriminant Analysis.\n \n \n \n \n\n\n \n Kocsor, A.; Kornél, K.; and Szepesvári, C.\n\n\n \n\n\n\n In ECML/PKDD-2004, pages 1– 2, 2004. \n \n\n\n\n
\n\n\n\n \n \n \"Margin paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{a.kocsor2004,\n\tabstract = {We propose a new feature extraction method called Margin Maximizing Discriminant Analysis (MMDA) which seeks to extract features suitable for classification tasks. MMDA is based on the principle that an ideal feature should convey the maximum information about the class labels and it should depend only on the geometry of the optimal decision boundary and not on those parts of the distribution of the input data that do not participate in shaping this boundary. Further, distinct feature components should convey unrelated information about the data. Two feature extraction methods are proposed for calculating the parameters of such a projection that are shown to yield equivalent results. The kernel mapping idea is used to derive non-linear versions. Experiments with several real-world, publicly available data sets demonstrate that the new method yields competitive results. },\n\tauthor = {Kocsor, A. and Korn{\\'e}l, K. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ECML/PKDD-2004},\n\tkeywords = {classification, supervised learning, supervised feature extraction, kernels},\n\towner = {Beata},\n\tpages = {1-- 2},\n\ttimestamp = {2010.08.31},\n\ttitle = {Margin Maximizing Discriminant Analysis},\n\turl_paper = {mmda-ecml2004.pdf},\n\tyear = {2004}}\n\n
\n
\n\n\n
\n We propose a new feature extraction method called Margin Maximizing Discriminant Analysis (MMDA) which seeks to extract features suitable for classification tasks. MMDA is based on the principle that an ideal feature should convey the maximum information about the class labels and it should depend only on the geometry of the optimal decision boundary and not on those parts of the distribution of the input data that do not participate in shaping this boundary. Further, distinct feature components should convey unrelated information about the data. Two feature extraction methods are proposed for calculating the parameters of such a projection that are shown to yield equivalent results. The kernel mapping idea is used to derive non-linear versions. Experiments with several real-world, publicly available data sets demonstrate that the new method yields competitive results. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Kernel Machine Based Feature Extraction Algorithm for Regression Problems.\n \n \n \n \n\n\n \n Szepesvári, C.; Kornél, K.; and Kocsor, A.\n\n\n \n\n\n\n In ECAI, pages 1– 2, 2004. \n \n\n\n\n
\n\n\n\n \n \n \"Kernel paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{cs.Szepesvari2004,\n\tabstract = {In this paper we consider two novel kernel machine based feature extraction algorithms in a regression settings. The first method is derived based on the principles underlying the recently introduced Maximum Margin Discrimination Analysis (MMDA) algorithm. However, here it is shown that the orthogonalization principle employed by the original MMDA algorithm can be motivated using the well-known ambiguity decomposition, thus providing a firm ground for the good performance of the algorithm. The second algorithm combines kernel machines with average derivative estimation and is derived from the assumption that the true regressor function depends only on a subspace of the original input space. The proposed algorithms are evaluated in preliminary experiments conducted with artificial and real datasets. },\n\tauthor = {Szepesv{\\'a}ri, Cs. and Korn{\\'e}l, K. and Kocsor, A.},\n\tbooktitle = {ECAI},\n\tkeywords = {supervised feature extraction, supervised learning, regression},\n\towner = {Beata},\n\tpages = {1-- 2},\n\ttimestamp = {2010.08.31},\n\ttitle = {Kernel Machine Based Feature Extraction Algorithm for Regression Problems},\n\turl_paper = {ecai-mmda.pdf},\n\tyear = {2004}}\n\n
\n
\n\n\n
\n In this paper we consider two novel kernel machine based feature extraction algorithms in a regression settings. The first method is derived based on the principles underlying the recently introduced Maximum Margin Discrimination Analysis (MMDA) algorithm. However, here it is shown that the orthogonalization principle employed by the original MMDA algorithm can be motivated using the well-known ambiguity decomposition, thus providing a firm ground for the good performance of the algorithm. The second algorithm combines kernel machines with average derivative estimation and is derived from the assumption that the true regressor function depends only on a subspace of the original input space. The proposed algorithms are evaluated in preliminary experiments conducted with artificial and real datasets. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Shortest Path Discovery Problems: A Framework, Algorithms and Experimental Results.\n \n \n \n \n\n\n \n Szepesvári, C.\n\n\n \n\n\n\n In AAAI, pages 550–555, 2004. \n \n\n\n\n
\n\n\n\n \n \n \"Shortest paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{Szepesvari2004a,\n\tabstract = {In this paper we introduce and study Shortest Path Discovery (SPD) problems, a generalization of shortest path problems: In SPD one is given a directed edge-weighted graph and the task is to find a the shortest path for fixed source and target nodes such that initially the edge-weights are unknown, but they can be queried. Querying the cost of an edge is expensive and hence the goal is to minimize the total number of edge cost queries executed. In this article we characterize some common properties of sound SPD algorithms, propose a particular algorithm that is shown to be sound and effective. Experimental results on real-world OCR task demonstrate the usefulness of the approach whereas the proposed algorithm is shown to yield a substantial speed-up of the recognition process.},\n\tauthor = {Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {AAAI},\n\tkeywords = {online learning, theory, search, application, image processing, shortest path problem},\n\tpages = {550--555},\n\ttitle = {Shortest Path Discovery Problems: A Framework, Algorithms and Experimental Results},\n\turl_paper = {szepes-aaai2004.pdf},\n\tyear = {2004}}\n\n
\n
\n\n\n
\n In this paper we introduce and study Shortest Path Discovery (SPD) problems, a generalization of shortest path problems: In SPD one is given a directed edge-weighted graph and the task is to find a the shortest path for fixed source and target nodes such that initially the edge-weights are unknown, but they can be queried. Querying the cost of an edge is expensive and hence the goal is to minimize the total number of edge cost queries executed. In this article we characterize some common properties of sound SPD algorithms, propose a particular algorithm that is shown to be sound and effective. Experimental results on real-world OCR task demonstrate the usefulness of the approach whereas the proposed algorithm is shown to yield a substantial speed-up of the recognition process.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Enhancing Particle Filters using Local Likelihood Sampling.\n \n \n \n \n\n\n \n Torma, P.; and Szepesvári, C.\n\n\n \n\n\n\n In ECCV, pages 16–27, 2004. \n \n\n\n\n
\n\n\n\n \n \n \"Enhancing paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{torma2004,\n\tabstract = {Particle filters provide a means to track the state of an object even when the dynamics and the observations are non-linear/non-Gaussian. However, they can be very inefficient when the observation noise is low as compared to the system noise, as it is often the case in visual tracking applications. In this paper we propose a new two-stage sampling procedure to boost the performance of particle filters under this condition. The new procedure is shown to reduce the variance of the weights by means of a theoretical analysis. This result is confirmed in a series of synthetic and real-world visual tracking experiments.},\n\tauthor = {Torma, P. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ECCV},\n\tkeywords = {vision, particle filtering, theory, application},\n\towner = {Beata},\n\tpages = {16--27},\n\ttimestamp = {2010.08.31},\n\ttitle = {Enhancing Particle Filters using Local Likelihood Sampling},\n\turl_paper = {lls-short.pdf},\n\tyear = {2004}}\n\n
\n
\n\n\n
\n Particle filters provide a means to track the state of an object even when the dynamics and the observations are non-linear/non-Gaussian. However, they can be very inefficient when the observation noise is low as compared to the system noise, as it is often the case in visual tracking applications. In this paper we propose a new two-stage sampling procedure to boost the performance of particle filters under this condition. The new procedure is shown to reduce the variance of the weights by means of a theoretical analysis. This result is confirmed in a series of synthetic and real-world visual tracking experiments.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2003\n \n \n (3)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n Performance of Nonlinear Approximate Adaptive Controllers.\n \n \n \n\n\n \n French, M.; Szepesvári, C.; and Rogers, E.\n\n\n \n\n\n\n Wiley, 2003.\n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@book{french2003,\n\tauthor = {French, M.C. and Szepesv{\\'a}ri, Cs. and Rogers, E.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2018-11-24 20:17:54 -0800},\n\tkeywords = {control, theory, adaptive control, Lyapunov design, function approximation},\n\tpublisher = {Wiley},\n\ttitle = {Performance of Nonlinear Approximate Adaptive Controllers},\n\tyear = {2003}}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Sequential Importance Sampling for Visual Tracking Reconsidered.\n \n \n \n \n\n\n \n Torma, P.; and Szepesvári, C.\n\n\n \n\n\n\n In C. M. Bishop, B. J. F., editor(s), AISTATS, pages 271–278, 2003. \n \n\n\n\n
\n\n\n\n \n \n \"Sequential paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{torma2003,\n\tabstract = {We consider the task of filtering dynamical systems observed in noise by means of sequential importance sampling when the proposal is restricted to the innovation components of the state. It is argued that the unmodified sequential importance sampling/resampling (SIR) algorithm may yield high variance estimates of the posterior in this case, resulting in poor performance when e.g. in visual tracking one tries to build a SIR algorithm on the top of the output of a color blob detector. A new method that associates the innovations sampled from the proposal and the particles in a separate computational step is proposed. The method is shown to outperform the unmodified SIR algorithm in a series of vision based object tracking experiments, both in terms of accuracy and robustness.},\n\tauthor = {Torma, P. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {AISTATS},\n\teditor = {C. M. Bishop, B. J. Frey},\n\tkeywords = {vision, particle filtering, theory, application},\n\towner = {Beata},\n\tpages = {271--278},\n\ttimestamp = {2010.08.31},\n\ttitle = {Sequential Importance Sampling for Visual Tracking Reconsidered},\n\turl_paper = {sisrc.pdf},\n\tyear = {2003}}\n\n
\n
\n\n\n
\n We consider the task of filtering dynamical systems observed in noise by means of sequential importance sampling when the proposal is restricted to the innovation components of the state. It is argued that the unmodified sequential importance sampling/resampling (SIR) algorithm may yield high variance estimates of the posterior in this case, resulting in poor performance when e.g. in visual tracking one tries to build a SIR algorithm on the top of the output of a color blob detector. A new method that associates the innovations sampled from the proposal and the particles in a separate computational step is proposed. The method is shown to outperform the unmodified SIR algorithm in a series of vision based object tracking experiments, both in terms of accuracy and robustness.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Combining Local Search, Neural Networks and Particle Filters to Achieve Fast and Reliable Contour Tracking.\n \n \n \n \n\n\n \n Torma, P.; and Szepesvári, C.\n\n\n \n\n\n\n In 2003 IEEE International Symposium on Intelligent Signal Processing, 2003. \n \n\n\n\n
\n\n\n\n \n \n \"Combining paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{torma2003a,\n\tabstract = {LS-N-IPS is an extension of the standard N-IPS particle filter (also known as CONDENSATION in the image processing literature). The modified algorithm adds local search to the baseline algorithm: in each time step the predictions are refined in a local search procedure that utilizes the most recent observation. A critical choice in the design of LS-N-IPS is the way the local search is implemented. Here, we introduce a method based on training artificial neural networks for implementing the local search. In experiments with real-life data (visual tracking) the method is shown to improve robustness and performance significantly, surpassing the performance of previous state-of-the-art algorithms.},\n\tauthor = {Torma, P. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {2003 IEEE International Symposium on Intelligent Signal Processing},\n\tkeywords = {vision, particle filtering, theory, application},\n\towner = {Beata},\n\ttimestamp = {2010.08.31},\n\ttitle = {Combining Local Search, Neural Networks and Particle Filters to Achieve Fast and Reliable Contour Tracking},\n\turl_paper = {lsn-ipsneuro.pdf},\n\tyear = {2003}}\n\n
\n
\n\n\n
\n LS-N-IPS is an extension of the standard N-IPS particle filter (also known as CONDENSATION in the image processing literature). The modified algorithm adds local search to the baseline algorithm: in each time step the predictions are refined in a local search procedure that utilizes the most recent observation. A critical choice in the design of LS-N-IPS is the way the local search is implemented. Here, we introduce a method based on training artificial neural networks for implementing the local search. In experiments with real-life data (visual tracking) the method is shown to improve robustness and performance significantly, surpassing the performance of previous state-of-the-art algorithms.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2002\n \n \n (1)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Towards Facial Pose Tracking.\n \n \n \n \n\n\n \n Torma, P.; and Szepesvári, C.\n\n\n \n\n\n\n In Proc. First Hungarian Computer Graphics and Geometry Conference, pages 10–16, Budapest, Hungary, 2002. \n \n\n\n\n
\n\n\n\n \n \n \"Towards paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{torma2002,\n\tabstract = {This paper presents a novel facial-pose tracking algorithm using LS-N-IPS (Local Search N-Interacting Particle System), an algorithm that has been introduced recently by the authors. LS-N-IPS is a probabilistic tracking algorithm that keeps track of a number of alternative hypotheses at any time, the particles. LS-N-IPS has three components: a dynamical model, an observation model, and a local-search operator that has to be chosen by the algorithm designer. The main novelty of the algorithm presented here is that it relies on shading information to guide the local search procedure, the idea of the search being to apply a sort-of Hough-transformation to the mapping that renders poses to images. Here we introduce this algorithm and report results on the task of tracking of synthetic facial masks using grey-scale image sequences.},\n\taddress = {Budapest, Hungary},\n\tauthor = {Torma, P. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Proc. First Hungarian Computer Graphics and Geometry Conference},\n\tkeywords = {vision, particle filtering, application},\n\towner = {Beata},\n\tpages = {10--16},\n\ttimestamp = {2010.08.31},\n\ttitle = {Towards Facial Pose Tracking},\n\turl_paper = {MaskShadeLS.pdf},\n\tyear = {2002}}\n\n
\n
\n\n\n
\n This paper presents a novel facial-pose tracking algorithm using LS-N-IPS (Local Search N-Interacting Particle System), an algorithm that has been introduced recently by the authors. LS-N-IPS is a probabilistic tracking algorithm that keeps track of a number of alternative hypotheses at any time, the particles. LS-N-IPS has three components: a dynamical model, an observation model, and a local-search operator that has to be chosen by the algorithm designer. The main novelty of the algorithm presented here is that it relies on shading information to guide the local search procedure, the idea of the search being to apply a sort-of Hough-transformation to the mapping that renders poses to images. Here we introduce this algorithm and report results on the task of tracking of synthetic facial masks using grey-scale image sequences.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2001\n \n \n (5)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Ockham's Razor Modeling of the Matrisome Channels of the Basal Ganglia Thalamocortical Loop.\n \n \n \n \n\n\n \n Lörincz, A.; Hévizi, G.; and Szepesvári, C.\n\n\n \n\n\n\n International Journal of Neural Systems, 11: 1– 2. 2001.\n \n\n\n\n
\n\n\n\n \n \n \"Ockham's link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@article{a.lorincz2001,\n\tabstract = {A functional model of the basal ganglia-thalamocortical (BTC) loops is described. In our modeling effort, we try to minimize the complexity of our starting hypotheses. For that reason, we call this type of modeling Ockham's razor modeling. We have the additional constraint that the starting assumptions should not contradict experimental findings about the brain. First assumption: The brain lacks direct representation of paths but represents directions (called speed fields in control theory). Then control should be concerned with speed-field tracking (SFT). Second assumption: Control signals are delivered upon differencing in competing parallel channels of the BTC loops. This is modeled by extending SFT with differencing that gives rise to the robust Static and Dynamic State (SDS) feedback-controlling scheme. Third assumption: Control signals are expressed in terms of a gelatinous medium surrounding the limbs. This is modeled by expressing parameters of motion in parameters of the external space. We show that corollaries of the model fit properties of the BTC loops. The SDS provides proper identification of motion related neuronal groups of the putamen. Local minima arise during the controlling process that works in external space. The model explains the presence of parallel channels as the means to avoiding such local minima. Stability conditions of the SDS predict that the initial phase of learning is mostly concerned with selection of sign for the inverse dynamics. The model provides a scalable controller. State description in external space instead of configurational space reduces the dimensionality problem. Falsifying experiment is suggested. Computer experiments demonstrate the feasibility of the approach. We argue that the resulting scheme has a straightforward connectionist representation exhibiting population coding and Hebbian learning properties. },\n\tauthor = {L{\\"o}rincz, A. and H{\\'e}vizi, Gy. and Szepesv{\\'a}ri, Cs.},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tjournal = {International Journal of Neural Systems},\n\tkeywords = {biological modeling, neural networks},\n\towner = {Beata},\n\tpages = {1-- 2},\n\ttimestamp = {2010.08.31},\n\ttitle = {Ockham's Razor Modeling of the Matrisome Channels of the Basal Ganglia Thalamocortical Loop},\n\turl_link = {http://ejournals.wspc.com.sg/ijns/11/1102/S0129065701000412.html},\n\tvolume = {11},\n\tyear = {2001},\n\tBdsk-Url-1 = {http://ejournals.wspc.com.sg/ijns/11/1102/S0129065701000412.html}}\n\n
\n
\n\n\n
\n A functional model of the basal ganglia-thalamocortical (BTC) loops is described. In our modeling effort, we try to minimize the complexity of our starting hypotheses. For that reason, we call this type of modeling Ockham's razor modeling. We have the additional constraint that the starting assumptions should not contradict experimental findings about the brain. First assumption: The brain lacks direct representation of paths but represents directions (called speed fields in control theory). Then control should be concerned with speed-field tracking (SFT). Second assumption: Control signals are delivered upon differencing in competing parallel channels of the BTC loops. This is modeled by extending SFT with differencing that gives rise to the robust Static and Dynamic State (SDS) feedback-controlling scheme. Third assumption: Control signals are expressed in terms of a gelatinous medium surrounding the limbs. This is modeled by expressing parameters of motion in parameters of the external space. We show that corollaries of the model fit properties of the BTC loops. The SDS provides proper identification of motion related neuronal groups of the putamen. Local minima arise during the controlling process that works in external space. The model explains the presence of parallel channels as the means to avoiding such local minima. Stability conditions of the SDS predict that the initial phase of learning is mostly concerned with selection of sign for the inverse dynamics. The model provides a scalable controller. State description in external space instead of configurational space reduces the dimensionality problem. Falsifying experiment is suggested. Computer experiments demonstrate the feasibility of the approach. We argue that the resulting scheme has a straightforward connectionist representation exhibiting population coding and Hebbian learning properties. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Prediction of Protein Functional Domains from Sequences Using Artificial Neural Networks.\n \n \n \n \n\n\n \n Murvai, J.; Vlahovicek, K.; Szepesvári, C.; and Pongor, S.\n\n\n \n\n\n\n Genome Research, 11(8): 1410–1417. 2001.\n \n\n\n\n
\n\n\n\n \n \n \"Prediction link\n  \n \n \n \"Prediction paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{j.murvai2001,\n\tabstract = {An artificial neural network (ANN) solution is described for the recognition of domains in protein sequences. A query sequence is first compared to a reference database of domain sequences by use of and the output data, encoded in the form of six parameters, are forwarded to feed-forward artificial neural networks with six input and six hidden units with sigmoidal transfer function. The recognition is based on the distribution of scores precomputed for the known domain groups in a database versus database comparison. Applications to the prediction of function are discussed.},\n\tauthor = {Murvai, J. and Vlahovicek, K. and Szepesv{\\'a}ri, Cs. and Pongor, S.},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\thtml = {http://www.genome.org/cgi/reprint/11/8/1410},\n\tjournal = {Genome Research},\n\tkeywords = {bioinformatics, domain prediction, neural networks, application},\n\tnumber = {8},\n\towner = {Beata},\n\tpages = {1410--1417},\n\ttimestamp = {2010.08.31},\n\ttitle = {Prediction of Protein Functional Domains from Sequences Using Artificial Neural Networks},\n\turl_link = {http://www.genome.org/cgi/reprint/11/8/1410.pdf},\n\turl_paper = {protdompred.pdf},\n\tvolume = {11},\n\tyear = {2001},\n\tBdsk-Url-1 = {http://www.genome.org/cgi/reprint/11/8/1410.pdf}}\n\n
\n
\n\n\n
\n An artificial neural network (ANN) solution is described for the recognition of domains in protein sequences. A query sequence is first compared to a reference database of domain sequences by use of and the output data, encoded in the form of six parameters, are forwarded to feed-forward artificial neural networks with six input and six hidden units with sigmoidal transfer function. The recognition is based on the distribution of scores precomputed for the known domain groups in a database versus database comparison. Applications to the prediction of function are discussed.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Efficient Approximate Planning in Continuous Space Markovian Decision Problems.\n \n \n \n \n\n\n \n Szepesvári, C.\n\n\n \n\n\n\n AI Communications, 13(3): 163–176. 2001.\n \n\n\n\n
\n\n\n\n \n \n \"Efficient paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{Szepesvari2001,\n\tabstract = {Monte-Carlo planning algorithms for planning in continuous state-space, discounted Markovian Decision Problems (MDPs) having a smooth transition law and a finite action space are considered. We prove various polynomial complexity results for the considered algorithms, improving upon several known bounds.},\n\tauthor = {Szepesv{\\'a}ri, Cs.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tjournal = {AI Communications},\n\tkeywords = {continuous-space MDPs, Lipschitz-continuous MDPs, Monte-Carlo methods, performance bounds, theory, curse of dimensionality, complexity analysis},\n\tnumber = {3},\n\tpages = {163--176},\n\ttitle = {Efficient Approximate Planning in Continuous Space Markovian Decision Problems},\n\turl_paper = {aicom.pdf},\n\tvolume = {13},\n\tyear = {2001}}\n\n
\n
\n\n\n
\n Monte-Carlo planning algorithms for planning in continuous state-space, discounted Markovian Decision Problems (MDPs) having a smooth transition law and a finite action space are considered. We prove various polynomial complexity results for the considered algorithms, improving upon several known bounds.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n LS-N-IPS: an Improvement of Particle Filters by Means of Local Search.\n \n \n \n \n\n\n \n Torma, P.; and Szepesvári, C.\n\n\n \n\n\n\n In 5th IFAC Symposium on Nonlinear Control Systems (NOLCOS'01), pages 715–719, 2001. \n \n\n\n\n
\n\n\n\n \n \n \"LS-N-IPS: paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{torma2001,\n\tabstract = {A modification of N-IPS, a well known particle filter method is proposed and is shown to be more efficient than the baseline algorithm in the small sample size limit and when the observations are ``reliable''. The algorithm called LS-N-IPS adds local search to the baseline algorithm: in each time step the predictions are refined in a local search procedure that utilizes the most recent observation. The uniform stability of LS-N-IPS is studied and results of experiments are reported both for a simulated and a real-world (visual) tracking problem.},\n\tauthor = {Torma, P. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {5th IFAC Symposium on Nonlinear Control Systems (NOLCOS'01)},\n\tkeywords = {vision, particle filtering, theory, application},\n\tpages = {715--719},\n\ttimestamp = {2010.08.31},\n\ttitle = {LS-N-IPS: an Improvement of Particle Filters by Means of Local Search},\n\turl_paper = {LSN-IPS.ps.pdf},\n\tyear = {2001}}\n\n
\n
\n\n\n
\n A modification of N-IPS, a well known particle filter method is proposed and is shown to be more efficient than the baseline algorithm in the small sample size limit and when the observations are ``reliable''. The algorithm called LS-N-IPS adds local search to the baseline algorithm: in each time step the predictions are refined in a local search procedure that utilizes the most recent observation. The uniform stability of LS-N-IPS is studied and results of experiments are reported both for a simulated and a real-world (visual) tracking problem.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Efficient Object Tracking in Video Sequences by means of LS-N-IPS.\n \n \n \n \n\n\n \n Torma, P.; and Szepesvári, C.\n\n\n \n\n\n\n In Proc. Second International Symposium on Image and Signal Processing and Analysis (ISAP'01), pages 277–282, 2001. \n \n\n\n\n
\n\n\n\n \n \n \"Efficient paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{torma2001a,\n\tabstract = {A recently introduced particle filtering method, called LS-N-IPS, is considered for tracking objects on video sequences. LS-N-IPS is a computationally efficient particle filter that performs better than the standard N-IPS particle filter, when observations are highly peaky, as it is the case of visual object tracking problems with good observation models. An implementation of LS-N-IPS that uses B-spline based contour models is proposed and is shown to perform very well as compared to similar state-of-the art tracking algorithms.},\n\tauthor = {Torma, P. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Proc. Second International Symposium on Image and Signal Processing and Analysis (ISAP'01)},\n\tkeywords = {vision, particle filtering, theory, application},\n\tpages = {277--282},\n\ttimestamp = {2010.08.31},\n\ttitle = {Efficient Object Tracking in Video Sequences by means of LS-N-IPS},\n\turl_paper = {VisTrLSN-IPS.ps.pdf},\n\tyear = {2001}}\n\n
\n
\n\n\n
\n A recently introduced particle filtering method, called LS-N-IPS, is considered for tracking objects on video sequences. LS-N-IPS is a computationally efficient particle filter that performs better than the standard N-IPS particle filter, when observations are highly peaky, as it is the case of visual object tracking problems with good observation models. An implementation of LS-N-IPS that uses B-spline based contour models is proposed and is shown to perform very well as compared to similar state-of-the art tracking algorithms.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2000\n \n \n (9)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n FlexVoice: a Parametric Approach to High-quality Speech-synthesis.\n \n \n \n \n\n\n \n Balogh, G.; Dobler, E.; Gr ̋obler, T.; Smodics, B.; and Szepesvári, C.\n\n\n \n\n\n\n In Text, Speech and Dialogue, volume 1902, pages 119–172, 2000. Springer\n \n\n\n\n
\n\n\n\n \n \n \"FlexVoice: paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{balogh2000a,\n\tabstract = {FlexVoice, an integrated text-to-speech (TTS) system is presented in this paper. Its most distinctive feature is its low memory and CPU load while preserving the high quality of leading TTS systems. FlexVoice uses a hybrid approach that combines diphone concatenation with LPC-based parametric synthesis. Major improvements of speech quality are achieved by the careful design of each module at all synthesis levels (such as selection of training data for the various machine learning methods and that of the basic synthesis units for the parametric synthesiser). FlexVoice currently supports US English with two male and two female voices. },\n\tauthor = {Balogh, Gy. and Dobler, E. and Gr\\H obler, T. and Smodics, B. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Text, Speech and Dialogue},\n\tdate-added = {2010-09-04 12:17:25 -0600},\n\tdate-modified = {2010-09-04 12:19:26 -0600},\n\tdoi = {10.1007/3-540-45323-7_32},\n\tkeywords = {application, text-to-speech synthesis},\n\tpages = {119--172},\n\tpublisher = {Springer},\n\ttitle = {FlexVoice: a Parametric Approach to High-quality Speech-synthesis},\n\turl_paper = {flexvoice-tsd.pdf},\n\tvolume = {1902},\n\tyear = {2000},\n\tBdsk-Url-1 = {http://dx.doi.org/10.1007/3-540-45323-7_32}}\n\n
\n
\n\n\n
\n FlexVoice, an integrated text-to-speech (TTS) system is presented in this paper. Its most distinctive feature is its low memory and CPU load while preserving the high quality of leading TTS systems. FlexVoice uses a hybrid approach that combines diphone concatenation with LPC-based parametric synthesis. Major improvements of speech quality are achieved by the careful design of each module at all synthesis levels (such as selection of training data for the various machine learning methods and that of the basic synthesis units for the parametric synthesiser). FlexVoice currently supports US English with two male and two female voices. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Computer Aided Diagnosis of Clustered Microcalcifications Using Artificial Neural Nets.\n \n \n \n \n\n\n \n Sorantin, E.; Schmidt, F.; Mayer, H.; Becker, M.; Szepesvári, C.; Graif, E.; and Winkler, P.\n\n\n \n\n\n\n Journal of Computing and Information Technology, 8(2). 2000.\n \n\n\n\n
\n\n\n\n \n \n \"Computer link\n  \n \n \n \"Computer paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{sor2000,\n\tabstract = {Objective: Development of a fully automated computer application for detection and classification of clustered microcalcifications using neural nets. Material and Methods: Mammographic films with clustered microcalcifications of known histology were digitized. All clusters were rated by two radiologists on a 3 point scale: benign, indeterminate and malignant. Automated detected clustered microcalcifications were clustered. Features derived from those clusters were used as input to 2 artificial neural nets: one was trained to identify the indeterminate clusters, whereas the second ANN classified the remaining clusters in benign or malignant ones. Performance evaluation followed the patient-based receiver operator characteristic analysis. Results: For identification of patients with indeterminate clusters a an Az-value of 0.8741 could be achieved. For the remaining patients their clusters could be classified as benign or malignant at an Az-value of 0.8749, a sensitivity of 0.977 and specificity of 0.471. Conclusions: A fully automated computer system for detection and classification of clustered microcalcifications was developed. The system is able to identify patients with indeterminate clusters, where additional investigations are recommended, and produces a reliable estimation of the biologic dignity for the remaining ones.},\n\tauthor = {Sorantin, E. and Schmidt, F. and Mayer, H. and Becker, M. and Szepesv{\\'a}ri, Cs. and Graif, E. and Winkler, P.},\n\tdate-added = {2010-09-02 10:15:27 -0600},\n\tdate-modified = {2010-09-02 13:09:59 -0600},\n\tjournal = {Journal of Computing and Information Technology},\n\tkeywords = {application, neural networks, image processing, health informatics, clinical decision support},\n\tnumber = {2},\n\ttitle = {Computer Aided Diagnosis of Clustered Microcalcifications Using Artificial Neural Nets},\n\turl_link = {http://cit.srce.hr/index.php/CIT/article/view/1415},\n\turl_paper = {JCIT2000.pdf},\n\tvolume = {8},\n\tyear = {2000},\n\tBdsk-Url-1 = {http://cit.srce.hr/index.php/CIT/article/view/1415}}\n\n
\n
\n\n\n
\n Objective: Development of a fully automated computer application for detection and classification of clustered microcalcifications using neural nets. Material and Methods: Mammographic films with clustered microcalcifications of known histology were digitized. All clusters were rated by two radiologists on a 3 point scale: benign, indeterminate and malignant. Automated detected clustered microcalcifications were clustered. Features derived from those clusters were used as input to 2 artificial neural nets: one was trained to identify the indeterminate clusters, whereas the second ANN classified the remaining clusters in benign or malignant ones. Performance evaluation followed the patient-based receiver operator characteristic analysis. Results: For identification of patients with indeterminate clusters a an Az-value of 0.8741 could be achieved. For the remaining patients their clusters could be classified as benign or malignant at an Az-value of 0.8749, a sensitivity of 0.977 and specificity of 0.471. Conclusions: A fully automated computer system for detection and classification of clustered microcalcifications was developed. The system is able to identify patients with indeterminate clusters, where additional investigations are recommended, and produces a reliable estimation of the biologic dignity for the remaining ones.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n FlexVoice: a Parametric Approach to High-quality Speech-synthesis.\n \n \n \n \n\n\n \n Balogh, G.; Dobler, E.; Gr ̋obler, T.; Smodics, B.; and Szepesvári, C.\n\n\n \n\n\n\n In pages 15/1–15/6, London, 04 2000. IEE Electronics and Communications\n \n\n\n\n
\n\n\n\n \n \n \"FlexVoice: paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{balogh2000,\n\tabstract = {The TTS system described in this paper is based on the analysis and resynthesis of a given speaker's voice. FlexVoice provides large flexibility in changing voice properties independently from the vocal tract parameters. This flexibility can be demonstrated by a number of voice conversions including female-to-male and female-to-child conversions. FlexVoice only uses a fraction of the resources of a PC and its quality is comparable to that of the leading TTS systems.},\n\taddress = {London},\n\tauthor = {Balogh, Gy. and Dobler, E. and Gr\\H obler, T. and Smodics, B. and Szepesv{\\'a}ri, Cs.},\n\tkeywords = {application, text-to-speech synthesis},\n\tmonth = {04},\n\tpages = {15/1--15/6},\n\tpublisher = {IEE Electronics and Communications},\n\ttitle = {FlexVoice: a Parametric Approach to High-quality Speech-synthesis},\n\turl_paper = {flexvoice-iee.ps.pdf},\n\tyear = {2000}}\n\n
\n
\n\n\n
\n The TTS system described in this paper is based on the analysis and resynthesis of a given speaker's voice. FlexVoice provides large flexibility in changing voice properties independently from the vocal tract parameters. This flexibility can be demonstrated by a number of voice conversions including female-to-male and female-to-child conversions. FlexVoice only uses a fraction of the resources of a PC and its quality is comparable to that of the leading TTS systems.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Uncertainty, Performance, and Model Dependency in Approximate Adaptive Nonlinear Control.\n \n \n \n \n\n\n \n French, M.; Szepesvári, C.; and Rogers, E.\n\n\n \n\n\n\n IEEE Transactions on Automatic Control, 45(2): 353–358. 2000.\n \n\n\n\n
\n\n\n\n \n \n \"Uncertainty, paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{french2000,\n\tabstract = {We consider systems satisfying a matching condition which are functionally known up to weighted L2 and L1 measures of uncertainty. A modified LQ measure of control and state transient performance is given, and the performance of a class of approximate model based adaptive controllers is studied. An upper performance bound is derived in terms of the uncertainty models (stability and the state transient bounds require only the L2 uncertainty model; control effort bounds require both L2 and L1 uncertainty models), and various structural properties of the model basis. Sufficient conditions are given to ensure that the performance is bounded independently of the model basis size.},\n\tauthor = {French, M.C. and Szepesv{\\'a}ri, Cs. and Rogers, E.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tjournal = {IEEE Transactions on Automatic Control},\n\tkeywords = {adaptive control, performance bounds, Lyapunov design, strict feedback, chain of integrators, matched uncertainty, theory, control, nonparametrics, stabilization},\n\tnumber = {2},\n\tpages = {353--358},\n\ttitle = {Uncertainty, Performance, and Model Dependency in Approximate Adaptive Nonlinear Control},\n\turl_paper = {fsr97.ps.pdf},\n\tvolume = {45},\n\tyear = {2000}}\n\n
\n
\n\n\n
\n We consider systems satisfying a matching condition which are functionally known up to weighted L2 and L1 measures of uncertainty. A modified LQ measure of control and state transient performance is given, and the performance of a class of approximate model based adaptive controllers is studied. An upper performance bound is derived in terms of the uncertainty models (stability and the state transient bounds require only the L2 uncertainty model; control effort bounds require both L2 and L1 uncertainty models), and various structural properties of the model basis. Sufficient conditions are given to ensure that the performance is bounded independently of the model basis size.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n LQ performance Bounds for Adaptive Output Feedback Controllers for Functionally Uncertain Systems.\n \n \n \n \n\n\n \n French, M.; Szepesvári, C.; and Rogers, E.\n\n\n \n\n\n\n Automatica. 2000.\n \n\n\n\n
\n\n\n\n \n \n \"LQ pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{french2000a,\n\tabstract = {We consider functionally uncertain systems which can be written in an output feedback form, where the nonlinearities are functions of the output only. The uncertainty is described by a weighted L 2 norm about a nominal system, and an approximate adaptive design is given which ensures output practical stability. The main result requires knowledge of the weighted L 2 uncertainty level. An upper bound on the LQ performance of the output transient and the control input is derived, where the cost penalises the output transient and the control effort on the time interval where the output lies outside the prescribed neighbourhood of zero to which we achieve convergence.},\n\tauthor = {French, M.C. and Szepesv{\\'a}ri, Cs. and Rogers, E.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tjournal = {Automatica},\n\tkeywords = {adaptive control, performance bounds, Lyapunov design, output feedback, theory, control, nonparametrics, output tracking, output stabilization},\n\ttitle = {LQ performance Bounds for Adaptive Output Feedback Controllers for Functionally Uncertain Systems},\n\turl_pdf = {http://eprints.ecs.soton.ac.uk/9066/1/mcfcsetar_aut2002.pdf},\n\tyear = {2000}}\n\n
\n
\n\n\n
\n We consider functionally uncertain systems which can be written in an output feedback form, where the nonlinearities are functions of the output only. The uncertainty is described by a weighted L 2 norm about a nominal system, and an approximate adaptive design is given which ensures output practical stability. The main result requires knowledge of the weighted L 2 uncertainty level. An upper bound on the LQ performance of the output transient and the control input is derived, where the cost penalises the output transient and the control effort on the time interval where the output lies outside the prescribed neighbourhood of zero to which we achieve convergence.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n An Asymptotic Scaling Analysis of LQ performance for an Approximate Adaptive Control Design.\n \n \n \n \n\n\n \n French, M.; Szepesvári, C.; and Rogers, E.\n\n\n \n\n\n\n Mathematics of Control, Signals and Systems, 15: 1–2. 2000.\n \n\n\n\n
\n\n\n\n \n \n \"An pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{french2000b,\n\tabstract = {We consider the adaptive tracking problem for a chain of integrators, where the uncertainty is static and functional. The uncertainty is specified by L2/L1 or weighted L2/L1 norm bounds. We analyse a standard Lyapunov based adaptive design which utilizes a function approximator to induce a parametric uncertainty, on which the adaptive design is completed. Performance is measured by a modified LQ cost functional, penalising both the tracking error transient and the control effort. With such a cost functional, it is shown that a standard control design has divergent performance when the resolution of a `mono-resolution' approximator is increased. The class of `mono-resolution' approximators includes models popular in applications. A general construction of a class of approximators and their associated controllers which have a uniformly bounded performance independent of the resolution of the approximator is given.},\n\tauthor = {French, M.C. and Szepesv{\\'a}ri, Cs. and Rogers, E.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-05 01:18:16 -0600},\n\tjournal = {Mathematics of Control, Signals and Systems},\n\tkeywords = {adaptive control, performance bounds, Lyapunov design, chain of integrators, tracking, multiresolution function approximation, theory, nonparametrics},\n\tpages = {1--2},\n\ttitle = {An Asymptotic Scaling Analysis of LQ performance for an Approximate Adaptive Control Design},\n\turl_pdf = {http://eprints.ecs.soton.ac.uk/6643/1/csmcfetar_mcss2002.pdf},\n\tvolume = {15},\n\tyear = {2000}}\n\n
\n
\n\n\n
\n We consider the adaptive tracking problem for a chain of integrators, where the uncertainty is static and functional. The uncertainty is specified by L2/L1 or weighted L2/L1 norm bounds. We analyse a standard Lyapunov based adaptive design which utilizes a function approximator to induce a parametric uncertainty, on which the adaptive design is completed. Performance is measured by a modified LQ cost functional, penalising both the tracking error transient and the control effort. With such a cost functional, it is shown that a standard control design has divergent performance when the resolution of a `mono-resolution' approximator is increased. The class of `mono-resolution' approximators includes models popular in applications. A general construction of a class of approximators and their associated controllers which have a uniformly bounded performance independent of the resolution of the approximator is given.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Scaling of LQ Performance in Approximate Adaptive Designs.\n \n \n \n\n\n \n French, M.; Szepesvári, C.; and Rogers, E.\n\n\n \n\n\n\n In Proceedings of the International Symposium on the Mathematical Theory of Networks and Systems (MTNS 2000), 2000. \n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{french2000c,\n\tauthor = {French, M.C. and Szepesv{\\'a}ri, Cs. and Rogers, E.},\n\tbooktitle = {Proceedings of the International Symposium on the Mathematical Theory of Networks and Systems (MTNS 2000)},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tkeywords = {adaptive control, performance bounds, Lyapunov design, chain of integrators, tracking, multiresolution function approximation, theory, nonparametrics},\n\ttitle = {Scaling of LQ Performance in Approximate Adaptive Designs},\n\tyear = {2000}}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Convergence Results for Single-Step On-Policy Reinforcement-Learning Algorithms.\n \n \n \n \n\n\n \n Singh, S. P.; Jaakkola, T.; Littman, M.; and Szepesvári, C.\n\n\n \n\n\n\n Machine Learning, 38(3): 287–308. 2000.\n \n\n\n\n
\n\n\n\n \n \n \"Convergence paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{singh2000,\n\tabstract = {An important application of reinforcement learning (RL) is to finite-state control problems and one of the most difficult problems in learning for control is balancing the exploration/exploitation tradeoff. Existing theoretical results for RL give very little guidance on reasonable ways to perform exploration. In this paper, we examine the convergence of single-step on-policy RL algorithms for control. On-policy algorithms cannot separate exploration from learning and therefore must confront the exploration problem directly. We prove convergence results for several related on-policy algorithms with both decaying exploration and persistent exploration. We also provide examples of exploration strategies that can be followed during learning that result in convergence to both optimal values and optimal policies.},\n\tauthor = {Singh, S. P. and Jaakkola, T. and Littman, M.L. and Szepesv{\\'a}ri, Cs.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:15 -0600},\n\tdoi = {10.1023/A:1007678930559},\n\tjournal = {Machine Learning},\n\tkeywords = {theory, reinforcement learning, asymptotic convergence, finite MDPs, stochastic approximation},\n\tnumber = {3},\n\tpages = {287--308},\n\ttitle = {Convergence Results for Single-Step On-Policy Reinforcement-Learning Algorithms},\n\turl_paper = {singh98convergence.pdf},\n\tvolume = {38},\n\tyear = {2000},\n\tBdsk-Url-1 = {http://dx.doi.org/10.1023/A:1007678930559}}\n\n
\n
\n\n\n
\n An important application of reinforcement learning (RL) is to finite-state control problems and one of the most difficult problems in learning for control is balancing the exploration/exploitation tradeoff. Existing theoretical results for RL give very little guidance on reasonable ways to perform exploration. In this paper, we examine the convergence of single-step on-policy RL algorithms for control. On-policy algorithms cannot separate exploration from learning and therefore must confront the exploration problem directly. We prove convergence results for several related on-policy algorithms with both decaying exploration and persistent exploration. We also provide examples of exploration strategies that can be followed during learning that result in convergence to both optimal values and optimal policies.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Convergent Reinforcement Learning with Value Function Interpolation.\n \n \n \n \n\n\n \n Szepesvári, C.\n\n\n \n\n\n\n Technical Report TR-2001-02, Mindmaker Ltd., Budapest 1121, Konkoly Th. M. u. 29-33, HUNGARY, 2000.\n \n\n\n\n
\n\n\n\n \n \n \"Convergent paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@techreport{Szepesvari2000,\n\tabstract = {We consider the convergence of a class of reinforcement learning algorithms combined with value function interpolation methods using the methods developed in (Littman and {Sz}epesv{\\'a}ri, 1996). As a special case of the obtained general results, for the first time, we prove the (almost sure) convergence of Q-learning when combined with value function interpolation in uncountable spaces.},\n\taddress = {Budapest 1121, Konkoly Th. M. u. 29-33, HUNGARY},\n\tauthor = {Szepesv{\\'a}ri, Cs.},\n\tdate-modified = {2010-09-04 14:48:33 -0600},\n\tinstitution = {Mindmaker Ltd.},\n\tkeywords = {reinforcement learning, theory, asymptotic convergence, function approximation, stochastic approximation},\n\tnumber = {TR-2001-02},\n\ttimestamp = {2010.08.30},\n\ttitle = {Convergent Reinforcement Learning with Value Function Interpolation},\n\turl_paper = {rlfapp.pdf},\n\tyear = {2000}}\n\n
\n
\n\n\n
\n We consider the convergence of a class of reinforcement learning algorithms combined with value function interpolation methods using the methods developed in (Littman and Szepesvári, 1996). As a special case of the obtained general results, for the first time, we prove the (almost sure) convergence of Q-learning when combined with value function interpolation in uncountable spaces.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 1999\n \n \n (6)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n An Automatic Method for the Identification and Interpretation of Clustered Microcalcifications in Mammograms.\n \n \n \n \n\n\n \n Schmidt, F.; Sorantin, E.; Szepesvári, C.; Graif, E.; Becker, M.; Mayer, H.; and Hartwagner, K.\n\n\n \n\n\n\n Physics in Medicine and Biology, 44(5): 1231–1243. 1999.\n \n\n\n\n
\n\n\n\n \n \n \"An link\n  \n \n \n \"An paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{Schmidt99,\n\tabstract = {We investigated a method for a fully automatic identification and interpretation process for clustered microcalcifications in mammograms. Mammographic films of 100 patients containing microcalcifications with known histology were digitized and preprocessed using standard techniques. Microcalcifications detected by an artificial neural network (ANN) were clustered and some cluster features served as the input of another ANN trained to differentiate between typical and atypical clusters, while others were fed into an ANN trained on typical clusters to evaluate these lesions. The measured sensitivity for the detection of grouped microcalcifications was 0.98. For the task of differentiation between typical and atypical clusters an Az value of 0.87 was computed, while for the diagnosis an Az value of 0.87 with a sensitivity of 0.97 and a specificity of 0.47 was obtained. The results show that a fully automatic computer system was developed for the identification and interpretation of clustered microcalcifications in mammograms with the ability to differentiate most benign lesions from malignant ones in an automatically selected subset of cases.},\n\tauthor = {Schmidt, F. and Sorantin, E. and Szepesv{\\'a}ri, Cs. and Graif, E. and Becker, M. and Mayer, H. and Hartwagner, K.},\n\tdate-added = {2010-09-02 09:29:13 -0600},\n\tdate-modified = {2010-09-02 13:09:15 -0600},\n\tjournal = {Physics in Medicine and Biology},\n\tkeywords = {application, neural networks, image processing, health informatics, clinical decision support},\n\tnumber = {5},\n\tpages = {1231--1243},\n\ttitle = {An Automatic Method for the Identification and Interpretation of Clustered Microcalcifications in Mammograms},\n\turl_link = {http://stacks.iop.org/0031-9155/44/i=5/a=011},\n\turl_paper = {schmidt-99.pdf},\n\tvolume = {44},\n\tyear = {1999},\n\tBdsk-Url-1 = {http://stacks.iop.org/0031-9155/44/i=5/a=011}}\n\n
\n
\n\n\n
\n We investigated a method for a fully automatic identification and interpretation process for clustered microcalcifications in mammograms. Mammographic films of 100 patients containing microcalcifications with known histology were digitized and preprocessed using standard techniques. Microcalcifications detected by an artificial neural network (ANN) were clustered and some cluster features served as the input of another ANN trained to differentiate between typical and atypical clusters, while others were fed into an ANN trained on typical clusters to evaluate these lesions. The measured sensitivity for the detection of grouped microcalcifications was 0.98. For the task of differentiation between typical and atypical clusters an Az value of 0.87 was computed, while for the diagnosis an Az value of 0.87 with a sensitivity of 0.97 and a specificity of 0.47 was obtained. The results show that a fully automatic computer system was developed for the identification and interpretation of clustered microcalcifications in mammograms with the ability to differentiate most benign lesions from malignant ones in an automatically selected subset of cases.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Comparing Value-Function Estimation Algorithms in Undiscounted Problems.\n \n \n \n \n\n\n \n Beleznay, F.; Gr ̋obler, T.; and Szepesvári, C.\n\n\n \n\n\n\n Technical Report TR-99-02, Mindmaker Ltd., Budapest 1121, Konkoly Th. M. u. 29-33, Hungary, 1999.\n \n\n\n\n
\n\n\n\n \n \n \"Comparing paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@techreport{beleznay1999,\n\tabstract = {We compare scaling properties of several value-function estimation algorithms. In particular, we prove that Q-learning can scale exponentially slowly with the number of states. We identify the reasons of the slow convergence and show that both TD($\\lambda$) and learning with a fixed learning-rate enjoy rather fast convergence, just like the model-based method.},\n\taddress = {Budapest 1121, Konkoly Th. M. u. 29-33, Hungary},\n\tauthor = {Beleznay, F. and Gr\\H obler, T. and Szepesv{\\'a}ri, Cs.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:15 -0600},\n\tinstitution = {Mindmaker Ltd.},\n\tkeywords = {theory, rate of convergence, reinforcement learning},\n\tnumber = {TR-99-02},\n\ttitle = {Comparing Value-Function Estimation Algorithms in Undiscounted Problems},\n\turl_paper = {slowql-tr99-02.ps.pdf},\n\tyear = {1999}}\n\n
\n
\n\n\n
\n We compare scaling properties of several value-function estimation algorithms. In particular, we prove that Q-learning can scale exponentially slowly with the number of states. We identify the reasons of the slow convergence and show that both TD($λ$) and learning with a fixed learning-rate enjoy rather fast convergence, just like the model-based method.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n The SBASE Protein Domain Sequence Library Release 6.0.\n \n \n \n\n\n \n Murvai, J.; Barta, E.; Vlahovicek, K.; Szepesvári, C.; Acatrinei, C.; and Pongor, S.\n\n\n \n\n\n\n Nucleic Acids Research, 27(1): 257–259. 1999.\n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@article{j.murvai1999,\n\tauthor = {Murvai, J. and Barta, E. and Vlahovicek, K. and Szepesv{\\'a}ri, Cs. and Acatrinei, C. and Pongor, S.},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tjournal = {Nucleic Acids Research},\n\tkeywords = {bioinformatics, application},\n\tnumber = {1},\n\towner = {Beata},\n\tpages = {257--259},\n\ttimestamp = {2010.08.31},\n\ttitle = {The SBASE Protein Domain Sequence Library Release 6.0.},\n\tvolume = {27},\n\tyear = {1999}}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Parallel and Robust Skeletonization Built on Self-organizing Elements.\n \n \n \n \n\n\n \n Kalmár, Z.; Marczell, Z.; Szepesvári, C.; and Lörincz, A.\n\n\n \n\n\n\n Neural Networks, 12: 163–173. 1999.\n \n\n\n\n
\n\n\n\n \n \n \"Parallel paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{kalmar1999,\n\tabstract = {A massively parallel neural architecture is suggested for the approximate computation of the skeleton of a planar shape. Numerical examples demonstrate the robustness of the method. The architecture is constructed from self-organizing elements that allow the extension of the concept of skeletonization to areas remote to image processing.},\n\tauthor = {Kalm{\\'a}r, Zs. and Marczell, Zs. and Szepesv{\\'a}ri, Cs. and L{\\"o}rincz, A.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-04 12:23:43 -0600},\n\tdoi = {10.1016/S0893-6080(98)00119-1},\n\tjournal = {Neural Networks},\n\tkeywords = {neural networks, image processing, application},\n\tpages = {163--173},\n\ttitle = {Parallel and Robust Skeletonization Built on Self-organizing Elements},\n\turl_paper = {marczell_SKELETON.pdf},\n\tvolume = {12},\n\tyear = {1999},\n\tBdsk-Url-1 = {http://dx.doi.org/10.1016/S0893-6080(98)00119-1}}\n\n
\n
\n\n\n
\n A massively parallel neural architecture is suggested for the approximate computation of the skeleton of a planar shape. Numerical examples demonstrate the robustness of the method. The architecture is constructed from self-organizing elements that allow the extension of the concept of skeletonization to areas remote to image processing.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n An Evaluation Criterion for Macro-Learning and Some Results.\n \n \n \n \n\n\n \n Kalmár, Z.; and Szepesvári, C.\n\n\n \n\n\n\n Technical Report TR-99-01, Mindmaker Ltd., Budapest 1121, Konkoly Th. M. u. 29-33, HUNGARY, 1999.\n \n\n\n\n
\n\n\n\n \n \n \"An paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@techreport{kalmarsze1999,\n\tabstract = {It is known that a well-chosen set of macros makes it possible to considerably speed-up the solution of planning problems. Recently, macros have been considered in the planning framework, built on Markovian decision problem. However, so far no systematic approach was put forth to investigate the utility of macros within this framework. In this article we begin to systematically study this problem by introducing the concept of multi-task MDPs defined with a distribution over the tasks. We propose an evaluation criterion for macro-sets that is based on the expected planning speed-up due to the usage of a macro-set, where the expectation is taken over the set of tasks. The consistency of the empirical speed-up maximization algorithm is shown in the finite case. For acyclic systems, the expected planning speed-up is shown to be proportional to the amount of ``time-compression'' due to the macros. Based on these observations a heuristic algorithm for learning of macros is proposed. The algorithm is shown to return macros identical with those that one would like to design by hand in the case of a particular navigation like multi-task MDP. Some related questions, in particular the problem of breaking up MDPs into multiple tasks, factorizing MDPs and learning generalizations over actions to enhance the amount of transfer are also considered in brief at the end of the paper.},\n\taddress = {Budapest 1121, Konkoly Th. M. u. 29-33, HUNGARY},\n\tauthor = {Kalm{\\'a}r, Zs. and Szepesv{\\'a}ri, Cs.},\n\tdate-modified = {2010-09-02 13:09:15 -0600},\n\tinstitution = {Mindmaker Ltd.},\n\tkeywords = {macro learning, reinforcement learning, lifelong learning, multitask learning},\n\tnumber = {TR-99-01},\n\towner = {Beata},\n\ttimestamp = {2010.08.30},\n\ttitle = {An Evaluation Criterion for Macro-Learning and Some Results},\n\turl_paper = {macro-tr99-01.ps.pdf},\n\tyear = {1999}}\n\n
\n
\n\n\n
\n It is known that a well-chosen set of macros makes it possible to considerably speed-up the solution of planning problems. Recently, macros have been considered in the planning framework, built on Markovian decision problem. However, so far no systematic approach was put forth to investigate the utility of macros within this framework. In this article we begin to systematically study this problem by introducing the concept of multi-task MDPs defined with a distribution over the tasks. We propose an evaluation criterion for macro-sets that is based on the expected planning speed-up due to the usage of a macro-set, where the expectation is taken over the set of tasks. The consistency of the empirical speed-up maximization algorithm is shown in the finite case. For acyclic systems, the expected planning speed-up is shown to be proportional to the amount of ``time-compression'' due to the macros. Based on these observations a heuristic algorithm for learning of macros is proposed. The algorithm is shown to return macros identical with those that one would like to design by hand in the case of a particular navigation like multi-task MDP. Some related questions, in particular the problem of breaking up MDPs into multiple tasks, factorizing MDPs and learning generalizations over actions to enhance the amount of transfer are also considered in brief at the end of the paper.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A Unified Analysis of Value-Function-Based Reinforcement-Learning Algorithms.\n \n \n \n \n\n\n \n Szepesvári, C.; and Littman, M.\n\n\n \n\n\n\n Neural Computation, 11: 2017–2059. 1999.\n \n\n\n\n
\n\n\n\n \n \n \"A paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 6 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{Szepesvari1999,\n\tabstract = {Reinforcement learning is the problem of generating optimal behavior in a sequential decision-making environment given the opportunity of interacting with it. Many algorithms for solving reinforcement-learning problems work by computing improved estimates of the optimal value function. We extend prior analyses of reinforcement-learning algorithms and present a powerful new theorem that can provide a unified analysis of value-function-based reinforcement-learning algorithms. The usefulness of the theorem lies in how it allows the asynchronous convergence of a complex reinforcement-learning algorithm to be proven by verifying that a simpler synchronous algorithm converges. We illustrate the application of the theorem by analyzing the convergence of Q-learning, model-based reinforcement learning, Q-learning with multi-state updates, Q-learning for Markov games, and risk-sensitive reinforcement learning.},\n\tauthor = {Szepesv{\\'a}ri, Cs. and Littman, M.L.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:15 -0600},\n\tjournal = {Neural Computation},\n\tkeywords = {reinforcement learning, theory, asymptotic convergence, finite MDPs, stochastic approximation},\n\tpages = {2017--2059},\n\ttitle = {A Unified Analysis of Value-Function-Based Reinforcement-Learning Algorithms},\n\turl_paper = {nc-97-gmdp.ps.pdf},\n\tvolume = {11},\n\tyear = {1999}}\n\n
\n
\n\n\n
\n Reinforcement learning is the problem of generating optimal behavior in a sequential decision-making environment given the opportunity of interacting with it. Many algorithms for solving reinforcement-learning problems work by computing improved estimates of the optimal value function. We extend prior analyses of reinforcement-learning algorithms and present a powerful new theorem that can provide a unified analysis of value-function-based reinforcement-learning algorithms. The usefulness of the theorem lies in how it allows the asynchronous convergence of a complex reinforcement-learning algorithm to be proven by verifying that a simpler synchronous algorithm converges. We illustrate the application of the theorem by analyzing the convergence of Q-learning, model-based reinforcement learning, Q-learning with multi-state updates, Q-learning for Markov games, and risk-sensitive reinforcement learning.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 1998\n \n \n (8)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Uncertainty and Performance of Adaptive Controllers for Functionally Uncertain Output Feedback Systems.\n \n \n \n \n\n\n \n French, M.; Szepesvári, C.; and Rogers, E.\n\n\n \n\n\n\n In CDC, pages 4515–4520, Tampa, Florida, 12 1998. IEEE\n \n\n\n\n
\n\n\n\n \n \n \"Uncertainty paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{french1998,\n\tabstract = {We consider nonlinear systems in an output feedback form which are functionally known up to a L2 measure of uncertainty. The control task is to drive the output of the system to some neighbourhood of the origin. A modified L2 measure of transient performance (penalising both state and control effort) is given, and the performance of a class of model based adaptive controllers is studied. An upper performance bound is derived.},\n\taddress = {Tampa, Florida},\n\tauthor = {French, M.C. and Szepesv{\\'a}ri, Cs. and Rogers, E.},\n\tbooktitle = {CDC},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-11-25 00:56:19 -0700},\n\tkeywords = {adaptive control, performance bounds, Lyapunov design, output feedback, theory, control, nonparametrics, output tracking, output stabilization},\n\tmonth = {12},\n\tpages = {4515--4520},\n\tpublisher = {IEEE},\n\ttitle = {Uncertainty and Performance of Adaptive Controllers for Functionally Uncertain Output Feedback Systems},\n\turl_paper = {cdc98.ps.pdf},\n\tyear = {1998}}\n\n
\n
\n\n\n
\n We consider nonlinear systems in an output feedback form which are functionally known up to a L2 measure of uncertainty. The control task is to drive the output of the system to some neighbourhood of the origin. A modified L2 measure of transient performance (penalising both state and control effort) is given, and the performance of a class of model based adaptive controllers is studied. An upper performance bound is derived.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Multi-criteria Reinforcement Learning.\n \n \n \n \n\n\n \n Gábor, Z.; Kalmár, Z.; and Szepesvári, C.\n\n\n \n\n\n\n Technical Report TR-98-115, ``Attila József'' University, Research Group on Artificial Intelligence, JATE-MTA, Szeged, HU-6700, 1998.\n Revised on 05/04/2004\n\n\n\n
\n\n\n\n \n \n \"Multi-criteria paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@techreport{gabor1998,\n\tabstract = {We consider multi-criteria sequential decision making problems where the vector-valued evaluations are compared by a given, fixed total ordering. Conditions for the optimality of stationary policies and the Bellman optimality equation are given for a special, but important class of problems when the evaluation of policies can be computed for the criteria independently of each other. The analysis requires special care as the topology introduced by pointwise convergence and the order-topology introduced by the preference order are in general incompatible. Reinforcement learning algorithms are proposed and analyzed. Preliminary computer experiments confirm the validity of the derived algorithms. These type of multi-criteria problems are most useful when there are several optimal solutions to a problem and one wants to choose the one among these which is optimal according to another fixed criterion. Possible application in robotics and repeated games are outlined.},\n\taddress = {Szeged, HU-6700},\n\tauthor = {G{\\'a}bor, Z. and Kalm{\\'a}r, Zs. and Szepesv{\\'a}ri, Cs.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tinstitution = {``Attila J{\\'o}zsef'' University, Research Group on Artificial Intelligence, JATE-MTA},\n\tkeywords = {game playing, constrained MDPs, reinforcement learning, theory},\n\tnote = {Revised on 05/04/2004},\n\tnumber = {TR-98-115},\n\ttitle = {Multi-criteria Reinforcement Learning},\n\turl_paper = {multi-rep97.ps.pdf},\n\tyear = {1998}}\n\n
\n
\n\n\n
\n We consider multi-criteria sequential decision making problems where the vector-valued evaluations are compared by a given, fixed total ordering. Conditions for the optimality of stationary policies and the Bellman optimality equation are given for a special, but important class of problems when the evaluation of policies can be computed for the criteria independently of each other. The analysis requires special care as the topology introduced by pointwise convergence and the order-topology introduced by the preference order are in general incompatible. Reinforcement learning algorithms are proposed and analyzed. Preliminary computer experiments confirm the validity of the derived algorithms. These type of multi-criteria problems are most useful when there are several optimal solutions to a problem and one wants to choose the one among these which is optimal according to another fixed criterion. Possible application in robotics and repeated games are outlined.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Automated Detection and Classification of Microcalcifications in Mammograms using Artificial Neural Nets.\n \n \n \n \n\n\n \n Sorantin, E.; Schmidt, F.; Mayer, H.; Winkler, P.; Szepesvári, C.; Graif, E.; and Schuetz, E.\n\n\n \n\n\n\n In 4th International Workshop on Digital Mammography, 1998. \n \n\n\n\n
\n\n\n\n \n \n \"Automated link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{sorantin1998,\n\tabstract = {The goal of this study was to develop a fully automated computer application for detection and classification of clustered mc using artificial neural nets (ANN's). Cases where additional investigations are necessary should be identified automatically, too.},\n\tauthor = {Sorantin, E. and Schmidt, F. and Mayer, H. and Winkler, P. and Szepesv{\\'a}ri, Cs. and Graif, E. and Schuetz, E.},\n\tbooktitle = {4th International Workshop on Digital Mammography},\n\tentrysubtype = {unrefereed},\n\tkeywords = {application, neural networks, image processing, health informatics, clinical decision support},\n\ttitle = {Automated Detection and Classification of Microcalcifications in Mammograms using Artificial Neural Nets},\n\turl_link = {http://www.azn.nl/rrng/xray/digmam/iwdm98},\n\tyear = {1998},\n\tBdsk-Url-1 = {http://www.azn.nl/rrng/xray/digmam/iwdm98}}\n\n
\n
\n\n\n
\n The goal of this study was to develop a fully automated computer application for detection and classification of clustered mc using artificial neural nets (ANN's). Cases where additional investigations are necessary should be identified automatically, too.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Reinforcement learning: Theory and practice.\n \n \n \n \n\n\n \n Szepesvári, C.\n\n\n \n\n\n\n In Hrehus, M., editor(s), Proceedings of the 2nd Slovak Conference on Artificial Neural Networks (SCANN'98), pages 29–39, 1998. \n \n\n\n\n
\n\n\n\n \n \n \"Reinforcement paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{Szepesvari1998c,\n\tabstract = {We consider reinforcement learning methods for the solution of complex sequential optimization problems. In particular, the soundness of two methods proposed for the solution of partially observable problems will be shown. The first method suggests a state-estimation scheme and requires mild {\\em a priori} knowledge, while the second method assumes that a significant amount of abstract knowledge is available about the decision problem and uses this knowledge to setup a macro-hierarchy to turn the partially observable problem into another one which can already be handled using methods worked out for observable problems. This second method is also illustrated with some experiments on a real-robot.},\n\tauthor = {Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Proceedings of the 2nd Slovak Conference on Artificial Neural Networks (SCANN'98)},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2013-10-20 21:25:10 +0300},\n\teditor = {Hrehus, M.},\n\tkeywords = {reinforcement learning, partial information, theory, MDP transformations},\n\tpages = {29--39},\n\ttitle = {Reinforcement learning: Theory and practice},\n\turl_paper = {scann98.ps.pdf},\n\tyear = {1998}}\n\n
\n
\n\n\n
\n We consider reinforcement learning methods for the solution of complex sequential optimization problems. In particular, the soundness of two methods proposed for the solution of partially observable problems will be shown. The first method suggests a state-estimation scheme and requires mild \\em a priori knowledge, while the second method assumes that a significant amount of abstract knowledge is available about the decision problem and uses this knowledge to setup a macro-hierarchy to turn the partially observable problem into another one which can already be handled using methods worked out for observable problems. This second method is also illustrated with some experiments on a real-robot.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Static and Dynamic Aspects of Optimal Sequential Decision Making.\n \n \n \n \n\n\n \n Szepesvári, C.\n\n\n \n\n\n\n Ph.D. Thesis, Bolyai Institute of Mathematics, University of Szeged, Szeged, Aradi vrt. tere 1, HUNGARY, 6720, 09 1998.\n \n\n\n\n
\n\n\n\n \n \n \"Static paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 4 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@phdthesis{Szepesvari1998,\n\tabstract = {Foreword: In this thesis the theory of optimal sequential decisions having a general recursive structure is investigated via an operator theoretical approach, the recursive structure (of both of the dynamics and the optimality criterion) being encoded into the so-called cost propagation operator. Decision problems like Markovian Decision Problems with expected or worst-case total discounted/undiscounted cost criterion; repeated zero-sum games such as Markov-games; or alternating Markov-games all admit such a recursive structure. Our setup has the advantage that it emphasizes their common properties as well as it points out some differences.\n\n\t\t  The thesis consists of two parts, in the first part the model is assumed to be known while in the second one the models are to be explored. The setup of Part I is rather abstract but enables a unified treatment of a large class of sequential decision problems, namely the class when the total cost of decision policies is defined recursively by a so called cost propagation operator. Under natural monotonicity and continuity conditions the greedy policies w.r.t. the optimal cost-to-go function turn out to be optimal, due to the recursive structure.\n\n\t\t  Part II considers the case when the models are unknown, and have to be explored and learnt. The price of considering unknown models is that here we have to restrict ourselves to models with an additive cost structure in order to obtain tractable learning situations. The almost sure convergence of the most frequently used algorithms proposed in the reinforcement learning community is proved. These algorithms are treated as multidimensional asynchronous stochastic approximation schemes and their convergence is deduced from the main theorem of the second part. The key of the method here is the so called rescaling property of certain homogeneous processes. A practical and verifiable sufficient condition for the convergence of on-line learning policies to an optimal policy is formulated and a convergence rate is established.},\n\taddress = {Szeged, Aradi vrt. tere 1, HUNGARY, 6720},\n\tauthor = {Szepesv{\\'a}ri, Cs.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tkeywords = {reinforcement learning, theory, asymptotic convergence},\n\tmonth = {09},\n\tschool = {Bolyai Institute of Mathematics, University of Szeged},\n\ttitle = {Static and Dynamic Aspects of Optimal Sequential Decision Making},\n\turl_paper = {thesis.ps.pdf},\n\tyear = {1998}}\n\n
\n
\n\n\n
\n Foreword: In this thesis the theory of optimal sequential decisions having a general recursive structure is investigated via an operator theoretical approach, the recursive structure (of both of the dynamics and the optimality criterion) being encoded into the so-called cost propagation operator. Decision problems like Markovian Decision Problems with expected or worst-case total discounted/undiscounted cost criterion; repeated zero-sum games such as Markov-games; or alternating Markov-games all admit such a recursive structure. Our setup has the advantage that it emphasizes their common properties as well as it points out some differences. The thesis consists of two parts, in the first part the model is assumed to be known while in the second one the models are to be explored. The setup of Part I is rather abstract but enables a unified treatment of a large class of sequential decision problems, namely the class when the total cost of decision policies is defined recursively by a so called cost propagation operator. Under natural monotonicity and continuity conditions the greedy policies w.r.t. the optimal cost-to-go function turn out to be optimal, due to the recursive structure. Part II considers the case when the models are unknown, and have to be explored and learnt. The price of considering unknown models is that here we have to restrict ourselves to models with an additive cost structure in order to obtain tractable learning situations. The almost sure convergence of the most frequently used algorithms proposed in the reinforcement learning community is proved. These algorithms are treated as multidimensional asynchronous stochastic approximation schemes and their convergence is deduced from the main theorem of the second part. The key of the method here is the so called rescaling property of certain homogeneous processes. A practical and verifiable sufficient condition for the convergence of on-line learning policies to an optimal policy is formulated and a convergence rate is established.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Non-Markovian Policies in Sequential Decision Problems.\n \n \n \n \n\n\n \n Szepesvári, C.\n\n\n \n\n\n\n Acta Cybernetica, 13(3): 305–318. 1998.\n \n\n\n\n
\n\n\n\n \n \n \"Non-Markovian paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@article{Szepesvari1998a,\n\tabstract = {In this article we prove the validity of the Bellman Optimality Equation and related results for sequential decision problems with a general recursive structure. The characteristic feature of our approach is that also non-Markovian policies are taken into account. The theory is motivated by some experiments with a learning robot.},\n\tauthor = {Szepesv{\\'a}ri, Cs.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tjournal = {Acta Cybernetica},\n\tkeywords = {sequential decision making, theory},\n\tnumber = {3},\n\tpages = {305--318},\n\ttitle = {Non-Markovian Policies in Sequential Decision Problems},\n\turl_paper = {accyb97.ps.pdf},\n\tvolume = {13},\n\tyear = {1998}}\n\n
\n
\n\n\n
\n In this article we prove the validity of the Bellman Optimality Equation and related results for sequential decision problems with a general recursive structure. The characteristic feature of our approach is that also non-Markovian policies are taken into account. The theory is motivated by some experiments with a learning robot.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Multi-criteria Reinforcement Learning.\n \n \n \n \n\n\n \n Gábor, Z.; Kalmár, Z.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, 1998. \n Revised on 05/04/2004\n\n\n\n
\n\n\n\n \n \n \"Multi-criteria paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{z.gabor1998,\n\tabstract = {We consider multi-criteria sequential decision making problems where the vector-valued evaluations are compared by a given, fixed total ordering. Conditions for the optimality of stationary policies and the Bellman optimality equation are given for a special, but important class of problems when the evaluation of policies can be computed for the criteria independently of each other. The analysis requires special care as the topology introduced by pointwise convergence and the order-topology introduced by the preference order are in general incompatible. Reinforcement learning algorithms are proposed and analyzed. Preliminary computer experiments confirm the validity of the derived algorithms. These type of multi-criteria problems are most useful when there are several optimal solutions to a problem and one wants to choose the one among these which is optimal according to another fixed criterion. Possible application in robotics and repeated games are outlined.},\n\tauthor = {G{\\'a}bor, Z. and Kalm{\\'a}r, Zs. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\tkeywords = {reinforcement learning, theory, constrained MDPs},\n\tnote = {Revised on 05/04/2004},\n\towner = {Beata},\n\ttimestamp = {2010.08.30},\n\ttitle = {Multi-criteria Reinforcement Learning},\n\turl_paper = {multi98.ps.pdf},\n\tyear = {1998}}\n\n
\n
\n\n\n
\n We consider multi-criteria sequential decision making problems where the vector-valued evaluations are compared by a given, fixed total ordering. Conditions for the optimality of stationary policies and the Bellman optimality equation are given for a special, but important class of problems when the evaluation of policies can be computed for the criteria independently of each other. The analysis requires special care as the topology introduced by pointwise convergence and the order-topology introduced by the preference order are in general incompatible. Reinforcement learning algorithms are proposed and analyzed. Preliminary computer experiments confirm the validity of the derived algorithms. These type of multi-criteria problems are most useful when there are several optimal solutions to a problem and one wants to choose the one among these which is optimal according to another fixed criterion. Possible application in robotics and repeated games are outlined.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Module-Based Reinforcement Learning: Experiments with a Real Robot.\n \n \n \n \n\n\n \n Kalmár, Z.; Szepesvári, C.; and Lörincz, A.\n\n\n \n\n\n\n Machine Learning, 31: 1– 2. 1998.\n Also appeared as: Z. Kalmár, C. Szepesvári, and A. Lorincz. Module-based reinforcement learning: Experiments with a real robot. Autonomous Robots, 5:273–295, 1998.\n\n\n\n
\n\n\n\n \n \n \"Module-Based paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{zs.kalmar1998a,\n\tabstract = {The behavior of reinforcement learning (RL) algorithms is best understood in completely observable, discrete-time controlled Markov chains with finite state and action spaces. In contrast, robot-learning domains are inherently continuous both in time and space, and moreover are partially observable. Here we suggest a systematic approach to solve such problems in which the available qualitative and quantitative knowledge is used to reduce the complexity of learning task. The steps of the design process are to: i) decompose the task into subtasks using the qualitative knowledge at hand; ii) design local controllers to solve the subtasks using the available quantitative knowledge and iii) learn a coordination of these controllers by means of reinforcement learning. It is argued that the approach enables fast, semi-automatic, but still high quality robot-control as no fine-tuning of the local controllers is needed. The approach was verified on a non-trivial real-life robot task. Several RL algorithms were compared by ANOVA and it was found that the model-based approach worked significantly better than the model-free approach. The learnt switching strategy performed comparably to a handcrafted version. Moreover, the learnt strategy seemed to exploit certain properties of the environment which were not foreseen in advance, thus supporting the view that adaptive algorithms are advantageous to non-adaptive ones in complex environments.},\n\tauthor = {Kalm{\\'a}r, Zs. and Szepesv{\\'a}ri, Cs. and L{\\"o}rincz, A.},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tjournal = {Machine Learning},\n\tkeywords = {robotics, application, hierarchical reinforcement learning, reinforcement learning, macro learning, theory},\n\tnote = {Also appeared as: Z. Kalm{\\'a}r, C. Szepesv{\\'a}ri, and A. Lorincz. Module-based reinforcement learning: Experiments with a real robot. Autonomous Robots, 5:273--295, 1998.},\n\towner = {Beata},\n\tpages = {1-- 2},\n\ttimestamp = {2010.08.30},\n\ttitle = {Module-Based Reinforcement Learning: Experiments with a Real Robot},\n\turl_paper = {ml-98.ps.pdf},\n\tvolume = {31},\n\tyear = {1998}}\n\t\t\n\n    
\n
\n\n\n
\n The behavior of reinforcement learning (RL) algorithms is best understood in completely observable, discrete-time controlled Markov chains with finite state and action spaces. In contrast, robot-learning domains are inherently continuous both in time and space, and moreover are partially observable. Here we suggest a systematic approach to solve such problems in which the available qualitative and quantitative knowledge is used to reduce the complexity of learning task. The steps of the design process are to: i) decompose the task into subtasks using the qualitative knowledge at hand; ii) design local controllers to solve the subtasks using the available quantitative knowledge and iii) learn a coordination of these controllers by means of reinforcement learning. It is argued that the approach enables fast, semi-automatic, but still high quality robot-control as no fine-tuning of the local controllers is needed. The approach was verified on a non-trivial real-life robot task. Several RL algorithms were compared by ANOVA and it was found that the model-based approach worked significantly better than the model-free approach. The learnt switching strategy performed comparably to a handcrafted version. Moreover, the learnt strategy seemed to exploit certain properties of the environment which were not foreseen in advance, thus supporting the view that adaptive algorithms are advantageous to non-adaptive ones in complex environments.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 1997\n \n \n (9)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n The Asymptotic Convergence-Rate of Q-learning.\n \n \n \n \n\n\n \n Szepesvári, C.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems, pages 1064–1070, 1997. \n \n\n\n\n
\n\n\n\n \n \n \"The paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{Szepesvari1997b,\n\tabstract = {In this paper we show that for discounted MDPs with discount factor $\\gamma>1/2$ the asymptotic rate of convergence of Q-learning is O($1/t^{R(1-\\gamma$)}) if $R(1-\\gamma)<1/2$ and O($\\sqrt{\\log\\log t/ t}$) otherwise provided that the state-action pairs are sampled from a fixed probability distribution. Here $R=p_{min}/p_{max}$ is the ratio of the minimum and maximum state-action occupation frequencies. The results extend to convergent on-line learning provided that $p_{min}>0$, where $p_{min}$ and $p_{max}$ now become the minimum and maximum state-action occupation frequencies corresponding to the stationary distribution.},\n\tauthor = {Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Advances in Neural Information Processing Systems},\n\tkeywords = {reinforcement learning, control learning, finite MDPs, online learning, stochastic approximation, theory},\n\tpages = {1064--1070},\n\ttitle = {The Asymptotic Convergence-Rate of Q-learning},\n\turl_paper = {NeurIPS97.ps.pdf},\n\tyear = {1997}}\n\n
\n
\n\n\n
\n In this paper we show that for discounted MDPs with discount factor $γ>1/2$ the asymptotic rate of convergence of Q-learning is O($1/t^{R(1-γ$)) if $R(1-γ)<1/2$ and O($\\sqrt{łogłog t/ t}$) otherwise provided that the state-action pairs are sampled from a fixed probability distribution. Here $R=p_{min}/p_{max}$ is the ratio of the minimum and maximum state-action occupation frequencies. The results extend to convergent on-line learning provided that $p_{min}>0$, where $p_{min}$ and $p_{max}$ now become the minimum and maximum state-action occupation frequencies corresponding to the stationary distribution.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Prediction of Protein Domain-Types by Backpropagation.\n \n \n \n \n\n\n \n Murvai, J.; Szepesvári, C.; Bachrati, C.; and Pongor, S.\n\n\n \n\n\n\n Technical Report TR-98-117, ``Attila József'' University, Research Group on Artificial Intelligence, Szeged, HU-6700, 1997.\n \n\n\n\n
\n\n\n\n \n \n \"Prediction paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@techreport{j.murvai1997,\n\tabstract = {We propose a neural net solution for the recognition of the domain types of proteins, which is a hard and important problem in biology. We have found that using a clever preprocessing technique relatively small neural networks perform surprisingly well. The performances of the neural nets were measured by cross-validation and Hoeffding's inequality was utilized for the estimation of a confidence interval of the estimates.},\n\taddress = {Szeged, HU-6700},\n\tauthor = {Murvai, J. and Szepesv{\\'a}ri, Cs. and Bachrati, Cs. and Pongor, S.},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tinstitution = {``Attila J{\\'o}zsef'' University, Research Group on Artificial Intelligence},\n\tkeywords = {bioinformatics, domain prediction, neural networks, application},\n\tnumber = {TR-98-117},\n\towner = {Beata},\n\ttimestamp = {2010.08.31},\n\ttitle = {Prediction of Protein Domain-Types by Backpropagation},\n\turl_paper = {RGAI-98-117.ps.pdf},\n\tyear = {1997}}\n\n
\n
\n\n\n
\n We propose a neural net solution for the recognition of the domain types of proteins, which is a hard and important problem in biology. We have found that using a clever preprocessing technique relatively small neural networks perform surprisingly well. The performances of the neural nets were measured by cross-validation and Hoeffding's inequality was utilized for the estimation of a confidence interval of the estimates.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Module Based Reinforcement Learning for a Real Robot.\n \n \n \n \n\n\n \n Kalmár, Z.; Szepesvári, C.; and Lörincz, A.\n\n\n \n\n\n\n In Proceedings of the 6th European Workshop on Learning Robots, pages 22–32, 1997. \n \n\n\n\n
\n\n\n\n \n \n \"Module paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{kalmar1997,\n\tabstract = {The behaviour of reinforcement learning (RL) algorithms is best understood in completely observable, finite state- and action-space, discrete-time controlled Markov-chains. Robot-learning domains, on the other hand, are inherently infinite both in time and space, and moreover they are only partially observable. In this article we suggest a systematic method whose motivation comes from the desire to transform the task-to-be-solved into a finite-state, discrete-time, ``approximately'' Markovian task, which is completely observable too. The key idea is to break up the problem into subtasks and design controllers for each of the subtasks. Then operating conditions are attached to the controllers (together the controllers and their operating conditions which are called modules) and possible additional features are designed to facilitate observability. A new discrete time-counter is introduced at the ``module-level'' that clicks only when a change in the value of one of the features is observed. The approach was tried out on a real-life robot. Several RL algorithms were compared and it was found that a model-based approach worked best. The learnt switching strategy performed equally well as a handcrafted version. Moreover, the learnt strategy seemed to exploit certain properties of the environment which could not have been seen in advance, which predicted the promising possibility that a learnt controller might outperform a handcrafted switching strategy in the future.},\n\tauthor = {Kalm{\\'a}r, Zs. and Szepesv{\\'a}ri, Cs. and L{\\"o}rincz, A.},\n\tbooktitle = {Proceedings of the 6th European Workshop on Learning Robots},\n\tentrysubtype = {unrefereed},\n\tkeywords = {robotics, application, hierarchical reinforcement learning, reinforcement learning, macro learning, theory},\n\tpages = {22--32},\n\ttitle = {Module Based Reinforcement Learning for a Real Robot},\n\turl_paper = {ewlr97.ps.pdf},\n\tyear = {1997}}\n\n
\n
\n\n\n
\n The behaviour of reinforcement learning (RL) algorithms is best understood in completely observable, finite state- and action-space, discrete-time controlled Markov-chains. Robot-learning domains, on the other hand, are inherently infinite both in time and space, and moreover they are only partially observable. In this article we suggest a systematic method whose motivation comes from the desire to transform the task-to-be-solved into a finite-state, discrete-time, ``approximately'' Markovian task, which is completely observable too. The key idea is to break up the problem into subtasks and design controllers for each of the subtasks. Then operating conditions are attached to the controllers (together the controllers and their operating conditions which are called modules) and possible additional features are designed to facilitate observability. A new discrete time-counter is introduced at the ``module-level'' that clicks only when a change in the value of one of the features is observed. The approach was tried out on a real-life robot. Several RL algorithms were compared and it was found that a model-based approach worked best. The learnt switching strategy performed equally well as a handcrafted version. Moreover, the learnt strategy seemed to exploit certain properties of the environment which could not have been seen in advance, which predicted the promising possibility that a learnt controller might outperform a handcrafted switching strategy in the future.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Uncertainty, Performance and Model Dependency in Approximate Adaptive Nonlinear Control.\n \n \n \n \n\n\n \n French, M.; Szepesvári, C.; and Rogers, E.\n\n\n \n\n\n\n In CDC, volume 3, pages 3046 - 3051, San Diego, California, 1997. \n \n\n\n\n
\n\n\n\n \n \n \"Uncertainty, paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{m.french1997a,\n\tabstract = {We consider systems satisfying a matching condition which are functionally known up to a L2 measure of uncertainty. A modified L2 performance measure is given, and the performance of a class of model based adaptive controllers is studied. An upper performance bound is derived in terms of the uncertainty measure and measures of the approximation error of the model. Asymptotic analyses of the bounds under increasing model size are undertaken, and sufficient conditions are given on the model that ensure the performance bounds are bounded independent of the model size.},\n\taddress = {San Diego, California},\n\tauthor = {French, M.C. and Szepesv{\\'a}ri, Cs. and Rogers, E.},\n\tbooktitle = {CDC},\n\tkeywords = {adaptive control, performance bounds, Lyapunov design, strict feedback, chain of integrators, matched uncertainty, theory, control, nonparametrics, stabilization},\n\towner = {Beata},\n\tpages = {3046 - 3051},\n\ttimestamp = {2010.08.31},\n\ttitle = {Uncertainty, Performance and Model Dependency in Approximate Adaptive Nonlinear Control},\n\turl_paper = {cdc97.ps.pdf},\n\tvolume = {3},\n\tyear = {1997}}\n\n
\n
\n\n\n
\n We consider systems satisfying a matching condition which are functionally known up to a L2 measure of uncertainty. A modified L2 performance measure is given, and the performance of a class of model based adaptive controllers is studied. An upper performance bound is derived in terms of the uncertainty measure and measures of the approximation error of the model. Asymptotic analyses of the bounds under increasing model size are undertaken, and sufficient conditions are given on the model that ensure the performance bounds are bounded independent of the model size.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Neurocontroller using Dynamic State Feedback for Compensatory Control.\n \n \n \n \n\n\n \n Szepesvári, C.; Cimmer, S.; and Lörincz, A.\n\n\n \n\n\n\n Neural Networks, 10: 1691–1708. 1997.\n \n\n\n\n
\n\n\n\n \n \n \"Neurocontroller paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{Szepesvari1997c,\n\tabstract = {A common technique in neurocontrol is that of controlling a plant by static state feedback using the plant's inverse dynamics, which is approximated through a learning process. It is well known that in this control mode even small approximation errors or, which is the same, small perturbations of the plant may lead to instability. Here, a novel approach is proposed to overcome the problem of instability by using the inverse dynamics both for the Static and for the error compensating Dynamic State feedback control. This scheme is termed SDS Feedback Control. It is shown that as long as the error of the inverse dynamics model is ``signproper'' the SDS Feedback Control is stable, i.e., the error of tracking may be kept small. The proof is based on a modification of Liapunov's second method. The problem of on-line learning of the inverse dynamics when using the controller simultaneously for both forward control and for dynamic feedback is dealt with, as are questions related to noise sensitivity and robust control of robotic manipulators. Simulations of a simplified sensorimotor loop serve to illustrate the approach.},\n\tauthor = {Szepesv{\\'a}ri, Cs. and Cimmer, Sz. and L{\\"o}rincz, A.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tjournal = {Neural Networks},\n\tkeywords = {neural networks, theory, Lyapunov stability, adaptive control, control},\n\tpages = {1691--1708},\n\ttitle = {Neurocontroller using Dynamic State Feedback for Compensatory Control},\n\turl_paper = {sds-nn98.ps.pdf},\n\tvolume = {10},\n\tyear = {1997}}\n\n
\n
\n\n\n
\n A common technique in neurocontrol is that of controlling a plant by static state feedback using the plant's inverse dynamics, which is approximated through a learning process. It is well known that in this control mode even small approximation errors or, which is the same, small perturbations of the plant may lead to instability. Here, a novel approach is proposed to overcome the problem of instability by using the inverse dynamics both for the Static and for the error compensating Dynamic State feedback control. This scheme is termed SDS Feedback Control. It is shown that as long as the error of the inverse dynamics model is ``signproper'' the SDS Feedback Control is stable, i.e., the error of tracking may be kept small. The proof is based on a modification of Liapunov's second method. The problem of on-line learning of the inverse dynamics when using the controller simultaneously for both forward control and for dynamic feedback is dealt with, as are questions related to noise sensitivity and robust control of robotic manipulators. Simulations of a simplified sensorimotor loop serve to illustrate the approach.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Approximate Inverse-Dynamics based Robust Control using Static and Dynamic State Feedback.\n \n \n \n \n\n\n \n Szepesvári, C.; and Lörincz, A.\n\n\n \n\n\n\n In Kalkkuhl, J.; Hunt, K.; Zbikowski, R.; and Dzielińsky, A., editor(s), Applications of Neural Adaptive Control Technology, pages 151–197. World Scientific, Singapore, 1997.\n \n\n\n\n
\n\n\n\n \n \n \"Approximate paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@incollection{Szepesvari1997,\n\tabstract = {It is rigorously shown that inverse-dynamics models can be used to stabilize plants of any order provided that the inverse-dynamic model is used in a mixed mode fashion, in that of a `Static and Dynamic' State-feedback (SDS) mode. When the resulting controller is used for tracking increasing the gain of the dynamic feedback decreases the tracking error. Yet another attractive feature of the SDS scheme is that the inverse-dynamics model can be tuned on-line by {\\em any} adaptation mechanism without cancelling stability if the conditions of the non-adaptive stability theorem hold at any time instant. Computer simulations of the control of a chaotic bioreactor and a `realistic' robotic manipulator demonstrate the robustness of the approach. It is shown that SDS control will yield zero asymptotic error when controlling the bioreactor using an inverse-dynamics model which when used in a traditional mode would yield intolerably large errors. In the case of the robotic arm simulations the effects of perturbation and sampling frequency are investigated and the SDS control is compared with the non-adaptive computed torque method. A fully self-organizing associative neural network architecture that can be used to approximate the inverse-dynamics in the form of a Position-and-Direction-to-Action (PDA) map is also described. Similarities between the basal ganglia - thalamocortical loops and the SDS scheme are discussed and it is argued that the SDS scheme could be viewed as a model of higher order motor functions of these areas.},\n\tauthor = {Szepesv{\\'a}ri, Cs. and L{\\"o}rincz, A.},\n\tbooktitle = {Applications of Neural Adaptive Control Technology},\n\teditor = {Kalkkuhl, J. and Hunt, K.J. and Zbikowski, R. and Dzieli{\\'n}sky, A.},\n\tkeywords = {control, theory, adaptive control, manipulator control, bioreactor control, neural networks},\n\tpages = {151--197},\n\tpublisher = {World Scientific, Singapore},\n\ttitle = {Approximate Inverse-Dynamics based Robust Control using Static and Dynamic State Feedback},\n\turl_paper = {nact97.ps.pdf},\n\tyear = {1997}}\n\n
\n
\n\n\n
\n It is rigorously shown that inverse-dynamics models can be used to stabilize plants of any order provided that the inverse-dynamic model is used in a mixed mode fashion, in that of a `Static and Dynamic' State-feedback (SDS) mode. When the resulting controller is used for tracking increasing the gain of the dynamic feedback decreases the tracking error. Yet another attractive feature of the SDS scheme is that the inverse-dynamics model can be tuned on-line by \\em any adaptation mechanism without cancelling stability if the conditions of the non-adaptive stability theorem hold at any time instant. Computer simulations of the control of a chaotic bioreactor and a `realistic' robotic manipulator demonstrate the robustness of the approach. It is shown that SDS control will yield zero asymptotic error when controlling the bioreactor using an inverse-dynamics model which when used in a traditional mode would yield intolerably large errors. In the case of the robotic arm simulations the effects of perturbation and sampling frequency are investigated and the SDS control is compared with the non-adaptive computed torque method. A fully self-organizing associative neural network architecture that can be used to approximate the inverse-dynamics in the form of a Position-and-Direction-to-Action (PDA) map is also described. Similarities between the basal ganglia - thalamocortical loops and the SDS scheme are discussed and it is argued that the SDS scheme could be viewed as a model of higher order motor functions of these areas.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning and Exploitation do not Conflict Under Minimax Optimality.\n \n \n \n \n\n\n \n Szepesvári, C.\n\n\n \n\n\n\n In Someren, M.; and Widmer, G., editor(s), ECML, volume 1224, of Lecture Notes in Artificial Intelligence, pages 242–249, 1997. Springer, Berlin\n \n\n\n\n
\n\n\n\n \n \n \"Learning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{Szepesvari1997h,\n\tabstract = {We show that adaptive real time dynamic programming extended with the action selection strategy which chooses the best action according to the latest estimate of the cost function yields asymptotically optimal policies within finite time under the minimax optimality criterion. From this it follows that learning and exploitation do not conflict under this special optimality criterion. We relate this result to learning optimal strategies in repeated two-player zero-sum deterministic games.},\n\tauthor = {Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ECML},\n\teditor = {Someren, M.van and Widmer, G.},\n\tkeywords = {exploration vs. exploitation, reinforcement learning, theory},\n\tpages = {242--249},\n\tpublisher = {Springer, Berlin},\n\tseries = {Lecture Notes in Artificial Intelligence},\n\ttitle = {Learning and Exploitation do not Conflict Under Minimax Optimality},\n\turl_paper = {ecml97.ps.pdf},\n\tvolume = {1224},\n\tyear = {1997}}\n\n
\n
\n\n\n
\n We show that adaptive real time dynamic programming extended with the action selection strategy which chooses the best action according to the latest estimate of the cost function yields asymptotically optimal policies within finite time under the minimax optimality criterion. From this it follows that learning and exploitation do not conflict under this special optimality criterion. We relate this result to learning optimal strategies in repeated two-player zero-sum deterministic games.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Integrated Architecture for Motion-control and Path-planning.\n \n \n \n \n\n\n \n Szepesvári, C.; and Lörincz, A.\n\n\n \n\n\n\n Journal of Robotic Systems, 15(1): 1–15. 1997.\n \n\n\n\n
\n\n\n\n \n \n \"Integrated paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@article{Szepesvari1997d,\n\tabstract = {We consider the problem of learning to control a plant with non-linear control characteristics and solving the path planning problem at the same time. The solution is based on a path planning model that designates a speed field to be tracked, the speed field being the gradient of the stationary solution of a diffusion process. Diffusion is simulated on an artificial neural network by spreading activation. Interneurons between neighboring discretizing neurons detect the strength of the activity flow and emit control signals to control neurons via modifiable connections. The proposed method may be used for learning redundant control problems. The architecture integrates reactive path planning and continuous motion control in a natural fashion.},\n\tauthor = {Szepesv{\\'a}ri, Cs. and L{\\"o}rincz, A.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tjournal = {Journal of Robotic Systems},\n\tkeywords = {control, neural networks},\n\tnumber = {1},\n\tpages = {1--15},\n\ttitle = {Integrated Architecture for Motion-control and Path-planning},\n\turl_paper = {jrs98.ps.pdf},\n\tvolume = {15},\n\tyear = {1997}}\n\n
\n
\n\n\n
\n We consider the problem of learning to control a plant with non-linear control characteristics and solving the path planning problem at the same time. The solution is based on a path planning model that designates a speed field to be tracked, the speed field being the gradient of the stationary solution of a diffusion process. Diffusion is simulated on an artificial neural network by spreading activation. Interneurons between neighboring discretizing neurons detect the strength of the activity flow and emit control signals to control neurons via modifiable connections. The proposed method may be used for learning redundant control problems. The architecture integrates reactive path planning and continuous motion control in a natural fashion.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n High Precision Neurocontrol of a Chaotic Bioreactor.\n \n \n \n\n\n \n Szepesvári, C.; and Lörincz, A.\n\n\n \n\n\n\n Nonlinear Analysis, 30(3): 1669–1676. 1997.\n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{Szepesvari1997f,\n\tauthor = {Szepesv{\\'a}ri, Cs. and L{\\"o}rincz, A.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tjournal = {Nonlinear Analysis},\n\tkeywords = {application, control, neural networks},\n\tnumber = {3},\n\tpages = {1669--1676},\n\ttitle = {High Precision Neurocontrol of a Chaotic Bioreactor},\n\tvolume = {30},\n\tyear = {1997}}\n\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 1996\n \n \n (9)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n A Generalized Reinforcement Learning Model: Convergence and Applications.\n \n \n \n \n\n\n \n Littman, M.; and Szepesvári, C.\n\n\n \n\n\n\n In ICML, pages 310–318, 1996. \n \n\n\n\n
\n\n\n\n \n \n \"A paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n  \n \n 9 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{littman1996,\n\tauthor = {Littman, M.L. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICML},\n\tkeywords = {reinforcement learning, theory, asymptotic convergence, finite MDPs, stochastic approximation},\n\tpages = {310--318},\n\ttitle = {A Generalized Reinforcement Learning Model: Convergence and Applications},\n\turl_paper = {ml96.ps.pdf},\n\tyear = {1996}}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Q-learning Combined with Spreading: Convergence and Results.\n \n \n \n\n\n \n Ribeiro, C. H. C.; and Szepesvári, C.\n\n\n \n\n\n\n In Proceedings of ISRF-IEE International Conference: Intelligent and Cognitive Systems, Neural Networks Symposium, pages 32–36, Tehran, Iran, 1996. \n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{ribeiro1996,\n\taddress = {Tehran, Iran},\n\tauthor = {Ribeiro, C. H. C. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Proceedings of ISRF-IEE International Conference: Intelligent and Cognitive Systems, Neural Networks Symposium},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tkeywords = {reinforcement learning, theory, asymptotic convergence},\n\tpages = {32--36},\n\ttitle = {Q-learning Combined with Spreading: Convergence and Results},\n\tyear = {1996}}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Generalized Markov Decision Processes: Dynamic-programming and reinforcement-learning algorithms.\n \n \n \n \n\n\n \n Szepesvári, C.; and Littman, M.\n\n\n \n\n\n\n Technical Report CS-96-11, Brown University, Department of Computer Science, Providence, RI, 11 1996.\n \n\n\n\n
\n\n\n\n \n \n \"Generalized paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@techreport{Szepesvari1996h,\n\tabstract = {Reinforcement learning is the process by which an autonomous agent uses its experience interacting with an environment to improve its behavior. The Markov decision process (MDP) model is a popular way of formalizing the reinforcement-learning problem, but it is by no means the only way. In this paper, we show how many of the important theoretical results concerning reinforcement learning in MDPs extend to a generalized MDP model that includes MDPs, two-player games and MDPs under a worst-case optimality criterion as special cases. The basis of this extension is a stochastic-approximation theorem that reduces asynchronous convergence to synchronous convergence.},\n\taddress = {Providence, RI},\n\tauthor = {Szepesv{\\'a}ri, Cs. and Littman, M.L.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tinstitution = {Brown University, Department of Computer Science},\n\tkeywords = {reinforcement learning, theory, asymptotic convergence, finite MDPs, stochastic approximation},\n\tmonth = {11},\n\tnumber = {CS-96-11},\n\ttitle = {Generalized Markov Decision Processes: Dynamic-programming and reinforcement-learning algorithms},\n\turl_paper = {gmdp.ps.pdf},\n\tyear = {1996}}\n\n
\n
\n\n\n
\n Reinforcement learning is the process by which an autonomous agent uses its experience interacting with an environment to improve its behavior. The Markov decision process (MDP) model is a popular way of formalizing the reinforcement-learning problem, but it is by no means the only way. In this paper, we show how many of the important theoretical results concerning reinforcement learning in MDPs extend to a generalized MDP model that includes MDPs, two-player games and MDPs under a worst-case optimality criterion as special cases. The basis of this extension is a stochastic-approximation theorem that reduces asynchronous convergence to synchronous convergence.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Inverse Dynamics Controllers for Robust Control: Consequences for Neurocontrollers.\n \n \n \n \n\n\n \n Szepesvári, C.; and Lörincz, A.\n\n\n \n\n\n\n In ICANN, pages 697–702, 1996. \n \n\n\n\n
\n\n\n\n \n \n \"Inverse paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{Szepesvari1996b,\n\tabstract = {It is proposed that controllers that approximate the inverse dynamics of the controlled plant can be used for on-line compensation of changes in the plant's dynamics. The idea is to use the very same controller in two modes at the same time: both for static and dynamic feedback. Implications for the learning of neurocontrollers are discussed. The proposed control mode relaxes the demand of precision and as a consequence, controllers that utilise direct associative learning by means of local function approximators may become more tractable in higher dimensional spaces.},\n\tauthor = {Szepesv{\\'a}ri, Cs. and L{\\"o}rincz, A.},\n\tbooktitle = {ICANN},\n\tkeywords = {control, theory, adaptive control, manipulator control, bioreactor control, neural networks},\n\tpages = {697--702},\n\ttitle = {Inverse Dynamics Controllers for Robust Control: Consequences for Neurocontrollers},\n\turl_paper = {szepes.icann96-fbc.ps.pdf},\n\tyear = {1996}}\n\n
\n
\n\n\n
\n It is proposed that controllers that approximate the inverse dynamics of the controlled plant can be used for on-line compensation of changes in the plant's dynamics. The idea is to use the very same controller in two modes at the same time: both for static and dynamic feedback. Implications for the learning of neurocontrollers are discussed. The proposed control mode relaxes the demand of precision and as a consequence, controllers that utilise direct associative learning by means of local function approximators may become more tractable in higher dimensional spaces.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Synthesis of Neural Networks: the Case of Cascaded Hebbians.\n \n \n \n \n\n\n \n Szepesvári, C.\n\n\n \n\n\n\n Technical Report 96-102, Research Group on Artificial Intelligence, JATE-MTA, Szeged 6720, Aradi vrt tere 1., HUNGARY, 08 1996.\n e-mail: szepes@math.u-szeged.hu\n\n\n\n
\n\n\n\n \n \n \"Synthesis paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@techreport{Szepesvari1996e,\n\tabstract = {We show that cascading Hebbian learning with any other convergent algorithm (called the forward algorithm) results in the convergence of the Hebbian weights to a stationary point where the Hebbian algorithm would converge if the weights of the forward algorithm had already converged. Further, it is shown that the convergence rate of the composite algorithm does not deteriorate because of the cascading. This result is a consequence of a more general theorem which is also stated and proved here, the proofs being based on a global Lipschitzian assumption. The theory is illustrated by a composite PCA-Hebbian architecture introduced by Micheals (Michaels, 1995).},\n\taddress = {Szeged 6720, Aradi vrt tere 1., HUNGARY},\n\tauthor = {Szepesv{\\'a}ri, Cs.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tinstitution = {Research Group on Artificial Intelligence, JATE-MTA},\n\tkeywords = {stochastic approximation, two-timescale stochastic approximation, neural networks, PCA},\n\tmonth = {08},\n\tnote = {e-mail: szepes@math.u-szeged.hu},\n\tnumber = {96-102},\n\ttitle = {Synthesis of Neural Networks: the Case of Cascaded Hebbians},\n\turl_paper = {TR96-102.pdf},\n\tyear = {1996}}\n\n
\n
\n\n\n
\n We show that cascading Hebbian learning with any other convergent algorithm (called the forward algorithm) results in the convergence of the Hebbian weights to a stationary point where the Hebbian algorithm would converge if the weights of the forward algorithm had already converged. Further, it is shown that the convergence rate of the composite algorithm does not deteriorate because of the cascading. This result is a consequence of a more general theorem which is also stated and proved here, the proofs being based on a global Lipschitzian assumption. The theory is illustrated by a composite PCA-Hebbian architecture introduced by Micheals (Michaels, 1995).\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Neurocontrol I: Self-organizing Speed-field Tracking.\n \n \n \n \n\n\n \n Szepesvári, C.; and Lörincz, A.\n\n\n \n\n\n\n Neural Network World, 6: 875–896. 1996.\n \n\n\n\n
\n\n\n\n \n \n \"Neurocontrol paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@article{Szepesvari1996c,\n\tabstract = {The problems of controlling a plant while avoiding obstacles and experiencing perturbations in the plants dynamics are considered. It is assumed that the plant's dynamics is not known in advance. To solve this problem a self-organizing artificial neural network (ANN) solution is advanced here. The ANN consists of various parts. The first part discretizes the state space of the plant and also learns the geometry of the state space. The learnt geometrical relations are represented by lateral connections. These connections are utilized for planning a speed field, allowing collision free motion. The speed field is defined over the neural represention of the state space and is transformed into control signals with the help of interneurons associated with the lateral connections: connections between interneurons and control neurons encode the inverse dynamics of the plant. These connections are learnt during a direct system inverse identification process by Hebbian learning. Theoretical results and computer experiments show the robustness of approach.},\n\tauthor = {Szepesv{\\'a}ri, Cs. and L{\\"o}rincz, A.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tjournal = {Neural Network World},\n\tkeywords = {neural networks, control},\n\tpages = {875--896},\n\ttitle = {Neurocontrol I: Self-organizing Speed-field Tracking},\n\turl_paper = {szepes.nnw1.ps.pdf},\n\tvolume = {6},\n\tyear = {1996}}\n\n
\n
\n\n\n
\n The problems of controlling a plant while avoiding obstacles and experiencing perturbations in the plants dynamics are considered. It is assumed that the plant's dynamics is not known in advance. To solve this problem a self-organizing artificial neural network (ANN) solution is advanced here. The ANN consists of various parts. The first part discretizes the state space of the plant and also learns the geometry of the state space. The learnt geometrical relations are represented by lateral connections. These connections are utilized for planning a speed field, allowing collision free motion. The speed field is defined over the neural represention of the state space and is transformed into control signals with the help of interneurons associated with the lateral connections: connections between interneurons and control neurons encode the inverse dynamics of the plant. These connections are learnt during a direct system inverse identification process by Hebbian learning. Theoretical results and computer experiments show the robustness of approach.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Neurocontrol II: High Precision Control Achieved using Approximate Inverse Dynamics Models.\n \n \n \n \n\n\n \n Szepesvári, C.; and Lörincz, A.\n\n\n \n\n\n\n Neural Network World, 6: 897–920. 1996.\n \n\n\n\n
\n\n\n\n \n \n \"Neurocontrol paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{Szepesvari1996d,\n\tabstract = {It is common that artificial neural networks (ANNs) are used for approximating the inverse dynamics of a plant. In the accompanying paper a self-organising ANN model for associative identification of the inverse dynamics was introduced. Here we propose the use of approximate inverse dynamic models for both Static and Dynamic State (SDS) feedback control. This compound controller is capable of high-precision control even when the inverse dynamics is just qualitatively modeled or the plant's dynamics is perturbed. Properties of the SDS Feedback Controller in learning the inverse dynamics as well as comparisons with other methods are discussed. An example is presented when a chaotic plant, a bioreactor, is controlled using the SDS Controller. We found that the SDS Controller can compensate model mismatches that otherwise would lead to an untolerably large error if a traditional controller were used.},\n\tauthor = {Szepesv{\\'a}ri, Cs. and L{\\"o}rincz, A.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tjournal = {Neural Network World},\n\tkeywords = {neural networks, control, theory, manipulator control},\n\tpages = {897--920},\n\ttitle = {Neurocontrol II: High Precision Control Achieved using Approximate Inverse Dynamics Models},\n\turl_paper = {szepes.nnw2.ps.pdf},\n\tvolume = {6},\n\tyear = {1996}}\n\n
\n
\n\n\n
\n It is common that artificial neural networks (ANNs) are used for approximating the inverse dynamics of a plant. In the accompanying paper a self-organising ANN model for associative identification of the inverse dynamics was introduced. Here we propose the use of approximate inverse dynamic models for both Static and Dynamic State (SDS) feedback control. This compound controller is capable of high-precision control even when the inverse dynamics is just qualitatively modeled or the plant's dynamics is perturbed. Properties of the SDS Feedback Controller in learning the inverse dynamics as well as comparisons with other methods are discussed. An example is presented when a chaotic plant, a bioreactor, is controlled using the SDS Controller. We found that the SDS Controller can compensate model mismatches that otherwise would lead to an untolerably large error if a traditional controller were used.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Approximate Geometry Representations and Sensory Fusion.\n \n \n \n \n\n\n \n Szepesvári, C.; and Lörincz, A.\n\n\n \n\n\n\n Neurocomputing, 12: 267–287. 1996.\n \n\n\n\n
\n\n\n\n \n \n \"Approximate paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{Szepesvari1996g,\n\tabstract = {This paper summarizes the recent advances in the theory of self-organizing development of approximate geometry representations based on the use of neural networks. Part of this work is based on the theoretical approach of ({Sz}epesv{\\'a}ri, 1993), which is different from that of (Martinetz, 1993) and also is somewhat more general. The Martinetz approach treats signals provided by artificial neuron-like entities whereas the present work uses the entities of the external world as its starting point. The relationship between the present work and the Martinetz approach will be detailed. We approach the problem of approximate geometry representations by first examining the problem of sensory fusion, i.e., the problem of fusing information from different transductors. A straightforward solution is the simultaneous discretization of the output of all transductors, which means the discretization of a space defined as the product of the individual transductor output spaces. However, the geometry relations are defined for the external world only, so it is still an open question how to define the metrics on the product of output spaces. It will be shown that simple Hebbian learning can result in the formation of a correct geometry representation. Some topological considerations will be presented to help us clarify the underlying concepts and assumptions. The mathematical framework gives rise to a corollary on the ``topographical mappings'' realized by Kohonen networks. In fact, the present work as well as (Martinetz, 1993) may be considered as a generalization of Kohonen's topographic maps. We develop topographic maps with self-organizing interneural connections.},\n\tauthor = {Szepesv{\\'a}ri, Cs. and L{\\"o}rincz, A.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:15 -0600},\n\tdoi = {10.1016/0925-2312(95)00116-6},\n\tjournal = {Neurocomputing},\n\tkeywords = {manifold learning, theory, neural networks},\n\tpages = {267--287},\n\ttitle = {Approximate Geometry Representations and Sensory Fusion},\n\turl_paper = {fusion-neucing.pdf},\n\tvolume = {12},\n\tyear = {1996},\n\tBdsk-Url-1 = {http://dx.doi.org/10.1016/0925-2312(95)00116-6}}\n\n
\n
\n\n\n
\n This paper summarizes the recent advances in the theory of self-organizing development of approximate geometry representations based on the use of neural networks. Part of this work is based on the theoretical approach of (Szepesvári, 1993), which is different from that of (Martinetz, 1993) and also is somewhat more general. The Martinetz approach treats signals provided by artificial neuron-like entities whereas the present work uses the entities of the external world as its starting point. The relationship between the present work and the Martinetz approach will be detailed. We approach the problem of approximate geometry representations by first examining the problem of sensory fusion, i.e., the problem of fusing information from different transductors. A straightforward solution is the simultaneous discretization of the output of all transductors, which means the discretization of a space defined as the product of the individual transductor output spaces. However, the geometry relations are defined for the external world only, so it is still an open question how to define the metrics on the product of output spaces. It will be shown that simple Hebbian learning can result in the formation of a correct geometry representation. Some topological considerations will be presented to help us clarify the underlying concepts and assumptions. The mathematical framework gives rise to a corollary on the ``topographical mappings'' realized by Kohonen networks. In fact, the present work as well as (Martinetz, 1993) may be considered as a generalization of Kohonen's topographic maps. We develop topographic maps with self-organizing interneural connections.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Self-organizing Multi-resolution Grid for Motion Planning and Control.\n \n \n \n \n\n\n \n Fomin, T.; Rozgonyi, T.; Szepesvári, C.; and Lörincz, A.\n\n\n \n\n\n\n International Journal of Neural Systems, 7: 757–776. 1996.\n \n\n\n\n
\n\n\n\n \n \n \"Self-organizing paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@article{t.fomin1996,\n\tabstract = {A fully self-organizing neural network approach to low-dimensional control problems is described. We consider the problem of learning to control an object and solving the path planning problem at the same time. Control is based on the path planning model that follows the gradient of the stationary solution of a diffusion process working in the state space. Previous works are extended by introducing a self-organizing multigrid-like discretizing structure to represent the external world. Diffusion is simulated within a recurrent neural network built on this multigrid system. The novelty of the approach is that the diffusion on the multigrid is fast. Moreover, the diffusion process on the multigrid fits well the requirements of the path planning: it accelerates the diffusion in large free space regions while still keeps the resolution in small bottleneck-like labyrinths along the path. Control is achieved in the usual way: associative learning identifies the inverse dynamics of the system in a direct fashion. To this end there are introduced interneurons between neighbouring discretizing units that detect the strength of the steady-state diffusion and forward control commands to the control neurons via modifiable connections. This architecture forms the Multigrid Position-and-Direction-to-Action (MPDA) map. The architecture integrates reactive path planning and continuous motion control. It is also shown that the scheme leads to population coding for the actual command vector.},\n\tauthor = {Fomin, T. and Rozgonyi, T. and Szepesv{\\'a}ri, Cs. and L{\\"o}rincz, A.},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tjournal = {International Journal of Neural Systems},\n\tkeywords = {control, neural networks},\n\tpages = {757--776},\n\ttimestamp = {2010.08.31},\n\ttitle = {Self-organizing Multi-resolution Grid for Motion Planning and Control},\n\turl_paper = {fomin.mo.ps.pdf},\n\tvolume = {7},\n\tyear = {1996}}\n\n
\n
\n\n\n
\n A fully self-organizing neural network approach to low-dimensional control problems is described. We consider the problem of learning to control an object and solving the path planning problem at the same time. Control is based on the path planning model that follows the gradient of the stationary solution of a diffusion process working in the state space. Previous works are extended by introducing a self-organizing multigrid-like discretizing structure to represent the external world. Diffusion is simulated within a recurrent neural network built on this multigrid system. The novelty of the approach is that the diffusion on the multigrid is fast. Moreover, the diffusion process on the multigrid fits well the requirements of the path planning: it accelerates the diffusion in large free space regions while still keeps the resolution in small bottleneck-like labyrinths along the path. Control is achieved in the usual way: associative learning identifies the inverse dynamics of the system in a direct fashion. To this end there are introduced interneurons between neighbouring discretizing units that detect the strength of the steady-state diffusion and forward control commands to the control neurons via modifiable connections. This architecture forms the Multigrid Position-and-Direction-to-Action (MPDA) map. The architecture integrates reactive path planning and continuous motion control. It is also shown that the scheme leads to population coding for the actual command vector.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 1995\n \n \n (2)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Generalized Dynamic Concept Model as a Route to Construct Adaptive Autonomous Agents.\n \n \n \n \n\n\n \n Kalmár, Z.; Szepesvári, C.; and Lörincz, A.\n\n\n \n\n\n\n Neural Network World, 3: 353–360. 1995.\n \n\n\n\n
\n\n\n\n \n \n \"Generalized paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@article{kalmar1995a,\n\tabstract = {A model of adaptive autonomous agents, that (i) builds internal representation of events an d event relations, (ii) utilizes activation spreading for building dynamic concepts and (iii) makes use of the winner-take-all paradigm to come to a decision is extended by introducing generalization into the model. The generalization reduces memory requirements and improves performance in unseen scenes as it is indicated by computer simulations.},\n\tauthor = {Kalm{\\'a}r, Zs. and Szepesv{\\'a}ri, Cs. and L{\\"o}rincz, A.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tjournal = {Neural Network World},\n\tkeywords = {reinforcement learning, theory},\n\tpages = {353--360},\n\ttitle = {Generalized Dynamic Concept Model as a Route to Construct Adaptive Autonomous Agents},\n\turl_paper = {kalmar.dcmg.ps.pdf},\n\tvolume = {3},\n\tyear = {1995}}\n\n
\n
\n\n\n
\n A model of adaptive autonomous agents, that (i) builds internal representation of events an d event relations, (ii) utilizes activation spreading for building dynamic concepts and (iii) makes use of the winner-take-all paradigm to come to a decision is extended by introducing generalization into the model. The generalization reduces memory requirements and improves performance in unseen scenes as it is indicated by computer simulations.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n General Framework for Reinforcement Learning.\n \n \n \n \n\n\n \n Szepesvári, C.\n\n\n \n\n\n\n In ICANN, volume 2, pages 165–170, 1995. \n \n\n\n\n
\n\n\n\n \n \n \"General paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{Szepesvari1995b,\n\tabstract = {In this article we propose a general framework for sequential decision making. The framework is based on the observation that the derivation of the optimal behaviour under various decision criteria follows the same pattern: the cost of policies can be decomposed into the successive application of an operator that defines the related dynamic programming algorithm and this operator describes completely the structure of the decision problem. We take this mapping (the so called one step lookahead (OLA) cost mapping) as our starting point. This enables the unified treatment of various decision criteria (e.g. the expected value criterion or the worst-case criterion). The main result of this article says that under minimal conditions optimal stationary policies are greedy w.r.t. the optimal cost function and vice versa. Based on this result we feel that former results on reinforcement learning can be transferred to other decision criteria provided that the decision criterion is decomposable by an appropriate mapping.},\n\tauthor = {Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {ICANN},\n\tkeywords = {reinforcement learning, theory},\n\tpages = {165--170},\n\ttitle = {General Framework for Reinforcement Learning},\n\turl_paper = {szepes.greinf.ps.pdf},\n\tvolume = {2},\n\tyear = {1995}}\n\n
\n
\n\n\n
\n In this article we propose a general framework for sequential decision making. The framework is based on the observation that the derivation of the optimal behaviour under various decision criteria follows the same pattern: the cost of policies can be decomposed into the successive application of an operator that defines the related dynamic programming algorithm and this operator describes completely the structure of the decision problem. We take this mapping (the so called one step lookahead (OLA) cost mapping) as our starting point. This enables the unified treatment of various decision criteria (e.g. the expected value criterion or the worst-case criterion). The main result of this article says that under minimal conditions optimal stationary policies are greedy w.r.t. the optimal cost function and vice versa. Based on this result we feel that former results on reinforcement learning can be transferred to other decision criteria provided that the decision criterion is decomposable by an appropriate mapping.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 1994\n \n \n (7)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Self-Organizing Neurocontrol.\n \n \n \n \n\n\n \n Fomin, T.; Szepesvári, C.; and Lörincz, A.\n\n\n \n\n\n\n In Proc. of IEEE WCCI ICNN'94, volume 5, pages 2777–2780, Orlando, Florida, 1994. IEEE Inc.\n \n\n\n\n
\n\n\n\n \n \n \"Self-Organizing paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{fomin1994,\n\tabstract = {Self-organizing neural network solutions to control problems are described. Competitive networks create spatial filters and geometry connections in a self-organizing fashion. The goal position, the obstacle and the object under control all create neural activities through the filters. Spreading activation that discriminates between the controlled object, the goal position and the obstacles is utilized on the internal representation. Local self-training method and Hebbian learning develops the self-organizing control connections. The algorithm provides maneouvering capability in unseen scenes.},\n\taddress = {Orlando, Florida},\n\tauthor = {Fomin, T. and Szepesv{\\'a}ri, Cs. and L{\\"o}rincz, A.},\n\tbooktitle = {Proc. of IEEE WCCI ICNN'94},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tkeywords = {neural networks, control},\n\tpages = {2777--2780},\n\tpublisher = {IEEE Inc.},\n\ttitle = {Self-Organizing Neurocontrol},\n\turl_paper = {fomin.neucont.ps.pdf},\n\tvolume = {5},\n\tyear = {1994}}\n\n
\n
\n\n\n
\n Self-organizing neural network solutions to control problems are described. Competitive networks create spatial filters and geometry connections in a self-organizing fashion. The goal position, the obstacle and the object under control all create neural activities through the filters. Spreading activation that discriminates between the controlled object, the goal position and the obstacles is utilized on the internal representation. Local self-training method and Hebbian learning develops the self-organizing control connections. The algorithm provides maneouvering capability in unseen scenes.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Generalization in an Autonomous Agent.\n \n \n \n \n\n\n \n Kalmár, Z.; Szepesvári, C.; and Lörincz, A.\n\n\n \n\n\n\n In Proc. of IEEE WCCI ICNN'94, volume 3, pages 1815–1817, Orlando, Florida, 06 1994. IEEE Inc.\n \n\n\n\n
\n\n\n\n \n \n \"Generalization link\n  \n \n \n \"Generalization paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{kalmar1994,\n\tabstract = {In this article we present an extension of a previously defined model [8]. This model was introduced to govern an agent in a goal-oriented fashion in a previously unknown environment. The extension allows generalization in the input space, which reduces memory requirements as well as the time requirements of the algorithm.},\n\taddress = {Orlando, Florida},\n\tauthor = {Kalm{\\'a}r, Zs. and Szepesv{\\'a}ri, Cs. and L{\\"o}rincz, A.},\n\tbooktitle = {Proc. of IEEE WCCI ICNN'94},\n\tkeywords = {agent architecture, reinforcement learning},\n\tmonth = {06},\n\tpages = {1815--1817},\n\tpublisher = {IEEE Inc.},\n\ttitle = {Generalization in an Autonomous Agent},\n\turl_link = {http://ieeexplore.ieee.org/iel2/3013/8558/00374432.pdf?arnumber=374432},\n\turl_paper = {kalmar.gen.pdf},\n\tvolume = {3},\n\tyear = {1994},\n\tBdsk-Url-1 = {http://ieeexplore.ieee.org/iel2/3013/8558/00374432.pdf?arnumber=374432}}\n\n
\n
\n\n\n
\n In this article we present an extension of a previously defined model [8]. This model was introduced to govern an agent in a goal-oriented fashion in a previously unknown environment. The extension allows generalization in the input space, which reduces memory requirements as well as the time requirements of the algorithm.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Complexity of Learning: the Case of Everyday Neural Networks.\n \n \n \n \n\n\n \n Oláh, B.; and Szepesvári, C.\n\n\n \n\n\n\n In Proceedings of IEEE WCCI ICNN'94, volume 1, pages 61–65, Orlando, Florida, 06 1994. \n \n\n\n\n
\n\n\n\n \n \n \"Complexity paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{olah1994,\n\tabstract = {Nowadays artificial neural networks (ANNs) receive an increasing attention. However recent computer architectures do not allow yet the implementation of large ANNs. Thus it is an important question to examine how the learning time of ANNs scales respect to their size (and/or with the size of the tasks). Judd has introduced a computational framework for the learning problem (J. Judd, ``Neural Network Design and the Complexity of Learning.'' A Bradford Book, MIT Press, Cambridge, 1990) and proved, that learning in neural networks in general is too hard, i.e. in the worst case learning in neural networks is NP-complete. However, in his proof he restricts the domain of neural network architectures and tasks in such a way, that ``everyday'' neural network architectures, such as the one of the back-propagation algorithm, are excluded. Consequently Judd's proof does not tell anything for these types of networks.\n\n\t\t  First we outline a thorough framework for loading. The framework enables to differentiate between loading problems at a finer level. Two theorems are presented about the complexity of learning for ``everyday'' ANN architectures. The first theorem says, that for extended binary tasks and in the worst-case, the loading problem is NP-complete, while the second says, that for binary tasks and basis LUF there exists a polynomial time algorithm. From these results it seems that the loading problem for ``everyday'' neural network is interesting from the mathematical point of view as it lies on the boundary of efficiently solvable problems.},\n\taddress = {Orlando, Florida},\n\tauthor = {Ol{\\'a}h, B. and Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Proceedings of IEEE WCCI ICNN'94},\n\tkeywords = {complexity analysis, NP-completeness, neural networks, loading problem, theory},\n\tmonth = {06},\n\tpages = {61--65},\n\ttitle = {Complexity of Learning: the Case of Everyday Neural Networks},\n\turl_paper = {icnn94.ps.pdf},\n\tvolume = {1},\n\tyear = {1994}}\n\n
\n
\n\n\n
\n Nowadays artificial neural networks (ANNs) receive an increasing attention. However recent computer architectures do not allow yet the implementation of large ANNs. Thus it is an important question to examine how the learning time of ANNs scales respect to their size (and/or with the size of the tasks). Judd has introduced a computational framework for the learning problem (J. Judd, ``Neural Network Design and the Complexity of Learning.'' A Bradford Book, MIT Press, Cambridge, 1990) and proved, that learning in neural networks in general is too hard, i.e. in the worst case learning in neural networks is NP-complete. However, in his proof he restricts the domain of neural network architectures and tasks in such a way, that ``everyday'' neural network architectures, such as the one of the back-propagation algorithm, are excluded. Consequently Judd's proof does not tell anything for these types of networks. First we outline a thorough framework for loading. The framework enables to differentiate between loading problems at a finer level. Two theorems are presented about the complexity of learning for ``everyday'' ANN architectures. The first theorem says, that for extended binary tasks and in the worst-case, the loading problem is NP-complete, while the second says, that for binary tasks and basis LUF there exists a polynomial time algorithm. From these results it seems that the loading problem for ``everyday'' neural network is interesting from the mathematical point of view as it lies on the boundary of efficiently solvable problems.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Dynamic Concept Model Learns Optimal Policies.\n \n \n \n \n\n\n \n Szepesvári, C.\n\n\n \n\n\n\n In Proc. of IEEE WCCI ICNN'94, volume 3, pages 1738–1742, Orlando, Florida, 1994. IEEE Inc.\n \n\n\n\n
\n\n\n\n \n \n \"Dynamic paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{Szepesvari1994c,\n\tabstract = {Reinforcement learning is a flourishing field of neural methods. It has a firm theoretical basis and has been proven powerful in many applications. A brain model based alternative to RL has been introduced in the literature: It integrates artificial neural networks (ANN) and knowledge based (KB) systems into one unit or agent for goal oriented problem solving. The agent may possess inherited and learnt ANN and KB subsystems. The agent has and develops ANN cues to the environment for dimensionality reduction in order to ease the problem of combinatorial explosion. A dynamic concept model was forwarded that builds cue-models of the phenomena in the world, designs action sets (concepts) and make them compete in a neural stage to come to a decision. The competition was implemented in the form of activation spreading (AS) and a winner-take-all mechanism. The efficiency of the algorithm has been demonstrated for several examples, however, the optimality of the algorithm have not yet been proven in general. Here, a restriction to Markov decision problems (MDP) shall be treated making possible to show the equivalence of a special AS and RL. The equivalence in this special case means, that DCM has all the advantages of RL, moreover it keeps track of more distinctions allowing faster convergence and generalization.},\n\taddress = {Orlando, Florida},\n\tauthor = {Szepesv{\\'a}ri, Cs.},\n\tbooktitle = {Proc. of IEEE WCCI ICNN'94},\n\tkeywords = {reinforcement learning, theory},\n\tpages = {1738--1742},\n\tpublisher = {IEEE Inc.},\n\ttitle = {Dynamic Concept Model Learns Optimal Policies},\n\turl_paper = {szepes.dcmopt.ps.pdf},\n\tvolume = {3},\n\tyear = {1994}}\n\n
\n
\n\n\n
\n Reinforcement learning is a flourishing field of neural methods. It has a firm theoretical basis and has been proven powerful in many applications. A brain model based alternative to RL has been introduced in the literature: It integrates artificial neural networks (ANN) and knowledge based (KB) systems into one unit or agent for goal oriented problem solving. The agent may possess inherited and learnt ANN and KB subsystems. The agent has and develops ANN cues to the environment for dimensionality reduction in order to ease the problem of combinatorial explosion. A dynamic concept model was forwarded that builds cue-models of the phenomena in the world, designs action sets (concepts) and make them compete in a neural stage to come to a decision. The competition was implemented in the form of activation spreading (AS) and a winner-take-all mechanism. The efficiency of the algorithm has been demonstrated for several examples, however, the optimality of the algorithm have not yet been proven in general. Here, a restriction to Markov decision problems (MDP) shall be treated making possible to show the equivalence of a special AS and RL. The equivalence in this special case means, that DCM has all the advantages of RL, moreover it keeps track of more distinctions allowing faster convergence and generalization.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Topology Learning Solved by Extended Objects: a Neural Network Model.\n \n \n \n \n\n\n \n Szepesvári, C.; Balázs, L.; and Lörincz, A.\n\n\n \n\n\n\n Neural Computation, 6(3): 441–458. 1994.\n \n\n\n\n
\n\n\n\n \n \n \"Topology paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@article{Szepesvari1994d,\n\tabstract = {It is shown that local, extended objects of a metrical topological space shape the receptive fields of competitive neurons to local filters. Self-organized topology learning is then solved with the help of Hebbian learning together with extended objects that provide unique information about neighborhood relations. A topographical map is deduced and is used to speed up further adaptation in a changing environment with the help of Kohonen type learning that teaches the neighbors of winning neurons as well.},\n\tauthor = {Szepesv{\\'a}ri, Cs. and Bal{\\'a}zs, L. and L{\\"o}rincz, A.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tjournal = {Neural Computation},\n\tkeywords = {manifold learning, neural networks},\n\tnumber = {3},\n\tpages = {441--458},\n\ttitle = {Topology Learning Solved by Extended Objects: a Neural Network Model},\n\turl_paper = {toplearn94.ps.pdf},\n\tvolume = {6},\n\tyear = {1994}}\n\n
\n
\n\n\n
\n It is shown that local, extended objects of a metrical topological space shape the receptive fields of competitive neurons to local filters. Self-organized topology learning is then solved with the help of Hebbian learning together with extended objects that provide unique information about neighborhood relations. A topographical map is deduced and is used to speed up further adaptation in a changing environment with the help of Kohonen type learning that teaches the neighbors of winning neurons as well.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Self-organized Learning of 3 Dimensions.\n \n \n \n \n\n\n \n Szepesvári, C.; and Lörincz, A.\n\n\n \n\n\n\n In Marinaro, M.; and Morasso, P., editor(s), ICANN, volume 2, pages 671–674, Sorrento, Italy, 1994. IEEE\n \n\n\n\n
\n\n\n\n \n \n \"Self-organized paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{Szepesvari1994a,\n\tabstract = {The geometric learning capabilities of a competitive neural network are studied. It is shown that the appropriate selection of a neural activity function enables the learning of the 3D geometry of a world, from two of the 2D projections of 3D extended objects},\n\taddress = {Sorrento, Italy},\n\tauthor = {Szepesv{\\'a}ri, Cs. and L{\\"o}rincz, A.},\n\tbooktitle = {ICANN},\n\teditor = {Marinaro, M. and Morasso, P.G.},\n\tkeywords = {manifold learning, theory, neural networks},\n\tpages = {671--674},\n\tpublisher = {IEEE},\n\ttitle = {Self-organized Learning of 3 Dimensions},\n\turl_paper = {icann94.ps.pdf},\n\tvolume = {2},\n\tyear = {1994}}\n\n
\n
\n\n\n
\n The geometric learning capabilities of a competitive neural network are studied. It is shown that the appropriate selection of a neural activity function enables the learning of the 3D geometry of a world, from two of the 2D projections of 3D extended objects\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Behavior of an Adaptive Self-organizing Autonomous Agent Working with Cues and Competing Concepts.\n \n \n \n \n\n\n \n Szepesvári, C.; and Lörincz, A.\n\n\n \n\n\n\n Adaptive Behavior, 2(2): 131–160. 1994.\n \n\n\n\n
\n\n\n\n \n \n \"Behavior paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@article{Szepesvari1994,\n\tabstract = {The concepts are presented of a neural model based shell that integrates artificial neural networks (ANN) and artificial intelligence (AI) for problem solving. The shell may possess inherited and learnt ANN and AI subsystems. The shell has and develops (i) cues to the environment for dimensionality reduction, (ii) rules between elements of the reduced dimensional internal representation, (iii) `concepts' for achieving goals, i.e. for solving existing problems, (iv) the shell then causes the concepts to compete in order to come to a decision. The shell is designed for control problems, e.g. robotic tasks, control of plants, investment advisory systems, and may have very different ANN and AI parts. Here, we consider a simple robotic-like object in two dimensional space.},\n\tauthor = {Szepesv{\\'a}ri, Cs. and L{\\"o}rincz, A.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:15 -0600},\n\tjournal = {Adaptive Behavior},\n\tkeywords = {agent architecture, reinforcement learning},\n\tnumber = {2},\n\tpages = {131--160},\n\ttitle = {Behavior of an Adaptive Self-organizing Autonomous Agent Working with Cues and Competing Concepts},\n\turl_paper = {annai94.pdf},\n\tvolume = {2},\n\tyear = {1994}}\n\n
\n
\n\n\n
\n The concepts are presented of a neural model based shell that integrates artificial neural networks (ANN) and artificial intelligence (AI) for problem solving. The shell may possess inherited and learnt ANN and AI subsystems. The shell has and develops (i) cues to the environment for dimensionality reduction, (ii) rules between elements of the reduced dimensional internal representation, (iii) `concepts' for achieving goals, i.e. for solving existing problems, (iv) the shell then causes the concepts to compete in order to come to a decision. The shell is designed for control problems, e.g. robotic tasks, control of plants, investment advisory systems, and may have very different ANN and AI parts. Here, we consider a simple robotic-like object in two dimensional space.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 1993\n \n \n (4)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n The Role of Local Connections in Competitive Neural Networks: Collective Learning and Representation of Geometry.\n \n \n \n\n\n \n Szepesvári, C.\n\n\n \n\n\n\n Master's thesis, Attila József University of Szeged, Szeged, Hungary, 1993.\n in Hungarian\n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@mastersthesis{Szepesvari1993d,\n\taddress = {Szeged, Hungary},\n\tauthor = {Szepesv{\\'a}ri, Cs.},\n\tdate-added = {2010-08-28 17:38:14 -0600},\n\tdate-modified = {2010-09-02 13:09:16 -0600},\n\tkeywords = {manifold learning, neural networks, theory},\n\tnote = {in Hungarian},\n\tschool = {Attila J{\\'o}zsef University of Szeged},\n\ttitle = {The Role of Local Connections in Competitive Neural Networks: Collective Learning and Representation of Geometry},\n\tyear = {1993}}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Integration of Artificial Neural Networks and Dynamic Concepts to an Adaptive and Self-organizing Agent.\n \n \n \n\n\n \n Szepesvári, C.; and Lörincz, A.\n\n\n \n\n\n\n In Gielen, S.; and Kappen, B., editor(s), ICANN, Amsterdam, The Netherlands, 09 1993. Springer-Verlag, London\n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{Szepesvari1993a,\n\taddress = {Amsterdam, The Netherlands},\n\tauthor = {Szepesv{\\'a}ri, Cs. and L{\\"o}rincz, A.},\n\tbooktitle = {ICANN},\n\teditor = {Gielen, S. and Kappen, B.},\n\tkeywords = {agent architecture, reinforcement learning},\n\tmonth = {09},\n\tpublisher = {Springer-Verlag, London},\n\ttitle = {Integration of Artificial Neural Networks and Dynamic Concepts to an Adaptive and Self-organizing Agent},\n\tyear = {1993}}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Integration of ANN Cues, Dynamic AI Concepts and ANN Decision System into an Adaptive Self-Organizing Agent.\n \n \n \n\n\n \n Szepesvári, C.; and Lörincz, A.\n\n\n \n\n\n\n In Koch, P., editor(s), 3rd Conf. on Artificial Intelligence, pages 231–237, Budapest, Hungary, April 6–8 1993. John von Neumann Society for Computer Sciences\n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{Szepesvari1993c,\n\taddress = {Budapest, Hungary},\n\tauthor = {Szepesv{\\'a}ri, Cs. and L{\\"o}rincz, A.},\n\tbooktitle = {3rd Conf. on Artificial Intelligence},\n\teditor = {Koch, P.},\n\tkeywords = {agent architecture, reinforcement learning},\n\tmonth = {April 6--8},\n\tpages = {231--237},\n\tpublisher = {John von Neumann Society for Computer Sciences},\n\ttitle = {Integration of ANN Cues, Dynamic AI Concepts and ANN Decision System into an Adaptive Self-Organizing Agent},\n\tyear = {1993}}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n Integration of Artificial Neural Networks and Dynamic Concepts to an Adaptive and Self-organizing Agent.\n \n \n \n\n\n \n Szepesvári, C.; and Lörincz, A.\n\n\n \n\n\n\n In Proc. of WCNN'93, volume 1, pages 524–527, Portland, Oregon, USA, 07 1993. Lawrence Erlbaum Associates, Inc. Publishers, New Jersey\n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{Szepesvari1993,\n\taddress = {Portland, Oregon, USA},\n\tauthor = {Szepesv{\\'a}ri, Cs. and L{\\"o}rincz, A.},\n\tbooktitle = {Proc. of WCNN'93},\n\tday = {11--15},\n\tkeywords = {agent architecture, reinforcement learning},\n\tmonth = {07},\n\tpages = {524--527},\n\tpublisher = {Lawrence Erlbaum Associates, Inc. Publishers, New Jersey},\n\ttitle = {Integration of Artificial Neural Networks and Dynamic Concepts to an Adaptive and Self-organizing Agent},\n\tvolume = {1},\n\tyear = {1993}}\n\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n\n\n\n
\n\n\n \n\n \n \n \n \n\n
\n"}; document.write(bibbase_data.data);