var bibbase_data = {"data":"\"Loading..\"\n\n
\n\n \n\n \n\n \n \n\n \n\n \n \n\n \n\n \n
\n generated by\n \n \"bibbase.org\"\n\n \n
\n \n\n
\n\n \n\n\n
\n\n Excellent! Next you can\n create a new website with this list, or\n embed it in an existing web page by copying & pasting\n any of the following snippets.\n\n
\n JavaScript\n (easiest)\n
\n \n <script src=\"https://bibbase.org/show?bib=https://bguedj.github.io/files/bguedj-publications.bib&jsonp=1&nocache=1&theme=default&jsonp=1\"></script>\n \n
\n\n PHP\n
\n \n <?php\n $contents = file_get_contents(\"https://bibbase.org/show?bib=https://bguedj.github.io/files/bguedj-publications.bib&jsonp=1&nocache=1&theme=default\");\n print_r($contents);\n ?>\n \n
\n\n iFrame\n (not recommended)\n
\n \n <iframe src=\"https://bibbase.org/show?bib=https://bguedj.github.io/files/bguedj-publications.bib&jsonp=1&nocache=1&theme=default\"></iframe>\n \n
\n\n

\n For more details see the documention.\n

\n
\n
\n\n
\n\n This is a preview! To use this list on your own web site\n or create a new web site from it,\n create a free account. The file will be added\n and you will be able to edit it in the File Manager.\n We will show you instructions once you've created your account.\n
\n\n
\n\n

To the site owner:

\n\n

Action required! Mendeley is changing its\n API. In order to keep using Mendeley with BibBase past April\n 14th, you need to:\n

    \n
  1. renew the authorization for BibBase on Mendeley, and
  2. \n
  3. update the BibBase URL\n in your page the same way you did when you initially set up\n this page.\n
  4. \n
\n

\n\n

\n \n \n Fix it now\n

\n
\n\n
\n\n\n
\n \n \n
\n
\n  \n 2024\n \n \n (6)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Discord in the voter model for complex networks.\n \n \n \n \n\n\n \n Vendeville, A.; Zhou, S.; and Guedj, B.\n\n\n \n\n\n\n Physical Review E, 109. 2024.\n \n\n\n\n
\n\n\n\n \n \n \"DiscordPaper\n  \n \n \n \"Discord pdf\n  \n \n \n \"Discord arxiv\n  \n \n \n \"Discord code\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{vendeville2022depolarising,\ntitle={Discord in the voter model for complex networks},\nauthor={Antoine Vendeville and Shi Zhou and Benjamin Guedj},\nyear={2024},\njournal = {Physical Review E},\nvolume = {109},\nissue = {2},\nabstract = {Online social networks have become primary means of communication. As they often exhibit undesirable effects such as hostility, polarisation or echo chambers, it is crucial to develop analytical tools that help us better understand them. In this paper, we are interested in the evolution of discord in social networks. Formally, we introduce a method to calculate the probability of discord between any two agents in the multi-state voter model with and without zealots. Our work applies to any directed, weighted graph with any finite number of possible opinions, allows for various update rates across agents, and does not imply any approximation. Under certain topological conditions, their opinions are independent and the joint distribution can be decoupled. Otherwise, the evolution of discord probabilities is described by a linear system of ordinary differential equations. We prove the existence of a unique equilibrium solution, which can be computed via an iterative algorithm. The classical definition of active links density is generalized to take into account long-range, weighted interactions. We illustrate our findings on real-life and synthetic networks. In particular, we investigate the impact of clustering on discord, and uncover a rich landscape of varied behaviors in polarised networks. This sheds lights on the evolution of discord between, and within, antagonistic communities.},\nurl = {https://link.aps.org/doi/10.1103/PhysRevE.109.024312},\nurl_PDF = {https://journals.aps.org/pre/pdf/10.1103/PhysRevE.109.024312},\nurl_arXiv = {https://arxiv.org/abs/2203.02002},\nurl_Code = {https://github.com/antoinevendeville/howopinionscrystallise},\neprint={2203.02002},\nDOI = {10.1103/PhysRevE.109.024312},\narchivePrefix={arXiv},\nprimaryClass={cs.SI},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n Online social networks have become primary means of communication. As they often exhibit undesirable effects such as hostility, polarisation or echo chambers, it is crucial to develop analytical tools that help us better understand them. In this paper, we are interested in the evolution of discord in social networks. Formally, we introduce a method to calculate the probability of discord between any two agents in the multi-state voter model with and without zealots. Our work applies to any directed, weighted graph with any finite number of possible opinions, allows for various update rates across agents, and does not imply any approximation. Under certain topological conditions, their opinions are independent and the joint distribution can be decoupled. Otherwise, the evolution of discord probabilities is described by a linear system of ordinary differential equations. We prove the existence of a unique equilibrium solution, which can be computed via an iterative algorithm. The classical definition of active links density is generalized to take into account long-range, weighted interactions. We illustrate our findings on real-life and synthetic networks. In particular, we investigate the impact of clustering on discord, and uncover a rich landscape of varied behaviors in polarised networks. This sheds lights on the evolution of discord between, and within, antagonistic communities.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Comparing Comparators in Generalization Bounds.\n \n \n \n \n\n\n \n Hellström, F.; and Guedj, B.\n\n\n \n\n\n\n In Proceedings of The 27th International Conference on Artificial Intelligence and Statistics [AISTATS], 2024. \n \n\n\n\n
\n\n\n\n \n \n \"ComparingPaper\n  \n \n \n \"Comparing pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{hellstrom2023comparing,\ntitle = {Comparing Comparators in Generalization Bounds},\nauthor={Fredrik Hellström and Benjamin Guedj},\nyear={2024},\nbooktitle={Proceedings of The 27th International Conference on Artificial Intelligence and Statistics [AISTATS]},\nabstract = {We derive generic information-theoretic and PAC-Bayesian generalization bounds involving an arbitrary convex \\emph{comparator} function, which measures the discrepancy between the training loss and the population loss. The bounds hold under the assumption that the cumulant-generating function (CGF) of the comparator is upper-bounded by the corresponding CGF within a family of bounding distributions. We show that the tightest possible bound is obtained with the comparator being the convex conjugate of the CGF of the bounding distribution, also known as the Cram\\'er function. This conclusion applies more broadly to generalization bounds with a similar structure. This confirms the near-optimality of known bounds for bounded and sub-Gaussian losses and leads to novel bounds under other bounding distributions.},\nurl = {https://arxiv.org/abs/2310.10534},\nurl_PDF = {https://arxiv.org/pdf/2310.10534.pdf},\ndoi = {10.48550/arXiv.2310.10534},\neprint={2310.10534},\narchivePrefix={arXiv},\nprimaryClass={cs.LG},\ncopyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n We derive generic information-theoretic and PAC-Bayesian generalization bounds involving an arbitrary convex \\emphcomparator function, which measures the discrepancy between the training loss and the population loss. The bounds hold under the assumption that the cumulant-generating function (CGF) of the comparator is upper-bounded by the corresponding CGF within a family of bounding distributions. We show that the tightest possible bound is obtained with the comparator being the convex conjugate of the CGF of the bounding distribution, also known as the Cramér function. This conclusion applies more broadly to generalization bounds with a similar structure. This confirms the near-optimality of known bounds for bounded and sub-Gaussian losses and leads to novel bounds under other bounding distributions.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Closed-form filtering for non-linear systems.\n \n \n \n \n\n\n \n Cantelobre, T.; Ciliberto, C.; Guedj, B.; and Rudi, A.\n\n\n \n\n\n\n 2024.\n Submitted.\n\n\n\n
\n\n\n\n \n \n \"Closed-formPaper\n  \n \n \n \"Closed-form pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@unpublished{cantelobre2024hmm,\ntitle = {Closed-form filtering for non-linear systems},\nauthor = {Cantelobre, Th{\\'e}ophile and Ciliberto, Carlo and Guedj, Benjamin and Rudi, Alessandro},\nyear = {2024},\nnote = "Submitted.",\nabstract = {Sequential Bayesian Filtering aims to estimate the current state distribution of a Hidden Markov Model, given the past observations. The problem is well-known to be intractable for most application domains, except in notable cases such as the tabular setting or for linear dynamical systems with gaussian noise. In this work, we propose a new class of filters based on Gaussian PSD Models, which offer several advantages in terms of density approximation and computational efficiency. We show that filtering can be efficiently performed in closed form when transitions and observations are Gaussian PSD Models. When the transition and observations are approximated by Gaussian PSD Models, we show that our proposed estimator enjoys strong theoretical guarantees, with estimation error that depends on the quality of the approximation and is adaptive to the regularity of the transition probabilities.  In particular, we identify regimes in which our proposed filter attains a TV $\\epsilon$-error with memory and computational complexity of $O(\\epsilon^{-1})$ and $O(\\epsilon^{-3/2})$ respectively, including the offline learning step, in contrast to the $O(\\epsilon^{-2})$ complexity of sampling methods such as particle filtering.},\nurl = {https://arxiv.org/abs/2402.09796},\nurl_PDF = {https://arxiv.org/pdf/2402.09796.pdf},\ndoi = {10.48550/ARXIV.2402.09796},\neprint={2402.09796},\narchivePrefix={arXiv},\nprimaryClass={stat.ML},\ncopyright = {Creative Commons Attribution 4.0 International},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n Sequential Bayesian Filtering aims to estimate the current state distribution of a Hidden Markov Model, given the past observations. The problem is well-known to be intractable for most application domains, except in notable cases such as the tabular setting or for linear dynamical systems with gaussian noise. In this work, we propose a new class of filters based on Gaussian PSD Models, which offer several advantages in terms of density approximation and computational efficiency. We show that filtering can be efficiently performed in closed form when transitions and observations are Gaussian PSD Models. When the transition and observations are approximated by Gaussian PSD Models, we show that our proposed estimator enjoys strong theoretical guarantees, with estimation error that depends on the quality of the approximation and is adaptive to the regularity of the transition probabilities. In particular, we identify regimes in which our proposed filter attains a TV $ε$-error with memory and computational complexity of $O(ε^{-1})$ and $O(ε^{-3/2})$ respectively, including the offline learning step, in contrast to the $O(ε^{-2})$ complexity of sampling methods such as particle filtering.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Tighter Generalisation Bounds via Interpolation.\n \n \n \n \n\n\n \n Viallard, P.; Haddouche, M.; Şimşekli, U.; and Guedj, B.\n\n\n \n\n\n\n 2024.\n Submitted.\n\n\n\n
\n\n\n\n \n \n \"TighterPaper\n  \n \n \n \"Tighter pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@unpublished{viallard2024interpolation,\ntitle = {Tighter Generalisation Bounds via Interpolation},\nauthor = {Viallard, Paul and Haddouche, Maxime and Şimşekli, Umut and Guedj, Benjamin},\nyear = {2024},\nnote = "Submitted.",\nabstract = {This paper contains a recipe for deriving new PAC-Bayes generalisation bounds based on the $(f, \\Gamma)$-divergence, and, in addition, presents PAC-Bayes generalisation bounds where we interpolate between a series of probability divergences (including but not limited to KL, Wasserstein, and total variation), making the best out of many worlds depending on the posterior distributions properties.\nWe explore the tightness of these bounds and connect them to earlier results from statistical learning, which are specific cases.\nWe also instantiate our bounds as training objectives, yielding non-trivial guarantees and practical performances.},\nurl = {https://arxiv.org/abs/2402.05101},\nurl_PDF = {https://arxiv.org/pdf/2402.05101.pdf},\ndoi = {10.48550/ARXIV.2402.05101},\neprint={2402.05101},\narchivePrefix={arXiv},\nprimaryClass={stat.ML},\ncopyright = {Creative Commons Attribution 4.0 International},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n This paper contains a recipe for deriving new PAC-Bayes generalisation bounds based on the $(f, Γ)$-divergence, and, in addition, presents PAC-Bayes generalisation bounds where we interpolate between a series of probability divergences (including but not limited to KL, Wasserstein, and total variation), making the best out of many worlds depending on the posterior distributions properties. We explore the tightness of these bounds and connect them to earlier results from statistical learning, which are specific cases. We also instantiate our bounds as training objectives, yielding non-trivial guarantees and practical performances.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A PAC-Bayesian Link Between Generalisation and Flat Minima.\n \n \n \n \n\n\n \n Haddouche, M.; Viallard, P.; Şimşekli, U.; and Guedj, B.\n\n\n \n\n\n\n 2024.\n Submitted.\n\n\n\n
\n\n\n\n \n \n \"APaper\n  \n \n \n \"A pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@unpublished{haddouche2024pls,\ntitle = {A PAC-Bayesian Link Between Generalisation and Flat Minima},\nauthor = {Haddouche, Maxime and Viallard, Paul and Şimşekli, Umut and Guedj, Benjamin},\nyear = {2024},\nnote = "Submitted.",\nabstract = {Modern machine learning usually involves predictors in the overparametrised setting (number of trained parameters greater than dataset size), and their training yield not only good performances on training data, but also good generalisation capacity. This phenomenon challenges many theoretical results, and remains an open problem. To reach a better understanding, we provide novel generalisation bounds involving gradient terms. To do so, we combine the PAC-Bayes toolbox with Poincaré and Log-Sobolev inequalities, avoiding an explicit dependency on dimension of the predictor space. Our results highlight the positive influence of \\emph{flat minima} (being minima with a neighbourhood nearly minimising the learning problem as well) on generalisation performances, involving directly the benefits of the optimisation phase.  },\nurl = {https://arxiv.org/abs/2402.08508},\nurl_PDF = {https://arxiv.org/pdf/2402.08508.pdf},\ndoi = {10.48550/ARXIV.2402.08508},\neprint={2402.08508},\narchivePrefix={arXiv},\nprimaryClass={stat.ML},\ncopyright = {Creative Commons Attribution 4.0 International},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n Modern machine learning usually involves predictors in the overparametrised setting (number of trained parameters greater than dataset size), and their training yield not only good performances on training data, but also good generalisation capacity. This phenomenon challenges many theoretical results, and remains an open problem. To reach a better understanding, we provide novel generalisation bounds involving gradient terms. To do so, we combine the PAC-Bayes toolbox with Poincaré and Log-Sobolev inequalities, avoiding an explicit dependency on dimension of the predictor space. Our results highlight the positive influence of \\emphflat minima (being minima with a neighbourhood nearly minimising the learning problem as well) on generalisation performances, involving directly the benefits of the optimisation phase. \n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n On Generalisation and Learning: Contributions to Statistical Learning Theory.\n \n \n \n \n\n\n \n Guedj, B.\n\n\n \n\n\n\n Ph.D. Thesis, 2024.\n Habilitation à diriger des recherches\n\n\n\n
\n\n\n\n \n \n \"OnPaper\n  \n \n \n \"On pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n  \n \n 13 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n\n\n\n
\n
@phdthesis{guedj2024hdr,\ntitle = {On Generalisation and Learning: Contributions to Statistical Learning Theory},\nauthor = {Guedj, Benjamin},\nnote = {Habilitation à diriger des recherches},\nurl = {},\nschool = {},\nyear = {2024},\nmonth = {},\nabstract = {},\nTYPE = {Theses},\nurl_PDF = {}\n}\n\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2023\n \n \n (14)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Model Validation Using Mutated Training Labels: An Exploratory Study.\n \n \n \n \n\n\n \n Zhang, J. M.; Harman, M.; Guedj, B.; Barr, E. T.; and Shawe-Taylor, J.\n\n\n \n\n\n\n Neurocomputing, 539. 2023.\n \n\n\n\n
\n\n\n\n \n \n \"ModelPaper\n  \n \n \n \"Model arxiv\n  \n \n \n \"Model pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{zhang2019perturbation,\ntitle={Model Validation Using Mutated Training Labels: An Exploratory Study},\nauthor={Jie M. Zhang and Mark Harman and Benjamin Guedj and Earl T. Barr and John Shawe-Taylor},\nyear={2023},\njournal = {Neurocomputing},\nvolume = {539},\ndoi = {10.1016/j.neucom.2023.02.042},\nabstract = {We introduce an exploratory study on Mutation Validation (MV), a model validation method using mutated training labels for supervised learning. MV mutates training data labels, retrains the model against the mutated data, and then uses the metamorphic relation that captures the consequent training performance changes to assess model fit. It does not use a validation set or test set. The intuition underpinning MV is that overfitting models tend to fit noise in the training data. MV does not aim to replace out-of-sample validation. Instead, we provide the first exploratory study on the possibility of using MV as a complement of out-of-sample validation. We explore 8 different learning algorithms, 18 datasets, and 5 types of hyperparameter tuning tasks. Our results demonstrate that MV complements well cross-validation and test accuracy in model selection and hyperparameter tuning tasks. MV deserves more attention from developers when simplicity, sustainaiblity, security (e.g., defending training data attack), and interpretability of the built models are required.},\nurl = "https://www.sciencedirect.com/science/article/abs/pii/S0925231223001911",\nurl_arXiv = "https://arxiv.org/abs/1905.10201",\nurl_PDF = "https://arxiv.org/pdf/1905.10201.pdf",\narchivePrefix={arXiv},\nprimaryClass={cs.LG},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n We introduce an exploratory study on Mutation Validation (MV), a model validation method using mutated training labels for supervised learning. MV mutates training data labels, retrains the model against the mutated data, and then uses the metamorphic relation that captures the consequent training performance changes to assess model fit. It does not use a validation set or test set. The intuition underpinning MV is that overfitting models tend to fit noise in the training data. MV does not aim to replace out-of-sample validation. Instead, we provide the first exploratory study on the possibility of using MV as a complement of out-of-sample validation. We explore 8 different learning algorithms, 18 datasets, and 5 types of hyperparameter tuning tasks. Our results demonstrate that MV complements well cross-validation and test accuracy in model selection and hyperparameter tuning tasks. MV deserves more attention from developers when simplicity, sustainaiblity, security (e.g., defending training data attack), and interpretability of the built models are required.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Cluster-Specific Predictions with Multi-Task Gaussian Processes.\n \n \n \n \n\n\n \n Leroy, A.; Latouche, P.; Guedj, B.; and Gey, S.\n\n\n \n\n\n\n Journal of Machine Learning Research [JMLR], 24(5): 1–49. 2023.\n \n\n\n\n
\n\n\n\n \n \n \"Cluster-SpecificPaper\n  \n \n \n \"Cluster-Specific arxiv\n  \n \n \n \"Cluster-Specific pdf\n  \n \n \n \"Cluster-Specific code\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{leroy2020clusterspecific,\ntitle={Cluster-Specific Predictions with Multi-Task {Gaussian} Processes}, \nauthor={Arthur Leroy and Pierre Latouche and Benjamin Guedj and Servane Gey},\nyear={2023},\njournal = {Journal of Machine Learning Research [JMLR]},\nabstract = {A model involving Gaussian processes (GPs) is introduced to simultaneously handle multitask learning, clustering, and prediction for multiple functional data. This procedure acts as a model-based clustering method for functional data as well as a learning step for subsequent predictions for new tasks. The model is instantiated as a mixture of multi-task GPs with common mean processes. A variational EM algorithm is derived for dealing with the optimisation of the hyper-parameters along with the hyper-posteriors’ estimation of latent variables and processes. We establish explicit formulas for integrating the mean processes and the latent clustering variables within a predictive distribution, accounting for uncertainty in both aspects. This distribution is defined as a mixture of cluster-specific GP predictions, which enhances the performance when dealing with group-structured data. The model handles irregular grids of observations and offers different hypotheses on the covariance structure for sharing additional information across tasks. The performances on both clustering and prediction tasks are assessed through various simulated scenarios and real data sets. The overall algorithm, called MagmaClust, is publicly available as an R package.},\nvolume  = {24},\nnumber  = {5},\npages   = {1--49},\nurl = "https://jmlr.org/papers/v24/20-1321.html",\nurl_arXiv = "https://arxiv.org/abs/2011.07866",\nurl_PDF = "https://jmlr.org/papers/volume24/20-1321/20-1321.pdf",\nurl_Code = "https://github.com/ArthurLeroy/MagmaClustR",\neprint={2011.07866},\narchivePrefix={arXiv},\nprimaryClass={cs.LG},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n A model involving Gaussian processes (GPs) is introduced to simultaneously handle multitask learning, clustering, and prediction for multiple functional data. This procedure acts as a model-based clustering method for functional data as well as a learning step for subsequent predictions for new tasks. The model is instantiated as a mixture of multi-task GPs with common mean processes. A variational EM algorithm is derived for dealing with the optimisation of the hyper-parameters along with the hyper-posteriors’ estimation of latent variables and processes. We establish explicit formulas for integrating the mean processes and the latent clustering variables within a predictive distribution, accounting for uncertainty in both aspects. This distribution is defined as a mixture of cluster-specific GP predictions, which enhances the performance when dealing with group-structured data. The model handles irregular grids of observations and offers different hypotheses on the covariance structure for sharing additional information across tasks. The performances on both clustering and prediction tasks are assessed through various simulated scenarios and real data sets. The overall algorithm, called MagmaClust, is publicly available as an R package.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n MMD Aggregated Two-Sample Test.\n \n \n \n \n\n\n \n Schrab, A.; Kim, I.; Albert, M.; Laurent, B.; Guedj, B.; and Gretton, A.\n\n\n \n\n\n\n Journal of Machine Learning Research [JMLR], 24(194): 1–81. 2023.\n \n\n\n\n
\n\n\n\n \n \n \"MMDPaper\n  \n \n \n \"MMD pdf\n  \n \n \n \"MMD arxiv\n  \n \n \n \"MMD code\n  \n \n \n \"MMD slides\n  \n \n \n \"MMD poster\n  \n \n \n \"MMD posterbis\n  \n \n \n \"MMD video\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 21 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{schrab2021mmd,\ntitle={{MMD} Aggregated Two-Sample Test},\nauthor={Antonin Schrab and Ilmun Kim and Mélisande Albert and Béatrice Laurent and Benjamin Guedj and Arthur Gretton},\nyear={2023},\njournal = {Journal of Machine Learning Research [JMLR]},\nabstract = {We propose two novel nonparametric two-sample kernel tests based on the Maximum Mean Discrepancy (MMD). First, for a fixed kernel, we construct an MMD test using either permutations or a wild bootstrap, two popular numerical procedures to determine the test threshold. We prove that this test controls the probability of type I error non-asymptotically. Hence, it can be used reliably even in settings with small sample sizes as it remains well-calibrated, which differs from previous MMD tests which only guarantee correct test level asymptotically. When the difference in densities lies in a Sobolev ball, we prove minimax optimality of our MMD test with a specific kernel depending on the smoothness parameter of the Sobolev ball. In practice, this parameter is unknown and, hence, the optimal MMD test with this particular kernel cannot be used. To overcome this issue, we construct an aggregated test, called MMDAgg, which is adaptive to the smoothness parameter. The test power is maximised over the collection of kernels used, without requiring held-out data for kernel selection (which results in a loss of test power), or arbitrary kernel choices such as the median heuristic. We prove that MMDAgg still controls the level non-asymptotically, and achieves the minimax rate over Sobolev balls, up to an iterated logarithmic term. Our guarantees are not restricted to a specific type of kernel, but hold for any product of one-dimensional translation invariant characteristic kernels. We provide a user-friendly parameter-free implementation of MMDAgg using an adaptive collection of bandwidths. We demonstrate that MMDAgg significantly outperforms alternative state-of-the-art MMD-based two-sample tests on synthetic data satisfying the Sobolev smoothness assumption, and that, on real-world image data, MMDAgg closely matches the power of tests leveraging the use of models such as neural networks.},\nurl = {https://jmlr.org/papers/v24/21-1289.html},\nurl_PDF = {https://jmlr.org/papers/volume24/21-1289/21-1289.pdf},\nurl_arXiv = {https://arxiv.org/abs/2110.15073},\nurl_Code = {https://github.com/antoninschrab/mmdagg-paper},\nurl_Slides = {https://antoninschrab.github.io/files/Slides-31-05-22.pdf},\nurl_Poster = {https://antoninschrab.github.io/files/Poster_MMDAgg_KSDAgg.pdf},\nurl_Posterbis = {https://antoninschrab.github.io/files/Simplified_Poster.pdf},\nurl_Video = {https://youtu.be/OWh6Hj10wsY},\neprint={2110.15073},\nvolume  = {24},\nnumber  = {194},\npages   = {1--81},\narchivePrefix={arXiv},\nprimaryClass={stat.ML},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n We propose two novel nonparametric two-sample kernel tests based on the Maximum Mean Discrepancy (MMD). First, for a fixed kernel, we construct an MMD test using either permutations or a wild bootstrap, two popular numerical procedures to determine the test threshold. We prove that this test controls the probability of type I error non-asymptotically. Hence, it can be used reliably even in settings with small sample sizes as it remains well-calibrated, which differs from previous MMD tests which only guarantee correct test level asymptotically. When the difference in densities lies in a Sobolev ball, we prove minimax optimality of our MMD test with a specific kernel depending on the smoothness parameter of the Sobolev ball. In practice, this parameter is unknown and, hence, the optimal MMD test with this particular kernel cannot be used. To overcome this issue, we construct an aggregated test, called MMDAgg, which is adaptive to the smoothness parameter. The test power is maximised over the collection of kernels used, without requiring held-out data for kernel selection (which results in a loss of test power), or arbitrary kernel choices such as the median heuristic. We prove that MMDAgg still controls the level non-asymptotically, and achieves the minimax rate over Sobolev balls, up to an iterated logarithmic term. Our guarantees are not restricted to a specific type of kernel, but hold for any product of one-dimensional translation invariant characteristic kernels. We provide a user-friendly parameter-free implementation of MMDAgg using an adaptive collection of bandwidths. We demonstrate that MMDAgg significantly outperforms alternative state-of-the-art MMD-based two-sample tests on synthetic data satisfying the Sobolev smoothness assumption, and that, on real-world image data, MMDAgg closely matches the power of tests leveraging the use of models such as neural networks.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Opening up echo chambers via optimal content recommendation.\n \n \n \n \n\n\n \n Vendeville, A.; Giovanidis, A.; Papanastasiou, E.; and Guedj, B.\n\n\n \n\n\n\n In Cherifi, H.; Mantegna, R. N.; Rocha, L. M.; Cherifi, C.; and Miccichè, S., editor(s), Complex Networks and Their Applications XI, pages 74–85, Cham, 2023. Springer International Publishing\n \n\n\n\n
\n\n\n\n \n \n \"OpeningPaper\n  \n \n \n \"Opening arxiv\n  \n \n \n \"Opening pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{vendeville2022online,\ntitle={Opening up echo chambers via optimal content recommendation},\nauthor={Antoine Vendeville and Anastasios Giovanidis and Effrosyni Papanastasiou and Benjamin Guedj},\nyear={2023},\neditor="Cherifi, Hocine\nand Mantegna, Rosario Nunzio\nand Rocha, Luis M.\nand Cherifi, Chantal\nand Miccich{\\`e}, Salvatore",\nbooktitle = {Complex Networks and Their Applications XI},\nabstract = {Online social platforms have become central in the political debate. In this context, the existence of echo chambers is a problem of primary relevance. These clusters of like-minded individuals tend to reinforce prior beliefs, elicit animosity towards others and aggravate the spread of misinformation. We study this phenomenon on a Twitter dataset related to the 2017 French presidential elections and propose a method to tackle it with content recommendations. We use a quadratic program to find optimal recommendations that maximise the diversity of content users are exposed to, while still accounting for their preferences. Our method relies on a theoretical model that can sufficiently describe how content flows through the platform. We show that the model provides good approximations of empirical measures and demonstrate the effectiveness of the optimisation algorithm at mitigating the echo chamber effect on this dataset, even with limited budget for recommendations.},\nurl = {https://link.springer.com/chapter/10.1007/978-3-031-21127-0_7},\nurl_arXiv = {https://arxiv.org/abs/2206.03859},\nurl_PDF = {https://arxiv.org/pdf/2206.03859.pdf},\neprint={2206.03859},\ndoi = {10.1007/978-3-031-21127-0_7},\narchivePrefix={arXiv},\nprimaryClass={cs.SI},\nisbn="978-3-031-21127-0",\npublisher="Springer International Publishing",\naddress="Cham",\npages="74--85",\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n Online social platforms have become central in the political debate. In this context, the existence of echo chambers is a problem of primary relevance. These clusters of like-minded individuals tend to reinforce prior beliefs, elicit animosity towards others and aggravate the spread of misinformation. We study this phenomenon on a Twitter dataset related to the 2017 French presidential elections and propose a method to tackle it with content recommendations. We use a quadratic program to find optimal recommendations that maximise the diversity of content users are exposed to, while still accounting for their preferences. Our method relies on a theoretical model that can sufficiently describe how content flows through the platform. We show that the model provides good approximations of empirical measures and demonstrate the effectiveness of the optimisation algorithm at mitigating the echo chamber effect on this dataset, even with limited budget for recommendations.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n PAC-Bayes Generalisation Bounds for Heavy-Tailed Losses through Supermartingales.\n \n \n \n \n\n\n \n Haddouche, M.; and Guedj, B.\n\n\n \n\n\n\n Transactions on Machine Learning Research [TMLR]. 2023.\n \n\n\n\n
\n\n\n\n \n \n \"PAC-BayesPaper\n  \n \n \n \"PAC-Bayes arxiv\n  \n \n \n \"PAC-Bayes pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{haddouche2022supermartingales,\ntitle = {{PAC}-Bayes Generalisation Bounds for Heavy-Tailed Losses through Supermartingales},\nauthor = {Haddouche, Maxime and Guedj, Benjamin},\nyear = {2023},\njournal = {Transactions on Machine Learning Research [TMLR]},\nabstract = {While PAC-Bayes is now an established learning framework for bounded losses, its extension to the case of unbounded losses (as simple as the squared loss on an unbounded space) remains largely uncharted and has attracted a growing interest in recent years. We contribute to this line of work by developing an extention of Markov's inequality for supermartingales, which we use to establish a novel PAC-Bayesian generalisation bound holding for unbounded losses. We show that this bound extends, unifies and even improves on existing PAC-Bayesian bounds.},\nissn={2835-8856},\nurl={https://openreview.net/forum?id=qxrwt6F3sf},\nurl_arXiv = {https://arxiv.org/abs/2210.00928},\nurl_PDF = {https://openreview.net/pdf?id=qxrwt6F3sf},\ndoi = {10.48550/ARXIV.2210.00928},\neprint={2210.00928},\narchivePrefix={arXiv},\nprimaryClass={stat.ML},\ncopyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n While PAC-Bayes is now an established learning framework for bounded losses, its extension to the case of unbounded losses (as simple as the squared loss on an unbounded space) remains largely uncharted and has attracted a growing interest in recent years. We contribute to this line of work by developing an extention of Markov's inequality for supermartingales, which we use to establish a novel PAC-Bayesian generalisation bound holding for unbounded losses. We show that this bound extends, unifies and even improves on existing PAC-Bayesian bounds.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Tighter PAC-Bayes Generalisation Bounds by Leveraging Example Difficulty.\n \n \n \n \n\n\n \n Biggs, F.; and Guedj, B.\n\n\n \n\n\n\n In Proceedings of The 26th International Conference on Artificial Intelligence and Statistics [AISTATS], volume 206, pages 8165-8182, 2023. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"TighterPaper\n  \n \n \n \"Tighter pdf\n  \n \n \n \"Tighter arxiv\n  \n \n \n \"Tighter code\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{biggs2022examples,\ntitle = {Tighter {PAC-Bayes} Generalisation Bounds by Leveraging Example Difficulty},\nauthor = {Biggs, Felix and Guedj, Benjamin},\nyear = {2023},\nbooktitle={Proceedings of The 26th International Conference on Artificial Intelligence and Statistics [AISTATS]},\nvolume = {206},\npublisher = {PMLR},\npages = {8165-8182},\nabstract = {We introduce a modified version of the excess risk, which can be used to obtain tighter, fast-rate PAC-Bayesian generalisation bounds. This modified excess risk leverages information about the relative hardness of data examples to reduce the variance of its empirical counterpart, tightening the bound. We combine this with a new bound for $[−1,1]$-valued (and potentially non-independent) signed losses, which is more favourable when they empirically have low variance around $0$. The primary new technical tool is a novel result for sequences of interdependent random vectors which may be of independent interest. We empirically evaluate these new bounds on a number of real-world datasets.},\nurl = {https://proceedings.mlr.press/v206/biggs23a.html},\nurl_PDF = {https://proceedings.mlr.press/v206/biggs23a/biggs23a.pdf},\nurl_arXiv = {https://arxiv.org/abs/2210.11289},\nurl_Code = {https://github.com/biggs/tighter-pac-bayes-difficulty},\ndoi = {10.48550/arXiv.2210.11289},\neprint={2210.11289},\narchivePrefix={arXiv},\nprimaryClass={cs.LG},\ncopyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n We introduce a modified version of the excess risk, which can be used to obtain tighter, fast-rate PAC-Bayesian generalisation bounds. This modified excess risk leverages information about the relative hardness of data examples to reduce the variance of its empirical counterpart, tightening the bound. We combine this with a new bound for $[−1,1]$-valued (and potentially non-independent) signed losses, which is more favourable when they empirically have low variance around $0$. The primary new technical tool is a novel result for sequences of interdependent random vectors which may be of independent interest. We empirically evaluate these new bounds on a number of real-world datasets.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Optimistically Tempered Online Learning.\n \n \n \n \n\n\n \n Haddouche, M.; Guedj, B.; and Wintenberger, O.\n\n\n \n\n\n\n 2023.\n Submitted.\n\n\n\n
\n\n\n\n \n \n \"OptimisticallyPaper\n  \n \n \n \"Optimistically pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@unpublished{haddouche2023optimistic,\ntitle = {Optimistically Tempered Online Learning},\nauthor = {Haddouche, Maxime and Guedj, Benjamin and Wintenberger, Olivier},\nyear = {2023},\nnote = "Submitted.",\nabstract = {Optimistic Online Learning algorithms have been developed to exploit expert advices, assumed optimistically to be always useful. However, it is legitimate to question the relevance of such advices \\emph{w.r.t.} the learning information provided by gradient-based online algorithms. In this work, we challenge the confidence assumption on the expert and develop the \\emph{optimistically tempered} (OT) online learning framework as well as OT adaptations of online algorithms. Our algorithms come with sound theoretical guarantees in the form of dynamic regret bounds, and we eventually provide experimental validation of the usefulness of the OT approach.},\nurl = {https://arxiv.org/abs/2301.07530},\nurl_PDF = {https://arxiv.org/pdf/2301.07530.pdf},\ndoi = {10.48550/arXiv.2301.07530},\neprint={2301.07530},\narchivePrefix={arXiv},\nprimaryClass={cs.LG},\ncopyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n Optimistic Online Learning algorithms have been developed to exploit expert advices, assumed optimistically to be always useful. However, it is legitimate to question the relevance of such advices \\emphw.r.t. the learning information provided by gradient-based online algorithms. In this work, we challenge the confidence assumption on the expert and develop the \\emphoptimistically tempered (OT) online learning framework as well as OT adaptations of online algorithms. Our algorithms come with sound theoretical guarantees in the form of dynamic regret bounds, and we eventually provide experimental validation of the usefulness of the OT approach.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Wasserstein PAC-Bayes Learning: Exploiting Optimisation Guarantees to Explain Generalisation.\n \n \n \n \n\n\n \n Haddouche, M.; and Guedj, B.\n\n\n \n\n\n\n 2023.\n Submitted.\n\n\n\n
\n\n\n\n \n \n \"WassersteinPaper\n  \n \n \n \"Wasserstein pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@unpublished{haddouche2023wasserstein,\ntitle = {Wasserstein PAC-Bayes Learning: Exploiting Optimisation Guarantees to Explain Generalisation},\nauthor = {Haddouche, Maxime and Guedj, Benjamin},\nyear = {2023},\nnote = "Submitted.",\nabstract = {PAC-Bayes learning is an established framework to both assess the generalisation ability of learning algorithms, and design new learning algorithm by exploiting generalisation bounds as training objectives. Most of the exisiting bounds involve a \\emph{Kullback-Leibler} (KL) divergence, which fails to capture the geometric properties of the loss function which are often useful in optimisation. We address this by extending the emerging \\emph{Wasserstein PAC-Bayes} theory. We develop new PAC-Bayes bounds with Wasserstein distances replacing the usual KL, and demonstrate that sound optimisation guarantees translate to good generalisation abilities. In particular we provide generalisation bounds for the \\emph{Bures-Wasserstein SGD} by exploiting its optimisation properties.},\nurl = {https://arxiv.org/abs/2304.07048},\nurl_PDF = {https://arxiv.org/pdf/2304.07048.pdf},\ndoi = {10.48550/arXiv.2304.07048},\neprint={2304.07048},\narchivePrefix={arXiv},\nprimaryClass={stat.ML},\ncopyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n PAC-Bayes learning is an established framework to both assess the generalisation ability of learning algorithms, and design new learning algorithm by exploiting generalisation bounds as training objectives. Most of the exisiting bounds involve a \\emphKullback-Leibler (KL) divergence, which fails to capture the geometric properties of the loss function which are often useful in optimisation. We address this by extending the emerging \\emphWasserstein PAC-Bayes theory. We develop new PAC-Bayes bounds with Wasserstein distances replacing the usual KL, and demonstrate that sound optimisation guarantees translate to good generalisation abilities. In particular we provide generalisation bounds for the \\emphBures-Wasserstein SGD by exploiting its optimisation properties.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning via Wasserstein-Based High Probability Generalisation Bounds.\n \n \n \n \n\n\n \n Viallard, P.; Haddouche, M.; Şimşekli, U.; and Guedj, B.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems [NeurIPS] 2023, 2023. \n \n\n\n\n
\n\n\n\n \n \n \"LearningPaper\n  \n \n \n \"Learning pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{viallard2023wasserstein,\ntitle = {Learning via {Wasserstein}-Based High Probability Generalisation Bounds},\nauthor = {Viallard, Paul and Haddouche, Maxime and Şimşekli, Umut and Guedj, Benjamin},\nyear = {2023},\nbooktitle={Advances in Neural Information Processing Systems 36: Annual Conference on Neural Information Processing Systems [NeurIPS] 2023},\nabstract = {Minimising upper bounds on the population risk or the generalisation gap has been widely used in structural risk minimisation (SRM) - this is in particular at the core of PAC-Bayesian learning. Despite its successes and unfailing surge of interest in recent years, a limitation of the PAC-Bayesian framework is that most bounds involve a Kullback-Leibler (KL) divergence term (or its variations), which might exhibit erratic behavior and fail to capture the underlying geometric structure of the learning problem - hence restricting its use in practical applications. As a remedy, recent studies have attempted to replace the KL divergence in the PAC-Bayesian bounds with the Wasserstein distance. Even though these bounds alleviated the aforementioned issues to a certain extent, they either hold in expectation, are for bounded losses, or are nontrivial to minimize in an SRM framework. In this work, we contribute to this line of research and prove novel Wasserstein distance-based PAC-Bayesian generalisation bounds for both batch learning with independent and identically distributed (i.i.d.) data, and online learning with potentially non-i.i.d. data. Contrary to previous art, our bounds are stronger in the sense that (i) they hold with high probability, (ii) they apply to unbounded (potentially heavy-tailed) losses, and (iii) they lead to optimizable training objectives that can be used in SRM. As a result we derive novel Wasserstein-based PAC-Bayesian learning algorithms and we illustrate their empirical advantage on a variety of experiments.},\nurl = {https://arxiv.org/abs/2306.04375},\nurl_PDF = {https://arxiv.org/pdf/2306.04375.pdf},\ndoi = {10.48550/arXiv.2306.04375},\neprint={2306.04375},\narchivePrefix={arXiv},\nprimaryClass={stat.ML},\ncopyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n Minimising upper bounds on the population risk or the generalisation gap has been widely used in structural risk minimisation (SRM) - this is in particular at the core of PAC-Bayesian learning. Despite its successes and unfailing surge of interest in recent years, a limitation of the PAC-Bayesian framework is that most bounds involve a Kullback-Leibler (KL) divergence term (or its variations), which might exhibit erratic behavior and fail to capture the underlying geometric structure of the learning problem - hence restricting its use in practical applications. As a remedy, recent studies have attempted to replace the KL divergence in the PAC-Bayesian bounds with the Wasserstein distance. Even though these bounds alleviated the aforementioned issues to a certain extent, they either hold in expectation, are for bounded losses, or are nontrivial to minimize in an SRM framework. In this work, we contribute to this line of research and prove novel Wasserstein distance-based PAC-Bayesian generalisation bounds for both batch learning with independent and identically distributed (i.i.d.) data, and online learning with potentially non-i.i.d. data. Contrary to previous art, our bounds are stronger in the sense that (i) they hold with high probability, (ii) they apply to unbounded (potentially heavy-tailed) losses, and (iii) they lead to optimizable training objectives that can be used in SRM. As a result we derive novel Wasserstein-based PAC-Bayesian learning algorithms and we illustrate their empirical advantage on a variety of experiments.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Generalization Bounds: Perspectives from Information Theory and PAC-Bayes.\n \n \n \n \n\n\n \n Hellström, F.; Durisi, G.; Guedj, B.; and Raginsky, M.\n\n\n \n\n\n\n 2023.\n Submitted.\n\n\n\n
\n\n\n\n \n \n \"GeneralizationPaper\n  \n \n \n \"Generalization pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@unpublished{hellstrom2023generalisation,\ntitle = {Generalization Bounds: Perspectives from Information Theory and {PAC-Bayes}},\nauthor={Fredrik Hellström and Giuseppe Durisi and Benjamin Guedj and Maxim Raginsky},\nyear={2023},\nnote = "Submitted.",\nabstract = {A fundamental question in theoretical machine learning is generalization. Over the past decades, the PAC-Bayesian approach has been established as a flexible framework to address the generalization capabilities of machine learning algorithms, and design new ones. Recently, it has garnered increased interest due to its potential applicability for a variety of learning algorithms, including deep neural networks. In parallel, an information-theoretic view of generalization has developed, wherein the relation between generalization and various information measures has been established. This framework is intimately connected to the PAC-Bayesian approach, and a number of results have been independently discovered in both strands. In this monograph, we highlight this strong connection and present a unified treatment of generalization. We present techniques and results that the two perspectives have in common, and discuss the approaches and interpretations that differ. In particular, we demonstrate how many proofs in the area share a modular structure, through which the underlying ideas can be intuited. We pay special attention to the conditional mutual information (CMI) framework; analytical studies of the information complexity of learning algorithms; and the application of the proposed methods to deep learning. This monograph is intended to provide a comprehensive introduction to information-theoretic generalization bounds and their connection to PAC-Bayes, serving as a foundation from which the most recent developments are accessible. It is aimed broadly towards researchers with an interest in generalization and theoretical machine learning.},\nurl = {https://arxiv.org/abs/2309.04381},\nurl_PDF = {https://arxiv.org/pdf/2309.04381.pdf},\ndoi = {10.48550/arXiv.2309.04381},\neprint={2309.04381},\narchivePrefix={arXiv},\nprimaryClass={cs.LG},\ncopyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n A fundamental question in theoretical machine learning is generalization. Over the past decades, the PAC-Bayesian approach has been established as a flexible framework to address the generalization capabilities of machine learning algorithms, and design new ones. Recently, it has garnered increased interest due to its potential applicability for a variety of learning algorithms, including deep neural networks. In parallel, an information-theoretic view of generalization has developed, wherein the relation between generalization and various information measures has been established. This framework is intimately connected to the PAC-Bayesian approach, and a number of results have been independently discovered in both strands. In this monograph, we highlight this strong connection and present a unified treatment of generalization. We present techniques and results that the two perspectives have in common, and discuss the approaches and interpretations that differ. In particular, we demonstrate how many proofs in the area share a modular structure, through which the underlying ideas can be intuited. We pay special attention to the conditional mutual information (CMI) framework; analytical studies of the information complexity of learning algorithms; and the application of the proposed methods to deep learning. This monograph is intended to provide a comprehensive introduction to information-theoretic generalization bounds and their connection to PAC-Bayes, serving as a foundation from which the most recent developments are accessible. It is aimed broadly towards researchers with an interest in generalization and theoretical machine learning.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Federated Learning with Nonvacuous Generalisation Bounds.\n \n \n \n \n\n\n \n Jobic, P.; Haddouche, M.; and Guedj, B.\n\n\n \n\n\n\n 2023.\n Submitted.\n\n\n\n
\n\n\n\n \n \n \"FederatedPaper\n  \n \n \n \"Federated pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@unpublished{jobic2023federated,\ntitle = {Federated Learning with Nonvacuous Generalisation Bounds},\nauthor={Pierre Jobic and Maxime Haddouche and Benjamin Guedj},\nyear={2023},\nnote = "Submitted.",\nabstract = {We introduce a novel strategy to train randomised predictors in federated learning, where each node of the network aims at preserving its privacy by releasing a local predictor but keeping secret its training dataset with respect to the other nodes. We then build a global randomised predictor which inherits the properties of the local private predictors in the sense of a PAC-Bayesian generalisation bound. We consider the synchronous case where all nodes share the same training objective (derived from a generalisation bound), and the asynchronous case where each node may have its own personalised training objective. We show through a series of numerical experiments that our approach achieves a comparable predictive performance to that of the batch approach where all datasets are shared across nodes. Moreover the predictors are supported by numerically nonvacuous generalisation bounds while preserving privacy for each node. We explicitly compute the increment on predictive performance and generalisation bounds between batch and federated settings, highlighting the price to pay to preserve privacy.},\nurl = {https://arxiv.org/abs/2310.11203},\nurl_PDF = {https://arxiv.org/pdf/2310.11203.pdf},\ndoi = {10.48550/arXiv.2310.11203},\neprint={2310.11203},\narchivePrefix={arXiv},\nprimaryClass={cs.LG},\ncopyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n We introduce a novel strategy to train randomised predictors in federated learning, where each node of the network aims at preserving its privacy by releasing a local predictor but keeping secret its training dataset with respect to the other nodes. We then build a global randomised predictor which inherits the properties of the local private predictors in the sense of a PAC-Bayesian generalisation bound. We consider the synchronous case where all nodes share the same training objective (derived from a generalisation bound), and the asynchronous case where each node may have its own personalised training objective. We show through a series of numerical experiments that our approach achieves a comparable predictive performance to that of the batch approach where all datasets are shared across nodes. Moreover the predictors are supported by numerically nonvacuous generalisation bounds while preserving privacy for each node. We explicitly compute the increment on predictive performance and generalisation bounds between batch and federated settings, highlighting the price to pay to preserve privacy.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Bayesian Uncertainty Quantification for Anaerobic Digestion models.\n \n \n \n \n\n\n \n Picard-Weibel, A.; Capson-Tojo, G.; Guedj, B.; and Moscoviz, R.\n\n\n \n\n\n\n Bioresource Technology, 394: 130-147. 2023.\n \n\n\n\n
\n\n\n\n \n \n \"BayesianPaper\n  \n \n \n \"Bayesian arxiv\n  \n \n \n \"Bayesian pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{picard2023bayesian,\ntitle = {{B}ayesian Uncertainty Quantification for Anaerobic Digestion models},\nauthor={Antoine Picard-Weibel and Gabriel Capson-Tojo and Benjamin Guedj and Roman Moscoviz},\nyear={2023},\njournal = {Bioresource Technology},\npages = {130-147},\nvolume = {394},\nissn = {0960-8524},\nabstract = {Uncertainty quantification is critical for ensuring adequate predictive power of computational models used in biology. Focusing on two anaerobic digestion models, this article introduces a novel generalized Bayesian procedure, called VarBUQ, ensuring a correct tradeoff between flexibility and computational cost. A benchmark against three existing methods (Fisher’s information, bootstrapping and Beale’s criteria) was conducted using synthetic data. This Bayesian procedure offered a good compromise between fitting ability and confidence estimation, while the other methods proved to be repeatedly overconfident. The method’s performances notably benefitted from inductive bias brought by the prior distribution, although it requires careful construction. This article advocates for more systematic consideration of uncertainty for anaerobic digestion models and showcases a new, computationally efficient Bayesian method. To facilitate future implementations, a Python package called ‘aduq’ is made available.},\nurl = {https://www.sciencedirect.com/science/article/pii/S0960852423015754},\nurl_arXiv = {https://arxiv.org/abs/2312.xxxxx},\nurl_PDF = {https://arxiv.org/pdf/2312.xxxxx.pdf},\ndoi = {10.1016/j.biortech.2023.130147},\neprint={2312.xxxxx},\narchivePrefix={arXiv},\nprimaryClass={cs.LG},\ncopyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n Uncertainty quantification is critical for ensuring adequate predictive power of computational models used in biology. Focusing on two anaerobic digestion models, this article introduces a novel generalized Bayesian procedure, called VarBUQ, ensuring a correct tradeoff between flexibility and computational cost. A benchmark against three existing methods (Fisher’s information, bootstrapping and Beale’s criteria) was conducted using synthetic data. This Bayesian procedure offered a good compromise between fitting ability and confidence estimation, while the other methods proved to be repeatedly overconfident. The method’s performances notably benefitted from inductive bias brought by the prior distribution, although it requires careful construction. This article advocates for more systematic consideration of uncertainty for anaerobic digestion models and showcases a new, computationally efficient Bayesian method. To facilitate future implementations, a Python package called ‘aduq’ is made available.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Nurturing Knowledge: A Virtue Epistemic Approach to Explainable AI.\n \n \n \n \n\n\n \n Candiotto, L.; Cayco-Gajic, N. A.; de Soarez, P. C.; Luca, M. D.; Frassinelli, D.; Fujita, A.; Growiec, J.; Guedj, B.; Kasturi, S. N.; Kawanishi, Y.; Kellmeyer, P.; Livermore, M. A.; Moodley, D.; Rabinowitch, I.; Ralitera, T.; Stalnov, O.; Taylor, H.; and Wevers, M.\n\n\n \n\n\n\n 2023.\n Submitted.\n\n\n\n
\n\n\n\n \n \n \"NurturingPaper\n  \n \n \n \"Nurturing pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@unpublished{ica2023,\ntitle = {{Nurturing Knowledge: A Virtue Epistemic Approach to Explainable AI}},\nauthor={Laura Candiotto and N. Alex Cayco-Gajic and Patricia Coelho de Soarez and Massimiliano Di Luca and Diego Frassinelli and André Fujita and Jakub Growiec and Benjamin Guedj and Suranga N. Kasturi and Yasutomo Kawanishi and Philipp Kellmeyer and Michael A. Livermore and Deshendran Moodley and Ithai Rabinowitch and Tahina Ralitera and Oksana Stalnov and Henry Taylor and Melvin Wevers},\nyear={2023},\nnote = "Submitted.",\nabstract = {The rapid integration of artificial intelligence (AI) into diverse areas of knowledge production has raised concerns about the explainability of complex computational systems. The requirements for what constitutes a satisfactory explanation vary extensively across disciplines and contexts. This paper introduces virtue epistemology as a framework for evaluating AI systems based on how they shape communal practices of knowledge development. By examining several illustrations in social science, neuroscience, medicine, and the humanities, we argue that progress in explainable AI in the context of knowledge production relies on aligning computational reasoning with the priorities of user communities and their specialized epistemic cultures.},\nurl = {https://arxiv.org/abs/xxxx.xxxxx},\nurl_PDF = {https://arxiv.org/pdf/xxxx.xxxxx.pdf},\ndoi = {10.48550/arXiv.xxxx.xxxxx},\neprint={xxxx.xxxxx},\narchivePrefix={arXiv},\nprimaryClass={cs.LG},\ncopyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n The rapid integration of artificial intelligence (AI) into diverse areas of knowledge production has raised concerns about the explainability of complex computational systems. The requirements for what constitutes a satisfactory explanation vary extensively across disciplines and contexts. This paper introduces virtue epistemology as a framework for evaluating AI systems based on how they shape communal practices of knowledge development. By examining several illustrations in social science, neuroscience, medicine, and the humanities, we argue that progress in explainable AI in the context of knowledge production relies on aligning computational reasoning with the priorities of user communities and their specialized epistemic cultures.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A note on regularised NTK dynamics with an application to PAC-Bayesian training.\n \n \n \n \n\n\n \n Clerico, E.; and Guedj, B.\n\n\n \n\n\n\n Transactions on Machine Learning Research [TMLR]. 2023.\n \n\n\n\n
\n\n\n\n \n \n \"APaper\n  \n \n \n \"A pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{clerico2023ntk,\ntitle = {A note on regularised {NTK} dynamics with an application to {PAC-Bayesian} training},\nauthor = {Clerico, Eugenio and Guedj, Benjamin},\nyear = {2023},\njournal = {Transactions on Machine Learning Research [TMLR]},\nissn={2835-8856},\nabstract = {We establish explicit dynamics for neural networks whose training objective has a regularising term that constrains the parameters to remain close to their initial value. This keeps the network in a lazy training regime, where the dynamics can be linearised around the initialisation. The standard neural tangent kernel (NTK) governs the evolution during the training in the infinite-width limit, although the regularisation yields an additional term appears in the differential equation describing the dynamics. This setting provides an appropriate framework to study the evolution of wide networks trained to optimise generalisation objectives such as PAC-Bayes bounds, and hence potentially contribute to a deeper theoretical understanding of such networks.},\nurl = {https://arxiv.org/abs/2312.13259},\nurl_PDF = {https://arxiv.org/pdf/2312.13259.pdf},\ndoi = {10.48550/ARXIV.2312.13259},\neprint={2312.13259},\narchivePrefix={arXiv},\nprimaryClass={stat.ML},\ncopyright = {Creative Commons Attribution 4.0 International},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n We establish explicit dynamics for neural networks whose training objective has a regularising term that constrains the parameters to remain close to their initial value. This keeps the network in a lazy training regime, where the dynamics can be linearised around the initialisation. The standard neural tangent kernel (NTK) governs the evolution during the training in the infinite-width limit, although the regularisation yields an additional term appears in the differential equation describing the dynamics. This setting provides an appropriate framework to study the evolution of wide networks trained to optimise generalisation objectives such as PAC-Bayes bounds, and hence potentially contribute to a deeper theoretical understanding of such networks.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2022\n \n \n (15)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Towards control of opinion diversity by introducing zealots into a polarised social group.\n \n \n \n \n\n\n \n Vendeville, A.; Guedj, B.; and Zhou, S.\n\n\n \n\n\n\n In Benito, R. M.; Cherifi, C.; Cherifi, H.; Moro, E.; Rocha, L. M.; and Sales-Pardo, M., editor(s), Complex Networks & Their Applications X, pages 341–352, 2022. Springer International Publishing\n \n\n\n\n
\n\n\n\n \n \n \"TowardsPaper\n  \n \n \n \"Towards arxiv\n  \n \n \n \"Towards pdf\n  \n \n \n \"Towards code\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{vendeville2020opinions,\ntitle={Towards control of opinion diversity by introducing zealots into a polarised social group},\nauthor={Antoine Vendeville and Benjamin Guedj and Shi Zhou},\nyear={2022},\neditor="Benito, Rosa Maria\nand Cherifi, Chantal\nand Cherifi, Hocine\nand Moro, Esteban\nand Rocha, Luis M.\nand Sales-Pardo, Marta",\npublisher="Springer International Publishing",\npages="341--352",\nbooktitle="Complex Networks {\\&} Their Applications X",\nabstract = {We explore a method to influence or even control the diversity of opinions within a polarised social group. We leverage the voter model in which users hold binary opinions and repeatedly update their beliefs based on others they connect with. Stubborn agents who never change their minds ("zealots") are also disseminated through the network, which is modelled by a connected graph. Building on earlier results, we provide a closed-form expression for the average opinion of the group at equilibrium. This leads us to a strategy to inject zealots into a polarised network in order to shift the average opinion towards any target value. We account for the possible presence of a backfire effect, which may lead the group to react negatively and reinforce its level of polarisation in response. Our results are supported by numerical experiments on synthetic data.},\nisbn="978-3-030-93413-2",\nurl = "https://link.springer.com/chapter/10.1007%2F978-3-030-93413-2_29",\nurl_arXiv = "https://arxiv.org/abs/2006.07265",\nurl_PDF = "https://arxiv.org/pdf/2006.07265.pdf",\nurl_Code = "https://github.com/AntoineVendeville/HowOpinionsCrystallise",\ndoi = {10.1007/978-3-030-93413-2_29},\neprint={2006.07265},\narchivePrefix={arXiv},\nprimaryClass={cs.SI},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n We explore a method to influence or even control the diversity of opinions within a polarised social group. We leverage the voter model in which users hold binary opinions and repeatedly update their beliefs based on others they connect with. Stubborn agents who never change their minds (\"zealots\") are also disseminated through the network, which is modelled by a connected graph. Building on earlier results, we provide a closed-form expression for the average opinion of the group at equilibrium. This leads us to a strategy to inject zealots into a polarised network in order to shift the average opinion towards any target value. We account for the possible presence of a backfire effect, which may lead the group to react negatively and reinforce its level of polarisation in response. Our results are supported by numerical experiments on synthetic data.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n MAGMA: Inference and Prediction with Multi-Task Gaussian Processes.\n \n \n \n \n\n\n \n Leroy, A.; Latouche, P.; Guedj, B.; and Gey, S.\n\n\n \n\n\n\n Machine Learning. 2022.\n \n\n\n\n
\n\n\n\n \n \n \"MAGMA:Paper\n  \n \n \n \"MAGMA: pdf\n  \n \n \n \"MAGMA: code\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{leroy2020magma,\ntitle={{MAGMA}: Inference and Prediction with Multi-Task {Gaussian} Processes},\nauthor={Arthur Leroy and Pierre Latouche and Benjamin Guedj and Servane Gey},\nyear={2022},\njournal = {Machine Learning},\nabstract = {A novel multi-task Gaussian process (GP) framework is proposed, by using a common mean process for sharing information across tasks. In particular, we investigate the problem of time series forecasting, with the objective to improve multiple-step-ahead predictions. The common mean process is defined as a GP for which the hyper-posterior distribution is tractable. Therefore an EM algorithm is derived for handling both hyper-parameters optimisation and hyper-posterior computation. Unlike previous approaches in the literature, the model fully accounts for uncertainty and can handle irregular grids of observations while maintaining explicit formulations, by modelling the mean process in a unified GP framework. Predictive analytical equations are provided, integrating information shared across tasks through a relevant prior mean. This approach greatly improves the predictive performances, even far from observations, and may reduce significantly the computational complexity compared to traditional multi-task GP models. Our overall algorithm is called MAGMA (standing for Multi tAsk GPs with common MeAn). The quality of the mean process estimation, predictive performances, and comparisons to alternatives are assessed in various simulated scenarios and on real datasets.},\neprint={2007.10731},\narchivePrefix={arXiv},\nprimaryClass={stat.CO},\nurl = {https://arxiv.org/abs/2007.10731},\nurl_PDF = {https://arxiv.org/pdf/2007.10731.pdf},\nurl_Code = {https://github.com/ArthurLeroy/MAGMA},\nDOI = "10.1007/s10994-022-06172-1",\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n A novel multi-task Gaussian process (GP) framework is proposed, by using a common mean process for sharing information across tasks. In particular, we investigate the problem of time series forecasting, with the objective to improve multiple-step-ahead predictions. The common mean process is defined as a GP for which the hyper-posterior distribution is tractable. Therefore an EM algorithm is derived for handling both hyper-parameters optimisation and hyper-posterior computation. Unlike previous approaches in the literature, the model fully accounts for uncertainty and can handle irregular grids of observations while maintaining explicit formulations, by modelling the mean process in a unified GP framework. Predictive analytical equations are provided, integrating information shared across tasks through a relevant prior mean. This approach greatly improves the predictive performances, even far from observations, and may reduce significantly the computational complexity compared to traditional multi-task GP models. Our overall algorithm is called MAGMA (standing for Multi tAsk GPs with common MeAn). The quality of the mean process estimation, predictive performances, and comparisons to alternatives are assessed in various simulated scenarios and on real datasets.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n An end-to-end data-driven optimisation framework for constrained trajectories.\n \n \n \n \n\n\n \n Dewez, F.; Guedj, B.; Talpaert, A.; and Vandewalle, V.\n\n\n \n\n\n\n Data-Centric Engineering, 3. 2022.\n \n\n\n\n
\n\n\n\n \n \n \"AnPaper\n  \n \n \n \"An arxiv\n  \n \n \n \"An pdf\n  \n \n \n \"An code\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{dewez2020endtoend,\ntitle={An end-to-end data-driven optimisation framework for constrained trajectories}, \nauthor={Florent Dewez and Benjamin Guedj and Arthur Talpaert and Vincent Vandewalle},\nyear={2022},\njournal={Data-Centric Engineering},\npublisher={Cambridge University Press},\nvolume = "3",\nabstract = {Many real-world problems require to optimize trajectories under constraints. Classical approaches are often based on optimal control methods but require an exact knowledge of the underlying dynamics and constraints, which could be challenging or even out of reach. In view of this, we leverage data-driven approaches to design a new end-to-end framework which is dynamics-free for optimized and realistic trajectories. Trajectories are here decomposed on function basis, trading the initial infinite dimension problem on a multivariate functional space for a parameter optimization problem. Then a maximum a posteriori approach which incorporates information from data is used to obtain a new penalized optimization problem. The penalized term narrows the search on a region centered on data and includes estimated features of the problem. We apply our data-driven approach to two settings in aeronautics and sailing routes optimization. The developed approach is implemented in the Python library PyRotor.},\nurl = "https://www.cambridge.org/core/journals/data-centric-engineering/article/an-endtoend-datadriven-optimization-framework-for-constrained-trajectories/B4BF729E1681F795FFE79A1F4B544CE5",\nurl_arXiv = "https://arxiv.org/abs/2011.11820",\nurl_PDF = "https://www.cambridge.org/core/services/aop-cambridge-core/content/view/B4BF729E1681F795FFE79A1F4B544CE5/S2632673622000065a.pdf/an-end-to-end-data-driven-optimization-framework-for-constrained-trajectories.pdf",\nurl_Code = "https://github.com/bguedj/pyrotor",\neprint={2011.11820},\narchivePrefix={arXiv},\nDOI = "10.1017/dce.2022.6",\nprimaryClass={stat.AP},\nkeywords={mine}\n}\n\n\n\n
\n
\n\n\n
\n Many real-world problems require to optimize trajectories under constraints. Classical approaches are often based on optimal control methods but require an exact knowledge of the underlying dynamics and constraints, which could be challenging or even out of reach. In view of this, we leverage data-driven approaches to design a new end-to-end framework which is dynamics-free for optimized and realistic trajectories. Trajectories are here decomposed on function basis, trading the initial infinite dimension problem on a multivariate functional space for a parameter optimization problem. Then a maximum a posteriori approach which incorporates information from data is used to obtain a new penalized optimization problem. The penalized term narrows the search on a region centered on data and includes estimated features of the problem. We apply our data-driven approach to two settings in aeronautics and sailing routes optimization. The developed approach is implemented in the Python library PyRotor.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n On Margins and Derandomisation in PAC-Bayes.\n \n \n \n \n\n\n \n Biggs, F.; and Guedj, B.\n\n\n \n\n\n\n In Camps-Valls, G.; Ruiz, F. J. R.; and Valera, I., editor(s), Proceedings of The 25th International Conference on Artificial Intelligence and Statistics [AISTATS], volume 151, of Proceedings of Machine Learning Research, pages 3709–3731, 28–30 Mar 2022. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"OnPaper\n  \n \n \n \"On arxiv\n  \n \n \n \"On pdf\n  \n \n \n \"On code\n  \n \n \n \"On video\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{biggs2021margins,\ntitle={On Margins and Derandomisation in {PAC-Bayes}},\nauthor={Felix Biggs and Benjamin Guedj},\nyear={2022},\npages = \t {3709--3731},\neditor = \t {Camps-Valls, Gustau and Ruiz, Francisco J. R. and Valera, Isabel},\nvolume = \t {151},\nseries = \t {Proceedings of Machine Learning Research},\nmonth = \t {28--30 Mar},\npublisher =    {PMLR},\nbooktitle={Proceedings of The 25th International Conference on Artificial Intelligence and Statistics [AISTATS]},\nabstract = {We give a general recipe for derandomising PAC-Bayesian bounds using margins, with the critical ingredient being that our randomised predictions concentrate around some value. The tools we develop straightforwardly lead to margin bounds for various classifiers, including linear prediction—a class that includes boosting and the support vector machine—single-hidden-layer neural networks with an unusual erf activation function, and deep ReLU networks. Further we extend to partially-derandomised predictors where only some of the randomness of our estimators is removed, letting us extend bounds to cases where the concentration properties of our estimators are otherwise poor.},\nurl = {https://proceedings.mlr.press/v151/biggs22a.html},\nurl_arXiv = {https://arxiv.org/abs/2107.03955},\nurl_PDF = {https://proceedings.mlr.press/v151/biggs22a/biggs22a.pdf},\nurl_Code = {https://github.com/biggs/margins-and-derandomisation-code},\nurl_Video = {https://virtual.aistats.org/virtual/2022/poster/3295},\neprint={2107.03955},\narchivePrefix={arXiv},\nprimaryClass={cs.LG},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n We give a general recipe for derandomising PAC-Bayesian bounds using margins, with the critical ingredient being that our randomised predictions concentrate around some value. The tools we develop straightforwardly lead to margin bounds for various classifiers, including linear prediction—a class that includes boosting and the support vector machine—single-hidden-layer neural networks with an unusual erf activation function, and deep ReLU networks. Further we extend to partially-derandomised predictors where only some of the randomness of our estimators is removed, letting us extend bounds to cases where the concentration properties of our estimators are otherwise poor.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n On PAC-Bayesian reconstruction guarantees for VAEs.\n \n \n \n \n\n\n \n Chérief-Abdellatif, B.; Shi, Y.; Doucet, A.; and Guedj, B.\n\n\n \n\n\n\n In Camps-Valls, G.; Ruiz, F. J. R.; and Valera, I., editor(s), Proceedings of The 25th International Conference on Artificial Intelligence and Statistics [AISTATS], volume 151, of Proceedings of Machine Learning Research, pages 3066–3079, 28–30 Mar 2022. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"OnPaper\n  \n \n \n \"On arxiv\n  \n \n \n \"On pdf\n  \n \n \n \"On code\n  \n \n \n \"On video\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{cherief2021vae,\ntitle={On {PAC-Bayesian} reconstruction guarantees for {VAEs}},\nauthor={Ch\\'erief-Abdellatif, Badr-Eddine and Shi, Yuyang and Doucet, Arnaud and Guedj, Benjamin},\nyear={2022},\npages = \t {3066--3079},\neditor = \t {Camps-Valls, Gustau and Ruiz, Francisco J. R. and Valera, Isabel},\nvolume = \t {151},\nseries = \t {Proceedings of Machine Learning Research},\nmonth = \t {28--30 Mar},\npublisher =    {PMLR},\nbooktitle={Proceedings of The 25th International Conference on Artificial Intelligence and Statistics [AISTATS]},\nabstract = {Despite its wide use and empirical successes, the theoretical understanding and study of the behaviour and performance of the variational autoencoder (VAE) have only emerged in the past few years. We contribute to this recent line of work by analysing the VAE’s reconstruction ability for unseen test data, leveraging arguments from the PAC-Bayes theory. We provide generalisation bounds on the theoretical reconstruction error, and provide insights on the regularisation effect of VAE objectives. We illustrate our theoretical results with supporting experiments on classical benchmark datasets.},\nurl = {https://proceedings.mlr.press/v151/cherief-abdellatif22a.html},\nurl_arXiv = {https://arxiv.org/abs/2202.11455},\nurl_PDF = {https://proceedings.mlr.press/v151/cherief-abdellatif22a/cherief-abdellatif22a.pdf},\nurl_Code = {https://github.com/yuyang-shi/PBVAE},\nurl_Video = {https://virtual.aistats.org/virtual/2022/poster/3265},\neprint={2202.11455},\narchivePrefix={arXiv},\nprimaryClass={cs.LG},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n Despite its wide use and empirical successes, the theoretical understanding and study of the behaviour and performance of the variational autoencoder (VAE) have only emerged in the past few years. We contribute to this recent line of work by analysing the VAE’s reconstruction ability for unseen test data, leveraging arguments from the PAC-Bayes theory. We provide generalisation bounds on the theoretical reconstruction error, and provide insights on the regularisation effect of VAE objectives. We illustrate our theoretical results with supporting experiments on classical benchmark datasets.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n KSD Aggregated Goodness-of-fit Test.\n \n \n \n \n\n\n \n Schrab, A.; Guedj, B.; and Gretton, A.\n\n\n \n\n\n\n In Koyejo, S.; Mohamed, S.; Agarwal, A.; Belgrave, D.; Cho, K.; and Oh, A., editor(s), Advances in Neural Information Processing Systems [NeurIPS], volume 35, pages 32624–32638, 2022. Curran Associates, Inc.\n \n\n\n\n
\n\n\n\n \n \n \"KSDPaper\n  \n \n \n \"KSD pdf\n  \n \n \n \"KSD supplementary\n  \n \n \n \"KSD arxiv\n  \n \n \n \"KSD code\n  \n \n \n \"KSD slides\n  \n \n \n \"KSD poster\n  \n \n \n \"KSD posterbis\n  \n \n \n \"KSD video\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 58 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{schrab2022ksd,\ntitle={{KSD} Aggregated Goodness-of-fit Test},\nauthor={Antonin Schrab and Benjamin Guedj and Arthur Gretton},\nyear={2022},\neditor = {S. Koyejo and S. Mohamed and A. Agarwal and D. Belgrave and K. Cho and A. Oh},\npages = {32624--32638},\npublisher = {Curran Associates, Inc.},\nvolume = {35},\nbooktitle = {Advances in Neural Information Processing Systems [NeurIPS]},\nabstract = {We investigate properties of goodness-of-fit tests based on the Kernel Stein Discrepancy (KSD). We introduce a strategy to construct a test, called KSDAgg, which aggregates multiple tests with different kernels. KSDAgg avoids splitting the data to perform kernel selection (which leads to a loss in test power), and rather maximises the test power over a collection of kernels. We provide theoretical guarantees on the power of KSDAgg: we show it achieves the smallest uniform separation rate of the collection, up to a logarithmic term. KSDAgg can be computed exactly in practice as it relies either on a parametric bootstrap or on a wild bootstrap to estimate the quantiles and the level corrections. In particular, for the crucial choice of bandwidth of a fixed kernel, it avoids resorting to arbitrary heuristics (such as median or standard deviation) or to data splitting. We find on both synthetic and real-world data that KSDAgg outperforms other state-of-the-art adaptive KSD-based goodness-of-fit testing procedures.},\nurl = {https://papers.nips.cc/paper_files/paper/2022/hash/d241a7b1499cee1bf40769ceade2444d-Abstract-Conference.html},\nurl_PDF = {https://papers.nips.cc/paper_files/paper/2022/file/d241a7b1499cee1bf40769ceade2444d-Paper-Conference.pdf},\nurl_Supplementary = {https://papers.nips.cc/paper_files/paper/2022/file/d241a7b1499cee1bf40769ceade2444d-Supplemental-Conference.pdf},\nurl_arXiv = {https://arxiv.org/abs/2202.00824},\nurl_Code = {https://github.com/antoninschrab/ksdagg-paper},\nurl_Slides = {https://antoninschrab.github.io/files/Slides-31-05-22.pdf},\nurl_Poster = {https://antoninschrab.github.io/files/Poster_MMDAgg_KSDAgg.pdf},\nurl_Posterbis = {https://antoninschrab.github.io/files/Simplified_Poster.pdf},\nurl_Video = {https://www.youtube.com/watch?v=OWh6Hj10wsY},\neprint={2202.00824},\narchivePrefix={arXiv},\nprimaryClass={stat.ML},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n We investigate properties of goodness-of-fit tests based on the Kernel Stein Discrepancy (KSD). We introduce a strategy to construct a test, called KSDAgg, which aggregates multiple tests with different kernels. KSDAgg avoids splitting the data to perform kernel selection (which leads to a loss in test power), and rather maximises the test power over a collection of kernels. We provide theoretical guarantees on the power of KSDAgg: we show it achieves the smallest uniform separation rate of the collection, up to a logarithmic term. KSDAgg can be computed exactly in practice as it relies either on a parametric bootstrap or on a wild bootstrap to estimate the quantiles and the level corrections. In particular, for the crucial choice of bandwidth of a fixed kernel, it avoids resorting to arbitrary heuristics (such as median or standard deviation) or to data splitting. We find on both synthetic and real-world data that KSDAgg outperforms other state-of-the-art adaptive KSD-based goodness-of-fit testing procedures.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Non-Vacuous Generalisation Bounds for Shallow Neural Networks.\n \n \n \n \n\n\n \n Biggs, F.; and Guedj, B.\n\n\n \n\n\n\n In Chaudhuri, K.; Jegelka, S.; Song, L.; Szepesvari, C.; Niu, G.; and Sabato, S., editor(s), Proceedings of the 39th International Conference on Machine Learning [ICML], volume 162, of Proceedings of Machine Learning Research, pages 1963–1981, July 2022. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"Non-VacuousPaper\n  \n \n \n \"Non-Vacuous arxiv\n  \n \n \n \"Non-Vacuous pdf\n  \n \n \n \"Non-Vacuous code\n  \n \n \n \"Non-Vacuous video\n  \n \n \n \"Non-Vacuous slides\n  \n \n \n \"Non-Vacuous poster\n  \n \n \n \"Non-Vacuous slideslive\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{biggs2022shallow,\nbooktitle={Proceedings of the 39th International Conference on Machine Learning [ICML]},\ntitle={Non-Vacuous Generalisation Bounds for Shallow Neural Networks},\nauthor={Felix Biggs and Benjamin Guedj},\nyear={2022},\npages = \t {1963--1981},\neditor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\nvolume = \t {162},\nseries = \t {Proceedings of Machine Learning Research},\nmonth = \t {July},\npublisher =  {PMLR},\nabstract = {We focus on a specific class of shallow neural networks with a single hidden layer, namely those with $L_2$-normalised data and either a sigmoid-shaped Gaussian error function (“erf”) activation or a Gaussian Error Linear Unit (GELU) activation. For these networks, we derive new generalisation bounds through the PAC-Bayesian theory; unlike most existing such bounds they apply to neural networks with deterministic rather than randomised parameters. Our bounds are empirically non-vacuous when the network is trained with vanilla stochastic gradient descent on MNIST and Fashion-MNIST.},\nurl = {https://proceedings.mlr.press/v162/biggs22a.html},\nurl_arXiv = {https://arxiv.org/abs/2202.01627},\nurl_PDF = {https://proceedings.mlr.press/v162/biggs22a/biggs22a.pdf},\nurl_Code = {},\nurl_Video = {https://icml.cc/virtual/2022/spotlight/17948},\nurl_Slides = {https://icml.cc/media/icml-2022/Slides/17948.pdf},\nurl_Poster = {https://icml.cc/media/PosterPDFs/ICML%202022/194cf6c2de8e00c05fcf16c498adc7bf.png},\nurl_SlidesLive = {https://slideslive.com/38983983/nonvacuous-generalisation-bounds-for-shallow-neural-networks},\neprint={2202.01627},\narchivePrefix={arXiv},\nprimaryClass={cs.LG},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n We focus on a specific class of shallow neural networks with a single hidden layer, namely those with $L_2$-normalised data and either a sigmoid-shaped Gaussian error function (“erf”) activation or a Gaussian Error Linear Unit (GELU) activation. For these networks, we derive new generalisation bounds through the PAC-Bayesian theory; unlike most existing such bounds they apply to neural networks with deterministic rather than randomised parameters. Our bounds are empirically non-vacuous when the network is trained with vanilla stochastic gradient descent on MNIST and Fashion-MNIST.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Measuring dissimilarity with diffeomorphism invariance.\n \n \n \n \n\n\n \n Cantelobre, T.; Ciliberto, C.; Guedj, B.; and Rudi, A.\n\n\n \n\n\n\n In Chaudhuri, K.; Jegelka, S.; Song, L.; Szepesvari, C.; Niu, G.; and Sabato, S., editor(s), Proceedings of the 39th International Conference on Machine Learning [ICML], volume 162, of Proceedings of Machine Learning Research, pages 2572–2596, 17–23 Jul 2022. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"MeasuringPaper\n  \n \n \n \"Measuring arxiv\n  \n \n \n \"Measuring pdf\n  \n \n \n \"Measuring code\n  \n \n \n \"Measuring video\n  \n \n \n \"Measuring videobis\n  \n \n \n \"Measuring slides\n  \n \n \n \"Measuring poster\n  \n \n \n \"Measuring slidesposter\n  \n \n \n \"Measuring slideslive\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{cantelobre2022dissimilarity,\nbooktitle={Proceedings of the 39th International Conference on Machine Learning [ICML]},\ntitle={Measuring dissimilarity with diffeomorphism invariance},\nauthor={Cantelobre, Th{\\'e}ophile and Ciliberto, Carlo and Guedj, Benjamin and Rudi, Alessandro},\nyear={2022},\npages = \t {2572--2596},\neditor = \t {Chaudhuri, Kamalika and Jegelka, Stefanie and Song, Le and Szepesvari, Csaba and Niu, Gang and Sabato, Sivan},\nvolume = \t {162},\nseries = \t {Proceedings of Machine Learning Research},\nmonth = \t {17--23 Jul},\npublisher =  {PMLR},\nabstract = {Measures of similarity (or dissimilarity) are a key ingredient to many machine learning algorithms. We introduce DID, a pairwise dissimilarity measure applicable to a wide range of data spaces, which leverages the data’s internal structure to be invariant to diffeomorphisms. We prove that DID enjoys properties which make it relevant for theoretical study and practical use. By representing each datum as a function, DID is defined as the solution to an optimization problem in a Reproducing Kernel Hilbert Space and can be expressed in closed-form. In practice, it can be efficiently approximated via Nystr{ö}m sampling. Empirical experiments support the merits of DID.},\nurl = \t {https://proceedings.mlr.press/v162/cantelobre22a.html},\nurl_arXiv = {https://arxiv.org/abs/2202.05614},\nurl_PDF = {https://proceedings.mlr.press/v162/cantelobre22a/cantelobre22a.pdf},\nurl_Code = {https://github.com/theophilec/diffy},\nurl_Video = {https://youtu.be/WKADjgoTBBs},\nurl_Videobis = {https://icml.cc/virtual/2022/spotlight/17588},\nurl_Slides = {https://icml.cc/media/icml-2022/Slides/17588_MqtuoR9.pdf},\nurl_Poster = {https://icml.cc/media/PosterPDFs/ICML%202022/9ff0525c64bf3d4c9957a1d4397f1b40.png},\nurl_SlidesPoster = {https://icml.cc/media/icml-2022/Slides/17587.pdf},\nurl_SlidesLive = {https://slideslive.com/38983838/measuring-dissimilarity-with-diffeomorphism-invariance},\neprint={2202.05614},\narchivePrefix={arXiv},\nprimaryClass={stat.ML},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n Measures of similarity (or dissimilarity) are a key ingredient to many machine learning algorithms. We introduce DID, a pairwise dissimilarity measure applicable to a wide range of data spaces, which leverages the data’s internal structure to be invariant to diffeomorphisms. We prove that DID enjoys properties which make it relevant for theoretical study and practical use. By representing each datum as a function, DID is defined as the solution to an optimization problem in a Reproducing Kernel Hilbert Space and can be expressed in closed-form. In practice, it can be efficiently approximated via Nyström sampling. Empirical experiments support the merits of DID.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Controlling Multiple Errors Simultaneously with a PAC-Bayes Bound.\n \n \n \n \n\n\n \n Adams, R.; Shawe-Taylor, J.; and Guedj, B.\n\n\n \n\n\n\n 2022.\n Submitted.\n\n\n\n
\n\n\n\n \n \n \"ControllingPaper\n  \n \n \n \"Controlling pdf\n  \n \n \n \"Controlling video\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@unpublished{adams2022confusion,\ntitle={Controlling Multiple Errors Simultaneously with a {PAC-Bayes} Bound},\nauthor={Reuben Adams and John Shawe-Taylor and Benjamin Guedj},\nyear={2022},\nnote = "Submitted.",\nabstract = {Current PAC-Bayes generalisation bounds are restricted to scalar metrics of performance, such as the loss or error rate. However, one ideally wants more information-rich certificates that control the entire distribution of possible outcomes, such as the distribution of the test loss in regression, or the probabilities of different mis-classifications. We provide the first PAC-Bayes bound capable of providing such rich information by bounding the Kullback-Leibler divergence between the empirical and true probabilities of a set of $M$ error types, which can either be discretized loss values for regression, or the elements of the confusion matrix (or a partition thereof) for classification. We transform our bound into a differentiable training objective. Our bound is especially useful in cases where the severity of different mis-classifications may change over time; existing PAC-Bayes bounds can only bound a particular pre-decided weighting of the error types. In contrast our bound implicitly controls all uncountably many weightings simultaneously.},\nurl = {https://arxiv.org/abs/2202.05560},\nurl_PDF = {https://arxiv.org/pdf/2202.05560.pdf},\nurl_Video = {https://youtu.be/yMBv89kP42w},\neprint={2202.05560},\narchivePrefix={arXiv},\nprimaryClass={stat.ML},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n Current PAC-Bayes generalisation bounds are restricted to scalar metrics of performance, such as the loss or error rate. However, one ideally wants more information-rich certificates that control the entire distribution of possible outcomes, such as the distribution of the test loss in regression, or the probabilities of different mis-classifications. We provide the first PAC-Bayes bound capable of providing such rich information by bounding the Kullback-Leibler divergence between the empirical and true probabilities of a set of $M$ error types, which can either be discretized loss values for regression, or the elements of the confusion matrix (or a partition thereof) for classification. We transform our bound into a differentiable training objective. Our bound is especially useful in cases where the severity of different mis-classifications may change over time; existing PAC-Bayes bounds can only bound a particular pre-decided weighting of the error types. In contrast our bound implicitly controls all uncountably many weightings simultaneously.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n On change of measure inequalities for $f$-divergences.\n \n \n \n \n\n\n \n Picard-Weibel, A.; and Guedj, B.\n\n\n \n\n\n\n 2022.\n Submitted.\n\n\n\n
\n\n\n\n \n \n \"OnPaper\n  \n \n \n \"On pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@unpublished{picard2022divergence,\ntitle={On change of measure inequalities for $f$-divergences},\nauthor={Antoine Picard-Weibel and Benjamin Guedj},\nyear={2022},\nnote = "Submitted.",\nabstract = {We propose new change of measure inequalities based on $f$-divergences (of which the Kullback-Leibler divergence is a particular case). Our strategy relies on combining the Legendre transform of $f$-divergences and the Young-Fenchel inequality. By exploiting these new change of measure inequalities, we derive new PAC-Bayesian generalisation bounds with a complexity involving $f$-divergences, and holding in mostly unchartered settings (such as heavy-tailed losses). We instantiate our results for the most popular $f$-divergences.},\nurl = {https://arxiv.org/abs/2202.05568},\nurl_PDF = {https://arxiv.org/pdf/2202.05568.pdf},\neprint={2202.05568},\narchivePrefix={arXiv},\nprimaryClass={stat.ML},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n We propose new change of measure inequalities based on $f$-divergences (of which the Kullback-Leibler divergence is a particular case). Our strategy relies on combining the Legendre transform of $f$-divergences and the Young-Fenchel inequality. By exploiting these new change of measure inequalities, we derive new PAC-Bayesian generalisation bounds with a complexity involving $f$-divergences, and holding in mostly unchartered settings (such as heavy-tailed losses). We instantiate our results for the most popular $f$-divergences.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Reprint: a randomized extrapolation based on principal components for data augmentation.\n \n \n \n \n\n\n \n Wei, J.; Chen, Q.; Peng, P.; Guedj, B.; and Li, L.\n\n\n \n\n\n\n 2022.\n Submitted.\n\n\n\n
\n\n\n\n \n \n \"Reprint:Paper\n  \n \n \n \"Reprint: pdf\n  \n \n \n \"Reprint: code\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@unpublished{wei2022reprint,\ntitle={Reprint: a randomized extrapolation based on principal components for data augmentation},\nauthor={Jiale Wei and Qiyuan Chen and Pai Peng and Benjamin Guedj and Le Li},\nyear={2022},\nnote = "Submitted.",\nabstract = {Data scarcity and data imbalance have attracted a lot of attention in many fields. Data augmentation, explored as an effective approach to tackle them, can improve the robustness and efficiency of classification models by generating new samples. This paper presents REPRINT, a simple and effective hidden-space data augmentation method for imbalanced data classification. Given hidden-space representations of samples in each class, REPRINT extrapolates, in a randomized fashion, augmented examples for target class by using subspaces spanned by principal components to summarize distribution structure of both source and target class. Consequently, the examples generated would diversify the target while maintaining the original geometry of target distribution. Besides, this method involves a label refinement component which allows to synthesize new soft labels for augmented examples. Compared with different NLP data augmentation approaches under a range of data imbalanced scenarios on four text classification benchmark, REPRINT shows prominent improvements. Moreover, through comprehensive ablation studies, we show that label refinement is better than label-preserving for augmented examples, and that our method suggests stable and consistent improvements in terms of suitable choices of principal components. Moreover, REPRINT is appealing for its easy-to-use since it contains only one hyperparameter determining the dimension of subspace and requires low computational resource.},\nurl = {https://arxiv.org/abs/2204.12024},\nurl_PDF = {https://arxiv.org/pdf/2204.12024.pdf},\nurl_Code = {https://github.com/bigdata-ccnu/REPRINT},\neprint={2204.12024},\narchivePrefix={arXiv},\nprimaryClass={cs.CL},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n Data scarcity and data imbalance have attracted a lot of attention in many fields. Data augmentation, explored as an effective approach to tackle them, can improve the robustness and efficiency of classification models by generating new samples. This paper presents REPRINT, a simple and effective hidden-space data augmentation method for imbalanced data classification. Given hidden-space representations of samples in each class, REPRINT extrapolates, in a randomized fashion, augmented examples for target class by using subspaces spanned by principal components to summarize distribution structure of both source and target class. Consequently, the examples generated would diversify the target while maintaining the original geometry of target distribution. Besides, this method involves a label refinement component which allows to synthesize new soft labels for augmented examples. Compared with different NLP data augmentation approaches under a range of data imbalanced scenarios on four text classification benchmark, REPRINT shows prominent improvements. Moreover, through comprehensive ablation studies, we show that label refinement is better than label-preserving for augmented examples, and that our method suggests stable and consistent improvements in terms of suitable choices of principal components. Moreover, REPRINT is appealing for its easy-to-use since it contains only one hyperparameter determining the dimension of subspace and requires low computational resource.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Online PAC-Bayesian Learning.\n \n \n \n \n\n\n \n Haddouche, M.; and Guedj, B.\n\n\n \n\n\n\n In Koyejo, S.; Mohamed, S.; Agarwal, A.; Belgrave, D.; Cho, K.; and Oh, A., editor(s), Advances in Neural Information Processing Systems [NeurIPS], volume 35, pages 25725–25738, 2022. Curran Associates, Inc.\n \n\n\n\n
\n\n\n\n \n \n \"OnlinePaper\n  \n \n \n \"Online pdf\n  \n \n \n \"Online supplementary\n  \n \n \n \"Online arxiv\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{haddouche2022online,\ntitle={Online {PAC-Bayesian} Learning},\nauthor={Maxime Haddouche and Benjamin Guedj},\nyear={2022},\nbooktitle = {Advances in Neural Information Processing Systems [NeurIPS]},\neditor = {S. Koyejo and S. Mohamed and A. Agarwal and D. Belgrave and K. Cho and A. Oh},\nabstract = {Most PAC-Bayesian bounds hold in the batch learning setting where data is collected at once, prior to inference or prediction. This somewhat departs from many contemporary learning problems where data streams are collected and the algorithms must dynamically adjust. We prove new PAC-Bayesian bounds in this online learning framework, leveraging an updated definition of regret, and we revisit classical PAC-Bayesian results with a batch-to-online conversion, extending their remit to the case of dependent data. Our results hold for bounded losses, potentially \\emph{non-convex}, paving the way to promising developments in online learning.},\npages = {25725--25738},\npublisher = {Curran Associates, Inc.},\nvolume = {35},\nurl = {https://papers.nips.cc/paper_files/paper/2022/hash/a4d991d581accd2955a1e1928f4e6965-Abstract-Conference.html},\nurl_PDF = {https://papers.nips.cc/paper_files/paper/2022/file/a4d991d581accd2955a1e1928f4e6965-Paper-Conference.pdf},\nurl_Supplementary = {https://papers.nips.cc/paper_files/paper/2022/file/a4d991d581accd2955a1e1928f4e6965-Supplemental-Conference.pdf},\nurl_arXiv = {https://arxiv.org/abs/2206.00024},\neprint={2206.00024},\narchivePrefix={arXiv},\nprimaryClass={cs.LG},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n Most PAC-Bayesian bounds hold in the batch learning setting where data is collected at once, prior to inference or prediction. This somewhat departs from many contemporary learning problems where data streams are collected and the algorithms must dynamically adjust. We prove new PAC-Bayesian bounds in this online learning framework, leveraging an updated definition of regret, and we revisit classical PAC-Bayesian results with a batch-to-online conversion, extending their remit to the case of dependent data. Our results hold for bounded losses, potentially \\emphnon-convex, paving the way to promising developments in online learning.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n On Margins and Generalisation for Voting Classifiers.\n \n \n \n \n\n\n \n Biggs, F.; Zantedeschi, V.; and Guedj, B.\n\n\n \n\n\n\n In Koyejo, S.; Mohamed, S.; Agarwal, A.; Belgrave, D.; Cho, K.; and Oh, A., editor(s), Advances in Neural Information Processing Systems [NeurIPS], volume 35, pages 9713–9726, 2022. Curran Associates, Inc.\n \n\n\n\n
\n\n\n\n \n \n \"OnPaper\n  \n \n \n \"On pdf\n  \n \n \n \"On supplementary\n  \n \n \n \"On arxiv\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{biggs2022majority,\ntitle={On Margins and Generalisation for Voting Classifiers},\nauthor={Felix Biggs and Valentina Zantedeschi and Benjamin Guedj},\nyear={2022},\neditor = {S. Koyejo and S. Mohamed and A. Agarwal and D. Belgrave and K. Cho and A. Oh},\npages = {9713--9726},\nvolume = {35},\npublisher = {Curran Associates, Inc.},\nbooktitle = {Advances in Neural Information Processing Systems [NeurIPS]},\nabstract = {We study the generalisation properties of majority voting on finite ensembles of classifiers, proving margin-based generalisation bounds via the PAC-Bayes theory. These provide state-of-the-art guarantees on a number of classification tasks. Our central results leverage the Dirichlet posteriors studied recently by Zantedeschi et al. [2021] for training voting classifiers; in contrast to that work our bounds apply to non-randomised votes via the use of margins. Our contributions add perspective to the debate on the "margins theory" proposed by Schapire et al. [1998] for the generalisation of ensemble classifiers.},\nurl = {https://papers.nips.cc/paper_files/paper/2022/hash/3f8675af3da6da231c9e75b889b7f047-Abstract-Conference.html},\nurl_PDF = {https://papers.nips.cc/paper_files/paper/2022/file/3f8675af3da6da231c9e75b889b7f047-Paper-Conference.pdf},\nurl_Supplementary = {https://papers.nips.cc/paper_files/paper/2022/file/3f8675af3da6da231c9e75b889b7f047-Supplemental-Conference.pdf},\nurl_arXiv = {https://arxiv.org/abs/2206.04607},\neprint={2206.04607},\narchivePrefix={arXiv},\nprimaryClass={cs.LG},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n We study the generalisation properties of majority voting on finite ensembles of classifiers, proving margin-based generalisation bounds via the PAC-Bayes theory. These provide state-of-the-art guarantees on a number of classification tasks. Our central results leverage the Dirichlet posteriors studied recently by Zantedeschi et al. [2021] for training voting classifiers; in contrast to that work our bounds apply to non-randomised votes via the use of margins. Our contributions add perspective to the debate on the \"margins theory\" proposed by Schapire et al. [1998] for the generalisation of ensemble classifiers.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Efficient Aggregated Kernel Tests using Incomplete $U$-statistics.\n \n \n \n \n\n\n \n Schrab, A.; Kim, I.; Guedj, B.; and Gretton, A.\n\n\n \n\n\n\n In Koyejo, S.; Mohamed, S.; Agarwal, A.; Belgrave, D.; Cho, K.; and Oh, A., editor(s), Advances in Neural Information Processing Systems [NeurIPS], volume 35, pages 18793–18807, 2022. Curran Associates, Inc.\n \n\n\n\n
\n\n\n\n \n \n \"EfficientPaper\n  \n \n \n \"Efficient pdf\n  \n \n \n \"Efficient supplementary\n  \n \n \n \"Efficient arxiv\n  \n \n \n \"Efficient slides\n  \n \n \n \"Efficient poster\n  \n \n \n \"Efficient posterbis\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 9 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{schrab2022ustats,\ntitle={Efficient Aggregated Kernel Tests using Incomplete $U$-statistics},\nauthor={Antonin Schrab and Ilmun Kim and Benjamin Guedj and Arthur Gretton},\nyear={2022},\neditor = {S. Koyejo and S. Mohamed and A. Agarwal and D. Belgrave and K. Cho and A. Oh},\npages = {18793--18807},\npublisher = {Curran Associates, Inc.},\nvolume = {35},\nbooktitle = {Advances in Neural Information Processing Systems [NeurIPS]},\nabstract = {We propose a series of computationally efficient, nonparametric tests for the two-sample, independence and goodness-of-fit problems, using the Maximum Mean Discrepancy (MMD), Hilbert Schmidt Independence Criterion (HSIC), and Kernel Stein Discrepancy (KSD), respectively. Our test statistics are incomplete $U$-statistics, with a computational cost that interpolates between linear time in the number of samples, and quadratic time, as associated with classical $U$-statistic tests. The three proposed tests aggregate over several kernel bandwidths to detect departures from the null on various scales: we call the resulting tests MMDAggInc, HSICAggInc and KSDAggInc. For the test thresholds, we derive a quantile bound for wild bootstrapped incomplete $U$-statistics, which is of independent interest. We derive uniform separation rates for MMDAggInc and HSICAggInc, and quantify exactly the trade-off between computational efficiency and the attainable rates: this result is novel for tests based on incomplete $U$-statistics, to our knowledge. We further show that in the quadratic-time case, the wild bootstrap incurs no penalty to test power over more widespread permutation-based approaches, since both attain the same minimax optimal rates (which in turn match the rates that use oracle quantiles). We support our claims with numerical experiments on the trade-off between computational efficiency and test power. In the three testing frameworks, we observe that our proposed linear-time aggregated tests obtain higher power than current state-of-the-art linear-time kernel tests.},\nurl = {https://papers.nips.cc/paper_files/paper/2022/hash/774164b966cc277c82a960934445140d-Abstract-Conference.html},\nurl_PDF = {https://papers.nips.cc/paper_files/paper/2022/file/774164b966cc277c82a960934445140d-Paper-Conference.pdf},\nurl_Supplementary = {https://papers.nips.cc/paper_files/paper/2022/file/774164b966cc277c82a960934445140d-Supplemental-Conference.pdf},\nurl_arXiv = {https://arxiv.org/abs/2206.09194},\nurl_Slides = {https://antoninschrab.github.io/files/Slides-31-05-22.pdf},\nurl_Poster = {},\nurl_Posterbis = {https://antoninschrab.github.io/files/Simplified_Poster.pdf},\neprint={2206.09194},\narchivePrefix={arXiv},\nprimaryClass={stat.ML},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n We propose a series of computationally efficient, nonparametric tests for the two-sample, independence and goodness-of-fit problems, using the Maximum Mean Discrepancy (MMD), Hilbert Schmidt Independence Criterion (HSIC), and Kernel Stein Discrepancy (KSD), respectively. Our test statistics are incomplete $U$-statistics, with a computational cost that interpolates between linear time in the number of samples, and quadratic time, as associated with classical $U$-statistic tests. The three proposed tests aggregate over several kernel bandwidths to detect departures from the null on various scales: we call the resulting tests MMDAggInc, HSICAggInc and KSDAggInc. For the test thresholds, we derive a quantile bound for wild bootstrapped incomplete $U$-statistics, which is of independent interest. We derive uniform separation rates for MMDAggInc and HSICAggInc, and quantify exactly the trade-off between computational efficiency and the attainable rates: this result is novel for tests based on incomplete $U$-statistics, to our knowledge. We further show that in the quadratic-time case, the wild bootstrap incurs no penalty to test power over more widespread permutation-based approaches, since both attain the same minimax optimal rates (which in turn match the rates that use oracle quantiles). We support our claims with numerical experiments on the trade-off between computational efficiency and test power. In the three testing frameworks, we observe that our proposed linear-time aggregated tests obtain higher power than current state-of-the-art linear-time kernel tests.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Generalisation under gradient descent via deterministic PAC-Bayes.\n \n \n \n \n\n\n \n Clerico, E.; Farghly, T.; Deligiannidis, G.; Guedj, B.; and Doucet, A.\n\n\n \n\n\n\n 2022.\n Submitted.\n\n\n\n
\n\n\n\n \n \n \"GeneralisationPaper\n  \n \n \n \"Generalisation pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@unpublished{clerico2022pacbayes,\ntitle = {Generalisation under gradient descent via deterministic {PAC-Bayes}},\nauthor = {Clerico, Eugenio and Farghly, Tyler and Deligiannidis, George and Guedj, Benjamin and Doucet, Arnaud},\nyear = {2022},\nnote = "Submitted.",\nabstract = {We establish disintegrated PAC-Bayesian generalisation bounds for models trained with gradient descent methods or continuous gradient flows. Contrary to standard practice in the PAC-Bayesian setting, our result applies to optimisation algorithms that are deterministic, without requiring any \\textit{de-randomisation} step. Our bounds are fully computable, depending on the density of the initial distribution and the Hessian of the training objective over the trajectory. We show that our framework can be applied to a variety of iterative optimisation algorithms, including stochastic gradient descent (SGD), momentum-based schemes, and damped Hamiltonian dynamics.},\nurl = {https://arxiv.org/abs/2209.02525},\nurl_PDF = {https://arxiv.org/pdf/2209.02525.pdf},\ndoi = {10.48550/ARXIV.2209.02525},\neprint={2209.02525},\narchivePrefix={arXiv},\nprimaryClass={stat.ML},\ncopyright = {Creative Commons Attribution 4.0 International},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n We establish disintegrated PAC-Bayesian generalisation bounds for models trained with gradient descent methods or continuous gradient flows. Contrary to standard practice in the PAC-Bayesian setting, our result applies to optimisation algorithms that are deterministic, without requiring any de-randomisation step. Our bounds are fully computable, depending on the density of the initial distribution and the Hessian of the training objective over the trajectory. We show that our framework can be applied to a variety of iterative optimisation algorithms, including stochastic gradient descent (SGD), momentum-based schemes, and damped Hamiltonian dynamics.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2021\n \n \n (10)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Sequential Learning of Principal Curves: Summarizing Data Streams on the Fly.\n \n \n \n \n\n\n \n Li, L.; and Guedj, B.\n\n\n \n\n\n\n Entropy, 23(11). 2021.\n \n\n\n\n
\n\n\n\n \n \n \"SequentialPaper\n  \n \n \n \"Sequential arxiv\n  \n \n \n \"Sequential pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 2 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{guedj2018sequential,\ntitle={Sequential Learning of Principal Curves: Summarizing Data Streams on the Fly},\nauthor={Li, Le and Guedj, Benjamin},\njournal={Entropy},\nvolume = {23},\nnumber = {11},\nARTICLE-NUMBER = {1534},\nISSN = {1099-4300},\ndoi = {10.3390/e23111534},\nissuetitle = {Approximate Bayesian Inference},\nurl = {https://www.mdpi.com/1099-4300/23/11/1534},\nurl_arXiv = "https://arxiv.org/abs/1805.07418",\nurl_PDF = "https://www.mdpi.com/1099-4300/23/11/1534/pdf",\nabstract = {When confronted with massive data streams, summarizing data with dimension reduction methods such as PCA raises theoretical and algorithmic pitfalls. A principal curve acts as a nonlinear generalization of PCA, and the present paper proposes a novel algorithm to automatically and sequentially learn principal curves from data streams. We show that our procedure is supported by regret bounds with optimal sublinear remainder terms. A greedy local search implementation (called slpc, for sequential learning principal curves) that incorporates both sleeping experts and multi-armed bandit ingredients is presented, along with its regret computation and performance on synthetic and real-life data.},\nyear={2021},\neprint={1805.07418},\narchivePrefix={arXiv},\nprimaryClass={stat.ML},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n When confronted with massive data streams, summarizing data with dimension reduction methods such as PCA raises theoretical and algorithmic pitfalls. A principal curve acts as a nonlinear generalization of PCA, and the present paper proposes a novel algorithm to automatically and sequentially learn principal curves from data streams. We show that our procedure is supported by regret bounds with optimal sublinear remainder terms. A greedy local search implementation (called slpc, for sequential learning principal curves) that incorporates both sleeping experts and multi-armed bandit ingredients is presented, along with its regret computation and performance on synthetic and real-life data.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Online k-means Clustering.\n \n \n \n \n\n\n \n Cohen-Addad, V.; Guedj, B.; Kanade, V.; and Rom, G.\n\n\n \n\n\n\n In Banerjee, A.; and Fukumizu, K., editor(s), Proceedings of The 24th International Conference on Artificial Intelligence and Statistics [AISTATS], volume 130, of Proceedings of Machine Learning Research, pages 1126–1134, April 2021. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"OnlinePaper\n  \n \n \n \"Online arxiv\n  \n \n \n \"Online pdf\n  \n \n \n \"Online supplementary\n  \n \n \n \"Online video\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@InProceedings{pmlr-v130-cohen-addad21a,\ntitle = \t {Online k-means Clustering},\nauthor =       {Cohen-Addad, Vincent and Guedj, Benjamin and Kanade, Varun and Rom, Guy},\nbooktitle = \t {Proceedings of The 24th International Conference on Artificial Intelligence and Statistics [AISTATS]},\npages = \t {1126--1134},\nyear = \t {2021},\neditor = \t {Banerjee, Arindam and Fukumizu, Kenji},\nvolume = \t {130},\nseries = \t {Proceedings of Machine Learning Research},\nmonth = \t {April},\npublisher =    {PMLR},\nurl = \t {http://proceedings.mlr.press/v130/cohen-addad21a.html},\nurl_arXiv = "https://arxiv.org/abs/1909.06861",\nurl_PDF = \t {http://proceedings.mlr.press/v130/cohen-addad21a/cohen-addad21a.pdf},\nurl_Supplementary = {http://proceedings.mlr.press/v130/cohen-addad21a/cohen-addad21a-supp.pdf},\nurl_Video = {https://slideslive.com/38953036},\nabstract = \t {We study the problem of learning a clustering of an online set of points. The specific formulation we use is the k-means objective: At each time step the algorithm has to maintain a set of k candidate centers and the loss incurred by the algorithm is the squared distance between the new point and the closest center. The goal is to minimize regret with respect to the best solution to the k-means objective in hindsight. We show that provided the data lies in a bounded region, learning is possible, namely an implementation of the Multiplicative Weights Update Algorithm (MWUA) using a discretized grid achieves a regret bound of $\\tilde{O}(\\sqrt{T})$ in expectation. We also present an online-to-offline reduction that shows that an efficient no-regret online algorithm (despite being allowed to choose a different set of candidate centres at each round) implies an offline efficient algorithm for the k-means problem, which is known to be NP-hard. In light of this hardness, we consider the slightly weaker requirement of comparing regret with respect to $(1 + \\epsilon)OPT$ and present a no-regret algorithm with runtime $O\\left(T \\mathrm{poly}(\\log(T),k,d,1/\\epsilon)^{O(kd)}\\right)$. Our algorithm is based on maintaining a set of points of bounded size which is a coreset that helps identifying the \\emph{relevant} regions of the space for running an adaptive, more efficient, variant of the MWUA. We show that simpler online algorithms, such as \\emph{Follow The Leader} (FTL), fail to produce sublinear regret in the worst case. We also report preliminary experiments with synthetic and real-world data. Our theoretical results answer an open question of Dasgupta (2008).},\nkeywords={mine}\n}\n\n\n\n
\n
\n\n\n
\n We study the problem of learning a clustering of an online set of points. The specific formulation we use is the k-means objective: At each time step the algorithm has to maintain a set of k candidate centers and the loss incurred by the algorithm is the squared distance between the new point and the closest center. The goal is to minimize regret with respect to the best solution to the k-means objective in hindsight. We show that provided the data lies in a bounded region, learning is possible, namely an implementation of the Multiplicative Weights Update Algorithm (MWUA) using a discretized grid achieves a regret bound of $Õ(\\sqrt{T})$ in expectation. We also present an online-to-offline reduction that shows that an efficient no-regret online algorithm (despite being allowed to choose a different set of candidate centres at each round) implies an offline efficient algorithm for the k-means problem, which is known to be NP-hard. In light of this hardness, we consider the slightly weaker requirement of comparing regret with respect to $(1 + ε)OPT$ and present a no-regret algorithm with runtime $Ołeft(T \\mathrm{poly}(łog(T),k,d,1/ε)^{O(kd)}i̊ght)$. Our algorithm is based on maintaining a set of points of bounded size which is a coreset that helps identifying the \\emphrelevant regions of the space for running an adaptive, more efficient, variant of the MWUA. We show that simpler online algorithms, such as \\emphFollow The Leader (FTL), fail to produce sublinear regret in the worst case. We also report preliminary experiments with synthetic and real-world data. Our theoretical results answer an open question of Dasgupta (2008).\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Still no free lunches: the price to pay for tighter PAC-Bayes bounds.\n \n \n \n \n\n\n \n Guedj, B.; and Pujol, L.\n\n\n \n\n\n\n Entropy, 23(11). 2021.\n \n\n\n\n
\n\n\n\n \n \n \"StillPaper\n  \n \n \n \"Still arxiv\n  \n \n \n \"Still pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{guedj2019free,\ntitle={Still no free lunches: the price to pay for tighter {PAC-Bayes} bounds},\nauthor={Benjamin Guedj and Louis Pujol},\nyear={2021},\njournal = {Entropy},\nvolume = {23},\nnumber = {11},\nARTICLE-NUMBER = {1529},\nISSN = {1099-4300},\ndoi = {10.3390/e23111529},\nissuetitle = {Approximate Bayesian Inference},\nabstract = {“No free lunch” results state the impossibility of obtaining meaningful bounds on the error of a learning algorithm without prior assumptions and modelling, which is more or less realistic for a given problem. Some models are “expensive” (strong assumptions, such as sub-Gaussian tails), others are “cheap” (simply finite variance). As it is well known, the more you pay, the more you get: in other words, the most expensive models yield the more interesting bounds. Recent advances in robust statistics have investigated procedures to obtain tight bounds while keeping the cost of assumptions minimal. The present paper explores and exhibits what the limits are for obtaining tight probably approximately correct (PAC)-Bayes bounds in a robust setting for cheap models.},\nurl = {https://www.mdpi.com/1099-4300/23/11/1529},\nurl_arXiv = "https://arxiv.org/abs/1910.04460",\nurl_PDF = "https://www.mdpi.com/1099-4300/23/11/1529/pdf",\neprint={1910.04460},\narchivePrefix={arXiv},\nprimaryClass={cs.LG},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n “No free lunch” results state the impossibility of obtaining meaningful bounds on the error of a learning algorithm without prior assumptions and modelling, which is more or less realistic for a given problem. Some models are “expensive” (strong assumptions, such as sub-Gaussian tails), others are “cheap” (simply finite variance). As it is well known, the more you pay, the more you get: in other words, the most expensive models yield the more interesting bounds. Recent advances in robust statistics have investigated procedures to obtain tight bounds while keeping the cost of assumptions minimal. The present paper explores and exhibits what the limits are for obtaining tight probably approximately correct (PAC)-Bayes bounds in a robust setting for cheap models.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n PAC-Bayes unleashed: generalisation bounds with unbounded losses.\n \n \n \n \n\n\n \n Haddouche, M.; Guedj, B.; Rivasplata, O.; and Shawe-Taylor, J.\n\n\n \n\n\n\n Entropy, 23(10). 2021.\n \n\n\n\n
\n\n\n\n \n \n \"PAC-BayesPaper\n  \n \n \n \"PAC-Bayes arxiv\n  \n \n \n \"PAC-Bayes pdf\n  \n \n \n \"PAC-Bayes code\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{haddouche2020pacbayes,\ntitle={{PAC-Bayes} unleashed: generalisation bounds with unbounded losses},\nauthor={Maxime Haddouche and Benjamin Guedj and Omar Rivasplata and John Shawe-Taylor},\nyear={2021},\neprint={2006.07279},\narchivePrefix={arXiv},\njournal = {Entropy},\nvolume = {23},\nnumber = {10},\nARTICLE-NUMBER = {1330},\nISSN = {1099-4300},\nDOI = {10.3390/e23101330},\nissuetitle = {Approximate Bayesian Inference},\nabstract = "We present new PAC-Bayesian generalisation bounds for learning problems with unbounded loss functions. This extends the relevance and applicability of the PAC-Bayes learning framework, where most of the existing literature focuses on supervised learning problems with a bounded loss function (typically assumed to take values in the interval [0;1]). In order to relax this assumption, we propose a new notion called HYPE (standing for \\emph{HYPothesis-dependent rangE}), which effectively allows the range of the loss to depend on each predictor. Based on this new notion we derive a novel PAC-Bayesian generalisation bound for unbounded loss functions, and we instantiate it on a linear regression problem. To make our theory usable by the largest audience possible, we include discussions on actual computation, practicality and limitations of our assumptions.",\nprimaryClass={stat.ML},\nurl = "https://www.mdpi.com/1099-4300/23/10/1330",\nurl_arXiv = "https://arxiv.org/abs/2006.07279",\nurl_PDF = "https://arxiv.org/pdf/2006.07279.pdf",\nurl_Code = "https://github.com/bguedj/pac-bayes-self-bounding",\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n We present new PAC-Bayesian generalisation bounds for learning problems with unbounded loss functions. This extends the relevance and applicability of the PAC-Bayes learning framework, where most of the existing literature focuses on supervised learning problems with a bounded loss function (typically assumed to take values in the interval [0;1]). In order to relax this assumption, we propose a new notion called HYPE (standing for \\emphHYPothesis-dependent rangE), which effectively allows the range of the loss to depend on each predictor. Based on this new notion we derive a novel PAC-Bayesian generalisation bound for unbounded loss functions, and we instantiate it on a linear regression problem. To make our theory usable by the largest audience possible, we include discussions on actual computation, practicality and limitations of our assumptions.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Differentiable PAC-Bayes Objectives with Partially Aggregated Neural Networks.\n \n \n \n \n\n\n \n Biggs, F.; and Guedj, B.\n\n\n \n\n\n\n Entropy, 23(10). 2021.\n \n\n\n\n
\n\n\n\n \n \n \"DifferentiablePaper\n  \n \n \n \"Differentiable arxiv\n  \n \n \n \"Differentiable pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{biggs2020differentiable,\ntitle={Differentiable {PAC-Bayes} Objectives with Partially Aggregated Neural Networks},\nauthor={Felix Biggs and Benjamin Guedj},\nyear={2021},\njournal = {Entropy},\nvolume = {23},\nnumber = {10},\nARTICLE-NUMBER = {1280},\nISSN = {1099-4300},\nissuetitle = {Approximate Bayesian Inference},\neprint={2006.12228},\nabstract = "We make two related contributions motivated by the challenge of training stochastic neural networks, particularly in a PAC–Bayesian setting: (1) we show how averaging over an ensemble of stochastic neural networks enables a new class of partially-aggregated estimators, proving that these lead to unbiased lower-variance output and gradient estimators; (2) we reformulate a PAC–Bayesian bound for signed-output networks to derive in combination with the above a directly optimisable, differentiable objective and a generalisation guarantee, without using a surrogate loss or loosening the bound. We show empirically that this leads to competitive generalisation guarantees and compares favourably to other methods for training such networks. Finally, we note that the above leads to a simpler PAC–Bayesian training scheme for sign-activation networks than previous work.",\nurl = "https://www.mdpi.com/1099-4300/23/10/1280",\nurl_arXiv = "https://arxiv.org/abs/2006.12228",\nurl_PDF = "https://arxiv.org/pdf/2006.12228.pdf",\narchivePrefix={arXiv},\ndoi = "10.3390/e23101280",\nprimaryClass={cs.LG},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n We make two related contributions motivated by the challenge of training stochastic neural networks, particularly in a PAC–Bayesian setting: (1) we show how averaging over an ensemble of stochastic neural networks enables a new class of partially-aggregated estimators, proving that these lead to unbiased lower-variance output and gradient estimators; (2) we reformulate a PAC–Bayesian bound for signed-output networks to derive in combination with the above a directly optimisable, differentiable objective and a generalisation guarantee, without using a surrogate loss or loosening the bound. We show empirically that this leads to competitive generalisation guarantees and compares favourably to other methods for training such networks. Finally, we note that the above leads to a simpler PAC–Bayesian training scheme for sign-activation networks than previous work.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Forecasting elections results via the voter model with stubborn nodes.\n \n \n \n \n\n\n \n Vendeville, A.; Guedj, B.; and Zhou, S.\n\n\n \n\n\n\n Applied Network Science, 6. 2021.\n \n\n\n\n
\n\n\n\n \n \n \"ForecastingPaper\n  \n \n \n \"Forecasting arxiv\n  \n \n \n \"Forecasting pdf\n  \n \n \n \"Forecasting code\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{vendeville2020elections,\ntitle={Forecasting elections results via the voter model with stubborn nodes},\nauthor={Antoine Vendeville and Benjamin Guedj and Shi Zhou},\nyear={2021},\nvolume = {6},\njournal = {Applied Network Science},\nabstract = {In this paper we propose a novel method to forecast the result of elections using only official results of previous ones. It is based on the voter model with stubborn nodes and uses theoretical results developed in a previous work of ours. We look at popular vote shares for the Conservative and Labour parties in the UK and the Republican and Democrat parties in the US. We are able to perform time-evolving estimates of the model parameters and use these to forecast the vote shares for each party in any election. We obtain a mean absolute error of 4.74\\%. As a side product, our parameters estimates provide meaningful insight on the political landscape, informing us on the proportion of voters that are strong supporters of each of the considered parties.},\nurl = "https://appliednetsci.springeropen.com/articles/10.1007/s41109-020-00342-7",\nurl_arXiv = "https://arxiv.org/abs/2009.10627",\nurl_PDF = "https://arxiv.org/pdf/2009.10627.pdf",\nurl_Code = "https://github.com/AntoineVendeville/HowOpinionsCrystallise",\nDOI = {10.1007/s41109-020-00342-7},\neprint={2009.10627},\narchivePrefix={arXiv},\nprimaryClass={cs.SI},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n In this paper we propose a novel method to forecast the result of elections using only official results of previous ones. It is based on the voter model with stubborn nodes and uses theoretical results developed in a previous work of ours. We look at popular vote shares for the Conservative and Labour parties in the UK and the Republican and Democrat parties in the US. We are able to perform time-evolving estimates of the model parameters and use these to forecast the vote shares for each party in any election. We obtain a mean absolute error of 4.74%. As a side product, our parameters estimates provide meaningful insight on the political landscape, informing us on the proportion of voters that are strong supporters of each of the considered parties.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Covid-19 and AI: Unexpected Challenges and Lessons.\n \n \n \n \n\n\n \n Guedj, B.\n\n\n \n\n\n\n Technical Report July 2021.\n \n\n\n\n
\n\n\n\n \n \n \"Covid-19Paper\n  \n \n \n \"Covid-19 pdf\n  \n \n \n \"Covid-19 video\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@techreport{guedj-hal-03277494,\nTITLE = {{Covid-19} and {AI}: Unexpected Challenges and Lessons},\nAUTHOR = {Guedj, Benjamin},\nurl = {https://hal.inria.fr/hal-03277494},\nYEAR = {2021},\nMONTH = {July},\ntype = {Research Report},\nabstract = {On May 21st, 2021, we held the webinar "Covid-19 and AI: unexpected challenges and lessons". This short note presents its highlights.},\nurl_PDF = {https://hal.inria.fr/hal-03277494v2/file/main.pdf},\nurl_Video = {https://youtu.be/5AnQ35tBnv0},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n On May 21st, 2021, we held the webinar \"Covid-19 and AI: unexpected challenges and lessons\". This short note presents its highlights.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning Stochastic Majority Votes by Minimizing a PAC-Bayes Generalization Bound.\n \n \n \n \n\n\n \n Zantedeschi, V.; Viallard, P.; Morvant, E.; Emonet, R.; Habrard, A.; Germain, P.; and Guedj, B.\n\n\n \n\n\n\n In Beygelzimer, A.; Liang, P.; Vaughan, J. W.; and Dauphin, Y., editor(s), Advances in Neural Information Processing Systems 34: Annual Conference on Neural Information Processing Systems [NeurIPS] 2021, 2021. \n \n\n\n\n
\n\n\n\n \n \n \"LearningPaper\n  \n \n \n \"Learning arxiv\n  \n \n \n \"Learning pdf\n  \n \n \n \"Learning supplementary\n  \n \n \n \"Learning code\n  \n \n \n \"Learning video\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{zantedeschi2021learning,\ntitle={Learning Stochastic Majority Votes by Minimizing a {PAC-Bayes} Generalization Bound},\nauthor={Valentina Zantedeschi and Paul Viallard and Emilie Morvant and Rémi Emonet and Amaury Habrard and Pascal Germain and Benjamin Guedj},\nyear={2021},\neditor    = {Alina Beygelzimer and Percy Liang and Jenn Wortman Vaughan and Yann Dauphin},\nbooktitle = {Advances in Neural Information Processing Systems 34: Annual Conference\non Neural Information Processing Systems [NeurIPS] 2021},\npages     = {},\nabstract = {We investigate a stochastic counterpart of majority votes over finite ensembles of classifiers, and study its generalization properties. While our approach holds for arbitrary distributions, we instantiate it with Dirichlet distributions: this allows for a closed-form and differentiable expression for the expected risk, which then turns the generalization bound into a tractable training objective.The resulting stochastic majority vote learning algorithm achieves state-of-the-art accuracy and benefits from (non-vacuous) tight generalization bounds, in a series of numerical experiments when compared to competing algorithms which also minimize PAC-Bayes objectives -- both with uninformed (data-independent) and informed (data-dependent) priors.},\nurl = {https://proceedings.neurips.cc/paper/2021/hash/0415740eaa4d9decbc8da001d3fd805f-Abstract.html},\nurl_arXiv = {https://arxiv.org/abs/2106.12535},\nurl_PDF = {https://proceedings.neurips.cc/paper/2021/file/0415740eaa4d9decbc8da001d3fd805f-Paper.pdf},\nurl_Supplementary = {https://proceedings.neurips.cc/paper/2021/file/0415740eaa4d9decbc8da001d3fd805f-Supplemental.pdf},\nurl_Code = {https://github.com/vzantedeschi/StocMV},\nurl_Video = {https://nips.cc/virtual/2021/poster/28787},\neprint={2106.12535},\narchivePrefix={arXiv},\nprimaryClass={cs.LG},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n We investigate a stochastic counterpart of majority votes over finite ensembles of classifiers, and study its generalization properties. While our approach holds for arbitrary distributions, we instantiate it with Dirichlet distributions: this allows for a closed-form and differentiable expression for the expected risk, which then turns the generalization bound into a tractable training objective.The resulting stochastic majority vote learning algorithm achieves state-of-the-art accuracy and benefits from (non-vacuous) tight generalization bounds, in a series of numerical experiments when compared to competing algorithms which also minimize PAC-Bayes objectives – both with uninformed (data-independent) and informed (data-dependent) priors.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning PAC-Bayes Priors for Probabilistic Neural Networks.\n \n \n \n \n\n\n \n Pérez-Ortiz, M.; Rivasplata, O.; Guedj, B.; Gleeson, M.; Zhang, J.; Shawe-Taylor, J.; Bober, M.; and Kittler, J.\n\n\n \n\n\n\n 2021.\n Submitted.\n\n\n\n
\n\n\n\n \n \n \"LearningPaper\n  \n \n \n \"Learning pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@unpublished{perezortiz2021learning,\ntitle={Learning {PAC-Bayes} Priors for Probabilistic Neural Networks},\nauthor={Mar\\'ia P\\'erez-Ortiz and Omar Rivasplata and Benjamin Guedj and Matthew Gleeson and Jingyu Zhang and John Shawe-Taylor and Miroslaw Bober and Josef Kittler},\nyear={2021},\nnote = "Submitted.",\nabstract = {Recent works have investigated deep learning models trained by optimising PAC-Bayes bounds, with priors that are learnt on subsets of the data. This combination has been shown to lead not only to accurate classifiers, but also to remarkably tight risk certificates, bearing promise towards self-certified learning (i.e. use all the data to learn a predictor and certify its quality). In this work, we empirically investigate the role of the prior. We experiment on 6 datasets with different strategies and amounts of data to learn data-dependent PAC-Bayes priors, and we compare them in terms of their effect on test performance of the learnt predictors and tightness of their risk certificate. We ask what is the optimal amount of data which should be allocated for building the prior and show that the optimum may be dataset dependent. We demonstrate that using a small percentage of the prior-building data for validation of the prior leads to promising results. We include a comparison of underparameterised and overparameterised models, along with an empirical study of different training objectives and regularisation strategies to learn the prior distribution.},\nurl = {https://arxiv.org/abs/2109.10304},\nurl_PDF = {https://arxiv.org/pdf/2109.10304.pdf},\neprint={2109.10304},\narchivePrefix={arXiv},\nprimaryClass={cs.LG},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n Recent works have investigated deep learning models trained by optimising PAC-Bayes bounds, with priors that are learnt on subsets of the data. This combination has been shown to lead not only to accurate classifiers, but also to remarkably tight risk certificates, bearing promise towards self-certified learning (i.e. use all the data to learn a predictor and certify its quality). In this work, we empirically investigate the role of the prior. We experiment on 6 datasets with different strategies and amounts of data to learn data-dependent PAC-Bayes priors, and we compare them in terms of their effect on test performance of the learnt predictors and tightness of their risk certificate. We ask what is the optimal amount of data which should be allocated for building the prior and show that the optimum may be dataset dependent. We demonstrate that using a small percentage of the prior-building data for validation of the prior leads to promising results. We include a comparison of underparameterised and overparameterised models, along with an empirical study of different training objectives and regularisation strategies to learn the prior distribution.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Progress in self-certified neural networks.\n \n \n \n \n\n\n \n Pérez-Ortiz, M.; Rivasplata, O.; Parrado-Hernandez, E.; Guedj, B.; and Shawe-Taylor, J.\n\n\n \n\n\n\n In NeurIPS 2021 workshop Bayesian Deep Learning [BDL], 2021. \n \n\n\n\n
\n\n\n\n \n \n \"ProgressPaper\n  \n \n \n \"Progress arxiv\n  \n \n \n \"Progress pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{perezortiz2021progress,\ntitle={Progress in self-certified neural networks},\nauthor={Mar\\'ia P\\'erez-Ortiz and Omar Rivasplata and Emilio Parrado-Hernandez and Benjamin Guedj and John Shawe-Taylor},\nyear={2021},\nbooktitle={{NeurIPS 2021 workshop Bayesian Deep Learning [BDL]}},\nabstract = {A learning method is self-certified if it uses all available data to simultaneously learn a predictor and certify its quality with a statistical certificate that is valid on unseen data. Recent work has shown that neural network models trained by optimising PAC-Bayes bounds lead not only to accurate predictors, but also to tight risk certificates, bearing promise towards achieving self-certified learning. In this context, learning and certification strategies based on PAC-Bayes bounds are especially attractive due to their ability to leverage all data to learn a posterior and simultaneously certify its risk. In this paper, we assess the progress towards self-certification in probabilistic neural networks learnt by PAC-Bayes inspired objectives. We empirically compare (on 4 classification datasets) classical test set bounds for deterministic predictors and a PAC-Bayes bound for randomised self-certified predictors. We first show that both of these generalisation bounds are not too far from out-of-sample test set errors. We then show that in data starvation regimes, holding out data for the test set bounds adversely affects generalisation performance, while self-certified strategies based on PAC-Bayes bounds do not suffer from this drawback, proving that they might be a suitable choice for the small data regime. We also find that probabilistic neural networks learnt by PAC-Bayes inspired objectives lead to certificates that can be surprisingly competitive with commonly used test set bounds.},\nurl = "http://bayesiandeeplearning.org/2021/papers/38.pdf",\nurl_arXiv = {https://arxiv.org/abs/2111.07737},\nurl_PDF = {https://arxiv.org/pdf/2111.07737.pdf},\neprint={2111.07737},\narchivePrefix={arXiv},\nprimaryClass={cs.LG},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n A learning method is self-certified if it uses all available data to simultaneously learn a predictor and certify its quality with a statistical certificate that is valid on unseen data. Recent work has shown that neural network models trained by optimising PAC-Bayes bounds lead not only to accurate predictors, but also to tight risk certificates, bearing promise towards achieving self-certified learning. In this context, learning and certification strategies based on PAC-Bayes bounds are especially attractive due to their ability to leverage all data to learn a posterior and simultaneously certify its risk. In this paper, we assess the progress towards self-certification in probabilistic neural networks learnt by PAC-Bayes inspired objectives. We empirically compare (on 4 classification datasets) classical test set bounds for deterministic predictors and a PAC-Bayes bound for randomised self-certified predictors. We first show that both of these generalisation bounds are not too far from out-of-sample test set errors. We then show that in data starvation regimes, holding out data for the test set bounds adversely affects generalisation performance, while self-certified strategies based on PAC-Bayes bounds do not suffer from this drawback, proving that they might be a suitable choice for the small data regime. We also find that probabilistic neural networks learnt by PAC-Bayes inspired objectives lead to certificates that can be surprisingly competitive with commonly used test set bounds.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2020\n \n \n (11)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Decentralized Learning with Budgeted Network Load Using Gaussian Copulas and Classifier Ensembles.\n \n \n \n \n\n\n \n Klein, J.; Albardan, M.; Guedj, B.; and Colot, O.\n\n\n \n\n\n\n In Cellier, P.; and Driessens, K., editor(s), ECML-PKDD 2019: Machine Learning and Knowledge Discovery in Databases, pages 301–316, 2020. Springer International Publishing\n \n\n\n\n
\n\n\n\n \n \n \"DecentralizedPaper\n  \n \n \n \"Decentralized arxiv\n  \n \n \n \"Decentralized pdf\n  \n \n \n \"Decentralized code\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{klein2018,\nauthor={Klein, John and Albardan, Mahmoud and Guedj, Benjamin and Colot, Olivier},\neditor="Cellier, Peggy\nand Driessens, Kurt",\ntitle={Decentralized Learning with Budgeted Network Load Using {Gaussian} Copulas and Classifier Ensembles},\nbooktitle="ECML-PKDD 2019: Machine Learning and Knowledge Discovery in Databases",\nyear="2020",\npublisher="Springer International Publishing",\npages="301--316",\nabstract="We examine a network of learners which address the same classification task but must learn from different data sets. The learners cannot share data but instead share their models. Models are shared only one time so as to preserve the network load. We introduce DELCO (standing for Decentralized Ensemble Learning with COpulas), a new approach allowing to aggregate the predictions of the classifiers trained by each learner. The proposed method aggregates the base classifiers using a probabilistic model relying on Gaussian copulas. Experiments on logistic regressor ensembles demonstrate competing accuracy and increased robustness in case of dependent classifiers. A companion python implementation can be downloaded at https://github.com/john-klein/DELCO.",\nisbn="978-3-030-43823-4",\ndoi = "10.1007/978-3-030-43823-4_26",\nurl = "https://link.springer.com/chapter/10.1007%2F978-3-030-43823-4_26",\nurl_arXiv = "https://arxiv.org/abs/1804.10028",\nurl_PDF = "https://arxiv.org/pdf/1804.10028.pdf",\nurl_Code = "https://github.com/john-klein/DELCO",\nkeywords={mine}\n}\n\n\n
\n
\n\n\n
\n We examine a network of learners which address the same classification task but must learn from different data sets. The learners cannot share data but instead share their models. Models are shared only one time so as to preserve the network load. We introduce DELCO (standing for Decentralized Ensemble Learning with COpulas), a new approach allowing to aggregate the predictions of the classifiers trained by each learner. The proposed method aggregates the base classifiers using a probabilistic model relying on Gaussian copulas. Experiments on logistic regressor ensembles demonstrate competing accuracy and increased robustness in case of dependent classifiers. A companion python implementation can be downloaded at https://github.com/john-klein/DELCO.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Revisiting clustering as matrix factorisation on the Stiefel manifold.\n \n \n \n \n\n\n \n Chrétien, S.; and Guedj, B.\n\n\n \n\n\n\n In Nicosia, G.; Ojha, V.; La Malfa, E.; Jansen, G.; Sciacca, V.; Pardalos, P.; Giuffrida, G.; and Umeton, R., editor(s), LOD – The Sixth International Conference on Machine Learning, Optimization, and Data Science, pages 1–12, 2020. Springer International Publishing\n \n\n\n\n
\n\n\n\n \n \n \"RevisitingPaper\n  \n \n \n \"Revisiting arxiv\n  \n \n \n \"Revisiting pdf\n  \n \n \n \"Revisiting video\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{chretien2019revisiting,\ntitle={Revisiting clustering as matrix factorisation on the {Stiefel} manifold},\nauthor={Chr{\\'e}tien, St{\\'e}phane and Guedj, Benjamin},\nyear={2020},\neditor="Nicosia, Giuseppe\nand Ojha, Varun\nand La Malfa, Emanuele\nand Jansen, Giorgio\nand Sciacca, Vincenzo\nand Pardalos, Panos\nand Giuffrida, Giovanni\nand Umeton, Renato",\npublisher="Springer International Publishing",\npages="1--12",\nabstract = "This paper studies clustering for possibly high dimensional data (e.g. images, time series, gene expression data, and many other settings), and rephrase it as low rank matrix estimation in the PAC-Bayesian framework. Our approach leverages the well known Burer-Monteiro factorisation strategy from large scale optimisation, in the context of low rank estimation. Moreover, our Burer-Monteiro factors are shown to lie on a Stiefel manifold. We propose a new generalized Bayesian estimator for this problem and prove novel prediction bounds for clustering. We also devise a componentwise Langevin sampler on the Stiefel manifold to compute this estimator.",\nbooktitle="LOD -- The Sixth International Conference on Machine Learning, Optimization, and Data Science",\nisbn="978-3-030-64583-0",\nurl = "https://link.springer.com/chapter/10.1007%2F978-3-030-64583-0_1",\nurl_arXiv = "https://arxiv.org/abs/1903.04479",\nurl_PDF = "https://arxiv.org/pdf/1903.04479.pdf",\nurl_Video = "https://youtu.be/Gz2euWW8kyA",\nDOI = "10.1007/978-3-030-64583-0_1",\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n This paper studies clustering for possibly high dimensional data (e.g. images, time series, gene expression data, and many other settings), and rephrase it as low rank matrix estimation in the PAC-Bayesian framework. Our approach leverages the well known Burer-Monteiro factorisation strategy from large scale optimisation, in the context of low rank estimation. Moreover, our Burer-Monteiro factors are shown to lie on a Stiefel manifold. We propose a new generalized Bayesian estimator for this problem and prove novel prediction bounds for clustering. We also devise a componentwise Langevin sampler on the Stiefel manifold to compute this estimator.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Non-linear aggregation of filters to improve image denoising.\n \n \n \n \n\n\n \n Guedj, B.; and Rengot, J.\n\n\n \n\n\n\n In Arai, K.; Kapoor, S.; and Bhatia, R., editor(s), SAI: Intelligent Computing, pages 314–327, 2020. Springer International Publishing\n \n\n\n\n
\n\n\n\n \n \n \"Non-linearPaper\n  \n \n \n \"Non-linear arxiv\n  \n \n \n \"Non-linear pdf\n  \n \n \n \"Non-linear code\n  \n \n \n \"Non-linear video\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{guedj2019nonlinear,\ntitle={Non-linear aggregation of filters to improve image denoising},\nauthor={Guedj, Benjamin and Rengot, Juliette},\nbooktitle="SAI: Intelligent Computing",\neditor="Arai, Kohei\nand Kapoor, Supriya\nand Bhatia, Rahul",\npublisher="Springer International Publishing",\npages="314--327",\nabstract="We introduce a novel aggregation method to efficiently perform image denoising. Preliminary filters are aggregated in a non-linear fashion, using a new metric of pixel proximity based on how the pool of filters reaches a consensus. We provide a theoretical bound to support our aggregation scheme, its numerical performance is illustrated and we show that the aggregate significantly outperforms each of the preliminary filters.",\nisbn="978-3-030-52246-9",\ndoi = "10.1007/978-3-030-52246-9_22",\nyear={2020},\neprint={1904.00865},\narchivePrefix={arXiv},\nurl = "https://link.springer.com/chapter/10.1007%2F978-3-030-52246-9_22",\nurl_arXiv = "https://arxiv.org/abs/1904.00865",\nurl_PDF = "https://arxiv.org/pdf/1904.00865.pdf",\nurl_Code = "https://github.com/rengotj/cobra_denoising",\nurl_Video = "https://youtu.be/cX1DErrxzOk",\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n We introduce a novel aggregation method to efficiently perform image denoising. Preliminary filters are aggregated in a non-linear fashion, using a new metric of pixel proximity based on how the pool of filters reaches a consensus. We provide a theoretical bound to support our aggregation scheme, its numerical performance is illustrated and we show that the aggregate significantly outperforms each of the preliminary filters.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Attributing and Referencing (Research) Software: Best Practices and Outlook From Inria.\n \n \n \n \n\n\n \n Alliez, P.; Di Cosmo, R.; Guedj, B.; Girault, A.; Hacid, M.; Legrand, A.; and Rougier, N.\n\n\n \n\n\n\n Computing in Science & Engineering, 22(1): 39–52. Jan 2020.\n \n\n\n\n
\n\n\n\n \n \n \"AttributingPaper\n  \n \n \n \"Attributing arxiv\n  \n \n \n \"Attributing pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{Alliez_2020,\ntitle={Attributing and Referencing (Research) Software: Best Practices and Outlook From {Inria}},\nvolume={22},\nISSN={1558-366X},\nurl={http://dx.doi.org/10.1109/MCSE.2019.2949413},\nurl_arXiv = "https://arxiv.org/abs/1905.11123",\nurl_PDF = "https://arxiv.org/pdf/1905.11123.pdf",\nDOI={10.1109/mcse.2019.2949413},\nnumber={1},\nabstract = "Software is a fundamental pillar of modern scientific research, across all fields and disciplines. However, there is a lack of adequate means to cite and reference software due to the complexity of the problem in terms of authorship, roles, and credits. This complexity is further increased when it is considered over the lifetime of a software that can span up to several decades. Building upon the internal experience of Inria, the French research institute for digital sciences, we provide in this article a contribution to the ongoing efforts in order to develop proper guidelines and recommendations for software citation and reference. Namely, we recommend: first, a richer taxonomy for software contributions with a qualitative scale; second, to put humans at the heart of the evaluation; and third, to distinguish citation from reference.",\njournal={Computing in Science \\& Engineering},\npublisher={Institute of Electrical and Electronics Engineers (IEEE)},\nauthor={Alliez, Pierre and Di Cosmo, Roberto and Guedj, Benjamin and Girault, Alain and Hacid, Mohand-Said and Legrand, Arnaud and Rougier, Nicolas},\nyear={2020},\nmonth={Jan},\npages={39–52},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n Software is a fundamental pillar of modern scientific research, across all fields and disciplines. However, there is a lack of adequate means to cite and reference software due to the complexity of the problem in terms of authorship, roles, and credits. This complexity is further increased when it is considered over the lifetime of a software that can span up to several decades. Building upon the internal experience of Inria, the French research institute for digital sciences, we provide in this article a contribution to the ongoing efforts in order to develop proper guidelines and recommendations for software citation and reference. Namely, we recommend: first, a richer taxonomy for software contributions with a qualitative scale; second, to put humans at the heart of the evaluation; and third, to distinguish citation from reference.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n PAC-Bayesian Contrastive Unsupervised Representation Learning.\n \n \n \n \n\n\n \n Nozawa, K.; Germain, P.; and Guedj, B.\n\n\n \n\n\n\n In Conference on Uncertainty in Artificial Intelligence [UAI], 2020. \n \n\n\n\n
\n\n\n\n \n \n \"PAC-BayesianPaper\n  \n \n \n \"PAC-Bayesian arxiv\n  \n \n \n \"PAC-Bayesian pdf\n  \n \n \n \"PAC-Bayesian supplementary\n  \n \n \n \"PAC-Bayesian code\n  \n \n \n \"PAC-Bayesian video\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{nozawa2019pacbayesian,\ntitle={{PAC-Bayesian} Contrastive Unsupervised Representation Learning},\nauthor={Kento Nozawa and Pascal Germain and Benjamin Guedj},\nyear={2020},\nabstract = "Contrastive unsupervised representation learning (CURL) is the state-of-the-art technique to learn representations (as a set of features) from unlabelled data. While CURL has collected several empirical successes recently, theoretical understanding of its performance was still missing. In a recent work, Arora et al. (2019) provide the first generalisation bounds for CURL, relying on a Rademacher complexity. We extend their framework to the flexible PAC-Bayes setting, allowing us to deal with the non-iid setting. We present PAC-Bayesian generalisation bounds for CURL, which are then used to derive a new representation learning algorithm. Numerical experiments on real-life datasets illustrate that our algorithm achieves competitive accuracy, and yields non-vacuous generalisation bounds.",\nbooktitle = {{Conference on Uncertainty in Artificial Intelligence [UAI]}},\nurl = "https://proceedings.mlr.press/v124/nozawa20a.html",\nurl_arXiv = "https://arxiv.org/abs/1910.04464",\nurl_PDF = "http://proceedings.mlr.press/v124/nozawa20a/nozawa20a.pdf",\nurl_Supplementary = "http://proceedings.mlr.press/v124/nozawa20a/nozawa20a-supp.pdf",\nurl_Code = "https://github.com/nzw0301/pb-contrastive",\nurl_Video = "https://youtu.be/WUh3Fgo5nhY",\neprint={1910.04464},\narchivePrefix={arXiv},\nprimaryClass={cs.LG},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n Contrastive unsupervised representation learning (CURL) is the state-of-the-art technique to learn representations (as a set of features) from unlabelled data. While CURL has collected several empirical successes recently, theoretical understanding of its performance was still missing. In a recent work, Arora et al. (2019) provide the first generalisation bounds for CURL, relying on a Rademacher complexity. We extend their framework to the flexible PAC-Bayes setting, allowing us to deal with the non-iid setting. We present PAC-Bayesian generalisation bounds for CURL, which are then used to derive a new representation learning algorithm. Numerical experiments on real-life datasets illustrate that our algorithm achieves competitive accuracy, and yields non-vacuous generalisation bounds.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n From industry-wide parameters to aircraft-centric on-flight inference: improving aeronautics performance prediction with machine learning.\n \n \n \n \n\n\n \n Dewez, F.; Guedj, B.; and Vandewalle, V.\n\n\n \n\n\n\n Data-Centric Engineering, 1. 2020.\n \n\n\n\n
\n\n\n\n \n \n \"FromPaper\n  \n \n \n \"From arxiv\n  \n \n \n \"From pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 25 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{dewez2020industrywide,\ntitle={From industry-wide parameters to aircraft-centric on-flight inference: improving aeronautics performance prediction with machine learning},\nauthor={Florent Dewez and Benjamin Guedj and Vincent Vandewalle},\nyear={2020},\njournal={Data-Centric Engineering},\npublisher={Cambridge University Press},\nabstract = "Aircraft performance models play a key role in airline operations, especially in planning a fuel-efficient flight. In practice, manufacturers provide guidelines calibrated on one single aircraft, with performance modelling for all similar aircrafts (\\emph{i.e.} same model) relying solely on that. In particular, it may poorly reflect on the current performance of a given aircraft. However, for each aircraft, flight data are continuously recorded and as such, not used to improve on the existing models. The key contribution of the present article is to foster the use of machine learning to leverage the massive amounts of collected data and update the models to reflect the actual performance of the aircraft. We illustrate our approach by focusing on the estimation of the drag and lift coefficients from recorded flight data. As these coefficients are not directly recorded, we resort to aerodynamics approximations. As a safety check, we provide bounds to assess the accuracy of both the aerodynamics approximation and the statistical performance of our approach. We provide numerical results on a collection of machine learning algorithms. We report excellent accuracy on real-life data and exhibit empirical evidence to support our modelling, in coherence with aerodynamics principles.",\neprint={2005.05286},\nvolume = "1",\ndoi = {10.1017/dce.2020.12},\nurl = "https://www.cambridge.org/core/journals/data-centric-engineering/article/from-industrywide-parameters-to-aircraftcentric-onflight-inference-improving-aeronautics-performance-prediction-with-machine-learning/7A5662351D23A3D855E7FBC58B45AB6D",\nurl_arXiv = "https://arxiv.org/abs/2005.05286",\nurl_PDF = "https://arxiv.org/pdf/2005.05286.pdf",\narchivePrefix={arXiv},\nprimaryClass={stat.AP},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n Aircraft performance models play a key role in airline operations, especially in planning a fuel-efficient flight. In practice, manufacturers provide guidelines calibrated on one single aircraft, with performance modelling for all similar aircrafts (\\emphi.e. same model) relying solely on that. In particular, it may poorly reflect on the current performance of a given aircraft. However, for each aircraft, flight data are continuously recorded and as such, not used to improve on the existing models. The key contribution of the present article is to foster the use of machine learning to leverage the massive amounts of collected data and update the models to reflect the actual performance of the aircraft. We illustrate our approach by focusing on the estimation of the drag and lift coefficients from recorded flight data. As these coefficients are not directly recorded, we resort to aerodynamics approximations. As a safety check, we provide bounds to assess the accuracy of both the aerodynamics approximation and the statistical performance of our approach. We provide numerical results on a collection of machine learning algorithms. We report excellent accuracy on real-life data and exhibit empirical evidence to support our modelling, in coherence with aerodynamics principles.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Kernel-Based Ensemble Learning in Python.\n \n \n \n \n\n\n \n Guedj, B.; and Srinivasa Desikan, B.\n\n\n \n\n\n\n Information, 11(2): 63. Jan 2020.\n \n\n\n\n
\n\n\n\n \n \n \"Kernel-BasedPaper\n  \n \n \n \"Kernel-Based arxiv\n  \n \n \n \"Kernel-Based pdf\n  \n \n \n \"Kernel-Based code\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{Guedj_2020,\ntitle={Kernel-Based Ensemble Learning in {Python}},\nvolume={11},\nISSN={2078-2489},\nDOI={10.3390/info11020063},\nurl={http://dx.doi.org/10.3390/info11020063},\nurl_arXiv = "https://arxiv.org/abs/1912.08311",\nurl_PDF = "https://www.mdpi.com/2078-2489/11/2/63/pdf",\nurl_Code = "https://github.com/bhargavvader/pycobra",\nnumber={2},\njournal={Information},\npublisher={MDPI AG},\nauthor={Guedj, Benjamin and Srinivasa Desikan, Bhargav},\nyear={2020},\nabstract = {We propose a new supervised learning algorithm for classification and regression problems where two or more preliminary predictors are available. We introduce KernelCobra, a non-linear learning strategy for combining an arbitrary number of initial predictors. KernelCobra builds on the COBRA algorithm introduced by Biau et al. (2016), which combined estimators based on a notion of proximity of predictions on the training data. While the COBRA algorithm used a binary threshold to declare which training data were close and to be used, we generalise this idea by using a kernel to better encapsulate the proximity information. Such a smoothing kernel provides more representative weights to each of the training points which are used to build the aggregate and final predictor, and KernelCobra systematically outperforms the COBRA algorithm. While COBRA is intended for regression, KernelCobra deals with classification and regression. KernelCobra is included as part of the open source Python package Pycobra (0.2.4 and onward), introduced by Srinivasa Desikan (2018). Numerical experiments were undertaken to assess the performance (in terms of pure prediction and computational complexity) of KernelCobra on real-life and synthetic datasets.},\nmonth={Jan},\npages={63},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n We propose a new supervised learning algorithm for classification and regression problems where two or more preliminary predictors are available. We introduce KernelCobra, a non-linear learning strategy for combining an arbitrary number of initial predictors. KernelCobra builds on the COBRA algorithm introduced by Biau et al. (2016), which combined estimators based on a notion of proximity of predictions on the training data. While the COBRA algorithm used a binary threshold to declare which training data were close and to be used, we generalise this idea by using a kernel to better encapsulate the proximity information. Such a smoothing kernel provides more representative weights to each of the training points which are used to build the aggregate and final predictor, and KernelCobra systematically outperforms the COBRA algorithm. While COBRA is intended for regression, KernelCobra deals with classification and regression. KernelCobra is included as part of the open source Python package Pycobra (0.2.4 and onward), introduced by Srinivasa Desikan (2018). Numerical experiments were undertaken to assess the performance (in terms of pure prediction and computational complexity) of KernelCobra on real-life and synthetic datasets.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n PAC-Bayesian Bound for the Conditional Value at Risk.\n \n \n \n \n\n\n \n Mhammedi, Z.; Guedj, B.; and Williamson, R. C.\n\n\n \n\n\n\n In Larochelle, H.; Ranzato, M.; Hadsell, R.; Balcan, M.; and Lin, H., editor(s), Advances in Neural Information Processing Systems 33: Annual Conference on Neural Information Processing Systems [NeurIPS] 2020, December 6-12, 2020, virtual, 2020. \n \n\n\n\n
\n\n\n\n \n \n \"PAC-BayesianPaper\n  \n \n \n \"PAC-Bayesian arxiv\n  \n \n \n \"PAC-Bayesian pdf\n  \n \n \n \"PAC-Bayesian supplementary\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{mhammedi2020pacbayesian,\nauthor    = {Zakaria Mhammedi and\nBenjamin Guedj and\nRobert C. Williamson},\neditor    = {Hugo Larochelle and\nMarc'Aurelio Ranzato and\nRaia Hadsell and\nMaria{-}Florina Balcan and\nHsuan{-}Tien Lin},\ntitle = {{PAC-Bayesian} Bound for the Conditional Value at Risk},\nbooktitle = {Advances in Neural Information Processing Systems 33: Annual Conference\non Neural Information Processing Systems [NeurIPS] 2020, December\n6-12, 2020, virtual},\nyear      = {2020},\nurl       = {https://proceedings.neurips.cc/paper/2020/hash/d02e9bdc27a894e882fa0c9055c99722-Abstract.html},\ntimestamp = {Tue, 19 Jan 2021 15:57:31 +0100},\nbiburl    = {https://dblp.org/rec/conf/nips/MhammediGW20.bib},\nbibsource = {dblp computer science bibliography, https://dblp.org},\neprint={2006.14763},\nabstract = {Conditional Value at Risk (CVaR) is a family of "coherent risk measures" which generalize the traditional mathematical expectation. Widely used in mathematical finance, it is garnering increasing interest in machine learning, e.g., as an alternate approach to regularization, and as a means for ensuring fairness. This paper presents a generalization bound for learning algorithms that minimize the CVaR of the empirical loss. The bound is of PAC-Bayesian type and is guaranteed to be small when the empirical CVaR is small. We achieve this by reducing the problem of estimating CVaR to that of merely estimating an expectation. This then enables us, as a by-product, to obtain concentration inequalities for CVaR even when the random variable in question is unbounded.},\narchivePrefix={arXiv},\nurl = "https://proceedings.neurips.cc/paper/2020/hash/d02e9bdc27a894e882fa0c9055c99722-Abstract.html",\nurl_arXiv = "https://arxiv.org/abs/2006.14763",\nurl_PDF = "https://proceedings.neurips.cc/paper/2020/file/d02e9bdc27a894e882fa0c9055c99722-Paper.pdf",\nurl_Supplementary = "https://proceedings.neurips.cc/paper/2020/file/d02e9bdc27a894e882fa0c9055c99722-Supplemental.pdf",\nprimaryClass={cs.LG},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n Conditional Value at Risk (CVaR) is a family of \"coherent risk measures\" which generalize the traditional mathematical expectation. Widely used in mathematical finance, it is garnering increasing interest in machine learning, e.g., as an alternate approach to regularization, and as a means for ensuring fairness. This paper presents a generalization bound for learning algorithms that minimize the CVaR of the empirical loss. The bound is of PAC-Bayesian type and is guaranteed to be small when the empirical CVaR is small. We achieve this by reducing the problem of estimating CVaR to that of merely estimating an expectation. This then enables us, as a by-product, to obtain concentration inequalities for CVaR even when the random variable in question is unbounded.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Indicateurs de suivi de l'activité scientifique de l'Inria.\n \n \n \n \n\n\n \n Brotcorne, L.; Canteaut, A.; Viana, A. C.; Grandmont, C.; Guedj, B.; Huot, S.; Issarny, V.; Pallez, G.; Perrier, V.; Quema, V.; Pomet, J.; Rival, X.; Salvati, S.; and Thomé, E.\n\n\n \n\n\n\n Technical Report Inria, December 2020.\n \n\n\n\n
\n\n\n\n \n \n \"IndicateursPaper\n  \n \n \n \"Indicateurs pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@techreport{brotcorne-hal-03033764,\ntitle = {Indicateurs de suivi de l'activit{\\'e} scientifique de l'{Inria}},\nauthor = {Brotcorne, Luce and Canteaut, Anne and Viana, Aline Carneiro and Grandmont, C{\\'e}line and Guedj, Benjamin and Huot, St{\\'e}phane and Issarny, Val{\\'e}rie and Pallez, Guillaume and Perrier, Val{\\'e}rie and Quema, Vivien and Pomet, Jean-Baptiste and Rival, Xavier and Salvati, Sylvain and Thom{\\'e}, Emmanuel},\nurl = {https://hal.inria.fr/hal-03033764},\ntype = {Research Report},\ninstitution = {{Inria}},\nabstract = {La Commission d'Evaluation de l'Inria a mis en place début 2020, à la demande de la Direction Générale, un groupe de travail visant à réfléchir à des indicateurs pour l'analyse qualitative de l'activité scientifique de l'Institut. Ce document est le fruit des discussions de ce groupe, enrichies par les échanges avec la Direction Générale d'Inria et avec l'ensemble de la Commision d'Evaluation. Au-delà de cette « commande », il nous a semblé important de saisir cette opportunité pour mener une réflexion collective, qui dépasse le cadre de notre Institut, sur les indicateurs, et sur ce qu'ils peuvent ou ne peuvent pas dire de nos pratiques scientifiques.},\nyear = {2020},\nmonth = Dec,\nurl_PDF = {https://hal.inria.fr/hal-03033764/file/CE_GT-Suivi.pdf},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n La Commission d'Evaluation de l'Inria a mis en place début 2020, à la demande de la Direction Générale, un groupe de travail visant à réfléchir à des indicateurs pour l'analyse qualitative de l'activité scientifique de l'Institut. Ce document est le fruit des discussions de ce groupe, enrichies par les échanges avec la Direction Générale d'Inria et avec l'ensemble de la Commision d'Evaluation. Au-delà de cette « commande », il nous a semblé important de saisir cette opportunité pour mener une réflexion collective, qui dépasse le cadre de notre Institut, sur les indicateurs, et sur ce qu'ils peuvent ou ne peuvent pas dire de nos pratiques scientifiques.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A PAC-Bayesian Perspective on Structured Prediction with Implicit Loss Embeddings.\n \n \n \n \n\n\n \n Cantelobre, T.; Guedj, B.; Pérez-Ortiz, M.; and Shawe-Taylor, J.\n\n\n \n\n\n\n 2020.\n Submitted.\n\n\n\n
\n\n\n\n \n \n \"APaper\n  \n \n \n \"A pdf\n  \n \n \n \"A code\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@unpublished{cantelobre2020pacbayesian,\ntitle={A {PAC-Bayesian} Perspective on Structured Prediction with Implicit Loss Embeddings}, \nauthor={Théophile Cantelobre and Benjamin Guedj and Mar\\'ia P\\'erez-Ortiz and John Shawe-Taylor},\nyear={2020},\nnote = "Submitted.",\nabstract = {Many practical machine learning tasks can be framed as Structured prediction problems, where several output variables are predicted and considered interdependent. Recent theoretical advances in structured prediction have focused on obtaining fast rates convergence guarantees, especially in the Implicit Loss Embedding (ILE) framework. PAC-Bayes has gained interest recently for its capacity of producing tight risk bounds for predictor distributions. This work proposes a novel PAC-Bayes perspective on the ILE Structured prediction framework. We present two generalization bounds, on the risk and excess risk, which yield insights into the behavior of ILE predictors. Two learning algorithms are derived from these bounds. The algorithms are implemented and their behavior analyzed, with source code available at \\url{https://github.com/theophilec/PAC-Bayes-ILE-Structured-Prediction}},\nurl = {https://arxiv.org/abs/2012.03780},\nurl_PDF = {https://arxiv.org/pdf/2012.03780.pdf},\nurl_Code = {https://github.com/theophilec/PAC-Bayes-ILE-Structured-Prediction},\neprint={2012.03780},\narchivePrefix={arXiv},\nprimaryClass={cs.LG},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n Many practical machine learning tasks can be framed as Structured prediction problems, where several output variables are predicted and considered interdependent. Recent theoretical advances in structured prediction have focused on obtaining fast rates convergence guarantees, especially in the Implicit Loss Embedding (ILE) framework. PAC-Bayes has gained interest recently for its capacity of producing tight risk bounds for predictor distributions. This work proposes a novel PAC-Bayes perspective on the ILE Structured prediction framework. We present two generalization bounds, on the risk and excess risk, which yield insights into the behavior of ILE predictors. Two learning algorithms are derived from these bounds. The algorithms are implemented and their behavior analyzed, with source code available at ˘rlhttps://github.com/theophilec/PAC-Bayes-ILE-Structured-Prediction\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Upper and Lower Bounds on the Performance of Kernel PCA.\n \n \n \n \n\n\n \n Haddouche, M.; Guedj, B.; and Shawe-Taylor, J.\n\n\n \n\n\n\n 2020.\n Submitted.\n\n\n\n
\n\n\n\n \n \n \"UpperPaper\n  \n \n \n \"Upper pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@unpublished{haddouche2020upper,\ntitle={Upper and Lower Bounds on the Performance of {Kernel PCA}}, \nauthor={Maxime Haddouche and Benjamin Guedj and John Shawe-Taylor},\nyear={2020},\nnote = "Submitted.",\nabstract = {Principal Component Analysis (PCA) is a popular method for dimension reduction and has attracted an unfailing interest for decades. Recently, kernel PCA has emerged as an extension of PCA but, despite its use in practice, a sound theoretical understanding of kernel PCA is missing. In this paper, we contribute lower and upper bounds on the efficiency of kernel PCA, involving the empirical eigenvalues of the kernel Gram matrix. Two bounds are for fixed estimators, and two are for randomized estimators through the PAC-Bayes theory. We control how much information is captured by kernel PCA on average, and we dissect the bounds to highlight strengths and limitations of the kernel PCA algorithm. Therefore, we contribute to the better understanding of kernel PCA. Our bounds are briefly illustrated on a toy numerical example.},\nurl = {https://arxiv.org/abs/2012.10369},\nurl_PDF = {https://arxiv.org/pdf/2012.10369.pdf},\neprint={2012.10369},\narchivePrefix={arXiv},\nprimaryClass={cs.LG},\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n Principal Component Analysis (PCA) is a popular method for dimension reduction and has attracted an unfailing interest for decades. Recently, kernel PCA has emerged as an extension of PCA but, despite its use in practice, a sound theoretical understanding of kernel PCA is missing. In this paper, we contribute lower and upper bounds on the efficiency of kernel PCA, involving the empirical eigenvalues of the kernel Gram matrix. Two bounds are for fixed estimators, and two are for randomized estimators through the PAC-Bayes theory. We control how much information is captured by kernel PCA on average, and we dissect the bounds to highlight strengths and limitations of the kernel PCA algorithm. Therefore, we contribute to the better understanding of kernel PCA. Our bounds are briefly illustrated on a toy numerical example.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2019\n \n \n (3)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n A Primer on PAC-Bayesian Learning.\n \n \n \n \n\n\n \n Guedj, B.\n\n\n \n\n\n\n In Proceedings of the second congress of the French Mathematical Society, volume 33, 2019. \n \n\n\n\n
\n\n\n\n \n \n \"APaper\n  \n \n \n \"A arxiv\n  \n \n \n \"A pdf\n  \n \n \n \"A video\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{guedj2019primer,\ntitle={A Primer on {PAC-Bayesian} Learning},\nauthor={Guedj, Benjamin},\nbooktitle={Proceedings of the second congress of the French Mathematical Society},\nyear={2019},\nvolume = {33},\nabstract = "Generalised Bayesian learning algorithms are increasingly popular in machine learning, due to their PAC generalisation properties and flexibility. The present paper aims at providing a self-contained survey on the resulting PAC-Bayes framework and some of its main theoretical and algorithmic developments.",\nurl = "https://arxiv.org/abs/1901.05353",\nurl_arXiv = "https://arxiv.org/abs/1901.05353",\nurl_PDF = "https://arxiv.org/pdf/1901.05353.pdf",\nurl_Video = "https://youtu.be/6t_vojO8jLI",\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n Generalised Bayesian learning algorithms are increasingly popular in machine learning, due to their PAC generalisation properties and flexibility. The present paper aims at providing a self-contained survey on the resulting PAC-Bayes framework and some of its main theoretical and algorithmic developments.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Dichotomize and Generalize: PAC-Bayesian Binary Activated Deep Neural Networks.\n \n \n \n \n\n\n \n Letarte, G.; Germain, P.; Guedj, B.; and Laviolette, F.\n\n\n \n\n\n\n In Wallach, H. M.; Larochelle, H.; Beygelzimer, A.; d'Alché-Buc , F.; Fox, E. B.; and Garnett, R., editor(s), Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems [NeurIPS] 2019, 8-14 December 2019, Vancouver, BC, Canada, pages 6869–6879, 2019. \n \n\n\n\n
\n\n\n\n \n \n \"DichotomizePaper\n  \n \n \n \"Dichotomize arxiv\n  \n \n \n \"Dichotomize pdf\n  \n \n \n \"Dichotomize code\n  \n \n \n \"Dichotomize video\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{letarte19dichotomize,\nauthor    = {Ga{\\"{e}}l Letarte and\nPascal Germain and\nBenjamin Guedj and\nFran{\\c{c}}ois Laviolette},\neditor    = {Hanna M. Wallach and\nHugo Larochelle and\nAlina Beygelzimer and\nFlorence d'Alch{\\'{e}}{-}Buc and\nEmily B. Fox and\nRoman Garnett},\ntitle     = {Dichotomize and Generalize: {PAC-Bayesian} Binary Activated Deep Neural\nNetworks},\nbooktitle = {Advances in Neural Information Processing Systems 32: Annual Conference\non Neural Information Processing Systems [NeurIPS] 2019, 8-14\nDecember 2019, Vancouver, BC, Canada},\npages     = {6869--6879},\nyear      = {2019},\neprint = "https://arxiv.org/abs/1905.10259",\ntimestamp = {Fri, 06 Mar 2020 16:59:22 +0100},\nbiburl    = {https://dblp.org/rec/conf/nips/LetarteGGL19.bib},\nbibsource = {dblp computer science bibliography, https://dblp.org},\nabstract = "We present a comprehensive study of multilayer neural networks with binary activation, relying on the PAC-Bayesian theory. Our contributions are twofold: (i) we develop an end-to-end framework to train a binary activated deep neural network, (ii) we provide nonvacuous PAC-Bayesian generalization bounds for binary activated deep neural networks. Our results are obtained by minimizing the expected loss of an architecture-dependent aggregation of binary activated deep neural networks. Our analysis inherently overcomes the fact that binary activation function is non-differentiable. The performance of our approach is assessed on a thorough numerical experiment protocol on real-life datasets.",\nurl = "https://papers.nips.cc/paper/8911-dichotomize-and-generalize-pac-bayesian-binary-activated-deep-neural-networks",\nurl_arXiv = "https://arxiv.org/abs/1905.10259",\nurl_PDF = "https://papers.nips.cc/paper/8911-dichotomize-and-generalize-pac-bayesian-binary-activated-deep-neural-networks.pdf",\nurl_Code = "https://github.com/gletarte/dichotomize-and-generalize",\nurl_Video = "https://youtu.be/FcIE0AVrTDg",\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n We present a comprehensive study of multilayer neural networks with binary activation, relying on the PAC-Bayesian theory. Our contributions are twofold: (i) we develop an end-to-end framework to train a binary activated deep neural network, (ii) we provide nonvacuous PAC-Bayesian generalization bounds for binary activated deep neural networks. Our results are obtained by minimizing the expected loss of an architecture-dependent aggregation of binary activated deep neural networks. Our analysis inherently overcomes the fact that binary activation function is non-differentiable. The performance of our approach is assessed on a thorough numerical experiment protocol on real-life datasets.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n PAC-Bayes Un-Expected Bernstein Inequality.\n \n \n \n \n\n\n \n Mhammedi, Z.; Grünwald, P.; and Guedj, B.\n\n\n \n\n\n\n In Wallach, H. M.; Larochelle, H.; Beygelzimer, A.; d'Alché-Buc , F.; Fox, E. B.; and Garnett, R., editor(s), Advances in Neural Information Processing Systems 32: Annual Conference on Neural Information Processing Systems [NeurIPS] 2019, 8-14 December 2019, Vancouver, BC, Canada, pages 12180–12191, 2019. \n \n\n\n\n
\n\n\n\n \n \n \"PAC-BayesPaper\n  \n \n \n \"PAC-Bayes arxiv\n  \n \n \n \"PAC-Bayes pdf\n  \n \n \n \"PAC-Bayes code\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{mhammedi19pac,\nauthor    = {Zakaria Mhammedi and\nPeter Gr{\\"{u}}nwald and\nBenjamin Guedj},\neditor    = {Hanna M. Wallach and\nHugo Larochelle and\nAlina Beygelzimer and\nFlorence d'Alch{\\'{e}}{-}Buc and\nEmily B. Fox and\nRoman Garnett},\ntitle     = {{PAC-Bayes} Un-Expected {Bernstein} Inequality},\nbooktitle = {Advances in Neural Information Processing Systems 32: Annual Conference\non Neural Information Processing Systems [NeurIPS] 2019, 8-14\nDecember 2019, Vancouver, BC, Canada},\npages     = {12180--12191},\nyear      = {2019},\nurl       = {http://papers.nips.cc/paper/9387-pac-bayes-un-expected-bernstein-inequality},\ntimestamp = {Fri, 06 Mar 2020 16:59:25 +0100},\nbiburl    = {https://dblp.org/rec/conf/nips/MhammediGG19.bib},\nabstract = "We present a new PAC-Bayesian generalization bound. Standard bounds contain a $\\sqrt{L_n \\cdot \\mathrm{KL}/n}$ complexity term which dominates unless $L_n$, the empirical error of the learning algorithm's randomized predictions, vanishes. We manage to replace $L_n$ by a term which vanishes in many more situations, essentially whenever the employed learning algorithm is sufficiently stable on the dataset at hand. Our new bound consistently beats state-of-the-art bounds both on a toy example and on UCI datasets (with large enough $n$). Theoretically, unlike existing bounds, our new bound can be expected to converge to $0$ faster whenever a Bernstein/Tsybakov condition holds, thus connecting PAC-Bayesian generalization and {\\em excess risk\\/} bounds---for the latter it has long been known that faster convergence can be obtained under Bernstein conditions. Our main technical tool is a new concentration inequality which is like Bernstein's but with $X^2$ taken outside its expectation.",\nbibsource = {dblp computer science bibliography, https://dblp.org},\nurl = "https://papers.nips.cc/paper/9387-pac-bayes-un-expected-bernstein-inequality",\nurl_arXiv = "https://arxiv.org/abs/1905.13367",\nurl_PDF = "https://papers.nips.cc/paper/9387-pac-bayes-un-expected-bernstein-inequality.pdf",\nurl_Code = "https://github.com/bguedj/PAC-Bayesian-Un-Expected-Bernstein-Inequality",\nkeywords={mine}\n}\n\n\n
\n
\n\n\n
\n We present a new PAC-Bayesian generalization bound. Standard bounds contain a $\\sqrt{L_n · \\mathrm{KL}/n}$ complexity term which dominates unless $L_n$, the empirical error of the learning algorithm's randomized predictions, vanishes. We manage to replace $L_n$ by a term which vanishes in many more situations, essentially whenever the employed learning algorithm is sufficiently stable on the dataset at hand. Our new bound consistently beats state-of-the-art bounds both on a toy example and on UCI datasets (with large enough $n$). Theoretically, unlike existing bounds, our new bound can be expected to converge to $0$ faster whenever a Bernstein/Tsybakov condition holds, thus connecting PAC-Bayesian generalization and \\em excess risk\\/ bounds—for the latter it has long been known that faster convergence can be obtained under Bernstein conditions. Our main technical tool is a new concentration inequality which is like Bernstein's but with $X^2$ taken outside its expectation.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2018\n \n \n (4)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n PAC-Bayesian high dimensional bipartite ranking.\n \n \n \n \n\n\n \n Guedj, B.; and Robbiano, S.\n\n\n \n\n\n\n Journal of Statistical Planning and Inference, 196: 70 - 86. 2018.\n \n\n\n\n
\n\n\n\n \n \n \"PAC-BayesianPaper\n  \n \n \n \"PAC-Bayesian arxiv\n  \n \n \n \"PAC-Bayesian pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{guedj2018pac,\ntitle={{PAC-Bayesian} high dimensional bipartite ranking},\nauthor={Guedj, Benjamin and Robbiano, Sylvain},\njournal={Journal of Statistical Planning and Inference},\nyear={2018},\npublisher={Elsevier},\nvolume = "196",\npages = "70 - 86",\nabstract = "This paper is devoted to the bipartite ranking problem, a classical statistical learning task, in a high dimensional setting. We propose a scoring and ranking strategy based on the PAC-Bayesian approach. We consider nonlinear additive scoring functions, and we derive non-asymptotic risk bounds under a sparsity assumption. In particular, oracle inequalities in probability holding under a margin condition assess the performance of our procedure, and prove its minimax optimality. An MCMC-flavored algorithm is proposed to implement our method, along with its behavior on synthetic and real-life datasets.",\nissn = "0378-3758",\ndoi = "https://doi.org/10.1016/j.jspi.2017.10.010",\nurl = "http://www.sciencedirect.com/science/article/pii/S0378375817301945",\nurl_arXiv = "https://arxiv.org/abs/1511.02729",\nurl_PDF = "https://arxiv.org/pdf/1511.02729.pdf",\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n This paper is devoted to the bipartite ranking problem, a classical statistical learning task, in a high dimensional setting. We propose a scoring and ranking strategy based on the PAC-Bayesian approach. We consider nonlinear additive scoring functions, and we derive non-asymptotic risk bounds under a sparsity assumption. In particular, oracle inequalities in probability holding under a margin condition assess the performance of our procedure, and prove its minimax optimality. An MCMC-flavored algorithm is proposed to implement our method, along with its behavior on synthetic and real-life datasets.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Simpler PAC-Bayesian bounds for hostile data.\n \n \n \n \n\n\n \n Alquier, P.; and Guedj, B.\n\n\n \n\n\n\n Machine Learning, 107(5): 887–902. 2018.\n \n\n\n\n
\n\n\n\n \n \n \"SimplerPaper\n  \n \n \n \"Simpler arxiv\n  \n \n \n \"Simpler pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{alquier2018simpler,\ntitle={Simpler {PAC-Bayesian} bounds for hostile data},\nauthor={Alquier, Pierre and Guedj, Benjamin},\njournal={Machine Learning},\nvolume={107},\nnumber={5},\npages={887--902},\nyear={2018},\npublisher={Springer},\nabstract = "PAC-Bayesian learning bounds are of the utmost interest to the learning community. Their role is to connect the generalization ability of an aggregation distribution $\\rho$ to its empirical risk and to its Kullback-Leibler divergence with respect to some prior distribution $\\pi$. Unfortunately, most of the available bounds typically rely on heavy assumptions such as boundedness and independence of the observations. This paper aims at relaxing these constraints and provides PAC-Bayesian learning bounds that hold for dependent, heavy-tailed observations (hereafter referred to as \\emph{hostile data}). In these bounds the Kullack-Leibler divergence is replaced with a general version of Csiszár’s f-divergence. We prove a general PAC-Bayesian bound, and show how to use it in various hostile settings.",\nissn="1573-0565",\ndoi="10.1007/s10994-017-5690-0",\nurl="https://doi.org/10.1007/s10994-017-5690-0",\nurl_arXiv = "https://arxiv.org/abs/1610.07193",\nurl_PDF = "https://arxiv.org/pdf/1610.07193.pdf",\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n PAC-Bayesian learning bounds are of the utmost interest to the learning community. Their role is to connect the generalization ability of an aggregation distribution $ρ$ to its empirical risk and to its Kullback-Leibler divergence with respect to some prior distribution $π$. Unfortunately, most of the available bounds typically rely on heavy assumptions such as boundedness and independence of the observations. This paper aims at relaxing these constraints and provides PAC-Bayesian learning bounds that hold for dependent, heavy-tailed observations (hereafter referred to as \\emphhostile data). In these bounds the Kullack-Leibler divergence is replaced with a general version of Csiszár’s f-divergence. We prove a general PAC-Bayesian bound, and show how to use it in various hostile settings.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Pycobra: A Python Toolbox for Ensemble Learning and Visualisation.\n \n \n \n \n\n\n \n Guedj, B.; and Srinivasa Desikan, B.\n\n\n \n\n\n\n Journal of Machine Learning Research, 18(190): 1-5. 2018.\n \n\n\n\n
\n\n\n\n \n \n \"Pycobra:Paper\n  \n \n \n \"Pycobra: arxiv\n  \n \n \n \"Pycobra: pdf\n  \n \n \n \"Pycobra: code\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{guedj2018pycobra,\nauthor  = {Guedj, Benjamin and Srinivasa Desikan, Bhargav},\ntitle   = {Pycobra: A {Python} Toolbox for Ensemble Learning and Visualisation},\njournal = {Journal of Machine Learning Research},\nyear    = {2018},\nvolume  = {18},\nnumber  = {190},\npages   = {1-5},\nabstract = "We introduce pycobra, a Python library devoted to ensemble learning (regression and classification) and visualisation. Its main assets are the implementation of several ensemble learning algorithms, a flexible and generic interface to compare and blend any existing machine learning algorithm available in Python libraries (as long as a predict method is given), and visualisation tools such as Voronoi tessellations. pycobra is fully scikit-learn compatible and is released under the MIT open-source license. pycobra can be downloaded from the Python Package Index (PyPi) and Machine Learning Open Source Software (MLOSS). The current version (along with Jupyter notebooks, extensive documentation, and continuous integration tests) is available at https://github.com/bhargavvader/pycobra and official documentation website is https://modal.lille.inria.fr/pycobra.",\nurl     = {http://jmlr.org/beta/papers/v18/17-228.html},\nurl_arXiv = {https://arxiv.org/abs/1707.00558},\nurl_PDF = "http://jmlr.org/papers/volume18/17-228/17-228.pdf",\nurl_Code = "https://github.com/bhargavvader/pycobra",\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n We introduce pycobra, a Python library devoted to ensemble learning (regression and classification) and visualisation. Its main assets are the implementation of several ensemble learning algorithms, a flexible and generic interface to compare and blend any existing machine learning algorithm available in Python libraries (as long as a predict method is given), and visualisation tools such as Voronoi tessellations. pycobra is fully scikit-learn compatible and is released under the MIT open-source license. pycobra can be downloaded from the Python Package Index (PyPi) and Machine Learning Open Source Software (MLOSS). The current version (along with Jupyter notebooks, extensive documentation, and continuous integration tests) is available at https://github.com/bhargavvader/pycobra and official documentation website is https://modal.lille.inria.fr/pycobra.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A quasi-Bayesian perspective to online clustering.\n \n \n \n \n\n\n \n Li, L.; Guedj, B.; and Loustau, S.\n\n\n \n\n\n\n Electron. J. Statist., 12(2): 3071–3113. 2018.\n \n\n\n\n
\n\n\n\n \n \n \"APaper\n  \n \n \n \"A arxiv\n  \n \n \n \"A pdf\n  \n \n \n \"A code\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{li2018,\nauthor = "Li, Le and Guedj, Benjamin and Loustau, Sébastien",\ndoi = "10.1214/18-EJS1479",\nfjournal = "Electronic Journal of Statistics",\njournal = "Electron. J. Statist.",\nnumber = "2",\npages = "3071--3113",\nabstract = "When faced with high frequency streams of data, clustering raises theoretical and algorithmic pitfalls. We introduce a new and adaptive online clustering algorithm relying on a quasi-Bayesian approach, with a dynamic (i.e., time-dependent) estimation of the (unknown and changing) number of clusters. We prove that our approach is supported by minimax regret bounds. We also provide an RJMCMC-flavored implementation (called PACBO, see https://cran.r-project.org/web/packages/PACBO/index.html) for which we give a convergence guarantee. Finally, numerical experiments illustrate the potential of our procedure.",\npublisher = "The Institute of Mathematical Statistics and the Bernoulli Society",\ntitle = {A quasi-{Bayesian} perspective to online clustering},\nurl = "https://doi.org/10.1214/18-EJS1479",\nurl_arXiv = "https://arxiv.org/abs/1602.00522",\nurl_PDF = "https://arxiv.org/pdf/1602.00522.pdf",\nurl_Code = "https://cran.r-project.org/src/contrib/Archive/PACBO/",\nvolume = "12",\nyear = "2018",\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n When faced with high frequency streams of data, clustering raises theoretical and algorithmic pitfalls. We introduce a new and adaptive online clustering algorithm relying on a quasi-Bayesian approach, with a dynamic (i.e., time-dependent) estimation of the (unknown and changing) number of clusters. We prove that our approach is supported by minimax regret bounds. We also provide an RJMCMC-flavored implementation (called PACBO, see https://cran.r-project.org/web/packages/PACBO/index.html) for which we give a convergence guarantee. Finally, numerical experiments illustrate the potential of our procedure.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2017\n \n \n (1)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n An oracle inequality for quasi-Bayesian nonnegative matrix factorization.\n \n \n \n \n\n\n \n Alquier, P.; and Guedj, B.\n\n\n \n\n\n\n Mathematical Methods of Statistics, 26(1): 55–67. 2017.\n \n\n\n\n
\n\n\n\n \n \n \"AnPaper\n  \n \n \n \"An arxiv\n  \n \n \n \"An pdf\n  \n \n \n \"An code\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{alquier2017oracle,\ntitle={An oracle inequality for quasi-{Bayesian} nonnegative matrix factorization},\nauthor={Alquier, Pierre and Guedj, Benjamin},\njournal={Mathematical Methods of Statistics},\nvolume={26},\nnumber={1},\npages={55--67},\nyear={2017},\npublisher={Springer},\nabstract = "The aim of this paper is to provide some theoretical understanding of quasi-Bayesian aggregation methods non-negative matrix factorization. We derive an oracle inequality for an aggregated estimator. This result holds for a very general class of prior distributions and shows how the prior affects the rate of convergence.",\nissn="1934-8045",\ndoi="10.3103/S1066530717010045",\nurl = "https://link.springer.com/article/10.3103%2FS1066530717010045",\nurl_arXiv="https://arxiv.org/abs/1601.01345",\nurl_PDF = "https://arxiv.org/pdf/1601.01345.pdf",\nurl_Code = "https://github.com/astha736/PACbayesianNMF",\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n The aim of this paper is to provide some theoretical understanding of quasi-Bayesian aggregation methods non-negative matrix factorization. We derive an oracle inequality for an aggregated estimator. This result holds for a very general class of prior distributions and shows how the prior affects the rate of convergence.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2016\n \n \n (2)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n COBRA: A combined regression strategy.\n \n \n \n \n\n\n \n Biau, G.; Fischer, A.; Guedj, B.; and Malley, J. D.\n\n\n \n\n\n\n Journal of Multivariate Analysis, 146: 18–28. 2016.\n Special Issue on Statistical Models and Methods for High or Infinite Dimensional Spaces\n\n\n\n
\n\n\n\n \n \n \"COBRA:Paper\n  \n \n \n \"COBRA: arxiv\n  \n \n \n \"COBRA: pdf\n  \n \n \n \"COBRA: code\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{biau2016cobra,\ntitle={{COBRA}: A combined regression strategy},\nauthor={Biau, G{\\'e}rard and Fischer, Aur{\\'e}lie and Guedj, Benjamin and Malley, James D.},\njournal={Journal of Multivariate Analysis},\nvolume={146},\npages={18--28},\nyear={2016},\nabstract = "A new method for combining several initial estimators of the regression function is introduced. Instead of building a linear or convex optimized combination over a collection of basic estimators $r_1,\\dots,r_M$, we use them as a collective indicator of the proximity between the training data and a test observation. This local distance approach is model-free and very fast. More specifically, the resulting nonparametric/nonlinear combined estimator is shown to perform asymptotically at least as well in the $L^2$\nsense as the best combination of the basic estimators in the collective. A companion R package called COBRA (standing for COmBined Regression Alternative) is presented (downloadable on http://cran.r-project.org/web/packages/COBRA/index.html). Substantial numerical evidence is provided on both synthetic and real data sets to assess the excellent performance and velocity of our method in a large variety of prediction problems.",\npublisher={Elsevier},\nnote = "Special Issue on Statistical Models and Methods for High or Infinite Dimensional Spaces",\nissn = "0047-259X",\ndoi = "https://doi.org/10.1016/j.jmva.2015.04.007",\nurl = "http://www.sciencedirect.com/science/article/pii/S0047259X15000950",\nurl_arxiv = "https://arxiv.org/abs/1303.2236",\nurl_PDF = "https://arxiv.org/pdf/1303.2236.pdf",\nurl_Code = "https://cran.r-project.org/package=COBRA",\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n A new method for combining several initial estimators of the regression function is introduced. Instead of building a linear or convex optimized combination over a collection of basic estimators $r_1,ṡ,r_M$, we use them as a collective indicator of the proximity between the training data and a test observation. This local distance approach is model-free and very fast. More specifically, the resulting nonparametric/nonlinear combined estimator is shown to perform asymptotically at least as well in the $L^2$ sense as the best combination of the basic estimators in the collective. A companion R package called COBRA (standing for COmBined Regression Alternative) is presented (downloadable on http://cran.r-project.org/web/packages/COBRA/index.html). Substantial numerical evidence is provided on both synthetic and real data sets to assess the excellent performance and velocity of our method in a large variety of prediction problems.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Stability revisited: new generalisation bounds for the Leave-one-Out.\n \n \n \n \n\n\n \n Celisse, A.; and Guedj, B.\n\n\n \n\n\n\n 2016.\n Preprint.\n\n\n\n
\n\n\n\n \n \n \"StabilityPaper\n  \n \n \n \"Stability pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@unpublished{celisse2016stability,\ntitle={Stability revisited: new generalisation bounds for the {Leave-one-Out}},\nauthor={Celisse, Alain and Guedj, Benjamin},\nnote={Preprint.},\nurl = "https://arxiv.org/abs/1608.06412",\nurl_PDF = "https://arxiv.org/pdf/1608.06412.pdf",\nyear={2016},\nabstract = "The present paper provides a new generic strategy leading to non-asymptotic theoretical guarantees on the Leave-one-Out procedure applied to a broad class of learning algorithms. This strategy relies on two main ingredients: the new notion of $L^q$ stability, and the strong use of moment inequalities. $L^q$ stability extends the ongoing notion of hypothesis stability while remaining weaker than the uniform stability. It leads to new PAC exponential generalisation bounds for Leave-one-Out under mild assumptions. In the literature, such bounds are available only for uniform stable algorithms under boundedness for instance. Our generic strategy is applied to the Ridge regression algorithm as a first step.",\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n The present paper provides a new generic strategy leading to non-asymptotic theoretical guarantees on the Leave-one-Out procedure applied to a broad class of learning algorithms. This strategy relies on two main ingredients: the new notion of $L^q$ stability, and the strong use of moment inequalities. $L^q$ stability extends the ongoing notion of hypothesis stability while remaining weaker than the uniform stability. It leads to new PAC exponential generalisation bounds for Leave-one-Out under mild assumptions. In the literature, such bounds are available only for uniform stable algorithms under boundedness for instance. Our generic strategy is applied to the Ridge regression algorithm as a first step.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2015\n \n \n (1)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n On some recent advances on high dimensional Bayesian statistics.\n \n \n \n \n\n\n \n Chopin, N.; Gadat, S.; Guedj, B.; Guyader, A.; and Vernet, E.\n\n\n \n\n\n\n ESAIM: Proceedings and Surveys, 51: 293–319. 2015.\n \n\n\n\n
\n\n\n\n \n \n \"OnPaper\n  \n \n \n \"On pdf\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{chopin2015survey,\ntitle={On some recent advances on high dimensional {Bayesian} statistics},\nauthor={Chopin, Nicolas and Gadat, S{\\'e}bastien and Guedj, Benjamin and Guyader, Arnaud and Vernet, Elodie},\njournal={ESAIM: Proceedings and Surveys},\nvolume={51},\npages={293--319},\nyear={2015},\nabstract = "This paper proposes to review some recent developments in Bayesian statistics for high dimensional data. After giving some brief motivations in a short introduction, we describe new advances in the understanding of Bayes posterior computation as well as theoretical contributions in non parametric and high dimensional Bayesian approaches. From an applied point of view, we describe the so-called SQMC particle method to compute posterior Bayesian law, and provide a nonparametric analysis of the widespread ABC method. On the theoretical side, we describe some recent advances in Bayesian consistency for a nonparametric hidden Markov model as well as new PAC-Bayesian results for different models of high dimensional regression.",\npublisher={EDP Sciences},\nDOI= "10.1051/proc/201551016",\nurl= "https://doi.org/10.1051/proc/201551016",\nurl_PDF="https://www.esaim-proc.org/articles/proc/pdf/2015/04/proc145116.pdf",\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n This paper proposes to review some recent developments in Bayesian statistics for high dimensional data. After giving some brief motivations in a short introduction, we describe new advances in the understanding of Bayes posterior computation as well as theoretical contributions in non parametric and high dimensional Bayesian approaches. From an applied point of view, we describe the so-called SQMC particle method to compute posterior Bayesian law, and provide a nonparametric analysis of the widespread ABC method. On the theoretical side, we describe some recent advances in Bayesian consistency for a nonparametric hidden Markov model as well as new PAC-Bayesian results for different models of high dimensional regression.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2013\n \n \n (2)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n PAC-Bayesian estimation and prediction in sparse additive models.\n \n \n \n \n\n\n \n Guedj, B.; and Alquier, P.\n\n\n \n\n\n\n Electron. J. Statist., 7: 264–291. 2013.\n \n\n\n\n
\n\n\n\n \n \n \"PAC-BayesianPaper\n  \n \n \n \"PAC-Bayesian arxiv\n  \n \n \n \"PAC-Bayesian pdf\n  \n \n \n \"PAC-Bayesian code\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{guedj2013,\nauthor = "Guedj, Benjamin and Alquier, Pierre",\ndoi = "10.1214/13-EJS771",\nfjournal = "Electronic Journal of Statistics",\njournal = "Electron. J. Statist.",\npages = "264--291",\nabstract = "The present paper is about estimation and prediction in high-dimensional additive models under a sparsity assumption ($p\\gg n$ paradigm). A PAC-Bayesian strategy is investigated, delivering oracle inequalities in probability. The implementation is performed through recent outcomes in high-dimensional MCMC algorithms, and the performance of our method is assessed on simulated data.",\npublisher = "The Institute of Mathematical Statistics and the Bernoulli Society",\ntitle = {{PAC-Bayesian} estimation and prediction in sparse additive models},\nurl = "https://doi.org/10.1214/13-EJS771",\nvolume = "7",\nyear = "2013",\nurl_arXiv = "https://arxiv.org/abs/1208.1211",\nurl_PDF = "https://arxiv.org/pdf/1208.1211.pdf",\nurl_Code = "https://cran.r-project.org/package=pacbpred",\nkeywords={mine}\n}\n\n
\n
\n\n\n
\n The present paper is about estimation and prediction in high-dimensional additive models under a sparsity assumption ($p≫ n$ paradigm). A PAC-Bayesian strategy is investigated, delivering oracle inequalities in probability. The implementation is performed through recent outcomes in high-dimensional MCMC algorithms, and the performance of our method is assessed on simulated data.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Aggregation of estimators and classifiers: theory and methods.\n \n \n \n \n\n\n \n Guedj, B.\n\n\n \n\n\n\n Ph.D. Thesis, Université Pierre et Marie Curie - Paris VI, December 2013.\n \n\n\n\n
\n\n\n\n \n \n \"AggregationPaper\n  \n \n \n \"Aggregation pdf\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@phdthesis{guedj2013phd,\nTITLE = {Aggregation of estimators and classifiers: theory and methods},\nAUTHOR = {Guedj, Benjamin},\nURL = {https://tel.archives-ouvertes.fr/tel-00922353},\nSCHOOL = {{Universit{\\'e} Pierre et Marie Curie - Paris VI}},\nYEAR = {2013},\nMONTH = Dec,\nabstract = "This thesis is devoted to the study of both theoretical and practical properties of various aggregation techniques. We first extend the PAC-Bayesian theory to the high dimensional paradigm in the additive and logistic regression settings. We prove that our estimators are nearly minimax optimal, and we provide an MCMC implementation, backed up by numerical simulations. Next, we introduce an original nonlinear aggregation strategy. Its theoretical merits are presented, and we benchmark the method—called COBRA—on a lengthy series of numerical experiments. Finally, a Bayesian approach to model admixture in population genetics is presented, along with its MCMC implementation. All approaches introduced in this thesis are freely available on the author’s website.",\nKEYWORDS = {Aggregation ; oracle inequalities ; PAC-Bayesian theory ; sparsity ; Agr{\\'e}gation ; r{\\'e}gression ; classification ; in{\\'e}galit{\\'e}s oracles ; th{\\'e}orie PAC-bay{\\'e}sienne ; COBRA ; MCMC ; parcimonie, mine},\nTYPE = {Theses},\nurl_PDF = "https://bguedj.github.io/files/bguedj-phd.pdf"\n}\n\n
\n
\n\n\n
\n This thesis is devoted to the study of both theoretical and practical properties of various aggregation techniques. We first extend the PAC-Bayesian theory to the high dimensional paradigm in the additive and logistic regression settings. We prove that our estimators are nearly minimax optimal, and we provide an MCMC implementation, backed up by numerical simulations. Next, we introduce an original nonlinear aggregation strategy. Its theoretical merits are presented, and we benchmark the method—called COBRA—on a lengthy series of numerical experiments. Finally, a Bayesian approach to model admixture in population genetics is presented, along with its MCMC implementation. All approaches introduced in this thesis are freely available on the author’s website.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2011\n \n \n (1)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Estimating the location and shape of hybrid zones.\n \n \n \n \n\n\n \n Guedj, B.; and Guillot, G.\n\n\n \n\n\n\n Molecular Ecology Resources, 11(6): 1119-1123. 2011.\n \n\n\n\n
\n\n\n\n \n \n \"EstimatingPaper\n  \n \n \n \"Estimating code\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{guedj2011,\nauthor = {Guedj, Benjamin and Guillot, Gilles},\ntitle = {Estimating the location and shape of hybrid zones},\njournal = {Molecular Ecology Resources},\nvolume = {11},\nnumber = {6},\npages = {1119-1123},\nyear = {2011},\nabstract="We propose a new model to make use of georeferenced genetic data for inferring the location and shape of a hybrid zone. The model output includes the posterior distribution of a parameter that quantifies the width of the hybrid zone. The model proposed is implemented in the GUI and command‐line versions of the Geneland program versions $\\geq$3.3.0. Information about the program can be found on http://www2.imm.dtu.dk/gigu/Geneland/.",\nkeywords = {Population structure, hybridization, admixture, selection pressure, Markov chain Monte Carlo, Geneland, mine},\ndoi = {10.1111/j.1755-0998.2011.03045.x},\nurl = {https://onlinelibrary.wiley.com/doi/abs/10.1111/j.1755-0998.2011.03045.x},\nurl_Code = {https://i-pri.org/special/Biostatistics/Software/Geneland/}\n}\n\n
\n
\n\n\n
\n We propose a new model to make use of georeferenced genetic data for inferring the location and shape of a hybrid zone. The model output includes the posterior distribution of a parameter that quantifies the width of the hybrid zone. The model proposed is implemented in the GUI and command‐line versions of the Geneland program versions $≥$3.3.0. Information about the program can be found on http://www2.imm.dtu.dk/gigu/Geneland/.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n 2010\n \n \n (1)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n A Bayesian modelling of the hybridization mechanism.\n \n \n \n\n\n \n Guedj, B.\n\n\n \n\n\n\n Master's thesis, Sorbonne Université and Danmarks Tekniske Universitet, 2010.\n \n\n\n\n
\n\n\n\n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@mastersthesis{guedj2010,\nauthor={Guedj, Benjamin},\ntitle = {A {Bayesian} modelling of the hybridization mechanism},\nyear = {2010},\nschool = {Sorbonne Université and Danmarks Tekniske Universitet},\nkeywords={mine}\n}\n\n
\n
\n\n\n\n
\n\n\n\n\n\n
\n
\n\n\n\n\n
\n\n\n \n\n \n \n \n \n\n
\n"}; document.write(bibbase_data.data);