var bibbase_data = {"data":"\"Loading..\"\n\n
\n\n \n\n \n\n \n \n\n \n\n \n \n\n \n\n \n
\n generated by\n \n \"bibbase.org\"\n\n \n
\n \n\n
\n\n \n\n\n
\n\n Excellent! Next you can\n create a new website with this list, or\n embed it in an existing web page by copying & pasting\n any of the following snippets.\n\n
\n JavaScript\n (easiest)\n
\n \n <script src=\"https://bibbase.org/show?bib=http%3A%2F%2Fpeople.csail.mit.edu%2Fdsontag%2Fpapers%2Fbibtex%2Fdavid_sontag_papers_all.bib&jsonp=1&folding=1&group0=keyword&jsonp=1\"></script>\n \n
\n\n PHP\n
\n \n <?php\n $contents = file_get_contents(\"https://bibbase.org/show?bib=http%3A%2F%2Fpeople.csail.mit.edu%2Fdsontag%2Fpapers%2Fbibtex%2Fdavid_sontag_papers_all.bib&jsonp=1&folding=1&group0=keyword\");\n print_r($contents);\n ?>\n \n
\n\n iFrame\n (not recommended)\n
\n \n <iframe src=\"https://bibbase.org/show?bib=http%3A%2F%2Fpeople.csail.mit.edu%2Fdsontag%2Fpapers%2Fbibtex%2Fdavid_sontag_papers_all.bib&jsonp=1&folding=1&group0=keyword\"></iframe>\n \n
\n\n

\n For more details see the documention.\n

\n
\n
\n\n
\n\n This is a preview! To use this list on your own web site\n or create a new web site from it,\n create a free account. The file will be added\n and you will be able to edit it in the File Manager.\n We will show you instructions once you've created your account.\n
\n\n
\n\n

To the site owner:

\n\n

Action required! Mendeley is changing its\n API. In order to keep using Mendeley with BibBase past April\n 14th, you need to:\n

    \n
  1. renew the authorization for BibBase on Mendeley, and
  2. \n
  3. update the BibBase URL\n in your page the same way you did when you initially set up\n this page.\n
  4. \n
\n

\n\n

\n \n \n Fix it now\n

\n
\n\n
\n\n\n
\n \n \n
\n
\n  \n Approximate inference in graphical models\n \n \n (20)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Max-margin learning with the Bayes Factor.\n \n \n \n \n\n\n \n Krishnan, R. G.; Khandelwal, A.; Ranganath, R.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the Conference on Uncertainty in Artificial Intelligence (UAI), 2018. \n \n\n\n\n
\n\n\n\n \n \n \"Max-margin paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KrishnanEtAl_uai18,\n  author = {Rahul G. Krishnan and Arjun Khandelwal and Rajesh Ranganath and David Sontag},\n  title = {Max-margin learning with the Bayes Factor},\n  booktitle = {Proceedings of the Conference on Uncertainty in Artificial Intelligence ({UAI})},\n  year = {2018},\n  keywords = {Machine learning, Unsupervised learning, Deep learning, Approximate inference in graphical models},\n  abstract = {We propose a new way to answer probabilistic queries that span multiple datapoints. We formalize reasoning about the similarity of different datapoints as the evaluation of the Bayes Factor within a hierarchical deep generative model that enforces a separation between the latent variables used for representation learning and those used for reasoning. Under this model, we derive an intuitive estimator for the Bayes Factor that represents similarity as the amount of overlap in representation space shared by different points. The estimator we derive relies on a query-conditional latent reasoning network, that parameterizes a distribution over the latent space of the deep generative model. The latent reasoning network is trained to amortize the posterior-predictive distribution under a hierarchical model using supervised data and a max-margin learning algorithm. We explore how the model may be used to focus the data variations captured in the latent space of the deep generative model and how this may be used to build new algorithms for few-shot learning.},\n  url_Paper = {http://people.csail.mit.edu/dsontag/papers/KrishnanEtAl_UAI18.pdf}\n}\n\n
\n
\n\n\n
\n We propose a new way to answer probabilistic queries that span multiple datapoints. We formalize reasoning about the similarity of different datapoints as the evaluation of the Bayes Factor within a hierarchical deep generative model that enforces a separation between the latent variables used for representation learning and those used for reasoning. Under this model, we derive an intuitive estimator for the Bayes Factor that represents similarity as the amount of overlap in representation space shared by different points. The estimator we derive relies on a query-conditional latent reasoning network, that parameterizes a distribution over the latent space of the deep generative model. The latent reasoning network is trained to amortize the posterior-predictive distribution under a hierarchical model using supervised data and a max-margin learning algorithm. We explore how the model may be used to focus the data variations captured in the latent space of the deep generative model and how this may be used to build new algorithms for few-shot learning.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Semi-Amortized Variational Autoencoders.\n \n \n \n \n\n\n \n Kim, Y.; Wiseman, S.; Miller, A. C.; Sontag, D.; and Rush, A. M.\n\n\n \n\n\n\n In Proceedings of the 35th International Conference on Machine Learning (ICML), 2018. \n \n\n\n\n
\n\n\n\n \n \n \"Semi-Amortized paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KimEtAl_icml18,\n  author    = {Yoon Kim and Sam Wiseman and Andrew C. Miller and David Sontag and Alexander M. Rush},\n  title = {Semi-Amortized Variational Autoencoders},\n  booktitle = {Proceedings of the 35th International Conference on Machine Learning ({ICML})},\n  year = 2018,\n  keywords = {Machine learning, Unsupervised learning, Deep learning, Approximate inference in graphical models},\n  url_Paper = {https://arxiv.org/pdf/1802.02550.pdf},\n  abstract = {Amortized variational inference (AVI) replaces instance-specific local inference with a global inference network. While AVI has enabled efficient training of deep generative models such as variational autoencoders (VAE), recent empirical work suggests that inference networks can produce suboptimal variational parameters. We propose a hybrid approach, to use AVI to initialize the variational parameters and run stochastic variational inference (SVI) to refine them. Crucially, the local SVI procedure is itself differentiable, so the inference network and generative model can be trained end-to-end with gradient-based optimization. This semi-amortized approach enables the use of rich generative models without experiencing the posterior-collapse phenomenon common in training VAEs for problems like text generation. Experiments show this approach outperforms strong autoregressive and variational baselines on standard text and image datasets.}\n}\n\n
\n
\n\n\n
\n Amortized variational inference (AVI) replaces instance-specific local inference with a global inference network. While AVI has enabled efficient training of deep generative models such as variational autoencoders (VAE), recent empirical work suggests that inference networks can produce suboptimal variational parameters. We propose a hybrid approach, to use AVI to initialize the variational parameters and run stochastic variational inference (SVI) to refine them. Crucially, the local SVI procedure is itself differentiable, so the inference network and generative model can be trained end-to-end with gradient-based optimization. This semi-amortized approach enables the use of rich generative models without experiencing the posterior-collapse phenomenon common in training VAEs for problems like text generation. Experiments show this approach outperforms strong autoregressive and variational baselines on standard text and image datasets.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Optimality of Approximate Inference Algorithms on Stable Instances.\n \n \n \n \n\n\n \n Lang, H.; Sontag, D.; and Vijayaraghavan, A.\n\n\n \n\n\n\n In Proceedings of the Twenty-First International Conference on Artificial Intelligence and Statistics (AI-STATS), 2018. JMLR: W&CP\n \n\n\n\n
\n\n\n\n \n \n \"Optimality paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{LangEtAl_aistats18,\n title = {Optimality of Approximate Inference Algorithms on Stable Instances},\n author = {Hunter Lang and David Sontag and Aravindan Vijayaraghavan},\n booktitle = {Proceedings of the Twenty-First International Conference on Artificial Intelligence and Statistics (AI-STATS)},\n publisher = {JMLR: W\\&CP},\n year = {2018},\n keywords = {Machine learning, Approximate inference in graphical models, Structured prediction},\n url_Paper = {http://proceedings.mlr.press/v84/lang18a.html},\n abstract = {Approximate algorithms for structured prediction problems -- such as LP relaxations and the popular alpha-expansion algorithm (Boykov et al. 2001) -- typically far exceed their theoretical performance guarantees on real-world instances. These algorithms often find solutions that are very close to optimal. The goal of this paper is to partially explain the performance of alpha-expansion and an LP relaxation algorithm on MAP inference in Ferromagnetic Potts models (FPMs). Our main results give stability conditions under which these two algorithms provably recover the optimal MAP solution. These theoretical results complement numerous empirical observations of good performance.}\n}\n\n
\n
\n\n\n
\n Approximate algorithms for structured prediction problems – such as LP relaxations and the popular alpha-expansion algorithm (Boykov et al. 2001) – typically far exceed their theoretical performance guarantees on real-world instances. These algorithms often find solutions that are very close to optimal. The goal of this paper is to partially explain the performance of alpha-expansion and an LP relaxation algorithm on MAP inference in Ferromagnetic Potts models (FPMs). Our main results give stability conditions under which these two algorithms provably recover the optimal MAP solution. These theoretical results complement numerous empirical observations of good performance.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Structured Inference Networks for Nonlinear State Space Models.\n \n \n \n \n\n\n \n Krishnan, R. G.; Shalit, U.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the Thirty-First AAAI Conference on Artificial Intelligence, pages 2101-2109, 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Structured paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KrishnanEtAl_aaai17,\n  author    = {Rahul G. Krishnan and\n               Uri Shalit and\n               David Sontag},\n  title     = {Structured Inference Networks for Nonlinear State Space Models},\n  booktitle = {Proceedings of the Thirty-First {AAAI} Conference on Artificial Intelligence},\n  pages     = {2101-2109},\n  year      = {2017},\n  keywords = {Machine learning, Unsupervised learning, Deep learning, Health care, Approximate inference in graphical models},\n  url_Paper = {https://arxiv.org/pdf/1609.09869.pdf},\n  abstract = {Gaussian state space models have been used for decades as generative models of sequential data. They admit an intuitive probabilistic interpretation, have a simple functional form, and enjoy widespread adoption. We introduce a unified algorithm to efficiently learn a broad class of linear and non-linear state space models, including variants where the emission and transition distributions are modeled by deep neural networks. Our learning algorithm simultaneously learns a compiled inference network and the generative model, leveraging a structured variational approximation parameterized by recurrent neural networks to mimic the posterior distribution. We apply the learning algorithm to both synthetic and real-world datasets, demonstrating its scalability and versatility. We find that using the structured approximation to the posterior results in models with significantly higher held-out likelihood.}\n}\n\n
\n
\n\n\n
\n Gaussian state space models have been used for decades as generative models of sequential data. They admit an intuitive probabilistic interpretation, have a simple functional form, and enjoy widespread adoption. We introduce a unified algorithm to efficiently learn a broad class of linear and non-linear state space models, including variants where the emission and transition distributions are modeled by deep neural networks. Our learning algorithm simultaneously learns a compiled inference network and the generative model, leveraging a structured variational approximation parameterized by recurrent neural networks to mimic the posterior distribution. We apply the learning algorithm to both synthetic and real-world datasets, demonstrating its scalability and versatility. We find that using the structured approximation to the posterior results in models with significantly higher held-out likelihood.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Tightness of LP Relaxations for Almost Balanced Models.\n \n \n \n \n\n\n \n Weller, A.; Rowland, M.; and Sontag, D.\n\n\n \n\n\n\n In Gretton, A.; and Robert, C. C., editor(s), Proceedings of the 19th International Conference on Artificial Intelligence and Statistics, volume 51, of Proceedings of Machine Learning Research, pages 47-55, Cadiz, Spain, 09–11 May 2016. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"Tightness paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@InProceedings{WellerEtAl_aistats16,\n  title = \t {Tightness of LP Relaxations for Almost Balanced Models},\n  author = \t {Adrian Weller and Mark Rowland and David Sontag},\n  booktitle = \t {Proceedings of the 19th International Conference on Artificial Intelligence and Statistics},\n  pages = \t {47-55},\n  year = \t {2016},\n  editor = \t {Arthur Gretton and Christian C. Robert},\n  volume = \t {51},\n  series = \t {Proceedings of Machine Learning Research},\n  address = \t {Cadiz, Spain},\n  month = \t {09--11 May},\n  publisher = \t {PMLR},\n  keywords = {Machine learning, Approximate inference in graphical models},\n  url_Paper = {http://people.csail.mit.edu/dsontag/papers/WellerEtAl_uai16.pdf},\n  abstract = {Linear programming (LP) relaxations are widely used to attempt to identify a most likely configuration of a discrete graphical model. In some cases, the LP relaxation attains an optimum vertex at an integral location and thus guarantees an exact solution to the original optimization problem. When this occurs, we say that the LP relaxation is tight. Here we consider binary pairwise models and derive sufficient conditions for guaranteed tightness of (i) the standard LP relaxation on the local polytope LP+LOC, and (ii) the LP relaxation on the triplet-consistent polytope LP+TRI (the next level in the Sherali-Adams hierarchy). We provide simple new proofs of earlier results and derive significant novel results including that LP+TRI is tight for any model where each block is balanced or almost balanced, and a decomposition theorem that may be used to break apart complex models into smaller pieces. An almost balanced (sub-)model is one that contains no frustrated cycles except through one privileged variable.}\n}\n\n
\n
\n\n\n
\n Linear programming (LP) relaxations are widely used to attempt to identify a most likely configuration of a discrete graphical model. In some cases, the LP relaxation attains an optimum vertex at an integral location and thus guarantees an exact solution to the original optimization problem. When this occurs, we say that the LP relaxation is tight. Here we consider binary pairwise models and derive sufficient conditions for guaranteed tightness of (i) the standard LP relaxation on the local polytope LP+LOC, and (ii) the LP relaxation on the triplet-consistent polytope LP+TRI (the next level in the Sherali-Adams hierarchy). We provide simple new proofs of earlier results and derive significant novel results including that LP+TRI is tight for any model where each block is balanced or almost balanced, and a decomposition theorem that may be used to break apart complex models into smaller pieces. An almost balanced (sub-)model is one that contains no frustrated cycles except through one privileged variable.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Barrier Frank-Wolfe for Marginal Inference.\n \n \n \n \n\n\n \n Krishnan, R. G.; Lacoste-Julien, S.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the 28th International Conference on Neural Information Processing Systems, of NIPS'15, pages 532–540, Cambridge, MA, USA, 2015. MIT Press\n \n\n\n\n
\n\n\n\n \n \n \"Barrier paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KrishnanEtAl_nips15,\n author = {Krishnan, Rahul G. and Lacoste-Julien, Simon and Sontag, David},\n title = {Barrier Frank-Wolfe for Marginal Inference},\n booktitle = {Proceedings of the 28th International Conference on Neural Information Processing Systems},\n series = {NIPS'15},\n year = {2015},\n location = {Montreal, Canada},\n pages = {532--540},\n numpages = {9},\n publisher = {MIT Press},\n address = {Cambridge, MA, USA},\n keywords = {Machine learning, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/KrishnanEtAl_nips15.pdf},\n abstract = {We introduce a globally-convergent algorithm for optimizing the tree-reweighted (TRW) variational objective over the marginal polytope. The algorithm is based on the conditional gradient method (Frank-Wolfe) and moves pseudomarginals within the marginal polytope through repeated maximum a posteriori (MAP) calls. This modular structure enables us to leverage black-box MAP solvers (both exact and approximate) for variational inference, and obtains more accurate results than tree-reweighted algorithms that optimize over the local consistency relaxation. Theoretically, we bound the sub-optimality for the proposed algorithm despite the TRW objective having unbounded gradients at the boundary of the marginal polytope. Empirically, we demonstrate the increased quality of results found by tightening the relaxation over the marginal polytope as well as the spanning tree polytope on synthetic and real-world instances.}\n} \n\n
\n
\n\n\n
\n We introduce a globally-convergent algorithm for optimizing the tree-reweighted (TRW) variational objective over the marginal polytope. The algorithm is based on the conditional gradient method (Frank-Wolfe) and moves pseudomarginals within the marginal polytope through repeated maximum a posteriori (MAP) calls. This modular structure enables us to leverage black-box MAP solvers (both exact and approximate) for variational inference, and obtains more accurate results than tree-reweighted algorithms that optimize over the local consistency relaxation. Theoretically, we bound the sub-optimality for the proposed algorithm despite the TRW objective having unbounded gradients at the boundary of the marginal polytope. Empirically, we demonstrate the increased quality of results found by tightening the relaxation over the marginal polytope as well as the spanning tree polytope on synthetic and real-world instances.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n How Hard is Inference for Structured Prediction?.\n \n \n \n \n\n\n \n Globerson, A.; Roughgarden, T.; Sontag, D.; and Yildirim, C.\n\n\n \n\n\n\n In Proceedings of the 32nd International Conference on Machine Learning (ICML), volume 37, pages 2181-–2190, 2015. JMLR: W&CP\n \n\n\n\n
\n\n\n\n \n \n \"How paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{GlobersonEtAl_icml15,\n  author    = {Amir Globerson and Tim Roughgarden and David Sontag and Cafer Yildirim},\n  title     = {How Hard is Inference for Structured Prediction?},\n  booktitle = {Proceedings of the 32nd International Conference on Machine Learning (ICML)},\n  year = {2015},\n publisher = {JMLR: W\\&CP},\n volume = {37},\n pages  = {2181-–2190},\n keywords = {Machine learning, Approximate inference in graphical models, Structured prediction},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/GloRouSonYil_icml15.pdf},\n abstract = {Structured prediction tasks in machine learning involve the simultaneous prediction of multiple labels. This is often done by maximizing a score function on the space of labels, which decomposes as a sum of pairwise elements, each depending on two specific labels. The goal of this paper is to develop a theoretical explanation of the empirical effectiveness of heuristic inference algorithms for solving such structured prediction problems. We study the minimum-achievable expected Hamming error in such problems, highlighting the case of 2D grid graphs, which are common in machine vision applications. Our main theorems provide tight upper and lower bounds on this error, as well as a polynomialtime algorithm that achieves the bound.}\n}\n\n
\n
\n\n\n
\n Structured prediction tasks in machine learning involve the simultaneous prediction of multiple labels. This is often done by maximizing a score function on the space of labels, which decomposes as a sum of pairwise elements, each depending on two specific labels. The goal of this paper is to develop a theoretical explanation of the empirical effectiveness of heuristic inference algorithms for solving such structured prediction problems. We study the minimum-achievable expected Hamming error in such problems, highlighting the case of 2D grid graphs, which are common in machine vision applications. Our main theorems provide tight upper and lower bounds on this error, as well as a polynomialtime algorithm that achieves the bound.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Lifted Tree-Reweighted Variational Inference.\n \n \n \n \n\n\n \n Bui, H. H.; Huynh, T. N.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the Thirtieth Conference on Uncertainty in Artificial Intelligence (UAI-14), 2014. \n \n\n\n\n
\n\n\n\n \n \n \"Lifted paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{BuiHuySon_uai14,\n author = {Hung Hai Bui and Tuyen N. Huynh and David Sontag},\n title  = {Lifted Tree-Reweighted Variational Inference},\n booktitle = {Proceedings of the Thirtieth Conference on Uncertainty in Artificial Intelligence ({UAI}-14)},\n year  = {2014},\n keywords = {Machine learning, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/BuiHuySon_uai14.pdf},\n abstract = {We analyze variational inference for highly symmetric graphical models such as those arising from first-order probabilistic models. We first show that for these graphical models, the tree-reweighted variational objective lends itself to a compact lifted formulation which can be solved much more efficiently than the standard TRW formulation for the ground graphical model. Compared to earlier work on lifted belief propagation, our formulation leads to a convex optimization problem for lifted marginal inference and provides an upper bound on the partition function. We provide two approaches for improving the lifted TRW upper bound. The first is a method for efficiently computing maximum spanning trees in highly symmetric graphs, which can be used to optimize the TRW edge appearance probabilities. The second is a method for tightening the relaxation of the marginal polytope using lifted cycle inequalities and novel exchangeable cluster consistency constraints.}\n}\n\n
\n
\n\n\n
\n We analyze variational inference for highly symmetric graphical models such as those arising from first-order probabilistic models. We first show that for these graphical models, the tree-reweighted variational objective lends itself to a compact lifted formulation which can be solved much more efficiently than the standard TRW formulation for the ground graphical model. Compared to earlier work on lifted belief propagation, our formulation leads to a convex optimization problem for lifted marginal inference and provides an upper bound on the partition function. We provide two approaches for improving the lifted TRW upper bound. The first is a method for efficiently computing maximum spanning trees in highly symmetric graphs, which can be used to optimize the TRW edge appearance probabilities. The second is a method for tightening the relaxation of the marginal polytope using lifted cycle inequalities and novel exchangeable cluster consistency constraints.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Understanding the Bethe Approximation: When and How can it go Wrong?.\n \n \n \n \n\n\n \n Weller, A.; Tang, K.; Sontag, D.; and Jebara, T.\n\n\n \n\n\n\n In Proceedings of the Thirtieth Conference on Uncertainty in Artificial Intelligence (UAI-14), 2014. \n \n\n\n\n
\n\n\n\n \n \n \"Understanding paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{WellerEtAl_uai14,\n author = {Adrian Weller and Kui Tang and David Sontag and Tony Jebara},\n title  = {Understanding the {B}ethe Approximation: When and How can it go Wrong?},\n booktitle = {Proceedings of the Thirtieth Conference on Uncertainty in Artificial Intelligence ({UAI}-14)},\n year  = {2014},\n keywords = {Machine learning, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/WellerEtAl_uai14.pdf},\n abstract = {Belief propagation is a remarkably effective tool for inference, even when applied to networks with cycles. It may be viewed as a way to seek the minimum of the Bethe free energy, though with no convergence guarantee in general. A variational perspective shows that, compared to exact inference, this minimization employs two forms of approximation: (i) the true entropy is approximated by the Bethe entropy, and (ii) the minimization is performed over a relaxation of the marginal polytope termed the local polytope. Here we explore when and how the Bethe approximation can fail for binary pairwise models by examining each aspect of the approximation, deriving results both analytically and with new experimental methods.}\n}\n\n
\n
\n\n\n
\n Belief propagation is a remarkably effective tool for inference, even when applied to networks with cycles. It may be viewed as a way to seek the minimum of the Bethe free energy, though with no convergence guarantee in general. A variational perspective shows that, compared to exact inference, this minimization employs two forms of approximation: (i) the true entropy is approximated by the Bethe entropy, and (ii) the minimization is performed over a relaxation of the marginal polytope termed the local polytope. Here we explore when and how the Bethe approximation can fail for binary pairwise models by examining each aspect of the approximation, deriving results both analytically and with new experimental methods.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Efficiently Searching for Frustrated Cycles in MAP Inference.\n \n \n \n \n\n\n \n Sontag, D.; Choe, D. K.; and Li, Y.\n\n\n \n\n\n\n In Proceedings of the Twenty-Eighth Conference on Uncertainty in Artificial Intelligence (UAI-12), pages 795–804, Corvallis, Oregon, 2012. AUAI Press\n \n\n\n\n
\n\n\n\n \n \n \"Efficiently paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{SontagChoeLi_uai12,\n author = {David Sontag and Do Kook Choe and Yitao Li},\n title = {Efficiently Searching for Frustrated Cycles in {MAP} Inference},\n booktitle = {Proceedings of the Twenty-Eighth Conference on Uncertainty in Artificial Intelligence ({UAI}-12)},\n publisher = {AUAI Press},\n address = {Corvallis, Oregon},\n pages = {795--804},\n year = {2012},\n keywords = {Machine learning, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/sontag_uai12.pdf},\n abstract = {Dual decomposition provides a tractable framework for designing algorithms for finding the most probable (MAP) configuration in graphical models. However, for many real-world inference problems, the typical decomposition has a large integrality gap, due to frustrated cycles. One way to tighten the relaxation is to introduce additional constraints that explicitly enforce cycle consistency. Earlier work showed that cluster-pursuit algorithms, which iteratively introduce cycle and other higher-order consistency constraints, allows one to exactly solve many hard inference problems. However, these algorithms explicitly enumerate a candidate set of clusters, limiting them to triplets or other short cycles. We solve the search problem for cycle constraints, giving a nearly linear time algorithm for finding the most frustrated cycle of arbitrary length. We show how to use this search algorithm together with the dual decomposition framework and cluster-pursuit. The new algorithm exactly solves MAP inference problems arising from relational classification and stereo vision.}\n}\n\n
\n
\n\n\n
\n Dual decomposition provides a tractable framework for designing algorithms for finding the most probable (MAP) configuration in graphical models. However, for many real-world inference problems, the typical decomposition has a large integrality gap, due to frustrated cycles. One way to tighten the relaxation is to introduce additional constraints that explicitly enforce cycle consistency. Earlier work showed that cluster-pursuit algorithms, which iteratively introduce cycle and other higher-order consistency constraints, allows one to exactly solve many hard inference problems. However, these algorithms explicitly enumerate a candidate set of clusters, limiting them to triplets or other short cycles. We solve the search problem for cycle constraints, giving a nearly linear time algorithm for finding the most frustrated cycle of arbitrary length. We show how to use this search algorithm together with the dual decomposition framework and cluster-pursuit. The new algorithm exactly solves MAP inference problems arising from relational classification and stereo vision.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Introduction to Dual Decomposition for Inference.\n \n \n \n \n\n\n \n Sontag, D.; Globerson, A.; and Jaakkola, T.\n\n\n \n\n\n\n In Sra, S.; Nowozin, S.; and Wright, S. J., editor(s), Optimization for Machine Learning, pages 219–254. MIT Press, 2012.\n \n\n\n\n
\n\n\n\n \n \n \"Introduction paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@incollection{SonGloJaa_optbook,\n author = {David Sontag and Amir Globerson and Tommi Jaakkola},\n title = {Introduction to Dual Decomposition for Inference},\n booktitle = {Optimization for Machine Learning},\n editor = {Suvrit Sra and Sebastian Nowozin and Stephen J. Wright},\n pages = {219--254},\n publisher = {MIT Press},\n year = {2012},\n keywords = {Machine learning, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/SonGloJaa_optbook.pdf},\n abstract = {Many inference problems with discrete variables result in a difficult combinatorial optimization problem. In recent years, the technique of dual decomposition, also called Lagrangian relaxation, has proven to be a powerful means of solving these inference problems by decomposing them into simpler components that are repeatedly solved independently and combined into a global solution. In this chapter, we introduce the general technique of dual decomposition through its application to the problem of finding the most likely (MAP) assignment in Markov random fields. We discuss both subgradient and block coordinate descent approaches to solving the dual problem. The resulting message-passing algorithms are similar to max-product, but can be shown to solve a linear programming relaxation of the MAP problem. We show how many of the MAP algorithms are related to each other, and also quantify when the MAP solution can and cannot be decoded directly from the dual solution.}\n}\n\n
\n
\n\n\n
\n Many inference problems with discrete variables result in a difficult combinatorial optimization problem. In recent years, the technique of dual decomposition, also called Lagrangian relaxation, has proven to be a powerful means of solving these inference problems by decomposing them into simpler components that are repeatedly solved independently and combined into a global solution. In this chapter, we introduce the general technique of dual decomposition through its application to the problem of finding the most likely (MAP) assignment in Markov random fields. We discuss both subgradient and block coordinate descent approaches to solving the dual problem. The resulting message-passing algorithms are similar to max-product, but can be shown to solve a linear programming relaxation of the MAP problem. We show how many of the MAP algorithms are related to each other, and also quantify when the MAP solution can and cannot be decoded directly from the dual solution.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Complexity of Inference in Latent Dirichlet Allocation.\n \n \n \n \n\n\n \n Sontag, D.; and Roy, D.\n\n\n \n\n\n\n In Shawe-Taylor, J.; Zemel, R.; Bartlett, P.; Pereira, F.; and Weinberger, K., editor(s), Advances in Neural Information Processing Systems 24, pages 1008–1016. MIT Press, 2011.\n \n\n\n\n
\n\n\n\n \n \n \"Complexity paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@incollection{SontagRoy_nips11,\n author = {David Sontag and Dan Roy},\n title = {Complexity of Inference in Latent Dirichlet Allocation},\n booktitle = {Advances in Neural Information Processing Systems 24},\n editor = {J. Shawe-Taylor and R.S. Zemel and P. Bartlett and F.C.N. Pereira and K.Q. Weinberger},\n pages = {1008--1016},\n publisher = {MIT Press},\n year = {2011},\n keywords = {Machine learning, Approximate inference in graphical models, Topic models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/SontagRoy_nips11.pdf},\n abstract = {We consider the computational complexity of probabilistic inference in Latent Dirichlet Allocation (LDA). First, we study the problem of finding the maximum a posteriori (MAP) assignment of topics to words, where the document’s topic distribution is integrated out. We show that, when the effective number of topics per document is small, exact inference takes polynomial time. In contrast, we show that, when a document has a large number of topics, finding the MAP assignment of topics to words in LDA is NP-hard. Next, we consider the problem of finding the MAP topic distribution for a document, where the topic-word assignments are integrated out. We show that this problem is also NP-hard. Finally, we briefly discuss the problem of sampling from the posterior, showing that this is NP-hard in one restricted setting, but leaving open the general question.}\n}\n\n
\n
\n\n\n
\n We consider the computational complexity of probabilistic inference in Latent Dirichlet Allocation (LDA). First, we study the problem of finding the maximum a posteriori (MAP) assignment of topics to words, where the document’s topic distribution is integrated out. We show that, when the effective number of topics per document is small, exact inference takes polynomial time. In contrast, we show that, when a document has a large number of topics, finding the MAP assignment of topics to words in LDA is NP-hard. Next, we consider the problem of finding the MAP topic distribution for a document, where the topic-word assignments are integrated out. We show that this problem is also NP-hard. Finally, we briefly discuss the problem of sampling from the posterior, showing that this is NP-hard in one restricted setting, but leaving open the general question.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n On Dual Decomposition and Linear Programming Relaxations for Natural Language Processing.\n \n \n \n \n\n\n \n Rush, A. M.; Sontag, D.; Collins, M.; and Jaakkola, T.\n\n\n \n\n\n\n In Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing (EMNLP), pages 1-11, 2010. \n \n\n\n\n
\n\n\n\n \n \n \"On paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{RusSonColJaa_emnlp10,\n author = {Alexander M. Rush and David Sontag and Michael Collins and Tommi Jaakkola},\n title = {On Dual Decomposition and Linear Programming Relaxations for Natural Language Processing},\n booktitle = {Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing (EMNLP)},\n pages = {1-11},\n year = {2010},\n keywords = {Machine learning, Natural language processing, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/RusSonColJaa_emnlp10.pdf},\n abstract = {This paper introduces dual decomposition as a framework for deriving inference algorithms for NLP problems. The approach relies on standard dynamic-programming algorithms as oracle solvers for sub-problems, together with a simple method for forcing agreement between the different oracles. The approach provably solves a linear programming (LP) relaxation of the global inference problem. It leads to algorithms that are simple, in that they use existing decoding algorithms; efficient, in that they avoid exact algorithms for the full model; and often exact, in that empirically they often recover the correct solution in spite of using an LP relaxation. We give experimental results on two problems: 1) the combination of two lexicalized parsing models; and 2) the combination of a lexicalized parsing model and a trigram part-of-speech tagger.}\n}\n\n
\n
\n\n\n
\n This paper introduces dual decomposition as a framework for deriving inference algorithms for NLP problems. The approach relies on standard dynamic-programming algorithms as oracle solvers for sub-problems, together with a simple method for forcing agreement between the different oracles. The approach provably solves a linear programming (LP) relaxation of the global inference problem. It leads to algorithms that are simple, in that they use existing decoding algorithms; efficient, in that they avoid exact algorithms for the full model; and often exact, in that empirically they often recover the correct solution in spite of using an LP relaxation. We give experimental results on two problems: 1) the combination of two lexicalized parsing models; and 2) the combination of a lexicalized parsing model and a trigram part-of-speech tagger.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Dual Decomposition for Parsing with Non-Projective Head Automata.\n \n \n \n \n\n\n \n Koo, T.; Rush, A. M.; Collins, M.; Jaakkola, T.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing (EMNLP), pages 1288-1298, 2010. \n \n\n\n\n
\n\n\n\n \n \n \"Dual paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KooEtAl_emnlp10,\n author = {Terry Koo and Alexander M. Rush and Michael Collins and Tommi Jaakkola and David Sontag},\n title = {Dual Decomposition for Parsing with Non-Projective Head Automata},\n booktitle = {Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing (EMNLP)},\n pages = {1288-1298},\n year = {2010},\n keywords = {Machine learning, Natural language processing, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/KooEtAl_emnlp10.pdf},\n abstract = {This paper introduces algorithms for non-projective parsing based on dual decomposition. We focus on parsing algorithms for non-projective head automata, a generalization of head-automata models to non-projective structures. The dual decomposition algorithms are simple and efficient, relying on standard dynamic programming and minimum spanning tree algorithms. They provably solve an LP relaxation of the non-projective parsing problem. Empirically the LP relaxation is very often tight: for many languages, exact solutions are achieved on over 98\\% of test sentences. The accuracy of our models is higher than previous work on a broad range of datasets.}\n}\n\n
\n
\n\n\n
\n This paper introduces algorithms for non-projective parsing based on dual decomposition. We focus on parsing algorithms for non-projective head automata, a generalization of head-automata models to non-projective structures. The dual decomposition algorithms are simple and efficient, relying on standard dynamic programming and minimum spanning tree algorithms. They provably solve an LP relaxation of the non-projective parsing problem. Empirically the LP relaxation is very often tight: for many languages, exact solutions are achieved on over 98% of test sentences. The accuracy of our models is higher than previous work on a broad range of datasets.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Approximate Inference in Graphical Models using LP Relaxations.\n \n \n \n \n\n\n \n Sontag, D.\n\n\n \n\n\n\n Ph.D. Thesis, Massachusetts Institute of Technology, Department of Electrical Engineering and Computer Science, 2010.\n \n\n\n\n
\n\n\n\n \n \n \"Approximate paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@phdthesis{Sontag_thesis10,\n title  = {Approximate Inference in Graphical Models using LP Relaxations},\n author = {David Sontag},\n school = {Massachusetts Institute of Technology},\n address = {Department of Electrical Engineering and Computer Science},\n year   = {2010},\n keywords = {Machine learning, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/sontag_phd_thesis.pdf},\n abstract = {Graphical models such as Markov random fields have been successfully applied to a wide variety of fields, from computer vision and natural language processing, to computational biology. Exact probabilistic inference is generally intractable in complex models having many dependencies between the variables. We present new approaches to approximate inference based on linear programming (LP) relaxations. Our algorithms optimize over the cycle relaxation of the marginal polytope, which we show to be closely related to the first lifting of the Sherali-Adams hierarchy, and is significantly tighter than the pairwise LP relaxation. We show how to efficiently optimize over the cycle relaxation using a cutting-plane algorithm that iteratively introduces constraints into the relaxation. We provide a criterion to determine which constraints would be most helpful in tightening the relaxation, and give efficient algorithms for solving the search problem of finding the best cycle constraint to add according to this criterion. By solving the LP relaxations in the dual, we obtain efficient message-passing algorithms that, when the relaxations are tight, can provably find the most likely (MAP) configuration. Our algorithms succeed at finding the MAP configuration in protein side-chain placement, protein design, and stereo vision problems.}\n}\n\n
\n
\n\n\n
\n Graphical models such as Markov random fields have been successfully applied to a wide variety of fields, from computer vision and natural language processing, to computational biology. Exact probabilistic inference is generally intractable in complex models having many dependencies between the variables. We present new approaches to approximate inference based on linear programming (LP) relaxations. Our algorithms optimize over the cycle relaxation of the marginal polytope, which we show to be closely related to the first lifting of the Sherali-Adams hierarchy, and is significantly tighter than the pairwise LP relaxation. We show how to efficiently optimize over the cycle relaxation using a cutting-plane algorithm that iteratively introduces constraints into the relaxation. We provide a criterion to determine which constraints would be most helpful in tightening the relaxation, and give efficient algorithms for solving the search problem of finding the best cycle constraint to add according to this criterion. By solving the LP relaxations in the dual, we obtain efficient message-passing algorithms that, when the relaxations are tight, can provably find the most likely (MAP) configuration. Our algorithms succeed at finding the MAP configuration in protein side-chain placement, protein design, and stereo vision problems.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Tree Block Coordinate Descent for MAP in Graphical Models.\n \n \n \n \n\n\n \n Sontag, D.; and Jaakkola, T.\n\n\n \n\n\n\n In Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AI-STATS), volume 8, pages 544-551, 2009. JMLR: W&CP\n \n\n\n\n
\n\n\n\n \n \n \"Tree paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{SonJaa_aistats09,\n title  = {Tree Block Coordinate Descent for {MAP} in Graphical Models},\n author = {David Sontag and Tommi Jaakkola},\n booktitle = {Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AI-STATS)},\n publisher = {JMLR: W\\&CP},\n volume = {8},\n pages  = {544-551},\n year = {2009},\n keywords = {Machine learning, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/sontag_aistats09.pdf},\n abstract = {A number of linear programming relaxations have been proposed for finding most likely settings of the variables (MAP) in large probabilistic models. The relaxations are often succinctly expressed in the dual and reduce to different types of reparameterizations of the original model. The dual objectives are typically solved by performing local block coordinate descent steps. In this work, we show how to perform block coordinate descent on spanning trees of the graphical model. We also show how all of the earlier dual algorithms are related to each other, giving transformations from one type of reparameterization to another while maintaining monotonicity relative to a common objective function. Finally, we quantify when the MAP solution can and cannot be decoded directly from the dual LP relaxation.}\n}\n\n
\n
\n\n\n
\n A number of linear programming relaxations have been proposed for finding most likely settings of the variables (MAP) in large probabilistic models. The relaxations are often succinctly expressed in the dual and reduce to different types of reparameterizations of the original model. The dual objectives are typically solved by performing local block coordinate descent steps. In this work, we show how to perform block coordinate descent on spanning trees of the graphical model. We also show how all of the earlier dual algorithms are related to each other, giving transformations from one type of reparameterization to another while maintaining monotonicity relative to a common objective function. Finally, we quantify when the MAP solution can and cannot be decoded directly from the dual LP relaxation.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Clusters and Coarse Partitions in LP Relaxations.\n \n \n \n \n\n\n \n Sontag, D.; Globerson, A.; and Jaakkola, T.\n\n\n \n\n\n\n In Koller, D.; Schuurmans, D.; Bengio, Y.; and Bottou, L., editor(s), Advances in Neural Information Processing Systems 21, pages 1537–1544, 2009. MIT Press\n \n\n\n\n
\n\n\n\n \n \n \"Clusters paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{SonGloJaa_nips08,\n title  = {Clusters and Coarse Partitions in {LP} Relaxations},\n author = {David Sontag and Amir Globerson and Tommi Jaakkola},\n booktitle = {Advances in Neural Information Processing Systems 21},\n editor = {D. Koller and D. Schuurmans and Y. Bengio and L. Bottou},\n pages = {1537--1544},\n publisher = {MIT Press},\n year = {2009},\n keywords = {Machine learning, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/sontag_nips08.pdf},\n abstract = {We propose a new class of consistency constraints for Linear Programming (LP) relaxations for finding the most probable (MAP) configuration in graphical models. Usual cluster-based LP relaxations enforce joint consistency on the beliefs of a cluster of variables, with computational cost increasing exponentially with the size of the clusters. By partitioning the state space of a cluster and enforcing consistency only across partitions, we obtain a class of constraints which, although less tight, are computationally feasible for large clusters. We show how to solve the cluster selection and partitioning problem monotonically in the dual LP, using the current beliefs to guide these choices. We obtain a dual message passing algorithm and apply it to protein design problems where the variables have large state spaces and the usual cluster-based relaxations are very costly. The resulting method solves many of these problems exactly, and significantly faster than a method that does not use partitioning.}\n}\n\n
\n
\n\n\n
\n We propose a new class of consistency constraints for Linear Programming (LP) relaxations for finding the most probable (MAP) configuration in graphical models. Usual cluster-based LP relaxations enforce joint consistency on the beliefs of a cluster of variables, with computational cost increasing exponentially with the size of the clusters. By partitioning the state space of a cluster and enforcing consistency only across partitions, we obtain a class of constraints which, although less tight, are computationally feasible for large clusters. We show how to solve the cluster selection and partitioning problem monotonically in the dual LP, using the current beliefs to guide these choices. We obtain a dual message passing algorithm and apply it to protein design problems where the variables have large state spaces and the usual cluster-based relaxations are very costly. The resulting method solves many of these problems exactly, and significantly faster than a method that does not use partitioning.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Tightening LP Relaxations for MAP using Message-Passing.\n \n \n \n \n\n\n \n Sontag, D.; Meltzer, T.; Globerson, A.; Weiss, Y.; and Jaakkola, T.\n\n\n \n\n\n\n In 24th Conference on Uncertainty in Artificial Intelligence, pages 503-510, 2008. AUAI Press\n \n\n\n\n
\n\n\n\n \n \n \"Tightening paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{SontagEtAl_uai08,\n title  = {Tightening {LP} Relaxations for {MAP} using Message-Passing},\n author = {David Sontag and Talya Meltzer and Amir Globerson and Yair Weiss and Tommi Jaakkola},\n pages     = {503-510},\n booktitle = {24th Conference on Uncertainty in Artificial Intelligence},\n publisher = {AUAI Press},\n year = {2008},\n keywords = {Machine learning, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/sontag_uai08.pdf},\n abstract = {Linear Programming (LP) relaxations have become powerful tools for finding the most probable (MAP) configuration in graphical models. These relaxations can be solved efficiently using message-passing algorithms such as belief propagation and, when the relaxation is tight, provably find the MAP configuration. The standard LP relaxation is not tight enough in many real-world problems, however, and this has lead to the use of higher order cluster-based LP relaxations. The computational cost increases exponentially with the size of the clusters and limits the number and type of clusters we can use. We propose to solve the cluster selection problem monotonically in the dual LP, iteratively selecting clusters with guaranteed improvement, and quickly re-solving with the added clusters by reusing the existing solution. Our dual message-passing algorithm finds the MAP configuration in protein side-chain placement, protein design, and stereo problems, in cases where the standard LP relaxation fails.}\n}\n\n
\n
\n\n\n
\n Linear Programming (LP) relaxations have become powerful tools for finding the most probable (MAP) configuration in graphical models. These relaxations can be solved efficiently using message-passing algorithms such as belief propagation and, when the relaxation is tight, provably find the MAP configuration. The standard LP relaxation is not tight enough in many real-world problems, however, and this has lead to the use of higher order cluster-based LP relaxations. The computational cost increases exponentially with the size of the clusters and limits the number and type of clusters we can use. We propose to solve the cluster selection problem monotonically in the dual LP, iteratively selecting clusters with guaranteed improvement, and quickly re-solving with the added clusters by reusing the existing solution. Our dual message-passing algorithm finds the MAP configuration in protein side-chain placement, protein design, and stereo problems, in cases where the standard LP relaxation fails.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n New Outer Bounds on the Marginal Polytope.\n \n \n \n \n\n\n \n Sontag, D.; and Jaakkola, T.\n\n\n \n\n\n\n In Platt, J.; Koller, D.; Singer, Y.; and Roweis, S., editor(s), Advances in Neural Information Processing Systems 20, pages 1393–1400, Cambridge, MA, 2008. MIT Press\n \n\n\n\n
\n\n\n\n \n \n \"New paper\n  \n \n \n \"New link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{SonJaa_nips08,\n title  = {New Outer Bounds on the Marginal Polytope},\n author = {David Sontag and Tommi Jaakkola}, \n booktitle = {Advances in Neural Information Processing Systems 20},\n editor = {J.C. Platt and D. Koller and Y. Singer and S. Roweis},\n publisher = {MIT Press},\n address = {Cambridge, MA},\n pages = {1393--1400},\n year = {2008},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/sontag_nips2007.pdf},\n url_Link = {http://people.csail.mit.edu/dsontag/papers/sontag_nips2007_addendum.txt},\n keywords = {Machine learning, Approximate inference in graphical models},\n abstract = {We give a new class of outer bounds on the marginal polytope, and propose a cutting-plane algorithm for efficiently optimizing over these constraints. When combined with a concave upper bound on the entropy, this gives a new variational inference algorithm for probabilistic inference in discrete Markov Random Fields (MRFs). Valid constraints on the marginal polytope are derived through a series of projections onto the cut polytope. As a result, we obtain tighter upper bounds on the log-partition function. We also show empirically that the approximations of the marginals are significantly more accurate when using the tighter outer bounds. Finally, we demonstrate the advantage of the new constraints for finding the MAP assignment in protein structure prediction.}\n}\n\n
\n
\n\n\n
\n We give a new class of outer bounds on the marginal polytope, and propose a cutting-plane algorithm for efficiently optimizing over these constraints. When combined with a concave upper bound on the entropy, this gives a new variational inference algorithm for probabilistic inference in discrete Markov Random Fields (MRFs). Valid constraints on the marginal polytope are derived through a series of projections onto the cut polytope. As a result, we obtain tighter upper bounds on the log-partition function. We also show empirically that the approximations of the marginals are significantly more accurate when using the tighter outer bounds. Finally, we demonstrate the advantage of the new constraints for finding the MAP assignment in protein structure prediction.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Cutting Plane Algorithms for Variational Inference in Graphical Models.\n \n \n \n \n\n\n \n Sontag, D.\n\n\n \n\n\n\n Master's thesis, Massachusetts Institute of Technology, Department of Electrical Engineering and Computer Science, 2007.\n \n\n\n\n
\n\n\n\n \n \n \"Cutting paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@mastersthesis{Sontag_thesis07,\n title  = {Cutting Plane Algorithms for Variational Inference in Graphical Models},\n author = {David Sontag},\n school = {Massachusetts Institute of Technology},\n address = {Department of Electrical Engineering and Computer Science},\n year   = {2007},\n abstract = {In this thesis, we give a new class of outer bounds on the marginal polytope, and propose a cutting-plane algorithm for efficiently optimizing over these constraints. When combined with a concave upper bound on the entropy, this gives a new variational inference algorithm for probabilistic inference in discrete Markov Random Fields (MRFs). Valid constraints are derived for the marginal polytope through a series of projections onto the cut polytope. Projecting onto a larger model gives an efficient separation algorithm for a large class of valid inequalities arising from each of the original projections. As a result, we obtain tighter upper bounds on the logpartition function than possible with previous variational inference algorithms. We also show empirically that our approximations of the marginals are significantly more accurate. This algorithm can also be applied to the problem of finding the Maximum a Posteriori assignment in a MRF, which corresponds to a linear program over the marginal polytope. One of the main contributions of the thesis is to bring together two seemingly different fields, polyhedral combinatorics and probabilistic inference, showing how certain results in either field can carry over to the other.},\n url_Paper = {http://people.csail.mit.edu/dsontag/masters_thesis.pdf},\n keywords = {Machine learning, Approximate inference in graphical models}\n}\n\n
\n
\n\n\n
\n In this thesis, we give a new class of outer bounds on the marginal polytope, and propose a cutting-plane algorithm for efficiently optimizing over these constraints. When combined with a concave upper bound on the entropy, this gives a new variational inference algorithm for probabilistic inference in discrete Markov Random Fields (MRFs). Valid constraints are derived for the marginal polytope through a series of projections onto the cut polytope. Projecting onto a larger model gives an efficient separation algorithm for a large class of valid inequalities arising from each of the original projections. As a result, we obtain tighter upper bounds on the logpartition function than possible with previous variational inference algorithms. We also show empirically that our approximations of the marginals are significantly more accurate. This algorithm can also be applied to the problem of finding the Maximum a Posteriori assignment in a MRF, which corresponds to a linear program over the marginal polytope. One of the main contributions of the thesis is to bring together two seemingly different fields, polyhedral combinatorics and probabilistic inference, showing how certain results in either field can carry over to the other.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n Bayesian network structure learning\n \n \n (2)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n SparsityBoost: A New Scoring Function for Learning Bayesian Network Structure.\n \n \n \n \n\n\n \n Brenner, E.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the Twenty-Ninth Conference on Uncertainty in Artificial Intelligence (UAI-13), pages 112–121, Corvallis, Oregon, 2013. AUAI Press\n \n\n\n\n
\n\n\n\n \n \n \"SparsityBoost: paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{BrennerSontag_uai13,\n author = {Eliot Brenner and David Sontag},\n title = {SparsityBoost: A New Scoring Function for Learning Bayesian Network Structure},\n booktitle = {Proceedings of the Twenty-Ninth Conference on Uncertainty in Artificial Intelligence ({UAI}-13)},\n publisher = {AUAI Press},\n address = {Corvallis, Oregon},\n pages = {112--121},\n year = {2013},\n keywords = {Machine learning, Bayesian network structure learning},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/BrennerSontag_uai13.pdf},\n abstract = {We give a new consistent scoring function for structure learning of Bayesian networks. In contrast to traditional approaches to score-based structure learning, such as BDeu or MDL, the complexity penalty that we propose is data-dependent and is given by the probability that a conditional independence test correctly shows that an edge cannot exist. What really distinguishes this new scoring function from earlier work is that it has the property of becoming computationally easier to maximize as the amount of data increases. We prove a polynomial sample complexity result, showing that maximizing this score is guaranteed to correctly learn a structure with no false edges and a distribution close to the generating distribution, whenever there exists a Bayesian network which is a perfect map for the data generating distribution. Although the new score can be used with any search algorithm, we give empirical results showing that it is particularly effective when used together with a linear programming relaxation approach to Bayesian network structure learning.}\n}\n\n
\n
\n\n\n
\n We give a new consistent scoring function for structure learning of Bayesian networks. In contrast to traditional approaches to score-based structure learning, such as BDeu or MDL, the complexity penalty that we propose is data-dependent and is given by the probability that a conditional independence test correctly shows that an edge cannot exist. What really distinguishes this new scoring function from earlier work is that it has the property of becoming computationally easier to maximize as the amount of data increases. We prove a polynomial sample complexity result, showing that maximizing this score is guaranteed to correctly learn a structure with no false edges and a distribution close to the generating distribution, whenever there exists a Bayesian network which is a perfect map for the data generating distribution. Although the new score can be used with any search algorithm, we give empirical results showing that it is particularly effective when used together with a linear programming relaxation approach to Bayesian network structure learning.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning Bayesian Network Structure using LP Relaxations.\n \n \n \n \n\n\n \n Jaakkola, T.; Sontag, D.; Globerson, A.; and Meila, M.\n\n\n \n\n\n\n In Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics (AI-STATS), volume 9, pages 358-365, 2010. JMLR: W&CP\n \n\n\n\n
\n\n\n\n \n \n \"Learning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{JaaSonGloMei_aistats10,\n title  = {Learning {B}ayesian Network Structure using {LP} Relaxations},\n author = {Tommi Jaakkola and David Sontag and Amir Globerson and Marina Meila},\n booktitle = {Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics (AI-STATS)},\n publisher = {JMLR: W\\&CP},\n volume = {9},\n pages  = {358-365},\n year = {2010},\n keywords = {Machine learning, Bayesian network structure learning},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/structure_aistats10.pdf},\n abstract = {We propose to solve the combinatorial problem of finding the highest scoring Bayesian network structure from data. This structure learning problem can be viewed as an inference problem where the variables specify the choice of parents for each node in the graph. The key combinatorial difficulty arises from the global constraint that the graph structure has to be acyclic. We cast the structure learning problem as a linear program over the polytope defined by valid acyclic structures. In relaxing this problem, we maintain an outer bound approximation to the polytope and iteratively tighten it by searching over a new class of valid constraints. If an integral solution is found, it is guaranteed to be the optimal Bayesian network. When the relaxation is not tight, the fast dual algorithms we develop remain useful in combination with a branch and bound method. Empirical results suggest that the method is competitive or faster than alternative exact methods based on dynamic programming.}\n}\n\n
\n
\n\n\n
\n We propose to solve the combinatorial problem of finding the highest scoring Bayesian network structure from data. This structure learning problem can be viewed as an inference problem where the variables specify the choice of parents for each node in the graph. The key combinatorial difficulty arises from the global constraint that the graph structure has to be acyclic. We cast the structure learning problem as a linear program over the polytope defined by valid acyclic structures. In relaxing this problem, we maintain an outer bound approximation to the polytope and iteratively tighten it by searching over a new class of valid constraints. If an integral solution is found, it is guaranteed to be the optimal Bayesian network. When the relaxation is not tight, the fast dual algorithms we develop remain useful in combination with a branch and bound method. Empirical results suggest that the method is competitive or faster than alternative exact methods based on dynamic programming.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n Causal inference\n \n \n (4)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Learning Weighted Representations for Generalization Across Designs.\n \n \n \n \n\n\n \n Johansson, F. D.; Kallus, N.; Shalit, U.; and Sontag, D.\n\n\n \n\n\n\n ArXiv e-prints arXiv:1802.08598. 2018.\n \n\n\n\n
\n\n\n\n \n \n \"Learning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{JohanssonEtAl_arxiv18,\n  author    = {Fredrik D. Johansson and Nathan Kallus and Uri Shalit and David Sontag},\n  title = {Learning Weighted Representations for Generalization Across Designs},\n  journal = {ArXiv e-prints arXiv:1802.08598},\narchivePrefix = "arXiv",\n   eprint = {1802.08598},\n primaryClass = "stat.ML",\n     year = 2018,\n keywords = {Machine learning, Causal inference, Deep learning},\n  url_Paper = {https://arxiv.org/pdf/1802.08598.pdf},\n  abstract = {Predictive models that generalize well under distributional shift are often desirable and sometimes crucial to building robust and reliable machine learning applications. We focus on distributional shift that arises in causal inference from observational data and in unsupervised domain adaptation. We pose both of these problems as prediction under a shift in design. Popular methods for overcoming distributional shift make unrealistic assumptions such as having a well-specified model or knowing the policy that gave rise to the observed data. Other methods are hindered by their need for a pre-specified metric for comparing observations, or by poor asymptotic properties. We devise a bound on the generalization error under design shift, incorporating both representation learning and sample re-weighting. Based on the bound, we propose an algorithmic framework that does not require any of the above assumptions and which is asymptotically consistent. We empirically study the new framework using two synthetic datasets, and demonstrate its effectiveness compared to previous methods.}\n}\n\n
\n
\n\n\n
\n Predictive models that generalize well under distributional shift are often desirable and sometimes crucial to building robust and reliable machine learning applications. We focus on distributional shift that arises in causal inference from observational data and in unsupervised domain adaptation. We pose both of these problems as prediction under a shift in design. Popular methods for overcoming distributional shift make unrealistic assumptions such as having a well-specified model or knowing the policy that gave rise to the observed data. Other methods are hindered by their need for a pre-specified metric for comparing observations, or by poor asymptotic properties. We devise a bound on the generalization error under design shift, incorporating both representation learning and sample re-weighting. Based on the bound, we propose an algorithmic framework that does not require any of the above assumptions and which is asymptotically consistent. We empirically study the new framework using two synthetic datasets, and demonstrate its effectiveness compared to previous methods.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Causal Effect Inference with Deep Latent-Variable Models.\n \n \n \n \n\n\n \n Louizos, C.; Shalit, U.; Mooij, J.; Sontag, D.; Zemel, R. S.; and Welling, M.\n\n\n \n\n\n\n In Proceedings of the 31st International Conference on Neural Information Processing Systems, of NIPS'17, 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Causal paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 12 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{LouizosEtAl_arxiv17,\n  author    = {Christos Louizos and\n               Uri Shalit and\n               Joris Mooij and\n               David Sontag and\n               Richard S. Zemel and\n               Max Welling},\n  title     = {Causal Effect Inference with Deep Latent-Variable Models},\n booktitle = {Proceedings of the 31st International Conference on Neural Information Processing Systems},\n series = {NIPS'17},\n year = {2017},\n keywords = {Machine learning, Causal inference, Deep learning},\n url_Paper = {https://arxiv.org/pdf/1705.08821.pdf},\n abstract = {Learning individual-level causal effects from observational data, such as inferring the most effective medication for a specific patient, is a problem of growing importance for policy makers. The most important aspect of inferring causal effects from observational data is the handling of confounders, factors that affect both an intervention and its outcome. A carefully designed observational study attempts to measure all important confounders. However, even if one does not have direct access to all confounders, there may exist noisy and uncertain measurement of proxies for confounders. We build on recent advances in latent variable modelling to simultaneously estimate the unknown latent space summarizing the confounders and the causal effect. Our method is based on Variational Autoencoders (VAE) which follow the causal structure of inference with proxies. We show our method is significantly more robust than existing methods, and matches the state-of-the-art on previous benchmarks focused on individual treatment effects.}\n}\n\n
\n
\n\n\n
\n Learning individual-level causal effects from observational data, such as inferring the most effective medication for a specific patient, is a problem of growing importance for policy makers. The most important aspect of inferring causal effects from observational data is the handling of confounders, factors that affect both an intervention and its outcome. A carefully designed observational study attempts to measure all important confounders. However, even if one does not have direct access to all confounders, there may exist noisy and uncertain measurement of proxies for confounders. We build on recent advances in latent variable modelling to simultaneously estimate the unknown latent space summarizing the confounders and the causal effect. Our method is based on Variational Autoencoders (VAE) which follow the causal structure of inference with proxies. We show our method is significantly more robust than existing methods, and matches the state-of-the-art on previous benchmarks focused on individual treatment effects.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Estimating individual treatment effect: generalization bounds and algorithms.\n \n \n \n \n\n\n \n Shalit, U.; Johansson, F. D.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the 34th International Conference on Machine Learning, pages 3076-3085, 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Estimating paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 7 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{ShalitEtAl_icml17,\n  author    = {Uri Shalit and\n               Fredrik D. Johansson and\n               David Sontag},\n  title     = {Estimating individual treatment effect: generalization bounds and\n               algorithms},\n  booktitle = {Proceedings of the 34th International Conference on Machine Learning},\n  pages     = {3076-3085},\n  year      = {2017},\n  keywords = {Machine learning, Causal inference, Deep learning},\n  url_Paper = {http://arxiv.org/pdf/1606.03976.pdf},\n  abstract = {There is intense interest in applying machine learning to problems of causal inference in fields such as healthcare, economics and education. In particular, individual-level causal inference has important applications such as precision medicine. We give a new theoretical analysis and family of algorithms for predicting individual treatment effect (ITE) from observational data, under the assumption known as strong ignorability. The algorithms learn a "balanced" representation such that the induced treated and control distributions look similar. We give a novel, simple and intuitive generalization-error bound showing that the expected ITE estimation error of a representation is bounded by a sum of the standard generalization-error of that representation and the distance between the treated and control distributions induced by the representation. We use Integral Probability Metrics to measure distances between distributions, deriving explicit bounds for the Wasserstein and Maximum Mean Discrepancy (MMD) distances. Experiments on real and simulated data show the new algorithms match or outperform the state-of-the-art.}\n}\n\n
\n
\n\n\n
\n There is intense interest in applying machine learning to problems of causal inference in fields such as healthcare, economics and education. In particular, individual-level causal inference has important applications such as precision medicine. We give a new theoretical analysis and family of algorithms for predicting individual treatment effect (ITE) from observational data, under the assumption known as strong ignorability. The algorithms learn a \"balanced\" representation such that the induced treated and control distributions look similar. We give a novel, simple and intuitive generalization-error bound showing that the expected ITE estimation error of a representation is bounded by a sum of the standard generalization-error of that representation and the distance between the treated and control distributions induced by the representation. We use Integral Probability Metrics to measure distances between distributions, deriving explicit bounds for the Wasserstein and Maximum Mean Discrepancy (MMD) distances. Experiments on real and simulated data show the new algorithms match or outperform the state-of-the-art.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning Representations for Counterfactual Inference.\n \n \n \n \n\n\n \n Johansson, F.; Shalit, U.; and Sontag, D.\n\n\n \n\n\n\n In Balcan, M. F.; and Weinberger, K. Q., editor(s), Proceedings of The 33rd International Conference on Machine Learning, volume 48, of Proceedings of Machine Learning Research, pages 3020–3029, New York, New York, USA, 20–22 Jun 2016. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"Learning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@InProceedings{JohanssonEtAl_icml16,\n  title = \t {Learning Representations for Counterfactual Inference},\n  author = \t {Fredrik Johansson and Uri Shalit and David Sontag},\n  booktitle = \t {Proceedings of The 33rd International Conference on Machine Learning},\n  pages = \t {3020--3029},\n  year = \t {2016},\n  editor = \t {Maria Florina Balcan and Kilian Q. Weinberger},\n  volume = \t {48},\n  series = \t {Proceedings of Machine Learning Research},\n  address = \t {New York, New York, USA},\n  month = \t {20--22 Jun},\n  publisher = \t {PMLR},\n  keywords = {Machine learning, Causal inference, Deep learning},\n  url_Paper = {http://people.csail.mit.edu/dsontag/papers/JohanssonShalitSontag_icml16.pdf},\n  abstract = {Observational studies are rising in importance due to the widespread accumulation of data in fields such as healthcare, education, employment and ecology. We consider the task of answering counterfactual questions such as, "Would this patient have lower blood sugar had she received a different medication?". We propose a new algorithmic framework for counterfactual inference which brings together ideas from domain adaptation and representation learning. In addition to a theoretical justification, we perform an empirical comparison with previous approaches to causal inference from observational data. Our deep learning algorithm significantly outperforms the previous state-of-the-art.}\n}\n\n
\n
\n\n\n
\n Observational studies are rising in importance due to the widespread accumulation of data in fields such as healthcare, education, employment and ecology. We consider the task of answering counterfactual questions such as, \"Would this patient have lower blood sugar had she received a different medication?\". We propose a new algorithmic framework for counterfactual inference which brings together ideas from domain adaptation and representation learning. In addition to a theoretical justification, we perform an empirical comparison with previous approaches to causal inference from observational data. Our deep learning algorithm significantly outperforms the previous state-of-the-art.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n Computational biology\n \n \n (2)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Cell-specific prediction and application of drug-induced gene expression profiles.\n \n \n \n \n\n\n \n Hodos, R.; Zhang, P.; Lee, H. C.; Duan, Q.; Wang, Z.; Clark, N. R.; Ma'ayan, A.; Wang, F.; Kidd, B.; Hu, J.; Sontag, D.; and Dudley, J.\n\n\n \n\n\n\n In Proceedings of the Pacific Symposium on Biocomputing (PSB), 2018. \n \n\n\n\n
\n\n\n\n \n \n \"Cell-specific paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{HodosEtAl_PSB17,\n author = {Rachel Hodos and Ping Zhang and Hao Chih Lee and Qiaonan Duan and Zichen Wang and Neil R. Clark and Avi Ma'ayan and Fei Wang and Brian Kidd and Jianying Hu and David Sontag and Joel Dudley},\n title = {Cell-specific prediction and application of drug-induced gene expression profiles},\n booktitle = {Proceedings of the Pacific Symposium on Biocomputing (PSB)},\n year = {2018},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/HodosEtAl_PSB18.pdf},\n keywords = {Computational biology, Health care},\n abstract = {Gene expression profiling of in vitro drug perturbations is useful for many biomedical discovery applications including drug repurposing and elucidation of drug mechanisms. However, limited data availability across cell types has hindered our capacity to leverage or explore the cell specificity of these perturbations. While recent efforts have generated a large number of drug perturbation profiles across a variety of human cell types, many gaps remain in this combinatorial drug-cell space. Hence, we asked whether it is possible to fill these gaps by predicting cell-specific drug perturbation profiles using available expression data from related conditions -- i.e. from other drugs and cell types. We developed a computational framework that first arranges existing profiles into a three-dimensional array (or tensor) indexed by drugs, genes, and cell types, and then uses either local (nearest-neighbors) or global (tensor completion) information to predict unmeasured profiles. We evaluate prediction accuracy using a variety of metrics, and find that the two methods have complementary performance, each superior in different regions in the drug-cell space. Predictions achieve correlations of 0.68 with true values, and maintain accurate differentially expressed genes (AUC 0.81). Finally, we demonstrate that the predicted profiles add value for making downstream associations with drug targets and therapeutic classes.}\n}\n\n\n
\n
\n\n\n
\n Gene expression profiling of in vitro drug perturbations is useful for many biomedical discovery applications including drug repurposing and elucidation of drug mechanisms. However, limited data availability across cell types has hindered our capacity to leverage or explore the cell specificity of these perturbations. While recent efforts have generated a large number of drug perturbation profiles across a variety of human cell types, many gaps remain in this combinatorial drug-cell space. Hence, we asked whether it is possible to fill these gaps by predicting cell-specific drug perturbation profiles using available expression data from related conditions – i.e. from other drugs and cell types. We developed a computational framework that first arranges existing profiles into a three-dimensional array (or tensor) indexed by drugs, genes, and cell types, and then uses either local (nearest-neighbors) or global (tensor completion) information to predict unmeasured profiles. We evaluate prediction accuracy using a variety of metrics, and find that the two methods have complementary performance, each superior in different regions in the drug-cell space. Predictions achieve correlations of 0.68 with true values, and maintain accurate differentially expressed genes (AUC 0.81). Finally, we demonstrate that the predicted profiles add value for making downstream associations with drug targets and therapeutic classes.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Probabilistic Modeling of Systematic Errors in Two-Hybrid Experiments.\n \n \n \n \n\n\n \n Sontag, D.; Singh, R.; and Berger, B.\n\n\n \n\n\n\n In Pacific Symposium on Biocomputing, volume 12, pages 445-457, 2007. \n \n\n\n\n
\n\n\n\n \n \n \"Probabilistic paper\n  \n \n \n \"Probabilistic link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{SonSinBer_psb07,\n title  = {Probabilistic Modeling of Systematic Errors in Two-Hybrid Experiments},\n author = {David Sontag and Rohit Singh and Bonnie Berger},\n booktitle = {Pacific Symposium on Biocomputing},\n volume  = {12},\n year   = {2007},\n pages  = {445-457},\n keywords = {Computational biology, Health care},\n url_Paper = {http://psb.stanford.edu/psb-online/proceedings/psb07/sontag.pdf},\n url_Link = {http://groups.csail.mit.edu/cb/probmod2H/},\n abstract = {We describe a novel probabilistic approach to estimating errors in two-hybrid (2H) experiments. Such experiments are frequently used to elucidate protein-protein interaction networks in a high-throughput fashion; however, a significant challenge with these is their relatively high error rate, specifically, a high false-positive rate. We describe a comprehensive error model for 2H data, accounting for both random\nand systematic errors. The latter arise from limitations of the 2H experimental protocol: in theory, the reporting mechanism of a 2H experiment should be activated if and only if the two proteins being tested truly interact; in practice, even in the absence of a true interaction, it may be activated by some proteins -- either by themselves or through promiscuous interaction with other proteins. We describe a probabilistic relational model that explicitly models the above phenomenon and use Markov Chain Monte Carlo (MCMC) algorithms to compute both the probability of an observed 2H interaction being true as well as the probability of individual proteins being self-activating/promiscuous. This is the first approach that explicitly models systematic errors in protein-protein interaction data; in contrast, previous work on this topic has modeled errors as being independent and random. By explicitly modeling the sources of noise in 2H systems, we find that we are better able to make use of the available experimental data. In comparison with Bader et al.’s method for estimating confidence in 2H predicted interactions, the proposed method performed 5-10\\% better overall, and in particular regimes improved prediction accuracy by as much as 76\\%.}\n}\n\n
\n
\n\n\n
\n We describe a novel probabilistic approach to estimating errors in two-hybrid (2H) experiments. Such experiments are frequently used to elucidate protein-protein interaction networks in a high-throughput fashion; however, a significant challenge with these is their relatively high error rate, specifically, a high false-positive rate. We describe a comprehensive error model for 2H data, accounting for both random and systematic errors. The latter arise from limitations of the 2H experimental protocol: in theory, the reporting mechanism of a 2H experiment should be activated if and only if the two proteins being tested truly interact; in practice, even in the absence of a true interaction, it may be activated by some proteins – either by themselves or through promiscuous interaction with other proteins. We describe a probabilistic relational model that explicitly models the above phenomenon and use Markov Chain Monte Carlo (MCMC) algorithms to compute both the probability of an observed 2H interaction being true as well as the probability of individual proteins being self-activating/promiscuous. This is the first approach that explicitly models systematic errors in protein-protein interaction data; in contrast, previous work on this topic has modeled errors as being independent and random. By explicitly modeling the sources of noise in 2H systems, we find that we are better able to make use of the available experimental data. In comparison with Bader et al.’s method for estimating confidence in 2H predicted interactions, the proposed method performed 5-10% better overall, and in particular regimes improved prediction accuracy by as much as 76%.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n Computer networking\n \n \n (1)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Scaling All-Pairs Overlay Routing.\n \n \n \n \n\n\n \n Sontag, D.; Zhang, Y.; Phanishayee, A.; Andersen, D.; and Karger, D.\n\n\n \n\n\n\n In CoNEXT '09: Proceedings of the 5th international conference on Emerging networking experiments and technologies, pages 145–156, New York, NY, USA, 2009. ACM\n \n\n\n\n
\n\n\n\n \n \n \"Scaling paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{SontagEtAl_conext09,\n title  = {Scaling All-Pairs Overlay Routing},\n author = {David Sontag and Yang Zhang and Amar Phanishayee and David Andersen and David Karger},\n booktitle = {CoNEXT '09: Proceedings of the 5th international conference on Emerging networking experiments and technologies},\n isbn = {978-1-60558-636-6},\n pages = {145--156},\n location = {Rome, Italy},\n publisher = {ACM},\n address = {New York, NY, USA},\n doi = {http://doi.acm.org/10.1145/1658939.1658956},\n year = {2009},\n keywords = {Computer networking},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/sontag_conext09.pdf},\n abstract = {This paper presents and experimentally evaluates a new algorithm for efficient one-hop link-state routing in full-mesh networks. Prior techniques for this setting scale poorly, as each node incurs quadratic (n^2) communication overhead to broadcast its link state to all other nodes. In contrast, in our algorithm each node exchanges routing state with only a small subset of overlay nodes determined by using a quorum system. Using a two round protocol, each node can find an optimal one-hop path to any other node using only n^1.5 pernode communication. Our algorithm can also be used to find the optimal shortest path of arbitrary length using only n^1.5 log(n) per-node communication. The algorithm is designed to be resilient to both node and link failures. We apply this algorithm to a Resilient Overlay Network (RON) system, and evaluate the results using a large-scale, globally distributed set of Internet hosts. The reduced communication overhead from using our improved full-mesh algorithm allows the creation of all-pairs routing overlays that scale to hundreds of nodes, without reducing the system’s ability to rapidly find optimal routes.}\n}\n\n
\n
\n\n\n
\n This paper presents and experimentally evaluates a new algorithm for efficient one-hop link-state routing in full-mesh networks. Prior techniques for this setting scale poorly, as each node incurs quadratic (n^2) communication overhead to broadcast its link state to all other nodes. In contrast, in our algorithm each node exchanges routing state with only a small subset of overlay nodes determined by using a quorum system. Using a two round protocol, each node can find an optimal one-hop path to any other node using only n^1.5 pernode communication. Our algorithm can also be used to find the optimal shortest path of arbitrary length using only n^1.5 log(n) per-node communication. The algorithm is designed to be resilient to both node and link failures. We apply this algorithm to a Resilient Overlay Network (RON) system, and evaluate the results using a large-scale, globally distributed set of Internet hosts. The reduced communication overhead from using our improved full-mesh algorithm allows the creation of all-pairs routing overlays that scale to hundreds of nodes, without reducing the system’s ability to rapidly find optimal routes.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n Computer vision\n \n \n (1)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Instance Segmentation of Indoor Scenes using a Coverage Loss.\n \n \n \n \n\n\n \n Silberman, N.; Sontag, D.; and Fergus, R.\n\n\n \n\n\n\n In Fleet, D. J.; Pajdla, T.; Schiele, B.; and Tuytelaars, T., editor(s), Proceedings of the 13th European Conference on Computer Vision (ECCV), volume 8689, of Lecture Notes in Computer Science, pages 616–631, 2014. Springer\n \n\n\n\n
\n\n\n\n \n \n \"Instance paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{SilSonFer_ECCV14,\n  author    = {Nathan Silberman and David Sontag and Rob Fergus},\n  title     = {Instance Segmentation of Indoor Scenes using a Coverage Loss},\n  booktitle = {Proceedings of the 13th European Conference on Computer Vision (ECCV)},\n  series    = {Lecture Notes in Computer Science},\n  volume    = {8689},\n  publisher = {Springer},\n  editor    = {David J. Fleet and\n               Tom{\\'{a}}s Pajdla and\n               Bernt Schiele and\n               Tinne Tuytelaars},\n  pages     = {616--631},\n  year      = {2014},\n  keywords = {Computer vision, Machine learning},\n  url_Paper = {http://people.csail.mit.edu/dsontag/papers/SilSonFer_ECCV14.pdf},\n  abstract = {A major limitation of existing models for semantic segmentation is the inability to identify individual instances of the same class: when labeling pixels with only semantic classes, a set of pixels with the same label could represent a single object or ten. In this work, we introduce a model to perform both semantic and instance segmentation simultaneously. We introduce a new higher-order loss function that directly minimizes the coverage metric and evaluate a variety of region features, including those from a convolutional network. We apply our model to the NYU Depth V2 dataset, obtaining state of the art results.}\n}\n\n
\n
\n\n\n
\n A major limitation of existing models for semantic segmentation is the inability to identify individual instances of the same class: when labeling pixels with only semantic classes, a set of pixels with the same label could represent a single object or ten. In this work, we introduce a model to perform both semantic and instance segmentation simultaneously. We introduce a new higher-order loss function that directly minimizes the coverage metric and evaluate a variety of region features, including those from a convolutional network. We apply our model to the NYU Depth V2 dataset, obtaining state of the art results.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n Deep learning\n \n \n (15)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Max-margin learning with the Bayes Factor.\n \n \n \n \n\n\n \n Krishnan, R. G.; Khandelwal, A.; Ranganath, R.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the Conference on Uncertainty in Artificial Intelligence (UAI), 2018. \n \n\n\n\n
\n\n\n\n \n \n \"Max-margin paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KrishnanEtAl_uai18,\n  author = {Rahul G. Krishnan and Arjun Khandelwal and Rajesh Ranganath and David Sontag},\n  title = {Max-margin learning with the Bayes Factor},\n  booktitle = {Proceedings of the Conference on Uncertainty in Artificial Intelligence ({UAI})},\n  year = {2018},\n  keywords = {Machine learning, Unsupervised learning, Deep learning, Approximate inference in graphical models},\n  abstract = {We propose a new way to answer probabilistic queries that span multiple datapoints. We formalize reasoning about the similarity of different datapoints as the evaluation of the Bayes Factor within a hierarchical deep generative model that enforces a separation between the latent variables used for representation learning and those used for reasoning. Under this model, we derive an intuitive estimator for the Bayes Factor that represents similarity as the amount of overlap in representation space shared by different points. The estimator we derive relies on a query-conditional latent reasoning network, that parameterizes a distribution over the latent space of the deep generative model. The latent reasoning network is trained to amortize the posterior-predictive distribution under a hierarchical model using supervised data and a max-margin learning algorithm. We explore how the model may be used to focus the data variations captured in the latent space of the deep generative model and how this may be used to build new algorithms for few-shot learning.},\n  url_Paper = {http://people.csail.mit.edu/dsontag/papers/KrishnanEtAl_UAI18.pdf}\n}\n\n
\n
\n\n\n
\n We propose a new way to answer probabilistic queries that span multiple datapoints. We formalize reasoning about the similarity of different datapoints as the evaluation of the Bayes Factor within a hierarchical deep generative model that enforces a separation between the latent variables used for representation learning and those used for reasoning. Under this model, we derive an intuitive estimator for the Bayes Factor that represents similarity as the amount of overlap in representation space shared by different points. The estimator we derive relies on a query-conditional latent reasoning network, that parameterizes a distribution over the latent space of the deep generative model. The latent reasoning network is trained to amortize the posterior-predictive distribution under a hierarchical model using supervised data and a max-margin learning algorithm. We explore how the model may be used to focus the data variations captured in the latent space of the deep generative model and how this may be used to build new algorithms for few-shot learning.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Semi-Amortized Variational Autoencoders.\n \n \n \n \n\n\n \n Kim, Y.; Wiseman, S.; Miller, A. C.; Sontag, D.; and Rush, A. M.\n\n\n \n\n\n\n In Proceedings of the 35th International Conference on Machine Learning (ICML), 2018. \n \n\n\n\n
\n\n\n\n \n \n \"Semi-Amortized paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KimEtAl_icml18,\n  author    = {Yoon Kim and Sam Wiseman and Andrew C. Miller and David Sontag and Alexander M. Rush},\n  title = {Semi-Amortized Variational Autoencoders},\n  booktitle = {Proceedings of the 35th International Conference on Machine Learning ({ICML})},\n  year = 2018,\n  keywords = {Machine learning, Unsupervised learning, Deep learning, Approximate inference in graphical models},\n  url_Paper = {https://arxiv.org/pdf/1802.02550.pdf},\n  abstract = {Amortized variational inference (AVI) replaces instance-specific local inference with a global inference network. While AVI has enabled efficient training of deep generative models such as variational autoencoders (VAE), recent empirical work suggests that inference networks can produce suboptimal variational parameters. We propose a hybrid approach, to use AVI to initialize the variational parameters and run stochastic variational inference (SVI) to refine them. Crucially, the local SVI procedure is itself differentiable, so the inference network and generative model can be trained end-to-end with gradient-based optimization. This semi-amortized approach enables the use of rich generative models without experiencing the posterior-collapse phenomenon common in training VAEs for problems like text generation. Experiments show this approach outperforms strong autoregressive and variational baselines on standard text and image datasets.}\n}\n\n
\n
\n\n\n
\n Amortized variational inference (AVI) replaces instance-specific local inference with a global inference network. While AVI has enabled efficient training of deep generative models such as variational autoencoders (VAE), recent empirical work suggests that inference networks can produce suboptimal variational parameters. We propose a hybrid approach, to use AVI to initialize the variational parameters and run stochastic variational inference (SVI) to refine them. Crucially, the local SVI procedure is itself differentiable, so the inference network and generative model can be trained end-to-end with gradient-based optimization. This semi-amortized approach enables the use of rich generative models without experiencing the posterior-collapse phenomenon common in training VAEs for problems like text generation. Experiments show this approach outperforms strong autoregressive and variational baselines on standard text and image datasets.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning Weighted Representations for Generalization Across Designs.\n \n \n \n \n\n\n \n Johansson, F. D.; Kallus, N.; Shalit, U.; and Sontag, D.\n\n\n \n\n\n\n ArXiv e-prints arXiv:1802.08598. 2018.\n \n\n\n\n
\n\n\n\n \n \n \"Learning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{JohanssonEtAl_arxiv18,\n  author    = {Fredrik D. Johansson and Nathan Kallus and Uri Shalit and David Sontag},\n  title = {Learning Weighted Representations for Generalization Across Designs},\n  journal = {ArXiv e-prints arXiv:1802.08598},\narchivePrefix = "arXiv",\n   eprint = {1802.08598},\n primaryClass = "stat.ML",\n     year = 2018,\n keywords = {Machine learning, Causal inference, Deep learning},\n  url_Paper = {https://arxiv.org/pdf/1802.08598.pdf},\n  abstract = {Predictive models that generalize well under distributional shift are often desirable and sometimes crucial to building robust and reliable machine learning applications. We focus on distributional shift that arises in causal inference from observational data and in unsupervised domain adaptation. We pose both of these problems as prediction under a shift in design. Popular methods for overcoming distributional shift make unrealistic assumptions such as having a well-specified model or knowing the policy that gave rise to the observed data. Other methods are hindered by their need for a pre-specified metric for comparing observations, or by poor asymptotic properties. We devise a bound on the generalization error under design shift, incorporating both representation learning and sample re-weighting. Based on the bound, we propose an algorithmic framework that does not require any of the above assumptions and which is asymptotically consistent. We empirically study the new framework using two synthetic datasets, and demonstrate its effectiveness compared to previous methods.}\n}\n\n
\n
\n\n\n
\n Predictive models that generalize well under distributional shift are often desirable and sometimes crucial to building robust and reliable machine learning applications. We focus on distributional shift that arises in causal inference from observational data and in unsupervised domain adaptation. We pose both of these problems as prediction under a shift in design. Popular methods for overcoming distributional shift make unrealistic assumptions such as having a well-specified model or knowing the policy that gave rise to the observed data. Other methods are hindered by their need for a pre-specified metric for comparing observations, or by poor asymptotic properties. We devise a bound on the generalization error under design shift, incorporating both representation learning and sample re-weighting. Based on the bound, we propose an algorithmic framework that does not require any of the above assumptions and which is asymptotically consistent. We empirically study the new framework using two synthetic datasets, and demonstrate its effectiveness compared to previous methods.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Recurrent Neural Networks for Multivariate Time Series with Missing Values.\n \n \n \n \n\n\n \n Che, Z.; Purushotham, S.; Cho, K.; Sontag, D.; and Liu, Y.\n\n\n \n\n\n\n Nature Scientific Reports, 8(1): 6085. 2018.\n \n\n\n\n
\n\n\n\n \n \n \"Recurrent paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 5 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{CheEtAl_nature_sr18,\n\tAuthor = {Che, Zhengping and Purushotham, Sanjay and Cho, Kyunghyun and Sontag, David and Liu, Yan},\n\tJournal = {Nature Scientific Reports},\n\tNumber = {1},\n\tPages = {6085},\n\tTitle = {Recurrent Neural Networks for Multivariate Time Series with Missing Values},\n\tVolume = {8},\n\tYear = {2018},\n        keywords = {Health care, Machine learning, Deep learning},\n        url_Paper = {https://www.nature.com/articles/s41598-018-24271-9},\n\tabstract = {Multivariate time series data in practical applications, such as health care, geoscience, and biology, are characterized by a variety of missing values. In time series prediction and other related tasks, it has been noted that missing values and their missing patterns are often correlated with the target labels, a.k.a., informative missingness. There is very limited work on exploiting the missing patterns for effective imputation and improving prediction performance. In this paper, we develop novel deep learning models, namely GRU-D, as one of the early attempts. GRU-D is based on Gated Recurrent Unit (GRU), a state-of-the-art recurrent neural network. It takes two representations of missing patterns, i.e., masking and time interval, and effectively incorporates them into a deep model architecture so that it not only captures the long-term temporal dependencies in time series, but also utilizes the missing patterns to achieve better prediction results. Experiments of time series classification tasks on real-world clinical datasets (MIMIC-III, PhysioNet) and synthetic datasets demonstrate that our models achieve state-of-the-art performance and provide useful insights for better understanding and utilization of missing values in time series analysis.},\n}\n\n
\n
\n\n\n
\n Multivariate time series data in practical applications, such as health care, geoscience, and biology, are characterized by a variety of missing values. In time series prediction and other related tasks, it has been noted that missing values and their missing patterns are often correlated with the target labels, a.k.a., informative missingness. There is very limited work on exploiting the missing patterns for effective imputation and improving prediction performance. In this paper, we develop novel deep learning models, namely GRU-D, as one of the early attempts. GRU-D is based on Gated Recurrent Unit (GRU), a state-of-the-art recurrent neural network. It takes two representations of missing patterns, i.e., masking and time interval, and effectively incorporates them into a deep model architecture so that it not only captures the long-term temporal dependencies in time series, but also utilizes the missing patterns to achieve better prediction results. Experiments of time series classification tasks on real-world clinical datasets (MIMIC-III, PhysioNet) and synthetic datasets demonstrate that our models achieve state-of-the-art performance and provide useful insights for better understanding and utilization of missing values in time series analysis.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Causal Effect Inference with Deep Latent-Variable Models.\n \n \n \n \n\n\n \n Louizos, C.; Shalit, U.; Mooij, J.; Sontag, D.; Zemel, R. S.; and Welling, M.\n\n\n \n\n\n\n In Proceedings of the 31st International Conference on Neural Information Processing Systems, of NIPS'17, 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Causal paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 12 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{LouizosEtAl_arxiv17,\n  author    = {Christos Louizos and\n               Uri Shalit and\n               Joris Mooij and\n               David Sontag and\n               Richard S. Zemel and\n               Max Welling},\n  title     = {Causal Effect Inference with Deep Latent-Variable Models},\n booktitle = {Proceedings of the 31st International Conference on Neural Information Processing Systems},\n series = {NIPS'17},\n year = {2017},\n keywords = {Machine learning, Causal inference, Deep learning},\n url_Paper = {https://arxiv.org/pdf/1705.08821.pdf},\n abstract = {Learning individual-level causal effects from observational data, such as inferring the most effective medication for a specific patient, is a problem of growing importance for policy makers. The most important aspect of inferring causal effects from observational data is the handling of confounders, factors that affect both an intervention and its outcome. A carefully designed observational study attempts to measure all important confounders. However, even if one does not have direct access to all confounders, there may exist noisy and uncertain measurement of proxies for confounders. We build on recent advances in latent variable modelling to simultaneously estimate the unknown latent space summarizing the confounders and the causal effect. Our method is based on Variational Autoencoders (VAE) which follow the causal structure of inference with proxies. We show our method is significantly more robust than existing methods, and matches the state-of-the-art on previous benchmarks focused on individual treatment effects.}\n}\n\n
\n
\n\n\n
\n Learning individual-level causal effects from observational data, such as inferring the most effective medication for a specific patient, is a problem of growing importance for policy makers. The most important aspect of inferring causal effects from observational data is the handling of confounders, factors that affect both an intervention and its outcome. A carefully designed observational study attempts to measure all important confounders. However, even if one does not have direct access to all confounders, there may exist noisy and uncertain measurement of proxies for confounders. We build on recent advances in latent variable modelling to simultaneously estimate the unknown latent space summarizing the confounders and the causal effect. Our method is based on Variational Autoencoders (VAE) which follow the causal structure of inference with proxies. We show our method is significantly more robust than existing methods, and matches the state-of-the-art on previous benchmarks focused on individual treatment effects.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Grounded Recurrent Neural Networks.\n \n \n \n \n\n\n \n Vani, A.; Jernite, Y.; and Sontag, D.\n\n\n \n\n\n\n ArXiv e-prints arXiv:1705.08557. 2017.\n \n\n\n\n
\n\n\n\n \n \n \"Grounded paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{VaniEtAl_arxiv17,\n   author = {{Vani}, A. and {Jernite}, Y. and {Sontag}, D.},\n    title = "{Grounded Recurrent Neural Networks}",\n  journal = {ArXiv e-prints arXiv:1705.08557},\narchivePrefix = "arXiv",\n   eprint = {1705.08557},\n primaryClass = "stat.ML",\n     year = 2017,\n  keywords = {Machine learning, Health care, Natural language processing, Deep learning},\n  url_Paper = {https://arxiv.org/pdf/1705.08557.pdf},\n  abstract = {In this work, we present the Grounded Recurrent Neural Network (GRNN), a recurrent neural network architecture for multi-label prediction which explicitly ties labels to specific dimensions of the recurrent hidden state (we call this process "grounding"). The approach is particularly well-suited for extracting large numbers of concepts from text. We apply the new model to address an important problem in healthcare of understanding what medical concepts are discussed in clinical text. Using a publicly available dataset derived from Intensive Care Units, we learn to label a patient's diagnoses and procedures from their discharge summary. Our evaluation shows a clear advantage to using our proposed architecture over a variety of strong baselines.}\n}\n\n
\n
\n\n\n
\n In this work, we present the Grounded Recurrent Neural Network (GRNN), a recurrent neural network architecture for multi-label prediction which explicitly ties labels to specific dimensions of the recurrent hidden state (we call this process \"grounding\"). The approach is particularly well-suited for extracting large numbers of concepts from text. We apply the new model to address an important problem in healthcare of understanding what medical concepts are discussed in clinical text. Using a publicly available dataset derived from Intensive Care Units, we learn to label a patient's diagnoses and procedures from their discharge summary. Our evaluation shows a clear advantage to using our proposed architecture over a variety of strong baselines.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Discourse-Based Objectives for Fast Unsupervised Sentence Representation Learning.\n \n \n \n \n\n\n \n Jernite, Y.; Bowman, S. R; and Sontag, D.\n\n\n \n\n\n\n arXiv preprint arXiv:1705.00557. 2017.\n \n\n\n\n
\n\n\n\n \n \n \"Discourse-Based paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{JerniteEtAl_arxiv17,\n  title={Discourse-Based Objectives for Fast Unsupervised Sentence Representation Learning},\n  author={Jernite, Yacine and Bowman, Samuel R and Sontag, David},\n  journal={arXiv preprint arXiv:1705.00557},\n  year={2017},\n  keywords = {Machine learning, Natural language processing, Deep learning},\n  url_Paper = {https://arxiv.org/pdf/1705.00557.pdf},\n  abstract = {This work presents a novel objective function for the unsupervised training of neural network sentence encoders. It exploits signals from paragraph-level discourse coherence to train these models to understand text. Our objective is purely discriminative, allowing us to train models many times faster than was possible under prior methods, and it yields models which perform well in extrinsic evaluations.}\n}\n\n\n
\n
\n\n\n
\n This work presents a novel objective function for the unsupervised training of neural network sentence encoders. It exploits signals from paragraph-level discourse coherence to train these models to understand text. Our objective is purely discriminative, allowing us to train models many times faster than was possible under prior methods, and it yields models which perform well in extrinsic evaluations.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Estimating individual treatment effect: generalization bounds and algorithms.\n \n \n \n \n\n\n \n Shalit, U.; Johansson, F. D.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the 34th International Conference on Machine Learning, pages 3076-3085, 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Estimating paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 7 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{ShalitEtAl_icml17,\n  author    = {Uri Shalit and\n               Fredrik D. Johansson and\n               David Sontag},\n  title     = {Estimating individual treatment effect: generalization bounds and\n               algorithms},\n  booktitle = {Proceedings of the 34th International Conference on Machine Learning},\n  pages     = {3076-3085},\n  year      = {2017},\n  keywords = {Machine learning, Causal inference, Deep learning},\n  url_Paper = {http://arxiv.org/pdf/1606.03976.pdf},\n  abstract = {There is intense interest in applying machine learning to problems of causal inference in fields such as healthcare, economics and education. In particular, individual-level causal inference has important applications such as precision medicine. We give a new theoretical analysis and family of algorithms for predicting individual treatment effect (ITE) from observational data, under the assumption known as strong ignorability. The algorithms learn a "balanced" representation such that the induced treated and control distributions look similar. We give a novel, simple and intuitive generalization-error bound showing that the expected ITE estimation error of a representation is bounded by a sum of the standard generalization-error of that representation and the distance between the treated and control distributions induced by the representation. We use Integral Probability Metrics to measure distances between distributions, deriving explicit bounds for the Wasserstein and Maximum Mean Discrepancy (MMD) distances. Experiments on real and simulated data show the new algorithms match or outperform the state-of-the-art.}\n}\n\n
\n
\n\n\n
\n There is intense interest in applying machine learning to problems of causal inference in fields such as healthcare, economics and education. In particular, individual-level causal inference has important applications such as precision medicine. We give a new theoretical analysis and family of algorithms for predicting individual treatment effect (ITE) from observational data, under the assumption known as strong ignorability. The algorithms learn a \"balanced\" representation such that the induced treated and control distributions look similar. We give a novel, simple and intuitive generalization-error bound showing that the expected ITE estimation error of a representation is bounded by a sum of the standard generalization-error of that representation and the distance between the treated and control distributions induced by the representation. We use Integral Probability Metrics to measure distances between distributions, deriving explicit bounds for the Wasserstein and Maximum Mean Discrepancy (MMD) distances. Experiments on real and simulated data show the new algorithms match or outperform the state-of-the-art.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Simultaneous Learning of Trees and Representations for Extreme Classification and Density Estimation.\n \n \n \n \n\n\n \n Jernite, Y.; Choromanska, A.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the 34th International Conference on Machine Learning, pages 1665-1674, 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Simultaneous paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{JerniteEtAl_icml17,\n  author    = {Yacine Jernite and\n               Anna Choromanska and\n               David Sontag},\n  title     = {Simultaneous Learning of Trees and Representations for Extreme Classification\n               and Density Estimation},\n  booktitle = {Proceedings of the 34th International Conference on Machine Learning},\n  pages     = {1665-1674},\n  year      = {2017},\n  keywords = {Machine learning, Natural language processing, Deep learning},\n  url_Paper = {https://arxiv.org/pdf/1610.04658.pdf},\n  abstract = {We consider multi-class classification where the predictor has a hierarchical structure that allows for a very large number of labels both at train and test time. The predictive power of such models can heavily depend on the structure of the tree, and although past work showed how to learn the tree structure, it expected that the feature vectors remained static. We provide a novel algorithm to simultaneously perform representation learning for the input data and learning of the hierarchical predictor. Our approach optimizes an objective function which favors balanced and easily-separable multi-way node partitions. We theoretically analyze this objective, showing that it gives rise to a boosting style property and a bound on classification error. We next show how to extend the algorithm to conditional density estimation. We empirically validate both variants of the algorithm on text classification and language modeling, respectively, and show that they compare favorably to common baselines in terms of accuracy and running time.}\n}\n\n
\n
\n\n\n
\n We consider multi-class classification where the predictor has a hierarchical structure that allows for a very large number of labels both at train and test time. The predictive power of such models can heavily depend on the structure of the tree, and although past work showed how to learn the tree structure, it expected that the feature vectors remained static. We provide a novel algorithm to simultaneously perform representation learning for the input data and learning of the hierarchical predictor. Our approach optimizes an objective function which favors balanced and easily-separable multi-way node partitions. We theoretically analyze this objective, showing that it gives rise to a boosting style property and a bound on classification error. We next show how to extend the algorithm to conditional density estimation. We empirically validate both variants of the algorithm on text classification and language modeling, respectively, and show that they compare favorably to common baselines in terms of accuracy and running time.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Structured Inference Networks for Nonlinear State Space Models.\n \n \n \n \n\n\n \n Krishnan, R. G.; Shalit, U.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the Thirty-First AAAI Conference on Artificial Intelligence, pages 2101-2109, 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Structured paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KrishnanEtAl_aaai17,\n  author    = {Rahul G. Krishnan and\n               Uri Shalit and\n               David Sontag},\n  title     = {Structured Inference Networks for Nonlinear State Space Models},\n  booktitle = {Proceedings of the Thirty-First {AAAI} Conference on Artificial Intelligence},\n  pages     = {2101-2109},\n  year      = {2017},\n  keywords = {Machine learning, Unsupervised learning, Deep learning, Health care, Approximate inference in graphical models},\n  url_Paper = {https://arxiv.org/pdf/1609.09869.pdf},\n  abstract = {Gaussian state space models have been used for decades as generative models of sequential data. They admit an intuitive probabilistic interpretation, have a simple functional form, and enjoy widespread adoption. We introduce a unified algorithm to efficiently learn a broad class of linear and non-linear state space models, including variants where the emission and transition distributions are modeled by deep neural networks. Our learning algorithm simultaneously learns a compiled inference network and the generative model, leveraging a structured variational approximation parameterized by recurrent neural networks to mimic the posterior distribution. We apply the learning algorithm to both synthetic and real-world datasets, demonstrating its scalability and versatility. We find that using the structured approximation to the posterior results in models with significantly higher held-out likelihood.}\n}\n\n
\n
\n\n\n
\n Gaussian state space models have been used for decades as generative models of sequential data. They admit an intuitive probabilistic interpretation, have a simple functional form, and enjoy widespread adoption. We introduce a unified algorithm to efficiently learn a broad class of linear and non-linear state space models, including variants where the emission and transition distributions are modeled by deep neural networks. Our learning algorithm simultaneously learns a compiled inference network and the generative model, leveraging a structured variational approximation parameterized by recurrent neural networks to mimic the posterior distribution. We apply the learning algorithm to both synthetic and real-world datasets, demonstrating its scalability and versatility. We find that using the structured approximation to the posterior results in models with significantly higher held-out likelihood.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Multi-task Prediction of Disease Onsets from Longitudinal Laboratory Tests.\n \n \n \n \n\n\n \n Razavian, N.; Marcus, J.; and Sontag, D.\n\n\n \n\n\n\n In Doshi-Velez, F.; Fackler, J.; Kale, D.; Wallace, B.; and Wiens, J., editor(s), Proceedings of the 1st Machine Learning for Healthcare Conference, volume 56, of Proceedings of Machine Learning Research, pages 73-100, 2016. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"Multi-task paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@InProceedings{RazavianEtAl_mlhc16,\n  title = \t {Multi-task Prediction of Disease Onsets from Longitudinal Laboratory Tests},\n  author = \t {Narges Razavian and Jake Marcus and David Sontag},\n  booktitle = \t {Proceedings of the 1st Machine Learning for Healthcare Conference},\n  pages = \t {73-100},\n  year = \t {2016},\n  editor = \t {Finale Doshi-Velez and Jim Fackler and David Kale and Byron Wallace and Jenna Wiens},\n  volume = \t {56},\n  series = \t {Proceedings of Machine Learning Research},\n  publisher = \t {PMLR},\n  keywords = {Health care, Deep learning},\n  url_Paper = {http://arxiv.org/pdf/1608.00647.pdf},\n  abstract = {Disparate areas of machine learning have benefited from models that can take raw data with little preprocessing as input and learn rich representations of that raw data in order to perform well on a given prediction task. We evaluate this approach in healthcare by using longitudinal measurements of lab tests, one of the more raw signals of a patient's health state widely available in clinical data, to predict disease onsets. In particular, we train a Long Short-Term Memory (LSTM) recurrent neural network and two novel convolutional neural networks for multi-task prediction of disease onset for 133 conditions based on 18 common lab tests measured over time in a cohort of 298K patients derived from 8 years of administrative claims data. We compare the neural networks to a logistic regression with several hand-engineered, clinically relevant features. We find that the representation-based learning approaches significantly outperform this baseline. We believe that our work suggests a new avenue for patient risk stratification based solely on lab results.}\n}\n\n
\n
\n\n\n
\n Disparate areas of machine learning have benefited from models that can take raw data with little preprocessing as input and learn rich representations of that raw data in order to perform well on a given prediction task. We evaluate this approach in healthcare by using longitudinal measurements of lab tests, one of the more raw signals of a patient's health state widely available in clinical data, to predict disease onsets. In particular, we train a Long Short-Term Memory (LSTM) recurrent neural network and two novel convolutional neural networks for multi-task prediction of disease onset for 133 conditions based on 18 common lab tests measured over time in a cohort of 298K patients derived from 8 years of administrative claims data. We compare the neural networks to a logistic regression with several hand-engineered, clinically relevant features. We find that the representation-based learning approaches significantly outperform this baseline. We believe that our work suggests a new avenue for patient risk stratification based solely on lab results.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Character-Aware Neural Language Models.\n \n \n \n \n\n\n \n Kim, Y.; Jernite, Y.; Sontag, D.; and Rush, A. M.\n\n\n \n\n\n\n In Proceedings of the Thirtieth AAAI Conference on Artificial Intelligence, pages 2741-2749, 2016. \n \n\n\n\n
\n\n\n\n \n \n \"Character-Aware paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KimEtAl_aaai16,\n  author    = {Yoon Kim and\n               Yacine Jernite and\n               David Sontag and\n               Alexander M. Rush},\n  title     = {Character-Aware Neural Language Models},\n  booktitle = {Proceedings of the Thirtieth {AAAI} Conference on Artificial Intelligence},\n  pages     = {2741-2749},\n  year      = {2016},\n  keywords = {Machine learning, Natural language processing, Deep learning},\n  url_Paper = {http://arxiv.org/pdf/1508.06615.pdf},\n  abstract = {We describe a simple neural language model that relies only on character-level inputs. Predictions are still made at the word-level. Our model employs a convolutional neural network (CNN) and a highway network over characters, whose output is given to a long short-term memory (LSTM) recurrent neural network language model (RNN-LM). On the English Penn Treebank the model is on par with the existing state-of-the-art despite having 60\\% fewer parameters. On languages with rich morphology (Arabic, Czech, French, German, Spanish, Russian), the model outperforms word-level/morpheme-level LSTM baselines, again with fewer parameters. The results suggest that on many languages, character inputs are sufficient for language modeling. Analysis of word representations obtained from the character composition part of the model reveals that the model is able to encode, from characters only, both semantic and orthographic information.}\n}\n\n
\n
\n\n\n
\n We describe a simple neural language model that relies only on character-level inputs. Predictions are still made at the word-level. Our model employs a convolutional neural network (CNN) and a highway network over characters, whose output is given to a long short-term memory (LSTM) recurrent neural network language model (RNN-LM). On the English Penn Treebank the model is on par with the existing state-of-the-art despite having 60% fewer parameters. On languages with rich morphology (Arabic, Czech, French, German, Spanish, Russian), the model outperforms word-level/morpheme-level LSTM baselines, again with fewer parameters. The results suggest that on many languages, character inputs are sufficient for language modeling. Analysis of word representations obtained from the character composition part of the model reveals that the model is able to encode, from characters only, both semantic and orthographic information.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning Representations for Counterfactual Inference.\n \n \n \n \n\n\n \n Johansson, F.; Shalit, U.; and Sontag, D.\n\n\n \n\n\n\n In Balcan, M. F.; and Weinberger, K. Q., editor(s), Proceedings of The 33rd International Conference on Machine Learning, volume 48, of Proceedings of Machine Learning Research, pages 3020–3029, New York, New York, USA, 20–22 Jun 2016. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"Learning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@InProceedings{JohanssonEtAl_icml16,\n  title = \t {Learning Representations for Counterfactual Inference},\n  author = \t {Fredrik Johansson and Uri Shalit and David Sontag},\n  booktitle = \t {Proceedings of The 33rd International Conference on Machine Learning},\n  pages = \t {3020--3029},\n  year = \t {2016},\n  editor = \t {Maria Florina Balcan and Kilian Q. Weinberger},\n  volume = \t {48},\n  series = \t {Proceedings of Machine Learning Research},\n  address = \t {New York, New York, USA},\n  month = \t {20--22 Jun},\n  publisher = \t {PMLR},\n  keywords = {Machine learning, Causal inference, Deep learning},\n  url_Paper = {http://people.csail.mit.edu/dsontag/papers/JohanssonShalitSontag_icml16.pdf},\n  abstract = {Observational studies are rising in importance due to the widespread accumulation of data in fields such as healthcare, education, employment and ecology. We consider the task of answering counterfactual questions such as, "Would this patient have lower blood sugar had she received a different medication?". We propose a new algorithmic framework for counterfactual inference which brings together ideas from domain adaptation and representation learning. In addition to a theoretical justification, we perform an empirical comparison with previous approaches to causal inference from observational data. Our deep learning algorithm significantly outperforms the previous state-of-the-art.}\n}\n\n
\n
\n\n\n
\n Observational studies are rising in importance due to the widespread accumulation of data in fields such as healthcare, education, employment and ecology. We consider the task of answering counterfactual questions such as, \"Would this patient have lower blood sugar had she received a different medication?\". We propose a new algorithmic framework for counterfactual inference which brings together ideas from domain adaptation and representation learning. In addition to a theoretical justification, we perform an empirical comparison with previous approaches to causal inference from observational data. Our deep learning algorithm significantly outperforms the previous state-of-the-art.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Deep Kalman Filters.\n \n \n \n \n\n\n \n Krishnan, R. G.; Shalit, U.; and Sontag, D.\n\n\n \n\n\n\n In arXiv:1511.05121, 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Deep paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KriShaSon_arxiv15,\n author = {Rahul G. Krishnan and Uri Shalit and David Sontag},\n title = {Deep Kalman Filters},\n booktitle = {arXiv:1511.05121},\n year = {2015},\n keywords = {Machine learning, Unsupervised learning, Health care, Deep learning},\n url_Paper = {http://arxiv.org/pdf/1511.05121.pdf},\n abstract = {Kalman Filters are one of the most influential models of time-varying phenomena. They admit an intuitive probabilistic interpretation, have a simple functional form, and enjoy widespread adoption in a variety of disciplines. Motivated by recent variational methods for learning deep generative models, we introduce a unified algorithm to efficiently learn a broad spectrum of Kalman filters. Of particular interest is the use of temporal generative models for counterfactual inference. We investigate the efficacy of such models for counterfactual inference, and to that end we introduce the "Healing MNIST" dataset where long-term structure, noise and actions are applied to sequences of digits. We show the efficacy of our method for modeling this dataset. We further show how our model can be used for counterfactual inference for patients, based on electronic health record data of 8,000 patients over 4.5 years.}\n}\n\n
\n
\n\n\n
\n Kalman Filters are one of the most influential models of time-varying phenomena. They admit an intuitive probabilistic interpretation, have a simple functional form, and enjoy widespread adoption in a variety of disciplines. Motivated by recent variational methods for learning deep generative models, we introduce a unified algorithm to efficiently learn a broad spectrum of Kalman filters. Of particular interest is the use of temporal generative models for counterfactual inference. We investigate the efficacy of such models for counterfactual inference, and to that end we introduce the \"Healing MNIST\" dataset where long-term structure, noise and actions are applied to sequences of digits. We show the efficacy of our method for modeling this dataset. We further show how our model can be used for counterfactual inference for patients, based on electronic health record data of 8,000 patients over 4.5 years.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Temporal Convolutional Neural Networks for Diagnosis from Lab Tests.\n \n \n \n \n\n\n \n Razavian, N.; and Sontag, D.\n\n\n \n\n\n\n In arXiv:1511.07938, 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Temporal paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{RazavianSontag_arxiv15,\n author = {Narges Razavian and David Sontag},\n title = {Temporal Convolutional Neural Networks for Diagnosis from Lab Tests},\n booktitle = {arXiv:1511.07938},\n year = {2015},\n keywords = {Health care, Machine learning, Deep learning},\n url_Paper = {http://arxiv.org/pdf/1511.07938.pdf},\n abstract = {Early diagnosis of treatable diseases is essential for improving healthcare, and many diseases’ onsets are predictable from annual lab tests and their temporal trends. We introduce a multi-resolution convolutional neural network for early detection of multiple diseases from irregularly measured sparse lab values. Our novel architecture takes as input both an imputed version of the data and a binary observation matrix. For imputing the temporal sparse observations, we develop a flexible, fast to train method for differentiable multivariate kernel regression. Our experiments on data from 298K individuals over 8 years, 18 common lab measurements, and 171 diseases show that the temporal signatures learned via convolution are significantly more predictive than baselines commonly used for early disease diagnosis.}\n}\n\n
\n
\n\n\n
\n Early diagnosis of treatable diseases is essential for improving healthcare, and many diseases’ onsets are predictable from annual lab tests and their temporal trends. We introduce a multi-resolution convolutional neural network for early detection of multiple diseases from irregularly measured sparse lab values. Our novel architecture takes as input both an imputed version of the data and a binary observation matrix. For imputing the temporal sparse observations, we develop a flexible, fast to train method for differentiable multivariate kernel regression. Our experiments on data from 298K individuals over 8 years, 18 common lab measurements, and 171 diseases show that the temporal signatures learned via convolution are significantly more predictive than baselines commonly used for early disease diagnosis.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n Health care\n \n \n (31)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Why Is My Classifier Discriminatory?.\n \n \n \n \n\n\n \n Chen, I.; Johansson, F. D.; and Sontag, D.\n\n\n \n\n\n\n ArXiv e-prints arXiv:1805.12002. 2018.\n \n\n\n\n
\n\n\n\n \n \n \"Why paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 8 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@article{ChenJohanssonSontag_arxiv18,\n  author = {Irene Chen and Fredrik D. Johansson and David Sontag},\n  title = {Why Is My Classifier Discriminatory?},\n  journal = {ArXiv e-prints arXiv:1805.12002},\n  archivePrefix = "arXiv",\n  eprint = {1805.12002},\n  primaryClass = "stat.ML",\n  year = 2018,\n  keywords = {Machine learning, Health care},\n  url_Paper = {https://arxiv.org/pdf/1805.12002.pdf},\n  abstract = {Recent attempts to achieve fairness in predictive models focus on the balance between fairness and accuracy. In sensitive applications such as healthcare or criminal justice, this trade-off is often undesirable as any increase in prediction error could have devastating consequences. In this work, we argue that the fairness of predictions should be evaluated in context of the data, and that unfairness induced by inadequate samples sizes or unmeasured predictive variables should be addressed through data collection, rather than by constraining the model. We decompose cost-based metrics of discrimination into bias, variance, and noise, and propose actions aimed at estimating and reducing each term. Finally, we perform case-studies on prediction of income, mortality, and review ratings, confirming the value of this analysis. We find that data collection is often a means to reduce discrimination without sacrificing accuracy.}\n}\n\n\n
\n
\n\n\n
\n Recent attempts to achieve fairness in predictive models focus on the balance between fairness and accuracy. In sensitive applications such as healthcare or criminal justice, this trade-off is often undesirable as any increase in prediction error could have devastating consequences. In this work, we argue that the fairness of predictions should be evaluated in context of the data, and that unfairness induced by inadequate samples sizes or unmeasured predictive variables should be addressed through data collection, rather than by constraining the model. We decompose cost-based metrics of discrimination into bias, variance, and noise, and propose actions aimed at estimating and reducing each term. Finally, we perform case-studies on prediction of income, mortality, and review ratings, confirming the value of this analysis. We find that data collection is often a means to reduce discrimination without sacrificing accuracy.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Recurrent Neural Networks for Multivariate Time Series with Missing Values.\n \n \n \n \n\n\n \n Che, Z.; Purushotham, S.; Cho, K.; Sontag, D.; and Liu, Y.\n\n\n \n\n\n\n Nature Scientific Reports, 8(1): 6085. 2018.\n \n\n\n\n
\n\n\n\n \n \n \"Recurrent paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 5 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{CheEtAl_nature_sr18,\n\tAuthor = {Che, Zhengping and Purushotham, Sanjay and Cho, Kyunghyun and Sontag, David and Liu, Yan},\n\tJournal = {Nature Scientific Reports},\n\tNumber = {1},\n\tPages = {6085},\n\tTitle = {Recurrent Neural Networks for Multivariate Time Series with Missing Values},\n\tVolume = {8},\n\tYear = {2018},\n        keywords = {Health care, Machine learning, Deep learning},\n        url_Paper = {https://www.nature.com/articles/s41598-018-24271-9},\n\tabstract = {Multivariate time series data in practical applications, such as health care, geoscience, and biology, are characterized by a variety of missing values. In time series prediction and other related tasks, it has been noted that missing values and their missing patterns are often correlated with the target labels, a.k.a., informative missingness. There is very limited work on exploiting the missing patterns for effective imputation and improving prediction performance. In this paper, we develop novel deep learning models, namely GRU-D, as one of the early attempts. GRU-D is based on Gated Recurrent Unit (GRU), a state-of-the-art recurrent neural network. It takes two representations of missing patterns, i.e., masking and time interval, and effectively incorporates them into a deep model architecture so that it not only captures the long-term temporal dependencies in time series, but also utilizes the missing patterns to achieve better prediction results. Experiments of time series classification tasks on real-world clinical datasets (MIMIC-III, PhysioNet) and synthetic datasets demonstrate that our models achieve state-of-the-art performance and provide useful insights for better understanding and utilization of missing values in time series analysis.},\n}\n\n
\n
\n\n\n
\n Multivariate time series data in practical applications, such as health care, geoscience, and biology, are characterized by a variety of missing values. In time series prediction and other related tasks, it has been noted that missing values and their missing patterns are often correlated with the target labels, a.k.a., informative missingness. There is very limited work on exploiting the missing patterns for effective imputation and improving prediction performance. In this paper, we develop novel deep learning models, namely GRU-D, as one of the early attempts. GRU-D is based on Gated Recurrent Unit (GRU), a state-of-the-art recurrent neural network. It takes two representations of missing patterns, i.e., masking and time interval, and effectively incorporates them into a deep model architecture so that it not only captures the long-term temporal dependencies in time series, but also utilizes the missing patterns to achieve better prediction results. Experiments of time series classification tasks on real-world clinical datasets (MIMIC-III, PhysioNet) and synthetic datasets demonstrate that our models achieve state-of-the-art performance and provide useful insights for better understanding and utilization of missing values in time series analysis.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Cell-specific prediction and application of drug-induced gene expression profiles.\n \n \n \n \n\n\n \n Hodos, R.; Zhang, P.; Lee, H. C.; Duan, Q.; Wang, Z.; Clark, N. R.; Ma'ayan, A.; Wang, F.; Kidd, B.; Hu, J.; Sontag, D.; and Dudley, J.\n\n\n \n\n\n\n In Proceedings of the Pacific Symposium on Biocomputing (PSB), 2018. \n \n\n\n\n
\n\n\n\n \n \n \"Cell-specific paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{HodosEtAl_PSB17,\n author = {Rachel Hodos and Ping Zhang and Hao Chih Lee and Qiaonan Duan and Zichen Wang and Neil R. Clark and Avi Ma'ayan and Fei Wang and Brian Kidd and Jianying Hu and David Sontag and Joel Dudley},\n title = {Cell-specific prediction and application of drug-induced gene expression profiles},\n booktitle = {Proceedings of the Pacific Symposium on Biocomputing (PSB)},\n year = {2018},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/HodosEtAl_PSB18.pdf},\n keywords = {Computational biology, Health care},\n abstract = {Gene expression profiling of in vitro drug perturbations is useful for many biomedical discovery applications including drug repurposing and elucidation of drug mechanisms. However, limited data availability across cell types has hindered our capacity to leverage or explore the cell specificity of these perturbations. While recent efforts have generated a large number of drug perturbation profiles across a variety of human cell types, many gaps remain in this combinatorial drug-cell space. Hence, we asked whether it is possible to fill these gaps by predicting cell-specific drug perturbation profiles using available expression data from related conditions -- i.e. from other drugs and cell types. We developed a computational framework that first arranges existing profiles into a three-dimensional array (or tensor) indexed by drugs, genes, and cell types, and then uses either local (nearest-neighbors) or global (tensor completion) information to predict unmeasured profiles. We evaluate prediction accuracy using a variety of metrics, and find that the two methods have complementary performance, each superior in different regions in the drug-cell space. Predictions achieve correlations of 0.68 with true values, and maintain accurate differentially expressed genes (AUC 0.81). Finally, we demonstrate that the predicted profiles add value for making downstream associations with drug targets and therapeutic classes.}\n}\n\n\n
\n
\n\n\n
\n Gene expression profiling of in vitro drug perturbations is useful for many biomedical discovery applications including drug repurposing and elucidation of drug mechanisms. However, limited data availability across cell types has hindered our capacity to leverage or explore the cell specificity of these perturbations. While recent efforts have generated a large number of drug perturbation profiles across a variety of human cell types, many gaps remain in this combinatorial drug-cell space. Hence, we asked whether it is possible to fill these gaps by predicting cell-specific drug perturbation profiles using available expression data from related conditions – i.e. from other drugs and cell types. We developed a computational framework that first arranges existing profiles into a three-dimensional array (or tensor) indexed by drugs, genes, and cell types, and then uses either local (nearest-neighbors) or global (tensor completion) information to predict unmeasured profiles. We evaluate prediction accuracy using a variety of metrics, and find that the two methods have complementary performance, each superior in different regions in the drug-cell space. Predictions achieve correlations of 0.68 with true values, and maintain accurate differentially expressed genes (AUC 0.81). Finally, we demonstrate that the predicted profiles add value for making downstream associations with drug targets and therapeutic classes.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning a Health Knowledge Graph from Electronic Medical Records.\n \n \n \n \n\n\n \n Rotmensch, M.; Halpern, Y.; Tlimat, A.; Horng, S.; and Sontag, D.\n\n\n \n\n\n\n Nature Scientific Reports, 7(1): 5994. 2017.\n \n\n\n\n
\n\n\n\n \n \n \"Learning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{rotmensch_nature_sr17,\n\tAuthor = {Rotmensch, Maya and Halpern, Yoni and Tlimat, Abdulhakim and Horng, Steven and Sontag, David},\n\tJournal = {Nature Scientific Reports},\n\tNumber = {1},\n\tPages = {5994},\n\tTitle = {Learning a Health Knowledge Graph from Electronic Medical Records},\n\tVolume = {7},\n\tYear = {2017},\n        keywords = {Health care},\n        url_Paper = {https://www.nature.com/articles/s41598-017-05778-z.pdf},\n        abstract = {Demand for clinical decision support systems in medicine and self-diagnostic symptom checkers has substantially increased in recent years. Existing platforms rely on knowledge bases manually compiled through a labor-intensive process or automatically derived using simple pairwise statistics. This study explored an automated process to learn high quality knowledge bases linking diseases and symptoms directly from electronic medical records. Medical concepts were extracted from 273,174 deidentified patient records and maximum likelihood estimation of three probabilistic models was used to automatically construct knowledge graphs: logistic regression, naive Bayes classifier and a Bayesian network using noisy OR gates. A graph of disease-symptom relationships was elicited from the learned parameters and the constructed knowledge graphs were evaluated and validated, with permission, against Google's manually-constructed knowledge graph and against expert physician opinions. Our study shows that direct and automated construction of high quality health knowledge graphs from medical records using rudimentary concept extraction is feasible. The noisy OR model produces a high quality knowledge graph reaching precision of 0.85 for a recall of 0.6 in the clinical evaluation. Noisy OR significantly outperforms all tested models across evaluation frameworks (p<0.01).}\n}\n\n
\n
\n\n\n
\n Demand for clinical decision support systems in medicine and self-diagnostic symptom checkers has substantially increased in recent years. Existing platforms rely on knowledge bases manually compiled through a labor-intensive process or automatically derived using simple pairwise statistics. This study explored an automated process to learn high quality knowledge bases linking diseases and symptoms directly from electronic medical records. Medical concepts were extracted from 273,174 deidentified patient records and maximum likelihood estimation of three probabilistic models was used to automatically construct knowledge graphs: logistic regression, naive Bayes classifier and a Bayesian network using noisy OR gates. A graph of disease-symptom relationships was elicited from the learned parameters and the constructed knowledge graphs were evaluated and validated, with permission, against Google's manually-constructed knowledge graph and against expert physician opinions. Our study shows that direct and automated construction of high quality health knowledge graphs from medical records using rudimentary concept extraction is feasible. The noisy OR model produces a high quality knowledge graph reaching precision of 0.85 for a recall of 0.6 in the clinical evaluation. Noisy OR significantly outperforms all tested models across evaluation frameworks (p<0.01).\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Grounded Recurrent Neural Networks.\n \n \n \n \n\n\n \n Vani, A.; Jernite, Y.; and Sontag, D.\n\n\n \n\n\n\n ArXiv e-prints arXiv:1705.08557. 2017.\n \n\n\n\n
\n\n\n\n \n \n \"Grounded paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{VaniEtAl_arxiv17,\n   author = {{Vani}, A. and {Jernite}, Y. and {Sontag}, D.},\n    title = "{Grounded Recurrent Neural Networks}",\n  journal = {ArXiv e-prints arXiv:1705.08557},\narchivePrefix = "arXiv",\n   eprint = {1705.08557},\n primaryClass = "stat.ML",\n     year = 2017,\n  keywords = {Machine learning, Health care, Natural language processing, Deep learning},\n  url_Paper = {https://arxiv.org/pdf/1705.08557.pdf},\n  abstract = {In this work, we present the Grounded Recurrent Neural Network (GRNN), a recurrent neural network architecture for multi-label prediction which explicitly ties labels to specific dimensions of the recurrent hidden state (we call this process "grounding"). The approach is particularly well-suited for extracting large numbers of concepts from text. We apply the new model to address an important problem in healthcare of understanding what medical concepts are discussed in clinical text. Using a publicly available dataset derived from Intensive Care Units, we learn to label a patient's diagnoses and procedures from their discharge summary. Our evaluation shows a clear advantage to using our proposed architecture over a variety of strong baselines.}\n}\n\n
\n
\n\n\n
\n In this work, we present the Grounded Recurrent Neural Network (GRNN), a recurrent neural network architecture for multi-label prediction which explicitly ties labels to specific dimensions of the recurrent hidden state (we call this process \"grounding\"). The approach is particularly well-suited for extracting large numbers of concepts from text. We apply the new model to address an important problem in healthcare of understanding what medical concepts are discussed in clinical text. Using a publicly available dataset derived from Intensive Care Units, we learn to label a patient's diagnoses and procedures from their discharge summary. Our evaluation shows a clear advantage to using our proposed architecture over a variety of strong baselines.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Structured Inference Networks for Nonlinear State Space Models.\n \n \n \n \n\n\n \n Krishnan, R. G.; Shalit, U.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the Thirty-First AAAI Conference on Artificial Intelligence, pages 2101-2109, 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Structured paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KrishnanEtAl_aaai17,\n  author    = {Rahul G. Krishnan and\n               Uri Shalit and\n               David Sontag},\n  title     = {Structured Inference Networks for Nonlinear State Space Models},\n  booktitle = {Proceedings of the Thirty-First {AAAI} Conference on Artificial Intelligence},\n  pages     = {2101-2109},\n  year      = {2017},\n  keywords = {Machine learning, Unsupervised learning, Deep learning, Health care, Approximate inference in graphical models},\n  url_Paper = {https://arxiv.org/pdf/1609.09869.pdf},\n  abstract = {Gaussian state space models have been used for decades as generative models of sequential data. They admit an intuitive probabilistic interpretation, have a simple functional form, and enjoy widespread adoption. We introduce a unified algorithm to efficiently learn a broad class of linear and non-linear state space models, including variants where the emission and transition distributions are modeled by deep neural networks. Our learning algorithm simultaneously learns a compiled inference network and the generative model, leveraging a structured variational approximation parameterized by recurrent neural networks to mimic the posterior distribution. We apply the learning algorithm to both synthetic and real-world datasets, demonstrating its scalability and versatility. We find that using the structured approximation to the posterior results in models with significantly higher held-out likelihood.}\n}\n\n
\n
\n\n\n
\n Gaussian state space models have been used for decades as generative models of sequential data. They admit an intuitive probabilistic interpretation, have a simple functional form, and enjoy widespread adoption. We introduce a unified algorithm to efficiently learn a broad class of linear and non-linear state space models, including variants where the emission and transition distributions are modeled by deep neural networks. Our learning algorithm simultaneously learns a compiled inference network and the generative model, leveraging a structured variational approximation parameterized by recurrent neural networks to mimic the posterior distribution. We apply the learning algorithm to both synthetic and real-world datasets, demonstrating its scalability and versatility. We find that using the structured approximation to the posterior results in models with significantly higher held-out likelihood.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Electronic phenotyping with APHRODITE and the Observational Health Sciences and Informatics (OHDSI) data network.\n \n \n \n \n\n\n \n Banda, J. M.; Halpern, Y.; Sontag, D.; and Shah, N. H.\n\n\n \n\n\n\n In Proceedings of the AMIA Summit on Clinical Research Informatics (CRI), 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Electronic paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{banda_cri17,\n author = {Juan M. Banda and Yoni Halpern and David Sontag and Nigam H. Shah},\n title = {Electronic phenotyping with APHRODITE and the Observational Health Sciences and Informatics ({OHDSI}) data network},\n booktitle = {Proceedings of the AMIA Summit on Clinical Research Informatics (CRI)},\n year = {2017},\n keywords = {Health care},\n url_Paper = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC5543379/pdf/2611061.pdf},\n abstract = {The widespread usage of electronic health records (EHRs) for clinical research has produced multiple electronic phenotyping approaches. Methods for electronic phenotyping range from those needing extensive specialized medical expert supervision to those based on semi-supervised learning techniques. We present Automated PHenotype Routine for Observational Definition, Identification, Training and Evaluation (APHRODITE), an R package phenotyping framework that combines noisy labeling and anchor learning. APHRODITE makes these cutting-edge phenotyping approaches available for use with the Observational Health Data Sciences and Informatics (OHDSI) data model for standardized and scalable deployment. APHRODITE uses EHR data available in the OHDSI Common Data Model to build classification models for electronic phenotyping. We demonstrate the utility of APHRODITE by comparing its performance versus traditional rule-based phenotyping approaches. Finally, the resulting phenotype models and model construction workflows built with APHRODITE can be shared between multiple OHDSI sites. Such sharing allows their application on large and diverse patient populations.}\n}\n\n
\n
\n\n\n
\n The widespread usage of electronic health records (EHRs) for clinical research has produced multiple electronic phenotyping approaches. Methods for electronic phenotyping range from those needing extensive specialized medical expert supervision to those based on semi-supervised learning techniques. We present Automated PHenotype Routine for Observational Definition, Identification, Training and Evaluation (APHRODITE), an R package phenotyping framework that combines noisy labeling and anchor learning. APHRODITE makes these cutting-edge phenotyping approaches available for use with the Observational Health Data Sciences and Informatics (OHDSI) data model for standardized and scalable deployment. APHRODITE uses EHR data available in the OHDSI Common Data Model to build classification models for electronic phenotyping. We demonstrate the utility of APHRODITE by comparing its performance versus traditional rule-based phenotyping approaches. Finally, the resulting phenotype models and model construction workflows built with APHRODITE can be shared between multiple OHDSI sites. Such sharing allows their application on large and diverse patient populations.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Objective Assessment of Depressive Symptoms with Machine Learning and Wearable Sensors Data.\n \n \n \n \n\n\n \n Ghandeharioun, A.; Fedor, S.; Sangermano, L.; Ionescu, D.; Alpert, J.; Dale, C.; Sontag, D.; and Picard, R.\n\n\n \n\n\n\n In Proceedings of the Seventh International Conference on Affective Computing and Intelligent Interaction (ACII), 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Objective paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{GhandehariounEtAl_ACII17,\n Author = {Asma Ghandeharioun and Szymon Fedor and Lisa Sangermano and Dawn Ionescu and Jonathan Alpert and Chelsea Dale and David Sontag and Rosalind Picard},\n Title = {Objective Assessment of Depressive Symptoms with Machine Learning and Wearable Sensors Data},\n Year = {2017},\n Booktitle = {Proceedings of the Seventh International Conference on Affective Computing and Intelligent Interaction (ACII)},\n keywords = {Health care},\n url_Paper = {http://affect.media.mit.edu/pdfs/17.ghandeharioun_etal_objective_ACII.pdf},\n abstract = {Depression is the major cause of years lived in disability world-wide; however, its diagnosis and tracking\nmethods still rely mainly on assessing self-reported depressive symptoms, methods that originated more than fifty years ago. These methods, which usually involve filling out surveys or engaging in face-to-face interviews, provide limited accuracy and reliability and are costly to track and scale. In this paper, we develop and test the efficacy of machine learning techniques applied to objective data captured passively and continuously from E4 wearable wristbands and from sensors in an Android phone for predicting the Hamilton Depression Rating Scale (HDRS). Input data include electrodermal activity (EDA), sleep behavior, motion, phone-based communication, location changes, and phone usage patterns. We introduce our feature generation and transformation process, imputing missing clinical scores from self-reported measures, and predicting depression severity from continuous sensor measurements. While HDRS ranges between 0 and 52, we were able to impute it with 2.8 RMSE and predict it with 4.5 RMSE which are low relative errors. Analyzing the features and their relation to depressive symptoms, we found that poor mental health was accompanied by more irregular sleep, less motion, fewer incoming messages, less variability in location patterns, and higher asymmetry of EDA between the right and the left wrists.}\n}\n\n
\n
\n\n\n
\n Depression is the major cause of years lived in disability world-wide; however, its diagnosis and tracking methods still rely mainly on assessing self-reported depressive symptoms, methods that originated more than fifty years ago. These methods, which usually involve filling out surveys or engaging in face-to-face interviews, provide limited accuracy and reliability and are costly to track and scale. In this paper, we develop and test the efficacy of machine learning techniques applied to objective data captured passively and continuously from E4 wearable wristbands and from sensors in an Android phone for predicting the Hamilton Depression Rating Scale (HDRS). Input data include electrodermal activity (EDA), sleep behavior, motion, phone-based communication, location changes, and phone usage patterns. We introduce our feature generation and transformation process, imputing missing clinical scores from self-reported measures, and predicting depression severity from continuous sensor measurements. While HDRS ranges between 0 and 52, we were able to impute it with 2.8 RMSE and predict it with 4.5 RMSE which are low relative errors. Analyzing the features and their relation to depressive symptoms, we found that poor mental health was accompanied by more irregular sleep, less motion, fewer incoming messages, less variability in location patterns, and higher asymmetry of EDA between the right and the left wrists.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Early Identification of Patients with Acute Decompensated Heart Failure.\n \n \n \n \n\n\n \n Blecker, S.; Sontag, D.; Horwitz, L.; Kuperman, G.; Park, H.; Reyentovich, A.; and Katz, S.\n\n\n \n\n\n\n Journal of Cardiac Failure. 2017.\n \n\n\n\n
\n\n\n\n \n \n \"Early paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{BleckerEtAl_JCF17,\n\tAuthor = {Blecker, Saul and Sontag, David and Horwitz, Leora and Kuperman, Gilad and Park, Hannah and Reyentovich, Alex and Katz, Stuart},\n\tBooktitle = {Journal of Cardiac Failure},\n\tDoi = {10.1016/j.cardfail.2017.08.458},\n\tIsbn = {1071-9164},\n\tJournal = {Journal of Cardiac Failure},\n\tYear = {2017},\n\tPublisher = {Elsevier},\n\tTitle = {Early Identification of Patients with Acute Decompensated Heart Failure},\n\tTy = {JOUR},\n        keywords = {Health care},\n        url_Paper = {http://www.onlinejcf.com/article/S1071-9164(17)31161-2/pdf},\n        abstract = {Interventions to reduce readmissions following acute heart failure hospitalization require early identification of patients. The purpose of this study was to develop and test accuracies of various approaches to identify patients with acute decompensated heart failure (ADHF) using data derived from the electronic health record. We included 37,229 hospitalizations of adult patients at a single hospital in 2013-2015. We developed four algorithms to identify hospitalization with a principal discharge diagnosis of ADHF: 1) presence of one of three clinical characteristics; 2) logistic regression of 31 structured data elements; 3) machine learning with unstructured data; 4) machine learning with both structured and unstructured data. In data validation, Algorithm 1 had a sensitivity of 0.98 and positive predictive value (PPV) of 0.14 for ADHF. Algorithm 2 had an area under the receiver operating characteristic curve (AUC) of 0.96, while both machine learning algorithms had AUCs of 0.99. Based on a brief survey of three providers who perform chart review for ADHF, we estimated providers spent 8.6 minutes per chart review; using this this parameter, we estimated providers would spend 61.4, 57.3, 28.7, and 25.3 minutes on secondary chart review for each case of ADHF if initial screening was done with algorithms 1, 2, 3, and 4, respectively. In conclusion, machine learning algorithms with unstructured notes had best performance for identification of ADHF and can improve provider efficiency for delivery of quality improvement interventions.}\n}\n\n
\n
\n\n\n
\n Interventions to reduce readmissions following acute heart failure hospitalization require early identification of patients. The purpose of this study was to develop and test accuracies of various approaches to identify patients with acute decompensated heart failure (ADHF) using data derived from the electronic health record. We included 37,229 hospitalizations of adult patients at a single hospital in 2013-2015. We developed four algorithms to identify hospitalization with a principal discharge diagnosis of ADHF: 1) presence of one of three clinical characteristics; 2) logistic regression of 31 structured data elements; 3) machine learning with unstructured data; 4) machine learning with both structured and unstructured data. In data validation, Algorithm 1 had a sensitivity of 0.98 and positive predictive value (PPV) of 0.14 for ADHF. Algorithm 2 had an area under the receiver operating characteristic curve (AUC) of 0.96, while both machine learning algorithms had AUCs of 0.99. Based on a brief survey of three providers who perform chart review for ADHF, we estimated providers spent 8.6 minutes per chart review; using this this parameter, we estimated providers would spend 61.4, 57.3, 28.7, and 25.3 minutes on secondary chart review for each case of ADHF if initial screening was done with algorithms 1, 2, 3, and 4, respectively. In conclusion, machine learning algorithms with unstructured notes had best performance for identification of ADHF and can improve provider efficiency for delivery of quality improvement interventions.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Using Machine Learning to Recommend Oncology Clinical Trials.\n \n \n \n \n\n\n \n Das, A.; Thorbergsson, L.; Grigorenko, A.; Sontag, D.; and Huerga, I.\n\n\n \n\n\n\n In Machine Learning for Health Care (Clinical abstract), 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Using paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{DasEtAl_mlhc17,\nauthor = {Anasuya Das and Leifur Thorbergsson and Aleksandr Grigorenko and David Sontag and Iker Huerga},\ntitle = {Using Machine Learning to Recommend Oncology Clinical Trials},\nbooktitle = {Machine Learning for Health Care (Clinical abstract)},\nyear = {2017},\nkeywords = {Health care},\nurl_Paper = {http://mucmd.org/CameraReadySubmissions/21\\%5Cclinical_abstracts\\%201.pdf},\nabstract = {Clinical trials serve an important role in oncology, not only advancing medical science but also offering patients promising therapy before it is widely available. Memorial Sloan Kettering Cancer Center (MSK) conducts over 500 therapeutic trials at one time; most are focused on a single type of cancer (e.g. breast, lung) reflecting subspecialized nature of care. However, clinical trial accrual is a challenge as patient-trial matching is a slow and manual process. We address this challenge via a machine learning-powered clinical trial recommendation engine designed to be deployed at the point of care.}\n}\n\n
\n
\n\n\n
\n Clinical trials serve an important role in oncology, not only advancing medical science but also offering patients promising therapy before it is widely available. Memorial Sloan Kettering Cancer Center (MSK) conducts over 500 therapeutic trials at one time; most are focused on a single type of cancer (e.g. breast, lung) reflecting subspecialized nature of care. However, clinical trial accrual is a challenge as patient-trial matching is a slow and manual process. We address this challenge via a machine learning-powered clinical trial recommendation engine designed to be deployed at the point of care.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Creating an Automated Trigger for Sepsis Clinical Decision Support at Emergency Department Triage using Machine Learning.\n \n \n \n \n\n\n \n Horng, S.; Sontag, D.; Halpern, Y.; Jernite, Y.; Shapiro, N. I.; and Nathanson, L. A.\n\n\n \n\n\n\n PLoS ONE, 12(4): e0174708. 2017.\n \n\n\n\n
\n\n\n\n \n \n \"Creating paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{HorngEtAl_plos17,\n author = {Steven Horng and David Sontag and Yoni Halpern and Yacine Jernite and Nathan I. Shapiro and Larry A. Nathanson},\n title = {Creating an Automated Trigger for Sepsis Clinical Decision Support at Emergency Department Triage using Machine Learning},\n journal = {PLoS ONE},\n volume = {12},\n number={4},\n pages={e0174708},\n year = {2017},\n keywords = {Health care},\n url_Paper = {http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0174708},\n abstract = {Our objective is to demonstrate the incremental benefit of using free text data in addition to vital sign and demographic data to identify patients with suspected infection in the emergency department. Compared to previous work that only used structured data such as vital signs and demographic information, utilizing free text drastically improves the discriminatory ability (increase in AUC from 0.67 to 0.86) of identifying infection.}\n}\n\n
\n
\n\n\n
\n Our objective is to demonstrate the incremental benefit of using free text data in addition to vital sign and demographic data to identify patients with suspected infection in the emergency department. Compared to previous work that only used structured data such as vital signs and demographic information, utilizing free text drastically improves the discriminatory ability (increase in AUC from 0.67 to 0.86) of identifying infection.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Contextual Autocomplete: A Novel User Interface Using Machine Learning to Improve Ontology Usage and Structured Data Capture for Presenting Problems in the Emergency Department.\n \n \n \n \n\n\n \n Greenbaum, N. R; Jernite, Y.; Halpern, Y.; Calder, S.; Nathanson, L. A.; Sontag, D.; and Horng, S.\n\n\n \n\n\n\n bioRxiv:10.1101/127092. 2017.\n \n\n\n\n
\n\n\n\n \n \n \"Contextual paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{GreenbaumEtAl17,\n\tauthor = {Greenbaum, Nathaniel R and Jernite, Yacine and Halpern, Yoni and Calder, Shelley and Nathanson, Larry A. and Sontag, David and Horng, Steven},\n\ttitle = {Contextual Autocomplete: A Novel User Interface Using Machine Learning to Improve Ontology Usage and Structured Data Capture for Presenting Problems in the Emergency Department},\n\tyear = {2017},\n\tjournal = {bioRxiv:10.1101/127092},\n        keywords = {Health care},\n        url_Paper = {https://www.biorxiv.org/content/early/2017/04/12/127092.full.pdf},\nabstract = {Our objective is to determine the effect of contextual autocomplete, a user interface that uses machine learning, on the efficiency and quality of documentation of presenting problems (chief complaints) in the emergency department (ED). We used contextual autocomplete, a user interface that ranks concepts by their predicted probability, to help nurses enter data about a patient’s reason for visiting the ED. Predicted probabilities were calculated using a previously derived model based on triage vital signs and a brief free text note. We evaluated the percentage and quality of structured data captured using a prospective before-and-after study design. A total of 279,231 patient encounters were analyzed. Structured data capture improved from 26.2\\% to 97.2\\% (p<0.0001). During the post-implementation period, presenting problems\nwere more complete (3.35 vs 3.66; p=0.0004), as precise (3.59 vs. 3.74; p=0.1), and higher in overall quality (3.38 vs. 3.72; p=0.0002). Our system reduced the mean number of keystrokes required to document a presenting problem from 11.6 to 0.6 (p<0.0001), a 95\\% improvement. We have thus demonstrated a technique that captures structured data on nearly all patients. We estimate that our system reduces the number of man-hours required annually to type presenting problems at our institution from 92.5 hours to 4.8 hours. In conclusion, implementation of a contextual autocomplete system resulted in improved structured data capture, ontology usage compliance, and data quality.}\n}\n\n
\n
\n\n\n
\n Our objective is to determine the effect of contextual autocomplete, a user interface that uses machine learning, on the efficiency and quality of documentation of presenting problems (chief complaints) in the emergency department (ED). We used contextual autocomplete, a user interface that ranks concepts by their predicted probability, to help nurses enter data about a patient’s reason for visiting the ED. Predicted probabilities were calculated using a previously derived model based on triage vital signs and a brief free text note. We evaluated the percentage and quality of structured data captured using a prospective before-and-after study design. A total of 279,231 patient encounters were analyzed. Structured data capture improved from 26.2% to 97.2% (p<0.0001). During the post-implementation period, presenting problems were more complete (3.35 vs 3.66; p=0.0004), as precise (3.59 vs. 3.74; p=0.1), and higher in overall quality (3.38 vs. 3.72; p=0.0002). Our system reduced the mean number of keystrokes required to document a presenting problem from 11.6 to 0.6 (p<0.0001), a 95% improvement. We have thus demonstrated a technique that captures structured data on nearly all patients. We estimate that our system reduces the number of man-hours required annually to type presenting problems at our institution from 92.5 hours to 4.8 hours. In conclusion, implementation of a contextual autocomplete system resulted in improved structured data capture, ontology usage compliance, and data quality.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Multi-task Prediction of Disease Onsets from Longitudinal Laboratory Tests.\n \n \n \n \n\n\n \n Razavian, N.; Marcus, J.; and Sontag, D.\n\n\n \n\n\n\n In Doshi-Velez, F.; Fackler, J.; Kale, D.; Wallace, B.; and Wiens, J., editor(s), Proceedings of the 1st Machine Learning for Healthcare Conference, volume 56, of Proceedings of Machine Learning Research, pages 73-100, 2016. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"Multi-task paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@InProceedings{RazavianEtAl_mlhc16,\n  title = \t {Multi-task Prediction of Disease Onsets from Longitudinal Laboratory Tests},\n  author = \t {Narges Razavian and Jake Marcus and David Sontag},\n  booktitle = \t {Proceedings of the 1st Machine Learning for Healthcare Conference},\n  pages = \t {73-100},\n  year = \t {2016},\n  editor = \t {Finale Doshi-Velez and Jim Fackler and David Kale and Byron Wallace and Jenna Wiens},\n  volume = \t {56},\n  series = \t {Proceedings of Machine Learning Research},\n  publisher = \t {PMLR},\n  keywords = {Health care, Deep learning},\n  url_Paper = {http://arxiv.org/pdf/1608.00647.pdf},\n  abstract = {Disparate areas of machine learning have benefited from models that can take raw data with little preprocessing as input and learn rich representations of that raw data in order to perform well on a given prediction task. We evaluate this approach in healthcare by using longitudinal measurements of lab tests, one of the more raw signals of a patient's health state widely available in clinical data, to predict disease onsets. In particular, we train a Long Short-Term Memory (LSTM) recurrent neural network and two novel convolutional neural networks for multi-task prediction of disease onset for 133 conditions based on 18 common lab tests measured over time in a cohort of 298K patients derived from 8 years of administrative claims data. We compare the neural networks to a logistic regression with several hand-engineered, clinically relevant features. We find that the representation-based learning approaches significantly outperform this baseline. We believe that our work suggests a new avenue for patient risk stratification based solely on lab results.}\n}\n\n
\n
\n\n\n
\n Disparate areas of machine learning have benefited from models that can take raw data with little preprocessing as input and learn rich representations of that raw data in order to perform well on a given prediction task. We evaluate this approach in healthcare by using longitudinal measurements of lab tests, one of the more raw signals of a patient's health state widely available in clinical data, to predict disease onsets. In particular, we train a Long Short-Term Memory (LSTM) recurrent neural network and two novel convolutional neural networks for multi-task prediction of disease onset for 133 conditions based on 18 common lab tests measured over time in a cohort of 298K patients derived from 8 years of administrative claims data. We compare the neural networks to a logistic regression with several hand-engineered, clinically relevant features. We find that the representation-based learning approaches significantly outperform this baseline. We believe that our work suggests a new avenue for patient risk stratification based solely on lab results.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Clinical Tagging with Joint Probabilistic Models.\n \n \n \n \n\n\n \n Halpern, Y.; Horng, S.; and Sontag, D.\n\n\n \n\n\n\n In Doshi-Velez, F.; Fackler, J.; Kale, D.; Wallace, B.; and Wiens, J., editor(s), Proceedings of the 1st Machine Learning for Healthcare Conference, volume 56, of Proceedings of Machine Learning Research, pages 209-225, 2016. \n \n\n\n\n
\n\n\n\n \n \n \"Clinical paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@InProceedings{HalpernEtAl_mlhc16,\n  title = \t {Clinical Tagging with Joint Probabilistic Models},\n  author = \t {Yoni Halpern and Steven Horng and David Sontag},\n  booktitle = \t {Proceedings of the 1st Machine Learning for Healthcare Conference},\n  pages = \t {209-225},\n  year = \t {2016},\n  editor = \t {Finale Doshi-Velez and Jim Fackler and David Kale and Byron Wallace and Jenna Wiens},\n  volume = \t {56},\n  series = \t {Proceedings of Machine Learning Research},\n  keywords = {Health care, Unsupervised learning},\n  url_Paper = {https://arxiv.org/pdf/1608.00686.pdf},\n  abstract = {We describe a method for parameter estimation in bipartite probabilistic graphical models for joint prediction of clinical conditions from the electronic medical record. The method does not rely on the availability of gold-standard labels, but rather uses noisy labels, called anchors, for learning. We provide a likelihood-based objective and a moments-based initialization that are effective at learning the model parameters. The learned model is evaluated in a task of assigning a heldout clinical condition to patients based on retrospective analysis of the records, and outperforms baselines which do not account for the noisiness in the labels or do not model the conditions jointly.}\n}\n\n
\n
\n\n\n
\n We describe a method for parameter estimation in bipartite probabilistic graphical models for joint prediction of clinical conditions from the electronic medical record. The method does not rely on the availability of gold-standard labels, but rather uses noisy labels, called anchors, for learning. We provide a likelihood-based objective and a moments-based initialization that are effective at learning the model parameters. The learned model is evaluated in a task of assigning a heldout clinical condition to patients based on retrospective analysis of the records, and outperforms baselines which do not account for the noisiness in the labels or do not model the conditions jointly.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Comparison of approaches for heart failure case identification from electronic health record data.\n \n \n \n \n\n\n \n Blecker, S.; Katz, S.; Horwitz, L.; Kuperman, G.; Park, H; Gold, A; and Sontag, D.\n\n\n \n\n\n\n JAMA Cardiology, 1(9): 1014-1020. 2016.\n \n\n\n\n
\n\n\n\n \n \n \"Comparison paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{BleckerEtAl_jama16,\nauthor = {Blecker, Saul and Katz, Stuart and Horwitz, LI and Kuperman, Gilad and Park, H and Gold, A and Sontag, David},\ntitle = {Comparison of approaches for heart failure case identification from electronic health record data},\njournal = {JAMA Cardiology},\nvolume = {1},\nnumber = {9},\npages = {1014-1020},\nyear = {2016},\nkeywords = {Health care},\nurl_Paper = {http://jamanetwork.com/journals/jamacardiology/article-abstract/2557840},\nabstract = {Accurate, real-time case identification is needed to target interventions to improve quality and outcomes for hospitalized patients with heart failure. Problem lists may be useful for case identification but are often inaccurate or incomplete. Machine-learning approaches may improve accuracy of identification but can be limited by complexity of implementation. Our objective was to develop algorithms that use readily available clinical data to identify patients with heart failure while in the hospital. In this study of 47,119 hospitalizations, inclusion of heart failure on the problem list had a sensitivity of 0.40 and a positive predictive value (PPV) of 0.96. A logistic regression model with clinical data was associated with a sensitivity of 0.68 and PPV of 0.90, whereas a machine-learning algorithm that used free text had a sensitivity of 0.83 and a PPV of 0.90. The high predictive accuracy of machine learning using free text demonstrates that support of such analytics in future electronic health record systems can improve cohort identification.}\n}\n\n
\n
\n\n\n
\n Accurate, real-time case identification is needed to target interventions to improve quality and outcomes for hospitalized patients with heart failure. Problem lists may be useful for case identification but are often inaccurate or incomplete. Machine-learning approaches may improve accuracy of identification but can be limited by complexity of implementation. Our objective was to develop algorithms that use readily available clinical data to identify patients with heart failure while in the hospital. In this study of 47,119 hospitalizations, inclusion of heart failure on the problem list had a sensitivity of 0.40 and a positive predictive value (PPV) of 0.96. A logistic regression model with clinical data was associated with a sensitivity of 0.68 and PPV of 0.90, whereas a machine-learning algorithm that used free text had a sensitivity of 0.83 and a PPV of 0.90. The high predictive accuracy of machine learning using free text demonstrates that support of such analytics in future electronic health record systems can improve cohort identification.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Identifiable Phenotyping using Constrained Non-Negative Matrix Factorization.\n \n \n \n \n\n\n \n Joshi, S.; Gunasekar, S.; Sontag, D.; and Joydeep, G.\n\n\n \n\n\n\n In Doshi-Velez, F.; Fackler, J.; Kale, D.; Wallace, B.; and Wiens, J., editor(s), Proceedings of the 1st Machine Learning for Healthcare Conference, volume 56, of Proceedings of Machine Learning Research, pages 17–41, 2016. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"Identifiable paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@InProceedings{JoshiEtAl_MLHC16,\n  title = \t {Identifiable Phenotyping using Constrained Non-Negative Matrix Factorization},\n  author = \t {Shalmali Joshi and Suriya Gunasekar and David Sontag and Ghosh Joydeep},\n  booktitle = \t {Proceedings of the 1st Machine Learning for Healthcare Conference},\n  pages = \t {17--41},\n  year = \t {2016},\n  editor = \t {Finale Doshi-Velez and Jim Fackler and David Kale and Byron Wallace and Jenna Wiens},\n  volume = \t {56},\n  series = \t {Proceedings of Machine Learning Research},\n  publisher = \t {PMLR},\n  keywords = {Health care},\n  url_Paper = \t {http://proceedings.mlr.press/v56/Joshi16.pdf},\n  abstract = \t {This work proposes a new algorithm for automated and simultaneous phenotyping of multiple co-occurring medical conditions, also referred to as comorbidities, using clinical notes from electronic health records (EHRs). A latent factor estimation technique, non-negative matrix factorization (NMF), is augmented with domain constraints from weak supervision to obtain sparse latent factors that are grounded to a fixed set of chronic conditions. The proposed grounding mechanism ensures a one-to-one identifiable and interpretable mapping between the latent factors and the target comorbidities. Qualitative assessment of the empirical results by clinical experts show that the proposed model learns clinically interpretable phenotypes which are also shown to have competitive performance on 30 day mortality prediction task. The proposed method can be readily adapted to any non-negative EHR data across various healthcare institutions.}\n}\n\n
\n
\n\n\n
\n This work proposes a new algorithm for automated and simultaneous phenotyping of multiple co-occurring medical conditions, also referred to as comorbidities, using clinical notes from electronic health records (EHRs). A latent factor estimation technique, non-negative matrix factorization (NMF), is augmented with domain constraints from weak supervision to obtain sparse latent factors that are grounded to a fixed set of chronic conditions. The proposed grounding mechanism ensures a one-to-one identifiable and interpretable mapping between the latent factors and the target comorbidities. Qualitative assessment of the empirical results by clinical experts show that the proposed model learns clinically interpretable phenotypes which are also shown to have competitive performance on 30 day mortality prediction task. The proposed method can be readily adapted to any non-negative EHR data across various healthcare institutions.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Electronic Medical Record Phenotyping using the Anchor & Learn Framework.\n \n \n \n \n\n\n \n Halpern, Y.; Horng, S.; Choi, Y.; and Sontag, D.\n\n\n \n\n\n\n In Journal of the American Medical Informatics Association (JAMIA), 2016. \n \n\n\n\n
\n\n\n\n \n \n \"Electronic paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{HalpernEtAl_jamia16,\n author = {Yoni Halpern and Steven Horng and Youngduck Choi and David Sontag},\n title = {Electronic Medical Record Phenotyping using the Anchor \\& Learn Framework},\n booktitle = {Journal of the American Medical Informatics Association (JAMIA)},\n year = {2016},\n keywords = {Health care, Unsupervised learning},\n doi = {10.1093/jamia/ocw011},\n url_Paper = {http://jamia.oxfordjournals.org/content/early/2016/04/26/jamia.ocw011.full.pdf},\n abstract = {Electronic medical records (EMRs) hold a tremendous amount of information about patients that is relevant to determining the optimal approach to patient care. As medicine becomes increasingly precise, a patient’s electronic medical record phenotype will play an important role in triggering clinical decision support systems that can deliver personalized recommendations in real time. Learning with anchors presents a method of efficiently learning statistically driven phenotypes with minimal manual intervention. We developed a phenotype library that uses both structured and unstructured data from the EMR to represent patients for real-time clinical decision support. Eight of the phenotypes were evaluated using retrospective EMR data on emergency department patients using a set of prospectively gathered gold standard labels. We built the phenotype library with 42 publicly available phenotype definitions. Using information from triage time, the phenotype classifiers have an area under the ROC curve (AUC) of infection 0.89, cancer 0.88, immunosuppressed 0.85, septic shock 0.93, nursing home 0.87, anticoagulated 0.83, cardiac etiology 0.89, and pneumonia 0.90. Using information available at the time of disposition from the emergency department, the AUC values are infection 0.91, cancer 0.95, immunosuppressed 0.90, septic shock 0.97, nursing home 0.91, anticoagulated 0.94, cardiac etiology 0.92, and pneumonia 0.97. The resulting phenotypes are interpretable and fast to build, and perform comparably to statistically learned phenotypes developed with 5000 manually labeled patients. Learning with anchors is an attractive option for building a large public repository of phenotype definitions that can be used for a range of health IT applications, including real-time decision support.}\n}\n%\tpublisher = {The Oxford University Press},\n%\tissn = {1067-5027},\n%\tURL = {http://jamia.oxfordjournals.org/content/early/2016/04/26/jamia.ocw011},\n\n
\n
\n\n\n
\n Electronic medical records (EMRs) hold a tremendous amount of information about patients that is relevant to determining the optimal approach to patient care. As medicine becomes increasingly precise, a patient’s electronic medical record phenotype will play an important role in triggering clinical decision support systems that can deliver personalized recommendations in real time. Learning with anchors presents a method of efficiently learning statistically driven phenotypes with minimal manual intervention. We developed a phenotype library that uses both structured and unstructured data from the EMR to represent patients for real-time clinical decision support. Eight of the phenotypes were evaluated using retrospective EMR data on emergency department patients using a set of prospectively gathered gold standard labels. We built the phenotype library with 42 publicly available phenotype definitions. Using information from triage time, the phenotype classifiers have an area under the ROC curve (AUC) of infection 0.89, cancer 0.88, immunosuppressed 0.85, septic shock 0.93, nursing home 0.87, anticoagulated 0.83, cardiac etiology 0.89, and pneumonia 0.90. Using information available at the time of disposition from the emergency department, the AUC values are infection 0.91, cancer 0.95, immunosuppressed 0.90, septic shock 0.97, nursing home 0.91, anticoagulated 0.94, cardiac etiology 0.92, and pneumonia 0.97. The resulting phenotypes are interpretable and fast to build, and perform comparably to statistically learned phenotypes developed with 5000 manually labeled patients. Learning with anchors is an attractive option for building a large public repository of phenotype definitions that can be used for a range of health IT applications, including real-time decision support.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Population-Level Prediction of Type 2 Diabetes using Claims Data and Analysis of Risk Factors.\n \n \n \n \n\n\n \n Razavian, N.; Blecker, S.; Schmidt, A. M.; Smith-McLallen, A.; Nigam, S.; and Sontag, D.\n\n\n \n\n\n\n Big Data, Data and Healthcare Special Issue. 2016.\n \n\n\n\n
\n\n\n\n \n \n \"Population-Level paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{RazavianEtAl_bigdata16,\n author = {Narges Razavian and Saul Blecker and Ann Marie Schmidt and Aaron Smith-McLallen and Somesh Nigam and David Sontag},\n title = {Population-Level Prediction of Type 2 Diabetes using Claims Data and Analysis of Risk Factors},\n journal = {Big Data},\n year = {2016},\n volume = {Data and Healthcare Special Issue},\n keywords = {Health care},\n url_Paper = {http://online.liebertpub.com/doi/pdf/10.1089/big.2015.0020},\n abstract = {We present a new approach to population health, in which data-driven predictive models are learned for outcomes such as type 2 diabetes. Our approach enables risk assessment from readily available electronic claims data on large populations, without additional screening cost. Proposed model uncovers early and late-stage risk factors. Using administrative claims, pharmacy records, healthcare utilization, and laboratory results of 4.1 million individuals between 2005 and 2009, an initial set of 42,000 variables were derived that together describe the full health status and history of every individual. Machine learning was then used to methodically enhance predictive variable set and fit models predicting onset of type 2 diabetes in 2009-2011, 2010-2012, and 2011-2013. We compared the enhanced model with a parsimonious model consisting of known diabetes risk factors in a real-world environment, where missing values are common and prevalent. Furthermore, we analyzed novel and known risk factors emerging from the model at different age groups at different stages before the onset. Parsimonious model using 21 classic diabetes risk factors resulted in area under ROC curve (AUC) of 0.75 for diabetes prediction within a 2-year window following the baseline. The enhanced model increased the AUC to 0.80, with about 900 variables selected as predictive ( p < 0.0001 for differences between AUCs). Similar improvements were observed for models predicting diabetes onset 1–3 years and 2–4 years after baseline. The enhanced model improved positive predictive value by at least 50\\% and identified novel surrogate risk factors for type 2 diabetes, such as chronic liver disease (odds ratio [OR] 3.71), high alanine aminotransferase (OR 2.26), esophageal reflux (OR 1.85), and history of acute bronchitis (OR 1.45). Liver risk factors emerge later in the process of diabetes development compared with obesity-related factors such as hypertension and high hemoglobin A1c. In conclusion, population-level risk prediction for type 2 diabetes using readily available administrative data is feasible and has better prediction performance than classical diabetes risk prediction algorithms on very large populations with missing data. The new model enables intervention allocation at national scale quickly and accurately and recovers potentially novel risk factors at different stages before the disease onset.}\n}\n\n
\n
\n\n\n
\n We present a new approach to population health, in which data-driven predictive models are learned for outcomes such as type 2 diabetes. Our approach enables risk assessment from readily available electronic claims data on large populations, without additional screening cost. Proposed model uncovers early and late-stage risk factors. Using administrative claims, pharmacy records, healthcare utilization, and laboratory results of 4.1 million individuals between 2005 and 2009, an initial set of 42,000 variables were derived that together describe the full health status and history of every individual. Machine learning was then used to methodically enhance predictive variable set and fit models predicting onset of type 2 diabetes in 2009-2011, 2010-2012, and 2011-2013. We compared the enhanced model with a parsimonious model consisting of known diabetes risk factors in a real-world environment, where missing values are common and prevalent. Furthermore, we analyzed novel and known risk factors emerging from the model at different age groups at different stages before the onset. Parsimonious model using 21 classic diabetes risk factors resulted in area under ROC curve (AUC) of 0.75 for diabetes prediction within a 2-year window following the baseline. The enhanced model increased the AUC to 0.80, with about 900 variables selected as predictive ( p < 0.0001 for differences between AUCs). Similar improvements were observed for models predicting diabetes onset 1–3 years and 2–4 years after baseline. The enhanced model improved positive predictive value by at least 50% and identified novel surrogate risk factors for type 2 diabetes, such as chronic liver disease (odds ratio [OR] 3.71), high alanine aminotransferase (OR 2.26), esophageal reflux (OR 1.85), and history of acute bronchitis (OR 1.45). Liver risk factors emerge later in the process of diabetes development compared with obesity-related factors such as hypertension and high hemoglobin A1c. In conclusion, population-level risk prediction for type 2 diabetes using readily available administrative data is feasible and has better prediction performance than classical diabetes risk prediction algorithms on very large populations with missing data. The new model enables intervention allocation at national scale quickly and accurately and recovers potentially novel risk factors at different stages before the disease onset.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning Low-Dimensional Representations of Medical Concepts.\n \n \n \n \n\n\n \n Choi, Y.; Chiu, Y.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the AMIA Summit on Clinical Research Informatics (CRI), 2016. \n \n\n\n\n
\n\n\n\n \n \n \"Learning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{ChoiChiuSon_amia16,\n  author    = {Youngduck Choi and Yi-I Chiu and David Sontag},\n  title     = {Learning Low-Dimensional Representations of Medical Concepts},\n booktitle = {Proceedings of the AMIA Summit on Clinical Research Informatics (CRI)},\n year = {2016},\n keywords = {Health care},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/ChoiChiuSontag_AMIA_CRI16.pdf},\n abstract = {We show how to learn low-dimensional representations (embeddings) of a wide range of concepts in medicine, including diseases (e.g., ICD9 codes), medications, procedures, and laboratory tests. We expect that these embeddings will be useful across medical informatics for tasks such as cohort selection and patient summarization. These embeddings are learned using a technique called neural language modeling from the natural language processing community. However, rather than learning the embeddings solely from text, we show how to learn the embeddings from claims data, which is widely available both to providers and to payers. We also show that with a simple algorithmic adjustment, it is possible to learn medical concept embeddings in a privacy preserving manner from co-occurrence counts derived from clinical narratives. Finally, we establish a methodological framework, arising from standard medical ontologies such as UMLS, NDF-RT, and CCS, to further investigate the embeddings and precisely characterize their quantitative properties.}\n}\n\n
\n
\n\n\n
\n We show how to learn low-dimensional representations (embeddings) of a wide range of concepts in medicine, including diseases (e.g., ICD9 codes), medications, procedures, and laboratory tests. We expect that these embeddings will be useful across medical informatics for tasks such as cohort selection and patient summarization. These embeddings are learned using a technique called neural language modeling from the natural language processing community. However, rather than learning the embeddings solely from text, we show how to learn the embeddings from claims data, which is widely available both to providers and to payers. We also show that with a simple algorithmic adjustment, it is possible to learn medical concept embeddings in a privacy preserving manner from co-occurrence counts derived from clinical narratives. Finally, we establish a methodological framework, arising from standard medical ontologies such as UMLS, NDF-RT, and CCS, to further investigate the embeddings and precisely characterize their quantitative properties.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Visual Exploration of Temporal Data in Electronic Medical Records.\n \n \n \n \n\n\n \n Krause, J.; Razavian, N.; Bertini, E.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the American Medical Informatics Association (AMIA) Annual Symposium (Abstract), pages 1538, 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Visual paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{KrauseEtAl_amia15,\n  author    = {Josua Krause and Narges Razavian and Enrico Bertini and David Sontag},\n  title     = {Visual Exploration of Temporal Data in Electronic Medical Records},\n booktitle = {Proceedings of the American Medical Informatics Association (AMIA) Annual Symposium (Abstract)},\n pages = {1538},\n year = {2015},\n keywords = {Health care},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/KrauseEtAl_PatientViz_AMIA15_abstract.pdf},\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Predicting chronic comorbid conditions of type 2 diabetes in Newly-Diagnosed Diabetic Patients.\n \n \n \n \n\n\n \n Razavian, N.; Smith-McLallen, A.; Nigam, S.; Blecker, S.; Schmidt, A. M.; and Sontag, D.\n\n\n \n\n\n\n Value in Health (Abstract), 18(3): A53. 2015.\n \n\n\n\n
\n\n\n\n \n \n \"Predicting paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{RazavianEtAl_ispor15,\n\tAuthor = {Razavian, N. and Smith-McLallen, A. and Nigam, S. and Blecker, S. and Schmidt, A. M. and Sontag, D.},\n\tJournal = {Value in Health (Abstract)},\n\tyear = {2015},\n\tNumber = {3},\n\tPages = {A53},\n\tTitle = {Predicting chronic comorbid conditions of type 2 diabetes in Newly-Diagnosed Diabetic Patients},\n\tVolume = {18},\n        keywords = {Health care},\n        url_Paper = {https://www.ispor.org/awards/20Meet/prediction_posterID_PDB5.pdf},\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Deep Kalman Filters.\n \n \n \n \n\n\n \n Krishnan, R. G.; Shalit, U.; and Sontag, D.\n\n\n \n\n\n\n In arXiv:1511.05121, 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Deep paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KriShaSon_arxiv15,\n author = {Rahul G. Krishnan and Uri Shalit and David Sontag},\n title = {Deep Kalman Filters},\n booktitle = {arXiv:1511.05121},\n year = {2015},\n keywords = {Machine learning, Unsupervised learning, Health care, Deep learning},\n url_Paper = {http://arxiv.org/pdf/1511.05121.pdf},\n abstract = {Kalman Filters are one of the most influential models of time-varying phenomena. They admit an intuitive probabilistic interpretation, have a simple functional form, and enjoy widespread adoption in a variety of disciplines. Motivated by recent variational methods for learning deep generative models, we introduce a unified algorithm to efficiently learn a broad spectrum of Kalman filters. Of particular interest is the use of temporal generative models for counterfactual inference. We investigate the efficacy of such models for counterfactual inference, and to that end we introduce the "Healing MNIST" dataset where long-term structure, noise and actions are applied to sequences of digits. We show the efficacy of our method for modeling this dataset. We further show how our model can be used for counterfactual inference for patients, based on electronic health record data of 8,000 patients over 4.5 years.}\n}\n\n
\n
\n\n\n
\n Kalman Filters are one of the most influential models of time-varying phenomena. They admit an intuitive probabilistic interpretation, have a simple functional form, and enjoy widespread adoption in a variety of disciplines. Motivated by recent variational methods for learning deep generative models, we introduce a unified algorithm to efficiently learn a broad spectrum of Kalman filters. Of particular interest is the use of temporal generative models for counterfactual inference. We investigate the efficacy of such models for counterfactual inference, and to that end we introduce the \"Healing MNIST\" dataset where long-term structure, noise and actions are applied to sequences of digits. We show the efficacy of our method for modeling this dataset. We further show how our model can be used for counterfactual inference for patients, based on electronic health record data of 8,000 patients over 4.5 years.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Anchored Discrete Factor Analysis.\n \n \n \n \n\n\n \n Halpern, Y.; Horng, S.; and Sontag, D.\n\n\n \n\n\n\n In arXiv:1511.03299, 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Anchored paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{HalpernEtAl_arxiv15,\n author = {Yoni Halpern and Steven Horng and David Sontag},\n title = {Anchored Discrete Factor Analysis},\n booktitle = {arXiv:1511.03299},\n year = {2015},\n keywords = {Machine learning, Unsupervised learning, Health care},\n url_Paper = {http://arxiv.org/pdf/1511.03299.pdf},\n abstract = {We present a semi-supervised learning algorithm for learning discrete factor analysis models with arbitrary structure on the latent variables. Our algorithm assumes that every latent variable has an "anchor", an observed variable with only that latent variable as its parent. Given such anchors, we show that it is possible to consistently recover moments of the latent variables and use these moments to learn complete models. We also introduce a new technique for improving the robustness of method-of-moment algorithms by optimizing over the marginal polytope or its relaxations. We evaluate our algorithm using two real-world tasks, tag prediction on questions from the Stack Overflow website and medical diagnosis in an emergency department.}\n}\n\n
\n
\n\n\n
\n We present a semi-supervised learning algorithm for learning discrete factor analysis models with arbitrary structure on the latent variables. Our algorithm assumes that every latent variable has an \"anchor\", an observed variable with only that latent variable as its parent. Given such anchors, we show that it is possible to consistently recover moments of the latent variables and use these moments to learn complete models. We also introduce a new technique for improving the robustness of method-of-moment algorithms by optimizing over the marginal polytope or its relaxations. We evaluate our algorithm using two real-world tasks, tag prediction on questions from the Stack Overflow website and medical diagnosis in an emergency department.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Temporal Convolutional Neural Networks for Diagnosis from Lab Tests.\n \n \n \n \n\n\n \n Razavian, N.; and Sontag, D.\n\n\n \n\n\n\n In arXiv:1511.07938, 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Temporal paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{RazavianSontag_arxiv15,\n author = {Narges Razavian and David Sontag},\n title = {Temporal Convolutional Neural Networks for Diagnosis from Lab Tests},\n booktitle = {arXiv:1511.07938},\n year = {2015},\n keywords = {Health care, Machine learning, Deep learning},\n url_Paper = {http://arxiv.org/pdf/1511.07938.pdf},\n abstract = {Early diagnosis of treatable diseases is essential for improving healthcare, and many diseases’ onsets are predictable from annual lab tests and their temporal trends. We introduce a multi-resolution convolutional neural network for early detection of multiple diseases from irregularly measured sparse lab values. Our novel architecture takes as input both an imputed version of the data and a binary observation matrix. For imputing the temporal sparse observations, we develop a flexible, fast to train method for differentiable multivariate kernel regression. Our experiments on data from 298K individuals over 8 years, 18 common lab measurements, and 171 diseases show that the temporal signatures learned via convolution are significantly more predictive than baselines commonly used for early disease diagnosis.}\n}\n\n
\n
\n\n\n
\n Early diagnosis of treatable diseases is essential for improving healthcare, and many diseases’ onsets are predictable from annual lab tests and their temporal trends. We introduce a multi-resolution convolutional neural network for early detection of multiple diseases from irregularly measured sparse lab values. Our novel architecture takes as input both an imputed version of the data and a binary observation matrix. For imputing the temporal sparse observations, we develop a flexible, fast to train method for differentiable multivariate kernel regression. Our experiments on data from 298K individuals over 8 years, 18 common lab measurements, and 171 diseases show that the temporal signatures learned via convolution are significantly more predictive than baselines commonly used for early disease diagnosis.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Using Anchors to Estimate Clinical State without Labeled Data.\n \n \n \n \n\n\n \n Halpern, Y.; Choi, Y.; Horng, S.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the American Medical Informatics Association (AMIA) Annual Symposium, pages 606–615, 2014. \n \n\n\n\n
\n\n\n\n \n \n \"Using paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{HalpernEtAl_amia14,\n author = {Yoni Halpern and Youngduck Choi and Steven Horng and David Sontag},\n title = {Using Anchors to Estimate Clinical State without Labeled Data},\n booktitle = {Proceedings of the American Medical Informatics Association (AMIA) Annual Symposium},\n pages = {606--615},\n year = {2014},\n keywords = {Health care, Machine learning, Unsupervised learning},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/HalpernEtAl_amia14.pdf},\n abstract = {We present a novel framework for learning to estimate and predict clinical state variables without labeled data. The resulting models can used for electronic phenotyping, triggering clinical decision support, and cohort selection. The framework relies on key observations which we characterize and term "anchor variables". By specifying anchor variables, an expert encodes a certain amount of domain knowledge about the problem while the rest of learning proceeds in an unsupervised manner. The ability to build anchors upon standardized ontologies and the framework's ability to learn from unlabeled data promote generalizability across institutions. We additionally develop a user interface to enable experts to choose anchor variables in an informed manner. The framework is applied to electronic medical record-based phenotyping to enable real-time decision support in the emergency department. We validate the learned models using a prospectively gathered set of gold-standard responses from emergency physicians for nine clinically relevant variables.}\n}\n\n
\n
\n\n\n
\n We present a novel framework for learning to estimate and predict clinical state variables without labeled data. The resulting models can used for electronic phenotyping, triggering clinical decision support, and cohort selection. The framework relies on key observations which we characterize and term \"anchor variables\". By specifying anchor variables, an expert encodes a certain amount of domain knowledge about the problem while the rest of learning proceeds in an unsupervised manner. The ability to build anchors upon standardized ontologies and the framework's ability to learn from unlabeled data promote generalizability across institutions. We additionally develop a user interface to enable experts to choose anchor variables in an informed manner. The framework is applied to electronic medical record-based phenotyping to enable real-time decision support in the emergency department. We validate the learned models using a prospectively gathered set of gold-standard responses from emergency physicians for nine clinically relevant variables.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Unsupervised Learning of Disease Progression Models.\n \n \n \n \n\n\n \n Wang, X.; Sontag, D.; and Wang, F.\n\n\n \n\n\n\n In Proceedings of the 20th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, of KDD '14, pages 85–94, New York, NY, USA, 2014. ACM\n \n\n\n\n
\n\n\n\n \n \n \"Unsupervised paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{WanSonWan_kdd14,\n author = {Xiang Wang and David Sontag and Fei Wang},\n title = {Unsupervised Learning of Disease Progression Models},\n booktitle = {Proceedings of the 20th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},\n series = {KDD '14},\n isbn = {978-1-4503-2956-9},\n pages = {85--94},\n numpages = {10},\n publisher = {ACM},\n address = {New York, NY, USA},\n keywords = {Health care, Unsupervised learning},\n year = {2014},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/WanSonWan_kdd14.pdf},\n abstract = {Chronic diseases, such as Alzheimer's Disease, Diabetes, and Chronic Obstructive Pulmonary Disease, usually progress slowly over a long period of time, causing increasing burden to the patients, their families, and the healthcare system. A better understanding of their progression is instrumental in early diagnosis and personalized care. Modeling disease progression based on real-world evidence is a very challenging task due to the incompleteness and irregularity of the observations, as well as the heterogeneity of the patient conditions. In this paper, we propose a probabilistic disease progression model that address these challenges. As compared to existing disease progression models, the advantage of our model is three-fold: 1) it learns a continuous-time progression model from discrete-time observations with non-equal intervals; 2) it learns the full progression trajectory from a set of incomplete records that only cover short segments of the progression; 3) it learns a compact set of medical concepts as the bridge between the hidden progression process and the observed medical evidence, which are usually extremely sparse and noisy. We demonstrate the capabilities of our model by applying it to a real-world COPD patient cohort and deriving some interesting clinical insights.}\n}\n\n
\n
\n\n\n
\n Chronic diseases, such as Alzheimer's Disease, Diabetes, and Chronic Obstructive Pulmonary Disease, usually progress slowly over a long period of time, causing increasing burden to the patients, their families, and the healthcare system. A better understanding of their progression is instrumental in early diagnosis and personalized care. Modeling disease progression based on real-world evidence is a very challenging task due to the incompleteness and irregularity of the observations, as well as the heterogeneity of the patient conditions. In this paper, we propose a probabilistic disease progression model that address these challenges. As compared to existing disease progression models, the advantage of our model is three-fold: 1) it learns a continuous-time progression model from discrete-time observations with non-equal intervals; 2) it learns the full progression trajectory from a set of incomplete records that only cover short segments of the progression; 3) it learns a compact set of medical concepts as the bridge between the hidden progression process and the observed medical evidence, which are usually extremely sparse and noisy. We demonstrate the capabilities of our model by applying it to a real-world COPD patient cohort and deriving some interesting clinical insights.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Discovering Hidden Variables in Noisy-Or Networks using Quartet Tests.\n \n \n \n \n\n\n \n Jernite, Y.; Halpern, Y.; and Sontag, D.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems 26, pages 2355–2363. MIT Press, 2013.\n \n\n\n\n
\n\n\n\n \n \n paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@incollection{JerHalSon_nips13,\n author = {Yacine Jernite and Yoni Halpern and David Sontag},\n title = {Discovering Hidden Variables in Noisy-Or Networks using Quartet Tests},\n booktitle = {Advances in Neural Information Processing Systems 26},\n pages = {2355--2363},\n publisher = {MIT Press},\n year = {2013},\n keywords = {Machine learning, Unsupervised learning, Health care},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/JerHalSon_nips13.pdf},\n abstract = {We give a polynomial-time algorithm for provably learning the structure and parameters of bipartite noisy-or Bayesian networks of binary variables where the top layer is completely hidden. Unsupervised learning of these models is a form of discrete factor analysis, enabling the discovery of hidden variables and their causal relationships with observed data. We obtain an efficient learning algorithm for a family of Bayesian networks that we call quartet-learnable. For each latent variable, the existence of a singly-coupled quartet allows us to uniquely identify and learn all parameters involving that latent variable. We give a proof of the polynomial sample complexity of our learning algorithm, and experimentally compare it to variational EM.}\n}\n\n
\n
\n\n\n
\n We give a polynomial-time algorithm for provably learning the structure and parameters of bipartite noisy-or Bayesian networks of binary variables where the top layer is completely hidden. Unsupervised learning of these models is a form of discrete factor analysis, enabling the discovery of hidden variables and their causal relationships with observed data. We obtain an efficient learning algorithm for a family of Bayesian networks that we call quartet-learnable. For each latent variable, the existence of a singly-coupled quartet allows us to uniquely identify and learn all parameters involving that latent variable. We give a proof of the polynomial sample complexity of our learning algorithm, and experimentally compare it to variational EM.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Predicting Chief Complaints at Triage Time in the Emergency Department.\n \n \n \n \n\n\n \n Jernite, Y.; Halpern, Y.; Horng, S.; and Sontag, D.\n\n\n \n\n\n\n NIPS Workshop on Machine Learning for Clinical Data Analysis and Healthcare. 2013.\n \n\n\n\n
\n\n\n\n \n \n \"Predicting paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{JerniteEtAl_nips13health,\n  author = {Yacine Jernite and Yoni Halpern and Steven Horng and David Sontag},\n  title = {Predicting Chief Complaints at Triage Time in the Emergency Department},\n  journal={NIPS Workshop on Machine Learning for Clinical Data Analysis and Healthcare},\n  year={2013},\n  keywords = {Health care},\n  url_Paper = {http://people.csail.mit.edu/dsontag/papers/JerniteEtAl_nips13health.pdf},\n  abstract = {As hospitals increasingly use electronic medical records for research and quality improvement, it is important to provide ways to structure medical data without losing either expressiveness or time. We present a system that helps achieve this goal by building an extended ontology of chief complaints and automatically predicting a patient's chief complaint, based on their vitals and the nurses' description of their state at arrival.}\n}\n\n
\n
\n\n\n
\n As hospitals increasingly use electronic medical records for research and quality improvement, it is important to provide ways to structure medical data without losing either expressiveness or time. We present a system that helps achieve this goal by building an extended ontology of chief complaints and automatically predicting a patient's chief complaint, based on their vitals and the nurses' description of their state at arrival.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Unsupervised Learning of Noisy-Or Bayesian Networks.\n \n \n \n \n\n\n \n Halpern, Y.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the Twenty-Ninth Conference on Uncertainty in Artificial Intelligence (UAI-13), pages 272–281, Corvallis, Oregon, 2013. AUAI Press\n \n\n\n\n
\n\n\n\n \n \n \"Unsupervised paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{HalpernSontag_uai13,\n author = {Yoni Halpern and David Sontag},\n title = {Unsupervised Learning of Noisy-Or Bayesian Networks},\n booktitle = {Proceedings of the Twenty-Ninth Conference on Uncertainty in Artificial Intelligence ({UAI}-13)},\n publisher = {AUAI Press},\n address = {Corvallis, Oregon},\n pages = {272--281},\n year = {2013},\n keywords = {Machine learning, Unsupervised learning, Health care},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/HalpernSontag_uai13.pdf},\n abstract = {This paper considers the problem of learning the parameters in Bayesian networks of discrete variables with known structure and hidden variables. Previous approaches in these settings typically use expectation maximization; when the network has high treewidth, the required expectations might be approximated using Monte Carlo or variational methods. We show how to avoid inference altogether during learning by giving a polynomial-time algorithm based on the method-of-moments, building upon recent work on learning discrete-valued mixture models. In particular, we show how to learn the parameters for a family of bipartite noisy-or Bayesian networks. In our experimental results, we demonstrate an application of our algorithm to learning QMR-DT, a large Bayesian network used for medical diagnosis. We show that it is possible to fully learn the parameters of QMR-DT even when only the findings are observed in the training data (ground truth diseases unknown).}\n}\n\n
\n
\n\n\n
\n This paper considers the problem of learning the parameters in Bayesian networks of discrete variables with known structure and hidden variables. Previous approaches in these settings typically use expectation maximization; when the network has high treewidth, the required expectations might be approximated using Monte Carlo or variational methods. We show how to avoid inference altogether during learning by giving a polynomial-time algorithm based on the method-of-moments, building upon recent work on learning discrete-valued mixture models. In particular, we show how to learn the parameters for a family of bipartite noisy-or Bayesian networks. In our experimental results, we demonstrate an application of our algorithm to learning QMR-DT, a large Bayesian network used for medical diagnosis. We show that it is possible to fully learn the parameters of QMR-DT even when only the findings are observed in the training data (ground truth diseases unknown).\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A Comparison of Dimensionality Reduction Techniques for Unstructured Clinical Text.\n \n \n \n \n\n\n \n Halpern, Y.; Horng, S.; Nathanson, L. A.; Shapiro, N. I.; and Sontag, D.\n\n\n \n\n\n\n ICML 2012 Workshop on Clinical Data Analysis. 2012.\n \n\n\n\n
\n\n\n\n \n \n \"A paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@article{HalpernEtAl_ICML_clinical_workshop12,\n  author = {Yoni Halpern and Steven Horng and Larry A. Nathanson and Nathan I. Shapiro and David Sontag},\n  title = {A Comparison of Dimensionality Reduction Techniques for Unstructured Clinical Text},\n  journal={ICML 2012 Workshop on Clinical Data Analysis},\n  year={2012},\n  keywords = {Health care},\n  url_Paper = {http://people.csail.mit.edu/dsontag/papers/HalpernEtAl_icml12_workshop.pdf},\n  abstract = {Much of clinical data is free text, which is challenging to use together with machine learning, visualization tools, and clinical decision rules. In this paper, we compare supervised and unsupervised dimensionality reduction techniques, including the recently proposed sLDA and MedLDA algorithms, on clinical texts. We evaluate each dimensionality reduction method by using them as features for two important prediction problems that arise in emergency departments: predicting whether a patient has an infection, which can progress to sepsis, and predicting the likelihood of a patient being admitted to the Intensive Care Unit (used for risk stratification). We find that, on this data, existing supervised dimensionality reduction techniques perform better than unsupervise techniques only for very low dimensional representations.}\n}\n\n
\n
\n\n\n
\n Much of clinical data is free text, which is challenging to use together with machine learning, visualization tools, and clinical decision rules. In this paper, we compare supervised and unsupervised dimensionality reduction techniques, including the recently proposed sLDA and MedLDA algorithms, on clinical texts. We evaluate each dimensionality reduction method by using them as features for two important prediction problems that arise in emergency departments: predicting whether a patient has an infection, which can progress to sepsis, and predicting the likelihood of a patient being admitted to the Intensive Care Unit (used for risk stratification). We find that, on this data, existing supervised dimensionality reduction techniques perform better than unsupervise techniques only for very low dimensional representations.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Probabilistic Modeling of Systematic Errors in Two-Hybrid Experiments.\n \n \n \n \n\n\n \n Sontag, D.; Singh, R.; and Berger, B.\n\n\n \n\n\n\n In Pacific Symposium on Biocomputing, volume 12, pages 445-457, 2007. \n \n\n\n\n
\n\n\n\n \n \n \"Probabilistic paper\n  \n \n \n \"Probabilistic link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{SonSinBer_psb07,\n title  = {Probabilistic Modeling of Systematic Errors in Two-Hybrid Experiments},\n author = {David Sontag and Rohit Singh and Bonnie Berger},\n booktitle = {Pacific Symposium on Biocomputing},\n volume  = {12},\n year   = {2007},\n pages  = {445-457},\n keywords = {Computational biology, Health care},\n url_Paper = {http://psb.stanford.edu/psb-online/proceedings/psb07/sontag.pdf},\n url_Link = {http://groups.csail.mit.edu/cb/probmod2H/},\n abstract = {We describe a novel probabilistic approach to estimating errors in two-hybrid (2H) experiments. Such experiments are frequently used to elucidate protein-protein interaction networks in a high-throughput fashion; however, a significant challenge with these is their relatively high error rate, specifically, a high false-positive rate. We describe a comprehensive error model for 2H data, accounting for both random\nand systematic errors. The latter arise from limitations of the 2H experimental protocol: in theory, the reporting mechanism of a 2H experiment should be activated if and only if the two proteins being tested truly interact; in practice, even in the absence of a true interaction, it may be activated by some proteins -- either by themselves or through promiscuous interaction with other proteins. We describe a probabilistic relational model that explicitly models the above phenomenon and use Markov Chain Monte Carlo (MCMC) algorithms to compute both the probability of an observed 2H interaction being true as well as the probability of individual proteins being self-activating/promiscuous. This is the first approach that explicitly models systematic errors in protein-protein interaction data; in contrast, previous work on this topic has modeled errors as being independent and random. By explicitly modeling the sources of noise in 2H systems, we find that we are better able to make use of the available experimental data. In comparison with Bader et al.’s method for estimating confidence in 2H predicted interactions, the proposed method performed 5-10\\% better overall, and in particular regimes improved prediction accuracy by as much as 76\\%.}\n}\n\n
\n
\n\n\n
\n We describe a novel probabilistic approach to estimating errors in two-hybrid (2H) experiments. Such experiments are frequently used to elucidate protein-protein interaction networks in a high-throughput fashion; however, a significant challenge with these is their relatively high error rate, specifically, a high false-positive rate. We describe a comprehensive error model for 2H data, accounting for both random and systematic errors. The latter arise from limitations of the 2H experimental protocol: in theory, the reporting mechanism of a 2H experiment should be activated if and only if the two proteins being tested truly interact; in practice, even in the absence of a true interaction, it may be activated by some proteins – either by themselves or through promiscuous interaction with other proteins. We describe a probabilistic relational model that explicitly models the above phenomenon and use Markov Chain Monte Carlo (MCMC) algorithms to compute both the probability of an observed 2H interaction being true as well as the probability of individual proteins being self-activating/promiscuous. This is the first approach that explicitly models systematic errors in protein-protein interaction data; in contrast, previous work on this topic has modeled errors as being independent and random. By explicitly modeling the sources of noise in 2H systems, we find that we are better able to make use of the available experimental data. In comparison with Bader et al.’s method for estimating confidence in 2H predicted interactions, the proposed method performed 5-10% better overall, and in particular regimes improved prediction accuracy by as much as 76%.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n Information retrieval\n \n \n (2)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Probabilistic models for personalizing web search.\n \n \n \n \n\n\n \n Sontag, D.; Collins-Thompson, K.; Bennett, P. N.; White, R. W.; Dumais, S.; and Billerbeck, B.\n\n\n \n\n\n\n In Proceedings of the Fifth ACM International Conference on Web Search and Data Mining, of WSDM '12, pages 433–442, New York, NY, USA, 2012. ACM\n \n\n\n\n
\n\n\n\n \n \n \"Probabilistic paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{Sontag_wsdm12,\n author = {David Sontag and Kevyn Collins-Thompson and Paul N. Bennett and Ryen W. White and Susan Dumais and Bodo Billerbeck},\n title = {Probabilistic models for personalizing web search},\n booktitle = {Proceedings of the Fifth {ACM} International Conference on Web Search and Data Mining},\n series = {WSDM '12},\n year = {2012},\n isbn = {978-1-4503-0747-5},\n location = {Seattle, Washington, USA},\n pages = {433--442},\n numpages = {10},\n doi = {http://doi.acm.org/10.1145/2124295.2124348},\n publisher = {ACM},\n address = {New York, NY, USA},\n keywords = {Information retrieval},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/sontag_wsdm12.pdf},\n abstract = {We present a new approach for personalizing Web search results to a specific user. Ranking functions for Web search engines are typically trained by machine learning algorithms using either direct human relevance judgments or indirect judgments obtained from click-through data from millions of users. The rankings are thus optimized to this generic population of users, not to any specific user. We propose a generative model of relevance which can be used to infer the relevance of a document to a specific user for a search query. The user-specific parameters of this generative model constitute a compact user profile. We show how to learn these profiles from a user's long-term search history. Our algorithm for computing the personalized ranking is simple and has little computational overhead. We evaluate our personalization approach using historical search data from thousands of users of a major Web search engine. Our findings demonstrate gains in retrieval performance for queries with high ambiguity, with particularly large improvements for acronym queries.}\n}\n\n
\n
\n\n\n
\n We present a new approach for personalizing Web search results to a specific user. Ranking functions for Web search engines are typically trained by machine learning algorithms using either direct human relevance judgments or indirect judgments obtained from click-through data from millions of users. The rankings are thus optimized to this generic population of users, not to any specific user. We propose a generative model of relevance which can be used to infer the relevance of a document to a specific user for a search query. The user-specific parameters of this generative model constitute a compact user profile. We show how to learn these profiles from a user's long-term search history. Our algorithm for computing the personalized ranking is simple and has little computational overhead. We evaluate our personalization approach using historical search data from thousands of users of a major Web search engine. Our findings demonstrate gains in retrieval performance for queries with high ambiguity, with particularly large improvements for acronym queries.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Personalizing web search results by reading level.\n \n \n \n \n\n\n \n Collins-Thompson, K.; Bennett, P. N.; White, R. W.; de la Chica, S.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the 20th ACM International Conference on Information and Knowledge Management, of CIKM '11, pages 403–412, New York, NY, USA, 2011. ACM\n \n\n\n\n
\n\n\n\n \n \n \"Personalizing paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{CollinsThompsonCIKM2011,\n author = {Kevyn Collins-Thompson and Paul N. Bennett and Ryen W. White and Sebastian de la Chica and David Sontag},\n title = {Personalizing web search results by reading level},\n booktitle = {Proceedings of the 20th {ACM} International Conference on Information and Knowledge Management},\n series = {CIKM '11},\n year = {2011},\n isbn = {978-1-4503-0717-8},\n location = {Glasgow, Scotland, UK},\n pages = {403--412},\n numpages = {10},\n doi = {http://doi.acm.org/10.1145/2063576.2063639},\n publisher = {ACM},\n address = {New York, NY, USA},\n keywords = {Information retrieval},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/CollinsThompsonCIKM2011.pdf},\n abstract = {Traditionally, search engines have ignored the reading difficulty of documents and the reading proficiency of users in computing a document ranking. This is one reason why Web search engines do a poor job of serving an important segment of the population: children. While there are many important problems in interface design, content filtering, and results presentation related to addressing children's search needs, perhaps the most fundamental challenge is simply that of providing relevant results at the right level of reading difficulty. At the opposite end of the proficiency spectrum, it may also be valuable for technical users to find more advanced material or to filter out material at lower levels of difficulty, such as tutorials and introductory texts. We show how reading level can provide a valuable new relevance signal for both general and personalized Web search. We describe models and algorithms to address the three key problems in improving relevance for search using reading difficulty: estimating user proficiency, estimating result difficulty, and re-ranking based on the difference between user and result reading level profiles. We evaluate our methods on a large volume of Web query traffic and provide a large-scale log analysis that highlights the importance of finding results at an appropriate reading level for the user.}\n}\n\n
\n
\n\n\n
\n Traditionally, search engines have ignored the reading difficulty of documents and the reading proficiency of users in computing a document ranking. This is one reason why Web search engines do a poor job of serving an important segment of the population: children. While there are many important problems in interface design, content filtering, and results presentation related to addressing children's search needs, perhaps the most fundamental challenge is simply that of providing relevant results at the right level of reading difficulty. At the opposite end of the proficiency spectrum, it may also be valuable for technical users to find more advanced material or to filter out material at lower levels of difficulty, such as tutorials and introductory texts. We show how reading level can provide a valuable new relevance signal for both general and personalized Web search. We describe models and algorithms to address the three key problems in improving relevance for search using reading difficulty: estimating user proficiency, estimating result difficulty, and re-ranking based on the difference between user and result reading level profiles. We evaluate our methods on a large volume of Web query traffic and provide a large-scale log analysis that highlights the importance of finding results at an appropriate reading level for the user.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n Machine learning\n \n \n (47)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Max-margin learning with the Bayes Factor.\n \n \n \n \n\n\n \n Krishnan, R. G.; Khandelwal, A.; Ranganath, R.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the Conference on Uncertainty in Artificial Intelligence (UAI), 2018. \n \n\n\n\n
\n\n\n\n \n \n \"Max-margin paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KrishnanEtAl_uai18,\n  author = {Rahul G. Krishnan and Arjun Khandelwal and Rajesh Ranganath and David Sontag},\n  title = {Max-margin learning with the Bayes Factor},\n  booktitle = {Proceedings of the Conference on Uncertainty in Artificial Intelligence ({UAI})},\n  year = {2018},\n  keywords = {Machine learning, Unsupervised learning, Deep learning, Approximate inference in graphical models},\n  abstract = {We propose a new way to answer probabilistic queries that span multiple datapoints. We formalize reasoning about the similarity of different datapoints as the evaluation of the Bayes Factor within a hierarchical deep generative model that enforces a separation between the latent variables used for representation learning and those used for reasoning. Under this model, we derive an intuitive estimator for the Bayes Factor that represents similarity as the amount of overlap in representation space shared by different points. The estimator we derive relies on a query-conditional latent reasoning network, that parameterizes a distribution over the latent space of the deep generative model. The latent reasoning network is trained to amortize the posterior-predictive distribution under a hierarchical model using supervised data and a max-margin learning algorithm. We explore how the model may be used to focus the data variations captured in the latent space of the deep generative model and how this may be used to build new algorithms for few-shot learning.},\n  url_Paper = {http://people.csail.mit.edu/dsontag/papers/KrishnanEtAl_UAI18.pdf}\n}\n\n
\n
\n\n\n
\n We propose a new way to answer probabilistic queries that span multiple datapoints. We formalize reasoning about the similarity of different datapoints as the evaluation of the Bayes Factor within a hierarchical deep generative model that enforces a separation between the latent variables used for representation learning and those used for reasoning. Under this model, we derive an intuitive estimator for the Bayes Factor that represents similarity as the amount of overlap in representation space shared by different points. The estimator we derive relies on a query-conditional latent reasoning network, that parameterizes a distribution over the latent space of the deep generative model. The latent reasoning network is trained to amortize the posterior-predictive distribution under a hierarchical model using supervised data and a max-margin learning algorithm. We explore how the model may be used to focus the data variations captured in the latent space of the deep generative model and how this may be used to build new algorithms for few-shot learning.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Semi-Amortized Variational Autoencoders.\n \n \n \n \n\n\n \n Kim, Y.; Wiseman, S.; Miller, A. C.; Sontag, D.; and Rush, A. M.\n\n\n \n\n\n\n In Proceedings of the 35th International Conference on Machine Learning (ICML), 2018. \n \n\n\n\n
\n\n\n\n \n \n \"Semi-Amortized paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KimEtAl_icml18,\n  author    = {Yoon Kim and Sam Wiseman and Andrew C. Miller and David Sontag and Alexander M. Rush},\n  title = {Semi-Amortized Variational Autoencoders},\n  booktitle = {Proceedings of the 35th International Conference on Machine Learning ({ICML})},\n  year = 2018,\n  keywords = {Machine learning, Unsupervised learning, Deep learning, Approximate inference in graphical models},\n  url_Paper = {https://arxiv.org/pdf/1802.02550.pdf},\n  abstract = {Amortized variational inference (AVI) replaces instance-specific local inference with a global inference network. While AVI has enabled efficient training of deep generative models such as variational autoencoders (VAE), recent empirical work suggests that inference networks can produce suboptimal variational parameters. We propose a hybrid approach, to use AVI to initialize the variational parameters and run stochastic variational inference (SVI) to refine them. Crucially, the local SVI procedure is itself differentiable, so the inference network and generative model can be trained end-to-end with gradient-based optimization. This semi-amortized approach enables the use of rich generative models without experiencing the posterior-collapse phenomenon common in training VAEs for problems like text generation. Experiments show this approach outperforms strong autoregressive and variational baselines on standard text and image datasets.}\n}\n\n
\n
\n\n\n
\n Amortized variational inference (AVI) replaces instance-specific local inference with a global inference network. While AVI has enabled efficient training of deep generative models such as variational autoencoders (VAE), recent empirical work suggests that inference networks can produce suboptimal variational parameters. We propose a hybrid approach, to use AVI to initialize the variational parameters and run stochastic variational inference (SVI) to refine them. Crucially, the local SVI procedure is itself differentiable, so the inference network and generative model can be trained end-to-end with gradient-based optimization. This semi-amortized approach enables the use of rich generative models without experiencing the posterior-collapse phenomenon common in training VAEs for problems like text generation. Experiments show this approach outperforms strong autoregressive and variational baselines on standard text and image datasets.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Why Is My Classifier Discriminatory?.\n \n \n \n \n\n\n \n Chen, I.; Johansson, F. D.; and Sontag, D.\n\n\n \n\n\n\n ArXiv e-prints arXiv:1805.12002. 2018.\n \n\n\n\n
\n\n\n\n \n \n \"Why paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 8 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@article{ChenJohanssonSontag_arxiv18,\n  author = {Irene Chen and Fredrik D. Johansson and David Sontag},\n  title = {Why Is My Classifier Discriminatory?},\n  journal = {ArXiv e-prints arXiv:1805.12002},\n  archivePrefix = "arXiv",\n  eprint = {1805.12002},\n  primaryClass = "stat.ML",\n  year = 2018,\n  keywords = {Machine learning, Health care},\n  url_Paper = {https://arxiv.org/pdf/1805.12002.pdf},\n  abstract = {Recent attempts to achieve fairness in predictive models focus on the balance between fairness and accuracy. In sensitive applications such as healthcare or criminal justice, this trade-off is often undesirable as any increase in prediction error could have devastating consequences. In this work, we argue that the fairness of predictions should be evaluated in context of the data, and that unfairness induced by inadequate samples sizes or unmeasured predictive variables should be addressed through data collection, rather than by constraining the model. We decompose cost-based metrics of discrimination into bias, variance, and noise, and propose actions aimed at estimating and reducing each term. Finally, we perform case-studies on prediction of income, mortality, and review ratings, confirming the value of this analysis. We find that data collection is often a means to reduce discrimination without sacrificing accuracy.}\n}\n\n\n
\n
\n\n\n
\n Recent attempts to achieve fairness in predictive models focus on the balance between fairness and accuracy. In sensitive applications such as healthcare or criminal justice, this trade-off is often undesirable as any increase in prediction error could have devastating consequences. In this work, we argue that the fairness of predictions should be evaluated in context of the data, and that unfairness induced by inadequate samples sizes or unmeasured predictive variables should be addressed through data collection, rather than by constraining the model. We decompose cost-based metrics of discrimination into bias, variance, and noise, and propose actions aimed at estimating and reducing each term. Finally, we perform case-studies on prediction of income, mortality, and review ratings, confirming the value of this analysis. We find that data collection is often a means to reduce discrimination without sacrificing accuracy.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Optimality of Approximate Inference Algorithms on Stable Instances.\n \n \n \n \n\n\n \n Lang, H.; Sontag, D.; and Vijayaraghavan, A.\n\n\n \n\n\n\n In Proceedings of the Twenty-First International Conference on Artificial Intelligence and Statistics (AI-STATS), 2018. JMLR: W&CP\n \n\n\n\n
\n\n\n\n \n \n \"Optimality paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{LangEtAl_aistats18,\n title = {Optimality of Approximate Inference Algorithms on Stable Instances},\n author = {Hunter Lang and David Sontag and Aravindan Vijayaraghavan},\n booktitle = {Proceedings of the Twenty-First International Conference on Artificial Intelligence and Statistics (AI-STATS)},\n publisher = {JMLR: W\\&CP},\n year = {2018},\n keywords = {Machine learning, Approximate inference in graphical models, Structured prediction},\n url_Paper = {http://proceedings.mlr.press/v84/lang18a.html},\n abstract = {Approximate algorithms for structured prediction problems -- such as LP relaxations and the popular alpha-expansion algorithm (Boykov et al. 2001) -- typically far exceed their theoretical performance guarantees on real-world instances. These algorithms often find solutions that are very close to optimal. The goal of this paper is to partially explain the performance of alpha-expansion and an LP relaxation algorithm on MAP inference in Ferromagnetic Potts models (FPMs). Our main results give stability conditions under which these two algorithms provably recover the optimal MAP solution. These theoretical results complement numerous empirical observations of good performance.}\n}\n\n
\n
\n\n\n
\n Approximate algorithms for structured prediction problems – such as LP relaxations and the popular alpha-expansion algorithm (Boykov et al. 2001) – typically far exceed their theoretical performance guarantees on real-world instances. These algorithms often find solutions that are very close to optimal. The goal of this paper is to partially explain the performance of alpha-expansion and an LP relaxation algorithm on MAP inference in Ferromagnetic Potts models (FPMs). Our main results give stability conditions under which these two algorithms provably recover the optimal MAP solution. These theoretical results complement numerous empirical observations of good performance.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning Weighted Representations for Generalization Across Designs.\n \n \n \n \n\n\n \n Johansson, F. D.; Kallus, N.; Shalit, U.; and Sontag, D.\n\n\n \n\n\n\n ArXiv e-prints arXiv:1802.08598. 2018.\n \n\n\n\n
\n\n\n\n \n \n \"Learning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{JohanssonEtAl_arxiv18,\n  author    = {Fredrik D. Johansson and Nathan Kallus and Uri Shalit and David Sontag},\n  title = {Learning Weighted Representations for Generalization Across Designs},\n  journal = {ArXiv e-prints arXiv:1802.08598},\narchivePrefix = "arXiv",\n   eprint = {1802.08598},\n primaryClass = "stat.ML",\n     year = 2018,\n keywords = {Machine learning, Causal inference, Deep learning},\n  url_Paper = {https://arxiv.org/pdf/1802.08598.pdf},\n  abstract = {Predictive models that generalize well under distributional shift are often desirable and sometimes crucial to building robust and reliable machine learning applications. We focus on distributional shift that arises in causal inference from observational data and in unsupervised domain adaptation. We pose both of these problems as prediction under a shift in design. Popular methods for overcoming distributional shift make unrealistic assumptions such as having a well-specified model or knowing the policy that gave rise to the observed data. Other methods are hindered by their need for a pre-specified metric for comparing observations, or by poor asymptotic properties. We devise a bound on the generalization error under design shift, incorporating both representation learning and sample re-weighting. Based on the bound, we propose an algorithmic framework that does not require any of the above assumptions and which is asymptotically consistent. We empirically study the new framework using two synthetic datasets, and demonstrate its effectiveness compared to previous methods.}\n}\n\n
\n
\n\n\n
\n Predictive models that generalize well under distributional shift are often desirable and sometimes crucial to building robust and reliable machine learning applications. We focus on distributional shift that arises in causal inference from observational data and in unsupervised domain adaptation. We pose both of these problems as prediction under a shift in design. Popular methods for overcoming distributional shift make unrealistic assumptions such as having a well-specified model or knowing the policy that gave rise to the observed data. Other methods are hindered by their need for a pre-specified metric for comparing observations, or by poor asymptotic properties. We devise a bound on the generalization error under design shift, incorporating both representation learning and sample re-weighting. Based on the bound, we propose an algorithmic framework that does not require any of the above assumptions and which is asymptotically consistent. We empirically study the new framework using two synthetic datasets, and demonstrate its effectiveness compared to previous methods.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning Topic Models - Provably and Efficiently.\n \n \n \n \n\n\n \n Arora, S.; Ge, R.; Halpern, Y.; Mimno, D.; Moitra, A.; Sontag, D.; Wu, Y.; and Zhu, M.\n\n\n \n\n\n\n Communications of the ACM, 61(4): 85-93. 2018.\n \n\n\n\n
\n\n\n\n \n \n \"Learning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{AroraEtAl_CACM18,\n  author    = {Sanjeev Arora and Rong Ge and Yoni Halpern and David Mimno and Ankur Moitra and David Sontag and Yichen Wu and Michael Zhu},\n  title     = {Learning Topic Models - Provably and Efficiently},\n  journal = {Communications of the {ACM}},\n  year = {2018},\n  volume = {61},\n  number = {4},\n  pages  = {85-93},\n  keywords = {Machine learning, Unsupervised learning, Topic models},\n  url_Paper = {https://cacm.acm.org/magazines/2018/4/226373-learning-topic-models-provably-and-efficiently/fulltext},\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Recurrent Neural Networks for Multivariate Time Series with Missing Values.\n \n \n \n \n\n\n \n Che, Z.; Purushotham, S.; Cho, K.; Sontag, D.; and Liu, Y.\n\n\n \n\n\n\n Nature Scientific Reports, 8(1): 6085. 2018.\n \n\n\n\n
\n\n\n\n \n \n \"Recurrent paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 5 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{CheEtAl_nature_sr18,\n\tAuthor = {Che, Zhengping and Purushotham, Sanjay and Cho, Kyunghyun and Sontag, David and Liu, Yan},\n\tJournal = {Nature Scientific Reports},\n\tNumber = {1},\n\tPages = {6085},\n\tTitle = {Recurrent Neural Networks for Multivariate Time Series with Missing Values},\n\tVolume = {8},\n\tYear = {2018},\n        keywords = {Health care, Machine learning, Deep learning},\n        url_Paper = {https://www.nature.com/articles/s41598-018-24271-9},\n\tabstract = {Multivariate time series data in practical applications, such as health care, geoscience, and biology, are characterized by a variety of missing values. In time series prediction and other related tasks, it has been noted that missing values and their missing patterns are often correlated with the target labels, a.k.a., informative missingness. There is very limited work on exploiting the missing patterns for effective imputation and improving prediction performance. In this paper, we develop novel deep learning models, namely GRU-D, as one of the early attempts. GRU-D is based on Gated Recurrent Unit (GRU), a state-of-the-art recurrent neural network. It takes two representations of missing patterns, i.e., masking and time interval, and effectively incorporates them into a deep model architecture so that it not only captures the long-term temporal dependencies in time series, but also utilizes the missing patterns to achieve better prediction results. Experiments of time series classification tasks on real-world clinical datasets (MIMIC-III, PhysioNet) and synthetic datasets demonstrate that our models achieve state-of-the-art performance and provide useful insights for better understanding and utilization of missing values in time series analysis.},\n}\n\n
\n
\n\n\n
\n Multivariate time series data in practical applications, such as health care, geoscience, and biology, are characterized by a variety of missing values. In time series prediction and other related tasks, it has been noted that missing values and their missing patterns are often correlated with the target labels, a.k.a., informative missingness. There is very limited work on exploiting the missing patterns for effective imputation and improving prediction performance. In this paper, we develop novel deep learning models, namely GRU-D, as one of the early attempts. GRU-D is based on Gated Recurrent Unit (GRU), a state-of-the-art recurrent neural network. It takes two representations of missing patterns, i.e., masking and time interval, and effectively incorporates them into a deep model architecture so that it not only captures the long-term temporal dependencies in time series, but also utilizes the missing patterns to achieve better prediction results. Experiments of time series classification tasks on real-world clinical datasets (MIMIC-III, PhysioNet) and synthetic datasets demonstrate that our models achieve state-of-the-art performance and provide useful insights for better understanding and utilization of missing values in time series analysis.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Causal Effect Inference with Deep Latent-Variable Models.\n \n \n \n \n\n\n \n Louizos, C.; Shalit, U.; Mooij, J.; Sontag, D.; Zemel, R. S.; and Welling, M.\n\n\n \n\n\n\n In Proceedings of the 31st International Conference on Neural Information Processing Systems, of NIPS'17, 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Causal paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 12 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{LouizosEtAl_arxiv17,\n  author    = {Christos Louizos and\n               Uri Shalit and\n               Joris Mooij and\n               David Sontag and\n               Richard S. Zemel and\n               Max Welling},\n  title     = {Causal Effect Inference with Deep Latent-Variable Models},\n booktitle = {Proceedings of the 31st International Conference on Neural Information Processing Systems},\n series = {NIPS'17},\n year = {2017},\n keywords = {Machine learning, Causal inference, Deep learning},\n url_Paper = {https://arxiv.org/pdf/1705.08821.pdf},\n abstract = {Learning individual-level causal effects from observational data, such as inferring the most effective medication for a specific patient, is a problem of growing importance for policy makers. The most important aspect of inferring causal effects from observational data is the handling of confounders, factors that affect both an intervention and its outcome. A carefully designed observational study attempts to measure all important confounders. However, even if one does not have direct access to all confounders, there may exist noisy and uncertain measurement of proxies for confounders. We build on recent advances in latent variable modelling to simultaneously estimate the unknown latent space summarizing the confounders and the causal effect. Our method is based on Variational Autoencoders (VAE) which follow the causal structure of inference with proxies. We show our method is significantly more robust than existing methods, and matches the state-of-the-art on previous benchmarks focused on individual treatment effects.}\n}\n\n
\n
\n\n\n
\n Learning individual-level causal effects from observational data, such as inferring the most effective medication for a specific patient, is a problem of growing importance for policy makers. The most important aspect of inferring causal effects from observational data is the handling of confounders, factors that affect both an intervention and its outcome. A carefully designed observational study attempts to measure all important confounders. However, even if one does not have direct access to all confounders, there may exist noisy and uncertain measurement of proxies for confounders. We build on recent advances in latent variable modelling to simultaneously estimate the unknown latent space summarizing the confounders and the causal effect. Our method is based on Variational Autoencoders (VAE) which follow the causal structure of inference with proxies. We show our method is significantly more robust than existing methods, and matches the state-of-the-art on previous benchmarks focused on individual treatment effects.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Grounded Recurrent Neural Networks.\n \n \n \n \n\n\n \n Vani, A.; Jernite, Y.; and Sontag, D.\n\n\n \n\n\n\n ArXiv e-prints arXiv:1705.08557. 2017.\n \n\n\n\n
\n\n\n\n \n \n \"Grounded paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{VaniEtAl_arxiv17,\n   author = {{Vani}, A. and {Jernite}, Y. and {Sontag}, D.},\n    title = "{Grounded Recurrent Neural Networks}",\n  journal = {ArXiv e-prints arXiv:1705.08557},\narchivePrefix = "arXiv",\n   eprint = {1705.08557},\n primaryClass = "stat.ML",\n     year = 2017,\n  keywords = {Machine learning, Health care, Natural language processing, Deep learning},\n  url_Paper = {https://arxiv.org/pdf/1705.08557.pdf},\n  abstract = {In this work, we present the Grounded Recurrent Neural Network (GRNN), a recurrent neural network architecture for multi-label prediction which explicitly ties labels to specific dimensions of the recurrent hidden state (we call this process "grounding"). The approach is particularly well-suited for extracting large numbers of concepts from text. We apply the new model to address an important problem in healthcare of understanding what medical concepts are discussed in clinical text. Using a publicly available dataset derived from Intensive Care Units, we learn to label a patient's diagnoses and procedures from their discharge summary. Our evaluation shows a clear advantage to using our proposed architecture over a variety of strong baselines.}\n}\n\n
\n
\n\n\n
\n In this work, we present the Grounded Recurrent Neural Network (GRNN), a recurrent neural network architecture for multi-label prediction which explicitly ties labels to specific dimensions of the recurrent hidden state (we call this process \"grounding\"). The approach is particularly well-suited for extracting large numbers of concepts from text. We apply the new model to address an important problem in healthcare of understanding what medical concepts are discussed in clinical text. Using a publicly available dataset derived from Intensive Care Units, we learn to label a patient's diagnoses and procedures from their discharge summary. Our evaluation shows a clear advantage to using our proposed architecture over a variety of strong baselines.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Discourse-Based Objectives for Fast Unsupervised Sentence Representation Learning.\n \n \n \n \n\n\n \n Jernite, Y.; Bowman, S. R; and Sontag, D.\n\n\n \n\n\n\n arXiv preprint arXiv:1705.00557. 2017.\n \n\n\n\n
\n\n\n\n \n \n \"Discourse-Based paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{JerniteEtAl_arxiv17,\n  title={Discourse-Based Objectives for Fast Unsupervised Sentence Representation Learning},\n  author={Jernite, Yacine and Bowman, Samuel R and Sontag, David},\n  journal={arXiv preprint arXiv:1705.00557},\n  year={2017},\n  keywords = {Machine learning, Natural language processing, Deep learning},\n  url_Paper = {https://arxiv.org/pdf/1705.00557.pdf},\n  abstract = {This work presents a novel objective function for the unsupervised training of neural network sentence encoders. It exploits signals from paragraph-level discourse coherence to train these models to understand text. Our objective is purely discriminative, allowing us to train models many times faster than was possible under prior methods, and it yields models which perform well in extrinsic evaluations.}\n}\n\n\n
\n
\n\n\n
\n This work presents a novel objective function for the unsupervised training of neural network sentence encoders. It exploits signals from paragraph-level discourse coherence to train these models to understand text. Our objective is purely discriminative, allowing us to train models many times faster than was possible under prior methods, and it yields models which perform well in extrinsic evaluations.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Estimating individual treatment effect: generalization bounds and algorithms.\n \n \n \n \n\n\n \n Shalit, U.; Johansson, F. D.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the 34th International Conference on Machine Learning, pages 3076-3085, 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Estimating paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 7 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{ShalitEtAl_icml17,\n  author    = {Uri Shalit and\n               Fredrik D. Johansson and\n               David Sontag},\n  title     = {Estimating individual treatment effect: generalization bounds and\n               algorithms},\n  booktitle = {Proceedings of the 34th International Conference on Machine Learning},\n  pages     = {3076-3085},\n  year      = {2017},\n  keywords = {Machine learning, Causal inference, Deep learning},\n  url_Paper = {http://arxiv.org/pdf/1606.03976.pdf},\n  abstract = {There is intense interest in applying machine learning to problems of causal inference in fields such as healthcare, economics and education. In particular, individual-level causal inference has important applications such as precision medicine. We give a new theoretical analysis and family of algorithms for predicting individual treatment effect (ITE) from observational data, under the assumption known as strong ignorability. The algorithms learn a "balanced" representation such that the induced treated and control distributions look similar. We give a novel, simple and intuitive generalization-error bound showing that the expected ITE estimation error of a representation is bounded by a sum of the standard generalization-error of that representation and the distance between the treated and control distributions induced by the representation. We use Integral Probability Metrics to measure distances between distributions, deriving explicit bounds for the Wasserstein and Maximum Mean Discrepancy (MMD) distances. Experiments on real and simulated data show the new algorithms match or outperform the state-of-the-art.}\n}\n\n
\n
\n\n\n
\n There is intense interest in applying machine learning to problems of causal inference in fields such as healthcare, economics and education. In particular, individual-level causal inference has important applications such as precision medicine. We give a new theoretical analysis and family of algorithms for predicting individual treatment effect (ITE) from observational data, under the assumption known as strong ignorability. The algorithms learn a \"balanced\" representation such that the induced treated and control distributions look similar. We give a novel, simple and intuitive generalization-error bound showing that the expected ITE estimation error of a representation is bounded by a sum of the standard generalization-error of that representation and the distance between the treated and control distributions induced by the representation. We use Integral Probability Metrics to measure distances between distributions, deriving explicit bounds for the Wasserstein and Maximum Mean Discrepancy (MMD) distances. Experiments on real and simulated data show the new algorithms match or outperform the state-of-the-art.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Simultaneous Learning of Trees and Representations for Extreme Classification and Density Estimation.\n \n \n \n \n\n\n \n Jernite, Y.; Choromanska, A.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the 34th International Conference on Machine Learning, pages 1665-1674, 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Simultaneous paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{JerniteEtAl_icml17,\n  author    = {Yacine Jernite and\n               Anna Choromanska and\n               David Sontag},\n  title     = {Simultaneous Learning of Trees and Representations for Extreme Classification\n               and Density Estimation},\n  booktitle = {Proceedings of the 34th International Conference on Machine Learning},\n  pages     = {1665-1674},\n  year      = {2017},\n  keywords = {Machine learning, Natural language processing, Deep learning},\n  url_Paper = {https://arxiv.org/pdf/1610.04658.pdf},\n  abstract = {We consider multi-class classification where the predictor has a hierarchical structure that allows for a very large number of labels both at train and test time. The predictive power of such models can heavily depend on the structure of the tree, and although past work showed how to learn the tree structure, it expected that the feature vectors remained static. We provide a novel algorithm to simultaneously perform representation learning for the input data and learning of the hierarchical predictor. Our approach optimizes an objective function which favors balanced and easily-separable multi-way node partitions. We theoretically analyze this objective, showing that it gives rise to a boosting style property and a bound on classification error. We next show how to extend the algorithm to conditional density estimation. We empirically validate both variants of the algorithm on text classification and language modeling, respectively, and show that they compare favorably to common baselines in terms of accuracy and running time.}\n}\n\n
\n
\n\n\n
\n We consider multi-class classification where the predictor has a hierarchical structure that allows for a very large number of labels both at train and test time. The predictive power of such models can heavily depend on the structure of the tree, and although past work showed how to learn the tree structure, it expected that the feature vectors remained static. We provide a novel algorithm to simultaneously perform representation learning for the input data and learning of the hierarchical predictor. Our approach optimizes an objective function which favors balanced and easily-separable multi-way node partitions. We theoretically analyze this objective, showing that it gives rise to a boosting style property and a bound on classification error. We next show how to extend the algorithm to conditional density estimation. We empirically validate both variants of the algorithm on text classification and language modeling, respectively, and show that they compare favorably to common baselines in terms of accuracy and running time.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Structured Inference Networks for Nonlinear State Space Models.\n \n \n \n \n\n\n \n Krishnan, R. G.; Shalit, U.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the Thirty-First AAAI Conference on Artificial Intelligence, pages 2101-2109, 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Structured paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KrishnanEtAl_aaai17,\n  author    = {Rahul G. Krishnan and\n               Uri Shalit and\n               David Sontag},\n  title     = {Structured Inference Networks for Nonlinear State Space Models},\n  booktitle = {Proceedings of the Thirty-First {AAAI} Conference on Artificial Intelligence},\n  pages     = {2101-2109},\n  year      = {2017},\n  keywords = {Machine learning, Unsupervised learning, Deep learning, Health care, Approximate inference in graphical models},\n  url_Paper = {https://arxiv.org/pdf/1609.09869.pdf},\n  abstract = {Gaussian state space models have been used for decades as generative models of sequential data. They admit an intuitive probabilistic interpretation, have a simple functional form, and enjoy widespread adoption. We introduce a unified algorithm to efficiently learn a broad class of linear and non-linear state space models, including variants where the emission and transition distributions are modeled by deep neural networks. Our learning algorithm simultaneously learns a compiled inference network and the generative model, leveraging a structured variational approximation parameterized by recurrent neural networks to mimic the posterior distribution. We apply the learning algorithm to both synthetic and real-world datasets, demonstrating its scalability and versatility. We find that using the structured approximation to the posterior results in models with significantly higher held-out likelihood.}\n}\n\n
\n
\n\n\n
\n Gaussian state space models have been used for decades as generative models of sequential data. They admit an intuitive probabilistic interpretation, have a simple functional form, and enjoy widespread adoption. We introduce a unified algorithm to efficiently learn a broad class of linear and non-linear state space models, including variants where the emission and transition distributions are modeled by deep neural networks. Our learning algorithm simultaneously learns a compiled inference network and the generative model, leveraging a structured variational approximation parameterized by recurrent neural networks to mimic the posterior distribution. We apply the learning algorithm to both synthetic and real-world datasets, demonstrating its scalability and versatility. We find that using the structured approximation to the posterior results in models with significantly higher held-out likelihood.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Character-Aware Neural Language Models.\n \n \n \n \n\n\n \n Kim, Y.; Jernite, Y.; Sontag, D.; and Rush, A. M.\n\n\n \n\n\n\n In Proceedings of the Thirtieth AAAI Conference on Artificial Intelligence, pages 2741-2749, 2016. \n \n\n\n\n
\n\n\n\n \n \n \"Character-Aware paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KimEtAl_aaai16,\n  author    = {Yoon Kim and\n               Yacine Jernite and\n               David Sontag and\n               Alexander M. Rush},\n  title     = {Character-Aware Neural Language Models},\n  booktitle = {Proceedings of the Thirtieth {AAAI} Conference on Artificial Intelligence},\n  pages     = {2741-2749},\n  year      = {2016},\n  keywords = {Machine learning, Natural language processing, Deep learning},\n  url_Paper = {http://arxiv.org/pdf/1508.06615.pdf},\n  abstract = {We describe a simple neural language model that relies only on character-level inputs. Predictions are still made at the word-level. Our model employs a convolutional neural network (CNN) and a highway network over characters, whose output is given to a long short-term memory (LSTM) recurrent neural network language model (RNN-LM). On the English Penn Treebank the model is on par with the existing state-of-the-art despite having 60\\% fewer parameters. On languages with rich morphology (Arabic, Czech, French, German, Spanish, Russian), the model outperforms word-level/morpheme-level LSTM baselines, again with fewer parameters. The results suggest that on many languages, character inputs are sufficient for language modeling. Analysis of word representations obtained from the character composition part of the model reveals that the model is able to encode, from characters only, both semantic and orthographic information.}\n}\n\n
\n
\n\n\n
\n We describe a simple neural language model that relies only on character-level inputs. Predictions are still made at the word-level. Our model employs a convolutional neural network (CNN) and a highway network over characters, whose output is given to a long short-term memory (LSTM) recurrent neural network language model (RNN-LM). On the English Penn Treebank the model is on par with the existing state-of-the-art despite having 60% fewer parameters. On languages with rich morphology (Arabic, Czech, French, German, Spanish, Russian), the model outperforms word-level/morpheme-level LSTM baselines, again with fewer parameters. The results suggest that on many languages, character inputs are sufficient for language modeling. Analysis of word representations obtained from the character composition part of the model reveals that the model is able to encode, from characters only, both semantic and orthographic information.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Tightness of LP Relaxations for Almost Balanced Models.\n \n \n \n \n\n\n \n Weller, A.; Rowland, M.; and Sontag, D.\n\n\n \n\n\n\n In Gretton, A.; and Robert, C. C., editor(s), Proceedings of the 19th International Conference on Artificial Intelligence and Statistics, volume 51, of Proceedings of Machine Learning Research, pages 47-55, Cadiz, Spain, 09–11 May 2016. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"Tightness paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@InProceedings{WellerEtAl_aistats16,\n  title = \t {Tightness of LP Relaxations for Almost Balanced Models},\n  author = \t {Adrian Weller and Mark Rowland and David Sontag},\n  booktitle = \t {Proceedings of the 19th International Conference on Artificial Intelligence and Statistics},\n  pages = \t {47-55},\n  year = \t {2016},\n  editor = \t {Arthur Gretton and Christian C. Robert},\n  volume = \t {51},\n  series = \t {Proceedings of Machine Learning Research},\n  address = \t {Cadiz, Spain},\n  month = \t {09--11 May},\n  publisher = \t {PMLR},\n  keywords = {Machine learning, Approximate inference in graphical models},\n  url_Paper = {http://people.csail.mit.edu/dsontag/papers/WellerEtAl_uai16.pdf},\n  abstract = {Linear programming (LP) relaxations are widely used to attempt to identify a most likely configuration of a discrete graphical model. In some cases, the LP relaxation attains an optimum vertex at an integral location and thus guarantees an exact solution to the original optimization problem. When this occurs, we say that the LP relaxation is tight. Here we consider binary pairwise models and derive sufficient conditions for guaranteed tightness of (i) the standard LP relaxation on the local polytope LP+LOC, and (ii) the LP relaxation on the triplet-consistent polytope LP+TRI (the next level in the Sherali-Adams hierarchy). We provide simple new proofs of earlier results and derive significant novel results including that LP+TRI is tight for any model where each block is balanced or almost balanced, and a decomposition theorem that may be used to break apart complex models into smaller pieces. An almost balanced (sub-)model is one that contains no frustrated cycles except through one privileged variable.}\n}\n\n
\n
\n\n\n
\n Linear programming (LP) relaxations are widely used to attempt to identify a most likely configuration of a discrete graphical model. In some cases, the LP relaxation attains an optimum vertex at an integral location and thus guarantees an exact solution to the original optimization problem. When this occurs, we say that the LP relaxation is tight. Here we consider binary pairwise models and derive sufficient conditions for guaranteed tightness of (i) the standard LP relaxation on the local polytope LP+LOC, and (ii) the LP relaxation on the triplet-consistent polytope LP+TRI (the next level in the Sherali-Adams hierarchy). We provide simple new proofs of earlier results and derive significant novel results including that LP+TRI is tight for any model where each block is balanced or almost balanced, and a decomposition theorem that may be used to break apart complex models into smaller pieces. An almost balanced (sub-)model is one that contains no frustrated cycles except through one privileged variable.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Train and Test Tightness of LP Relaxations in Structured Prediction.\n \n \n \n \n\n\n \n Meshi, O.; Mahdavi, M.; Weller, A.; and Sontag, D.\n\n\n \n\n\n\n In Balcan, M. F.; and Weinberger, K. Q., editor(s), Proceedings of The 33rd International Conference on Machine Learning, volume 48, of Proceedings of Machine Learning Research, pages 1776–1785, New York, New York, USA, 20–22 Jun 2016. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"Train paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@InProceedings{MeshiEtAl_icml16,\n  title = \t {Train and Test Tightness of LP Relaxations in Structured Prediction},\n  author = \t {Ofer Meshi and Mehrdad Mahdavi and Adrian Weller and David Sontag},\n  booktitle = \t {Proceedings of The 33rd International Conference on Machine Learning},\n  pages = \t {1776--1785},\n  year = \t {2016},\n  editor = \t {Maria Florina Balcan and Kilian Q. Weinberger},\n  volume = \t {48},\n  series = \t {Proceedings of Machine Learning Research},\n  address = \t {New York, New York, USA},\n  month = \t {20--22 Jun},\n  publisher = \t {PMLR},\n  keywords = {Machine learning, Structured prediction},\n  url_Paper = {http://people.csail.mit.edu/dsontag/papers/MeshiEtAl_icml16.pdf},\n  abstract = {Structured prediction is used in areas such as computer vision and natural language processing to predict structured outputs such as segmentations or parse trees. In these settings, prediction is performed by MAP inference or, equivalently, by solving an integer linear program. Because of the complex scoring functions required to obtain accurate predictions, both learning and inference typically require the use of approximate solvers. We propose a theoretical explanation to the striking observation that approximations based on linear programming (LP) relaxations are often tight on real-world instances. In particular, we show that learning with LP relaxed inference encourages integrality of training instances, and that tightness generalizes from train to test data.}\n}\n\n
\n
\n\n\n
\n Structured prediction is used in areas such as computer vision and natural language processing to predict structured outputs such as segmentations or parse trees. In these settings, prediction is performed by MAP inference or, equivalently, by solving an integer linear program. Because of the complex scoring functions required to obtain accurate predictions, both learning and inference typically require the use of approximate solvers. We propose a theoretical explanation to the striking observation that approximations based on linear programming (LP) relaxations are often tight on real-world instances. In particular, we show that learning with LP relaxed inference encourages integrality of training instances, and that tightness generalizes from train to test data.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning Representations for Counterfactual Inference.\n \n \n \n \n\n\n \n Johansson, F.; Shalit, U.; and Sontag, D.\n\n\n \n\n\n\n In Balcan, M. F.; and Weinberger, K. Q., editor(s), Proceedings of The 33rd International Conference on Machine Learning, volume 48, of Proceedings of Machine Learning Research, pages 3020–3029, New York, New York, USA, 20–22 Jun 2016. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"Learning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@InProceedings{JohanssonEtAl_icml16,\n  title = \t {Learning Representations for Counterfactual Inference},\n  author = \t {Fredrik Johansson and Uri Shalit and David Sontag},\n  booktitle = \t {Proceedings of The 33rd International Conference on Machine Learning},\n  pages = \t {3020--3029},\n  year = \t {2016},\n  editor = \t {Maria Florina Balcan and Kilian Q. Weinberger},\n  volume = \t {48},\n  series = \t {Proceedings of Machine Learning Research},\n  address = \t {New York, New York, USA},\n  month = \t {20--22 Jun},\n  publisher = \t {PMLR},\n  keywords = {Machine learning, Causal inference, Deep learning},\n  url_Paper = {http://people.csail.mit.edu/dsontag/papers/JohanssonShalitSontag_icml16.pdf},\n  abstract = {Observational studies are rising in importance due to the widespread accumulation of data in fields such as healthcare, education, employment and ecology. We consider the task of answering counterfactual questions such as, "Would this patient have lower blood sugar had she received a different medication?". We propose a new algorithmic framework for counterfactual inference which brings together ideas from domain adaptation and representation learning. In addition to a theoretical justification, we perform an empirical comparison with previous approaches to causal inference from observational data. Our deep learning algorithm significantly outperforms the previous state-of-the-art.}\n}\n\n
\n
\n\n\n
\n Observational studies are rising in importance due to the widespread accumulation of data in fields such as healthcare, education, employment and ecology. We consider the task of answering counterfactual questions such as, \"Would this patient have lower blood sugar had she received a different medication?\". We propose a new algorithmic framework for counterfactual inference which brings together ideas from domain adaptation and representation learning. In addition to a theoretical justification, we perform an empirical comparison with previous approaches to causal inference from observational data. Our deep learning algorithm significantly outperforms the previous state-of-the-art.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Barrier Frank-Wolfe for Marginal Inference.\n \n \n \n \n\n\n \n Krishnan, R. G.; Lacoste-Julien, S.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the 28th International Conference on Neural Information Processing Systems, of NIPS'15, pages 532–540, Cambridge, MA, USA, 2015. MIT Press\n \n\n\n\n
\n\n\n\n \n \n \"Barrier paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KrishnanEtAl_nips15,\n author = {Krishnan, Rahul G. and Lacoste-Julien, Simon and Sontag, David},\n title = {Barrier Frank-Wolfe for Marginal Inference},\n booktitle = {Proceedings of the 28th International Conference on Neural Information Processing Systems},\n series = {NIPS'15},\n year = {2015},\n location = {Montreal, Canada},\n pages = {532--540},\n numpages = {9},\n publisher = {MIT Press},\n address = {Cambridge, MA, USA},\n keywords = {Machine learning, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/KrishnanEtAl_nips15.pdf},\n abstract = {We introduce a globally-convergent algorithm for optimizing the tree-reweighted (TRW) variational objective over the marginal polytope. The algorithm is based on the conditional gradient method (Frank-Wolfe) and moves pseudomarginals within the marginal polytope through repeated maximum a posteriori (MAP) calls. This modular structure enables us to leverage black-box MAP solvers (both exact and approximate) for variational inference, and obtains more accurate results than tree-reweighted algorithms that optimize over the local consistency relaxation. Theoretically, we bound the sub-optimality for the proposed algorithm despite the TRW objective having unbounded gradients at the boundary of the marginal polytope. Empirically, we demonstrate the increased quality of results found by tightening the relaxation over the marginal polytope as well as the spanning tree polytope on synthetic and real-world instances.}\n} \n\n
\n
\n\n\n
\n We introduce a globally-convergent algorithm for optimizing the tree-reweighted (TRW) variational objective over the marginal polytope. The algorithm is based on the conditional gradient method (Frank-Wolfe) and moves pseudomarginals within the marginal polytope through repeated maximum a posteriori (MAP) calls. This modular structure enables us to leverage black-box MAP solvers (both exact and approximate) for variational inference, and obtains more accurate results than tree-reweighted algorithms that optimize over the local consistency relaxation. Theoretically, we bound the sub-optimality for the proposed algorithm despite the TRW objective having unbounded gradients at the boundary of the marginal polytope. Empirically, we demonstrate the increased quality of results found by tightening the relaxation over the marginal polytope as well as the spanning tree polytope on synthetic and real-world instances.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A Fast Variational Approach for Learning Markov Random Field Language Models.\n \n \n \n \n\n\n \n Jernite, Y.; Rush, A.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the 32nd International Conference on Machine Learning (ICML), volume 37, pages 2209–2217, 2015. JMLR: W&CP\n \n\n\n\n
\n\n\n\n \n \n \"A paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{JerniteEtAl_icml15,\n  author    = {Yacine Jernite and Alexander Rush and David Sontag },\n  title     = {A Fast Variational Approach for Learning Markov Random Field Language Models},\n  booktitle = {Proceedings of the 32nd International Conference on Machine Learning (ICML)},\n  year = {2015},\n publisher = {JMLR: W\\&CP},\n volume = {37},\n pages  = {2209--2217},\n keywords = {Machine learning, Natural language processing},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/JerRusSon_icml15.pdf},\n abstract = {Language modelling is a fundamental building block of natural language processing. However, in practice the size of the vocabulary limits thedistributions applicable for this task: specifically, one has to either resort to local optimization methods, such as those used in neural language models, or work with heavily constrained distributions. In this work, we take a step towards overcoming these difficulties. We present a method for global-likelihood optimization of a Markov random field language model exploiting long-range contexts in time independent of the corpus size. We take a variational approach to optimizing the likelihood and exploit underlying symmetries to greatly simplify learning. We demonstrate the efficiency of this method both for language modelling and for part-of-speech tagging.}\n}\n\n
\n
\n\n\n
\n Language modelling is a fundamental building block of natural language processing. However, in practice the size of the vocabulary limits thedistributions applicable for this task: specifically, one has to either resort to local optimization methods, such as those used in neural language models, or work with heavily constrained distributions. In this work, we take a step towards overcoming these difficulties. We present a method for global-likelihood optimization of a Markov random field language model exploiting long-range contexts in time independent of the corpus size. We take a variational approach to optimizing the likelihood and exploit underlying symmetries to greatly simplify learning. We demonstrate the efficiency of this method both for language modelling and for part-of-speech tagging.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n How Hard is Inference for Structured Prediction?.\n \n \n \n \n\n\n \n Globerson, A.; Roughgarden, T.; Sontag, D.; and Yildirim, C.\n\n\n \n\n\n\n In Proceedings of the 32nd International Conference on Machine Learning (ICML), volume 37, pages 2181-–2190, 2015. JMLR: W&CP\n \n\n\n\n
\n\n\n\n \n \n \"How paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{GlobersonEtAl_icml15,\n  author    = {Amir Globerson and Tim Roughgarden and David Sontag and Cafer Yildirim},\n  title     = {How Hard is Inference for Structured Prediction?},\n  booktitle = {Proceedings of the 32nd International Conference on Machine Learning (ICML)},\n  year = {2015},\n publisher = {JMLR: W\\&CP},\n volume = {37},\n pages  = {2181-–2190},\n keywords = {Machine learning, Approximate inference in graphical models, Structured prediction},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/GloRouSonYil_icml15.pdf},\n abstract = {Structured prediction tasks in machine learning involve the simultaneous prediction of multiple labels. This is often done by maximizing a score function on the space of labels, which decomposes as a sum of pairwise elements, each depending on two specific labels. The goal of this paper is to develop a theoretical explanation of the empirical effectiveness of heuristic inference algorithms for solving such structured prediction problems. We study the minimum-achievable expected Hamming error in such problems, highlighting the case of 2D grid graphs, which are common in machine vision applications. Our main theorems provide tight upper and lower bounds on this error, as well as a polynomialtime algorithm that achieves the bound.}\n}\n\n
\n
\n\n\n
\n Structured prediction tasks in machine learning involve the simultaneous prediction of multiple labels. This is often done by maximizing a score function on the space of labels, which decomposes as a sum of pairwise elements, each depending on two specific labels. The goal of this paper is to develop a theoretical explanation of the empirical effectiveness of heuristic inference algorithms for solving such structured prediction problems. We study the minimum-achievable expected Hamming error in such problems, highlighting the case of 2D grid graphs, which are common in machine vision applications. Our main theorems provide tight upper and lower bounds on this error, as well as a polynomialtime algorithm that achieves the bound.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Deep Kalman Filters.\n \n \n \n \n\n\n \n Krishnan, R. G.; Shalit, U.; and Sontag, D.\n\n\n \n\n\n\n In arXiv:1511.05121, 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Deep paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KriShaSon_arxiv15,\n author = {Rahul G. Krishnan and Uri Shalit and David Sontag},\n title = {Deep Kalman Filters},\n booktitle = {arXiv:1511.05121},\n year = {2015},\n keywords = {Machine learning, Unsupervised learning, Health care, Deep learning},\n url_Paper = {http://arxiv.org/pdf/1511.05121.pdf},\n abstract = {Kalman Filters are one of the most influential models of time-varying phenomena. They admit an intuitive probabilistic interpretation, have a simple functional form, and enjoy widespread adoption in a variety of disciplines. Motivated by recent variational methods for learning deep generative models, we introduce a unified algorithm to efficiently learn a broad spectrum of Kalman filters. Of particular interest is the use of temporal generative models for counterfactual inference. We investigate the efficacy of such models for counterfactual inference, and to that end we introduce the "Healing MNIST" dataset where long-term structure, noise and actions are applied to sequences of digits. We show the efficacy of our method for modeling this dataset. We further show how our model can be used for counterfactual inference for patients, based on electronic health record data of 8,000 patients over 4.5 years.}\n}\n\n
\n
\n\n\n
\n Kalman Filters are one of the most influential models of time-varying phenomena. They admit an intuitive probabilistic interpretation, have a simple functional form, and enjoy widespread adoption in a variety of disciplines. Motivated by recent variational methods for learning deep generative models, we introduce a unified algorithm to efficiently learn a broad spectrum of Kalman filters. Of particular interest is the use of temporal generative models for counterfactual inference. We investigate the efficacy of such models for counterfactual inference, and to that end we introduce the \"Healing MNIST\" dataset where long-term structure, noise and actions are applied to sequences of digits. We show the efficacy of our method for modeling this dataset. We further show how our model can be used for counterfactual inference for patients, based on electronic health record data of 8,000 patients over 4.5 years.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Anchored Discrete Factor Analysis.\n \n \n \n \n\n\n \n Halpern, Y.; Horng, S.; and Sontag, D.\n\n\n \n\n\n\n In arXiv:1511.03299, 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Anchored paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{HalpernEtAl_arxiv15,\n author = {Yoni Halpern and Steven Horng and David Sontag},\n title = {Anchored Discrete Factor Analysis},\n booktitle = {arXiv:1511.03299},\n year = {2015},\n keywords = {Machine learning, Unsupervised learning, Health care},\n url_Paper = {http://arxiv.org/pdf/1511.03299.pdf},\n abstract = {We present a semi-supervised learning algorithm for learning discrete factor analysis models with arbitrary structure on the latent variables. Our algorithm assumes that every latent variable has an "anchor", an observed variable with only that latent variable as its parent. Given such anchors, we show that it is possible to consistently recover moments of the latent variables and use these moments to learn complete models. We also introduce a new technique for improving the robustness of method-of-moment algorithms by optimizing over the marginal polytope or its relaxations. We evaluate our algorithm using two real-world tasks, tag prediction on questions from the Stack Overflow website and medical diagnosis in an emergency department.}\n}\n\n
\n
\n\n\n
\n We present a semi-supervised learning algorithm for learning discrete factor analysis models with arbitrary structure on the latent variables. Our algorithm assumes that every latent variable has an \"anchor\", an observed variable with only that latent variable as its parent. Given such anchors, we show that it is possible to consistently recover moments of the latent variables and use these moments to learn complete models. We also introduce a new technique for improving the robustness of method-of-moment algorithms by optimizing over the marginal polytope or its relaxations. We evaluate our algorithm using two real-world tasks, tag prediction on questions from the Stack Overflow website and medical diagnosis in an emergency department.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Temporal Convolutional Neural Networks for Diagnosis from Lab Tests.\n \n \n \n \n\n\n \n Razavian, N.; and Sontag, D.\n\n\n \n\n\n\n In arXiv:1511.07938, 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Temporal paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{RazavianSontag_arxiv15,\n author = {Narges Razavian and David Sontag},\n title = {Temporal Convolutional Neural Networks for Diagnosis from Lab Tests},\n booktitle = {arXiv:1511.07938},\n year = {2015},\n keywords = {Health care, Machine learning, Deep learning},\n url_Paper = {http://arxiv.org/pdf/1511.07938.pdf},\n abstract = {Early diagnosis of treatable diseases is essential for improving healthcare, and many diseases’ onsets are predictable from annual lab tests and their temporal trends. We introduce a multi-resolution convolutional neural network for early detection of multiple diseases from irregularly measured sparse lab values. Our novel architecture takes as input both an imputed version of the data and a binary observation matrix. For imputing the temporal sparse observations, we develop a flexible, fast to train method for differentiable multivariate kernel regression. Our experiments on data from 298K individuals over 8 years, 18 common lab measurements, and 171 diseases show that the temporal signatures learned via convolution are significantly more predictive than baselines commonly used for early disease diagnosis.}\n}\n\n
\n
\n\n\n
\n Early diagnosis of treatable diseases is essential for improving healthcare, and many diseases’ onsets are predictable from annual lab tests and their temporal trends. We introduce a multi-resolution convolutional neural network for early detection of multiple diseases from irregularly measured sparse lab values. Our novel architecture takes as input both an imputed version of the data and a binary observation matrix. For imputing the temporal sparse observations, we develop a flexible, fast to train method for differentiable multivariate kernel regression. Our experiments on data from 298K individuals over 8 years, 18 common lab measurements, and 171 diseases show that the temporal signatures learned via convolution are significantly more predictive than baselines commonly used for early disease diagnosis.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Using Anchors to Estimate Clinical State without Labeled Data.\n \n \n \n \n\n\n \n Halpern, Y.; Choi, Y.; Horng, S.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the American Medical Informatics Association (AMIA) Annual Symposium, pages 606–615, 2014. \n \n\n\n\n
\n\n\n\n \n \n \"Using paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{HalpernEtAl_amia14,\n author = {Yoni Halpern and Youngduck Choi and Steven Horng and David Sontag},\n title = {Using Anchors to Estimate Clinical State without Labeled Data},\n booktitle = {Proceedings of the American Medical Informatics Association (AMIA) Annual Symposium},\n pages = {606--615},\n year = {2014},\n keywords = {Health care, Machine learning, Unsupervised learning},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/HalpernEtAl_amia14.pdf},\n abstract = {We present a novel framework for learning to estimate and predict clinical state variables without labeled data. The resulting models can used for electronic phenotyping, triggering clinical decision support, and cohort selection. The framework relies on key observations which we characterize and term "anchor variables". By specifying anchor variables, an expert encodes a certain amount of domain knowledge about the problem while the rest of learning proceeds in an unsupervised manner. The ability to build anchors upon standardized ontologies and the framework's ability to learn from unlabeled data promote generalizability across institutions. We additionally develop a user interface to enable experts to choose anchor variables in an informed manner. The framework is applied to electronic medical record-based phenotyping to enable real-time decision support in the emergency department. We validate the learned models using a prospectively gathered set of gold-standard responses from emergency physicians for nine clinically relevant variables.}\n}\n\n
\n
\n\n\n
\n We present a novel framework for learning to estimate and predict clinical state variables without labeled data. The resulting models can used for electronic phenotyping, triggering clinical decision support, and cohort selection. The framework relies on key observations which we characterize and term \"anchor variables\". By specifying anchor variables, an expert encodes a certain amount of domain knowledge about the problem while the rest of learning proceeds in an unsupervised manner. The ability to build anchors upon standardized ontologies and the framework's ability to learn from unlabeled data promote generalizability across institutions. We additionally develop a user interface to enable experts to choose anchor variables in an informed manner. The framework is applied to electronic medical record-based phenotyping to enable real-time decision support in the emergency department. We validate the learned models using a prospectively gathered set of gold-standard responses from emergency physicians for nine clinically relevant variables.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Instance Segmentation of Indoor Scenes using a Coverage Loss.\n \n \n \n \n\n\n \n Silberman, N.; Sontag, D.; and Fergus, R.\n\n\n \n\n\n\n In Fleet, D. J.; Pajdla, T.; Schiele, B.; and Tuytelaars, T., editor(s), Proceedings of the 13th European Conference on Computer Vision (ECCV), volume 8689, of Lecture Notes in Computer Science, pages 616–631, 2014. Springer\n \n\n\n\n
\n\n\n\n \n \n \"Instance paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{SilSonFer_ECCV14,\n  author    = {Nathan Silberman and David Sontag and Rob Fergus},\n  title     = {Instance Segmentation of Indoor Scenes using a Coverage Loss},\n  booktitle = {Proceedings of the 13th European Conference on Computer Vision (ECCV)},\n  series    = {Lecture Notes in Computer Science},\n  volume    = {8689},\n  publisher = {Springer},\n  editor    = {David J. Fleet and\n               Tom{\\'{a}}s Pajdla and\n               Bernt Schiele and\n               Tinne Tuytelaars},\n  pages     = {616--631},\n  year      = {2014},\n  keywords = {Computer vision, Machine learning},\n  url_Paper = {http://people.csail.mit.edu/dsontag/papers/SilSonFer_ECCV14.pdf},\n  abstract = {A major limitation of existing models for semantic segmentation is the inability to identify individual instances of the same class: when labeling pixels with only semantic classes, a set of pixels with the same label could represent a single object or ten. In this work, we introduce a model to perform both semantic and instance segmentation simultaneously. We introduce a new higher-order loss function that directly minimizes the coverage metric and evaluate a variety of region features, including those from a convolutional network. We apply our model to the NYU Depth V2 dataset, obtaining state of the art results.}\n}\n\n
\n
\n\n\n
\n A major limitation of existing models for semantic segmentation is the inability to identify individual instances of the same class: when labeling pixels with only semantic classes, a set of pixels with the same label could represent a single object or ten. In this work, we introduce a model to perform both semantic and instance segmentation simultaneously. We introduce a new higher-order loss function that directly minimizes the coverage metric and evaluate a variety of region features, including those from a convolutional network. We apply our model to the NYU Depth V2 dataset, obtaining state of the art results.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Lifted Tree-Reweighted Variational Inference.\n \n \n \n \n\n\n \n Bui, H. H.; Huynh, T. N.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the Thirtieth Conference on Uncertainty in Artificial Intelligence (UAI-14), 2014. \n \n\n\n\n
\n\n\n\n \n \n \"Lifted paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{BuiHuySon_uai14,\n author = {Hung Hai Bui and Tuyen N. Huynh and David Sontag},\n title  = {Lifted Tree-Reweighted Variational Inference},\n booktitle = {Proceedings of the Thirtieth Conference on Uncertainty in Artificial Intelligence ({UAI}-14)},\n year  = {2014},\n keywords = {Machine learning, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/BuiHuySon_uai14.pdf},\n abstract = {We analyze variational inference for highly symmetric graphical models such as those arising from first-order probabilistic models. We first show that for these graphical models, the tree-reweighted variational objective lends itself to a compact lifted formulation which can be solved much more efficiently than the standard TRW formulation for the ground graphical model. Compared to earlier work on lifted belief propagation, our formulation leads to a convex optimization problem for lifted marginal inference and provides an upper bound on the partition function. We provide two approaches for improving the lifted TRW upper bound. The first is a method for efficiently computing maximum spanning trees in highly symmetric graphs, which can be used to optimize the TRW edge appearance probabilities. The second is a method for tightening the relaxation of the marginal polytope using lifted cycle inequalities and novel exchangeable cluster consistency constraints.}\n}\n\n
\n
\n\n\n
\n We analyze variational inference for highly symmetric graphical models such as those arising from first-order probabilistic models. We first show that for these graphical models, the tree-reweighted variational objective lends itself to a compact lifted formulation which can be solved much more efficiently than the standard TRW formulation for the ground graphical model. Compared to earlier work on lifted belief propagation, our formulation leads to a convex optimization problem for lifted marginal inference and provides an upper bound on the partition function. We provide two approaches for improving the lifted TRW upper bound. The first is a method for efficiently computing maximum spanning trees in highly symmetric graphs, which can be used to optimize the TRW edge appearance probabilities. The second is a method for tightening the relaxation of the marginal polytope using lifted cycle inequalities and novel exchangeable cluster consistency constraints.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Understanding the Bethe Approximation: When and How can it go Wrong?.\n \n \n \n \n\n\n \n Weller, A.; Tang, K.; Sontag, D.; and Jebara, T.\n\n\n \n\n\n\n In Proceedings of the Thirtieth Conference on Uncertainty in Artificial Intelligence (UAI-14), 2014. \n \n\n\n\n
\n\n\n\n \n \n \"Understanding paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{WellerEtAl_uai14,\n author = {Adrian Weller and Kui Tang and David Sontag and Tony Jebara},\n title  = {Understanding the {B}ethe Approximation: When and How can it go Wrong?},\n booktitle = {Proceedings of the Thirtieth Conference on Uncertainty in Artificial Intelligence ({UAI}-14)},\n year  = {2014},\n keywords = {Machine learning, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/WellerEtAl_uai14.pdf},\n abstract = {Belief propagation is a remarkably effective tool for inference, even when applied to networks with cycles. It may be viewed as a way to seek the minimum of the Bethe free energy, though with no convergence guarantee in general. A variational perspective shows that, compared to exact inference, this minimization employs two forms of approximation: (i) the true entropy is approximated by the Bethe entropy, and (ii) the minimization is performed over a relaxation of the marginal polytope termed the local polytope. Here we explore when and how the Bethe approximation can fail for binary pairwise models by examining each aspect of the approximation, deriving results both analytically and with new experimental methods.}\n}\n\n
\n
\n\n\n
\n Belief propagation is a remarkably effective tool for inference, even when applied to networks with cycles. It may be viewed as a way to seek the minimum of the Bethe free energy, though with no convergence guarantee in general. A variational perspective shows that, compared to exact inference, this minimization employs two forms of approximation: (i) the true entropy is approximated by the Bethe entropy, and (ii) the minimization is performed over a relaxation of the marginal polytope termed the local polytope. Here we explore when and how the Bethe approximation can fail for binary pairwise models by examining each aspect of the approximation, deriving results both analytically and with new experimental methods.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Discovering Hidden Variables in Noisy-Or Networks using Quartet Tests.\n \n \n \n \n\n\n \n Jernite, Y.; Halpern, Y.; and Sontag, D.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems 26, pages 2355–2363. MIT Press, 2013.\n \n\n\n\n
\n\n\n\n \n \n paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@incollection{JerHalSon_nips13,\n author = {Yacine Jernite and Yoni Halpern and David Sontag},\n title = {Discovering Hidden Variables in Noisy-Or Networks using Quartet Tests},\n booktitle = {Advances in Neural Information Processing Systems 26},\n pages = {2355--2363},\n publisher = {MIT Press},\n year = {2013},\n keywords = {Machine learning, Unsupervised learning, Health care},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/JerHalSon_nips13.pdf},\n abstract = {We give a polynomial-time algorithm for provably learning the structure and parameters of bipartite noisy-or Bayesian networks of binary variables where the top layer is completely hidden. Unsupervised learning of these models is a form of discrete factor analysis, enabling the discovery of hidden variables and their causal relationships with observed data. We obtain an efficient learning algorithm for a family of Bayesian networks that we call quartet-learnable. For each latent variable, the existence of a singly-coupled quartet allows us to uniquely identify and learn all parameters involving that latent variable. We give a proof of the polynomial sample complexity of our learning algorithm, and experimentally compare it to variational EM.}\n}\n\n
\n
\n\n\n
\n We give a polynomial-time algorithm for provably learning the structure and parameters of bipartite noisy-or Bayesian networks of binary variables where the top layer is completely hidden. Unsupervised learning of these models is a form of discrete factor analysis, enabling the discovery of hidden variables and their causal relationships with observed data. We obtain an efficient learning algorithm for a family of Bayesian networks that we call quartet-learnable. For each latent variable, the existence of a singly-coupled quartet allows us to uniquely identify and learn all parameters involving that latent variable. We give a proof of the polynomial sample complexity of our learning algorithm, and experimentally compare it to variational EM.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n SparsityBoost: A New Scoring Function for Learning Bayesian Network Structure.\n \n \n \n \n\n\n \n Brenner, E.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the Twenty-Ninth Conference on Uncertainty in Artificial Intelligence (UAI-13), pages 112–121, Corvallis, Oregon, 2013. AUAI Press\n \n\n\n\n
\n\n\n\n \n \n \"SparsityBoost: paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{BrennerSontag_uai13,\n author = {Eliot Brenner and David Sontag},\n title = {SparsityBoost: A New Scoring Function for Learning Bayesian Network Structure},\n booktitle = {Proceedings of the Twenty-Ninth Conference on Uncertainty in Artificial Intelligence ({UAI}-13)},\n publisher = {AUAI Press},\n address = {Corvallis, Oregon},\n pages = {112--121},\n year = {2013},\n keywords = {Machine learning, Bayesian network structure learning},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/BrennerSontag_uai13.pdf},\n abstract = {We give a new consistent scoring function for structure learning of Bayesian networks. In contrast to traditional approaches to score-based structure learning, such as BDeu or MDL, the complexity penalty that we propose is data-dependent and is given by the probability that a conditional independence test correctly shows that an edge cannot exist. What really distinguishes this new scoring function from earlier work is that it has the property of becoming computationally easier to maximize as the amount of data increases. We prove a polynomial sample complexity result, showing that maximizing this score is guaranteed to correctly learn a structure with no false edges and a distribution close to the generating distribution, whenever there exists a Bayesian network which is a perfect map for the data generating distribution. Although the new score can be used with any search algorithm, we give empirical results showing that it is particularly effective when used together with a linear programming relaxation approach to Bayesian network structure learning.}\n}\n\n
\n
\n\n\n
\n We give a new consistent scoring function for structure learning of Bayesian networks. In contrast to traditional approaches to score-based structure learning, such as BDeu or MDL, the complexity penalty that we propose is data-dependent and is given by the probability that a conditional independence test correctly shows that an edge cannot exist. What really distinguishes this new scoring function from earlier work is that it has the property of becoming computationally easier to maximize as the amount of data increases. We prove a polynomial sample complexity result, showing that maximizing this score is guaranteed to correctly learn a structure with no false edges and a distribution close to the generating distribution, whenever there exists a Bayesian network which is a perfect map for the data generating distribution. Although the new score can be used with any search algorithm, we give empirical results showing that it is particularly effective when used together with a linear programming relaxation approach to Bayesian network structure learning.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Unsupervised Learning of Noisy-Or Bayesian Networks.\n \n \n \n \n\n\n \n Halpern, Y.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the Twenty-Ninth Conference on Uncertainty in Artificial Intelligence (UAI-13), pages 272–281, Corvallis, Oregon, 2013. AUAI Press\n \n\n\n\n
\n\n\n\n \n \n \"Unsupervised paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{HalpernSontag_uai13,\n author = {Yoni Halpern and David Sontag},\n title = {Unsupervised Learning of Noisy-Or Bayesian Networks},\n booktitle = {Proceedings of the Twenty-Ninth Conference on Uncertainty in Artificial Intelligence ({UAI}-13)},\n publisher = {AUAI Press},\n address = {Corvallis, Oregon},\n pages = {272--281},\n year = {2013},\n keywords = {Machine learning, Unsupervised learning, Health care},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/HalpernSontag_uai13.pdf},\n abstract = {This paper considers the problem of learning the parameters in Bayesian networks of discrete variables with known structure and hidden variables. Previous approaches in these settings typically use expectation maximization; when the network has high treewidth, the required expectations might be approximated using Monte Carlo or variational methods. We show how to avoid inference altogether during learning by giving a polynomial-time algorithm based on the method-of-moments, building upon recent work on learning discrete-valued mixture models. In particular, we show how to learn the parameters for a family of bipartite noisy-or Bayesian networks. In our experimental results, we demonstrate an application of our algorithm to learning QMR-DT, a large Bayesian network used for medical diagnosis. We show that it is possible to fully learn the parameters of QMR-DT even when only the findings are observed in the training data (ground truth diseases unknown).}\n}\n\n
\n
\n\n\n
\n This paper considers the problem of learning the parameters in Bayesian networks of discrete variables with known structure and hidden variables. Previous approaches in these settings typically use expectation maximization; when the network has high treewidth, the required expectations might be approximated using Monte Carlo or variational methods. We show how to avoid inference altogether during learning by giving a polynomial-time algorithm based on the method-of-moments, building upon recent work on learning discrete-valued mixture models. In particular, we show how to learn the parameters for a family of bipartite noisy-or Bayesian networks. In our experimental results, we demonstrate an application of our algorithm to learning QMR-DT, a large Bayesian network used for medical diagnosis. We show that it is possible to fully learn the parameters of QMR-DT even when only the findings are observed in the training data (ground truth diseases unknown).\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A Practical Algorithm for Topic Modeling with Provable Guarantees.\n \n \n \n \n\n\n \n Arora, S.; Ge, R.; Halpern, Y.; Mimno, D. M.; Moitra, A.; Sontag, D.; Wu, Y.; and Zhu, M.\n\n\n \n\n\n\n In Proceedings of the International Conference on Machine Learning (ICML), volume 28 (2), pages 280–288, 2013. JMLR: W&CP\n \n\n\n\n
\n\n\n\n \n \n \"A paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{AroraEtAl_icml13,\n  author    = {Sanjeev Arora and Rong Ge and Yoni Halpern and David M. Mimno and Ankur Moitra and David Sontag and Yichen Wu and Michael Zhu},\n  title     = {A Practical Algorithm for Topic Modeling with Provable Guarantees},\n  booktitle = {Proceedings of the International Conference on Machine Learning (ICML)},\n  year = {2013},\n publisher = {JMLR: W\\&CP},\n volume = {28 (2)},\n pages  = {280--288},\n keywords = {Machine learning, Unsupervised learning, Topic models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/AroraEtAl_icml13.pdf},\n abstract = {Topic models provide a useful method for dimensionality reduction and exploratory data analysis in large text corpora. Most approaches to topic model learning have been based on a maximum likelihood objective. Efficient algorithms exist that attempt to approximate this objective, but they have no provable guarantees. Recently, algorithms have been introduced that provide provable bounds, but these algorithms are not practical because they are inefficient and not robust to violations of model assumptions. In this paper we present an algorithm for learning topic models that is both provable and practical. The algorithm produces results comparable to the best MCMC implementations while running orders of magnitude faster.}\n}\n\n
\n
\n\n\n
\n Topic models provide a useful method for dimensionality reduction and exploratory data analysis in large text corpora. Most approaches to topic model learning have been based on a maximum likelihood objective. Efficient algorithms exist that attempt to approximate this objective, but they have no provable guarantees. Recently, algorithms have been introduced that provide provable bounds, but these algorithms are not practical because they are inefficient and not robust to violations of model assumptions. In this paper we present an algorithm for learning topic models that is both provable and practical. The algorithm produces results comparable to the best MCMC implementations while running orders of magnitude faster.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Efficiently Searching for Frustrated Cycles in MAP Inference.\n \n \n \n \n\n\n \n Sontag, D.; Choe, D. K.; and Li, Y.\n\n\n \n\n\n\n In Proceedings of the Twenty-Eighth Conference on Uncertainty in Artificial Intelligence (UAI-12), pages 795–804, Corvallis, Oregon, 2012. AUAI Press\n \n\n\n\n
\n\n\n\n \n \n \"Efficiently paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{SontagChoeLi_uai12,\n author = {David Sontag and Do Kook Choe and Yitao Li},\n title = {Efficiently Searching for Frustrated Cycles in {MAP} Inference},\n booktitle = {Proceedings of the Twenty-Eighth Conference on Uncertainty in Artificial Intelligence ({UAI}-12)},\n publisher = {AUAI Press},\n address = {Corvallis, Oregon},\n pages = {795--804},\n year = {2012},\n keywords = {Machine learning, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/sontag_uai12.pdf},\n abstract = {Dual decomposition provides a tractable framework for designing algorithms for finding the most probable (MAP) configuration in graphical models. However, for many real-world inference problems, the typical decomposition has a large integrality gap, due to frustrated cycles. One way to tighten the relaxation is to introduce additional constraints that explicitly enforce cycle consistency. Earlier work showed that cluster-pursuit algorithms, which iteratively introduce cycle and other higher-order consistency constraints, allows one to exactly solve many hard inference problems. However, these algorithms explicitly enumerate a candidate set of clusters, limiting them to triplets or other short cycles. We solve the search problem for cycle constraints, giving a nearly linear time algorithm for finding the most frustrated cycle of arbitrary length. We show how to use this search algorithm together with the dual decomposition framework and cluster-pursuit. The new algorithm exactly solves MAP inference problems arising from relational classification and stereo vision.}\n}\n\n
\n
\n\n\n
\n Dual decomposition provides a tractable framework for designing algorithms for finding the most probable (MAP) configuration in graphical models. However, for many real-world inference problems, the typical decomposition has a large integrality gap, due to frustrated cycles. One way to tighten the relaxation is to introduce additional constraints that explicitly enforce cycle consistency. Earlier work showed that cluster-pursuit algorithms, which iteratively introduce cycle and other higher-order consistency constraints, allows one to exactly solve many hard inference problems. However, these algorithms explicitly enumerate a candidate set of clusters, limiting them to triplets or other short cycles. We solve the search problem for cycle constraints, giving a nearly linear time algorithm for finding the most frustrated cycle of arbitrary length. We show how to use this search algorithm together with the dual decomposition framework and cluster-pursuit. The new algorithm exactly solves MAP inference problems arising from relational classification and stereo vision.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Introduction to Dual Decomposition for Inference.\n \n \n \n \n\n\n \n Sontag, D.; Globerson, A.; and Jaakkola, T.\n\n\n \n\n\n\n In Sra, S.; Nowozin, S.; and Wright, S. J., editor(s), Optimization for Machine Learning, pages 219–254. MIT Press, 2012.\n \n\n\n\n
\n\n\n\n \n \n \"Introduction paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@incollection{SonGloJaa_optbook,\n author = {David Sontag and Amir Globerson and Tommi Jaakkola},\n title = {Introduction to Dual Decomposition for Inference},\n booktitle = {Optimization for Machine Learning},\n editor = {Suvrit Sra and Sebastian Nowozin and Stephen J. Wright},\n pages = {219--254},\n publisher = {MIT Press},\n year = {2012},\n keywords = {Machine learning, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/SonGloJaa_optbook.pdf},\n abstract = {Many inference problems with discrete variables result in a difficult combinatorial optimization problem. In recent years, the technique of dual decomposition, also called Lagrangian relaxation, has proven to be a powerful means of solving these inference problems by decomposing them into simpler components that are repeatedly solved independently and combined into a global solution. In this chapter, we introduce the general technique of dual decomposition through its application to the problem of finding the most likely (MAP) assignment in Markov random fields. We discuss both subgradient and block coordinate descent approaches to solving the dual problem. The resulting message-passing algorithms are similar to max-product, but can be shown to solve a linear programming relaxation of the MAP problem. We show how many of the MAP algorithms are related to each other, and also quantify when the MAP solution can and cannot be decoded directly from the dual solution.}\n}\n\n
\n
\n\n\n
\n Many inference problems with discrete variables result in a difficult combinatorial optimization problem. In recent years, the technique of dual decomposition, also called Lagrangian relaxation, has proven to be a powerful means of solving these inference problems by decomposing them into simpler components that are repeatedly solved independently and combined into a global solution. In this chapter, we introduce the general technique of dual decomposition through its application to the problem of finding the most likely (MAP) assignment in Markov random fields. We discuss both subgradient and block coordinate descent approaches to solving the dual problem. The resulting message-passing algorithms are similar to max-product, but can be shown to solve a linear programming relaxation of the MAP problem. We show how many of the MAP algorithms are related to each other, and also quantify when the MAP solution can and cannot be decoded directly from the dual solution.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Complexity of Inference in Latent Dirichlet Allocation.\n \n \n \n \n\n\n \n Sontag, D.; and Roy, D.\n\n\n \n\n\n\n In Shawe-Taylor, J.; Zemel, R.; Bartlett, P.; Pereira, F.; and Weinberger, K., editor(s), Advances in Neural Information Processing Systems 24, pages 1008–1016. MIT Press, 2011.\n \n\n\n\n
\n\n\n\n \n \n \"Complexity paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@incollection{SontagRoy_nips11,\n author = {David Sontag and Dan Roy},\n title = {Complexity of Inference in Latent Dirichlet Allocation},\n booktitle = {Advances in Neural Information Processing Systems 24},\n editor = {J. Shawe-Taylor and R.S. Zemel and P. Bartlett and F.C.N. Pereira and K.Q. Weinberger},\n pages = {1008--1016},\n publisher = {MIT Press},\n year = {2011},\n keywords = {Machine learning, Approximate inference in graphical models, Topic models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/SontagRoy_nips11.pdf},\n abstract = {We consider the computational complexity of probabilistic inference in Latent Dirichlet Allocation (LDA). First, we study the problem of finding the maximum a posteriori (MAP) assignment of topics to words, where the document’s topic distribution is integrated out. We show that, when the effective number of topics per document is small, exact inference takes polynomial time. In contrast, we show that, when a document has a large number of topics, finding the MAP assignment of topics to words in LDA is NP-hard. Next, we consider the problem of finding the MAP topic distribution for a document, where the topic-word assignments are integrated out. We show that this problem is also NP-hard. Finally, we briefly discuss the problem of sampling from the posterior, showing that this is NP-hard in one restricted setting, but leaving open the general question.}\n}\n\n
\n
\n\n\n
\n We consider the computational complexity of probabilistic inference in Latent Dirichlet Allocation (LDA). First, we study the problem of finding the maximum a posteriori (MAP) assignment of topics to words, where the document’s topic distribution is integrated out. We show that, when the effective number of topics per document is small, exact inference takes polynomial time. In contrast, we show that, when a document has a large number of topics, finding the MAP assignment of topics to words in LDA is NP-hard. Next, we consider the problem of finding the MAP topic distribution for a document, where the topic-word assignments are integrated out. We show that this problem is also NP-hard. Finally, we briefly discuss the problem of sampling from the posterior, showing that this is NP-hard in one restricted setting, but leaving open the general question.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n More data means less inference: A pseudo-max approach to structured learning.\n \n \n \n \n\n\n \n Sontag, D.; Meshi, O.; Jaakkola, T.; and Globerson, A.\n\n\n \n\n\n\n In Lafferty, J.; Williams, C.; Shawe-Taylor, J.; Zemel, R.; and Culotta, A., editor(s), Advances in Neural Information Processing Systems 23, pages 2181–2189. MIT Press, 2010.\n \n\n\n\n
\n\n\n\n \n \n \"More paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@incollection{SonMesJaaGlo_nips10,\n author = {David Sontag and Ofer Meshi and Tommi Jaakkola and Amir Globerson},\n title = {More data means less inference: A pseudo-max approach to structured learning},\n booktitle = {Advances in Neural Information Processing Systems 23},\n editor = {J. Lafferty and C.K.I. Williams and J. Shawe-Taylor and R.S. Zemel and A. Culotta},\n pages = {2181--2189},\n publisher = {MIT Press},\n year = {2010},\n keywords = {Machine learning, Structured prediction},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/SonMesJaaGlo_nips10.pdf},\n abstract = {The problem of learning to predict structured labels is of key importance in many applications. However, for general graph structure both learning and inference are intractable. Here we show that it is possible to circumvent this difficulty when the distribution of training examples is rich enough, via a method similar in spirit to pseudo-likelihood. We show that our new method achieves consistency, and illustrate empirically that it indeed approaches the performance of exact methods when sufficiently large training sets are used.}\n}\n\n
\n
\n\n\n
\n The problem of learning to predict structured labels is of key importance in many applications. However, for general graph structure both learning and inference are intractable. Here we show that it is possible to circumvent this difficulty when the distribution of training examples is rich enough, via a method similar in spirit to pseudo-likelihood. We show that our new method achieves consistency, and illustrate empirically that it indeed approaches the performance of exact methods when sufficiently large training sets are used.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n On Dual Decomposition and Linear Programming Relaxations for Natural Language Processing.\n \n \n \n \n\n\n \n Rush, A. M.; Sontag, D.; Collins, M.; and Jaakkola, T.\n\n\n \n\n\n\n In Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing (EMNLP), pages 1-11, 2010. \n \n\n\n\n
\n\n\n\n \n \n \"On paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{RusSonColJaa_emnlp10,\n author = {Alexander M. Rush and David Sontag and Michael Collins and Tommi Jaakkola},\n title = {On Dual Decomposition and Linear Programming Relaxations for Natural Language Processing},\n booktitle = {Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing (EMNLP)},\n pages = {1-11},\n year = {2010},\n keywords = {Machine learning, Natural language processing, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/RusSonColJaa_emnlp10.pdf},\n abstract = {This paper introduces dual decomposition as a framework for deriving inference algorithms for NLP problems. The approach relies on standard dynamic-programming algorithms as oracle solvers for sub-problems, together with a simple method for forcing agreement between the different oracles. The approach provably solves a linear programming (LP) relaxation of the global inference problem. It leads to algorithms that are simple, in that they use existing decoding algorithms; efficient, in that they avoid exact algorithms for the full model; and often exact, in that empirically they often recover the correct solution in spite of using an LP relaxation. We give experimental results on two problems: 1) the combination of two lexicalized parsing models; and 2) the combination of a lexicalized parsing model and a trigram part-of-speech tagger.}\n}\n\n
\n
\n\n\n
\n This paper introduces dual decomposition as a framework for deriving inference algorithms for NLP problems. The approach relies on standard dynamic-programming algorithms as oracle solvers for sub-problems, together with a simple method for forcing agreement between the different oracles. The approach provably solves a linear programming (LP) relaxation of the global inference problem. It leads to algorithms that are simple, in that they use existing decoding algorithms; efficient, in that they avoid exact algorithms for the full model; and often exact, in that empirically they often recover the correct solution in spite of using an LP relaxation. We give experimental results on two problems: 1) the combination of two lexicalized parsing models; and 2) the combination of a lexicalized parsing model and a trigram part-of-speech tagger.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Dual Decomposition for Parsing with Non-Projective Head Automata.\n \n \n \n \n\n\n \n Koo, T.; Rush, A. M.; Collins, M.; Jaakkola, T.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing (EMNLP), pages 1288-1298, 2010. \n \n\n\n\n
\n\n\n\n \n \n \"Dual paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KooEtAl_emnlp10,\n author = {Terry Koo and Alexander M. Rush and Michael Collins and Tommi Jaakkola and David Sontag},\n title = {Dual Decomposition for Parsing with Non-Projective Head Automata},\n booktitle = {Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing (EMNLP)},\n pages = {1288-1298},\n year = {2010},\n keywords = {Machine learning, Natural language processing, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/KooEtAl_emnlp10.pdf},\n abstract = {This paper introduces algorithms for non-projective parsing based on dual decomposition. We focus on parsing algorithms for non-projective head automata, a generalization of head-automata models to non-projective structures. The dual decomposition algorithms are simple and efficient, relying on standard dynamic programming and minimum spanning tree algorithms. They provably solve an LP relaxation of the non-projective parsing problem. Empirically the LP relaxation is very often tight: for many languages, exact solutions are achieved on over 98\\% of test sentences. The accuracy of our models is higher than previous work on a broad range of datasets.}\n}\n\n
\n
\n\n\n
\n This paper introduces algorithms for non-projective parsing based on dual decomposition. We focus on parsing algorithms for non-projective head automata, a generalization of head-automata models to non-projective structures. The dual decomposition algorithms are simple and efficient, relying on standard dynamic programming and minimum spanning tree algorithms. They provably solve an LP relaxation of the non-projective parsing problem. Empirically the LP relaxation is very often tight: for many languages, exact solutions are achieved on over 98% of test sentences. The accuracy of our models is higher than previous work on a broad range of datasets.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Approximate Inference in Graphical Models using LP Relaxations.\n \n \n \n \n\n\n \n Sontag, D.\n\n\n \n\n\n\n Ph.D. Thesis, Massachusetts Institute of Technology, Department of Electrical Engineering and Computer Science, 2010.\n \n\n\n\n
\n\n\n\n \n \n \"Approximate paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@phdthesis{Sontag_thesis10,\n title  = {Approximate Inference in Graphical Models using LP Relaxations},\n author = {David Sontag},\n school = {Massachusetts Institute of Technology},\n address = {Department of Electrical Engineering and Computer Science},\n year   = {2010},\n keywords = {Machine learning, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/sontag_phd_thesis.pdf},\n abstract = {Graphical models such as Markov random fields have been successfully applied to a wide variety of fields, from computer vision and natural language processing, to computational biology. Exact probabilistic inference is generally intractable in complex models having many dependencies between the variables. We present new approaches to approximate inference based on linear programming (LP) relaxations. Our algorithms optimize over the cycle relaxation of the marginal polytope, which we show to be closely related to the first lifting of the Sherali-Adams hierarchy, and is significantly tighter than the pairwise LP relaxation. We show how to efficiently optimize over the cycle relaxation using a cutting-plane algorithm that iteratively introduces constraints into the relaxation. We provide a criterion to determine which constraints would be most helpful in tightening the relaxation, and give efficient algorithms for solving the search problem of finding the best cycle constraint to add according to this criterion. By solving the LP relaxations in the dual, we obtain efficient message-passing algorithms that, when the relaxations are tight, can provably find the most likely (MAP) configuration. Our algorithms succeed at finding the MAP configuration in protein side-chain placement, protein design, and stereo vision problems.}\n}\n\n
\n
\n\n\n
\n Graphical models such as Markov random fields have been successfully applied to a wide variety of fields, from computer vision and natural language processing, to computational biology. Exact probabilistic inference is generally intractable in complex models having many dependencies between the variables. We present new approaches to approximate inference based on linear programming (LP) relaxations. Our algorithms optimize over the cycle relaxation of the marginal polytope, which we show to be closely related to the first lifting of the Sherali-Adams hierarchy, and is significantly tighter than the pairwise LP relaxation. We show how to efficiently optimize over the cycle relaxation using a cutting-plane algorithm that iteratively introduces constraints into the relaxation. We provide a criterion to determine which constraints would be most helpful in tightening the relaxation, and give efficient algorithms for solving the search problem of finding the best cycle constraint to add according to this criterion. By solving the LP relaxations in the dual, we obtain efficient message-passing algorithms that, when the relaxations are tight, can provably find the most likely (MAP) configuration. Our algorithms succeed at finding the MAP configuration in protein side-chain placement, protein design, and stereo vision problems.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning Efficiently with Approximate Inference via Dual Losses.\n \n \n \n \n\n\n \n Meshi, O.; Sontag, D.; Jaakkola, T.; and Globerson, A.\n\n\n \n\n\n\n In Furnkranz, J.; and Joachims, T., editor(s), Proceedings of the 27th International Conference on Machine Learning (ICML), pages 783-790, 2010. Omnipress\n \n\n\n\n
\n\n\n\n \n \n \"Learning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{MesSonJaaGlo_icml10,\n title = {Learning Efficiently with Approximate Inference via Dual Losses},\n author = {Ofer Meshi and David Sontag and Tommi Jaakkola and Amir Globerson},\n booktitle = {Proceedings of the 27th International Conference on Machine Learning (ICML)},\n pages = {783-790},\n editor = {Johannes Furnkranz and Thorsten Joachims},\n publisher = {Omnipress},\n year = {2010},\n keywords = {Machine learning, Structured prediction},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/MesSonJaaGlo_icml10.pdf},\n abstract = {Many structured prediction tasks involve complex models where inference is computationally intractable, but where it can be well approximated using a linear programming relaxation. Previous approaches for learning for structured prediction (e.g., cutting-plane, subgradient methods, perceptron) repeatedly make predictions for some of the data points. These approaches are computationally demanding because each prediction involves solving a linear program to optimality. We present a scalable algorithm for learning for structured prediction. The main idea is to instead solve the dual of the structured prediction loss. We formulate the learning task as a convex minimization over both the weights and the dual variables corresponding to each data point. As a result, we can begin to optimize the weights even before completely solving any of the individual prediction problems. We show how the dual variables can be efficiently optimized using coordinate descent. Our algorithm is competitive with state-of-the-art methods such as stochastic subgradient and cutting-plane.}\n}\n\n
\n
\n\n\n
\n Many structured prediction tasks involve complex models where inference is computationally intractable, but where it can be well approximated using a linear programming relaxation. Previous approaches for learning for structured prediction (e.g., cutting-plane, subgradient methods, perceptron) repeatedly make predictions for some of the data points. These approaches are computationally demanding because each prediction involves solving a linear program to optimality. We present a scalable algorithm for learning for structured prediction. The main idea is to instead solve the dual of the structured prediction loss. We formulate the learning task as a convex minimization over both the weights and the dual variables corresponding to each data point. As a result, we can begin to optimize the weights even before completely solving any of the individual prediction problems. We show how the dual variables can be efficiently optimized using coordinate descent. Our algorithm is competitive with state-of-the-art methods such as stochastic subgradient and cutting-plane.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning Bayesian Network Structure using LP Relaxations.\n \n \n \n \n\n\n \n Jaakkola, T.; Sontag, D.; Globerson, A.; and Meila, M.\n\n\n \n\n\n\n In Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics (AI-STATS), volume 9, pages 358-365, 2010. JMLR: W&CP\n \n\n\n\n
\n\n\n\n \n \n \"Learning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{JaaSonGloMei_aistats10,\n title  = {Learning {B}ayesian Network Structure using {LP} Relaxations},\n author = {Tommi Jaakkola and David Sontag and Amir Globerson and Marina Meila},\n booktitle = {Proceedings of the Thirteenth International Conference on Artificial Intelligence and Statistics (AI-STATS)},\n publisher = {JMLR: W\\&CP},\n volume = {9},\n pages  = {358-365},\n year = {2010},\n keywords = {Machine learning, Bayesian network structure learning},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/structure_aistats10.pdf},\n abstract = {We propose to solve the combinatorial problem of finding the highest scoring Bayesian network structure from data. This structure learning problem can be viewed as an inference problem where the variables specify the choice of parents for each node in the graph. The key combinatorial difficulty arises from the global constraint that the graph structure has to be acyclic. We cast the structure learning problem as a linear program over the polytope defined by valid acyclic structures. In relaxing this problem, we maintain an outer bound approximation to the polytope and iteratively tighten it by searching over a new class of valid constraints. If an integral solution is found, it is guaranteed to be the optimal Bayesian network. When the relaxation is not tight, the fast dual algorithms we develop remain useful in combination with a branch and bound method. Empirical results suggest that the method is competitive or faster than alternative exact methods based on dynamic programming.}\n}\n\n
\n
\n\n\n
\n We propose to solve the combinatorial problem of finding the highest scoring Bayesian network structure from data. This structure learning problem can be viewed as an inference problem where the variables specify the choice of parents for each node in the graph. The key combinatorial difficulty arises from the global constraint that the graph structure has to be acyclic. We cast the structure learning problem as a linear program over the polytope defined by valid acyclic structures. In relaxing this problem, we maintain an outer bound approximation to the polytope and iteratively tighten it by searching over a new class of valid constraints. If an integral solution is found, it is guaranteed to be the optimal Bayesian network. When the relaxation is not tight, the fast dual algorithms we develop remain useful in combination with a branch and bound method. Empirical results suggest that the method is competitive or faster than alternative exact methods based on dynamic programming.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Tree Block Coordinate Descent for MAP in Graphical Models.\n \n \n \n \n\n\n \n Sontag, D.; and Jaakkola, T.\n\n\n \n\n\n\n In Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AI-STATS), volume 8, pages 544-551, 2009. JMLR: W&CP\n \n\n\n\n
\n\n\n\n \n \n \"Tree paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{SonJaa_aistats09,\n title  = {Tree Block Coordinate Descent for {MAP} in Graphical Models},\n author = {David Sontag and Tommi Jaakkola},\n booktitle = {Proceedings of the Twelfth International Conference on Artificial Intelligence and Statistics (AI-STATS)},\n publisher = {JMLR: W\\&CP},\n volume = {8},\n pages  = {544-551},\n year = {2009},\n keywords = {Machine learning, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/sontag_aistats09.pdf},\n abstract = {A number of linear programming relaxations have been proposed for finding most likely settings of the variables (MAP) in large probabilistic models. The relaxations are often succinctly expressed in the dual and reduce to different types of reparameterizations of the original model. The dual objectives are typically solved by performing local block coordinate descent steps. In this work, we show how to perform block coordinate descent on spanning trees of the graphical model. We also show how all of the earlier dual algorithms are related to each other, giving transformations from one type of reparameterization to another while maintaining monotonicity relative to a common objective function. Finally, we quantify when the MAP solution can and cannot be decoded directly from the dual LP relaxation.}\n}\n\n
\n
\n\n\n
\n A number of linear programming relaxations have been proposed for finding most likely settings of the variables (MAP) in large probabilistic models. The relaxations are often succinctly expressed in the dual and reduce to different types of reparameterizations of the original model. The dual objectives are typically solved by performing local block coordinate descent steps. In this work, we show how to perform block coordinate descent on spanning trees of the graphical model. We also show how all of the earlier dual algorithms are related to each other, giving transformations from one type of reparameterization to another while maintaining monotonicity relative to a common objective function. Finally, we quantify when the MAP solution can and cannot be decoded directly from the dual LP relaxation.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Clusters and Coarse Partitions in LP Relaxations.\n \n \n \n \n\n\n \n Sontag, D.; Globerson, A.; and Jaakkola, T.\n\n\n \n\n\n\n In Koller, D.; Schuurmans, D.; Bengio, Y.; and Bottou, L., editor(s), Advances in Neural Information Processing Systems 21, pages 1537–1544, 2009. MIT Press\n \n\n\n\n
\n\n\n\n \n \n \"Clusters paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{SonGloJaa_nips08,\n title  = {Clusters and Coarse Partitions in {LP} Relaxations},\n author = {David Sontag and Amir Globerson and Tommi Jaakkola},\n booktitle = {Advances in Neural Information Processing Systems 21},\n editor = {D. Koller and D. Schuurmans and Y. Bengio and L. Bottou},\n pages = {1537--1544},\n publisher = {MIT Press},\n year = {2009},\n keywords = {Machine learning, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/sontag_nips08.pdf},\n abstract = {We propose a new class of consistency constraints for Linear Programming (LP) relaxations for finding the most probable (MAP) configuration in graphical models. Usual cluster-based LP relaxations enforce joint consistency on the beliefs of a cluster of variables, with computational cost increasing exponentially with the size of the clusters. By partitioning the state space of a cluster and enforcing consistency only across partitions, we obtain a class of constraints which, although less tight, are computationally feasible for large clusters. We show how to solve the cluster selection and partitioning problem monotonically in the dual LP, using the current beliefs to guide these choices. We obtain a dual message passing algorithm and apply it to protein design problems where the variables have large state spaces and the usual cluster-based relaxations are very costly. The resulting method solves many of these problems exactly, and significantly faster than a method that does not use partitioning.}\n}\n\n
\n
\n\n\n
\n We propose a new class of consistency constraints for Linear Programming (LP) relaxations for finding the most probable (MAP) configuration in graphical models. Usual cluster-based LP relaxations enforce joint consistency on the beliefs of a cluster of variables, with computational cost increasing exponentially with the size of the clusters. By partitioning the state space of a cluster and enforcing consistency only across partitions, we obtain a class of constraints which, although less tight, are computationally feasible for large clusters. We show how to solve the cluster selection and partitioning problem monotonically in the dual LP, using the current beliefs to guide these choices. We obtain a dual message passing algorithm and apply it to protein design problems where the variables have large state spaces and the usual cluster-based relaxations are very costly. The resulting method solves many of these problems exactly, and significantly faster than a method that does not use partitioning.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Tightening LP Relaxations for MAP using Message-Passing.\n \n \n \n \n\n\n \n Sontag, D.; Meltzer, T.; Globerson, A.; Weiss, Y.; and Jaakkola, T.\n\n\n \n\n\n\n In 24th Conference on Uncertainty in Artificial Intelligence, pages 503-510, 2008. AUAI Press\n \n\n\n\n
\n\n\n\n \n \n \"Tightening paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{SontagEtAl_uai08,\n title  = {Tightening {LP} Relaxations for {MAP} using Message-Passing},\n author = {David Sontag and Talya Meltzer and Amir Globerson and Yair Weiss and Tommi Jaakkola},\n pages     = {503-510},\n booktitle = {24th Conference on Uncertainty in Artificial Intelligence},\n publisher = {AUAI Press},\n year = {2008},\n keywords = {Machine learning, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/sontag_uai08.pdf},\n abstract = {Linear Programming (LP) relaxations have become powerful tools for finding the most probable (MAP) configuration in graphical models. These relaxations can be solved efficiently using message-passing algorithms such as belief propagation and, when the relaxation is tight, provably find the MAP configuration. The standard LP relaxation is not tight enough in many real-world problems, however, and this has lead to the use of higher order cluster-based LP relaxations. The computational cost increases exponentially with the size of the clusters and limits the number and type of clusters we can use. We propose to solve the cluster selection problem monotonically in the dual LP, iteratively selecting clusters with guaranteed improvement, and quickly re-solving with the added clusters by reusing the existing solution. Our dual message-passing algorithm finds the MAP configuration in protein side-chain placement, protein design, and stereo problems, in cases where the standard LP relaxation fails.}\n}\n\n
\n
\n\n\n
\n Linear Programming (LP) relaxations have become powerful tools for finding the most probable (MAP) configuration in graphical models. These relaxations can be solved efficiently using message-passing algorithms such as belief propagation and, when the relaxation is tight, provably find the MAP configuration. The standard LP relaxation is not tight enough in many real-world problems, however, and this has lead to the use of higher order cluster-based LP relaxations. The computational cost increases exponentially with the size of the clusters and limits the number and type of clusters we can use. We propose to solve the cluster selection problem monotonically in the dual LP, iteratively selecting clusters with guaranteed improvement, and quickly re-solving with the added clusters by reusing the existing solution. Our dual message-passing algorithm finds the MAP configuration in protein side-chain placement, protein design, and stereo problems, in cases where the standard LP relaxation fails.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n New Outer Bounds on the Marginal Polytope.\n \n \n \n \n\n\n \n Sontag, D.; and Jaakkola, T.\n\n\n \n\n\n\n In Platt, J.; Koller, D.; Singer, Y.; and Roweis, S., editor(s), Advances in Neural Information Processing Systems 20, pages 1393–1400, Cambridge, MA, 2008. MIT Press\n \n\n\n\n
\n\n\n\n \n \n \"New paper\n  \n \n \n \"New link\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{SonJaa_nips08,\n title  = {New Outer Bounds on the Marginal Polytope},\n author = {David Sontag and Tommi Jaakkola}, \n booktitle = {Advances in Neural Information Processing Systems 20},\n editor = {J.C. Platt and D. Koller and Y. Singer and S. Roweis},\n publisher = {MIT Press},\n address = {Cambridge, MA},\n pages = {1393--1400},\n year = {2008},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/sontag_nips2007.pdf},\n url_Link = {http://people.csail.mit.edu/dsontag/papers/sontag_nips2007_addendum.txt},\n keywords = {Machine learning, Approximate inference in graphical models},\n abstract = {We give a new class of outer bounds on the marginal polytope, and propose a cutting-plane algorithm for efficiently optimizing over these constraints. When combined with a concave upper bound on the entropy, this gives a new variational inference algorithm for probabilistic inference in discrete Markov Random Fields (MRFs). Valid constraints on the marginal polytope are derived through a series of projections onto the cut polytope. As a result, we obtain tighter upper bounds on the log-partition function. We also show empirically that the approximations of the marginals are significantly more accurate when using the tighter outer bounds. Finally, we demonstrate the advantage of the new constraints for finding the MAP assignment in protein structure prediction.}\n}\n\n
\n
\n\n\n
\n We give a new class of outer bounds on the marginal polytope, and propose a cutting-plane algorithm for efficiently optimizing over these constraints. When combined with a concave upper bound on the entropy, this gives a new variational inference algorithm for probabilistic inference in discrete Markov Random Fields (MRFs). Valid constraints on the marginal polytope are derived through a series of projections onto the cut polytope. As a result, we obtain tighter upper bounds on the log-partition function. We also show empirically that the approximations of the marginals are significantly more accurate when using the tighter outer bounds. Finally, we demonstrate the advantage of the new constraints for finding the MAP assignment in protein structure prediction.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Cutting Plane Algorithms for Variational Inference in Graphical Models.\n \n \n \n \n\n\n \n Sontag, D.\n\n\n \n\n\n\n Master's thesis, Massachusetts Institute of Technology, Department of Electrical Engineering and Computer Science, 2007.\n \n\n\n\n
\n\n\n\n \n \n \"Cutting paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@mastersthesis{Sontag_thesis07,\n title  = {Cutting Plane Algorithms for Variational Inference in Graphical Models},\n author = {David Sontag},\n school = {Massachusetts Institute of Technology},\n address = {Department of Electrical Engineering and Computer Science},\n year   = {2007},\n abstract = {In this thesis, we give a new class of outer bounds on the marginal polytope, and propose a cutting-plane algorithm for efficiently optimizing over these constraints. When combined with a concave upper bound on the entropy, this gives a new variational inference algorithm for probabilistic inference in discrete Markov Random Fields (MRFs). Valid constraints are derived for the marginal polytope through a series of projections onto the cut polytope. Projecting onto a larger model gives an efficient separation algorithm for a large class of valid inequalities arising from each of the original projections. As a result, we obtain tighter upper bounds on the logpartition function than possible with previous variational inference algorithms. We also show empirically that our approximations of the marginals are significantly more accurate. This algorithm can also be applied to the problem of finding the Maximum a Posteriori assignment in a MRF, which corresponds to a linear program over the marginal polytope. One of the main contributions of the thesis is to bring together two seemingly different fields, polyhedral combinatorics and probabilistic inference, showing how certain results in either field can carry over to the other.},\n url_Paper = {http://people.csail.mit.edu/dsontag/masters_thesis.pdf},\n keywords = {Machine learning, Approximate inference in graphical models}\n}\n\n
\n
\n\n\n
\n In this thesis, we give a new class of outer bounds on the marginal polytope, and propose a cutting-plane algorithm for efficiently optimizing over these constraints. When combined with a concave upper bound on the entropy, this gives a new variational inference algorithm for probabilistic inference in discrete Markov Random Fields (MRFs). Valid constraints are derived for the marginal polytope through a series of projections onto the cut polytope. Projecting onto a larger model gives an efficient separation algorithm for a large class of valid inequalities arising from each of the original projections. As a result, we obtain tighter upper bounds on the logpartition function than possible with previous variational inference algorithms. We also show empirically that our approximations of the marginals are significantly more accurate. This algorithm can also be applied to the problem of finding the Maximum a Posteriori assignment in a MRF, which corresponds to a linear program over the marginal polytope. One of the main contributions of the thesis is to bring together two seemingly different fields, polyhedral combinatorics and probabilistic inference, showing how certain results in either field can carry over to the other.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n BLOG: probabilistic models with unknown objects.\n \n \n \n \n\n\n \n Milch, B.; Marthi, B.; Russell, S.; Sontag, D.; Ong, D. L.; and Kolobov, A.\n\n\n \n\n\n\n In IJCAI'05: Proceedings of the 19th international joint conference on Artificial intelligence, pages 1352–1359, 2005. \n \n\n\n\n
\n\n\n\n \n \n \"BLOG: paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{MilchEtAl_ijcai05,\n title = {{BLOG}: probabilistic models with unknown objects},\n author = {Brian Milch and Bhaskara Marthi and Stuart Russell and David Sontag and Daniel L. Ong and Andrey Kolobov},\n booktitle = {IJCAI'05: Proceedings of the 19th international joint conference on Artificial intelligence},\n year = {2005},\n pages = {1352--1359},\n keywords = {Machine learning},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/MilchEtAl_IJCAI05.pdf},\n abstract = {This paper introduces and illustrates BLOG, a formal language for defining probability models over worlds with unknown objects and identity uncertainty. BLOG unifies and extends several existing approaches. Subject to certain acyclicity constraints, every BLOG model specifies a unique probability distribution over first-order model structures that can contain varying and unbounded numbers of objects. Furthermore, complete inference algorithms exist for a large fragment of the language. We also introduce a probabilistic form of Skolemization for handling evidence.}\n}\n% location = {Edinburgh, Scotland},\n% publisher = {Morgan Kaufmann Publishers Inc.},\n% address = {San Francisco, CA, USA},\n\n
\n
\n\n\n
\n This paper introduces and illustrates BLOG, a formal language for defining probability models over worlds with unknown objects and identity uncertainty. BLOG unifies and extends several existing approaches. Subject to certain acyclicity constraints, every BLOG model specifies a unique probability distribution over first-order model structures that can contain varying and unbounded numbers of objects. Furthermore, complete inference algorithms exist for a large fragment of the language. We also introduce a probabilistic form of Skolemization for handling evidence.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Approximate Inference for Infinite Contingent Bayesian Networks.\n \n \n \n \n\n\n \n Milch, B.; Marthi, B.; Sontag, D.; Russell, S.; Ong, D. L.; and Kolobov, A.\n\n\n \n\n\n\n In Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics, pages 238–245, 2005. \n \n\n\n\n
\n\n\n\n \n \n \"Approximate paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n\n\n\n
\n
@inproceedings{MilchEtAl_aistats05,\n title = {Approximate Inference for Infinite Contingent {B}ayesian Networks},\n author = {Brian Milch and Bhaskara Marthi and David Sontag and Stuart Russell and Daniel L. Ong and Andrey Kolobov},\n booktitle = {Proceedings of the Tenth International Workshop on Artificial Intelligence and Statistics},\n year = {2005},\n pages = {238--245},\n keywords = {Machine learning},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/MilchEtAl_AIStats05.pdf},\n abstract = {In many practical problems -- from tracking aircraft based on radar data to building a bibliographic database based on citation lists -- we want to reason about an unbounded number of unseen objects with unknown relations among them. Bayesian networks, which define a fixed dependency structure on a finite set of variables, are not the ideal representation language for this task. This paper introduces contingent Bayesian networks (CBNs), which represent uncertainty about dependencies by labeling each edge with a condition under which it is active. A CBN may contain cycles and have infinitely many variables. Nevertheless, we give general conditions under which such a CBN defines a unique joint distribution over its variables. We also present a likelihood weighting algorithm that performs approximate inference in finite time per sampling step on any CBN that satisfies these conditions.}\n}\n% editor = {Robert G. Cowell and Zoubin Ghahramani},\n% publisher = {Society for Artificial Intelligence and Statistics},\n
\n
\n\n\n
\n In many practical problems – from tracking aircraft based on radar data to building a bibliographic database based on citation lists – we want to reason about an unbounded number of unseen objects with unknown relations among them. Bayesian networks, which define a fixed dependency structure on a finite set of variables, are not the ideal representation language for this task. This paper introduces contingent Bayesian networks (CBNs), which represent uncertainty about dependencies by labeling each edge with a condition under which it is active. A CBN may contain cycles and have infinitely many variables. Nevertheless, we give general conditions under which such a CBN defines a unique joint distribution over its variables. We also present a likelihood weighting algorithm that performs approximate inference in finite time per sampling step on any CBN that satisfies these conditions.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n Natural language processing\n \n \n (7)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Grounded Recurrent Neural Networks.\n \n \n \n \n\n\n \n Vani, A.; Jernite, Y.; and Sontag, D.\n\n\n \n\n\n\n ArXiv e-prints arXiv:1705.08557. 2017.\n \n\n\n\n
\n\n\n\n \n \n \"Grounded paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{VaniEtAl_arxiv17,\n   author = {{Vani}, A. and {Jernite}, Y. and {Sontag}, D.},\n    title = "{Grounded Recurrent Neural Networks}",\n  journal = {ArXiv e-prints arXiv:1705.08557},\narchivePrefix = "arXiv",\n   eprint = {1705.08557},\n primaryClass = "stat.ML",\n     year = 2017,\n  keywords = {Machine learning, Health care, Natural language processing, Deep learning},\n  url_Paper = {https://arxiv.org/pdf/1705.08557.pdf},\n  abstract = {In this work, we present the Grounded Recurrent Neural Network (GRNN), a recurrent neural network architecture for multi-label prediction which explicitly ties labels to specific dimensions of the recurrent hidden state (we call this process "grounding"). The approach is particularly well-suited for extracting large numbers of concepts from text. We apply the new model to address an important problem in healthcare of understanding what medical concepts are discussed in clinical text. Using a publicly available dataset derived from Intensive Care Units, we learn to label a patient's diagnoses and procedures from their discharge summary. Our evaluation shows a clear advantage to using our proposed architecture over a variety of strong baselines.}\n}\n\n
\n
\n\n\n
\n In this work, we present the Grounded Recurrent Neural Network (GRNN), a recurrent neural network architecture for multi-label prediction which explicitly ties labels to specific dimensions of the recurrent hidden state (we call this process \"grounding\"). The approach is particularly well-suited for extracting large numbers of concepts from text. We apply the new model to address an important problem in healthcare of understanding what medical concepts are discussed in clinical text. Using a publicly available dataset derived from Intensive Care Units, we learn to label a patient's diagnoses and procedures from their discharge summary. Our evaluation shows a clear advantage to using our proposed architecture over a variety of strong baselines.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Discourse-Based Objectives for Fast Unsupervised Sentence Representation Learning.\n \n \n \n \n\n\n \n Jernite, Y.; Bowman, S. R; and Sontag, D.\n\n\n \n\n\n\n arXiv preprint arXiv:1705.00557. 2017.\n \n\n\n\n
\n\n\n\n \n \n \"Discourse-Based paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{JerniteEtAl_arxiv17,\n  title={Discourse-Based Objectives for Fast Unsupervised Sentence Representation Learning},\n  author={Jernite, Yacine and Bowman, Samuel R and Sontag, David},\n  journal={arXiv preprint arXiv:1705.00557},\n  year={2017},\n  keywords = {Machine learning, Natural language processing, Deep learning},\n  url_Paper = {https://arxiv.org/pdf/1705.00557.pdf},\n  abstract = {This work presents a novel objective function for the unsupervised training of neural network sentence encoders. It exploits signals from paragraph-level discourse coherence to train these models to understand text. Our objective is purely discriminative, allowing us to train models many times faster than was possible under prior methods, and it yields models which perform well in extrinsic evaluations.}\n}\n\n\n
\n
\n\n\n
\n This work presents a novel objective function for the unsupervised training of neural network sentence encoders. It exploits signals from paragraph-level discourse coherence to train these models to understand text. Our objective is purely discriminative, allowing us to train models many times faster than was possible under prior methods, and it yields models which perform well in extrinsic evaluations.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Simultaneous Learning of Trees and Representations for Extreme Classification and Density Estimation.\n \n \n \n \n\n\n \n Jernite, Y.; Choromanska, A.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the 34th International Conference on Machine Learning, pages 1665-1674, 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Simultaneous paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{JerniteEtAl_icml17,\n  author    = {Yacine Jernite and\n               Anna Choromanska and\n               David Sontag},\n  title     = {Simultaneous Learning of Trees and Representations for Extreme Classification\n               and Density Estimation},\n  booktitle = {Proceedings of the 34th International Conference on Machine Learning},\n  pages     = {1665-1674},\n  year      = {2017},\n  keywords = {Machine learning, Natural language processing, Deep learning},\n  url_Paper = {https://arxiv.org/pdf/1610.04658.pdf},\n  abstract = {We consider multi-class classification where the predictor has a hierarchical structure that allows for a very large number of labels both at train and test time. The predictive power of such models can heavily depend on the structure of the tree, and although past work showed how to learn the tree structure, it expected that the feature vectors remained static. We provide a novel algorithm to simultaneously perform representation learning for the input data and learning of the hierarchical predictor. Our approach optimizes an objective function which favors balanced and easily-separable multi-way node partitions. We theoretically analyze this objective, showing that it gives rise to a boosting style property and a bound on classification error. We next show how to extend the algorithm to conditional density estimation. We empirically validate both variants of the algorithm on text classification and language modeling, respectively, and show that they compare favorably to common baselines in terms of accuracy and running time.}\n}\n\n
\n
\n\n\n
\n We consider multi-class classification where the predictor has a hierarchical structure that allows for a very large number of labels both at train and test time. The predictive power of such models can heavily depend on the structure of the tree, and although past work showed how to learn the tree structure, it expected that the feature vectors remained static. We provide a novel algorithm to simultaneously perform representation learning for the input data and learning of the hierarchical predictor. Our approach optimizes an objective function which favors balanced and easily-separable multi-way node partitions. We theoretically analyze this objective, showing that it gives rise to a boosting style property and a bound on classification error. We next show how to extend the algorithm to conditional density estimation. We empirically validate both variants of the algorithm on text classification and language modeling, respectively, and show that they compare favorably to common baselines in terms of accuracy and running time.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Character-Aware Neural Language Models.\n \n \n \n \n\n\n \n Kim, Y.; Jernite, Y.; Sontag, D.; and Rush, A. M.\n\n\n \n\n\n\n In Proceedings of the Thirtieth AAAI Conference on Artificial Intelligence, pages 2741-2749, 2016. \n \n\n\n\n
\n\n\n\n \n \n \"Character-Aware paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KimEtAl_aaai16,\n  author    = {Yoon Kim and\n               Yacine Jernite and\n               David Sontag and\n               Alexander M. Rush},\n  title     = {Character-Aware Neural Language Models},\n  booktitle = {Proceedings of the Thirtieth {AAAI} Conference on Artificial Intelligence},\n  pages     = {2741-2749},\n  year      = {2016},\n  keywords = {Machine learning, Natural language processing, Deep learning},\n  url_Paper = {http://arxiv.org/pdf/1508.06615.pdf},\n  abstract = {We describe a simple neural language model that relies only on character-level inputs. Predictions are still made at the word-level. Our model employs a convolutional neural network (CNN) and a highway network over characters, whose output is given to a long short-term memory (LSTM) recurrent neural network language model (RNN-LM). On the English Penn Treebank the model is on par with the existing state-of-the-art despite having 60\\% fewer parameters. On languages with rich morphology (Arabic, Czech, French, German, Spanish, Russian), the model outperforms word-level/morpheme-level LSTM baselines, again with fewer parameters. The results suggest that on many languages, character inputs are sufficient for language modeling. Analysis of word representations obtained from the character composition part of the model reveals that the model is able to encode, from characters only, both semantic and orthographic information.}\n}\n\n
\n
\n\n\n
\n We describe a simple neural language model that relies only on character-level inputs. Predictions are still made at the word-level. Our model employs a convolutional neural network (CNN) and a highway network over characters, whose output is given to a long short-term memory (LSTM) recurrent neural network language model (RNN-LM). On the English Penn Treebank the model is on par with the existing state-of-the-art despite having 60% fewer parameters. On languages with rich morphology (Arabic, Czech, French, German, Spanish, Russian), the model outperforms word-level/morpheme-level LSTM baselines, again with fewer parameters. The results suggest that on many languages, character inputs are sufficient for language modeling. Analysis of word representations obtained from the character composition part of the model reveals that the model is able to encode, from characters only, both semantic and orthographic information.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A Fast Variational Approach for Learning Markov Random Field Language Models.\n \n \n \n \n\n\n \n Jernite, Y.; Rush, A.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the 32nd International Conference on Machine Learning (ICML), volume 37, pages 2209–2217, 2015. JMLR: W&CP\n \n\n\n\n
\n\n\n\n \n \n \"A paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{JerniteEtAl_icml15,\n  author    = {Yacine Jernite and Alexander Rush and David Sontag },\n  title     = {A Fast Variational Approach for Learning Markov Random Field Language Models},\n  booktitle = {Proceedings of the 32nd International Conference on Machine Learning (ICML)},\n  year = {2015},\n publisher = {JMLR: W\\&CP},\n volume = {37},\n pages  = {2209--2217},\n keywords = {Machine learning, Natural language processing},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/JerRusSon_icml15.pdf},\n abstract = {Language modelling is a fundamental building block of natural language processing. However, in practice the size of the vocabulary limits thedistributions applicable for this task: specifically, one has to either resort to local optimization methods, such as those used in neural language models, or work with heavily constrained distributions. In this work, we take a step towards overcoming these difficulties. We present a method for global-likelihood optimization of a Markov random field language model exploiting long-range contexts in time independent of the corpus size. We take a variational approach to optimizing the likelihood and exploit underlying symmetries to greatly simplify learning. We demonstrate the efficiency of this method both for language modelling and for part-of-speech tagging.}\n}\n\n
\n
\n\n\n
\n Language modelling is a fundamental building block of natural language processing. However, in practice the size of the vocabulary limits thedistributions applicable for this task: specifically, one has to either resort to local optimization methods, such as those used in neural language models, or work with heavily constrained distributions. In this work, we take a step towards overcoming these difficulties. We present a method for global-likelihood optimization of a Markov random field language model exploiting long-range contexts in time independent of the corpus size. We take a variational approach to optimizing the likelihood and exploit underlying symmetries to greatly simplify learning. We demonstrate the efficiency of this method both for language modelling and for part-of-speech tagging.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n On Dual Decomposition and Linear Programming Relaxations for Natural Language Processing.\n \n \n \n \n\n\n \n Rush, A. M.; Sontag, D.; Collins, M.; and Jaakkola, T.\n\n\n \n\n\n\n In Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing (EMNLP), pages 1-11, 2010. \n \n\n\n\n
\n\n\n\n \n \n \"On paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{RusSonColJaa_emnlp10,\n author = {Alexander M. Rush and David Sontag and Michael Collins and Tommi Jaakkola},\n title = {On Dual Decomposition and Linear Programming Relaxations for Natural Language Processing},\n booktitle = {Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing (EMNLP)},\n pages = {1-11},\n year = {2010},\n keywords = {Machine learning, Natural language processing, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/RusSonColJaa_emnlp10.pdf},\n abstract = {This paper introduces dual decomposition as a framework for deriving inference algorithms for NLP problems. The approach relies on standard dynamic-programming algorithms as oracle solvers for sub-problems, together with a simple method for forcing agreement between the different oracles. The approach provably solves a linear programming (LP) relaxation of the global inference problem. It leads to algorithms that are simple, in that they use existing decoding algorithms; efficient, in that they avoid exact algorithms for the full model; and often exact, in that empirically they often recover the correct solution in spite of using an LP relaxation. We give experimental results on two problems: 1) the combination of two lexicalized parsing models; and 2) the combination of a lexicalized parsing model and a trigram part-of-speech tagger.}\n}\n\n
\n
\n\n\n
\n This paper introduces dual decomposition as a framework for deriving inference algorithms for NLP problems. The approach relies on standard dynamic-programming algorithms as oracle solvers for sub-problems, together with a simple method for forcing agreement between the different oracles. The approach provably solves a linear programming (LP) relaxation of the global inference problem. It leads to algorithms that are simple, in that they use existing decoding algorithms; efficient, in that they avoid exact algorithms for the full model; and often exact, in that empirically they often recover the correct solution in spite of using an LP relaxation. We give experimental results on two problems: 1) the combination of two lexicalized parsing models; and 2) the combination of a lexicalized parsing model and a trigram part-of-speech tagger.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Dual Decomposition for Parsing with Non-Projective Head Automata.\n \n \n \n \n\n\n \n Koo, T.; Rush, A. M.; Collins, M.; Jaakkola, T.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing (EMNLP), pages 1288-1298, 2010. \n \n\n\n\n
\n\n\n\n \n \n \"Dual paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KooEtAl_emnlp10,\n author = {Terry Koo and Alexander M. Rush and Michael Collins and Tommi Jaakkola and David Sontag},\n title = {Dual Decomposition for Parsing with Non-Projective Head Automata},\n booktitle = {Proceedings of the 2010 Conference on Empirical Methods in Natural Language Processing (EMNLP)},\n pages = {1288-1298},\n year = {2010},\n keywords = {Machine learning, Natural language processing, Approximate inference in graphical models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/KooEtAl_emnlp10.pdf},\n abstract = {This paper introduces algorithms for non-projective parsing based on dual decomposition. We focus on parsing algorithms for non-projective head automata, a generalization of head-automata models to non-projective structures. The dual decomposition algorithms are simple and efficient, relying on standard dynamic programming and minimum spanning tree algorithms. They provably solve an LP relaxation of the non-projective parsing problem. Empirically the LP relaxation is very often tight: for many languages, exact solutions are achieved on over 98\\% of test sentences. The accuracy of our models is higher than previous work on a broad range of datasets.}\n}\n\n
\n
\n\n\n
\n This paper introduces algorithms for non-projective parsing based on dual decomposition. We focus on parsing algorithms for non-projective head automata, a generalization of head-automata models to non-projective structures. The dual decomposition algorithms are simple and efficient, relying on standard dynamic programming and minimum spanning tree algorithms. They provably solve an LP relaxation of the non-projective parsing problem. Empirically the LP relaxation is very often tight: for many languages, exact solutions are achieved on over 98% of test sentences. The accuracy of our models is higher than previous work on a broad range of datasets.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n Structured prediction\n \n \n (5)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Optimality of Approximate Inference Algorithms on Stable Instances.\n \n \n \n \n\n\n \n Lang, H.; Sontag, D.; and Vijayaraghavan, A.\n\n\n \n\n\n\n In Proceedings of the Twenty-First International Conference on Artificial Intelligence and Statistics (AI-STATS), 2018. JMLR: W&CP\n \n\n\n\n
\n\n\n\n \n \n \"Optimality paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{LangEtAl_aistats18,\n title = {Optimality of Approximate Inference Algorithms on Stable Instances},\n author = {Hunter Lang and David Sontag and Aravindan Vijayaraghavan},\n booktitle = {Proceedings of the Twenty-First International Conference on Artificial Intelligence and Statistics (AI-STATS)},\n publisher = {JMLR: W\\&CP},\n year = {2018},\n keywords = {Machine learning, Approximate inference in graphical models, Structured prediction},\n url_Paper = {http://proceedings.mlr.press/v84/lang18a.html},\n abstract = {Approximate algorithms for structured prediction problems -- such as LP relaxations and the popular alpha-expansion algorithm (Boykov et al. 2001) -- typically far exceed their theoretical performance guarantees on real-world instances. These algorithms often find solutions that are very close to optimal. The goal of this paper is to partially explain the performance of alpha-expansion and an LP relaxation algorithm on MAP inference in Ferromagnetic Potts models (FPMs). Our main results give stability conditions under which these two algorithms provably recover the optimal MAP solution. These theoretical results complement numerous empirical observations of good performance.}\n}\n\n
\n
\n\n\n
\n Approximate algorithms for structured prediction problems – such as LP relaxations and the popular alpha-expansion algorithm (Boykov et al. 2001) – typically far exceed their theoretical performance guarantees on real-world instances. These algorithms often find solutions that are very close to optimal. The goal of this paper is to partially explain the performance of alpha-expansion and an LP relaxation algorithm on MAP inference in Ferromagnetic Potts models (FPMs). Our main results give stability conditions under which these two algorithms provably recover the optimal MAP solution. These theoretical results complement numerous empirical observations of good performance.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Train and Test Tightness of LP Relaxations in Structured Prediction.\n \n \n \n \n\n\n \n Meshi, O.; Mahdavi, M.; Weller, A.; and Sontag, D.\n\n\n \n\n\n\n In Balcan, M. F.; and Weinberger, K. Q., editor(s), Proceedings of The 33rd International Conference on Machine Learning, volume 48, of Proceedings of Machine Learning Research, pages 1776–1785, New York, New York, USA, 20–22 Jun 2016. PMLR\n \n\n\n\n
\n\n\n\n \n \n \"Train paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@InProceedings{MeshiEtAl_icml16,\n  title = \t {Train and Test Tightness of LP Relaxations in Structured Prediction},\n  author = \t {Ofer Meshi and Mehrdad Mahdavi and Adrian Weller and David Sontag},\n  booktitle = \t {Proceedings of The 33rd International Conference on Machine Learning},\n  pages = \t {1776--1785},\n  year = \t {2016},\n  editor = \t {Maria Florina Balcan and Kilian Q. Weinberger},\n  volume = \t {48},\n  series = \t {Proceedings of Machine Learning Research},\n  address = \t {New York, New York, USA},\n  month = \t {20--22 Jun},\n  publisher = \t {PMLR},\n  keywords = {Machine learning, Structured prediction},\n  url_Paper = {http://people.csail.mit.edu/dsontag/papers/MeshiEtAl_icml16.pdf},\n  abstract = {Structured prediction is used in areas such as computer vision and natural language processing to predict structured outputs such as segmentations or parse trees. In these settings, prediction is performed by MAP inference or, equivalently, by solving an integer linear program. Because of the complex scoring functions required to obtain accurate predictions, both learning and inference typically require the use of approximate solvers. We propose a theoretical explanation to the striking observation that approximations based on linear programming (LP) relaxations are often tight on real-world instances. In particular, we show that learning with LP relaxed inference encourages integrality of training instances, and that tightness generalizes from train to test data.}\n}\n\n
\n
\n\n\n
\n Structured prediction is used in areas such as computer vision and natural language processing to predict structured outputs such as segmentations or parse trees. In these settings, prediction is performed by MAP inference or, equivalently, by solving an integer linear program. Because of the complex scoring functions required to obtain accurate predictions, both learning and inference typically require the use of approximate solvers. We propose a theoretical explanation to the striking observation that approximations based on linear programming (LP) relaxations are often tight on real-world instances. In particular, we show that learning with LP relaxed inference encourages integrality of training instances, and that tightness generalizes from train to test data.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n How Hard is Inference for Structured Prediction?.\n \n \n \n \n\n\n \n Globerson, A.; Roughgarden, T.; Sontag, D.; and Yildirim, C.\n\n\n \n\n\n\n In Proceedings of the 32nd International Conference on Machine Learning (ICML), volume 37, pages 2181-–2190, 2015. JMLR: W&CP\n \n\n\n\n
\n\n\n\n \n \n \"How paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{GlobersonEtAl_icml15,\n  author    = {Amir Globerson and Tim Roughgarden and David Sontag and Cafer Yildirim},\n  title     = {How Hard is Inference for Structured Prediction?},\n  booktitle = {Proceedings of the 32nd International Conference on Machine Learning (ICML)},\n  year = {2015},\n publisher = {JMLR: W\\&CP},\n volume = {37},\n pages  = {2181-–2190},\n keywords = {Machine learning, Approximate inference in graphical models, Structured prediction},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/GloRouSonYil_icml15.pdf},\n abstract = {Structured prediction tasks in machine learning involve the simultaneous prediction of multiple labels. This is often done by maximizing a score function on the space of labels, which decomposes as a sum of pairwise elements, each depending on two specific labels. The goal of this paper is to develop a theoretical explanation of the empirical effectiveness of heuristic inference algorithms for solving such structured prediction problems. We study the minimum-achievable expected Hamming error in such problems, highlighting the case of 2D grid graphs, which are common in machine vision applications. Our main theorems provide tight upper and lower bounds on this error, as well as a polynomialtime algorithm that achieves the bound.}\n}\n\n
\n
\n\n\n
\n Structured prediction tasks in machine learning involve the simultaneous prediction of multiple labels. This is often done by maximizing a score function on the space of labels, which decomposes as a sum of pairwise elements, each depending on two specific labels. The goal of this paper is to develop a theoretical explanation of the empirical effectiveness of heuristic inference algorithms for solving such structured prediction problems. We study the minimum-achievable expected Hamming error in such problems, highlighting the case of 2D grid graphs, which are common in machine vision applications. Our main theorems provide tight upper and lower bounds on this error, as well as a polynomialtime algorithm that achieves the bound.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n More data means less inference: A pseudo-max approach to structured learning.\n \n \n \n \n\n\n \n Sontag, D.; Meshi, O.; Jaakkola, T.; and Globerson, A.\n\n\n \n\n\n\n In Lafferty, J.; Williams, C.; Shawe-Taylor, J.; Zemel, R.; and Culotta, A., editor(s), Advances in Neural Information Processing Systems 23, pages 2181–2189. MIT Press, 2010.\n \n\n\n\n
\n\n\n\n \n \n \"More paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@incollection{SonMesJaaGlo_nips10,\n author = {David Sontag and Ofer Meshi and Tommi Jaakkola and Amir Globerson},\n title = {More data means less inference: A pseudo-max approach to structured learning},\n booktitle = {Advances in Neural Information Processing Systems 23},\n editor = {J. Lafferty and C.K.I. Williams and J. Shawe-Taylor and R.S. Zemel and A. Culotta},\n pages = {2181--2189},\n publisher = {MIT Press},\n year = {2010},\n keywords = {Machine learning, Structured prediction},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/SonMesJaaGlo_nips10.pdf},\n abstract = {The problem of learning to predict structured labels is of key importance in many applications. However, for general graph structure both learning and inference are intractable. Here we show that it is possible to circumvent this difficulty when the distribution of training examples is rich enough, via a method similar in spirit to pseudo-likelihood. We show that our new method achieves consistency, and illustrate empirically that it indeed approaches the performance of exact methods when sufficiently large training sets are used.}\n}\n\n
\n
\n\n\n
\n The problem of learning to predict structured labels is of key importance in many applications. However, for general graph structure both learning and inference are intractable. Here we show that it is possible to circumvent this difficulty when the distribution of training examples is rich enough, via a method similar in spirit to pseudo-likelihood. We show that our new method achieves consistency, and illustrate empirically that it indeed approaches the performance of exact methods when sufficiently large training sets are used.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning Efficiently with Approximate Inference via Dual Losses.\n \n \n \n \n\n\n \n Meshi, O.; Sontag, D.; Jaakkola, T.; and Globerson, A.\n\n\n \n\n\n\n In Furnkranz, J.; and Joachims, T., editor(s), Proceedings of the 27th International Conference on Machine Learning (ICML), pages 783-790, 2010. Omnipress\n \n\n\n\n
\n\n\n\n \n \n \"Learning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{MesSonJaaGlo_icml10,\n title = {Learning Efficiently with Approximate Inference via Dual Losses},\n author = {Ofer Meshi and David Sontag and Tommi Jaakkola and Amir Globerson},\n booktitle = {Proceedings of the 27th International Conference on Machine Learning (ICML)},\n pages = {783-790},\n editor = {Johannes Furnkranz and Thorsten Joachims},\n publisher = {Omnipress},\n year = {2010},\n keywords = {Machine learning, Structured prediction},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/MesSonJaaGlo_icml10.pdf},\n abstract = {Many structured prediction tasks involve complex models where inference is computationally intractable, but where it can be well approximated using a linear programming relaxation. Previous approaches for learning for structured prediction (e.g., cutting-plane, subgradient methods, perceptron) repeatedly make predictions for some of the data points. These approaches are computationally demanding because each prediction involves solving a linear program to optimality. We present a scalable algorithm for learning for structured prediction. The main idea is to instead solve the dual of the structured prediction loss. We formulate the learning task as a convex minimization over both the weights and the dual variables corresponding to each data point. As a result, we can begin to optimize the weights even before completely solving any of the individual prediction problems. We show how the dual variables can be efficiently optimized using coordinate descent. Our algorithm is competitive with state-of-the-art methods such as stochastic subgradient and cutting-plane.}\n}\n\n
\n
\n\n\n
\n Many structured prediction tasks involve complex models where inference is computationally intractable, but where it can be well approximated using a linear programming relaxation. Previous approaches for learning for structured prediction (e.g., cutting-plane, subgradient methods, perceptron) repeatedly make predictions for some of the data points. These approaches are computationally demanding because each prediction involves solving a linear program to optimality. We present a scalable algorithm for learning for structured prediction. The main idea is to instead solve the dual of the structured prediction loss. We formulate the learning task as a convex minimization over both the weights and the dual variables corresponding to each data point. As a result, we can begin to optimize the weights even before completely solving any of the individual prediction problems. We show how the dual variables can be efficiently optimized using coordinate descent. Our algorithm is competitive with state-of-the-art methods such as stochastic subgradient and cutting-plane.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n Topic models\n \n \n (3)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Learning Topic Models - Provably and Efficiently.\n \n \n \n \n\n\n \n Arora, S.; Ge, R.; Halpern, Y.; Mimno, D.; Moitra, A.; Sontag, D.; Wu, Y.; and Zhu, M.\n\n\n \n\n\n\n Communications of the ACM, 61(4): 85-93. 2018.\n \n\n\n\n
\n\n\n\n \n \n \"Learning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{AroraEtAl_CACM18,\n  author    = {Sanjeev Arora and Rong Ge and Yoni Halpern and David Mimno and Ankur Moitra and David Sontag and Yichen Wu and Michael Zhu},\n  title     = {Learning Topic Models - Provably and Efficiently},\n  journal = {Communications of the {ACM}},\n  year = {2018},\n  volume = {61},\n  number = {4},\n  pages  = {85-93},\n  keywords = {Machine learning, Unsupervised learning, Topic models},\n  url_Paper = {https://cacm.acm.org/magazines/2018/4/226373-learning-topic-models-provably-and-efficiently/fulltext},\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A Practical Algorithm for Topic Modeling with Provable Guarantees.\n \n \n \n \n\n\n \n Arora, S.; Ge, R.; Halpern, Y.; Mimno, D. M.; Moitra, A.; Sontag, D.; Wu, Y.; and Zhu, M.\n\n\n \n\n\n\n In Proceedings of the International Conference on Machine Learning (ICML), volume 28 (2), pages 280–288, 2013. JMLR: W&CP\n \n\n\n\n
\n\n\n\n \n \n \"A paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{AroraEtAl_icml13,\n  author    = {Sanjeev Arora and Rong Ge and Yoni Halpern and David M. Mimno and Ankur Moitra and David Sontag and Yichen Wu and Michael Zhu},\n  title     = {A Practical Algorithm for Topic Modeling with Provable Guarantees},\n  booktitle = {Proceedings of the International Conference on Machine Learning (ICML)},\n  year = {2013},\n publisher = {JMLR: W\\&CP},\n volume = {28 (2)},\n pages  = {280--288},\n keywords = {Machine learning, Unsupervised learning, Topic models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/AroraEtAl_icml13.pdf},\n abstract = {Topic models provide a useful method for dimensionality reduction and exploratory data analysis in large text corpora. Most approaches to topic model learning have been based on a maximum likelihood objective. Efficient algorithms exist that attempt to approximate this objective, but they have no provable guarantees. Recently, algorithms have been introduced that provide provable bounds, but these algorithms are not practical because they are inefficient and not robust to violations of model assumptions. In this paper we present an algorithm for learning topic models that is both provable and practical. The algorithm produces results comparable to the best MCMC implementations while running orders of magnitude faster.}\n}\n\n
\n
\n\n\n
\n Topic models provide a useful method for dimensionality reduction and exploratory data analysis in large text corpora. Most approaches to topic model learning have been based on a maximum likelihood objective. Efficient algorithms exist that attempt to approximate this objective, but they have no provable guarantees. Recently, algorithms have been introduced that provide provable bounds, but these algorithms are not practical because they are inefficient and not robust to violations of model assumptions. In this paper we present an algorithm for learning topic models that is both provable and practical. The algorithm produces results comparable to the best MCMC implementations while running orders of magnitude faster.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Complexity of Inference in Latent Dirichlet Allocation.\n \n \n \n \n\n\n \n Sontag, D.; and Roy, D.\n\n\n \n\n\n\n In Shawe-Taylor, J.; Zemel, R.; Bartlett, P.; Pereira, F.; and Weinberger, K., editor(s), Advances in Neural Information Processing Systems 24, pages 1008–1016. MIT Press, 2011.\n \n\n\n\n
\n\n\n\n \n \n \"Complexity paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@incollection{SontagRoy_nips11,\n author = {David Sontag and Dan Roy},\n title = {Complexity of Inference in Latent Dirichlet Allocation},\n booktitle = {Advances in Neural Information Processing Systems 24},\n editor = {J. Shawe-Taylor and R.S. Zemel and P. Bartlett and F.C.N. Pereira and K.Q. Weinberger},\n pages = {1008--1016},\n publisher = {MIT Press},\n year = {2011},\n keywords = {Machine learning, Approximate inference in graphical models, Topic models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/SontagRoy_nips11.pdf},\n abstract = {We consider the computational complexity of probabilistic inference in Latent Dirichlet Allocation (LDA). First, we study the problem of finding the maximum a posteriori (MAP) assignment of topics to words, where the document’s topic distribution is integrated out. We show that, when the effective number of topics per document is small, exact inference takes polynomial time. In contrast, we show that, when a document has a large number of topics, finding the MAP assignment of topics to words in LDA is NP-hard. Next, we consider the problem of finding the MAP topic distribution for a document, where the topic-word assignments are integrated out. We show that this problem is also NP-hard. Finally, we briefly discuss the problem of sampling from the posterior, showing that this is NP-hard in one restricted setting, but leaving open the general question.}\n}\n\n
\n
\n\n\n
\n We consider the computational complexity of probabilistic inference in Latent Dirichlet Allocation (LDA). First, we study the problem of finding the maximum a posteriori (MAP) assignment of topics to words, where the document’s topic distribution is integrated out. We show that, when the effective number of topics per document is small, exact inference takes polynomial time. In contrast, we show that, when a document has a large number of topics, finding the MAP assignment of topics to words in LDA is NP-hard. Next, we consider the problem of finding the MAP topic distribution for a document, where the topic-word assignments are integrated out. We show that this problem is also NP-hard. Finally, we briefly discuss the problem of sampling from the posterior, showing that this is NP-hard in one restricted setting, but leaving open the general question.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n
\n
\n  \n Unsupervised learning\n \n \n (13)\n \n \n
\n
\n \n \n
\n \n\n \n \n \n \n \n \n Max-margin learning with the Bayes Factor.\n \n \n \n \n\n\n \n Krishnan, R. G.; Khandelwal, A.; Ranganath, R.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the Conference on Uncertainty in Artificial Intelligence (UAI), 2018. \n \n\n\n\n
\n\n\n\n \n \n \"Max-margin paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KrishnanEtAl_uai18,\n  author = {Rahul G. Krishnan and Arjun Khandelwal and Rajesh Ranganath and David Sontag},\n  title = {Max-margin learning with the Bayes Factor},\n  booktitle = {Proceedings of the Conference on Uncertainty in Artificial Intelligence ({UAI})},\n  year = {2018},\n  keywords = {Machine learning, Unsupervised learning, Deep learning, Approximate inference in graphical models},\n  abstract = {We propose a new way to answer probabilistic queries that span multiple datapoints. We formalize reasoning about the similarity of different datapoints as the evaluation of the Bayes Factor within a hierarchical deep generative model that enforces a separation between the latent variables used for representation learning and those used for reasoning. Under this model, we derive an intuitive estimator for the Bayes Factor that represents similarity as the amount of overlap in representation space shared by different points. The estimator we derive relies on a query-conditional latent reasoning network, that parameterizes a distribution over the latent space of the deep generative model. The latent reasoning network is trained to amortize the posterior-predictive distribution under a hierarchical model using supervised data and a max-margin learning algorithm. We explore how the model may be used to focus the data variations captured in the latent space of the deep generative model and how this may be used to build new algorithms for few-shot learning.},\n  url_Paper = {http://people.csail.mit.edu/dsontag/papers/KrishnanEtAl_UAI18.pdf}\n}\n\n
\n
\n\n\n
\n We propose a new way to answer probabilistic queries that span multiple datapoints. We formalize reasoning about the similarity of different datapoints as the evaluation of the Bayes Factor within a hierarchical deep generative model that enforces a separation between the latent variables used for representation learning and those used for reasoning. Under this model, we derive an intuitive estimator for the Bayes Factor that represents similarity as the amount of overlap in representation space shared by different points. The estimator we derive relies on a query-conditional latent reasoning network, that parameterizes a distribution over the latent space of the deep generative model. The latent reasoning network is trained to amortize the posterior-predictive distribution under a hierarchical model using supervised data and a max-margin learning algorithm. We explore how the model may be used to focus the data variations captured in the latent space of the deep generative model and how this may be used to build new algorithms for few-shot learning.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Semi-Amortized Variational Autoencoders.\n \n \n \n \n\n\n \n Kim, Y.; Wiseman, S.; Miller, A. C.; Sontag, D.; and Rush, A. M.\n\n\n \n\n\n\n In Proceedings of the 35th International Conference on Machine Learning (ICML), 2018. \n \n\n\n\n
\n\n\n\n \n \n \"Semi-Amortized paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KimEtAl_icml18,\n  author    = {Yoon Kim and Sam Wiseman and Andrew C. Miller and David Sontag and Alexander M. Rush},\n  title = {Semi-Amortized Variational Autoencoders},\n  booktitle = {Proceedings of the 35th International Conference on Machine Learning ({ICML})},\n  year = 2018,\n  keywords = {Machine learning, Unsupervised learning, Deep learning, Approximate inference in graphical models},\n  url_Paper = {https://arxiv.org/pdf/1802.02550.pdf},\n  abstract = {Amortized variational inference (AVI) replaces instance-specific local inference with a global inference network. While AVI has enabled efficient training of deep generative models such as variational autoencoders (VAE), recent empirical work suggests that inference networks can produce suboptimal variational parameters. We propose a hybrid approach, to use AVI to initialize the variational parameters and run stochastic variational inference (SVI) to refine them. Crucially, the local SVI procedure is itself differentiable, so the inference network and generative model can be trained end-to-end with gradient-based optimization. This semi-amortized approach enables the use of rich generative models without experiencing the posterior-collapse phenomenon common in training VAEs for problems like text generation. Experiments show this approach outperforms strong autoregressive and variational baselines on standard text and image datasets.}\n}\n\n
\n
\n\n\n
\n Amortized variational inference (AVI) replaces instance-specific local inference with a global inference network. While AVI has enabled efficient training of deep generative models such as variational autoencoders (VAE), recent empirical work suggests that inference networks can produce suboptimal variational parameters. We propose a hybrid approach, to use AVI to initialize the variational parameters and run stochastic variational inference (SVI) to refine them. Crucially, the local SVI procedure is itself differentiable, so the inference network and generative model can be trained end-to-end with gradient-based optimization. This semi-amortized approach enables the use of rich generative models without experiencing the posterior-collapse phenomenon common in training VAEs for problems like text generation. Experiments show this approach outperforms strong autoregressive and variational baselines on standard text and image datasets.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Learning Topic Models - Provably and Efficiently.\n \n \n \n \n\n\n \n Arora, S.; Ge, R.; Halpern, Y.; Mimno, D.; Moitra, A.; Sontag, D.; Wu, Y.; and Zhu, M.\n\n\n \n\n\n\n Communications of the ACM, 61(4): 85-93. 2018.\n \n\n\n\n
\n\n\n\n \n \n \"Learning paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@article{AroraEtAl_CACM18,\n  author    = {Sanjeev Arora and Rong Ge and Yoni Halpern and David Mimno and Ankur Moitra and David Sontag and Yichen Wu and Michael Zhu},\n  title     = {Learning Topic Models - Provably and Efficiently},\n  journal = {Communications of the {ACM}},\n  year = {2018},\n  volume = {61},\n  number = {4},\n  pages  = {85-93},\n  keywords = {Machine learning, Unsupervised learning, Topic models},\n  url_Paper = {https://cacm.acm.org/magazines/2018/4/226373-learning-topic-models-provably-and-efficiently/fulltext},\n}\n\n
\n
\n\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Structured Inference Networks for Nonlinear State Space Models.\n \n \n \n \n\n\n \n Krishnan, R. G.; Shalit, U.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the Thirty-First AAAI Conference on Artificial Intelligence, pages 2101-2109, 2017. \n \n\n\n\n
\n\n\n\n \n \n \"Structured paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 3 downloads\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KrishnanEtAl_aaai17,\n  author    = {Rahul G. Krishnan and\n               Uri Shalit and\n               David Sontag},\n  title     = {Structured Inference Networks for Nonlinear State Space Models},\n  booktitle = {Proceedings of the Thirty-First {AAAI} Conference on Artificial Intelligence},\n  pages     = {2101-2109},\n  year      = {2017},\n  keywords = {Machine learning, Unsupervised learning, Deep learning, Health care, Approximate inference in graphical models},\n  url_Paper = {https://arxiv.org/pdf/1609.09869.pdf},\n  abstract = {Gaussian state space models have been used for decades as generative models of sequential data. They admit an intuitive probabilistic interpretation, have a simple functional form, and enjoy widespread adoption. We introduce a unified algorithm to efficiently learn a broad class of linear and non-linear state space models, including variants where the emission and transition distributions are modeled by deep neural networks. Our learning algorithm simultaneously learns a compiled inference network and the generative model, leveraging a structured variational approximation parameterized by recurrent neural networks to mimic the posterior distribution. We apply the learning algorithm to both synthetic and real-world datasets, demonstrating its scalability and versatility. We find that using the structured approximation to the posterior results in models with significantly higher held-out likelihood.}\n}\n\n
\n
\n\n\n
\n Gaussian state space models have been used for decades as generative models of sequential data. They admit an intuitive probabilistic interpretation, have a simple functional form, and enjoy widespread adoption. We introduce a unified algorithm to efficiently learn a broad class of linear and non-linear state space models, including variants where the emission and transition distributions are modeled by deep neural networks. Our learning algorithm simultaneously learns a compiled inference network and the generative model, leveraging a structured variational approximation parameterized by recurrent neural networks to mimic the posterior distribution. We apply the learning algorithm to both synthetic and real-world datasets, demonstrating its scalability and versatility. We find that using the structured approximation to the posterior results in models with significantly higher held-out likelihood.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Clinical Tagging with Joint Probabilistic Models.\n \n \n \n \n\n\n \n Halpern, Y.; Horng, S.; and Sontag, D.\n\n\n \n\n\n\n In Doshi-Velez, F.; Fackler, J.; Kale, D.; Wallace, B.; and Wiens, J., editor(s), Proceedings of the 1st Machine Learning for Healthcare Conference, volume 56, of Proceedings of Machine Learning Research, pages 209-225, 2016. \n \n\n\n\n
\n\n\n\n \n \n \"Clinical paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@InProceedings{HalpernEtAl_mlhc16,\n  title = \t {Clinical Tagging with Joint Probabilistic Models},\n  author = \t {Yoni Halpern and Steven Horng and David Sontag},\n  booktitle = \t {Proceedings of the 1st Machine Learning for Healthcare Conference},\n  pages = \t {209-225},\n  year = \t {2016},\n  editor = \t {Finale Doshi-Velez and Jim Fackler and David Kale and Byron Wallace and Jenna Wiens},\n  volume = \t {56},\n  series = \t {Proceedings of Machine Learning Research},\n  keywords = {Health care, Unsupervised learning},\n  url_Paper = {https://arxiv.org/pdf/1608.00686.pdf},\n  abstract = {We describe a method for parameter estimation in bipartite probabilistic graphical models for joint prediction of clinical conditions from the electronic medical record. The method does not rely on the availability of gold-standard labels, but rather uses noisy labels, called anchors, for learning. We provide a likelihood-based objective and a moments-based initialization that are effective at learning the model parameters. The learned model is evaluated in a task of assigning a heldout clinical condition to patients based on retrospective analysis of the records, and outperforms baselines which do not account for the noisiness in the labels or do not model the conditions jointly.}\n}\n\n
\n
\n\n\n
\n We describe a method for parameter estimation in bipartite probabilistic graphical models for joint prediction of clinical conditions from the electronic medical record. The method does not rely on the availability of gold-standard labels, but rather uses noisy labels, called anchors, for learning. We provide a likelihood-based objective and a moments-based initialization that are effective at learning the model parameters. The learned model is evaluated in a task of assigning a heldout clinical condition to patients based on retrospective analysis of the records, and outperforms baselines which do not account for the noisiness in the labels or do not model the conditions jointly.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Electronic Medical Record Phenotyping using the Anchor & Learn Framework.\n \n \n \n \n\n\n \n Halpern, Y.; Horng, S.; Choi, Y.; and Sontag, D.\n\n\n \n\n\n\n In Journal of the American Medical Informatics Association (JAMIA), 2016. \n \n\n\n\n
\n\n\n\n \n \n \"Electronic paper\n  \n \n\n \n \n doi\n  \n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{HalpernEtAl_jamia16,\n author = {Yoni Halpern and Steven Horng and Youngduck Choi and David Sontag},\n title = {Electronic Medical Record Phenotyping using the Anchor \\& Learn Framework},\n booktitle = {Journal of the American Medical Informatics Association (JAMIA)},\n year = {2016},\n keywords = {Health care, Unsupervised learning},\n doi = {10.1093/jamia/ocw011},\n url_Paper = {http://jamia.oxfordjournals.org/content/early/2016/04/26/jamia.ocw011.full.pdf},\n abstract = {Electronic medical records (EMRs) hold a tremendous amount of information about patients that is relevant to determining the optimal approach to patient care. As medicine becomes increasingly precise, a patient’s electronic medical record phenotype will play an important role in triggering clinical decision support systems that can deliver personalized recommendations in real time. Learning with anchors presents a method of efficiently learning statistically driven phenotypes with minimal manual intervention. We developed a phenotype library that uses both structured and unstructured data from the EMR to represent patients for real-time clinical decision support. Eight of the phenotypes were evaluated using retrospective EMR data on emergency department patients using a set of prospectively gathered gold standard labels. We built the phenotype library with 42 publicly available phenotype definitions. Using information from triage time, the phenotype classifiers have an area under the ROC curve (AUC) of infection 0.89, cancer 0.88, immunosuppressed 0.85, septic shock 0.93, nursing home 0.87, anticoagulated 0.83, cardiac etiology 0.89, and pneumonia 0.90. Using information available at the time of disposition from the emergency department, the AUC values are infection 0.91, cancer 0.95, immunosuppressed 0.90, septic shock 0.97, nursing home 0.91, anticoagulated 0.94, cardiac etiology 0.92, and pneumonia 0.97. The resulting phenotypes are interpretable and fast to build, and perform comparably to statistically learned phenotypes developed with 5000 manually labeled patients. Learning with anchors is an attractive option for building a large public repository of phenotype definitions that can be used for a range of health IT applications, including real-time decision support.}\n}\n%\tpublisher = {The Oxford University Press},\n%\tissn = {1067-5027},\n%\tURL = {http://jamia.oxfordjournals.org/content/early/2016/04/26/jamia.ocw011},\n\n
\n
\n\n\n
\n Electronic medical records (EMRs) hold a tremendous amount of information about patients that is relevant to determining the optimal approach to patient care. As medicine becomes increasingly precise, a patient’s electronic medical record phenotype will play an important role in triggering clinical decision support systems that can deliver personalized recommendations in real time. Learning with anchors presents a method of efficiently learning statistically driven phenotypes with minimal manual intervention. We developed a phenotype library that uses both structured and unstructured data from the EMR to represent patients for real-time clinical decision support. Eight of the phenotypes were evaluated using retrospective EMR data on emergency department patients using a set of prospectively gathered gold standard labels. We built the phenotype library with 42 publicly available phenotype definitions. Using information from triage time, the phenotype classifiers have an area under the ROC curve (AUC) of infection 0.89, cancer 0.88, immunosuppressed 0.85, septic shock 0.93, nursing home 0.87, anticoagulated 0.83, cardiac etiology 0.89, and pneumonia 0.90. Using information available at the time of disposition from the emergency department, the AUC values are infection 0.91, cancer 0.95, immunosuppressed 0.90, septic shock 0.97, nursing home 0.91, anticoagulated 0.94, cardiac etiology 0.92, and pneumonia 0.97. The resulting phenotypes are interpretable and fast to build, and perform comparably to statistically learned phenotypes developed with 5000 manually labeled patients. Learning with anchors is an attractive option for building a large public repository of phenotype definitions that can be used for a range of health IT applications, including real-time decision support.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Deep Kalman Filters.\n \n \n \n \n\n\n \n Krishnan, R. G.; Shalit, U.; and Sontag, D.\n\n\n \n\n\n\n In arXiv:1511.05121, 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Deep paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{KriShaSon_arxiv15,\n author = {Rahul G. Krishnan and Uri Shalit and David Sontag},\n title = {Deep Kalman Filters},\n booktitle = {arXiv:1511.05121},\n year = {2015},\n keywords = {Machine learning, Unsupervised learning, Health care, Deep learning},\n url_Paper = {http://arxiv.org/pdf/1511.05121.pdf},\n abstract = {Kalman Filters are one of the most influential models of time-varying phenomena. They admit an intuitive probabilistic interpretation, have a simple functional form, and enjoy widespread adoption in a variety of disciplines. Motivated by recent variational methods for learning deep generative models, we introduce a unified algorithm to efficiently learn a broad spectrum of Kalman filters. Of particular interest is the use of temporal generative models for counterfactual inference. We investigate the efficacy of such models for counterfactual inference, and to that end we introduce the "Healing MNIST" dataset where long-term structure, noise and actions are applied to sequences of digits. We show the efficacy of our method for modeling this dataset. We further show how our model can be used for counterfactual inference for patients, based on electronic health record data of 8,000 patients over 4.5 years.}\n}\n\n
\n
\n\n\n
\n Kalman Filters are one of the most influential models of time-varying phenomena. They admit an intuitive probabilistic interpretation, have a simple functional form, and enjoy widespread adoption in a variety of disciplines. Motivated by recent variational methods for learning deep generative models, we introduce a unified algorithm to efficiently learn a broad spectrum of Kalman filters. Of particular interest is the use of temporal generative models for counterfactual inference. We investigate the efficacy of such models for counterfactual inference, and to that end we introduce the \"Healing MNIST\" dataset where long-term structure, noise and actions are applied to sequences of digits. We show the efficacy of our method for modeling this dataset. We further show how our model can be used for counterfactual inference for patients, based on electronic health record data of 8,000 patients over 4.5 years.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Anchored Discrete Factor Analysis.\n \n \n \n \n\n\n \n Halpern, Y.; Horng, S.; and Sontag, D.\n\n\n \n\n\n\n In arXiv:1511.03299, 2015. \n \n\n\n\n
\n\n\n\n \n \n \"Anchored paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{HalpernEtAl_arxiv15,\n author = {Yoni Halpern and Steven Horng and David Sontag},\n title = {Anchored Discrete Factor Analysis},\n booktitle = {arXiv:1511.03299},\n year = {2015},\n keywords = {Machine learning, Unsupervised learning, Health care},\n url_Paper = {http://arxiv.org/pdf/1511.03299.pdf},\n abstract = {We present a semi-supervised learning algorithm for learning discrete factor analysis models with arbitrary structure on the latent variables. Our algorithm assumes that every latent variable has an "anchor", an observed variable with only that latent variable as its parent. Given such anchors, we show that it is possible to consistently recover moments of the latent variables and use these moments to learn complete models. We also introduce a new technique for improving the robustness of method-of-moment algorithms by optimizing over the marginal polytope or its relaxations. We evaluate our algorithm using two real-world tasks, tag prediction on questions from the Stack Overflow website and medical diagnosis in an emergency department.}\n}\n\n
\n
\n\n\n
\n We present a semi-supervised learning algorithm for learning discrete factor analysis models with arbitrary structure on the latent variables. Our algorithm assumes that every latent variable has an \"anchor\", an observed variable with only that latent variable as its parent. Given such anchors, we show that it is possible to consistently recover moments of the latent variables and use these moments to learn complete models. We also introduce a new technique for improving the robustness of method-of-moment algorithms by optimizing over the marginal polytope or its relaxations. We evaluate our algorithm using two real-world tasks, tag prediction on questions from the Stack Overflow website and medical diagnosis in an emergency department.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Using Anchors to Estimate Clinical State without Labeled Data.\n \n \n \n \n\n\n \n Halpern, Y.; Choi, Y.; Horng, S.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the American Medical Informatics Association (AMIA) Annual Symposium, pages 606–615, 2014. \n \n\n\n\n
\n\n\n\n \n \n \"Using paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{HalpernEtAl_amia14,\n author = {Yoni Halpern and Youngduck Choi and Steven Horng and David Sontag},\n title = {Using Anchors to Estimate Clinical State without Labeled Data},\n booktitle = {Proceedings of the American Medical Informatics Association (AMIA) Annual Symposium},\n pages = {606--615},\n year = {2014},\n keywords = {Health care, Machine learning, Unsupervised learning},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/HalpernEtAl_amia14.pdf},\n abstract = {We present a novel framework for learning to estimate and predict clinical state variables without labeled data. The resulting models can used for electronic phenotyping, triggering clinical decision support, and cohort selection. The framework relies on key observations which we characterize and term "anchor variables". By specifying anchor variables, an expert encodes a certain amount of domain knowledge about the problem while the rest of learning proceeds in an unsupervised manner. The ability to build anchors upon standardized ontologies and the framework's ability to learn from unlabeled data promote generalizability across institutions. We additionally develop a user interface to enable experts to choose anchor variables in an informed manner. The framework is applied to electronic medical record-based phenotyping to enable real-time decision support in the emergency department. We validate the learned models using a prospectively gathered set of gold-standard responses from emergency physicians for nine clinically relevant variables.}\n}\n\n
\n
\n\n\n
\n We present a novel framework for learning to estimate and predict clinical state variables without labeled data. The resulting models can used for electronic phenotyping, triggering clinical decision support, and cohort selection. The framework relies on key observations which we characterize and term \"anchor variables\". By specifying anchor variables, an expert encodes a certain amount of domain knowledge about the problem while the rest of learning proceeds in an unsupervised manner. The ability to build anchors upon standardized ontologies and the framework's ability to learn from unlabeled data promote generalizability across institutions. We additionally develop a user interface to enable experts to choose anchor variables in an informed manner. The framework is applied to electronic medical record-based phenotyping to enable real-time decision support in the emergency department. We validate the learned models using a prospectively gathered set of gold-standard responses from emergency physicians for nine clinically relevant variables.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Unsupervised Learning of Disease Progression Models.\n \n \n \n \n\n\n \n Wang, X.; Sontag, D.; and Wang, F.\n\n\n \n\n\n\n In Proceedings of the 20th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining, of KDD '14, pages 85–94, New York, NY, USA, 2014. ACM\n \n\n\n\n
\n\n\n\n \n \n \"Unsupervised paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{WanSonWan_kdd14,\n author = {Xiang Wang and David Sontag and Fei Wang},\n title = {Unsupervised Learning of Disease Progression Models},\n booktitle = {Proceedings of the 20th ACM SIGKDD International Conference on Knowledge Discovery and Data Mining},\n series = {KDD '14},\n isbn = {978-1-4503-2956-9},\n pages = {85--94},\n numpages = {10},\n publisher = {ACM},\n address = {New York, NY, USA},\n keywords = {Health care, Unsupervised learning},\n year = {2014},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/WanSonWan_kdd14.pdf},\n abstract = {Chronic diseases, such as Alzheimer's Disease, Diabetes, and Chronic Obstructive Pulmonary Disease, usually progress slowly over a long period of time, causing increasing burden to the patients, their families, and the healthcare system. A better understanding of their progression is instrumental in early diagnosis and personalized care. Modeling disease progression based on real-world evidence is a very challenging task due to the incompleteness and irregularity of the observations, as well as the heterogeneity of the patient conditions. In this paper, we propose a probabilistic disease progression model that address these challenges. As compared to existing disease progression models, the advantage of our model is three-fold: 1) it learns a continuous-time progression model from discrete-time observations with non-equal intervals; 2) it learns the full progression trajectory from a set of incomplete records that only cover short segments of the progression; 3) it learns a compact set of medical concepts as the bridge between the hidden progression process and the observed medical evidence, which are usually extremely sparse and noisy. We demonstrate the capabilities of our model by applying it to a real-world COPD patient cohort and deriving some interesting clinical insights.}\n}\n\n
\n
\n\n\n
\n Chronic diseases, such as Alzheimer's Disease, Diabetes, and Chronic Obstructive Pulmonary Disease, usually progress slowly over a long period of time, causing increasing burden to the patients, their families, and the healthcare system. A better understanding of their progression is instrumental in early diagnosis and personalized care. Modeling disease progression based on real-world evidence is a very challenging task due to the incompleteness and irregularity of the observations, as well as the heterogeneity of the patient conditions. In this paper, we propose a probabilistic disease progression model that address these challenges. As compared to existing disease progression models, the advantage of our model is three-fold: 1) it learns a continuous-time progression model from discrete-time observations with non-equal intervals; 2) it learns the full progression trajectory from a set of incomplete records that only cover short segments of the progression; 3) it learns a compact set of medical concepts as the bridge between the hidden progression process and the observed medical evidence, which are usually extremely sparse and noisy. We demonstrate the capabilities of our model by applying it to a real-world COPD patient cohort and deriving some interesting clinical insights.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Discovering Hidden Variables in Noisy-Or Networks using Quartet Tests.\n \n \n \n \n\n\n \n Jernite, Y.; Halpern, Y.; and Sontag, D.\n\n\n \n\n\n\n In Advances in Neural Information Processing Systems 26, pages 2355–2363. MIT Press, 2013.\n \n\n\n\n
\n\n\n\n \n \n paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@incollection{JerHalSon_nips13,\n author = {Yacine Jernite and Yoni Halpern and David Sontag},\n title = {Discovering Hidden Variables in Noisy-Or Networks using Quartet Tests},\n booktitle = {Advances in Neural Information Processing Systems 26},\n pages = {2355--2363},\n publisher = {MIT Press},\n year = {2013},\n keywords = {Machine learning, Unsupervised learning, Health care},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/JerHalSon_nips13.pdf},\n abstract = {We give a polynomial-time algorithm for provably learning the structure and parameters of bipartite noisy-or Bayesian networks of binary variables where the top layer is completely hidden. Unsupervised learning of these models is a form of discrete factor analysis, enabling the discovery of hidden variables and their causal relationships with observed data. We obtain an efficient learning algorithm for a family of Bayesian networks that we call quartet-learnable. For each latent variable, the existence of a singly-coupled quartet allows us to uniquely identify and learn all parameters involving that latent variable. We give a proof of the polynomial sample complexity of our learning algorithm, and experimentally compare it to variational EM.}\n}\n\n
\n
\n\n\n
\n We give a polynomial-time algorithm for provably learning the structure and parameters of bipartite noisy-or Bayesian networks of binary variables where the top layer is completely hidden. Unsupervised learning of these models is a form of discrete factor analysis, enabling the discovery of hidden variables and their causal relationships with observed data. We obtain an efficient learning algorithm for a family of Bayesian networks that we call quartet-learnable. For each latent variable, the existence of a singly-coupled quartet allows us to uniquely identify and learn all parameters involving that latent variable. We give a proof of the polynomial sample complexity of our learning algorithm, and experimentally compare it to variational EM.\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n Unsupervised Learning of Noisy-Or Bayesian Networks.\n \n \n \n \n\n\n \n Halpern, Y.; and Sontag, D.\n\n\n \n\n\n\n In Proceedings of the Twenty-Ninth Conference on Uncertainty in Artificial Intelligence (UAI-13), pages 272–281, Corvallis, Oregon, 2013. AUAI Press\n \n\n\n\n
\n\n\n\n \n \n \"Unsupervised paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n  \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{HalpernSontag_uai13,\n author = {Yoni Halpern and David Sontag},\n title = {Unsupervised Learning of Noisy-Or Bayesian Networks},\n booktitle = {Proceedings of the Twenty-Ninth Conference on Uncertainty in Artificial Intelligence ({UAI}-13)},\n publisher = {AUAI Press},\n address = {Corvallis, Oregon},\n pages = {272--281},\n year = {2013},\n keywords = {Machine learning, Unsupervised learning, Health care},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/HalpernSontag_uai13.pdf},\n abstract = {This paper considers the problem of learning the parameters in Bayesian networks of discrete variables with known structure and hidden variables. Previous approaches in these settings typically use expectation maximization; when the network has high treewidth, the required expectations might be approximated using Monte Carlo or variational methods. We show how to avoid inference altogether during learning by giving a polynomial-time algorithm based on the method-of-moments, building upon recent work on learning discrete-valued mixture models. In particular, we show how to learn the parameters for a family of bipartite noisy-or Bayesian networks. In our experimental results, we demonstrate an application of our algorithm to learning QMR-DT, a large Bayesian network used for medical diagnosis. We show that it is possible to fully learn the parameters of QMR-DT even when only the findings are observed in the training data (ground truth diseases unknown).}\n}\n\n
\n
\n\n\n
\n This paper considers the problem of learning the parameters in Bayesian networks of discrete variables with known structure and hidden variables. Previous approaches in these settings typically use expectation maximization; when the network has high treewidth, the required expectations might be approximated using Monte Carlo or variational methods. We show how to avoid inference altogether during learning by giving a polynomial-time algorithm based on the method-of-moments, building upon recent work on learning discrete-valued mixture models. In particular, we show how to learn the parameters for a family of bipartite noisy-or Bayesian networks. In our experimental results, we demonstrate an application of our algorithm to learning QMR-DT, a large Bayesian network used for medical diagnosis. We show that it is possible to fully learn the parameters of QMR-DT even when only the findings are observed in the training data (ground truth diseases unknown).\n
\n\n\n
\n\n\n
\n \n\n \n \n \n \n \n \n A Practical Algorithm for Topic Modeling with Provable Guarantees.\n \n \n \n \n\n\n \n Arora, S.; Ge, R.; Halpern, Y.; Mimno, D. M.; Moitra, A.; Sontag, D.; Wu, Y.; and Zhu, M.\n\n\n \n\n\n\n In Proceedings of the International Conference on Machine Learning (ICML), volume 28 (2), pages 280–288, 2013. JMLR: W&CP\n \n\n\n\n
\n\n\n\n \n \n \"A paper\n  \n \n\n \n\n \n link\n  \n \n\n bibtex\n \n\n \n  \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n  \n \n \n \n \n \n \n \n \n\n\n\n
\n
@inproceedings{AroraEtAl_icml13,\n  author    = {Sanjeev Arora and Rong Ge and Yoni Halpern and David M. Mimno and Ankur Moitra and David Sontag and Yichen Wu and Michael Zhu},\n  title     = {A Practical Algorithm for Topic Modeling with Provable Guarantees},\n  booktitle = {Proceedings of the International Conference on Machine Learning (ICML)},\n  year = {2013},\n publisher = {JMLR: W\\&CP},\n volume = {28 (2)},\n pages  = {280--288},\n keywords = {Machine learning, Unsupervised learning, Topic models},\n url_Paper = {http://people.csail.mit.edu/dsontag/papers/AroraEtAl_icml13.pdf},\n abstract = {Topic models provide a useful method for dimensionality reduction and exploratory data analysis in large text corpora. Most approaches to topic model learning have been based on a maximum likelihood objective. Efficient algorithms exist that attempt to approximate this objective, but they have no provable guarantees. Recently, algorithms have been introduced that provide provable bounds, but these algorithms are not practical because they are inefficient and not robust to violations of model assumptions. In this paper we present an algorithm for learning topic models that is both provable and practical. The algorithm produces results comparable to the best MCMC implementations while running orders of magnitude faster.}\n}\n\n
\n
\n\n\n
\n Topic models provide a useful method for dimensionality reduction and exploratory data analysis in large text corpora. Most approaches to topic model learning have been based on a maximum likelihood objective. Efficient algorithms exist that attempt to approximate this objective, but they have no provable guarantees. Recently, algorithms have been introduced that provide provable bounds, but these algorithms are not practical because they are inefficient and not robust to violations of model assumptions. In this paper we present an algorithm for learning topic models that is both provable and practical. The algorithm produces results comparable to the best MCMC implementations while running orders of magnitude faster.\n
\n\n\n
\n\n\n\n\n\n
\n
\n\n\n\n\n
\n\n\n \n\n \n \n \n \n\n
\n"}; document.write(bibbase_data.data);