\n

\n \n 2023\n \n \n (3)\n \n \n

\n

\n \n \n

\n \n\n \n \n \n \n \n \n Alignment of spatial genomics data using deep Gaussian processes.\n \n \n \n \n\n\n \n Jones, A.; Townes, F. W.; Li, D.; and Engelhardt, B. E.\n\n\n \n\n\n\n Nature Methods, 20(9): 1379–1387. September 2023.\n Number: 9 Publisher: Nature Publishing Group\n\n\n\n
\n\n\n\n \n \n $\"Alignment$ Paper\n \n \n\n \n \n doi\n \n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n

\n

@article{jones_alignment_2023,\n\ttitle = {Alignment of spatial genomics data using deep {Gaussian} processes},\n\tvolume = {20},\n\tcopyright = {2023 The Author(s)},\n\tissn = {1548-7105},\n\turl = {https://www.nature.com/articles/s41592-023-01972-2},\n\tdoi = {10.1038/s41592-023-01972-2},\n\tabstract = {Spatially resolved genomic technologies have allowed us to study the physical organization of cells and tissues, and promise an understanding of local interactions between cells. However, it remains difficult to precisely align spatial observations across slices, samples, scales, individuals and technologies. Here, we propose a probabilistic model that aligns spatially-resolved samples onto a known or unknown common coordinate system (CCS) with respect to phenotypic readouts (for example, gene expression). Our method, Gaussian Process Spatial Alignment (GPSA), consists of a two-layer Gaussian process: the first layer maps observed samples’ spatial locations onto a CCS, and the second layer maps from the CCS to the observed readouts. Our approach enables complex downstream spatially aware analyses that are impossible or inaccurate with unaligned data, including an analysis of variance, creation of a dense three-dimensional (3D) atlas from sparse two-dimensional (2D) slices or association tests across data modalities.},\n\tlanguage = {en},\n\tnumber = {9},\n\turldate = {2023-11-07},\n\tjournal = {Nature Methods},\n\tauthor = {Jones, Andrew and Townes, F. William and Li, Didong and Engelhardt, Barbara E.},\n\tmonth = sep,\n\tyear = {2023},\n\tnote = {Number: 9\nPublisher: Nature Publishing Group},\n\tkeywords = {Image processing, Machine learning, Software, Statistical methods, Transcriptomics},\n\tpages = {1379--1387},\n}\n\n

\n

\n\n\n\n\n\n

\n\n\n

\n \n\n \n \n \n \n \n \n Correcting for heterogeneity in real-time epidemiological indicators.\n \n \n \n \n\n\n \n Rumack, A.; Rosenfeld, R.; and Townes, F. W.\n\n\n \n\n\n\n September 2023.\n arXiv:2309.16546 [cs]\n\n\n\n
\n\n\n\n \n \n $\"Correcting$ Paper\n \n \n\n \n \n doi\n \n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n \n \n\n\n\n

\n

@misc{rumack_correcting_2023,\n\ttitle = {Correcting for heterogeneity in real-time epidemiological indicators},\n\turl = {http://arxiv.org/abs/2309.16546},\n\tdoi = {10.48550/arXiv.2309.16546},\n\tabstract = {Auxiliary data sources have become increasingly important in epidemiological surveillance, as they are often available at a finer spatial and temporal resolution, larger coverage, and lower latency than traditional surveillance signals. We describe the problem of spatial and temporal heterogeneity in these signals derived from these data sources, where spatial and/or temporal biases are present. We present a method to use a ``guiding'' signal to correct for these biases and produce a more reliable signal that can be used for modeling and forecasting. The method assumes that the heterogeneity can be approximated by a low-rank matrix and that the temporal heterogeneity is smooth over time. We also present a hyperparameter selection algorithm to choose the parameters representing the matrix rank and degree of temporal smoothness of the corrections. In the absence of ground truth, we use maps and plots to argue that this method does indeed reduce heterogeneity. Reducing heterogeneity from auxiliary data sources greatly increases their utility in modeling and forecasting epidemics.},\n\turldate = {2023-11-07},\n\tpublisher = {arXiv},\n\tauthor = {Rumack, Aaron and Rosenfeld, Roni and Townes, F. William},\n\tmonth = sep,\n\tyear = {2023},\n\tnote = {arXiv:2309.16546 [cs]},\n\tkeywords = {Computer Science - Machine Learning},\n}\n\n

\n

\n\n\n\n\n\n

\n\n\n

\n \n\n \n \n \n \n \n \n Cerebellar contributions to a brainwide network for flexible behavior in mice.\n \n \n \n \n\n\n \n Verpeut, J. L.; Bergeler, S.; Kislin, M.; William Townes, F.; Klibaite, U.; Dhanerawala, Z. M.; Hoag, A.; Janarthanan, S.; Jung, C.; Lee, J.; Pisano, T. J.; Seagraves, K. M.; Shaevitz, J. W.; and Wang, S. S.\n\n\n \n\n\n\n Communications Biology, 6(1): 1–17. June 2023.\n Number: 1 Publisher: Nature Publishing Group\n\n\n\n
\n\n\n\n \n \n $\"Cerebellar$ Paper\n \n \n\n \n \n doi\n \n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n\n\n\n

\n

@article{verpeut_cerebellar_2023,\n\ttitle = {Cerebellar contributions to a brainwide network for flexible behavior in mice},\n\tvolume = {6},\n\tcopyright = {2023 The Author(s)},\n\tissn = {2399-3642},\n\turl = {https://www.nature.com/articles/s42003-023-04920-0},\n\tdoi = {10.1038/s42003-023-04920-0},\n\tabstract = {The cerebellum regulates nonmotor behavior, but the routes of influence are not well characterized. Here we report a necessary role for the posterior cerebellum in guiding a reversal learning task through a network of diencephalic and neocortical structures, and in flexibility of free behavior. After chemogenetic inhibition of lobule VI vermis or hemispheric crus I Purkinje cells, mice could learn a water Y-maze but were impaired in ability to reverse their initial choice. To map targets of perturbation, we imaged c-Fos activation in cleared whole brains using light-sheet microscopy. Reversal learning activated diencephalic and associative neocortical regions. Distinctive subsets of structures were altered by perturbation of lobule VI (including thalamus and habenula) and crus I (including hypothalamus and prelimbic/orbital cortex), and both perturbations influenced anterior cingulate and infralimbic cortex. To identify functional networks, we used correlated variation in c-Fos activation within each group. Lobule VI inactivation weakened within-thalamus correlations, while crus I inactivation divided neocortical activity into sensorimotor and associative subnetworks. In both groups, high-throughput automated analysis of whole-body movement revealed deficiencies in across-day behavioral habituation to an open-field environment. Taken together, these experiments reveal brainwide systems for cerebellar influence that affect multiple flexible responses.},\n\tlanguage = {en},\n\tnumber = {1},\n\turldate = {2023-06-06},\n\tjournal = {Communications Biology},\n\tauthor = {Verpeut, Jessica L. and Bergeler, Silke and Kislin, Mikhail and William Townes, F. and Klibaite, Ugne and Dhanerawala, Zahra M. and Hoag, Austin and Janarthanan, Sanjeev and Jung, Caroline and Lee, Junuk and Pisano, Thomas J. and Seagraves, Kelly M. and Shaevitz, Joshua W. and Wang, Samuel S.-H.},\n\tmonth = jun,\n\tyear = {2023},\n\tnote = {Number: 1\nPublisher: Nature Publishing Group},\n\tkeywords = {Decision, Neural circuits},\n\tpages = {1--17},\n}\n\n

\n

\n\n\n\n\n\n

\n

\n \n 2022\n \n \n (4)\n \n \n

\n

\n \n \n

\n \n\n \n \n \n \n \n \n Nonnegative spatial factorization applied to spatial genomics.\n \n \n \n \n\n\n \n Townes, F. W.; and Engelhardt, B. E.\n\n\n \n\n\n\n Nature Methods,1–10. December 2022.\n Publisher: Nature Publishing Group\n\n\n\n
\n\n\n\n \n \n $\"Nonnegative$ Paper\n \n \n\n \n \n doi\n \n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n \n \n 1 download\n \n \n\n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n

\n

@article{townes_nonnegative_2022,\n\ttitle = {Nonnegative spatial factorization applied to spatial genomics},\n\tcopyright = {2022 The Author(s)},\n\tissn = {1548-7105},\n\turl = {https://www.nature.com/articles/s41592-022-01687-w},\n\tdoi = {10.1038/s41592-022-01687-w},\n\tabstract = {Nonnegative matrix factorization (NMF) is widely used to analyze high-dimensional count data because, in contrast to real-valued alternatives such as factor analysis, it produces an interpretable parts-based representation. However, in applications such as spatial transcriptomics, NMF fails to incorporate known structure between observations. Here, we present nonnegative spatial factorization (NSF), a spatially-aware probabilistic dimension reduction model based on transformed Gaussian processes that naturally encourages sparsity and scales to tens of thousands of observations. NSF recovers ground truth factors more accurately than real-valued alternatives such as MEFISTO in simulations, and has lower out-of-sample prediction error than probabilistic NMF on three spatial transcriptomics datasets from mouse brain and liver. Since not all patterns of gene expression have spatial correlations, we also propose a hybrid extension of NSF that combines spatial and nonspatial components, enabling quantification of spatial importance for both observations and features. A TensorFlow implementation of NSF is available from https://github.com/willtownes/nsf-paper.},\n\tlanguage = {en},\n\turldate = {2022-12-31},\n\tjournal = {Nature Methods},\n\tauthor = {Townes, F. William and Engelhardt, Barbara E.},\n\tmonth = dec,\n\tyear = {2022},\n\tnote = {Publisher: Nature Publishing Group},\n\tkeywords = {Gene expression analysis, Machine learning, Software, Statistical methods, Transcriptomics},\n\tpages = {1--10},\n}\n\n

\n

\n\n\n\n\n\n

\n\n\n

\n \n\n \n \n \n \n \n \n Expression QTLs in single-cell sequencing data.\n \n \n \n \n\n\n \n Gewirtz, A. D.; Townes, F. W.; and Engelhardt, B. E.\n\n\n \n\n\n\n August 2022.\n Pages: 2022.08.14.503915 Section: New Results\n\n\n\n
\n\n\n\n \n \n $\"Expression$ Paper\n \n \n\n \n \n doi\n \n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@misc{gewirtz_expression_2022,\n\ttitle = {Expression {QTLs} in single-cell sequencing data},\n\tcopyright = {© 2022, Posted by Cold Spring Harbor Laboratory. This pre-print is available under a Creative Commons License (Attribution-NoDerivs 4.0 International), CC BY-ND 4.0, as described at http://creativecommons.org/licenses/by-nd/4.0/},\n\turl = {https://www.biorxiv.org/content/10.1101/2022.08.14.503915v1},\n\tdoi = {10.1101/2022.08.14.503915},\n\tabstract = {Single nucleotide polymorphisms (SNPs) are important drivers of gene expression variation and downstream phenotypes including disease risk. Single-cell RNA-sequencing (scRNA-seq) allows an unprecedented exploration of cell-type specific associations between gene expression levels and genotypes, but current methods rely on pseudobulk approaches that use composite expression values across cells and often use summary statistics within cell types, ignoring information across cell types and assuming cell type labels are accurate. Here, we extend our method, telescoping bimodal latent Dirichlet allocation (TBLDA), that identifies covarying genotypes and gene expression values when the matching from samples to cells is not one-to-one in order to allow cell-type label agnostic discovery of eQTLs in noncomposite scRNA-seq data. In particular, we add GPU-compatibility, sparse priors, and amortization to enable fast inference on large-scale scRNA-seq data. We apply single-cell TBLDA (scTBLDA) to 400K cells from 119 individuals with systemic lupus erythematosus and examine properties of features from each modality across the estimated latent factors. We use linked genes and SNPs to identify 205 cis-eQTLS, 66 trans-eQTLs, and 53 cell type proportion QTLs, which we then compare against prior studies of immune-cell eQTLs. Our results demonstrate the ability of scTBLDA to identify genes involved in cell-type specific regulatory processes associated with SNPs in single-cell data.},\n\tlanguage = {en},\n\turldate = {2022-08-30},\n\tpublisher = {bioRxiv},\n\tauthor = {Gewirtz, Ariel DH and Townes, F. William and Engelhardt, Barbara E.},\n\tmonth = aug,\n\tyear = {2022},\n\tnote = {Pages: 2022.08.14.503915\nSection: New Results},\n}\n\n

\n

\n\n\n\n\n\n

\n\n\n

\n \n\n \n \n \n \n \n \n Telescoping bimodal latent Dirichlet allocation to identify expression QTLs across tissues.\n \n \n \n \n\n\n \n Gewirtz, A. D.; Townes, F. W.; and Engelhardt, B. E.\n\n\n \n\n\n\n Life Science Alliance, 5(12). December 2022.\n Publisher: Life Science Alliance Section: Research Articles\n\n\n\n
\n\n\n\n \n \n $\"Telescoping$ Paper\n \n \n\n \n \n doi\n \n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{gewirtz_telescoping_2022,\n\ttitle = {Telescoping bimodal latent {Dirichlet} allocation to identify expression {QTLs} across tissues},\n\tvolume = {5},\n\tcopyright = {© 2022 Gewirtz et al.. https://creativecommons.org/licenses/by/4.0/This article is available under a Creative Commons License (Attribution 4.0 International, as described at https://creativecommons.org/licenses/by/4.0/).},\n\tissn = {2575-1077},\n\turl = {https://www.life-science-alliance.org/content/5/12/e202101297},\n\tdoi = {10.26508/lsa.202101297},\n\tabstract = {Expression quantitative trait loci (eQTLs), or single-nucleotide polymorphisms that affect average gene expression levels, provide important insights into context-specific gene regulation. Classic eQTL analyses use one-to-one association tests, which test gene–variant pairs individually and ignore correlations induced by gene regulatory networks and linkage disequilibrium. Probabilistic topic models, such as latent Dirichlet allocation, estimate latent topics for a collection of count observations. Prior multimodal frameworks that bridge genotype and expression data assume matched sample numbers between modalities. However, many data sets have a nested structure where one individual has several associated gene expression samples and a single germline genotype vector. Here, we build a telescoping bimodal latent Dirichlet allocation (TBLDA) framework to learn shared topics across gene expression and genotype data that allows multiple RNA sequencing samples to correspond to a single individual’s genotype. By using raw count data, our model avoids possible adulteration via normalization procedures. Ancestral structure is captured in a genotype-specific latent space, effectively removing it from shared components. Using GTEx v8 expression data across 10 tissues and genotype data, we show that the estimated topics capture meaningful and robust biological signal in both modalities and identify associations within and across tissue types. We identify 4,645 cis-eQTLs and 995 trans-eQTLs by conducting eQTL mapping between the most informative features in each topic. Our TBLDA model is able to identify associations using raw sequencing count data when the samples in two separate data modalities are matched one-to-many, as is often the case in biological data. Our code is freely available at https://github.com/gewirtz/TBLDA.},\n\tlanguage = {en},\n\tnumber = {12},\n\turldate = {2022-08-18},\n\tjournal = {Life Science Alliance},\n\tauthor = {Gewirtz, Ariel DH and Townes, F. William and Engelhardt, Barbara E.},\n\tmonth = dec,\n\tyear = {2022},\n\tpmid = {35977827},\n\tnote = {Publisher: Life Science Alliance\nSection: Research Articles},\n}\n\n

\n

\n\n\n\n\n\n

\n\n\n

\n \n\n \n \n \n \n \n \n Contrastive latent variable modeling with application to case-control sequencing experiments.\n \n \n \n \n\n\n \n Jones, A.; Townes, F. W.; Li, D.; and Engelhardt, B. E.\n\n\n \n\n\n\n The Annals of Applied Statistics, 16(3): 1268–1291. September 2022.\n Publisher: Institute of Mathematical Statistics\n\n\n\n
\n\n\n\n \n \n $\"Contrastive$ Paper\n \n \n\n \n \n doi\n \n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n

\n

@article{jones_contrastive_2022,\n\ttitle = {Contrastive latent variable modeling with application to case-control sequencing experiments},\n\tvolume = {16},\n\tissn = {1932-6157, 1941-7330},\n\turl = {https://projecteuclid.org/journals/annals-of-applied-statistics/volume-16/issue-3/Contrastive-latent-variable-modeling-with-application-to-case-control-sequencing/10.1214/21-AOAS1534.full},\n\tdoi = {10.1214/21-AOAS1534},\n\tabstract = {High-throughput RNA-sequencing (RNA-seq) technologies are powerful tools for understanding cellular state. Often, it is of interest to quantify and to summarize changes in cell state that occur between experimental or biological conditions. Differential expression is typically assessed using univariate tests to measure genewise shifts in expression. However, these methods largely ignore changes in transcriptional correlation. Furthermore, there is a need to identify the low-dimensional structure of the gene expression shift to identify collections of genes that change between conditions. Here, we propose contrastive latent variable models designed for count data to create a richer portrait of differential expression in sequencing data. These models disentangle the sources of transcriptional variation in different conditions in the context of an explicit model of variation at baseline. Moreover, we develop a model-based hypothesis testing framework that can test for global and gene subset-specific changes in expression. We evaluate our model through extensive simulations and analyses with count-based gene expression data from perturbation and observational sequencing experiments. We find that our methods effectively summarize and quantify complex transcriptional changes in case-control experimental sequencing data.},\n\tnumber = {3},\n\turldate = {2022-07-22},\n\tjournal = {The Annals of Applied Statistics},\n\tauthor = {Jones, Andrew and Townes, F. William and Li, Didong and Engelhardt, Barbara E.},\n\tmonth = sep,\n\tyear = {2022},\n\tnote = {Publisher: Institute of Mathematical Statistics},\n\tkeywords = {RNA sequencing, case-control data, contrastive models, differential expression, latent variable models},\n\tpages = {1268--1291},\n}\n\n

\n

\n\n\n\n\n\n

\n

\n \n 2020\n \n \n (4)\n \n \n

\n

\n \n \n

\n \n\n \n \n \n \n \n \n Identifying longevity associated genes by integrating gene expression and curated annotations.\n \n \n \n \n\n\n \n Townes, F. W.; Carr, K.; and Miller, J. W.\n\n\n \n\n\n\n PLOS Computational Biology, 16(11): e1008429. November 2020.\n Publisher: Public Library of Science\n\n\n\n
\n\n\n\n \n \n $\"Identifying$ Paper\n \n \n\n \n \n doi\n \n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n

\n

@article{townes_identifying_2020,\n\ttitle = {Identifying longevity associated genes by integrating gene expression and curated annotations},\n\tvolume = {16},\n\tissn = {1553-7358},\n\turl = {https://journals.plos.org/ploscompbiol/article?id=10.1371/journal.pcbi.1008429},\n\tdoi = {10.1371/journal.pcbi.1008429},\n\tabstract = {Aging is a complex process with poorly understood genetic mechanisms. Recent studies have sought to classify genes as pro-longevity or anti-longevity using a variety of machine learning algorithms. However, it is not clear which types of features are best for optimizing classification performance and which algorithms are best suited to this task. Further, performance assessments based on held-out test data are lacking. We systematically compare five popular classification algorithms using gene ontology and gene expression datasets as features to predict the pro-longevity versus anti-longevity status of genes for two model organisms (C. elegans and S. cerevisiae) using the GenAge database as ground truth. We find that elastic net penalized logistic regression performs particularly well at this task. Using elastic net, we make novel predictions of pro- and anti-longevity genes that are not currently in the GenAge database.},\n\tlanguage = {en},\n\tnumber = {11},\n\turldate = {2021-09-28},\n\tjournal = {PLOS Computational Biology},\n\tauthor = {Townes, F. William and Carr, Kareem and Miller, Jeffrey W.},\n\tmonth = nov,\n\tyear = {2020},\n\tnote = {Publisher: Public Library of Science},\n\tkeywords = {DNA-binding proteins, Gene expression, Gene ontologies, Gene prediction, Machine learning algorithms, Saccharomyces cerevisiae, Support vector machines, Yeast},\n\tpages = {e1008429},\n}\n\n

\n

\n\n\n\n\n\n

\n\n\n

\n \n\n \n \n \n \n \n \n A unifying framework for rare variant association testing in family-based designs, including higher criticism approaches, SKATs, and burden tests.\n \n \n \n \n\n\n \n Hecker, J.; Townes, F W.; Kachroo, P.; Laurie, C.; Lasky-Su, J.; Ziniti, J.; Cho, M. H; Weiss, S. T; Laird, N. M; and Lange, C.\n\n\n \n\n\n\n Bioinformatics, 36(22-23): 5432–5438. December 2020.\n \n\n\n\n
\n\n\n\n \n \n $\"A$ Paper\n \n \n\n \n \n doi\n \n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{hecker_unifying_2020,\n\ttitle = {A unifying framework for rare variant association testing in family-based designs, including higher criticism approaches, {SKATs}, and burden tests},\n\tvolume = {36},\n\tissn = {1367-4803},\n\turl = {https://doi.org/10.1093/bioinformatics/btaa1055},\n\tdoi = {10.1093/bioinformatics/btaa1055},\n\tabstract = {Analysis of rare variants in family-based studies remains a challenge. Transmission-based approaches provide robustness against population stratification, but the evaluation of the significance of test statistics based on asymptotic theory can be imprecise. Also, power will depend heavily on the choice of the test statistic and on the underlying genetic architecture of the locus, which will be generally unknown.In our proposed framework, we utilize the FBAT haplotype algorithm to obtain the conditional offspring genotype distribution under the null hypothesis given the sufficient statistic. Based on this conditional offspring genotype distribution, the significance of virtually any association test statistic can be evaluated based on simulations or exact computations, without the need for asymptotic approximations. Besides standard linear burden-type statistics, this enables our approach to also evaluate other test statistics such as variance components statistics, higher criticism approaches, and maximum-single-variant-statistics, where asymptotic theory might be involved or does not provide accurate approximations for rare variant data. Based on these P-values, combined test statistics such as the aggregated Cauchy association test (ACAT) can also be utilized. In simulation studies, we show that our framework outperforms existing approaches for family-based studies in several scenarios. We also applied our methodology to a TOPMed whole-genome sequencing dataset with 897 asthmatic trios from Costa Rica.FBAT software is available at https://sites.google.com/view/fbatwebpage. Simulation code is available at https://github.com/julianhecker/FBAT\\_rare\\_variant\\_test\\_simulations. Whole-genome sequencing data for ‘NHLBI TOPMed: The Genetic Epidemiology of Asthma in Costa Rica’ is available at https://www.ncbi.nlm.nih.gov/projects/gap/cgi-bin/study.cgi?study\\_id=phs000988.v4.p1.Supplementary data are available at Bioinformatics online.},\n\tnumber = {22-23},\n\turldate = {2021-06-28},\n\tjournal = {Bioinformatics},\n\tauthor = {Hecker, Julian and Townes, F William and Kachroo, Priyadarshini and Laurie, Cecelia and Lasky-Su, Jessica and Ziniti, John and Cho, Michael H and Weiss, Scott T and Laird, Nan M and Lange, Christoph},\n\tmonth = dec,\n\tyear = {2020},\n\tpages = {5432--5438},\n}\n\n

\n

\n\n\n\n\n\n

\n\n\n

\n \n\n \n \n \n \n \n \n Quantile normalization of single-cell RNA-seq read counts without unique molecular identifiers.\n \n \n \n \n\n\n \n Townes, F. W.; and Irizarry, R. A.\n\n\n \n\n\n\n Genome Biology, 21(1): 160. July 2020.\n \n\n\n\n
\n\n\n\n \n \n $\"Quantile$ Paper\n \n \n\n \n \n doi\n \n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{townes_quantile_2020,\n\ttitle = {Quantile normalization of single-cell {RNA}-seq read counts without unique molecular identifiers},\n\tvolume = {21},\n\tissn = {1474-760X},\n\turl = {https://doi.org/10.1186/s13059-020-02078-0},\n\tdoi = {10.1186/s13059-020-02078-0},\n\tabstract = {Single-cell RNA-seq (scRNA-seq) profiles gene expression of individual cells. Unique molecular identifiers (UMIs) remove duplicates in read counts resulting from polymerase chain reaction, a major source of noise. For scRNA-seq data lacking UMIs, we propose quasi-UMIs: quantile normalization of read counts to a compound Poisson distribution empirically derived from UMI datasets. When applied to ground-truth datasets having both reads and UMIs, quasi-UMI normalization has higher accuracy than competing methods. Using quasi-UMIs enables methods designed specifically for UMI data to be applied to non-UMI scRNA-seq datasets.},\n\tnumber = {1},\n\turldate = {2020-07-03},\n\tjournal = {Genome Biology},\n\tauthor = {Townes, F. William and Irizarry, Rafael A.},\n\tmonth = jul,\n\tyear = {2020},\n\tpages = {160},\n}\n\n

\n

\n\n\n\n\n\n

\n\n\n

\n \n\n \n \n \n \n \n \n Review of Probability Distributions for Modeling Count Data.\n \n \n \n \n\n\n \n Townes, F. W.\n\n\n \n\n\n\n arXiv:2001.04343 [stat]. January 2020.\n arXiv: 2001.04343\n\n\n\n
\n\n\n\n \n \n $\"Review$ Paper\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n\n\n\n

\n

@article{townes_review_2020,\n\ttitle = {Review of {Probability} {Distributions} for {Modeling} {Count} {Data}},\n\tcopyright = {All rights reserved},\n\turl = {http://arxiv.org/abs/2001.04343},\n\tabstract = {Count data take on non-negative integer values and are challenging to properly analyze using standard linear-Gaussian methods such as linear regression and principal components analysis. Generalized linear models enable direct modeling of counts in a regression context using distributions such as the Poisson and negative binomial. When counts contain only relative information, multinomial or Dirichlet-multinomial models can be more appropriate. We review some of the fundamental connections between multinomial and count models from probability theory, providing detailed proofs. These relationships are useful for methods development in applications such as topic modeling of text data and genomics.},\n\turldate = {2020-01-19},\n\tjournal = {arXiv:2001.04343 [stat]},\n\tauthor = {Townes, F. William},\n\tmonth = jan,\n\tyear = {2020},\n\tnote = {arXiv: 2001.04343},\n\tkeywords = {Statistics - Machine Learning, Statistics - Methodology},\n}\n\n

\n

\n\n\n\n\n\n

\n

\n \n 2019\n \n \n (3)\n \n \n

\n

\n \n \n

\n \n\n \n \n \n \n \n \n Feature selection and dimension reduction for single-cell RNA-Seq based on a multinomial model.\n \n \n \n \n\n\n \n Townes, F. W.; Hicks, S. C.; Aryee, M. J.; and Irizarry, R. A.\n\n\n \n\n\n\n Genome Biology, 20(1): 295. December 2019.\n \n\n\n\n
\n\n\n\n \n \n $\"Feature$ Paper\n \n \n\n \n \n doi\n \n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{townes_feature_2019,\n\ttitle = {Feature selection and dimension reduction for single-cell {RNA}-{Seq} based on a multinomial model},\n\tvolume = {20},\n\tcopyright = {All rights reserved},\n\tissn = {1474-760X},\n\turl = {https://doi.org/10.1186/s13059-019-1861-6},\n\tdoi = {10.1186/s13059-019-1861-6},\n\tabstract = {Single-cell RNA-Seq (scRNA-Seq) profiles gene expression of individual cells. Recent scRNA-Seq datasets have incorporated unique molecular identifiers (UMIs). Using negative controls, we show UMI counts follow multinomial sampling with no zero inflation. Current normalization procedures such as log of counts per million and feature selection by highly variable genes produce false variability in dimension reduction. We propose simple multinomial methods, including generalized principal component analysis (GLM-PCA) for non-normal distributions, and feature selection using deviance. These methods outperform the current practice in a downstream clustering assessment using ground truth datasets.},\n\tnumber = {1},\n\turldate = {2020-01-02},\n\tjournal = {Genome Biology},\n\tauthor = {Townes, F. William and Hicks, Stephanie C. and Aryee, Martin J. and Irizarry, Rafael A.},\n\tmonth = dec,\n\tyear = {2019},\n\tpages = {295},\n}\n\n

\n

\n\n\n\n\n\n

\n\n\n

\n \n\n \n \n \n \n \n \n Generalized Principal Component Analysis.\n \n \n \n \n\n\n \n Townes, F. W.\n\n\n \n\n\n\n arXiv:1907.02647 [cs, stat]. July 2019.\n arXiv: 1907.02647\n\n\n\n
\n\n\n\n \n \n $\"Generalized$ Paper\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n\n\n\n

\n

@article{townes_generalized_2019,\n\ttitle = {Generalized {Principal} {Component} {Analysis}},\n\tcopyright = {All rights reserved},\n\turl = {http://arxiv.org/abs/1907.02647},\n\tabstract = {Generalized principal component analysis (GLM-PCA) facilitates dimension reduction of non-normally distributed data. We provide a detailed derivation of GLM-PCA with a focus on optimization. We also demonstrate how to incorporate covariates, and suggest post-processing transformations to improve interpretability of latent factors.},\n\turldate = {2019-07-22},\n\tjournal = {arXiv:1907.02647 [cs, stat]},\n\tauthor = {Townes, F. William},\n\tmonth = jul,\n\tyear = {2019},\n\tnote = {arXiv: 1907.02647},\n\tkeywords = {Computer Science - Machine Learning, Statistics - Machine Learning},\n}\n\n

\n

\n\n\n\n\n\n

\n\n\n

\n \n\n \n \n \n \n \n \n Thermal Preference and Species Range in Mountaintop Salamanders and Their Widespread Competitors.\n \n \n \n \n\n\n \n Marsh, D. M.; Townes, F. W.; Cotter, K. M.; Farroni, K.; McCreary, K. L.; Petry, R. L.; and Tilghman, J. M.\n\n\n \n\n\n\n Journal of Herpetology, 53(2): 96–103. May 2019.\n \n\n\n\n
\n\n\n\n \n \n $\"Thermal$ Paper\n \n \n\n \n \n doi\n \n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{marsh_thermal_2019,\n\ttitle = {Thermal {Preference} and {Species} {Range} in {Mountaintop} {Salamanders} and {Their} {Widespread} {Competitors}},\n\tvolume = {53},\n\tcopyright = {All rights reserved},\n\tissn = {0022-1511},\n\turl = {https://www.journalofherpetology.org/doi/abs/10.1670/18-110},\n\tdoi = {10.1670/18-110},\n\tabstract = {Temperature tolerance can be a critical factor in determining amphibian species range over both latitudinal and elevational gradients; however, range limits across latitudes and elevations are not always congruent. For some mountaintop endemic and widespread salamanders in the Southern Appalachians, elevational distributions suggest the hypothesis that mountaintop species should be more cold tolerant, whereas latitudinal patterns are consistent with the hypothesis that widespread species should be more cold tolerant or tolerate a wider range of temperatures. We tested these hypotheses with year-round surveys of two mountaintop endemic species, Big Levels salamanders (Plethodon sherando) and Peaks of Otter salamanders (Plethodon hubrichti), at high- and low-elevation sites where they overlap with the widespread Eastern Redback salamander (Plethodon cinereus). We fit hierarchical Bayesian models to salamander surface counts across natural variation in soil temperature and moisture to compare temperatures that maximized surface counts (“peak activity temperature”) and the range of temperatures at which each species was active (“activity window”). We found that P. sherando and P. cinereus showed similar peak activity temperatures in areas of overlap, though P. sherando had a wider activity window as compared to P. cinereus. For P. hubrichti, we found a similar to somewhat higher peak activity temperature compared to P. cinereus, though P. cinereus had a wider activity window. We found no consistent differences in responses to soil moisture across species pairs. Our results suggest that elevational zonation in salamanders can result from a variety of processes and may not always reflect differences in relative temperature preferences.},\n\tnumber = {2},\n\turldate = {2019-07-22},\n\tjournal = {Journal of Herpetology},\n\tauthor = {Marsh, David M. and Townes, F. William and Cotter, Kerry M. and Farroni, Kara and McCreary, Kathryn L. and Petry, Rachael L. and Tilghman, Joseph M.},\n\tmonth = may,\n\tyear = {2019},\n\tpages = {96--103},\n}\n\n

\n

\n\n\n\n\n\n

\n

\n \n 2018\n \n \n (2)\n \n \n

\n

\n \n \n

\n \n\n \n \n \n \n \n \n Family-based tests for associating haplotypes with general phenotype data.\n \n \n \n \n\n\n \n Hecker, J.; Xu, X.; Townes, F. W.; Fier, H. L.; Corcoran, C.; Laird, N.; and Lange, C.\n\n\n \n\n\n\n Genetic Epidemiology, 42(1): 123–126. 2018.\n \n\n\n\n
\n\n\n\n \n \n $\"Family-based$ Paper\n \n \n\n \n \n doi\n \n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \n\n\n\n

\n

@article{hecker_family-based_2018,\n\ttitle = {Family-based tests for associating haplotypes with general phenotype data},\n\tvolume = {42},\n\tcopyright = {© 2017 WILEY PERIODICALS, INC.},\n\tissn = {1098-2272},\n\turl = {https://onlinelibrary.wiley.com/doi/abs/10.1002/gepi.22094},\n\tdoi = {10.1002/gepi.22094},\n\tabstract = {For family-based association studies, Horvath et al. proposed an algorithm for the association analysis between haplotypes and arbitrary phenotypes when the phase of the haplotypes is unknown, that is, genotype data is given. Their approach to haplotype analysis maintains the original features of the TDT/FBAT-approach, that is, complete robustness against genetic confounding and misspecification of the phenotype. The algorithm has been implemented in the FBAT and PBAT software package and has been used in numerous substantive manuscripts. Here, we propose a simplification of the original algorithm that maintains the original approach but reduces the computational burden of the approach substantially and gives valuable insights regarding the conditional distribution. With the modified algorithm, the application to whole-genome sequencing (WGS) studies becomes feasible; for example, in sliding window approaches or spatial-clustering approaches. The reduction of the computational burden that our modification provides is especially dramatic when both parental genotypes are missing. For example, for eight variants and 441 nuclear families with mostly offspring-only families, in a WGS study at the APOE locus, the running time decreased from approximately 21 hr for the original algorithm to 0.11 sec after our modification.},\n\tlanguage = {en},\n\tnumber = {1},\n\turldate = {2019-11-01},\n\tjournal = {Genetic Epidemiology},\n\tauthor = {Hecker, Julian and Xu, Xin and Townes, F. William and Fier, Heide Loehlein and Corcoran, Chris and Laird, Nan and Lange, Christoph},\n\tyear = {2018},\n\tkeywords = {FBAT, admixture, candidate region, whole-genome sequencing},\n\tpages = {123--126},\n}\n\n

\n

\n\n\n\n\n\n

\n\n\n

\n \n\n \n \n \n \n \n \n Missing data and technical variability in single-cell RNA-sequencing experiments.\n \n \n \n \n\n\n \n Hicks, S. C.; Townes, F. W.; Teng, M.; and Irizarry, R. A.\n\n\n \n\n\n\n Biostatistics, 19(4): 562–578. October 2018.\n \n\n\n\n
\n\n\n\n \n \n $\"Missing$ Paper\n \n \n\n \n \n doi\n \n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{hicks_missing_2018,\n\ttitle = {Missing data and technical variability in single-cell {RNA}-sequencing experiments},\n\tvolume = {19},\n\tcopyright = {All rights reserved},\n\tissn = {1465-4644},\n\turl = {https://academic.oup.com/biostatistics/article/19/4/562/4599254},\n\tdoi = {10.1093/biostatistics/kxx053},\n\tabstract = {SUMMARY.  Until recently, high-throughput gene expression technology, such as RNA-Sequencing (RNA-seq) required hundreds of thousands of cells to produce reliab},\n\tlanguage = {en},\n\tnumber = {4},\n\turldate = {2019-04-24},\n\tjournal = {Biostatistics},\n\tauthor = {Hicks, Stephanie C. and Townes, F. William and Teng, Mingxiang and Irizarry, Rafael A.},\n\tmonth = oct,\n\tyear = {2018},\n\tpages = {562--578},\n}\n

\n

\n\n\n\n\n\n

\n

\n \n 2017\n \n \n (1)\n \n \n

\n

\n \n \n

\n \n\n \n \n \n \n \n \n Varying-Censoring Aware Matrix Factorization for Single Cell RNA-Sequencing.\n \n \n \n \n\n\n \n Townes, F. W.; Hicks, S. C.; Aryee, M. J.; and Irizarry, R. A.\n\n\n \n\n\n\n bioRxiv,166736. July 2017.\n \n\n\n\n
\n\n\n\n \n \n $\"Varying-Censoring$ Paper\n \n \n\n \n \n doi\n \n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{townes_varying-censoring_2017,\n\ttitle = {Varying-{Censoring} {Aware} {Matrix} {Factorization} for {Single} {Cell} {RNA}-{Sequencing}},\n\tcopyright = {© 2017, Posted by Cold Spring Harbor Laboratory. This pre-print is available under a Creative Commons License (Attribution-NonCommercial-NoDerivs 4.0 International), CC BY-NC-ND 4.0, as described at http://creativecommons.org/licenses/by-nc-nd/4.0/},\n\turl = {https://www.biorxiv.org/content/10.1101/166736v1},\n\tdoi = {10.1101/166736},\n\tabstract = {{\\textless}h3{\\textgreater}Abstract{\\textless}/h3{\\textgreater} {\\textless}p{\\textgreater}Single cell RNA-Seq (scRNA-Seq) has become the most widely used high-throughput technology for gene expression profiling of individual cells. The potential of being able to measure cell-to-cell variability at a high-dimensional genomic scale opens numerous new lines of investigation in basic and clinical research. For example, by identifying groups of cells with expression profiles unlike those observed in cells with known phenotypes, new cell types may be discovered. Dimension reduction followed by unsupervised clustering are the quantitative approaches typically used to facilitate such discoveries. However, a challenge for this approach is that most scRNA-Seq datasets are sparse, with the percentages of measurements reported as zero ranging from 35\\% to 99\\% across cells, and these zeros are partially explained by experimental inefficiencies that lead to censored data. Furthermore, the observed across-cell differences in the percentages of zeros are partly due to technical artifacts rather than biological differences. Unfortunately, standard dimension reduction approaches treat these censored values as true zeros, which leads to the identification of distorted low-dimensional factors. When these factors are used for clustering, the distortion leads to incorrect identification of biological groups. Here, we propose an approach that accounts for cell-specific censoring with a varying-censoring aware matrix factorization (VAMF) model that permits the identification of factors in the presence of the above described systematic bias. We demonstrate the advantages of our approach on published scRNA-Seq data and confirm these on simulated data.{\\textless}/p{\\textgreater}},\n\tlanguage = {en},\n\turldate = {2019-05-02},\n\tjournal = {bioRxiv},\n\tauthor = {Townes, F. William and Hicks, Stephanie C. and Aryee, Martin J. and Irizarry, Rafael A.},\n\tmonth = jul,\n\tyear = {2017},\n\tpages = {166736},\n}\n\n

\n

\n\n\n\n\n\n

\n

\n \n 2016\n \n \n (1)\n \n \n

\n

\n \n \n

\n \n\n \n \n \n \n \n \n Predicting Subnational Ebola Virus Disease Epidemic Dynamics from Sociodemographic Indicators.\n \n \n \n \n\n\n \n Valeri, L.; Patterson-Lomba, O.; Gurmu, Y.; Ablorh, A.; Bobb, J.; Townes, F. W.; and Harling, G.\n\n\n \n\n\n\n PLOS ONE, 11(10): e0163544. October 2016.\n \n\n\n\n
\n\n\n\n \n \n $\"Predicting$ Paper\n \n \n\n \n \n doi\n \n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n\n\n\n

\n

@article{valeri_predicting_2016,\n\ttitle = {Predicting {Subnational} {Ebola} {Virus} {Disease} {Epidemic} {Dynamics} from {Sociodemographic} {Indicators}},\n\tvolume = {11},\n\tcopyright = {All rights reserved},\n\tissn = {1932-6203},\n\turl = {https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0163544},\n\tdoi = {10.1371/journal.pone.0163544},\n\tabstract = {Background The recent Ebola virus disease (EVD) outbreak in West Africa has spread wider than any previous human EVD epidemic. While individual-level risk factors that contribute to the spread of EVD have been studied, the population-level attributes of subnational regions associated with outbreak severity have not yet been considered. Methods To investigate the area-level predictors of EVD dynamics, we integrated time series data on cumulative reported cases of EVD from the World Health Organization and covariate data from the Demographic and Health Surveys. We first estimated the early growth rates of epidemics in each second-level administrative district (ADM2) in Guinea, Sierra Leone and Liberia using exponential, logistic and polynomial growth models. We then evaluated how these growth rates, as well as epidemic size within ADM2s, were ecologically associated with several demographic and socio-economic characteristics of the ADM2, using bivariate correlations and multivariable regression models. Results The polynomial growth model appeared to best fit the ADM2 epidemic curves, displaying the lowest residual standard error. Each outcome was associated with various regional characteristics in bivariate models, however in stepwise multivariable models only mean education levels were consistently associated with a worse local epidemic. Discussion By combining two common methods—estimation of epidemic parameters using mathematical models, and estimation of associations using ecological regression models—we identified some factors predicting rapid and severe EVD epidemics in West African subnational regions. While care should be taken interpreting such results as anything more than correlational, we suggest that our approach of using data sources that were publicly available in advance of the epidemic or in real-time provides an analytic framework that may assist countries in understanding the dynamics of future outbreaks as they occur.},\n\tlanguage = {en},\n\tnumber = {10},\n\turldate = {2019-11-01},\n\tjournal = {PLOS ONE},\n\tauthor = {Valeri, Linda and Patterson-Lomba, Oscar and Gurmu, Yared and Ablorh, Akweley and Bobb, Jennifer and Townes, F. William and Harling, Guy},\n\tmonth = oct,\n\tyear = {2016},\n\tkeywords = {Curve fitting, Epidemiological statistics, Epidemiology, Guinea, Infectious disease epidemiology, Liberia, Polynomials, Sierra Leone},\n\tpages = {e0163544},\n}\n\n

\n

\n\n\n\n\n\n

\n

\n \n 2010\n \n \n (1)\n \n \n

\n

\n \n \n

\n \n\n \n \n \n \n \n \n Seed dispersal of the genus Leea in forest patches of Bataan, Philippines.\n \n \n \n \n\n\n \n Townes, W.\n\n\n \n\n\n\n Ecotropica, 16(2): 145–148. 2010.\n \n\n\n\n
\n\n\n\n \n \n $\"Seed$ Paper\n \n \n\n \n\n \n link\n \n \n\n bibtex\n \n\n \n \n \n abstract \n \n\n \n\n \n \n \n \n \n \n \n\n \n \n \n\n\n\n

\n

@article{townes_seed_2010,\n\ttitle = {Seed dispersal of the genus {Leea} in forest patches of {Bataan}, {Philippines}.},\n\tvolume = {16},\n\tcopyright = {All rights reserved},\n\tissn = {0949-3026},\n\turl = {https://www.cabdirect.org/cabdirect/abstract/20113007384},\n\tabstract = {An account is given on the seed dispersal of Leea trees located along two intermittent streams in Morong, Bataan, Philippines. Observations revealed that none of the 12 fruiting trees monitored were flowering, preventing species identification. However, habitat suggested they were likely Leea guineensis. Philippine bulbuls (Hypsipetes philippinus) were found feeding on Leea fruits along with...},\n\tlanguage = {English},\n\tnumber = {2},\n\turldate = {2019-11-01},\n\tjournal = {Ecotropica},\n\tauthor = {Townes, W.},\n\tyear = {2010},\n\tpages = {145--148},\n}\n\n

\n

\n\n\n\n\n\n

\n