@article{mahendran_quantitative_2022, title = {Quantitative methods for descriptive intersectional analysis with binary health outcomes}, volume = {17}, issn = {2352-8273}, url = {https://www.sciencedirect.com/science/article/pii/S2352827322000118}, doi = {10.1016/j.ssmph.2022.101032}, abstract = {Intersectionality recognizes that in the context of sociohistorically shaped structural power relations, an individual's multiple social positions or identities (e.g., gender, ethnicity) can interact to affect health-related outcomes. Despite limited methodological guidance, intersectionality frameworks have increasingly been incorporated into epidemiological studies, both to describe health disparities and to examine their causes. This study aimed to advance methods in intersectional estimation of binary outcomes in descriptive health disparities research through evaluation of 7 potentially intersectional data analysis methods: cross-classification, regression with interactions, multilevel analysis of individual heterogeneity (MAIHDA), and decision trees (CART, CTree, CHAID, random forest). Accuracy of estimated intersection-specific outcome prevalence was evaluated across 192 intersections using simulated data scenarios. For comparison we included a non-intersectional main effects regression. We additionally assessed variable selection performance amongst decision trees. Example analyses using National Health and Nutrition Examination Study data illustrated differences in results between methods. At larger sample sizes, all methods except for CART performed better than non-intersectional main effects regression. In smaller samples, MAIHDA was the most accurate method but showed no advantage over main effects regression, while random forest, cross-classification, and saturated regression were the least accurate, and CTree and CHAID performed moderately well. CART performed poorly for estimation and variable selection. Sensitivity analyses examining the bias-variance tradeoff suggest MAIHDA as the preferred unbiased method for accurate estimation of high-dimensional intersections at smaller sample sizes. Larger sample sizes are more imperative for other methods. Results support the adoption of an intersectional approach to descriptive epidemiology.}, urldate = {2023-11-13}, journal = {SSM - Population Health}, author = {Mahendran, Mayuri and Lizotte, Daniel and Bauer, Greta R.}, month = mar, year = {2022}, keywords = {Biostatistics, Epidemiological studies, Health equity, Intersectionality, Research design}, pages = {101032}, }
@article{ title = {Moving beyond p <0.05 in ecotoxicology: A guide for practitioners}, type = {article}, year = {2020}, identifiers = {[object Object]}, keywords = {biostatistics,cance testing,ecotoxicology,environmental toxicology,null hypothesis signi fi,p value}, pages = {1657-1669}, volume = {39}, id = {560c6d63-4a66-3baf-8851-9a067ddd6771}, created = {2020-09-05T18:42:40.223Z}, file_attached = {true}, profile_id = {ab0f59ed-74df-3903-921d-6cd544d31123}, group_id = {3addd0f7-d578-34d3-be80-24022cc062a1}, last_modified = {2020-09-05T18:42:52.409Z}, read = {false}, starred = {false}, authored = {false}, confirmed = {true}, hidden = {false}, private_publication = {false}, abstract = {Statistical inferences play a critical role in ecotoxicology. Historically, Null Hypothesis Significance Testing (NHST) has been the dominant method for inference in ecotoxicology. As a brief and informal definition of NHST, researchers compare (or "test") an experimental treatment or observation against a hypothesis of no relationship (the "null hypothesis") using the collected data to see if the observed values are statistically "significant" given predefined error rates. The resulting probability of observing a value equal to or greater than the observed value assuming the null hypothesis is true is the p-value. Criticisms of NHST have existed for almost a century and have recently grown to the point where statisticians, including the American Statistical Association (ASA), have felt the need to clarify the role of NHST and p-values beyond their current common use. These limitations also exist in ecotoxicology. For example, a review of the 2010 Environmental Toxicology & Chemistry (ET&C) volume found many authors did not correctly report p-values. We repeated this review looking at the 2019 volume of ET&C. Incorrect reporting of p-values still occurred almost a decade later. Problems with NHST and p-values highlight the need for statistical inferences besides NHST, something long known in ecotoxicology and the broader scientific and statistical communities. Furthermore, concerns such as these led the Executive Director of the ASA to recommend against use of "statistical significance" in 2019. In light of these criticisms, ecotoxicologists require alternative methods. In this paper, we describe some alternative methods including confidence intervals, regression analysis, dose-response curves, Bayes factors, survival analysis, and model selection. Lastly, we provide insights for what ecotoxicology might look like in a post-p-value world. This article is protected by copyright. All rights reserved.}, bibtype = {article}, author = {Erickson, Richard A. and Rattner, Barnett A.}, journal = {Environmental Toxicology and Chemistry}, number = {9} }
@article{chicco_advantages_2020, title = {The advantages of the {Matthews} correlation coefficient ({MCC}) over {F1} score and accuracy in binary classification evaluation}, volume = {21}, issn = {1471-2164}, url = {https://doi.org/10.1186/s12864-019-6413-7}, doi = {10.1186/s12864-019-6413-7}, abstract = {To evaluate binary classifications and their confusion matrices, scientific researchers can employ several statistical rates, accordingly to the goal of the experiment they are investigating. Despite being a crucial issue in machine learning, no widespread consensus has been reached on a unified elective chosen measure yet. Accuracy and F1 score computed on confusion matrices have been (and still are) among the most popular adopted metrics in binary classification tasks. However, these statistical measures can dangerously show overoptimistic inflated results, especially on imbalanced datasets.}, number = {1}, urldate = {2022-02-21}, journal = {BMC Genomics}, author = {Chicco, Davide and Jurman, Giuseppe}, month = jan, year = {2020}, keywords = {Accuracy, Binary classification, Biostatistics, Confusion matrices, Dataset imbalance, F1 score, Genomics, Machine learning, Matthews correlation coefficient}, pages = {6}, }
@article{ranganathan_understanding_2018, title = {Understanding the properties of diagnostic tests - {Part} 2: {Likelihood} ratios}, volume = {9}, issn = {2229-3485}, shorttitle = {Understanding the properties of diagnostic tests - {Part} 2}, doi = {10.4103/picr.PICR_41_18}, abstract = {Diagnostic tests are used to identify subjects with and without disease. In a previous article in this series, we examined some attributes of diagnostic tests - sensitivity, specificity, and predictive values. In this second article, we look at likelihood ratios, which are useful for the interpretation of diagnostic test results in everyday clinical practice.}, language = {eng}, number = {2}, journal = {Perspectives in Clinical Research}, author = {Ranganathan, Priya and Aggarwal, Rakesh}, year = {2018}, pmid = {29862204}, pmcid = {PMC5950618}, keywords = {Biostatistics, diagnostic tests, likelihood ratios}, pages = {99--102}, }
@article{latimer_assessing_2018-1, title = {Assessing methods for dealing with treatment switching in clinical trials: {A} follow-up simulation study}, volume = {27}, issn = {1477-0334}, shorttitle = {Assessing methods for dealing with treatment switching in clinical trials}, doi = {10.1177/0962280216642264}, abstract = {When patients randomised to the control group of a randomised controlled trial are allowed to switch onto the experimental treatment, intention-to-treat analyses of the treatment effect are confounded because the separation of randomised groups is lost. Previous research has investigated statistical methods that aim to estimate the treatment effect that would have been observed had this treatment switching not occurred and has demonstrated their performance in a limited set of scenarios. Here, we investigate these methods in a new range of realistic scenarios, allowing conclusions to be made based upon a broader evidence base. We simulated randomised controlled trials incorporating prognosis-related treatment switching and investigated the impact of sample size, reduced switching proportions, disease severity, and alternative data-generating models on the performance of adjustment methods, assessed through a comparison of bias, mean squared error, and coverage, related to the estimation of true restricted mean survival in the absence of switching in the control group. Rank preserving structural failure time models, inverse probability of censoring weights, and two-stage methods consistently produced less bias than the intention-to-treat analysis. The switching proportion was confirmed to be a key determinant of bias: sample size and censoring proportion were relatively less important. It is critical to determine the size of the treatment effect in terms of an acceleration factor (rather than a hazard ratio) to provide information on the likely bias associated with rank-preserving structural failure time model adjustments. In general, inverse probability of censoring weight methods are more volatile than other adjustment methods.}, language = {eng}, number = {3}, journal = {Statistical Methods in Medical Research}, author = {Latimer, N. R. and Abrams, K. R. and Lambert, P. C. and Morden, J. P. and Crowther, M. J.}, year = {2018}, pmid = {27114326}, keywords = {Biostatistics, Clinical Trial Protocols as Topic, Computer Simulation, Cross-Over Studies, Data Interpretation, Follow-Up Studies, health technology assessment, Humans, Kaplan-Meier Estimate, Models, oncology, overall survival, prediction, Proportional Hazards Models, Randomized Controlled Trials as Topic, Sample Size, Statistical, survival analysis, Survival Analysis, time-to-event outcomes, treatment crossover, Treatment switching}, pages = {765--784}, }
@article{ranganathan_common_2017, title = {Common pitfalls in statistical analysis: {Logistic} regression}, volume = {8}, issn = {22295488}, url = {/pmc/articles/PMC5543767/?report=abstract}, doi = {10.4103/picr.PICR_87_17}, abstract = {Logistic regression analysis is a statistical technique to evaluate the relationship between various predictor variables (either categorical or continuous) and an outcome which is binary (dichotomous). In this article, we discuss logistic regression analysis and the limitations of this technique.}, number = {3}, urldate = {2020-08-12}, journal = {Perspectives in Clinical Research}, author = {Ranganathan, Priya and Pramesh, C. and Aggarwal, Rakesh}, month = jul, year = {2017}, pmid = {28828311}, note = {Publisher: Medknow Publications}, keywords = {Biostatistics, logistic models, regression analysis}, pages = {148--151}, }
@article{latimer_adjusting_2017-1, title = {Adjusting for treatment switching in randomised controlled trials - {A} simulation study and a simplified two-stage method}, volume = {26}, issn = {1477-0334}, doi = {10.1177/0962280214557578}, abstract = {Estimates of the overall survival benefit of new cancer treatments are often confounded by treatment switching in randomised controlled trials (RCTs) - whereby patients randomised to the control group are permitted to switch onto the experimental treatment upon disease progression. In health technology assessment, estimates of the unconfounded overall survival benefit associated with the new treatment are needed. Several switching adjustment methods have been advocated in the literature, some of which have been used in health technology assessment. However, it is unclear which methods are likely to produce least bias in realistic RCT-based scenarios. We simulated RCTs in which switching, associated with patient prognosis, was permitted. Treatment effect size and time dependency, switching proportions and disease severity were varied across scenarios. We assessed the performance of alternative adjustment methods based upon bias, coverage and mean squared error, related to the estimation of true restricted mean survival in the absence of switching in the control group. We found that when the treatment effect was not time-dependent, rank preserving structural failure time models (RPSFTM) and iterative parameter estimation methods produced low levels of bias. However, in the presence of a time-dependent treatment effect, these methods produced higher levels of bias, similar to those produced by an inverse probability of censoring weights method. The inverse probability of censoring weights and structural nested models produced high levels of bias when switching proportions exceeded 85\%. A simplified two-stage Weibull method produced low bias across all scenarios and provided the treatment switching mechanism is suitable, represents an appropriate adjustment method.}, language = {eng}, number = {2}, journal = {Statistical Methods in Medical Research}, author = {Latimer, N. R. and Abrams, K. R. and Lambert, P. C. and Crowther, M. J. and Wailoo, A. J. and Morden, J. P. and Akehurst, R. L. and Campbell, M. J.}, year = {2017}, pmid = {25416688}, keywords = {Algorithms, Biomedical, Biostatistics, Computer Simulation, Cross-Over Studies, Disease Progression, health technology assessment, Humans, inverse probability of censoring weights, Models, prediction, Randomized Controlled Trials as Topic, Statistical, Survival Analysis, Technology Assessment, time-to-event outcomes, treatment crossover, treatment switching}, pages = {724--751}, }
@article{xia_hypothesis_2017, title = {Hypothesis {Testing} and {Statistical} {Analysis} of {Microbiome}}, volume = {4}, issn = {2352-4820}, doi = {10.1016/j.gendis.2017.06.001}, abstract = {After the initiation of Human Microbiome Project in 2008, various biostatistic and bioinformatic tools for data analysis and computational methods have been developed and applied to microbiome studies. In this review and perspective, we discuss the research and statistical hypotheses in gut microbiome studies, focusing on mechanistic concepts that underlie the complex relationships among host, microbiome, and environment. We review the current available statistic tools and highlight recent progress of newly developed statistical methods and models. Given the current challenges and limitations in biostatistic approaches and tools, we discuss the future direction in developing statistical methods and models for the microbiome studies.}, language = {eng}, number = {3}, journal = {Genes \& Diseases}, author = {Xia, Yinglin and Sun, Jun}, month = sep, year = {2017}, pmid = {30197908}, pmcid = {PMC6128532}, note = {Number: 3}, keywords = {IBD, Vitamin D receptor, bioinformatics, biostatistics, cancer, diet, dysbiosis, hypothesis testing, inflammation, microbiome, obesity, statistical methods and models}, pages = {138--148}, }
@article{crowther_joint_2016-1, title = {Joint modelling of longitudinal and survival data: incorporating delayed entry and an assessment of model misspecification}, volume = {35}, issn = {1097-0258}, shorttitle = {Joint modelling of longitudinal and survival data}, doi = {10.1002/sim.6779}, abstract = {A now common goal in medical research is to investigate the inter-relationships between a repeatedly measured biomarker, measured with error, and the time to an event of interest. This form of question can be tackled with a joint longitudinal-survival model, with the most common approach combining a longitudinal mixed effects model with a proportional hazards survival model, where the models are linked through shared random effects. In this article, we look at incorporating delayed entry (left truncation), which has received relatively little attention. The extension to delayed entry requires a second set of numerical integration, beyond that required in a standard joint model. We therefore implement two sets of fully adaptive Gauss-Hermite quadrature with nested Gauss-Kronrod quadrature (to allow time-dependent association structures), conducted simultaneously, to evaluate the likelihood. We evaluate fully adaptive quadrature compared with previously proposed non-adaptive quadrature through a simulation study, showing substantial improvements, both in terms of minimising bias and reducing computation time. We further investigate, through simulation, the consequences of misspecifying the longitudinal trajectory and its impact on estimates of association. Our scenarios showed the current value association structure to be very robust, compared with the rate of change that we found to be highly sensitive showing that assuming a simpler trend when the truth is more complex can lead to substantial bias. With emphasis on flexible parametric approaches, we generalise previous models by proposing the use of polynomials or splines to capture the longitudinal trend and restricted cubic splines to model the baseline log hazard function. The methods are illustrated on a dataset of breast cancer patients, modelling mammographic density jointly with survival, where we show how to incorporate density measurements prior to the at-risk period, to make use of all the available information. User-friendly Stata software is provided.}, language = {eng}, number = {7}, journal = {Statistics in Medicine}, author = {Crowther, M. J. and Andersson, T. M. L. and Lambert, P. C. and Abrams, K. R. and Humphreys, K.}, month = mar, year = {2016}, pmid = {26514596}, pmcid = {PMC5019272}, keywords = {adaptive Gauss-Hermite quadrature, Biostatistics, Breast Density, Breast Neoplasms, Computer Simulation, delayed entry, Female, Humans, joint modelling, left truncation, Likelihood Functions, Longitudinal Studies, mixed effects, Models, oncology, Proportional Hazards Models, Statistical, survival analysis, Survival Analysis}, pages = {1193--1209}, }
@article{bobb_bayesian_2015, title = {Bayesian kernel machine regression for estimating the health effects of multi-pollutant mixtures}, volume = {16}, issn = {1468-4357}, url = {https://pubmed.ncbi.nlm.nih.gov/25532525/}, doi = {10.1093/BIOSTATISTICS/KXU058}, abstract = {Because humans are invariably exposed to complex chemical mixtures, estimating the health effects of multi-pollutant exposures is of critical concern in environmental epidemiology, and to regulatory agencies such as the U.S. Environmental Protection Agency. However, most health effects studies focus on single agents or consider simple two-way interaction models, in part because we lack the statistical methodology to more realistically capture the complexity of mixed exposures. We introduce Bayesian kernel machine regression (BKMR) as a new approach to study mixtures, in which the health outcome is regressed on a flexible function of the mixture (e.g. air pollution or toxic waste) components that is specified using a kernel function. In high-dimensional settings, a novel hierarchical variable selection approach is incorporated to identify important mixture components and account for the correlated structure of the mixture. Simulation studies demonstrate the success of BKMR in estimating the exposure-response function and in identifying the individual components of the mixture responsible for health effects. We demonstrate the features of the method through epidemiology and toxicology applications.}, number = {3}, urldate = {2021-12-16}, journal = {Biostatistics (Oxford, England)}, author = {Bobb, Jennifer F. and Valeri, Linda and Claus Henn, Birgit and Christiani, David C. and Wright, Robert O. and Mazumdar, Maitreyi and Godleski, John J. and Coull, Brent A.}, month = sep, year = {2015}, pmid = {25532525}, note = {Publisher: Biostatistics}, keywords = {Animals, Bangladesh, Bayes Theorem*, Biostatistics, Brent A Coull, Child, Developmental Disabilities / etiology, Dogs, Environmental Health / statistics \& numerical data, Environmental Pollutants / adverse effects*, Extramural, Female, Hemodynamics / drug effects, Humans, Infant, Jennifer F Bobb, Linda Valeri, MEDLINE, Machine Learning, Metals / adverse effects, Models, N.I.H., NCBI, NIH, NLM, National Center for Biotechnology Information, National Institutes of Health, National Library of Medicine, Neurodevelopmental Disorders / etiology, Non-P.H.S., Normal Distribution, PMC5963470, Pregnancy, Preschool, PubMed Abstract, Regression Analysis, Research Support, Statistical, U.S. Gov't, doi:10.1093/biostatistics/kxu058, pmid:25532525}, pages = {493--508}, }
@article{head_extent_2015, title = {The {Extent} and {Consequences} of {P}-{Hacking} in {Science}}, volume = {13}, issn = {1545-7885}, url = {http://dx.plos.org/10.1371/journal.pbio.1002106}, doi = {10.1371/journal.pbio.1002106}, abstract = {Publication bias resulting from so-called "p-hacking" is pervasive throughout the life sciences; however, its effects on general conclusions made from the literature appear to be weak.}, language = {English}, number = {3}, journal = {PLOS Biology}, author = {Head, Megan L and Holman, Luke and Lanfear, Rob and Kahn, Andrew T and Jennions, Michael D}, month = mar, year = {2015}, keywords = {biostatistics}, pages = {e1002106}, }
@article{crowther_flexible_2012-1, title = {Flexible parametric joint modelling of longitudinal and survival data}, volume = {31}, issn = {1097-0258}, doi = {10.1002/sim.5644}, abstract = {The joint modelling of longitudinal and survival data is a highly active area of biostatistical research. The submodel for the longitudinal biomarker usually takes the form of a linear mixed effects model. We describe a flexible parametric approach for the survival submodel that models the log baseline cumulative hazard using restricted cubic splines. This approach overcomes limitations of standard parametric choices for the survival submodel, which can lack the flexibility to effectively capture the shape of the underlying hazard function. Numerical integration techniques, such as Gauss-Hermite quadrature, are usually required to evaluate both the cumulative hazard and the overall joint likelihood; however, by using a flexible parametric model, the cumulative hazard has an analytically tractable form, providing considerable computational benefits. We conduct an extensive simulation study to assess the proposed model, comparing it with a B-spline formulation, illustrating insensitivity of parameter estimates to the baseline cumulative hazard function specification. Furthermore, we compare non-adaptive and fully adaptive quadrature, showing the superiority of adaptive quadrature in evaluating the joint likelihood. We also describe a useful technique to simulate survival times from complex baseline hazard functions and illustrate the methods using an example data set investigating the association between longitudinal prothrombin index and survival of patients with liver cirrhosis, showing greater flexibility and improved stability with fewer parameters under the proposed model compared with the B-spline approach. We provide user-friendly Stata software.}, language = {eng}, number = {30}, journal = {Statistics in Medicine}, author = {Crowther, M. J. and Abrams, K. R. and Lambert, P. C.}, month = dec, year = {2012}, pmid = {23037571}, keywords = {Anti-Inflammatory Agents, Biomarkers, Biostatistics, Computer Simulation, Humans, Kaplan-Meier Estimate, Likelihood Functions, Linear Models, Liver Cirrhosis, Longitudinal Studies, Prednisone, Prognosis, Proportional Hazards Models, Randomized Controlled Trials as Topic, Time Factors}, pages = {4456--4471}, }
@book{lachin_biostatistical_2011, address = {Hoboken, N.J}, edition = {2nd ed}, series = {Wiley series in probability and statistics}, title = {Biostatistical methods: the assessment of relative risks}, isbn = {978-0-470-50822-0}, shorttitle = {Biostatistical methods}, abstract = {"This book focuses on the comparison, contrast, and assessment of risks on the basis of clinical investigations. It develops basic concepts as well as deriving biostatistical methods through both the application of classical mathematical statistical tools and more modern likelihood-based theories. The first half of the book presents methods for the analysis of single and multiple 2x2 tables for cross-sectional, prospective, and retrospective (case-control) sampling, with and without matching using fixed and two-stage random effects models. The text then moves on to present a more modern likelihood- or model-based approach, which includes unconditional and conditional logistic regression; the analysis of count data and the Poisson regression model; the analysis of event time data, including the proportional hazards and multiplicative intensity models; and elements of categorical data analysis (expanded in this edition). SAS subroutines are both showcased in the text and embellished online by way of a dedicated author website. The book contains a technical, but accessible appendix that presents the core mathematical statistical theory used for the development of classical and modern statistical methods"--Provided by publisher}, publisher = {Wiley}, author = {Lachin, John M.}, year = {2011}, note = {OCLC: ocn613993075}, keywords = {Biostatistics, Data Interpretation, Statistical, Health risk assessment, Medical statistics, Medicine, Models, Statistical, Research Statistical methods, Risk Assessment, Statistical methods, methods}, }